diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,97533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.515025776436565, + "eval_steps": 500, + "global_step": 15000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 251.78126049041748, + "epoch": 0.00033530323986755525, + "grad_norm": 0.2821206415206205, + "kl": 0.0, + "learning_rate": 2.4691358024691357e-10, + "loss": -0.0, + "reward": 1.7428572177886963, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7517857365310192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2 + }, + { + "completion_length": 251.26340675354004, + "epoch": 0.0006706064797351105, + "grad_norm": 0.23751592598417418, + "kl": 2.596992999315262e-05, + "learning_rate": 4.938271604938271e-10, + "loss": 0.0, + "reward": 1.7803571969270706, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4 + }, + { + "completion_length": 255.4955472946167, + "epoch": 0.0010059097196026656, + "grad_norm": 0.34231130475220095, + "kl": 0.00018531084060668945, + "learning_rate": 7.407407407407407e-10, + "loss": 0.0, + "reward": 1.7053572311997414, + "reward_std": 0.10354063473641872, + "rewards/equation_reward_func": 0.7187500223517418, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6 + }, + { + "completion_length": 242.8571538925171, + "epoch": 0.001341212959470221, + "grad_norm": 0.282011994670917, + "kl": 0.0002923011779785156, + "learning_rate": 9.876543209876543e-10, + "loss": 0.0, + "reward": 1.7816964909434319, + "reward_std": 0.08649431029334664, + "rewards/equation_reward_func": 0.793750025331974, + "rewards/format_reward_func": 0.9879464358091354, + "step": 8 + }, + { + "completion_length": 246.70090103149414, + "epoch": 0.0016765161993377762, + "grad_norm": 0.4075528008052861, + "kl": 0.0003095269203186035, + "learning_rate": 1.2345679012345679e-09, + "loss": 0.0, + "reward": 1.7267858013510704, + "reward_std": 0.11364216078072786, + "rewards/equation_reward_func": 0.7401785962283611, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10 + }, + { + "completion_length": 243.15179634094238, + "epoch": 0.002011819439205331, + "grad_norm": 0.2539315791340793, + "kl": 0.00033289194107055664, + "learning_rate": 1.4814814814814814e-09, + "loss": 0.0, + "reward": 1.775000050663948, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 12 + }, + { + "completion_length": 246.633939743042, + "epoch": 0.0023471226790728866, + "grad_norm": 0.24597106986052078, + "kl": 0.0003146529197692871, + "learning_rate": 1.728395061728395e-09, + "loss": 0.0, + "reward": 1.687500074505806, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.691964328289032, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14 + }, + { + "completion_length": 248.13840293884277, + "epoch": 0.002682425918940442, + "grad_norm": 0.2283280840772994, + "kl": 0.0003129243850708008, + "learning_rate": 1.9753086419753086e-09, + "loss": 0.0, + "reward": 1.7732143476605415, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7776786126196384, + "rewards/format_reward_func": 0.9955357164144516, + "step": 16 + }, + { + "completion_length": 243.49108219146729, + "epoch": 0.003017729158807997, + "grad_norm": 0.25527874503701053, + "kl": 0.0002974867820739746, + "learning_rate": 2.222222222222222e-09, + "loss": 0.0, + "reward": 1.7375000715255737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 18 + }, + { + "completion_length": 257.25000858306885, + "epoch": 0.0033530323986755524, + "grad_norm": 0.3755577292202996, + "kl": 0.0003231167793273926, + "learning_rate": 2.4691358024691357e-09, + "loss": 0.0, + "reward": 1.751785784959793, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 20 + }, + { + "completion_length": 248.70090293884277, + "epoch": 0.0036883356385431073, + "grad_norm": 0.25981511886182895, + "kl": 0.00031244754791259766, + "learning_rate": 2.7160493827160493e-09, + "loss": 0.0, + "reward": 1.7410715147852898, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7544643171131611, + "rewards/format_reward_func": 0.9866071492433548, + "step": 22 + }, + { + "completion_length": 254.64733219146729, + "epoch": 0.004023638878410662, + "grad_norm": 0.15723937143886035, + "kl": 0.0003066062927246094, + "learning_rate": 2.962962962962963e-09, + "loss": 0.0, + "reward": 1.769642911851406, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071823984385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 24 + }, + { + "completion_length": 240.6919765472412, + "epoch": 0.004358942118278218, + "grad_norm": 0.45759362392893915, + "kl": 0.000310361385345459, + "learning_rate": 3.209876543209876e-09, + "loss": 0.0, + "reward": 1.7950893640518188, + "reward_std": 0.0776554741896689, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 0.9986607171595097, + "step": 26 + }, + { + "completion_length": 250.5446548461914, + "epoch": 0.004694245358145773, + "grad_norm": 0.15906385599381753, + "kl": 0.0002815127372741699, + "learning_rate": 3.45679012345679e-09, + "loss": 0.0, + "reward": 1.7589286491274834, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 28 + }, + { + "completion_length": 245.8571548461914, + "epoch": 0.005029548598013328, + "grad_norm": 0.20213909633387014, + "kl": 0.0003229975700378418, + "learning_rate": 3.7037037037037036e-09, + "loss": 0.0, + "reward": 1.7946429252624512, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071566939354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 30 + }, + { + "completion_length": 239.7812614440918, + "epoch": 0.005364851837880884, + "grad_norm": 0.2975602670727807, + "kl": 0.0003528594970703125, + "learning_rate": 3.950617283950617e-09, + "loss": 0.0, + "reward": 1.7732143327593803, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 32 + }, + { + "completion_length": 244.4196548461914, + "epoch": 0.005700155077748439, + "grad_norm": 0.19631508669566625, + "kl": 0.00030624866485595703, + "learning_rate": 4.197530864197531e-09, + "loss": 0.0, + "reward": 1.7767857685685158, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 34 + }, + { + "completion_length": 248.04018878936768, + "epoch": 0.006035458317615994, + "grad_norm": 0.1693595352246246, + "kl": 0.0003243684768676758, + "learning_rate": 4.444444444444444e-09, + "loss": 0.0, + "reward": 1.6482143849134445, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.6616071630269289, + "rewards/format_reward_func": 0.9866071492433548, + "step": 36 + }, + { + "completion_length": 246.6294765472412, + "epoch": 0.006370761557483549, + "grad_norm": 0.2152008240652874, + "kl": 0.0003387331962585449, + "learning_rate": 4.6913580246913574e-09, + "loss": 0.0, + "reward": 1.7517857775092125, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.765178594738245, + "rewards/format_reward_func": 0.9866071492433548, + "step": 38 + }, + { + "completion_length": 242.79465579986572, + "epoch": 0.006706064797351105, + "grad_norm": 0.2081818106050967, + "kl": 0.00032889842987060547, + "learning_rate": 4.938271604938271e-09, + "loss": 0.0, + "reward": 1.7482143640518188, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7526786122471094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 40 + }, + { + "completion_length": 247.04911708831787, + "epoch": 0.00704136803721866, + "grad_norm": 0.197765113235337, + "kl": 0.00030040740966796875, + "learning_rate": 5.1851851851851846e-09, + "loss": 0.0, + "reward": 1.8107143640518188, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 42 + }, + { + "completion_length": 233.49108219146729, + "epoch": 0.007376671277086215, + "grad_norm": 0.22273231251306044, + "kl": 0.00031495094299316406, + "learning_rate": 5.4320987654320985e-09, + "loss": 0.0, + "reward": 1.8017857447266579, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8062500357627869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 44 + }, + { + "completion_length": 237.5714406967163, + "epoch": 0.00771197451695377, + "grad_norm": 0.18181933051003635, + "kl": 0.00033354759216308594, + "learning_rate": 5.679012345679012e-09, + "loss": 0.0, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 46 + }, + { + "completion_length": 246.5937614440918, + "epoch": 0.008047277756821325, + "grad_norm": 0.1606775878103379, + "kl": 0.00032961368560791016, + "learning_rate": 5.925925925925926e-09, + "loss": 0.0, + "reward": 1.7714286372065544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286018162966, + "rewards/format_reward_func": 1.0, + "step": 48 + }, + { + "completion_length": 239.64733219146729, + "epoch": 0.008382580996688881, + "grad_norm": 0.3020505688952993, + "kl": 0.00031131505966186523, + "learning_rate": 6.172839506172839e-09, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 50 + }, + { + "completion_length": 241.3169755935669, + "epoch": 0.008717884236556436, + "grad_norm": 0.28625665696322333, + "kl": 0.0003256797790527344, + "learning_rate": 6.419753086419752e-09, + "loss": 0.0, + "reward": 1.7821429148316383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 52 + }, + { + "completion_length": 248.96429443359375, + "epoch": 0.009053187476423991, + "grad_norm": 0.26129573486345886, + "kl": 0.0003027915954589844, + "learning_rate": 6.666666666666667e-09, + "loss": 0.0, + "reward": 1.7571429088711739, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.766071455553174, + "rewards/format_reward_func": 0.9910714328289032, + "step": 54 + }, + { + "completion_length": 250.23215293884277, + "epoch": 0.009388490716291546, + "grad_norm": 0.27548165311054307, + "kl": 0.00035876035690307617, + "learning_rate": 6.91358024691358e-09, + "loss": 0.0, + "reward": 1.7053572162985802, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7098214589059353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 56 + }, + { + "completion_length": 244.34376049041748, + "epoch": 0.009723793956159101, + "grad_norm": 0.22151518083461771, + "kl": 0.00031697750091552734, + "learning_rate": 7.160493827160494e-09, + "loss": 0.0, + "reward": 1.8053571954369545, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8098214492201805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 58 + }, + { + "completion_length": 241.06697273254395, + "epoch": 0.010059097196026656, + "grad_norm": 0.22549107156528939, + "kl": 0.000319063663482666, + "learning_rate": 7.407407407407407e-09, + "loss": 0.0, + "reward": 1.7375000640749931, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 60 + }, + { + "completion_length": 243.7544755935669, + "epoch": 0.010394400435894211, + "grad_norm": 0.17997447995320268, + "kl": 0.00031387805938720703, + "learning_rate": 7.654320987654321e-09, + "loss": 0.0, + "reward": 1.741071492433548, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.74553575925529, + "rewards/format_reward_func": 0.9955357164144516, + "step": 62 + }, + { + "completion_length": 239.93750953674316, + "epoch": 0.010729703675761768, + "grad_norm": 0.29247137737836243, + "kl": 0.00034987926483154297, + "learning_rate": 7.901234567901234e-09, + "loss": 0.0, + "reward": 1.7285714894533157, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7375000305473804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 64 + }, + { + "completion_length": 246.06697273254395, + "epoch": 0.011065006915629323, + "grad_norm": 0.2353839170492452, + "kl": 0.00032019615173339844, + "learning_rate": 8.148148148148147e-09, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286129921675, + "rewards/format_reward_func": 1.0, + "step": 66 + }, + { + "completion_length": 239.946439743042, + "epoch": 0.011400310155496878, + "grad_norm": 0.23810286576070455, + "kl": 0.0003294944763183594, + "learning_rate": 8.395061728395062e-09, + "loss": 0.0, + "reward": 1.7625000774860382, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 68 + }, + { + "completion_length": 244.83929538726807, + "epoch": 0.011735613395364433, + "grad_norm": 0.4474750559486092, + "kl": 0.0003364682197570801, + "learning_rate": 8.641975308641974e-09, + "loss": 0.0, + "reward": 1.7714286148548126, + "reward_std": 0.11111677531152964, + "rewards/equation_reward_func": 0.7714286185801029, + "rewards/format_reward_func": 1.0, + "step": 70 + }, + { + "completion_length": 242.31697463989258, + "epoch": 0.012070916635231988, + "grad_norm": 0.23940161865679357, + "kl": 0.000335693359375, + "learning_rate": 8.888888888888889e-09, + "loss": 0.0, + "reward": 1.6892857998609543, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6892857477068901, + "rewards/format_reward_func": 1.0, + "step": 72 + }, + { + "completion_length": 250.2500123977661, + "epoch": 0.012406219875099543, + "grad_norm": 0.3547914135677079, + "kl": 0.00031691789627075195, + "learning_rate": 9.135802469135803e-09, + "loss": 0.0, + "reward": 1.748214341700077, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7526785861700773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 74 + }, + { + "completion_length": 247.17858505249023, + "epoch": 0.012741523114967098, + "grad_norm": 0.37705598365442405, + "kl": 0.0003069639205932617, + "learning_rate": 9.382716049382715e-09, + "loss": 0.0, + "reward": 1.7142857983708382, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 1.0, + "step": 76 + }, + { + "completion_length": 240.8080472946167, + "epoch": 0.013076826354834654, + "grad_norm": 0.2500330365327647, + "kl": 0.00031298398971557617, + "learning_rate": 9.62962962962963e-09, + "loss": 0.0, + "reward": 1.7339286282658577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928969502449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 78 + }, + { + "completion_length": 252.24554824829102, + "epoch": 0.01341212959470221, + "grad_norm": 0.24171287363674568, + "kl": 0.000303804874420166, + "learning_rate": 9.876543209876543e-09, + "loss": 0.0, + "reward": 1.7433036342263222, + "reward_std": 0.044825518038123846, + "rewards/equation_reward_func": 0.7580357380211353, + "rewards/format_reward_func": 0.9852678664028645, + "step": 80 + }, + { + "completion_length": 247.12054824829102, + "epoch": 0.013747432834569764, + "grad_norm": 0.3022600089442828, + "kl": 0.00033527612686157227, + "learning_rate": 1.0123456790123458e-08, + "loss": 0.0, + "reward": 1.6678572446107864, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6678571682423353, + "rewards/format_reward_func": 1.0, + "step": 82 + }, + { + "completion_length": 245.23662281036377, + "epoch": 0.01408273607443732, + "grad_norm": 0.27039366876832605, + "kl": 0.00029647350311279297, + "learning_rate": 1.0370370370370369e-08, + "loss": 0.0, + "reward": 1.80892863124609, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.813392885029316, + "rewards/format_reward_func": 0.9955357164144516, + "step": 84 + }, + { + "completion_length": 250.4330472946167, + "epoch": 0.014418039314304874, + "grad_norm": 0.7187583509006711, + "kl": 0.0003364086151123047, + "learning_rate": 1.0617283950617284e-08, + "loss": 0.0, + "reward": 1.7321429327130318, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 86 + }, + { + "completion_length": 247.8169765472412, + "epoch": 0.01475334255417243, + "grad_norm": 0.23848320000262993, + "kl": 0.0003071427345275879, + "learning_rate": 1.0864197530864197e-08, + "loss": 0.0, + "reward": 1.751785770058632, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 88 + }, + { + "completion_length": 240.4776906967163, + "epoch": 0.015088645794039984, + "grad_norm": 0.27460619568878064, + "kl": 0.0003440380096435547, + "learning_rate": 1.111111111111111e-08, + "loss": 0.0, + "reward": 1.798214353621006, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8026785980910063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 90 + }, + { + "completion_length": 251.821439743042, + "epoch": 0.01542394903390754, + "grad_norm": 0.29052291960498944, + "kl": 0.00032770633697509766, + "learning_rate": 1.1358024691358023e-08, + "loss": 0.0, + "reward": 1.7571429386734962, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 92 + }, + { + "completion_length": 245.17411613464355, + "epoch": 0.015759252273775094, + "grad_norm": 0.1360821787656637, + "kl": 0.00029844045639038086, + "learning_rate": 1.1604938271604938e-08, + "loss": 0.0, + "reward": 1.7446429282426834, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 94 + }, + { + "completion_length": 256.1517963409424, + "epoch": 0.01609455551364265, + "grad_norm": 0.27993944292911604, + "kl": 0.0003134012222290039, + "learning_rate": 1.1851851851851851e-08, + "loss": 0.0, + "reward": 1.7660714983940125, + "reward_std": 0.09848987311124802, + "rewards/equation_reward_func": 0.7794643267989159, + "rewards/format_reward_func": 0.9866071492433548, + "step": 96 + }, + { + "completion_length": 242.4553689956665, + "epoch": 0.016429858753510204, + "grad_norm": 0.2106037899562362, + "kl": 0.00028455257415771484, + "learning_rate": 1.2098765432098765e-08, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 98 + }, + { + "completion_length": 239.50447368621826, + "epoch": 0.016765161993377763, + "grad_norm": 0.3103782359638075, + "kl": 0.0003159642219543457, + "learning_rate": 1.2345679012345678e-08, + "loss": 0.0, + "reward": 1.764285795390606, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857693135738, + "rewards/format_reward_func": 1.0, + "step": 100 + }, + { + "completion_length": 254.9285831451416, + "epoch": 0.017100465233245318, + "grad_norm": 0.17884850735394575, + "kl": 0.00032651424407958984, + "learning_rate": 1.2592592592592592e-08, + "loss": 0.0, + "reward": 1.8000000640749931, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000417232513, + "rewards/format_reward_func": 1.0, + "step": 102 + }, + { + "completion_length": 248.58483219146729, + "epoch": 0.017435768473112873, + "grad_norm": 0.29105440062209287, + "kl": 0.00033777952194213867, + "learning_rate": 1.2839506172839504e-08, + "loss": 0.0, + "reward": 1.7629464864730835, + "reward_std": 0.03219861118122935, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 0.9986607171595097, + "step": 104 + }, + { + "completion_length": 251.37500953674316, + "epoch": 0.017771071712980428, + "grad_norm": 0.24668718286887112, + "kl": 0.0003491640090942383, + "learning_rate": 1.3086419753086419e-08, + "loss": 0.0, + "reward": 1.7071429342031479, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7071428932249546, + "rewards/format_reward_func": 1.0, + "step": 106 + }, + { + "completion_length": 241.62501049041748, + "epoch": 0.018106374952847983, + "grad_norm": 0.2734650656617372, + "kl": 0.0003235936164855957, + "learning_rate": 1.3333333333333334e-08, + "loss": 0.0, + "reward": 1.7803572043776512, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 108 + }, + { + "completion_length": 252.74108219146729, + "epoch": 0.018441678192715538, + "grad_norm": 0.2705035956067135, + "kl": 0.00032901763916015625, + "learning_rate": 1.3580246913580247e-08, + "loss": 0.0, + "reward": 1.7375000789761543, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.741964315995574, + "rewards/format_reward_func": 0.9955357164144516, + "step": 110 + }, + { + "completion_length": 240.09822463989258, + "epoch": 0.018776981432583092, + "grad_norm": 0.14575037715148337, + "kl": 0.0003132820129394531, + "learning_rate": 1.382716049382716e-08, + "loss": 0.0, + "reward": 1.8000000640749931, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 112 + }, + { + "completion_length": 251.90626049041748, + "epoch": 0.019112284672450647, + "grad_norm": 0.28566608159303575, + "kl": 0.0003253817558288574, + "learning_rate": 1.4074074074074073e-08, + "loss": 0.0, + "reward": 1.7392857745289803, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7482143174856901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 114 + }, + { + "completion_length": 245.4419755935669, + "epoch": 0.019447587912318202, + "grad_norm": 0.22007001347735153, + "kl": 0.0003275871276855469, + "learning_rate": 1.4320987654320988e-08, + "loss": 0.0, + "reward": 1.7660714983940125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 116 + }, + { + "completion_length": 246.47769165039062, + "epoch": 0.019782891152185757, + "grad_norm": 0.24532766247165583, + "kl": 0.00031566619873046875, + "learning_rate": 1.4567901234567901e-08, + "loss": 0.0, + "reward": 1.7267857939004898, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7312500327825546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 118 + }, + { + "completion_length": 246.93304634094238, + "epoch": 0.020118194392053312, + "grad_norm": 0.21397852222502559, + "kl": 0.00031769275665283203, + "learning_rate": 1.4814814814814814e-08, + "loss": 0.0, + "reward": 1.7232143506407738, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 120 + }, + { + "completion_length": 245.8035831451416, + "epoch": 0.020453497631920867, + "grad_norm": 0.44949071446508276, + "kl": 0.0003509521484375, + "learning_rate": 1.5061728395061727e-08, + "loss": 0.0, + "reward": 1.7589286342263222, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928768336773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 122 + }, + { + "completion_length": 250.5848331451416, + "epoch": 0.020788800871788422, + "grad_norm": 0.00010308908116451039, + "kl": 0.00032585859298706055, + "learning_rate": 1.5308641975308642e-08, + "loss": 0.0, + "reward": 1.732142947614193, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 124 + }, + { + "completion_length": 242.9062623977661, + "epoch": 0.021124104111655977, + "grad_norm": 0.2326498304971465, + "kl": 0.0003529787063598633, + "learning_rate": 1.5555555555555554e-08, + "loss": 0.0, + "reward": 1.7750000655651093, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.775000037625432, + "rewards/format_reward_func": 1.0, + "step": 126 + }, + { + "completion_length": 248.71876049041748, + "epoch": 0.021459407351523536, + "grad_norm": 0.2724319389491217, + "kl": 0.0003319978713989258, + "learning_rate": 1.580246913580247e-08, + "loss": 0.0, + "reward": 1.767857201397419, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7767857573926449, + "rewards/format_reward_func": 0.9910714328289032, + "step": 128 + }, + { + "completion_length": 244.5982265472412, + "epoch": 0.02179471059139109, + "grad_norm": 0.28980555293971616, + "kl": 0.00033468008041381836, + "learning_rate": 1.6049382716049383e-08, + "loss": 0.0, + "reward": 1.796428620815277, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 130 + }, + { + "completion_length": 246.5491189956665, + "epoch": 0.022130013831258646, + "grad_norm": 0.22034484681712568, + "kl": 0.00032633543014526367, + "learning_rate": 1.6296296296296295e-08, + "loss": 0.0, + "reward": 1.7267857939004898, + "reward_std": 0.0732360603287816, + "rewards/equation_reward_func": 0.7401786036789417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 132 + }, + { + "completion_length": 251.6919765472412, + "epoch": 0.0224653170711262, + "grad_norm": 0.21857858965530919, + "kl": 0.00031572580337524414, + "learning_rate": 1.654320987654321e-08, + "loss": 0.0, + "reward": 1.7660714909434319, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7705357261002064, + "rewards/format_reward_func": 0.9955357164144516, + "step": 134 + }, + { + "completion_length": 244.83483505249023, + "epoch": 0.022800620310993756, + "grad_norm": 0.2697978233876309, + "kl": 0.00033473968505859375, + "learning_rate": 1.6790123456790124e-08, + "loss": 0.0, + "reward": 1.7500000447034836, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 136 + }, + { + "completion_length": 256.58483505249023, + "epoch": 0.02313592355086131, + "grad_norm": 0.300841254760372, + "kl": 0.0002970099449157715, + "learning_rate": 1.7037037037037036e-08, + "loss": 0.0, + "reward": 1.7285715192556381, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 138 + }, + { + "completion_length": 246.47768783569336, + "epoch": 0.023471226790728866, + "grad_norm": 0.2083502710840338, + "kl": 0.0003154277801513672, + "learning_rate": 1.7283950617283947e-08, + "loss": 0.0, + "reward": 1.733928643167019, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7383928969502449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 140 + }, + { + "completion_length": 253.37054538726807, + "epoch": 0.02380653003059642, + "grad_norm": 0.19329695372740538, + "kl": 0.00031769275665283203, + "learning_rate": 1.7530864197530862e-08, + "loss": 0.0, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 142 + }, + { + "completion_length": 244.33483123779297, + "epoch": 0.024141833270463976, + "grad_norm": 0.2657022566055933, + "kl": 0.00033587217330932617, + "learning_rate": 1.7777777777777777e-08, + "loss": 0.0, + "reward": 1.7642857655882835, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.773214302957058, + "rewards/format_reward_func": 0.9910714328289032, + "step": 144 + }, + { + "completion_length": 241.88840293884277, + "epoch": 0.02447713651033153, + "grad_norm": 0.24003225151699342, + "kl": 0.00033402442932128906, + "learning_rate": 1.8024691358024692e-08, + "loss": 0.0, + "reward": 1.7392857894301414, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7482143193483353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 146 + }, + { + "completion_length": 254.4598331451416, + "epoch": 0.024812439750199086, + "grad_norm": 0.2815512180517347, + "kl": 0.0003273487091064453, + "learning_rate": 1.8271604938271607e-08, + "loss": 0.0, + "reward": 1.7017857804894447, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7062500528991222, + "rewards/format_reward_func": 0.9955357164144516, + "step": 148 + }, + { + "completion_length": 235.10268878936768, + "epoch": 0.02514774299006664, + "grad_norm": 0.32352647991566374, + "kl": 0.00031512975692749023, + "learning_rate": 1.8518518518518518e-08, + "loss": 0.0, + "reward": 1.7821429073810577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 150 + }, + { + "completion_length": 243.23215198516846, + "epoch": 0.025483046229934195, + "grad_norm": 0.1719160101460022, + "kl": 0.0003261566162109375, + "learning_rate": 1.876543209876543e-08, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 152 + }, + { + "completion_length": 247.1071548461914, + "epoch": 0.02581834946980175, + "grad_norm": 0.2568745935002292, + "kl": 0.00030517578125, + "learning_rate": 1.9012345679012344e-08, + "loss": 0.0, + "reward": 1.7089286521077156, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7133929040282965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 154 + }, + { + "completion_length": 242.92411994934082, + "epoch": 0.02615365270966931, + "grad_norm": 0.32403116735365, + "kl": 0.00030052661895751953, + "learning_rate": 1.925925925925926e-08, + "loss": 0.0, + "reward": 1.783928632736206, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 156 + }, + { + "completion_length": 245.2053680419922, + "epoch": 0.026488955949536864, + "grad_norm": 0.2945305207519835, + "kl": 0.00034099817276000977, + "learning_rate": 1.950617283950617e-08, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 158 + }, + { + "completion_length": 250.0312623977661, + "epoch": 0.02682425918940442, + "grad_norm": 0.2981654874681236, + "kl": 0.00033992528915405273, + "learning_rate": 1.9753086419753086e-08, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7875000201165676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 160 + }, + { + "completion_length": 252.8616180419922, + "epoch": 0.027159562429271974, + "grad_norm": 0.20431294977682624, + "kl": 0.0002976059913635254, + "learning_rate": 2e-08, + "loss": 0.0, + "reward": 1.7964286357164383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 162 + }, + { + "completion_length": 246.81697368621826, + "epoch": 0.02749486566913953, + "grad_norm": 0.1897492145271756, + "kl": 0.00032776594161987305, + "learning_rate": 2.0246913580246915e-08, + "loss": 0.0, + "reward": 1.7375000715255737, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7419643253087997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 164 + }, + { + "completion_length": 242.5178689956665, + "epoch": 0.027830168909007084, + "grad_norm": 0.1981890035443643, + "kl": 0.00030219554901123047, + "learning_rate": 2.0493827160493823e-08, + "loss": 0.0, + "reward": 1.7089286670088768, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.713392898440361, + "rewards/format_reward_func": 0.9955357164144516, + "step": 166 + }, + { + "completion_length": 242.95536994934082, + "epoch": 0.02816547214887464, + "grad_norm": 0.27498948462332695, + "kl": 0.00031065940856933594, + "learning_rate": 2.0740740740740738e-08, + "loss": 0.0, + "reward": 1.7267858237028122, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.731250025331974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 168 + }, + { + "completion_length": 244.3169765472412, + "epoch": 0.028500775388742194, + "grad_norm": 0.18119311717593303, + "kl": 0.0003190040588378906, + "learning_rate": 2.0987654320987653e-08, + "loss": 0.0, + "reward": 1.7535715103149414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 170 + }, + { + "completion_length": 250.15179824829102, + "epoch": 0.02883607862860975, + "grad_norm": 0.1600883159721621, + "kl": 0.0003666877746582031, + "learning_rate": 2.1234567901234568e-08, + "loss": 0.0, + "reward": 1.775000087916851, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 172 + }, + { + "completion_length": 252.1607255935669, + "epoch": 0.029171381868477304, + "grad_norm": 0.20143375779388584, + "kl": 0.00031435489654541016, + "learning_rate": 2.148148148148148e-08, + "loss": 0.0, + "reward": 1.7308036610484123, + "reward_std": 0.0473508988507092, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 0.9986607171595097, + "step": 174 + }, + { + "completion_length": 245.46429538726807, + "epoch": 0.02950668510834486, + "grad_norm": 0.38570735794849914, + "kl": 0.0003286004066467285, + "learning_rate": 2.1728395061728394e-08, + "loss": 0.0, + "reward": 1.7125001028180122, + "reward_std": 0.10354063380509615, + "rewards/equation_reward_func": 0.7169643118977547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 176 + }, + { + "completion_length": 247.22769165039062, + "epoch": 0.029841988348212414, + "grad_norm": 0.3972032973187878, + "kl": 0.0003095269203186035, + "learning_rate": 2.197530864197531e-08, + "loss": 0.0, + "reward": 1.748214341700077, + "reward_std": 0.10354063287377357, + "rewards/equation_reward_func": 0.7526786141097546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 178 + }, + { + "completion_length": 243.81697463989258, + "epoch": 0.03017729158807997, + "grad_norm": 0.3034668175618403, + "kl": 0.00033104419708251953, + "learning_rate": 2.222222222222222e-08, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 180 + }, + { + "completion_length": 244.0089406967163, + "epoch": 0.030512594827947524, + "grad_norm": 0.20606761170561028, + "kl": 0.00030857324600219727, + "learning_rate": 2.2469135802469135e-08, + "loss": 0.0, + "reward": 1.7875000536441803, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643055647612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 182 + }, + { + "completion_length": 250.7098331451416, + "epoch": 0.03084789806781508, + "grad_norm": 0.20840818777387127, + "kl": 0.00032150745391845703, + "learning_rate": 2.2716049382716047e-08, + "loss": 0.0, + "reward": 1.7678572088479996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 1.0, + "step": 184 + }, + { + "completion_length": 238.39733219146729, + "epoch": 0.031183201307682637, + "grad_norm": 0.30613951822839514, + "kl": 0.00033026933670043945, + "learning_rate": 2.296296296296296e-08, + "loss": 0.0, + "reward": 1.8000000566244125, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 1.0, + "step": 186 + }, + { + "completion_length": 242.68751335144043, + "epoch": 0.03151850454755019, + "grad_norm": 0.24889609063442605, + "kl": 0.0003234744071960449, + "learning_rate": 2.3209876543209876e-08, + "loss": 0.0, + "reward": 1.7571429163217545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428958326578, + "rewards/format_reward_func": 1.0, + "step": 188 + }, + { + "completion_length": 236.5714406967163, + "epoch": 0.03185380778741775, + "grad_norm": 0.25782909812310634, + "kl": 0.0003020763397216797, + "learning_rate": 2.345679012345679e-08, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 190 + }, + { + "completion_length": 242.67412090301514, + "epoch": 0.0321891110272853, + "grad_norm": 0.16179825374337622, + "kl": 0.0003166794776916504, + "learning_rate": 2.3703703703703703e-08, + "loss": 0.0, + "reward": 1.8178571984171867, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.8267857432365417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 192 + }, + { + "completion_length": 246.08929634094238, + "epoch": 0.03252441426715286, + "grad_norm": 0.20594663668147256, + "kl": 0.00032722949981689453, + "learning_rate": 2.3950617283950614e-08, + "loss": 0.0, + "reward": 1.8089286237955093, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8133928775787354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 194 + }, + { + "completion_length": 249.73215770721436, + "epoch": 0.03285971750702041, + "grad_norm": 0.2088766158551277, + "kl": 0.00032842159271240234, + "learning_rate": 2.419753086419753e-08, + "loss": 0.0, + "reward": 1.7089286595582962, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7133928947150707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 196 + }, + { + "completion_length": 246.8571548461914, + "epoch": 0.03319502074688797, + "grad_norm": 0.19387291215229924, + "kl": 0.00029861927032470703, + "learning_rate": 2.4444444444444444e-08, + "loss": 0.0, + "reward": 1.7946429252624512, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071604192257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 198 + }, + { + "completion_length": 246.4910831451416, + "epoch": 0.033530323986755525, + "grad_norm": 0.14395851317716862, + "kl": 0.00030988454818725586, + "learning_rate": 2.4691358024691355e-08, + "loss": 0.0, + "reward": 1.7553572207689285, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214540630579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 200 + }, + { + "completion_length": 234.1294765472412, + "epoch": 0.03386562722662308, + "grad_norm": 0.24436453124931118, + "kl": 0.00032639503479003906, + "learning_rate": 2.493827160493827e-08, + "loss": 0.0, + "reward": 1.7517857775092125, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.75625004991889, + "rewards/format_reward_func": 0.9955357164144516, + "step": 202 + }, + { + "completion_length": 248.93304634094238, + "epoch": 0.034200930466490635, + "grad_norm": 0.2248675849511274, + "kl": 0.0003234744071960449, + "learning_rate": 2.5185185185185185e-08, + "loss": 0.0, + "reward": 1.7285715267062187, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 204 + }, + { + "completion_length": 246.19197368621826, + "epoch": 0.03453623370635819, + "grad_norm": 0.22934369353586878, + "kl": 0.0003287792205810547, + "learning_rate": 2.54320987654321e-08, + "loss": 0.0, + "reward": 1.7500000894069672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 206 + }, + { + "completion_length": 245.40179634094238, + "epoch": 0.034871536946225745, + "grad_norm": 0.29961009398828276, + "kl": 0.00034368038177490234, + "learning_rate": 2.5679012345679008e-08, + "loss": 0.0, + "reward": 1.7616072222590446, + "reward_std": 0.06944798538461328, + "rewards/equation_reward_func": 0.7723214626312256, + "rewards/format_reward_func": 0.9892857223749161, + "step": 208 + }, + { + "completion_length": 248.41518878936768, + "epoch": 0.0352068401860933, + "grad_norm": 0.15792865989980773, + "kl": 0.00034034252166748047, + "learning_rate": 2.5925925925925923e-08, + "loss": 0.0, + "reward": 1.769642896950245, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7741071861237288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 210 + }, + { + "completion_length": 242.93304538726807, + "epoch": 0.035542143425960855, + "grad_norm": 0.3301838137697797, + "kl": 0.0003197193145751953, + "learning_rate": 2.6172839506172838e-08, + "loss": 0.0, + "reward": 1.750446505844593, + "reward_std": 0.07007933082059026, + "rewards/equation_reward_func": 0.7562500145286322, + "rewards/format_reward_func": 0.9941964335739613, + "step": 212 + }, + { + "completion_length": 248.7276906967163, + "epoch": 0.03587744666582841, + "grad_norm": 0.35616311274054235, + "kl": 0.0003361701965332031, + "learning_rate": 2.6419753086419752e-08, + "loss": 0.0, + "reward": 1.7678572162985802, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 214 + }, + { + "completion_length": 248.99108505249023, + "epoch": 0.036212749905695965, + "grad_norm": 0.23756977130728407, + "kl": 0.00034689903259277344, + "learning_rate": 2.6666666666666667e-08, + "loss": 0.0, + "reward": 1.7017857804894447, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7062500342726707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 216 + }, + { + "completion_length": 248.22768688201904, + "epoch": 0.03654805314556352, + "grad_norm": 0.2756674602915548, + "kl": 0.00029724836349487305, + "learning_rate": 2.691358024691358e-08, + "loss": 0.0, + "reward": 1.705357238650322, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7098214626312256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 218 + }, + { + "completion_length": 233.95983219146729, + "epoch": 0.036883356385431075, + "grad_norm": 0.2544126349698565, + "kl": 0.0003007054328918457, + "learning_rate": 2.7160493827160494e-08, + "loss": 0.0, + "reward": 1.7625000700354576, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669643107801676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 220 + }, + { + "completion_length": 250.9553680419922, + "epoch": 0.03721865962529863, + "grad_norm": 0.213788279693055, + "kl": 0.0003082156181335449, + "learning_rate": 2.740740740740741e-08, + "loss": 0.0, + "reward": 1.7589286267757416, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 222 + }, + { + "completion_length": 249.2500123977661, + "epoch": 0.037553962865166185, + "grad_norm": 0.2610069019462948, + "kl": 0.0003325939178466797, + "learning_rate": 2.765432098765432e-08, + "loss": 0.0, + "reward": 1.72857154160738, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.728571455925703, + "rewards/format_reward_func": 1.0, + "step": 224 + }, + { + "completion_length": 247.03572368621826, + "epoch": 0.037889266105033736, + "grad_norm": 0.2206000139730647, + "kl": 0.0003177523612976074, + "learning_rate": 2.790123456790123e-08, + "loss": 0.0, + "reward": 1.773214340209961, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 226 + }, + { + "completion_length": 243.68304634094238, + "epoch": 0.038224569344901295, + "grad_norm": 0.34760430842090373, + "kl": 0.0003180503845214844, + "learning_rate": 2.8148148148148146e-08, + "loss": 0.0, + "reward": 1.7267857789993286, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.731250025331974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 228 + }, + { + "completion_length": 259.589298248291, + "epoch": 0.03855987258476885, + "grad_norm": 0.20944117193770692, + "kl": 0.000335693359375, + "learning_rate": 2.839506172839506e-08, + "loss": 0.0, + "reward": 1.6821429431438446, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.6910714581608772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 230 + }, + { + "completion_length": 241.665189743042, + "epoch": 0.038895175824636405, + "grad_norm": 0.2512126660543398, + "kl": 0.00031304359436035156, + "learning_rate": 2.8641975308641976e-08, + "loss": 0.0, + "reward": 1.7285714894533157, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 232 + }, + { + "completion_length": 234.51786708831787, + "epoch": 0.03923047906450396, + "grad_norm": 0.18261909607261712, + "kl": 0.000345766544342041, + "learning_rate": 2.8888888888888887e-08, + "loss": 0.0, + "reward": 1.73392865806818, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7375000305473804, + "rewards/format_reward_func": 0.9964285790920258, + "step": 234 + }, + { + "completion_length": 252.02679634094238, + "epoch": 0.039565782304371515, + "grad_norm": 0.20775712922379, + "kl": 0.0003249645233154297, + "learning_rate": 2.9135802469135802e-08, + "loss": 0.0, + "reward": 1.7589286416769028, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 236 + }, + { + "completion_length": 242.97768783569336, + "epoch": 0.03990108554423907, + "grad_norm": 0.1833960305143577, + "kl": 0.00034111738204956055, + "learning_rate": 2.9382716049382714e-08, + "loss": 0.0, + "reward": 1.7446429207921028, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 238 + }, + { + "completion_length": 244.35269165039062, + "epoch": 0.040236388784106625, + "grad_norm": 0.08606613377561309, + "kl": 0.0003235340118408203, + "learning_rate": 2.962962962962963e-08, + "loss": 0.0, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000186264515, + "rewards/format_reward_func": 1.0, + "step": 240 + }, + { + "completion_length": 242.70090675354004, + "epoch": 0.04057169202397418, + "grad_norm": 0.22739026239901686, + "kl": 0.00033462047576904297, + "learning_rate": 2.987654320987654e-08, + "loss": 0.0, + "reward": 1.7982143387198448, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.802678607404232, + "rewards/format_reward_func": 0.9955357164144516, + "step": 242 + }, + { + "completion_length": 240.8169755935669, + "epoch": 0.040906995263841735, + "grad_norm": 0.23189861598153808, + "kl": 0.00033158063888549805, + "learning_rate": 3.0123456790123455e-08, + "loss": 0.0, + "reward": 1.7357143685221672, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 244 + }, + { + "completion_length": 245.93305015563965, + "epoch": 0.04124229850370929, + "grad_norm": 0.3452260535215257, + "kl": 0.0003554821014404297, + "learning_rate": 3.037037037037037e-08, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 246 + }, + { + "completion_length": 254.03572750091553, + "epoch": 0.041577601743576845, + "grad_norm": 0.3311219707962642, + "kl": 0.0003204345703125, + "learning_rate": 3.0617283950617284e-08, + "loss": 0.0, + "reward": 1.6928572431206703, + "reward_std": 0.10101525112986565, + "rewards/equation_reward_func": 0.6928571779280901, + "rewards/format_reward_func": 1.0, + "step": 248 + }, + { + "completion_length": 249.513409614563, + "epoch": 0.0419129049834444, + "grad_norm": 0.37071766869235806, + "kl": 0.0003209114074707031, + "learning_rate": 3.086419753086419e-08, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 250 + }, + { + "completion_length": 244.571439743042, + "epoch": 0.042248208223311955, + "grad_norm": 0.2739936128591675, + "kl": 0.0003472566604614258, + "learning_rate": 3.111111111111111e-08, + "loss": 0.0, + "reward": 1.7607143446803093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143130153418, + "rewards/format_reward_func": 1.0, + "step": 252 + }, + { + "completion_length": 232.39733028411865, + "epoch": 0.04258351146317951, + "grad_norm": 0.17136269147118577, + "kl": 0.00031685829162597656, + "learning_rate": 3.135802469135802e-08, + "loss": 0.0, + "reward": 1.7750000655651093, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 254 + }, + { + "completion_length": 242.82144165039062, + "epoch": 0.04291881470304707, + "grad_norm": 0.24005201682447744, + "kl": 0.0003197789192199707, + "learning_rate": 3.160493827160494e-08, + "loss": 0.0, + "reward": 1.735714353621006, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143275439739, + "rewards/format_reward_func": 1.0, + "step": 256 + }, + { + "completion_length": 247.8571538925171, + "epoch": 0.04325411794291462, + "grad_norm": 0.25000441328741113, + "kl": 0.0003287792205810547, + "learning_rate": 3.185185185185185e-08, + "loss": 0.0, + "reward": 1.7410714998841286, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 258 + }, + { + "completion_length": 246.6428680419922, + "epoch": 0.04358942118278218, + "grad_norm": 0.18778229429765905, + "kl": 0.0002881288528442383, + "learning_rate": 3.2098765432098767e-08, + "loss": 0.0, + "reward": 1.7500000894069672, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7589285969734192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 260 + }, + { + "completion_length": 252.77679634094238, + "epoch": 0.04392472442264973, + "grad_norm": 0.2507058768188766, + "kl": 0.0003418922424316406, + "learning_rate": 3.234567901234568e-08, + "loss": 0.0, + "reward": 1.6964286491274834, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6964286118745804, + "rewards/format_reward_func": 1.0, + "step": 262 + }, + { + "completion_length": 250.5223331451416, + "epoch": 0.04426002766251729, + "grad_norm": 0.2145908398443362, + "kl": 0.00032597780227661133, + "learning_rate": 3.259259259259259e-08, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7339286059141159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 264 + }, + { + "completion_length": 244.2946538925171, + "epoch": 0.04459533090238484, + "grad_norm": 0.1814048605402375, + "kl": 0.0003126859664916992, + "learning_rate": 3.2839506172839504e-08, + "loss": 0.0, + "reward": 1.7357143461704254, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7446428835391998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 266 + }, + { + "completion_length": 240.95090579986572, + "epoch": 0.0449306341422524, + "grad_norm": 0.573185463890021, + "kl": 0.00038886070251464844, + "learning_rate": 3.308641975308642e-08, + "loss": 0.0, + "reward": 1.7875000461935997, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919642999768257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 268 + }, + { + "completion_length": 251.35269165039062, + "epoch": 0.04526593738211995, + "grad_norm": 0.345458955989029, + "kl": 0.0003148317337036133, + "learning_rate": 3.3333333333333334e-08, + "loss": 0.0, + "reward": 1.7607143595814705, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 270 + }, + { + "completion_length": 240.8303680419922, + "epoch": 0.04560124062198751, + "grad_norm": 0.21387981859355704, + "kl": 0.00033462047576904297, + "learning_rate": 3.358024691358025e-08, + "loss": 0.0, + "reward": 1.7678572237491608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 272 + }, + { + "completion_length": 243.8482265472412, + "epoch": 0.04593654386185506, + "grad_norm": 0.1372445448617604, + "kl": 0.00030040740966796875, + "learning_rate": 3.382716049382716e-08, + "loss": 0.0, + "reward": 1.8017857745289803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.806250024586916, + "rewards/format_reward_func": 0.9955357164144516, + "step": 274 + }, + { + "completion_length": 242.38393878936768, + "epoch": 0.04627184710172262, + "grad_norm": 0.2538804221997863, + "kl": 0.0003288388252258301, + "learning_rate": 3.407407407407407e-08, + "loss": 0.0, + "reward": 1.7482143491506577, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7526785973459482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 276 + }, + { + "completion_length": 243.4553680419922, + "epoch": 0.04660715034159017, + "grad_norm": 0.17161419413289314, + "kl": 0.0003286600112915039, + "learning_rate": 3.4320987654320987e-08, + "loss": 0.0, + "reward": 1.739285796880722, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 278 + }, + { + "completion_length": 253.1071548461914, + "epoch": 0.04694245358145773, + "grad_norm": 0.1583888212171527, + "kl": 0.0003293752670288086, + "learning_rate": 3.4567901234567895e-08, + "loss": 0.0, + "reward": 1.7857143580913544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143152505159, + "rewards/format_reward_func": 1.0, + "step": 280 + }, + { + "completion_length": 247.33483600616455, + "epoch": 0.04727775682132528, + "grad_norm": 0.2364266681687748, + "kl": 0.0003266334533691406, + "learning_rate": 3.481481481481481e-08, + "loss": 0.0, + "reward": 1.725000075995922, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7250000461935997, + "rewards/format_reward_func": 1.0, + "step": 282 + }, + { + "completion_length": 247.9196548461914, + "epoch": 0.04761306006119284, + "grad_norm": 0.15755245023632508, + "kl": 0.0003324151039123535, + "learning_rate": 3.5061728395061724e-08, + "loss": 0.0, + "reward": 1.7964286282658577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 284 + }, + { + "completion_length": 244.0982265472412, + "epoch": 0.0479483633010604, + "grad_norm": 0.248909793697624, + "kl": 0.0002993345260620117, + "learning_rate": 3.530864197530864e-08, + "loss": 0.0, + "reward": 1.7017857879400253, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7062500305473804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 286 + }, + { + "completion_length": 245.4866180419922, + "epoch": 0.04828366654092795, + "grad_norm": 0.2979174520331835, + "kl": 0.0003222227096557617, + "learning_rate": 3.5555555555555554e-08, + "loss": 0.0, + "reward": 1.7160715088248253, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7205357514321804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 288 + }, + { + "completion_length": 245.1696548461914, + "epoch": 0.04861896978079551, + "grad_norm": 0.2121228295030136, + "kl": 0.0003127455711364746, + "learning_rate": 3.580246913580247e-08, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 290 + }, + { + "completion_length": 254.25001621246338, + "epoch": 0.04895427302066306, + "grad_norm": 0.2591013783701026, + "kl": 0.0003070831298828125, + "learning_rate": 3.6049382716049384e-08, + "loss": 0.0, + "reward": 1.7107143625617027, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7107143253087997, + "rewards/format_reward_func": 1.0, + "step": 292 + }, + { + "completion_length": 248.9285831451416, + "epoch": 0.04928957626053062, + "grad_norm": 0.25970339981154605, + "kl": 0.00030744075775146484, + "learning_rate": 3.62962962962963e-08, + "loss": 0.0, + "reward": 1.7142858132719994, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7142857424914837, + "rewards/format_reward_func": 1.0, + "step": 294 + }, + { + "completion_length": 245.6607255935669, + "epoch": 0.04962487950039817, + "grad_norm": 0.1693241175450565, + "kl": 0.0003269314765930176, + "learning_rate": 3.6543209876543213e-08, + "loss": 0.0, + "reward": 1.7928572073578835, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 296 + }, + { + "completion_length": 244.47768878936768, + "epoch": 0.04996018274026573, + "grad_norm": 0.28498148563751646, + "kl": 0.00032448768615722656, + "learning_rate": 3.679012345679012e-08, + "loss": 0.0, + "reward": 1.7232143804430962, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7276786062866449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 298 + }, + { + "completion_length": 249.23662090301514, + "epoch": 0.05029548598013328, + "grad_norm": 0.17194495222367387, + "kl": 0.00031006336212158203, + "learning_rate": 3.7037037037037036e-08, + "loss": 0.0, + "reward": 1.7250000908970833, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.725000036880374, + "rewards/format_reward_func": 1.0, + "step": 300 + }, + { + "completion_length": 241.9509038925171, + "epoch": 0.05063078922000084, + "grad_norm": 0.09369687516121022, + "kl": 0.0003318190574645996, + "learning_rate": 3.7283950617283945e-08, + "loss": 0.0, + "reward": 1.7589286640286446, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 302 + }, + { + "completion_length": 245.50893878936768, + "epoch": 0.05096609245986839, + "grad_norm": 0.20374576311730827, + "kl": 0.0003292560577392578, + "learning_rate": 3.753086419753086e-08, + "loss": 0.0, + "reward": 1.762500062584877, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643182307482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 304 + }, + { + "completion_length": 246.89286708831787, + "epoch": 0.05130139569973595, + "grad_norm": 0.33707103491103274, + "kl": 0.00032824277877807617, + "learning_rate": 3.7777777777777774e-08, + "loss": 0.0, + "reward": 1.7375000640749931, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 306 + }, + { + "completion_length": 251.9910831451416, + "epoch": 0.0516366989396035, + "grad_norm": 0.16043020844799977, + "kl": 0.0003122687339782715, + "learning_rate": 3.802469135802469e-08, + "loss": 0.0, + "reward": 1.7214286476373672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 308 + }, + { + "completion_length": 252.6696548461914, + "epoch": 0.05197200217947106, + "grad_norm": 0.32476087022862743, + "kl": 0.00031131505966186523, + "learning_rate": 3.8271604938271604e-08, + "loss": 0.0, + "reward": 1.7142858058214188, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7232143059372902, + "rewards/format_reward_func": 0.9910714328289032, + "step": 310 + }, + { + "completion_length": 250.33037090301514, + "epoch": 0.05230730541933862, + "grad_norm": 0.20662703392624346, + "kl": 0.00035762786865234375, + "learning_rate": 3.851851851851852e-08, + "loss": 0.0, + "reward": 1.7232143431901932, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.727678619325161, + "rewards/format_reward_func": 0.9955357164144516, + "step": 312 + }, + { + "completion_length": 245.37947368621826, + "epoch": 0.05264260865920617, + "grad_norm": 0.3116052051179476, + "kl": 0.00034052133560180664, + "learning_rate": 3.876543209876543e-08, + "loss": 0.0, + "reward": 1.7267857939004898, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7312500216066837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 314 + }, + { + "completion_length": 254.11608219146729, + "epoch": 0.05297791189907373, + "grad_norm": 0.23588516688376734, + "kl": 0.00032001733779907227, + "learning_rate": 3.901234567901234e-08, + "loss": 0.0, + "reward": 1.7589286491274834, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.772321455180645, + "rewards/format_reward_func": 0.9866071492433548, + "step": 316 + }, + { + "completion_length": 250.17858409881592, + "epoch": 0.05331321513894128, + "grad_norm": 0.21352114622407678, + "kl": 0.0003173947334289551, + "learning_rate": 3.9259259259259256e-08, + "loss": 0.0, + "reward": 1.708928644657135, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7133928779512644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 318 + }, + { + "completion_length": 241.33036708831787, + "epoch": 0.05364851837880884, + "grad_norm": 0.19064104806150775, + "kl": 0.000352323055267334, + "learning_rate": 3.950617283950617e-08, + "loss": 0.0, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 320 + }, + { + "completion_length": 247.5714406967163, + "epoch": 0.05398382161867639, + "grad_norm": 0.09471434497682275, + "kl": 0.00035202503204345703, + "learning_rate": 3.9753086419753086e-08, + "loss": 0.0, + "reward": 1.7357143759727478, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357142977416515, + "rewards/format_reward_func": 1.0, + "step": 322 + }, + { + "completion_length": 247.8392972946167, + "epoch": 0.05431912485854395, + "grad_norm": 0.16439367857374707, + "kl": 0.0003255605697631836, + "learning_rate": 4e-08, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7339286040514708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 324 + }, + { + "completion_length": 244.4821538925171, + "epoch": 0.0546544280984115, + "grad_norm": 0.28376400955422515, + "kl": 0.0003072023391723633, + "learning_rate": 4.0246913580246916e-08, + "loss": 0.0, + "reward": 1.7357143610715866, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7446429003030062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 326 + }, + { + "completion_length": 241.10715293884277, + "epoch": 0.05498973133827906, + "grad_norm": 0.21232563662149967, + "kl": 0.00036221742630004883, + "learning_rate": 4.049382716049383e-08, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 328 + }, + { + "completion_length": 244.8259048461914, + "epoch": 0.05532503457814661, + "grad_norm": 0.31284424481049833, + "kl": 0.00032901763916015625, + "learning_rate": 4.0740740740740745e-08, + "loss": 0.0, + "reward": 1.7642857655882835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 330 + }, + { + "completion_length": 233.58036708831787, + "epoch": 0.05566033781801417, + "grad_norm": 0.14740794798794307, + "kl": 0.00032466650009155273, + "learning_rate": 4.098765432098765e-08, + "loss": 0.0, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000484287739, + "rewards/format_reward_func": 1.0, + "step": 332 + }, + { + "completion_length": 247.05358219146729, + "epoch": 0.05599564105788172, + "grad_norm": 0.21326826487289274, + "kl": 0.0003064870834350586, + "learning_rate": 4.123456790123456e-08, + "loss": 0.0, + "reward": 1.7660714760422707, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 334 + }, + { + "completion_length": 252.5134038925171, + "epoch": 0.05633094429774928, + "grad_norm": 0.2820975150743322, + "kl": 0.0003514289855957031, + "learning_rate": 4.1481481481481476e-08, + "loss": 0.0, + "reward": 1.737500086426735, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 336 + }, + { + "completion_length": 245.87054538726807, + "epoch": 0.05666624753761683, + "grad_norm": 0.28892632014326425, + "kl": 0.00032830238342285156, + "learning_rate": 4.172839506172839e-08, + "loss": 0.0, + "reward": 1.7392857894301414, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.739285733550787, + "rewards/format_reward_func": 1.0, + "step": 338 + }, + { + "completion_length": 257.6428699493408, + "epoch": 0.05700155077748439, + "grad_norm": 0.20801041134465126, + "kl": 0.0003235936164855957, + "learning_rate": 4.1975308641975306e-08, + "loss": 0.0, + "reward": 1.7035714983940125, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.712500024586916, + "rewards/format_reward_func": 0.9910714328289032, + "step": 340 + }, + { + "completion_length": 239.0446538925171, + "epoch": 0.057336854017351946, + "grad_norm": 0.20466302680456666, + "kl": 0.00033277273178100586, + "learning_rate": 4.222222222222222e-08, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7875000350177288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 342 + }, + { + "completion_length": 247.04911994934082, + "epoch": 0.0576721572572195, + "grad_norm": 0.2544168425529169, + "kl": 0.0003313422203063965, + "learning_rate": 4.2469135802469136e-08, + "loss": 0.0, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 344 + }, + { + "completion_length": 247.80805110931396, + "epoch": 0.058007460497087056, + "grad_norm": 0.28697432321700045, + "kl": 0.0003412961959838867, + "learning_rate": 4.271604938271605e-08, + "loss": 0.0, + "reward": 1.7464286461472511, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 346 + }, + { + "completion_length": 241.80358409881592, + "epoch": 0.05834276373695461, + "grad_norm": 0.2989078442481178, + "kl": 0.0003070235252380371, + "learning_rate": 4.296296296296296e-08, + "loss": 0.0, + "reward": 1.771428644657135, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285887777805, + "rewards/format_reward_func": 1.0, + "step": 348 + }, + { + "completion_length": 241.81697463989258, + "epoch": 0.058678066976822166, + "grad_norm": 0.18283142606513708, + "kl": 0.0003039836883544922, + "learning_rate": 4.3209876543209874e-08, + "loss": 0.0, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 350 + }, + { + "completion_length": 245.33929634094238, + "epoch": 0.05901337021668972, + "grad_norm": 0.2445142243174735, + "kl": 0.0003504753112792969, + "learning_rate": 4.345679012345679e-08, + "loss": 0.0, + "reward": 1.771428644657135, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 352 + }, + { + "completion_length": 246.90179920196533, + "epoch": 0.059348673456557276, + "grad_norm": 0.16569502991379484, + "kl": 0.00032913684844970703, + "learning_rate": 4.37037037037037e-08, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571429088711739, + "rewards/format_reward_func": 1.0, + "step": 354 + }, + { + "completion_length": 240.02679634094238, + "epoch": 0.05968397669642483, + "grad_norm": 0.2585568390691014, + "kl": 0.0003421306610107422, + "learning_rate": 4.395061728395062e-08, + "loss": 0.0, + "reward": 1.758928656578064, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 356 + }, + { + "completion_length": 245.41072463989258, + "epoch": 0.060019279936292386, + "grad_norm": 0.13192074596224, + "kl": 0.00033915042877197266, + "learning_rate": 4.419753086419753e-08, + "loss": 0.0, + "reward": 1.7678571939468384, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 358 + }, + { + "completion_length": 235.59375953674316, + "epoch": 0.06035458317615994, + "grad_norm": 0.1860175287251393, + "kl": 0.00028246641159057617, + "learning_rate": 4.444444444444444e-08, + "loss": 0.0, + "reward": 1.7875000536441803, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 360 + }, + { + "completion_length": 250.3169755935669, + "epoch": 0.060689886416027496, + "grad_norm": 0.296173081498714, + "kl": 0.0003261566162109375, + "learning_rate": 4.4691358024691356e-08, + "loss": 0.0, + "reward": 1.7500000521540642, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 362 + }, + { + "completion_length": 245.74108028411865, + "epoch": 0.06102518965589505, + "grad_norm": 0.17763701007983315, + "kl": 0.0003126859664916992, + "learning_rate": 4.493827160493827e-08, + "loss": 0.0, + "reward": 1.7714286372065544, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7803571783006191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 364 + }, + { + "completion_length": 249.30358219146729, + "epoch": 0.061360492895762606, + "grad_norm": 0.2121971068500702, + "kl": 0.00033473968505859375, + "learning_rate": 4.518518518518518e-08, + "loss": 0.0, + "reward": 1.751785770058632, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 366 + }, + { + "completion_length": 243.9285831451416, + "epoch": 0.06169579613563016, + "grad_norm": 0.15114489649405735, + "kl": 0.00029456615447998047, + "learning_rate": 4.5432098765432094e-08, + "loss": 0.0, + "reward": 1.7892857640981674, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 368 + }, + { + "completion_length": 248.87500953674316, + "epoch": 0.062031099375497716, + "grad_norm": 0.2841277990992117, + "kl": 0.00031626224517822266, + "learning_rate": 4.567901234567901e-08, + "loss": 0.0, + "reward": 1.741071492433548, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7544643115252256, + "rewards/format_reward_func": 0.9866071492433548, + "step": 370 + }, + { + "completion_length": 244.41965293884277, + "epoch": 0.062366402615365274, + "grad_norm": 0.4126531872874137, + "kl": 0.0003432035446166992, + "learning_rate": 4.592592592592592e-08, + "loss": 0.0, + "reward": 1.7303572222590446, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214574158192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 372 + }, + { + "completion_length": 249.9821548461914, + "epoch": 0.06270170585523283, + "grad_norm": 0.25716336513716315, + "kl": 0.0003498196601867676, + "learning_rate": 4.617283950617284e-08, + "loss": 0.0, + "reward": 1.7821429446339607, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.782142873853445, + "rewards/format_reward_func": 1.0, + "step": 374 + }, + { + "completion_length": 258.8259086608887, + "epoch": 0.06303700909510038, + "grad_norm": 0.209967257582214, + "kl": 0.00030928850173950195, + "learning_rate": 4.641975308641975e-08, + "loss": 0.0, + "reward": 1.696428656578064, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7053571715950966, + "rewards/format_reward_func": 0.9910714328289032, + "step": 376 + }, + { + "completion_length": 243.48661613464355, + "epoch": 0.06337231233496794, + "grad_norm": 0.357820262167908, + "kl": 0.00035440921783447266, + "learning_rate": 4.666666666666667e-08, + "loss": 0.0, + "reward": 1.7553572058677673, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 378 + }, + { + "completion_length": 250.60269165039062, + "epoch": 0.0637076155748355, + "grad_norm": 0.2806225729005327, + "kl": 0.0003170967102050781, + "learning_rate": 4.691358024691358e-08, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143092900515, + "rewards/format_reward_func": 1.0, + "step": 380 + }, + { + "completion_length": 243.09376049041748, + "epoch": 0.06404291881470305, + "grad_norm": 0.18647090907402403, + "kl": 0.0003414154052734375, + "learning_rate": 4.716049382716049e-08, + "loss": 0.0, + "reward": 1.725000061094761, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7339285928755999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 382 + }, + { + "completion_length": 246.5089406967163, + "epoch": 0.0643782220545706, + "grad_norm": 0.6344869457901994, + "kl": 0.0003355741500854492, + "learning_rate": 4.7407407407407405e-08, + "loss": 0.0, + "reward": 1.725000075995922, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.733928594738245, + "rewards/format_reward_func": 0.9910714328289032, + "step": 384 + }, + { + "completion_length": 242.91519260406494, + "epoch": 0.06471352529443816, + "grad_norm": 0.3172293127272545, + "kl": 0.0003371238708496094, + "learning_rate": 4.765432098765432e-08, + "loss": 0.0, + "reward": 1.762500062584877, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 386 + }, + { + "completion_length": 238.28126335144043, + "epoch": 0.06504882853430571, + "grad_norm": 0.2713314165205857, + "kl": 0.0003418922424316406, + "learning_rate": 4.790123456790123e-08, + "loss": 0.0, + "reward": 1.7482143566012383, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7526786141097546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 388 + }, + { + "completion_length": 243.88393783569336, + "epoch": 0.06538413177417327, + "grad_norm": 0.15007989523396975, + "kl": 0.0003464221954345703, + "learning_rate": 4.814814814814814e-08, + "loss": 0.0, + "reward": 1.8035714998841286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 390 + }, + { + "completion_length": 247.31697845458984, + "epoch": 0.06571943501404082, + "grad_norm": 0.4454137762015419, + "kl": 0.0003800392150878906, + "learning_rate": 4.839506172839506e-08, + "loss": 0.0, + "reward": 1.7357143759727478, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7446428947150707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 392 + }, + { + "completion_length": 252.01340579986572, + "epoch": 0.06605473825390838, + "grad_norm": 0.5044913641727443, + "kl": 0.00032645463943481445, + "learning_rate": 4.864197530864197e-08, + "loss": 0.0, + "reward": 1.698214367032051, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7026786152273417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 394 + }, + { + "completion_length": 241.75001430511475, + "epoch": 0.06639004149377593, + "grad_norm": 0.19490645814902496, + "kl": 0.0003345012664794922, + "learning_rate": 4.888888888888889e-08, + "loss": 0.0, + "reward": 1.733928643167019, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 396 + }, + { + "completion_length": 248.08036994934082, + "epoch": 0.06672534473364349, + "grad_norm": 0.2114488097393688, + "kl": 0.0003476142883300781, + "learning_rate": 4.91358024691358e-08, + "loss": 0.0, + "reward": 1.775000087916851, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 398 + }, + { + "completion_length": 249.24554920196533, + "epoch": 0.06706064797351105, + "grad_norm": 0.3736813422927126, + "kl": 0.0003236532211303711, + "learning_rate": 4.938271604938271e-08, + "loss": 0.0, + "reward": 1.7535715028643608, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 400 + }, + { + "completion_length": 250.5491189956665, + "epoch": 0.0673959512133786, + "grad_norm": 0.09379494996574526, + "kl": 0.0003262758255004883, + "learning_rate": 4.9629629629629626e-08, + "loss": 0.0, + "reward": 1.8142857626080513, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8232143111526966, + "rewards/format_reward_func": 0.9910714328289032, + "step": 402 + }, + { + "completion_length": 242.2366180419922, + "epoch": 0.06773125445324615, + "grad_norm": 0.21504913509015106, + "kl": 0.0003097057342529297, + "learning_rate": 4.987654320987654e-08, + "loss": 0.0, + "reward": 1.744642935693264, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 404 + }, + { + "completion_length": 249.0446548461914, + "epoch": 0.0680665576931137, + "grad_norm": 0.25395455275495804, + "kl": 0.0003694295883178711, + "learning_rate": 5.0123456790123455e-08, + "loss": 0.0, + "reward": 1.7357143610715866, + "reward_std": 0.0909137288108468, + "rewards/equation_reward_func": 0.7535714469850063, + "rewards/format_reward_func": 0.9821428656578064, + "step": 406 + }, + { + "completion_length": 238.26340579986572, + "epoch": 0.06840186093298127, + "grad_norm": 0.33754900952521866, + "kl": 0.0003122687339782715, + "learning_rate": 5.037037037037037e-08, + "loss": 0.0, + "reward": 1.7535715103149414, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714600235224, + "rewards/format_reward_func": 1.0, + "step": 408 + }, + { + "completion_length": 249.81697845458984, + "epoch": 0.06873716417284882, + "grad_norm": 0.1367116643336248, + "kl": 0.0003184080123901367, + "learning_rate": 5.0617283950617285e-08, + "loss": 0.0, + "reward": 1.728571504354477, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714522004128, + "rewards/format_reward_func": 1.0, + "step": 410 + }, + { + "completion_length": 246.19197463989258, + "epoch": 0.06907246741271637, + "grad_norm": 0.23975839196424972, + "kl": 0.00031936168670654297, + "learning_rate": 5.08641975308642e-08, + "loss": 0.0, + "reward": 1.7946428880095482, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.799107177183032, + "rewards/format_reward_func": 0.9955357164144516, + "step": 412 + }, + { + "completion_length": 245.99108505249023, + "epoch": 0.06940777065258392, + "grad_norm": 0.35602425619406325, + "kl": 0.00033867359161376953, + "learning_rate": 5.1111111111111114e-08, + "loss": 0.0, + "reward": 1.8196429014205933, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8241071589291096, + "rewards/format_reward_func": 0.9955357164144516, + "step": 414 + }, + { + "completion_length": 250.1562614440918, + "epoch": 0.06974307389245149, + "grad_norm": 0.24837916834782592, + "kl": 0.0003298521041870117, + "learning_rate": 5.1358024691358016e-08, + "loss": 0.0, + "reward": 1.735714353621006, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7446428872644901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 416 + }, + { + "completion_length": 251.49107837677002, + "epoch": 0.07007837713231904, + "grad_norm": 0.35602402754236945, + "kl": 0.0003180503845214844, + "learning_rate": 5.160493827160493e-08, + "loss": 0.0, + "reward": 1.751785784959793, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 418 + }, + { + "completion_length": 238.81251049041748, + "epoch": 0.0704136803721866, + "grad_norm": 0.3111388382821028, + "kl": 0.0003452301025390625, + "learning_rate": 5.1851851851851846e-08, + "loss": 0.0, + "reward": 1.7392857894301414, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857633531094, + "rewards/format_reward_func": 1.0, + "step": 420 + }, + { + "completion_length": 241.9241180419922, + "epoch": 0.07074898361205414, + "grad_norm": 0.22274741286610353, + "kl": 0.0003383755683898926, + "learning_rate": 5.209876543209876e-08, + "loss": 0.0, + "reward": 1.7571429386734962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 422 + }, + { + "completion_length": 244.17411708831787, + "epoch": 0.07108428685192171, + "grad_norm": 0.19095279674450452, + "kl": 0.00030684471130371094, + "learning_rate": 5.2345679012345675e-08, + "loss": 0.0, + "reward": 1.703571505844593, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7125000283122063, + "rewards/format_reward_func": 0.9910714328289032, + "step": 424 + }, + { + "completion_length": 245.1562623977661, + "epoch": 0.07141959009178926, + "grad_norm": 0.21128512978496952, + "kl": 0.0003224611282348633, + "learning_rate": 5.259259259259259e-08, + "loss": 0.0, + "reward": 1.8107143491506577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 426 + }, + { + "completion_length": 232.4821548461914, + "epoch": 0.07175489333165681, + "grad_norm": 0.2414527596562914, + "kl": 0.0003307461738586426, + "learning_rate": 5.2839506172839505e-08, + "loss": 0.0, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 428 + }, + { + "completion_length": 253.75447750091553, + "epoch": 0.07209019657152438, + "grad_norm": 0.2552537262189747, + "kl": 0.00033414363861083984, + "learning_rate": 5.308641975308642e-08, + "loss": 0.0, + "reward": 1.707142949104309, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7160714641213417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 430 + }, + { + "completion_length": 251.34822368621826, + "epoch": 0.07242549981139193, + "grad_norm": 0.1560573542361731, + "kl": 0.0003288388252258301, + "learning_rate": 5.3333333333333334e-08, + "loss": 0.0, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 432 + }, + { + "completion_length": 247.5044755935669, + "epoch": 0.07276080305125948, + "grad_norm": 0.30241987654645447, + "kl": 0.0003216266632080078, + "learning_rate": 5.358024691358024e-08, + "loss": 0.0, + "reward": 1.7660715207457542, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 434 + }, + { + "completion_length": 246.5312614440918, + "epoch": 0.07309610629112703, + "grad_norm": 0.33054557128364087, + "kl": 0.0003314018249511719, + "learning_rate": 5.382716049382716e-08, + "loss": 0.0, + "reward": 1.7250000536441803, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7339286021888256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 436 + }, + { + "completion_length": 243.68304920196533, + "epoch": 0.0734314095309946, + "grad_norm": 0.19851593896021988, + "kl": 0.00032967329025268555, + "learning_rate": 5.407407407407407e-08, + "loss": 0.0, + "reward": 1.7107143625617027, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7107143215835094, + "rewards/format_reward_func": 1.0, + "step": 438 + }, + { + "completion_length": 249.64287185668945, + "epoch": 0.07376671277086215, + "grad_norm": 0.1713552603952388, + "kl": 0.0003451108932495117, + "learning_rate": 5.432098765432099e-08, + "loss": 0.0, + "reward": 1.7750000357627869, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7839285992085934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 440 + }, + { + "completion_length": 240.55358409881592, + "epoch": 0.0741020160107297, + "grad_norm": 0.21027297014803956, + "kl": 0.000316619873046875, + "learning_rate": 5.45679012345679e-08, + "loss": 0.0, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 442 + }, + { + "completion_length": 245.67411708831787, + "epoch": 0.07443731925059725, + "grad_norm": 0.33966947103680717, + "kl": 0.00033915042877197266, + "learning_rate": 5.481481481481482e-08, + "loss": 0.0, + "reward": 1.7660714983940125, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 444 + }, + { + "completion_length": 254.05358028411865, + "epoch": 0.07477262249046482, + "grad_norm": 0.4185328389589341, + "kl": 0.00033539533615112305, + "learning_rate": 5.5061728395061725e-08, + "loss": 0.0, + "reward": 1.6982143595814705, + "reward_std": 0.09848987031728029, + "rewards/equation_reward_func": 0.7116071805357933, + "rewards/format_reward_func": 0.9866071492433548, + "step": 446 + }, + { + "completion_length": 235.8437614440918, + "epoch": 0.07510792573033237, + "grad_norm": 0.31327344899543313, + "kl": 0.00031751394271850586, + "learning_rate": 5.530864197530864e-08, + "loss": 0.0, + "reward": 1.785714365541935, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 448 + }, + { + "completion_length": 244.25894260406494, + "epoch": 0.07544322897019992, + "grad_norm": 0.168519024379435, + "kl": 0.00033473968505859375, + "learning_rate": 5.555555555555555e-08, + "loss": 0.0, + "reward": 1.7428572103381157, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 450 + }, + { + "completion_length": 241.50447463989258, + "epoch": 0.07577853221006747, + "grad_norm": 0.2579852036649518, + "kl": 0.0003389120101928711, + "learning_rate": 5.580246913580246e-08, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 452 + }, + { + "completion_length": 246.60715103149414, + "epoch": 0.07611383544993504, + "grad_norm": 0.2946907618719622, + "kl": 0.0003083944320678711, + "learning_rate": 5.604938271604938e-08, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 454 + }, + { + "completion_length": 243.24554634094238, + "epoch": 0.07644913868980259, + "grad_norm": 0.4449621858359876, + "kl": 0.0003170967102050781, + "learning_rate": 5.629629629629629e-08, + "loss": 0.0, + "reward": 1.7464286386966705, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.746428607031703, + "rewards/format_reward_func": 1.0, + "step": 456 + }, + { + "completion_length": 244.9017972946167, + "epoch": 0.07678444192967014, + "grad_norm": 0.27342841301104115, + "kl": 0.00032132863998413086, + "learning_rate": 5.654320987654321e-08, + "loss": 0.0, + "reward": 1.778571479022503, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 458 + }, + { + "completion_length": 246.16518783569336, + "epoch": 0.0771197451695377, + "grad_norm": 0.19558656358425155, + "kl": 0.0003364682197570801, + "learning_rate": 5.679012345679012e-08, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7803571783006191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 460 + }, + { + "completion_length": 241.52233123779297, + "epoch": 0.07745504840940526, + "grad_norm": 0.169838691432525, + "kl": 0.00031113624572753906, + "learning_rate": 5.703703703703704e-08, + "loss": 0.0, + "reward": 1.7803572416305542, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 462 + }, + { + "completion_length": 253.54019165039062, + "epoch": 0.07779035164927281, + "grad_norm": 0.19518278420827403, + "kl": 0.00032633543014526367, + "learning_rate": 5.728395061728395e-08, + "loss": 0.0, + "reward": 1.7000001072883606, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7089285887777805, + "rewards/format_reward_func": 0.9910714328289032, + "step": 464 + }, + { + "completion_length": 236.49554634094238, + "epoch": 0.07812565488914036, + "grad_norm": 0.21490568594797874, + "kl": 0.00033402442932128906, + "learning_rate": 5.7530864197530866e-08, + "loss": 0.0, + "reward": 1.8160714954137802, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8205357305705547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 466 + }, + { + "completion_length": 244.2857255935669, + "epoch": 0.07846095812900793, + "grad_norm": 0.11754674459427326, + "kl": 0.00033402442932128906, + "learning_rate": 5.7777777777777775e-08, + "loss": 0.0, + "reward": 1.8017857447266579, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500320374966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 468 + }, + { + "completion_length": 244.0848331451416, + "epoch": 0.07879626136887548, + "grad_norm": 0.23226873868439502, + "kl": 0.00031507015228271484, + "learning_rate": 5.802469135802469e-08, + "loss": 0.0, + "reward": 1.70000009983778, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7089286036789417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 470 + }, + { + "completion_length": 252.51340770721436, + "epoch": 0.07913156460874303, + "grad_norm": 0.2389300517769942, + "kl": 0.00030809640884399414, + "learning_rate": 5.8271604938271604e-08, + "loss": 0.0, + "reward": 1.7339286506175995, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9776785783469677, + "step": 472 + }, + { + "completion_length": 258.8705463409424, + "epoch": 0.07946686784861058, + "grad_norm": 0.21870083869491905, + "kl": 0.00032639503479003906, + "learning_rate": 5.851851851851851e-08, + "loss": 0.0, + "reward": 1.6946429386734962, + "reward_std": 0.09848987124860287, + "rewards/equation_reward_func": 0.7080357503145933, + "rewards/format_reward_func": 0.9866071492433548, + "step": 474 + }, + { + "completion_length": 240.48661994934082, + "epoch": 0.07980217108847815, + "grad_norm": 0.22507093576887516, + "kl": 0.00033414363861083984, + "learning_rate": 5.876543209876543e-08, + "loss": 0.0, + "reward": 1.7571429088711739, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428772062063, + "rewards/format_reward_func": 1.0, + "step": 476 + }, + { + "completion_length": 249.78572750091553, + "epoch": 0.0801374743283457, + "grad_norm": 0.2081506142291193, + "kl": 0.000325620174407959, + "learning_rate": 5.901234567901234e-08, + "loss": 0.0, + "reward": 1.7232143580913544, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 478 + }, + { + "completion_length": 248.21875762939453, + "epoch": 0.08047277756821325, + "grad_norm": 0.29807364642733086, + "kl": 0.0003132820129394531, + "learning_rate": 5.925925925925926e-08, + "loss": 0.0, + "reward": 1.6535715386271477, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.6535714566707611, + "rewards/format_reward_func": 1.0, + "step": 480 + }, + { + "completion_length": 243.1919765472412, + "epoch": 0.08080808080808081, + "grad_norm": 0.20914536774706063, + "kl": 0.000324249267578125, + "learning_rate": 5.950617283950617e-08, + "loss": 0.0, + "reward": 1.7928571924567223, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 482 + }, + { + "completion_length": 247.24554443359375, + "epoch": 0.08114338404794837, + "grad_norm": 0.235749099937035, + "kl": 0.00035053491592407227, + "learning_rate": 5.975308641975308e-08, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 484 + }, + { + "completion_length": 234.6830472946167, + "epoch": 0.08147868728781592, + "grad_norm": 0.28977720166304766, + "kl": 0.0003306269645690918, + "learning_rate": 6e-08, + "loss": 0.0, + "reward": 1.7428572252392769, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 486 + }, + { + "completion_length": 249.5044755935669, + "epoch": 0.08181399052768347, + "grad_norm": 0.26207066498924353, + "kl": 0.00032585859298706055, + "learning_rate": 6.024691358024691e-08, + "loss": 0.0, + "reward": 1.716071479022503, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.729464327916503, + "rewards/format_reward_func": 0.9866071492433548, + "step": 488 + }, + { + "completion_length": 247.4196548461914, + "epoch": 0.08214929376755103, + "grad_norm": 0.18458472700771195, + "kl": 0.00031828880310058594, + "learning_rate": 6.049382716049382e-08, + "loss": 0.0, + "reward": 1.776785783469677, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 490 + }, + { + "completion_length": 235.79018783569336, + "epoch": 0.08248459700741859, + "grad_norm": 0.2414137740023195, + "kl": 0.00032210350036621094, + "learning_rate": 6.074074074074074e-08, + "loss": 0.0, + "reward": 1.7892857566475868, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7892857510596514, + "rewards/format_reward_func": 1.0, + "step": 492 + }, + { + "completion_length": 244.3482255935669, + "epoch": 0.08281990024728614, + "grad_norm": 0.223221207261264, + "kl": 0.00033855438232421875, + "learning_rate": 6.098765432098765e-08, + "loss": 0.0, + "reward": 1.721428669989109, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 494 + }, + { + "completion_length": 250.5089406967163, + "epoch": 0.08315520348715369, + "grad_norm": 0.16481525246585177, + "kl": 0.0003097057342529297, + "learning_rate": 6.123456790123457e-08, + "loss": 0.0, + "reward": 1.7410714849829674, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7544643208384514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 496 + }, + { + "completion_length": 243.70090198516846, + "epoch": 0.08349050672702125, + "grad_norm": 0.23407503260150778, + "kl": 0.000316619873046875, + "learning_rate": 6.148148148148148e-08, + "loss": 0.0, + "reward": 1.7178572192788124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571745753288, + "rewards/format_reward_func": 1.0, + "step": 498 + }, + { + "completion_length": 251.65625953674316, + "epoch": 0.0838258099668888, + "grad_norm": 0.21363418666430034, + "kl": 0.00032788515090942383, + "learning_rate": 6.172839506172839e-08, + "loss": 0.0, + "reward": 1.7857143506407738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 500 + }, + { + "completion_length": 234.915189743042, + "epoch": 0.08416111320675636, + "grad_norm": 0.2728153197023007, + "kl": 0.00033909082412719727, + "learning_rate": 6.19753086419753e-08, + "loss": 0.0, + "reward": 1.7392858266830444, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 502 + }, + { + "completion_length": 245.33483409881592, + "epoch": 0.08449641644662391, + "grad_norm": 0.26575297012927046, + "kl": 0.00031566619873046875, + "learning_rate": 6.222222222222221e-08, + "loss": 0.0, + "reward": 1.7839286252856255, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928809314966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 504 + }, + { + "completion_length": 248.352689743042, + "epoch": 0.08483171968649147, + "grad_norm": 0.1380297498926032, + "kl": 0.0003465414047241211, + "learning_rate": 6.246913580246913e-08, + "loss": 0.0, + "reward": 1.785714365541935, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143077999353, + "rewards/format_reward_func": 1.0, + "step": 506 + }, + { + "completion_length": 241.34375858306885, + "epoch": 0.08516702292635903, + "grad_norm": 0.2441522470456551, + "kl": 0.00032442808151245117, + "learning_rate": 6.271604938271604e-08, + "loss": 0.0, + "reward": 1.785714365541935, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 508 + }, + { + "completion_length": 246.17412090301514, + "epoch": 0.08550232616622658, + "grad_norm": 0.21561174473883762, + "kl": 0.0003211498260498047, + "learning_rate": 6.296296296296296e-08, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7250000294297934, + "rewards/format_reward_func": 1.0, + "step": 510 + }, + { + "completion_length": 248.7991180419922, + "epoch": 0.08583762940609414, + "grad_norm": 0.1975704900557329, + "kl": 0.00030416250228881836, + "learning_rate": 6.320987654320987e-08, + "loss": 0.0, + "reward": 1.79464291036129, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7991071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 512 + }, + { + "completion_length": 244.66518878936768, + "epoch": 0.0861729326459617, + "grad_norm": 0.31873917088865067, + "kl": 0.0003369450569152832, + "learning_rate": 6.345679012345679e-08, + "loss": 0.0, + "reward": 1.6946429535746574, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.6991071738302708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 514 + }, + { + "completion_length": 249.6785831451416, + "epoch": 0.08650823588582925, + "grad_norm": 0.1803545586351522, + "kl": 0.00034105777740478516, + "learning_rate": 6.37037037037037e-08, + "loss": 0.0, + "reward": 1.796428620815277, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285872876644, + "rewards/format_reward_func": 1.0, + "step": 516 + }, + { + "completion_length": 238.2366180419922, + "epoch": 0.0868435391256968, + "grad_norm": 0.19465113178922888, + "kl": 0.0003395676612854004, + "learning_rate": 6.39506172839506e-08, + "loss": 0.0, + "reward": 1.803571492433548, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 518 + }, + { + "completion_length": 243.0982255935669, + "epoch": 0.08717884236556436, + "grad_norm": 0.19674061927229855, + "kl": 0.00034350156784057617, + "learning_rate": 6.419753086419753e-08, + "loss": 0.0, + "reward": 1.7714286223053932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286055415869, + "rewards/format_reward_func": 1.0, + "step": 520 + }, + { + "completion_length": 248.9866180419922, + "epoch": 0.08751414560543191, + "grad_norm": 0.282739041232047, + "kl": 0.00033104419708251953, + "learning_rate": 6.444444444444443e-08, + "loss": 0.0, + "reward": 1.7089286670088768, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7133928947150707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 522 + }, + { + "completion_length": 237.4553689956665, + "epoch": 0.08784944884529947, + "grad_norm": 0.2913300631224223, + "kl": 0.00030612945556640625, + "learning_rate": 6.469135802469136e-08, + "loss": 0.0, + "reward": 1.7995536252856255, + "reward_std": 0.06124049657955766, + "rewards/equation_reward_func": 0.8026785850524902, + "rewards/format_reward_func": 0.9968750029802322, + "step": 524 + }, + { + "completion_length": 248.6517972946167, + "epoch": 0.08818475208516702, + "grad_norm": 0.2988093886621688, + "kl": 0.0003228187561035156, + "learning_rate": 6.493827160493826e-08, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 526 + }, + { + "completion_length": 247.02233505249023, + "epoch": 0.08852005532503458, + "grad_norm": 0.31123073954807806, + "kl": 0.00034630298614501953, + "learning_rate": 6.518518518518518e-08, + "loss": 0.0, + "reward": 1.7321429252624512, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 528 + }, + { + "completion_length": 240.74108123779297, + "epoch": 0.08885535856490213, + "grad_norm": 0.2844834020779769, + "kl": 0.0003533363342285156, + "learning_rate": 6.54320987654321e-08, + "loss": 0.0, + "reward": 1.8178571909666061, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 530 + }, + { + "completion_length": 247.12947845458984, + "epoch": 0.08919066180476969, + "grad_norm": 0.3683831687012005, + "kl": 0.00037026405334472656, + "learning_rate": 6.567901234567901e-08, + "loss": 0.0, + "reward": 1.7214286252856255, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7392857503145933, + "rewards/format_reward_func": 0.9821428656578064, + "step": 532 + }, + { + "completion_length": 239.91965293884277, + "epoch": 0.08952596504463724, + "grad_norm": 0.23647114042655057, + "kl": 0.00031244754791259766, + "learning_rate": 6.592592592592592e-08, + "loss": 0.0, + "reward": 1.737500049173832, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643066823483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 534 + }, + { + "completion_length": 239.70537090301514, + "epoch": 0.0898612682845048, + "grad_norm": 0.2744914568098237, + "kl": 0.00034332275390625, + "learning_rate": 6.617283950617284e-08, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 536 + }, + { + "completion_length": 235.54465579986572, + "epoch": 0.09019657152437235, + "grad_norm": 0.24470360924408466, + "kl": 0.00032085180282592773, + "learning_rate": 6.641975308641975e-08, + "loss": 0.0, + "reward": 1.7714286521077156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 538 + }, + { + "completion_length": 252.58929634094238, + "epoch": 0.0905318747642399, + "grad_norm": 0.19399993162010581, + "kl": 0.0003203749656677246, + "learning_rate": 6.666666666666667e-08, + "loss": 0.0, + "reward": 1.7142857536673546, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857536673546, + "rewards/format_reward_func": 1.0, + "step": 540 + }, + { + "completion_length": 242.52679443359375, + "epoch": 0.09086717800410747, + "grad_norm": 0.1805765854243748, + "kl": 0.0003228187561035156, + "learning_rate": 6.691358024691358e-08, + "loss": 0.0, + "reward": 1.803571492433548, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 542 + }, + { + "completion_length": 250.33930015563965, + "epoch": 0.09120248124397502, + "grad_norm": 0.2510144798661156, + "kl": 0.00032061338424682617, + "learning_rate": 6.71604938271605e-08, + "loss": 0.0, + "reward": 1.753571517765522, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7625000290572643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 544 + }, + { + "completion_length": 248.883939743042, + "epoch": 0.09153778448384257, + "grad_norm": 0.2655297568235366, + "kl": 0.0003451108932495117, + "learning_rate": 6.74074074074074e-08, + "loss": 0.0, + "reward": 1.751785784959793, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 546 + }, + { + "completion_length": 249.2009038925171, + "epoch": 0.09187308772371013, + "grad_norm": 0.26878506330376745, + "kl": 0.0003330707550048828, + "learning_rate": 6.765432098765431e-08, + "loss": 0.0, + "reward": 1.7482143640518188, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 548 + }, + { + "completion_length": 237.3348331451416, + "epoch": 0.09220839096357769, + "grad_norm": 0.2253938894358309, + "kl": 0.00033724308013916016, + "learning_rate": 6.790123456790123e-08, + "loss": 0.0, + "reward": 1.757142923772335, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571429051458836, + "rewards/format_reward_func": 1.0, + "step": 550 + }, + { + "completion_length": 251.540189743042, + "epoch": 0.09254369420344524, + "grad_norm": 0.22765833106871078, + "kl": 0.00031238794326782227, + "learning_rate": 6.814814814814814e-08, + "loss": 0.0, + "reward": 1.762500062584877, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7669643145054579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 552 + }, + { + "completion_length": 243.71876430511475, + "epoch": 0.0928789974433128, + "grad_norm": 0.2972380601698253, + "kl": 0.00034296512603759766, + "learning_rate": 6.839506172839506e-08, + "loss": 0.0, + "reward": 1.7625000700354576, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 554 + }, + { + "completion_length": 241.85269260406494, + "epoch": 0.09321430068318035, + "grad_norm": 0.18903812105876086, + "kl": 0.0003712177276611328, + "learning_rate": 6.864197530864197e-08, + "loss": 0.0, + "reward": 1.7821429148316383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 556 + }, + { + "completion_length": 240.98661708831787, + "epoch": 0.09354960392304791, + "grad_norm": 0.2808854496545026, + "kl": 0.00033342838287353516, + "learning_rate": 6.888888888888889e-08, + "loss": 0.0, + "reward": 1.7535715028643608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714488476515, + "rewards/format_reward_func": 1.0, + "step": 558 + }, + { + "completion_length": 247.7678680419922, + "epoch": 0.09388490716291546, + "grad_norm": 0.2433063193601212, + "kl": 0.00035250186920166016, + "learning_rate": 6.913580246913579e-08, + "loss": 0.0, + "reward": 1.7250000834465027, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 560 + }, + { + "completion_length": 239.64733219146729, + "epoch": 0.09422021040278301, + "grad_norm": 0.2328401577713564, + "kl": 0.00031560659408569336, + "learning_rate": 6.938271604938272e-08, + "loss": 0.0, + "reward": 1.793750062584877, + "reward_std": 0.03914341004565358, + "rewards/equation_reward_func": 0.795535746961832, + "rewards/format_reward_func": 0.9982142895460129, + "step": 562 + }, + { + "completion_length": 243.7321548461914, + "epoch": 0.09455551364265057, + "grad_norm": 0.27133592508850835, + "kl": 0.00033986568450927734, + "learning_rate": 6.962962962962962e-08, + "loss": 0.0, + "reward": 1.8071429282426834, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8071428686380386, + "rewards/format_reward_func": 1.0, + "step": 564 + }, + { + "completion_length": 255.19197368621826, + "epoch": 0.09489081688251813, + "grad_norm": 0.2894428896849047, + "kl": 0.0003210902214050293, + "learning_rate": 6.987654320987655e-08, + "loss": 0.0, + "reward": 1.7375000789761543, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.74196432903409, + "rewards/format_reward_func": 0.9955357164144516, + "step": 566 + }, + { + "completion_length": 256.6250114440918, + "epoch": 0.09522612012238568, + "grad_norm": 0.24724991177580125, + "kl": 0.00033986568450927734, + "learning_rate": 7.012345679012345e-08, + "loss": 0.0, + "reward": 1.721428632736206, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 568 + }, + { + "completion_length": 246.13840103149414, + "epoch": 0.09556142336225323, + "grad_norm": 0.27228424645528715, + "kl": 0.0003192424774169922, + "learning_rate": 7.037037037037038e-08, + "loss": 0.0, + "reward": 1.778571479022503, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 570 + }, + { + "completion_length": 240.89286422729492, + "epoch": 0.0958967266021208, + "grad_norm": 0.17697003476147488, + "kl": 0.0003370046615600586, + "learning_rate": 7.061728395061728e-08, + "loss": 0.0, + "reward": 1.8071429133415222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 572 + }, + { + "completion_length": 245.89286613464355, + "epoch": 0.09623202984198835, + "grad_norm": 0.23976461206515592, + "kl": 0.0003515481948852539, + "learning_rate": 7.08641975308642e-08, + "loss": 0.0, + "reward": 1.7607143744826317, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 1.0, + "step": 574 + }, + { + "completion_length": 257.33483505249023, + "epoch": 0.0965673330818559, + "grad_norm": 0.24968865023102368, + "kl": 0.00032526254653930664, + "learning_rate": 7.111111111111111e-08, + "loss": 0.0, + "reward": 1.7696429342031479, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071581840515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 576 + }, + { + "completion_length": 236.85715579986572, + "epoch": 0.09690263632172345, + "grad_norm": 0.19104595177042635, + "kl": 0.00033676624298095703, + "learning_rate": 7.135802469135801e-08, + "loss": 0.0, + "reward": 1.8535714894533157, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8535714372992516, + "rewards/format_reward_func": 1.0, + "step": 578 + }, + { + "completion_length": 250.72322750091553, + "epoch": 0.09723793956159102, + "grad_norm": 0.26928102942838894, + "kl": 0.00033277273178100586, + "learning_rate": 7.160493827160494e-08, + "loss": 0.0, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 580 + }, + { + "completion_length": 250.25001049041748, + "epoch": 0.09757324280145857, + "grad_norm": 0.2822786859772726, + "kl": 0.0003406405448913574, + "learning_rate": 7.185185185185184e-08, + "loss": 0.0, + "reward": 1.7553572058677673, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 582 + }, + { + "completion_length": 231.52679824829102, + "epoch": 0.09790854604132612, + "grad_norm": 0.3094974337963821, + "kl": 0.00034546852111816406, + "learning_rate": 7.209876543209877e-08, + "loss": 0.0, + "reward": 1.7392857745289803, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7482143137603998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 584 + }, + { + "completion_length": 245.5134048461914, + "epoch": 0.09824384928119367, + "grad_norm": 0.29343542630310954, + "kl": 0.0003381967544555664, + "learning_rate": 7.234567901234567e-08, + "loss": 0.0, + "reward": 1.7714286372065544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 586 + }, + { + "completion_length": 246.6384038925171, + "epoch": 0.09857915252106124, + "grad_norm": 0.18806855527665864, + "kl": 0.00032448768615722656, + "learning_rate": 7.25925925925926e-08, + "loss": 0.0, + "reward": 1.7785715013742447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 588 + }, + { + "completion_length": 251.34822940826416, + "epoch": 0.09891445576092879, + "grad_norm": 0.2674941278667219, + "kl": 0.00032722949981689453, + "learning_rate": 7.28395061728395e-08, + "loss": 0.0, + "reward": 1.667857214808464, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.6678571924567223, + "rewards/format_reward_func": 1.0, + "step": 590 + }, + { + "completion_length": 239.42858123779297, + "epoch": 0.09924975900079634, + "grad_norm": 0.17254025141382712, + "kl": 0.0003591179847717285, + "learning_rate": 7.308641975308643e-08, + "loss": 0.0, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 592 + }, + { + "completion_length": 247.8303689956665, + "epoch": 0.0995850622406639, + "grad_norm": 0.3699325611286679, + "kl": 0.0003343820571899414, + "learning_rate": 7.333333333333333e-08, + "loss": 0.0, + "reward": 1.7500000819563866, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 594 + }, + { + "completion_length": 256.76340103149414, + "epoch": 0.09992036548053146, + "grad_norm": 0.3237781294586503, + "kl": 0.00031179189682006836, + "learning_rate": 7.358024691358024e-08, + "loss": 0.0, + "reward": 1.7428572252392769, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571823984385, + "rewards/format_reward_func": 1.0, + "step": 596 + }, + { + "completion_length": 247.66965579986572, + "epoch": 0.10025566872039901, + "grad_norm": 0.3258600776488885, + "kl": 0.0003464221954345703, + "learning_rate": 7.382716049382716e-08, + "loss": 0.0, + "reward": 1.6928571984171867, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7017857357859612, + "rewards/format_reward_func": 0.9910714328289032, + "step": 598 + }, + { + "completion_length": 249.6250114440918, + "epoch": 0.10059097196026656, + "grad_norm": 0.261689626399354, + "kl": 0.0003151893615722656, + "learning_rate": 7.407407407407407e-08, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714488476515, + "rewards/format_reward_func": 1.0, + "step": 600 + }, + { + "completion_length": 242.17858123779297, + "epoch": 0.10092627520013413, + "grad_norm": 0.24581423709483807, + "kl": 0.00032001733779907227, + "learning_rate": 7.432098765432099e-08, + "loss": 0.0, + "reward": 1.7642857730388641, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 602 + }, + { + "completion_length": 248.79912090301514, + "epoch": 0.10126157844000168, + "grad_norm": 0.3911909271722524, + "kl": 0.0003205537796020508, + "learning_rate": 7.456790123456789e-08, + "loss": 0.0, + "reward": 1.7214286774396896, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7303571663796902, + "rewards/format_reward_func": 0.9910714328289032, + "step": 604 + }, + { + "completion_length": 240.64286994934082, + "epoch": 0.10159688167986923, + "grad_norm": 0.1934736455480886, + "kl": 0.0003267526626586914, + "learning_rate": 7.481481481481482e-08, + "loss": 0.0, + "reward": 1.7910714820027351, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357544124126, + "rewards/format_reward_func": 0.9955357164144516, + "step": 606 + }, + { + "completion_length": 244.6160831451416, + "epoch": 0.10193218491973678, + "grad_norm": 0.22264172706402283, + "kl": 0.00035625696182250977, + "learning_rate": 7.506172839506172e-08, + "loss": 0.0, + "reward": 1.796428620815277, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 608 + }, + { + "completion_length": 238.7232255935669, + "epoch": 0.10226748815960435, + "grad_norm": 0.1806141118501612, + "kl": 0.00031882524490356445, + "learning_rate": 7.530864197530865e-08, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7875000238418579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 610 + }, + { + "completion_length": 239.7678689956665, + "epoch": 0.1026027913994719, + "grad_norm": 0.17213793905290295, + "kl": 0.00036013126373291016, + "learning_rate": 7.555555555555555e-08, + "loss": 0.0, + "reward": 1.6964286267757416, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6964286155998707, + "rewards/format_reward_func": 1.0, + "step": 612 + }, + { + "completion_length": 257.57590770721436, + "epoch": 0.10293809463933945, + "grad_norm": 0.3303583332562279, + "kl": 0.0003082752227783203, + "learning_rate": 7.580246913580246e-08, + "loss": 0.0, + "reward": 1.723214365541935, + "reward_std": 0.09848987031728029, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 614 + }, + { + "completion_length": 238.5312614440918, + "epoch": 0.103273397879207, + "grad_norm": 0.30225920527212524, + "kl": 0.000339508056640625, + "learning_rate": 7.604938271604938e-08, + "loss": 0.0, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428716182709, + "rewards/format_reward_func": 1.0, + "step": 616 + }, + { + "completion_length": 237.79911994934082, + "epoch": 0.10360870111907457, + "grad_norm": 0.1956188765585153, + "kl": 0.00033271312713623047, + "learning_rate": 7.629629629629629e-08, + "loss": 0.0, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 618 + }, + { + "completion_length": 243.6651906967163, + "epoch": 0.10394400435894212, + "grad_norm": 0.21769077201626216, + "kl": 0.0003572702407836914, + "learning_rate": 7.654320987654321e-08, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 620 + }, + { + "completion_length": 252.90626335144043, + "epoch": 0.10427930759880967, + "grad_norm": 0.2545100315186332, + "kl": 0.00031685829162597656, + "learning_rate": 7.679012345679012e-08, + "loss": 0.0, + "reward": 1.7535715028643608, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7625000402331352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 622 + }, + { + "completion_length": 247.33929634094238, + "epoch": 0.10461461083867724, + "grad_norm": 0.3575182728529308, + "kl": 0.0003485679626464844, + "learning_rate": 7.703703703703704e-08, + "loss": 0.0, + "reward": 1.7625000774860382, + "reward_std": 0.09343910776078701, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 624 + }, + { + "completion_length": 256.81251525878906, + "epoch": 0.10494991407854479, + "grad_norm": 0.2494622615449293, + "kl": 0.00032192468643188477, + "learning_rate": 7.728395061728395e-08, + "loss": 0.0, + "reward": 1.7964286357164383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 626 + }, + { + "completion_length": 245.0357265472412, + "epoch": 0.10528521731841234, + "grad_norm": 0.22256146612232233, + "kl": 0.00034034252166748047, + "learning_rate": 7.753086419753085e-08, + "loss": 0.0, + "reward": 1.6964286863803864, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6964286006987095, + "rewards/format_reward_func": 1.0, + "step": 628 + }, + { + "completion_length": 245.7634048461914, + "epoch": 0.10562052055827989, + "grad_norm": 0.2509199441030853, + "kl": 0.00033855438232421875, + "learning_rate": 7.777777777777778e-08, + "loss": 0.0, + "reward": 1.7732143476605415, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 630 + }, + { + "completion_length": 252.70090675354004, + "epoch": 0.10595582379814746, + "grad_norm": 0.2160039639235301, + "kl": 0.0003249645233154297, + "learning_rate": 7.802469135802468e-08, + "loss": 0.0, + "reward": 1.7446429207921028, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 632 + }, + { + "completion_length": 244.58929538726807, + "epoch": 0.10629112703801501, + "grad_norm": 0.2032292563867832, + "kl": 0.00032579898834228516, + "learning_rate": 7.82716049382716e-08, + "loss": 0.0, + "reward": 1.7160714864730835, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7205357402563095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 634 + }, + { + "completion_length": 248.6026906967163, + "epoch": 0.10662643027788256, + "grad_norm": 0.27863712660799594, + "kl": 0.00033843517303466797, + "learning_rate": 7.851851851851851e-08, + "loss": 0.0, + "reward": 1.717857226729393, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.717857176437974, + "rewards/format_reward_func": 1.0, + "step": 636 + }, + { + "completion_length": 245.8750114440918, + "epoch": 0.10696173351775011, + "grad_norm": 0.26748447053290475, + "kl": 0.0003311634063720703, + "learning_rate": 7.876543209876543e-08, + "loss": 0.0, + "reward": 1.8107143342494965, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 638 + }, + { + "completion_length": 237.75000953674316, + "epoch": 0.10729703675761768, + "grad_norm": 0.378121476129134, + "kl": 0.00033354759216308594, + "learning_rate": 7.901234567901234e-08, + "loss": 0.0, + "reward": 1.7196429446339607, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 640 + }, + { + "completion_length": 254.9241189956665, + "epoch": 0.10763233999748523, + "grad_norm": 0.31986577676480604, + "kl": 0.0003407001495361328, + "learning_rate": 7.925925925925926e-08, + "loss": 0.0, + "reward": 1.7750000581145287, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000469386578, + "rewards/format_reward_func": 1.0, + "step": 642 + }, + { + "completion_length": 251.55804824829102, + "epoch": 0.10796764323735278, + "grad_norm": 0.17222404932069518, + "kl": 0.0003371238708496094, + "learning_rate": 7.950617283950617e-08, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000171363354, + "rewards/format_reward_func": 1.0, + "step": 644 + }, + { + "completion_length": 238.65179920196533, + "epoch": 0.10830294647722033, + "grad_norm": 0.30833933519304063, + "kl": 0.00034743547439575195, + "learning_rate": 7.975308641975307e-08, + "loss": 0.0, + "reward": 1.7625000774860382, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 646 + }, + { + "completion_length": 255.2500123977661, + "epoch": 0.1086382497170879, + "grad_norm": 0.2769674980707656, + "kl": 0.00034427642822265625, + "learning_rate": 8e-08, + "loss": 0.0, + "reward": 1.7053572237491608, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7098214589059353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 648 + }, + { + "completion_length": 240.78572750091553, + "epoch": 0.10897355295695545, + "grad_norm": 0.21938149745780547, + "kl": 0.00034177303314208984, + "learning_rate": 8.02469135802469e-08, + "loss": 0.0, + "reward": 1.7892857939004898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 650 + }, + { + "completion_length": 247.77233219146729, + "epoch": 0.109308856196823, + "grad_norm": 0.3524412693319839, + "kl": 0.00033420324325561523, + "learning_rate": 8.049382716049383e-08, + "loss": 0.0, + "reward": 1.7750000581145287, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 652 + }, + { + "completion_length": 239.45536994934082, + "epoch": 0.10964415943669056, + "grad_norm": 0.21691827702998728, + "kl": 0.00033783912658691406, + "learning_rate": 8.074074074074073e-08, + "loss": 0.0, + "reward": 1.762500062584877, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669642977416515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 654 + }, + { + "completion_length": 237.54018878936768, + "epoch": 0.10997946267655812, + "grad_norm": 0.36721379041357677, + "kl": 0.00033032894134521484, + "learning_rate": 8.098765432098766e-08, + "loss": 0.0, + "reward": 1.7375000566244125, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7419643104076385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 656 + }, + { + "completion_length": 247.8303689956665, + "epoch": 0.11031476591642567, + "grad_norm": 0.22179464679431105, + "kl": 0.0003440380096435547, + "learning_rate": 8.123456790123456e-08, + "loss": 0.0, + "reward": 1.7375000715255737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 658 + }, + { + "completion_length": 245.22322463989258, + "epoch": 0.11065006915629322, + "grad_norm": 0.28631903396643277, + "kl": 0.0003451108932495117, + "learning_rate": 8.148148148148149e-08, + "loss": 0.0, + "reward": 1.7678572088479996, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 660 + }, + { + "completion_length": 239.6026906967163, + "epoch": 0.11098537239616078, + "grad_norm": 0.23815337312005594, + "kl": 0.0003368854522705078, + "learning_rate": 8.172839506172839e-08, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 662 + }, + { + "completion_length": 240.05804538726807, + "epoch": 0.11132067563602834, + "grad_norm": 0.00011184530104431835, + "kl": 0.00032639503479003906, + "learning_rate": 8.19753086419753e-08, + "loss": 0.0, + "reward": 1.7285715118050575, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 664 + }, + { + "completion_length": 238.4196548461914, + "epoch": 0.11165597887589589, + "grad_norm": 0.16343083171368733, + "kl": 0.0003247857093811035, + "learning_rate": 8.222222222222222e-08, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 666 + }, + { + "completion_length": 234.1919755935669, + "epoch": 0.11199128211576344, + "grad_norm": 0.2718751293232413, + "kl": 0.0003414750099182129, + "learning_rate": 8.246913580246912e-08, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7625000402331352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 668 + }, + { + "completion_length": 247.2634048461914, + "epoch": 0.112326585355631, + "grad_norm": 0.44564106362192857, + "kl": 0.0003719329833984375, + "learning_rate": 8.271604938271605e-08, + "loss": 0.0, + "reward": 1.7732143625617027, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 670 + }, + { + "completion_length": 233.86608123779297, + "epoch": 0.11266188859549855, + "grad_norm": 0.24836091057957915, + "kl": 0.00035303831100463867, + "learning_rate": 8.296296296296295e-08, + "loss": 0.0, + "reward": 1.7678571939468384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 672 + }, + { + "completion_length": 250.02679634094238, + "epoch": 0.1129971918353661, + "grad_norm": 0.33629052891667766, + "kl": 0.0003447532653808594, + "learning_rate": 8.320987654320988e-08, + "loss": 0.0, + "reward": 1.7428571805357933, + "reward_std": 0.10101525206118822, + "rewards/equation_reward_func": 0.7517857626080513, + "rewards/format_reward_func": 0.9910714328289032, + "step": 674 + }, + { + "completion_length": 244.0669755935669, + "epoch": 0.11333249507523366, + "grad_norm": 0.22850472179793171, + "kl": 0.00035583972930908203, + "learning_rate": 8.345679012345678e-08, + "loss": 0.0, + "reward": 1.7214286476373672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 676 + }, + { + "completion_length": 242.03572750091553, + "epoch": 0.11366779831510122, + "grad_norm": 0.23854294689657238, + "kl": 0.00033485889434814453, + "learning_rate": 8.37037037037037e-08, + "loss": 0.0, + "reward": 1.776785783469677, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.781250037252903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 678 + }, + { + "completion_length": 245.96429443359375, + "epoch": 0.11400310155496877, + "grad_norm": 0.16755888824586532, + "kl": 0.0003756284713745117, + "learning_rate": 8.395061728395061e-08, + "loss": 0.0, + "reward": 1.7571429163217545, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428883820772, + "rewards/format_reward_func": 1.0, + "step": 680 + }, + { + "completion_length": 233.83929347991943, + "epoch": 0.11433840479483633, + "grad_norm": 0.24174291075874793, + "kl": 0.0003483295440673828, + "learning_rate": 8.419753086419753e-08, + "loss": 0.0, + "reward": 1.7196429297327995, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7241071742027998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 682 + }, + { + "completion_length": 248.6160831451416, + "epoch": 0.11467370803470389, + "grad_norm": 0.22786130194654666, + "kl": 0.00032138824462890625, + "learning_rate": 8.444444444444444e-08, + "loss": 0.0, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 1.0, + "step": 684 + }, + { + "completion_length": 237.14733123779297, + "epoch": 0.11500901127457144, + "grad_norm": 0.20303655961874897, + "kl": 0.00035130977630615234, + "learning_rate": 8.469135802469136e-08, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 686 + }, + { + "completion_length": 243.2767972946167, + "epoch": 0.115344314514439, + "grad_norm": 0.19912653343272865, + "kl": 0.0003395676612854004, + "learning_rate": 8.493827160493827e-08, + "loss": 0.0, + "reward": 1.7232143729925156, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7366071753203869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 688 + }, + { + "completion_length": 249.47322463989258, + "epoch": 0.11567961775430655, + "grad_norm": 0.2278184337257273, + "kl": 0.0003470182418823242, + "learning_rate": 8.518518518518517e-08, + "loss": 0.0, + "reward": 1.7312500551342964, + "reward_std": 0.06692260666750371, + "rewards/equation_reward_func": 0.7473214492201805, + "rewards/format_reward_func": 0.9839285835623741, + "step": 690 + }, + { + "completion_length": 238.8169755935669, + "epoch": 0.11601492099417411, + "grad_norm": 0.17911647827573768, + "kl": 0.0003763437271118164, + "learning_rate": 8.54320987654321e-08, + "loss": 0.0, + "reward": 1.7714286223053932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285850524902, + "rewards/format_reward_func": 1.0, + "step": 692 + }, + { + "completion_length": 255.2366180419922, + "epoch": 0.11635022423404166, + "grad_norm": 0.2756423672346394, + "kl": 0.0003293752670288086, + "learning_rate": 8.5679012345679e-08, + "loss": 0.0, + "reward": 1.7410714998841286, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7544643245637417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 694 + }, + { + "completion_length": 249.30804634094238, + "epoch": 0.11668552747390921, + "grad_norm": 0.2111679882859518, + "kl": 0.00035190582275390625, + "learning_rate": 8.592592592592592e-08, + "loss": 0.0, + "reward": 1.7196429073810577, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7330357506871223, + "rewards/format_reward_func": 0.9866071492433548, + "step": 696 + }, + { + "completion_length": 243.665189743042, + "epoch": 0.11702083071377677, + "grad_norm": 0.155534871523258, + "kl": 0.00037992000579833984, + "learning_rate": 8.617283950617283e-08, + "loss": 0.0, + "reward": 1.7196429446339607, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7241071686148643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 698 + }, + { + "completion_length": 251.8348331451416, + "epoch": 0.11735613395364433, + "grad_norm": 0.26079539156736914, + "kl": 0.0003502368927001953, + "learning_rate": 8.641975308641975e-08, + "loss": 0.0, + "reward": 1.712500087916851, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7258928939700127, + "rewards/format_reward_func": 0.9866071492433548, + "step": 700 + }, + { + "completion_length": 242.1071538925171, + "epoch": 0.11769143719351188, + "grad_norm": 0.2277414630854397, + "kl": 0.0003610849380493164, + "learning_rate": 8.666666666666666e-08, + "loss": 0.0, + "reward": 1.8321429193019867, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.832142885774374, + "rewards/format_reward_func": 1.0, + "step": 702 + }, + { + "completion_length": 253.36608409881592, + "epoch": 0.11802674043337943, + "grad_norm": 0.30555546254853067, + "kl": 0.0003439188003540039, + "learning_rate": 8.691358024691358e-08, + "loss": 0.0, + "reward": 1.7464286386966705, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.746428607031703, + "rewards/format_reward_func": 1.0, + "step": 704 + }, + { + "completion_length": 239.258939743042, + "epoch": 0.11836204367324699, + "grad_norm": 0.2848334485064057, + "kl": 0.0003402233123779297, + "learning_rate": 8.716049382716049e-08, + "loss": 0.0, + "reward": 1.771428644657135, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 706 + }, + { + "completion_length": 239.32590293884277, + "epoch": 0.11869734691311455, + "grad_norm": 0.21435664986931335, + "kl": 0.00037920475006103516, + "learning_rate": 8.74074074074074e-08, + "loss": 0.0, + "reward": 1.7696429193019867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071879863739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 708 + }, + { + "completion_length": 243.0401906967163, + "epoch": 0.1190326501529821, + "grad_norm": 0.20084110899875415, + "kl": 0.0003509521484375, + "learning_rate": 8.765432098765432e-08, + "loss": 0.0, + "reward": 1.7660714909434319, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 710 + }, + { + "completion_length": 251.1339406967163, + "epoch": 0.11936795339284965, + "grad_norm": 0.6091294566896917, + "kl": 0.00038111209869384766, + "learning_rate": 8.790123456790124e-08, + "loss": 0.0, + "reward": 1.7428571954369545, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.7517857626080513, + "rewards/format_reward_func": 0.9910714328289032, + "step": 712 + }, + { + "completion_length": 246.12947463989258, + "epoch": 0.11970325663271722, + "grad_norm": 0.24858780124660707, + "kl": 0.0003471970558166504, + "learning_rate": 8.814814814814814e-08, + "loss": 0.0, + "reward": 1.7714286595582962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 714 + }, + { + "completion_length": 260.6696557998657, + "epoch": 0.12003855987258477, + "grad_norm": 0.3235796854238328, + "kl": 0.00035631656646728516, + "learning_rate": 8.839506172839507e-08, + "loss": 0.0, + "reward": 1.680357240140438, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7026786040514708, + "rewards/format_reward_func": 0.977678582072258, + "step": 716 + }, + { + "completion_length": 244.77233219146729, + "epoch": 0.12037386311245232, + "grad_norm": 0.18663753240167508, + "kl": 0.0003312826156616211, + "learning_rate": 8.864197530864197e-08, + "loss": 0.0, + "reward": 1.8017857745289803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500208616257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 718 + }, + { + "completion_length": 244.47768688201904, + "epoch": 0.12070916635231987, + "grad_norm": 0.2831141797933221, + "kl": 0.000345766544342041, + "learning_rate": 8.888888888888888e-08, + "loss": 0.0, + "reward": 1.7343750670552254, + "reward_std": 0.06250318652018905, + "rewards/equation_reward_func": 0.7357143349945545, + "rewards/format_reward_func": 0.9986607171595097, + "step": 720 + }, + { + "completion_length": 233.08483028411865, + "epoch": 0.12104446959218744, + "grad_norm": 0.2972022109726578, + "kl": 0.0003477931022644043, + "learning_rate": 8.91358024691358e-08, + "loss": 0.0, + "reward": 1.7500000596046448, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 722 + }, + { + "completion_length": 238.62054634094238, + "epoch": 0.12137977283205499, + "grad_norm": 0.39101243176433587, + "kl": 0.00035965442657470703, + "learning_rate": 8.938271604938271e-08, + "loss": 0.0, + "reward": 1.7321429327130318, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 724 + }, + { + "completion_length": 246.7053680419922, + "epoch": 0.12171507607192254, + "grad_norm": 0.19475704922237347, + "kl": 0.0003685951232910156, + "learning_rate": 8.962962962962963e-08, + "loss": 0.0, + "reward": 1.7625000402331352, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 726 + }, + { + "completion_length": 235.30357837677002, + "epoch": 0.1220503793117901, + "grad_norm": 0.22615944528910542, + "kl": 0.0003758668899536133, + "learning_rate": 8.987654320987654e-08, + "loss": 0.0, + "reward": 1.735714353621006, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143312692642, + "rewards/format_reward_func": 1.0, + "step": 728 + }, + { + "completion_length": 243.78572750091553, + "epoch": 0.12238568255165766, + "grad_norm": 0.20370458125847893, + "kl": 0.00036966800689697266, + "learning_rate": 9.012345679012346e-08, + "loss": 0.0, + "reward": 1.791071504354477, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 730 + }, + { + "completion_length": 241.9732255935669, + "epoch": 0.12272098579152521, + "grad_norm": 0.2427979605331861, + "kl": 0.0003876686096191406, + "learning_rate": 9.037037037037036e-08, + "loss": 0.0, + "reward": 1.8285714760422707, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8285714536905289, + "rewards/format_reward_func": 1.0, + "step": 732 + }, + { + "completion_length": 242.00447273254395, + "epoch": 0.12305628903139276, + "grad_norm": 0.2729256204257734, + "kl": 0.00039398670196533203, + "learning_rate": 9.061728395061729e-08, + "loss": 0.0, + "reward": 1.8035714849829674, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 734 + }, + { + "completion_length": 238.852689743042, + "epoch": 0.12339159227126031, + "grad_norm": 0.12436679177862824, + "kl": 0.0003916025161743164, + "learning_rate": 9.086419753086419e-08, + "loss": 0.0, + "reward": 1.7357143461704254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 1.0, + "step": 736 + }, + { + "completion_length": 238.6830472946167, + "epoch": 0.12372689551112788, + "grad_norm": 0.22044509282884264, + "kl": 0.00040662288665771484, + "learning_rate": 9.111111111111112e-08, + "loss": 0.0, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 738 + }, + { + "completion_length": 234.1473331451416, + "epoch": 0.12406219875099543, + "grad_norm": 0.10637798693708506, + "kl": 0.00035452842712402344, + "learning_rate": 9.135802469135802e-08, + "loss": 0.0, + "reward": 1.764285758137703, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 740 + }, + { + "completion_length": 247.75001049041748, + "epoch": 0.12439750199086298, + "grad_norm": 0.20902896319414058, + "kl": 0.00039124488830566406, + "learning_rate": 9.160493827160494e-08, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 742 + }, + { + "completion_length": 247.42858028411865, + "epoch": 0.12473280523073055, + "grad_norm": 0.17218299822927055, + "kl": 0.00037997961044311523, + "learning_rate": 9.185185185185185e-08, + "loss": 0.0, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 1.0, + "step": 744 + }, + { + "completion_length": 242.24554634094238, + "epoch": 0.1250681084705981, + "grad_norm": 0.3318051559590746, + "kl": 0.0003941059112548828, + "learning_rate": 9.209876543209875e-08, + "loss": 0.0, + "reward": 1.7178572043776512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571745753288, + "rewards/format_reward_func": 1.0, + "step": 746 + }, + { + "completion_length": 235.47322463989258, + "epoch": 0.12540341171046565, + "grad_norm": 0.31337923764354797, + "kl": 0.0003949403762817383, + "learning_rate": 9.234567901234568e-08, + "loss": 0.0, + "reward": 1.7839286401867867, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 748 + }, + { + "completion_length": 243.5000114440918, + "epoch": 0.1257387149503332, + "grad_norm": 0.17612462070677215, + "kl": 0.0003739595413208008, + "learning_rate": 9.259259259259258e-08, + "loss": 0.0, + "reward": 1.7696429044008255, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071805357933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 750 + }, + { + "completion_length": 251.64733028411865, + "epoch": 0.12607401819020075, + "grad_norm": 0.23943906846513874, + "kl": 0.00038892030715942383, + "learning_rate": 9.28395061728395e-08, + "loss": 0.0, + "reward": 1.74642863124609, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 752 + }, + { + "completion_length": 237.15625858306885, + "epoch": 0.1264093214300683, + "grad_norm": 0.2950467187688119, + "kl": 0.00037169456481933594, + "learning_rate": 9.308641975308641e-08, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 1.0, + "step": 754 + }, + { + "completion_length": 251.42858409881592, + "epoch": 0.12674462466993588, + "grad_norm": 0.16807803968532695, + "kl": 0.00037419795989990234, + "learning_rate": 9.333333333333334e-08, + "loss": 0.0, + "reward": 1.7428572252392769, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7517857477068901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 756 + }, + { + "completion_length": 243.0178680419922, + "epoch": 0.12707992790980344, + "grad_norm": 0.30999128943235993, + "kl": 0.0003815889358520508, + "learning_rate": 9.358024691358024e-08, + "loss": 0.0, + "reward": 1.8035715073347092, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 758 + }, + { + "completion_length": 243.02233409881592, + "epoch": 0.127415231149671, + "grad_norm": 0.34858323722828805, + "kl": 0.00039207935333251953, + "learning_rate": 9.382716049382716e-08, + "loss": 0.0, + "reward": 1.7142857983708382, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857387661934, + "rewards/format_reward_func": 1.0, + "step": 760 + }, + { + "completion_length": 241.68751049041748, + "epoch": 0.12775053438953854, + "grad_norm": 0.13222748191498046, + "kl": 0.00041425228118896484, + "learning_rate": 9.407407407407407e-08, + "loss": 0.0, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571864962578, + "rewards/format_reward_func": 1.0, + "step": 762 + }, + { + "completion_length": 244.65626049041748, + "epoch": 0.1280858376294061, + "grad_norm": 0.30478521229008854, + "kl": 0.0004159212112426758, + "learning_rate": 9.432098765432098e-08, + "loss": 0.0, + "reward": 1.7089286670088768, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7133928947150707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 764 + }, + { + "completion_length": 237.8125114440918, + "epoch": 0.12842114086927364, + "grad_norm": 0.21084985263179185, + "kl": 0.00038814544677734375, + "learning_rate": 9.45679012345679e-08, + "loss": 0.0, + "reward": 1.7928572073578835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.792857164517045, + "rewards/format_reward_func": 1.0, + "step": 766 + }, + { + "completion_length": 234.95982933044434, + "epoch": 0.1287564441091412, + "grad_norm": 0.2433138284776026, + "kl": 0.00036156177520751953, + "learning_rate": 9.481481481481481e-08, + "loss": 0.0, + "reward": 1.7232143804430962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7267857454717159, + "rewards/format_reward_func": 0.9964285716414452, + "step": 768 + }, + { + "completion_length": 242.1875114440918, + "epoch": 0.12909174734900877, + "grad_norm": 0.15249408191828723, + "kl": 0.00039446353912353516, + "learning_rate": 9.506172839506173e-08, + "loss": 0.0, + "reward": 1.7392857745289803, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7392857633531094, + "rewards/format_reward_func": 1.0, + "step": 770 + }, + { + "completion_length": 241.51786613464355, + "epoch": 0.12942705058887632, + "grad_norm": 0.19767611997197468, + "kl": 0.0004150867462158203, + "learning_rate": 9.530864197530864e-08, + "loss": 0.0, + "reward": 1.7767857611179352, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 772 + }, + { + "completion_length": 247.3259048461914, + "epoch": 0.12976235382874388, + "grad_norm": 0.2504453396389677, + "kl": 0.00043642520904541016, + "learning_rate": 9.555555555555556e-08, + "loss": 0.0, + "reward": 1.7517857998609543, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 774 + }, + { + "completion_length": 248.77679634094238, + "epoch": 0.13009765706861143, + "grad_norm": 0.1976896046009228, + "kl": 0.0004209280014038086, + "learning_rate": 9.580246913580246e-08, + "loss": 0.0, + "reward": 1.7642857730388641, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7732143141329288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 776 + }, + { + "completion_length": 238.41965579986572, + "epoch": 0.13043296030847898, + "grad_norm": 0.3026664034458984, + "kl": 0.0004271268844604492, + "learning_rate": 9.604938271604938e-08, + "loss": 0.0, + "reward": 1.73392865806818, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928820490837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 778 + }, + { + "completion_length": 242.62501049041748, + "epoch": 0.13076826354834653, + "grad_norm": 0.21444822804367636, + "kl": 0.00046432018280029297, + "learning_rate": 9.629629629629629e-08, + "loss": 0.0, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 780 + }, + { + "completion_length": 247.27679538726807, + "epoch": 0.13110356678821408, + "grad_norm": 0.257440107889928, + "kl": 0.0003908872604370117, + "learning_rate": 9.65432098765432e-08, + "loss": 0.0, + "reward": 1.783928632736206, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 782 + }, + { + "completion_length": 242.34822463989258, + "epoch": 0.13143887002808163, + "grad_norm": 0.2855112633919714, + "kl": 0.0004401206970214844, + "learning_rate": 9.679012345679012e-08, + "loss": 0.0, + "reward": 1.7821429297327995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 784 + }, + { + "completion_length": 252.64286613464355, + "epoch": 0.1317741732679492, + "grad_norm": 0.13781706493838075, + "kl": 0.0004125833511352539, + "learning_rate": 9.703703703703703e-08, + "loss": 0.0, + "reward": 1.687500074505806, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.6919643171131611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 786 + }, + { + "completion_length": 241.74554538726807, + "epoch": 0.13210947650781676, + "grad_norm": 0.12654413814050075, + "kl": 0.00040209293365478516, + "learning_rate": 9.728395061728395e-08, + "loss": 0.0, + "reward": 1.7732143476605415, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 788 + }, + { + "completion_length": 243.47322750091553, + "epoch": 0.13244477974768432, + "grad_norm": 0.2837353459979307, + "kl": 0.0004172325134277344, + "learning_rate": 9.753086419753086e-08, + "loss": 0.0, + "reward": 1.728571504354477, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714503377676, + "rewards/format_reward_func": 1.0, + "step": 790 + }, + { + "completion_length": 248.37054634094238, + "epoch": 0.13278008298755187, + "grad_norm": 0.22323115630506868, + "kl": 0.00045418739318847656, + "learning_rate": 9.777777777777778e-08, + "loss": 0.0, + "reward": 1.7357143834233284, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 1.0, + "step": 792 + }, + { + "completion_length": 249.6071548461914, + "epoch": 0.13311538622741942, + "grad_norm": 0.24346910568279878, + "kl": 0.000400543212890625, + "learning_rate": 9.802469135802469e-08, + "loss": 0.0, + "reward": 1.7803572118282318, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 794 + }, + { + "completion_length": 255.96875953674316, + "epoch": 0.13345068946728697, + "grad_norm": 0.3170654293138634, + "kl": 0.00042128562927246094, + "learning_rate": 9.82716049382716e-08, + "loss": 0.0, + "reward": 1.7089286372065544, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7133928816765547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 796 + }, + { + "completion_length": 242.2053680419922, + "epoch": 0.13378599270715452, + "grad_norm": 0.2949914528878829, + "kl": 0.0004379749298095703, + "learning_rate": 9.851851851851852e-08, + "loss": 0.0, + "reward": 1.7357143685221672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 798 + }, + { + "completion_length": 250.4375123977661, + "epoch": 0.1341212959470221, + "grad_norm": 0.21137036767136858, + "kl": 0.00042176246643066406, + "learning_rate": 9.876543209876542e-08, + "loss": 0.0, + "reward": 1.7678572162985802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571734577417, + "rewards/format_reward_func": 1.0, + "step": 800 + }, + { + "completion_length": 238.6919755935669, + "epoch": 0.13445659918688965, + "grad_norm": 0.2814166147627615, + "kl": 0.00045418739318847656, + "learning_rate": 9.901234567901235e-08, + "loss": 0.0, + "reward": 1.7857143580913544, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 802 + }, + { + "completion_length": 242.8705472946167, + "epoch": 0.1347919024267572, + "grad_norm": 0.30188794758016163, + "kl": 0.00044226646423339844, + "learning_rate": 9.925925925925925e-08, + "loss": 0.0, + "reward": 1.7366072237491608, + "reward_std": 0.05934646027162671, + "rewards/equation_reward_func": 0.7383928894996643, + "rewards/format_reward_func": 0.9982142895460129, + "step": 804 + }, + { + "completion_length": 257.3794775009155, + "epoch": 0.13512720566662476, + "grad_norm": 0.25469855228198224, + "kl": 0.0004140138626098633, + "learning_rate": 9.950617283950617e-08, + "loss": 0.0, + "reward": 1.7446429505944252, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 806 + }, + { + "completion_length": 250.01787090301514, + "epoch": 0.1354625089064923, + "grad_norm": 0.27453581331030075, + "kl": 0.0004793405532836914, + "learning_rate": 9.975308641975308e-08, + "loss": 0.0, + "reward": 1.767857201397419, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 808 + }, + { + "completion_length": 244.21876049041748, + "epoch": 0.13579781214635986, + "grad_norm": 0.28017698150363773, + "kl": 0.0004837512969970703, + "learning_rate": 1e-07, + "loss": 0.0, + "reward": 1.7232143878936768, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7276785895228386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 810 + }, + { + "completion_length": 247.321439743042, + "epoch": 0.1361331153862274, + "grad_norm": 0.29506744159707904, + "kl": 0.000514984130859375, + "learning_rate": 1.0024691358024691e-07, + "loss": 0.0, + "reward": 1.737500086426735, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643122702837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 812 + }, + { + "completion_length": 248.3794765472412, + "epoch": 0.13646841862609496, + "grad_norm": 0.45052300905816106, + "kl": 0.0004858970642089844, + "learning_rate": 1.0049382716049381e-07, + "loss": 0.0, + "reward": 1.807142935693264, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 814 + }, + { + "completion_length": 247.02679443359375, + "epoch": 0.13680372186596254, + "grad_norm": 0.23890198941501573, + "kl": 0.0005213022232055664, + "learning_rate": 1.0074074074074074e-07, + "loss": 0.0, + "reward": 1.7535715028643608, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 816 + }, + { + "completion_length": 255.1428680419922, + "epoch": 0.1371390251058301, + "grad_norm": 0.3321438645772538, + "kl": 0.000493168830871582, + "learning_rate": 1.0098765432098764e-07, + "loss": 0.0, + "reward": 1.7232143431901932, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7276786118745804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 818 + }, + { + "completion_length": 244.43304824829102, + "epoch": 0.13747432834569764, + "grad_norm": 0.20810648283029934, + "kl": 0.0005180835723876953, + "learning_rate": 1.0123456790123457e-07, + "loss": 0.0, + "reward": 1.7392857894301414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.739285746589303, + "rewards/format_reward_func": 1.0, + "step": 820 + }, + { + "completion_length": 237.0134038925171, + "epoch": 0.1378096315855652, + "grad_norm": 0.22541616829027122, + "kl": 0.0004540681838989258, + "learning_rate": 1.0148148148148147e-07, + "loss": 0.0, + "reward": 1.8535714894533157, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.8625000156462193, + "rewards/format_reward_func": 0.9910714328289032, + "step": 822 + }, + { + "completion_length": 242.77233219146729, + "epoch": 0.13814493482543275, + "grad_norm": 0.27520042618499535, + "kl": 0.00048482418060302734, + "learning_rate": 1.017283950617284e-07, + "loss": 0.0, + "reward": 1.789285771548748, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 824 + }, + { + "completion_length": 248.8660831451416, + "epoch": 0.1384802380653003, + "grad_norm": 0.30717772584495134, + "kl": 0.0005295276641845703, + "learning_rate": 1.019753086419753e-07, + "loss": 0.0, + "reward": 1.7571429088711739, + "reward_std": 0.06060914974659681, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 826 + }, + { + "completion_length": 241.49554824829102, + "epoch": 0.13881554130516785, + "grad_norm": 0.16709679133115038, + "kl": 0.0005042552947998047, + "learning_rate": 1.0222222222222223e-07, + "loss": 0.0, + "reward": 1.7964286282658577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 828 + }, + { + "completion_length": 258.34376335144043, + "epoch": 0.13915084454503543, + "grad_norm": 0.19247229448577885, + "kl": 0.0004944801330566406, + "learning_rate": 1.0246913580246913e-07, + "loss": 0.0, + "reward": 1.755357213318348, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214484751225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 830 + }, + { + "completion_length": 236.6651906967163, + "epoch": 0.13948614778490298, + "grad_norm": 0.2414068343995031, + "kl": 0.00048828125, + "learning_rate": 1.0271604938271603e-07, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 832 + }, + { + "completion_length": 245.133939743042, + "epoch": 0.13982145102477053, + "grad_norm": 0.328663766531583, + "kl": 0.0005055665969848633, + "learning_rate": 1.0296296296296296e-07, + "loss": 0.0, + "reward": 1.689285784959793, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.6892857402563095, + "rewards/format_reward_func": 1.0, + "step": 834 + }, + { + "completion_length": 243.54018878936768, + "epoch": 0.14015675426463808, + "grad_norm": 0.4397604670140151, + "kl": 0.00045621395111083984, + "learning_rate": 1.0320987654320986e-07, + "loss": 0.0, + "reward": 1.7482143491506577, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 836 + }, + { + "completion_length": 241.74108219146729, + "epoch": 0.14049205750450564, + "grad_norm": 0.29775824487765673, + "kl": 0.00047719478607177734, + "learning_rate": 1.0345679012345679e-07, + "loss": 0.0, + "reward": 1.7517857626080513, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 838 + }, + { + "completion_length": 239.21429538726807, + "epoch": 0.1408273607443732, + "grad_norm": 0.3551956976091781, + "kl": 0.0004627704620361328, + "learning_rate": 1.0370370370370369e-07, + "loss": 0.0, + "reward": 1.757589340209961, + "reward_std": 0.09028238197788596, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9941964335739613, + "step": 840 + }, + { + "completion_length": 249.44197368621826, + "epoch": 0.14116266398424074, + "grad_norm": 0.16079556819889082, + "kl": 0.0005052089691162109, + "learning_rate": 1.0395061728395062e-07, + "loss": 0.0, + "reward": 1.701785795390606, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7062500417232513, + "rewards/format_reward_func": 0.9955357164144516, + "step": 842 + }, + { + "completion_length": 249.0178689956665, + "epoch": 0.1414979672241083, + "grad_norm": 0.43008578708661116, + "kl": 0.0005563497543334961, + "learning_rate": 1.0419753086419752e-07, + "loss": 0.0, + "reward": 1.7410714998841286, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7455357611179352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 844 + }, + { + "completion_length": 240.696439743042, + "epoch": 0.14183327046397587, + "grad_norm": 0.19942106819097674, + "kl": 0.0004343986511230469, + "learning_rate": 1.0444444444444445e-07, + "loss": 0.0, + "reward": 1.7803571745753288, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 846 + }, + { + "completion_length": 248.0937614440918, + "epoch": 0.14216857370384342, + "grad_norm": 0.32938655852109006, + "kl": 0.0005087852478027344, + "learning_rate": 1.0469135802469135e-07, + "loss": 0.0, + "reward": 1.7625000700354576, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7669643107801676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 848 + }, + { + "completion_length": 241.70983123779297, + "epoch": 0.14250387694371097, + "grad_norm": 0.20099853956894784, + "kl": 0.0004711151123046875, + "learning_rate": 1.0493827160493827e-07, + "loss": 0.0, + "reward": 1.7571429014205933, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428921073675, + "rewards/format_reward_func": 1.0, + "step": 850 + }, + { + "completion_length": 251.04911613464355, + "epoch": 0.14283918018357852, + "grad_norm": 0.26710235078942274, + "kl": 0.0005549192428588867, + "learning_rate": 1.0518518518518518e-07, + "loss": 0.0, + "reward": 1.7535715028643608, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 852 + }, + { + "completion_length": 242.21875953674316, + "epoch": 0.14317448342344608, + "grad_norm": 0.24595597743321643, + "kl": 0.000561833381652832, + "learning_rate": 1.054320987654321e-07, + "loss": 0.0, + "reward": 1.7821429073810577, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 854 + }, + { + "completion_length": 239.6160831451416, + "epoch": 0.14350978666331363, + "grad_norm": 0.24191581359797343, + "kl": 0.0005096197128295898, + "learning_rate": 1.0567901234567901e-07, + "loss": 0.0, + "reward": 1.7125000730156898, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.716964315623045, + "rewards/format_reward_func": 0.9955357164144516, + "step": 856 + }, + { + "completion_length": 242.3705472946167, + "epoch": 0.14384508990318118, + "grad_norm": 0.3070888032318429, + "kl": 0.0004938840866088867, + "learning_rate": 1.0592592592592592e-07, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 858 + }, + { + "completion_length": 250.14733123779297, + "epoch": 0.14418039314304876, + "grad_norm": 0.2893268587254202, + "kl": 0.0005345344543457031, + "learning_rate": 1.0617283950617284e-07, + "loss": 0.0, + "reward": 1.7589286640286446, + "reward_std": 0.09848987031728029, + "rewards/equation_reward_func": 0.7633928768336773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 860 + }, + { + "completion_length": 244.47322463989258, + "epoch": 0.1445156963829163, + "grad_norm": 0.19801472144759966, + "kl": 0.0005357265472412109, + "learning_rate": 1.0641975308641974e-07, + "loss": 0.0, + "reward": 1.7178572490811348, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 862 + }, + { + "completion_length": 241.60715293884277, + "epoch": 0.14485099962278386, + "grad_norm": 0.2844514690303181, + "kl": 0.0005900859832763672, + "learning_rate": 1.0666666666666667e-07, + "loss": 0.0, + "reward": 1.8107143342494965, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 864 + }, + { + "completion_length": 246.8035831451416, + "epoch": 0.1451863028626514, + "grad_norm": 0.14321224353680495, + "kl": 0.0005359649658203125, + "learning_rate": 1.0691358024691357e-07, + "loss": 0.0, + "reward": 1.6946429386734962, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.6991071663796902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 866 + }, + { + "completion_length": 252.5446538925171, + "epoch": 0.14552160610251896, + "grad_norm": 0.2782425391415311, + "kl": 0.00047397613525390625, + "learning_rate": 1.0716049382716049e-07, + "loss": 0.0, + "reward": 1.778571479022503, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7875000424683094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 868 + }, + { + "completion_length": 240.04465293884277, + "epoch": 0.14585690934238651, + "grad_norm": 0.41490336823387663, + "kl": 0.0005705356597900391, + "learning_rate": 1.074074074074074e-07, + "loss": 0.0, + "reward": 1.7500000894069672, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7500000279396772, + "rewards/format_reward_func": 1.0, + "step": 870 + }, + { + "completion_length": 249.63840675354004, + "epoch": 0.14619221258225407, + "grad_norm": 0.2734543943620819, + "kl": 0.0005452632904052734, + "learning_rate": 1.0765432098765431e-07, + "loss": 0.0, + "reward": 1.7714286372065544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.771428594365716, + "rewards/format_reward_func": 1.0, + "step": 872 + }, + { + "completion_length": 239.69197273254395, + "epoch": 0.14652751582212162, + "grad_norm": 0.2274695812309938, + "kl": 0.000532984733581543, + "learning_rate": 1.0790123456790123e-07, + "loss": 0.0, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 874 + }, + { + "completion_length": 238.9330472946167, + "epoch": 0.1468628190619892, + "grad_norm": 0.16239306753807847, + "kl": 0.0005614757537841797, + "learning_rate": 1.0814814814814814e-07, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 876 + }, + { + "completion_length": 246.38393783569336, + "epoch": 0.14719812230185675, + "grad_norm": 0.25783637098150386, + "kl": 0.0005737543106079102, + "learning_rate": 1.0839506172839506e-07, + "loss": 0.0, + "reward": 1.7678572088479996, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7767857387661934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 878 + }, + { + "completion_length": 241.51786708831787, + "epoch": 0.1475334255417243, + "grad_norm": 0.3677828611432142, + "kl": 0.0005735158920288086, + "learning_rate": 1.0864197530864197e-07, + "loss": 0.0, + "reward": 1.7125000730156898, + "reward_std": 0.09343910776078701, + "rewards/equation_reward_func": 0.7169643249362707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 880 + }, + { + "completion_length": 242.76340675354004, + "epoch": 0.14786872878159185, + "grad_norm": 0.20185872070200894, + "kl": 0.0005519390106201172, + "learning_rate": 1.0888888888888888e-07, + "loss": 0.0, + "reward": 1.7696429267525673, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 882 + }, + { + "completion_length": 239.31697368621826, + "epoch": 0.1482040320214594, + "grad_norm": 0.24556548501686268, + "kl": 0.0005544424057006836, + "learning_rate": 1.091358024691358e-07, + "loss": 0.0, + "reward": 1.7892857640981674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857510596514, + "rewards/format_reward_func": 1.0, + "step": 884 + }, + { + "completion_length": 236.79465198516846, + "epoch": 0.14853933526132695, + "grad_norm": 0.19666918858541813, + "kl": 0.0005842447280883789, + "learning_rate": 1.093827160493827e-07, + "loss": 0.0, + "reward": 1.7142857909202576, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857201397419, + "rewards/format_reward_func": 1.0, + "step": 886 + }, + { + "completion_length": 243.69644165039062, + "epoch": 0.1488746385011945, + "grad_norm": 0.2543890898105721, + "kl": 0.0005837678909301758, + "learning_rate": 1.0962962962962963e-07, + "loss": 0.0, + "reward": 1.7946429252624512, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.799107164144516, + "rewards/format_reward_func": 0.9955357164144516, + "step": 888 + }, + { + "completion_length": 240.16518783569336, + "epoch": 0.14920994174106209, + "grad_norm": 0.2612662392595029, + "kl": 0.000578761100769043, + "learning_rate": 1.0987654320987653e-07, + "loss": 0.0, + "reward": 1.7535714879631996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.753571443259716, + "rewards/format_reward_func": 1.0, + "step": 890 + }, + { + "completion_length": 243.67412090301514, + "epoch": 0.14954524498092964, + "grad_norm": 0.2645548665031763, + "kl": 0.0005819797515869141, + "learning_rate": 1.1012345679012345e-07, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7696428671479225, + "rewards/format_reward_func": 0.9910714328289032, + "step": 892 + }, + { + "completion_length": 240.39733123779297, + "epoch": 0.1498805482207972, + "grad_norm": 0.202545137842961, + "kl": 0.0006400346755981445, + "learning_rate": 1.1037037037037036e-07, + "loss": 0.0, + "reward": 1.7142857760190964, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857573926449, + "rewards/format_reward_func": 1.0, + "step": 894 + }, + { + "completion_length": 244.81250953674316, + "epoch": 0.15021585146066474, + "grad_norm": 0.26342116615886524, + "kl": 0.0005612373352050781, + "learning_rate": 1.1061728395061728e-07, + "loss": 0.0, + "reward": 1.7250000834465027, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 1.0, + "step": 896 + }, + { + "completion_length": 252.0714406967163, + "epoch": 0.1505511547005323, + "grad_norm": 0.19740268449770748, + "kl": 0.0004432201385498047, + "learning_rate": 1.108641975308642e-07, + "loss": 0.0, + "reward": 1.7375000715255737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.741964302957058, + "rewards/format_reward_func": 0.9955357164144516, + "step": 898 + }, + { + "completion_length": 244.9062623977661, + "epoch": 0.15088645794039984, + "grad_norm": 0.21749415281902099, + "kl": 0.0005629062652587891, + "learning_rate": 1.111111111111111e-07, + "loss": 0.0, + "reward": 1.755357213318348, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214708268642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 900 + }, + { + "completion_length": 238.94643783569336, + "epoch": 0.1512217611802674, + "grad_norm": 0.19636977379194281, + "kl": 0.0005797147750854492, + "learning_rate": 1.1135802469135802e-07, + "loss": 0.0, + "reward": 1.7910714745521545, + "reward_std": 0.022728432901203632, + "rewards/equation_reward_func": 0.7955357357859612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 902 + }, + { + "completion_length": 237.40179634094238, + "epoch": 0.15155706442013495, + "grad_norm": 0.0812537734176348, + "kl": 0.0005922317504882812, + "learning_rate": 1.1160493827160493e-07, + "loss": 0.0, + "reward": 1.7500000819563866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000111758709, + "rewards/format_reward_func": 1.0, + "step": 904 + }, + { + "completion_length": 240.58482837677002, + "epoch": 0.15189236766000253, + "grad_norm": 0.31431448086070735, + "kl": 0.0008060932159423828, + "learning_rate": 1.1185185185185185e-07, + "loss": 0.0, + "reward": 1.7642857730388641, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7732143197208643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 906 + }, + { + "completion_length": 234.2857265472412, + "epoch": 0.15222767089987008, + "grad_norm": 0.13392711374206998, + "kl": 0.0005654096603393555, + "learning_rate": 1.1209876543209876e-07, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 908 + }, + { + "completion_length": 244.5267972946167, + "epoch": 0.15256297413973763, + "grad_norm": 0.27221219675506475, + "kl": 0.0005095005035400391, + "learning_rate": 1.1234567901234568e-07, + "loss": 0.0, + "reward": 1.7267857789993286, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7312500383704901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 910 + }, + { + "completion_length": 241.03126049041748, + "epoch": 0.15289827737960518, + "grad_norm": 0.31530224582644223, + "kl": 0.0005794763565063477, + "learning_rate": 1.1259259259259258e-07, + "loss": 0.0, + "reward": 1.7250000834465027, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000238418579, + "rewards/format_reward_func": 1.0, + "step": 912 + }, + { + "completion_length": 246.7991180419922, + "epoch": 0.15323358061947273, + "grad_norm": 0.250379166539302, + "kl": 0.0006912946701049805, + "learning_rate": 1.1283950617283951e-07, + "loss": 0.0, + "reward": 1.7142858058214188, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 1.0, + "step": 914 + }, + { + "completion_length": 238.97322463989258, + "epoch": 0.15356888385934028, + "grad_norm": 0.19226153813655147, + "kl": 0.0006650686264038086, + "learning_rate": 1.1308641975308641e-07, + "loss": 0.0, + "reward": 1.7267857640981674, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.740178594365716, + "rewards/format_reward_func": 0.9866071492433548, + "step": 916 + }, + { + "completion_length": 236.57143688201904, + "epoch": 0.15390418709920783, + "grad_norm": 0.29729296307514946, + "kl": 0.00064849853515625, + "learning_rate": 1.1333333333333332e-07, + "loss": 0.0, + "reward": 1.7892857566475868, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 918 + }, + { + "completion_length": 241.696439743042, + "epoch": 0.1542394903390754, + "grad_norm": 0.23932282850941852, + "kl": 0.0005295276641845703, + "learning_rate": 1.1358024691358024e-07, + "loss": 0.0, + "reward": 1.7750000655651093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 920 + }, + { + "completion_length": 240.95090293884277, + "epoch": 0.15457479357894297, + "grad_norm": 0.1940264202902731, + "kl": 0.0005745887756347656, + "learning_rate": 1.1382716049382715e-07, + "loss": 0.0, + "reward": 1.769642911851406, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071619093418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 922 + }, + { + "completion_length": 243.47322273254395, + "epoch": 0.15491009681881052, + "grad_norm": 0.21505952196582073, + "kl": 0.0005426406860351562, + "learning_rate": 1.1407407407407407e-07, + "loss": 0.0, + "reward": 1.7607143446803093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 1.0, + "step": 924 + }, + { + "completion_length": 239.3616180419922, + "epoch": 0.15524540005867807, + "grad_norm": 0.2723087004689939, + "kl": 0.0005807876586914062, + "learning_rate": 1.1432098765432098e-07, + "loss": 0.0, + "reward": 1.8107143342494965, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8196429014205933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 926 + }, + { + "completion_length": 248.90179824829102, + "epoch": 0.15558070329854562, + "grad_norm": 0.4941249855932243, + "kl": 0.0005967617034912109, + "learning_rate": 1.145679012345679e-07, + "loss": 0.0, + "reward": 1.7767857536673546, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500186264515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 928 + }, + { + "completion_length": 241.01340293884277, + "epoch": 0.15591600653841317, + "grad_norm": 0.1552928542536375, + "kl": 0.0006102323532104492, + "learning_rate": 1.148148148148148e-07, + "loss": 0.0, + "reward": 1.7500000894069672, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 1.0, + "step": 930 + }, + { + "completion_length": 238.94197273254395, + "epoch": 0.15625130977828072, + "grad_norm": 0.418642637769558, + "kl": 0.0006206035614013672, + "learning_rate": 1.1506172839506173e-07, + "loss": 0.0, + "reward": 1.7428572177886963, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7428571749478579, + "rewards/format_reward_func": 1.0, + "step": 932 + }, + { + "completion_length": 243.3437614440918, + "epoch": 0.15658661301814827, + "grad_norm": 0.223657964390005, + "kl": 0.0006108283996582031, + "learning_rate": 1.1530864197530863e-07, + "loss": 0.0, + "reward": 1.723214402794838, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7276786025613546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 934 + }, + { + "completion_length": 238.477689743042, + "epoch": 0.15692191625801585, + "grad_norm": 0.19391093772147885, + "kl": 0.0006520748138427734, + "learning_rate": 1.1555555555555555e-07, + "loss": 0.0, + "reward": 1.7750000804662704, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 936 + }, + { + "completion_length": 243.4955472946167, + "epoch": 0.1572572194978834, + "grad_norm": 0.29975605622207685, + "kl": 0.000649571418762207, + "learning_rate": 1.1580246913580246e-07, + "loss": 0.0, + "reward": 1.7625000849366188, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7669643014669418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 938 + }, + { + "completion_length": 240.0625114440918, + "epoch": 0.15759252273775096, + "grad_norm": 0.1899270930718893, + "kl": 0.0006060600280761719, + "learning_rate": 1.1604938271604938e-07, + "loss": 0.0, + "reward": 1.7928571924567223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 940 + }, + { + "completion_length": 246.10715293884277, + "epoch": 0.1579278259776185, + "grad_norm": 0.3378569868261778, + "kl": 0.0005865097045898438, + "learning_rate": 1.162962962962963e-07, + "loss": 0.0, + "reward": 1.7303572073578835, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7348214574158192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 942 + }, + { + "completion_length": 238.98661613464355, + "epoch": 0.15826312921748606, + "grad_norm": 0.23510469699532882, + "kl": 0.0005903244018554688, + "learning_rate": 1.1654320987654321e-07, + "loss": 0.0, + "reward": 1.8107143267989159, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 944 + }, + { + "completion_length": 239.17858219146729, + "epoch": 0.1585984324573536, + "grad_norm": 0.2531029628465875, + "kl": 0.0005866289138793945, + "learning_rate": 1.1679012345679012e-07, + "loss": 0.0, + "reward": 1.7553571909666061, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 946 + }, + { + "completion_length": 239.99108219146729, + "epoch": 0.15893373569722116, + "grad_norm": 0.1832711381666499, + "kl": 0.0005750656127929688, + "learning_rate": 1.1703703703703702e-07, + "loss": 0.0, + "reward": 1.8303571790456772, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.834821455180645, + "rewards/format_reward_func": 0.9955357164144516, + "step": 948 + }, + { + "completion_length": 248.0223331451416, + "epoch": 0.15926903893708874, + "grad_norm": 0.21616249364705065, + "kl": 0.0006320476531982422, + "learning_rate": 1.1728395061728394e-07, + "loss": 0.0, + "reward": 1.7892857939004898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 950 + }, + { + "completion_length": 247.7053689956665, + "epoch": 0.1596043421769563, + "grad_norm": 0.2808309498061027, + "kl": 0.0006732940673828125, + "learning_rate": 1.1753086419753085e-07, + "loss": 0.0, + "reward": 1.732142947614193, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428656578064, + "rewards/format_reward_func": 1.0, + "step": 952 + }, + { + "completion_length": 247.93304634094238, + "epoch": 0.15993964541682384, + "grad_norm": 0.28244344156282136, + "kl": 0.0006171464920043945, + "learning_rate": 1.1777777777777777e-07, + "loss": 0.0, + "reward": 1.7678571939468384, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 954 + }, + { + "completion_length": 249.13840293884277, + "epoch": 0.1602749486566914, + "grad_norm": 0.3471574848884779, + "kl": 0.0006940364837646484, + "learning_rate": 1.1802469135802468e-07, + "loss": 0.0, + "reward": 1.7535714879631996, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 956 + }, + { + "completion_length": 237.1696548461914, + "epoch": 0.16061025189655895, + "grad_norm": 0.24213326359003345, + "kl": 0.0006643533706665039, + "learning_rate": 1.182716049382716e-07, + "loss": 0.0, + "reward": 1.7928571924567223, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 958 + }, + { + "completion_length": 246.29019260406494, + "epoch": 0.1609455551364265, + "grad_norm": 0.25180774730216865, + "kl": 0.0006979703903198242, + "learning_rate": 1.1851851851851851e-07, + "loss": 0.0, + "reward": 1.7196429297327995, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 960 + }, + { + "completion_length": 240.30804824829102, + "epoch": 0.16128085837629405, + "grad_norm": 0.27009721641478057, + "kl": 0.0005599260330200195, + "learning_rate": 1.1876543209876543e-07, + "loss": 0.0, + "reward": 1.7857143580913544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 962 + }, + { + "completion_length": 240.95090675354004, + "epoch": 0.16161616161616163, + "grad_norm": 0.20274197501049643, + "kl": 0.0005586147308349609, + "learning_rate": 1.1901234567901234e-07, + "loss": 0.0, + "reward": 1.7035715207457542, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.712500024586916, + "rewards/format_reward_func": 0.9910714328289032, + "step": 964 + }, + { + "completion_length": 234.34376049041748, + "epoch": 0.16195146485602918, + "grad_norm": 0.2154479276321837, + "kl": 0.0006890296936035156, + "learning_rate": 1.1925925925925924e-07, + "loss": 0.0, + "reward": 1.7821429148316383, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7910714615136385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 966 + }, + { + "completion_length": 246.4776906967163, + "epoch": 0.16228676809589673, + "grad_norm": 0.31223951314737436, + "kl": 0.000695347785949707, + "learning_rate": 1.1950617283950616e-07, + "loss": 0.0, + "reward": 1.74821437895298, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.752678606659174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 968 + }, + { + "completion_length": 247.47322463989258, + "epoch": 0.16262207133576428, + "grad_norm": 0.26589705178869344, + "kl": 0.0006476640701293945, + "learning_rate": 1.1975308641975307e-07, + "loss": 0.0, + "reward": 1.7379465252161026, + "reward_std": 0.06755394977517426, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 0.9986607171595097, + "step": 970 + }, + { + "completion_length": 243.9241180419922, + "epoch": 0.16295737457563184, + "grad_norm": 0.2610188794854556, + "kl": 0.0007169246673583984, + "learning_rate": 1.2e-07, + "loss": 0.0, + "reward": 1.733928695321083, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7383928894996643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 972 + }, + { + "completion_length": 244.821439743042, + "epoch": 0.1632926778154994, + "grad_norm": 0.3007014229187292, + "kl": 0.0007772445678710938, + "learning_rate": 1.202469135802469e-07, + "loss": 0.0, + "reward": 1.7875000685453415, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 974 + }, + { + "completion_length": 244.1964406967163, + "epoch": 0.16362798105536694, + "grad_norm": 0.31254137964797346, + "kl": 0.0007396936416625977, + "learning_rate": 1.2049382716049382e-07, + "loss": 0.0, + "reward": 1.764285795390606, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7732143178582191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 976 + }, + { + "completion_length": 245.65180015563965, + "epoch": 0.1639632842952345, + "grad_norm": 0.12124514568359325, + "kl": 0.0006595849990844727, + "learning_rate": 1.2074074074074073e-07, + "loss": 0.0, + "reward": 1.7089286372065544, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.713392898440361, + "rewards/format_reward_func": 0.9955357164144516, + "step": 978 + }, + { + "completion_length": 231.5982255935669, + "epoch": 0.16429858753510207, + "grad_norm": 0.30993064357758865, + "kl": 0.0006895065307617188, + "learning_rate": 1.2098765432098765e-07, + "loss": 0.0, + "reward": 1.757142923772335, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7651786059141159, + "rewards/format_reward_func": 0.9919642880558968, + "step": 980 + }, + { + "completion_length": 240.02679538726807, + "epoch": 0.16463389077496962, + "grad_norm": 0.2581272471044061, + "kl": 0.0006649494171142578, + "learning_rate": 1.2123456790123456e-07, + "loss": 0.0, + "reward": 1.7482143566012383, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 982 + }, + { + "completion_length": 241.2678689956665, + "epoch": 0.16496919401483717, + "grad_norm": 0.36361643382301556, + "kl": 0.00080108642578125, + "learning_rate": 1.2148148148148148e-07, + "loss": 0.0, + "reward": 1.7607143446803093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 984 + }, + { + "completion_length": 250.83483505249023, + "epoch": 0.16530449725470472, + "grad_norm": 0.21378059263062124, + "kl": 0.0007206201553344727, + "learning_rate": 1.217283950617284e-07, + "loss": 0.0, + "reward": 1.7017857730388641, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7062500435858965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 986 + }, + { + "completion_length": 238.31697463989258, + "epoch": 0.16563980049457228, + "grad_norm": 0.32740835817943337, + "kl": 0.0007141828536987305, + "learning_rate": 1.219753086419753e-07, + "loss": 0.0, + "reward": 1.6821429505944252, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.6910714693367481, + "rewards/format_reward_func": 0.9910714328289032, + "step": 988 + }, + { + "completion_length": 238.20983219146729, + "epoch": 0.16597510373443983, + "grad_norm": 0.19363083263718955, + "kl": 0.0007278919219970703, + "learning_rate": 1.2222222222222222e-07, + "loss": 0.0, + "reward": 1.7000000700354576, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7000000439584255, + "rewards/format_reward_func": 1.0, + "step": 990 + }, + { + "completion_length": 234.18751049041748, + "epoch": 0.16631040697430738, + "grad_norm": 0.23389482243909598, + "kl": 0.0006586313247680664, + "learning_rate": 1.2246913580246914e-07, + "loss": 0.0, + "reward": 1.7593750730156898, + "reward_std": 0.047350899782031775, + "rewards/equation_reward_func": 0.7696428969502449, + "rewards/format_reward_func": 0.9897321499884129, + "step": 992 + }, + { + "completion_length": 239.7009038925171, + "epoch": 0.16664571021417496, + "grad_norm": 0.2894204366418183, + "kl": 0.0007368326187133789, + "learning_rate": 1.2271604938271605e-07, + "loss": 0.0, + "reward": 1.7821429297327995, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7910714708268642, + "rewards/format_reward_func": 0.9910714328289032, + "step": 994 + }, + { + "completion_length": 232.94643878936768, + "epoch": 0.1669810134540425, + "grad_norm": 0.24697120320275165, + "kl": 0.0007069110870361328, + "learning_rate": 1.2296296296296297e-07, + "loss": 0.0, + "reward": 1.7750000655651093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 996 + }, + { + "completion_length": 240.09376335144043, + "epoch": 0.16731631669391006, + "grad_norm": 0.26157424089241804, + "kl": 0.0006704330444335938, + "learning_rate": 1.2320987654320988e-07, + "loss": 0.0, + "reward": 1.764285795390606, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.764285733923316, + "rewards/format_reward_func": 1.0, + "step": 998 + }, + { + "completion_length": 253.48215198516846, + "epoch": 0.1676516199337776, + "grad_norm": 0.2163902996279677, + "kl": 0.0006573200225830078, + "learning_rate": 1.2345679012345677e-07, + "loss": 0.0, + "reward": 1.7928572073578835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 1000 + }, + { + "completion_length": 248.79018878936768, + "epoch": 0.16798692317364516, + "grad_norm": 0.12645581691936147, + "kl": 0.0006426572799682617, + "learning_rate": 1.237037037037037e-07, + "loss": 0.0, + "reward": 1.7178572341799736, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.71785718947649, + "rewards/format_reward_func": 1.0, + "step": 1002 + }, + { + "completion_length": 229.32143878936768, + "epoch": 0.16832222641351272, + "grad_norm": 0.15066270893991937, + "kl": 0.0006672143936157227, + "learning_rate": 1.239506172839506e-07, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 1004 + }, + { + "completion_length": 243.65179538726807, + "epoch": 0.16865752965338027, + "grad_norm": 0.19863441041666846, + "kl": 0.0008147954940795898, + "learning_rate": 1.2419753086419751e-07, + "loss": 0.0, + "reward": 1.7517857924103737, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1006 + }, + { + "completion_length": 236.758939743042, + "epoch": 0.16899283289324782, + "grad_norm": 0.323289276257295, + "kl": 0.0006589889526367188, + "learning_rate": 1.2444444444444443e-07, + "loss": 0.0, + "reward": 1.816071480512619, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8205357156693935, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1008 + }, + { + "completion_length": 239.04465293884277, + "epoch": 0.1693281361331154, + "grad_norm": 0.21195653521421964, + "kl": 0.0006917715072631836, + "learning_rate": 1.2469135802469134e-07, + "loss": 0.0, + "reward": 1.7285715118050575, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7464285958558321, + "rewards/format_reward_func": 0.9821428619325161, + "step": 1010 + }, + { + "completion_length": 234.57143878936768, + "epoch": 0.16966343937298295, + "grad_norm": 0.14904220568452134, + "kl": 0.0006827116012573242, + "learning_rate": 1.2493827160493826e-07, + "loss": 0.0, + "reward": 1.7477679178118706, + "reward_std": 0.053664354141801596, + "rewards/equation_reward_func": 0.7553571909666061, + "rewards/format_reward_func": 0.9924107193946838, + "step": 1012 + }, + { + "completion_length": 239.071439743042, + "epoch": 0.1699987426128505, + "grad_norm": 0.32596573692825204, + "kl": 0.0007929801940917969, + "learning_rate": 1.2518518518518517e-07, + "loss": 0.0, + "reward": 1.7964286357164383, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 1014 + }, + { + "completion_length": 241.7053689956665, + "epoch": 0.17033404585271805, + "grad_norm": 0.22868795871007735, + "kl": 0.0008907318115234375, + "learning_rate": 1.254320987654321e-07, + "loss": 0.0, + "reward": 1.7107143700122833, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7196428813040257, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1016 + }, + { + "completion_length": 242.83037185668945, + "epoch": 0.1706693490925856, + "grad_norm": 0.2450458836005228, + "kl": 0.0008082389831542969, + "learning_rate": 1.25679012345679e-07, + "loss": 0.0, + "reward": 1.764285795390606, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7732143215835094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1018 + }, + { + "completion_length": 241.7321538925171, + "epoch": 0.17100465233245316, + "grad_norm": 0.22439507701355918, + "kl": 0.0007718801498413086, + "learning_rate": 1.2592592592592592e-07, + "loss": 0.0, + "reward": 1.7607143744826317, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143167406321, + "rewards/format_reward_func": 1.0, + "step": 1020 + }, + { + "completion_length": 239.7857208251953, + "epoch": 0.1713399555723207, + "grad_norm": 0.275089302136405, + "kl": 0.0007205009460449219, + "learning_rate": 1.2617283950617283e-07, + "loss": 0.0, + "reward": 1.7464286461472511, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 1022 + }, + { + "completion_length": 236.66518783569336, + "epoch": 0.17167525881218829, + "grad_norm": 0.31408574841301085, + "kl": 0.0007870197296142578, + "learning_rate": 1.2641975308641975e-07, + "loss": 0.0, + "reward": 1.6982143744826317, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7026785835623741, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1024 + }, + { + "completion_length": 244.68751049041748, + "epoch": 0.17201056205205584, + "grad_norm": 0.23233340761476015, + "kl": 0.0008363723754882812, + "learning_rate": 1.2666666666666666e-07, + "loss": 0.0, + "reward": 1.717857226729393, + "reward_std": 0.0656599160283804, + "rewards/equation_reward_func": 0.7267857417464256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1026 + }, + { + "completion_length": 243.33483409881592, + "epoch": 0.1723458652919234, + "grad_norm": 0.2969390274202134, + "kl": 0.000970005989074707, + "learning_rate": 1.2691358024691358e-07, + "loss": 0.0, + "reward": 1.7428572252392769, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 1028 + }, + { + "completion_length": 232.19643783569336, + "epoch": 0.17268116853179094, + "grad_norm": 0.267704808940672, + "kl": 0.0009148120880126953, + "learning_rate": 1.271604938271605e-07, + "loss": 0.0, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1030 + }, + { + "completion_length": 242.52679920196533, + "epoch": 0.1730164717716585, + "grad_norm": 0.41808152142365423, + "kl": 0.0009038448333740234, + "learning_rate": 1.274074074074074e-07, + "loss": 0.0, + "reward": 1.7160715162754059, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.720535745844245, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1032 + }, + { + "completion_length": 237.8348331451416, + "epoch": 0.17335177501152604, + "grad_norm": 0.2222729619708794, + "kl": 0.0007498264312744141, + "learning_rate": 1.276543209876543e-07, + "loss": 0.0, + "reward": 1.716071493923664, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.720535745844245, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1034 + }, + { + "completion_length": 239.33483219146729, + "epoch": 0.1736870782513936, + "grad_norm": 0.24150968139086826, + "kl": 0.0009173154830932617, + "learning_rate": 1.279012345679012e-07, + "loss": 0.0, + "reward": 1.7357143461704254, + "reward_std": 0.06060914974659681, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1036 + }, + { + "completion_length": 234.47322463989258, + "epoch": 0.17402238149126115, + "grad_norm": 0.24846206951979183, + "kl": 0.0008068084716796875, + "learning_rate": 1.2814814814814815e-07, + "loss": 0.0, + "reward": 1.766071505844593, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1038 + }, + { + "completion_length": 237.48661708831787, + "epoch": 0.17435768473112873, + "grad_norm": 0.23149397808743663, + "kl": 0.0008314847946166992, + "learning_rate": 1.2839506172839507e-07, + "loss": 0.0, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 1040 + }, + { + "completion_length": 237.63840293884277, + "epoch": 0.17469298797099628, + "grad_norm": 0.272804539190451, + "kl": 0.0009119510650634766, + "learning_rate": 1.2864197530864195e-07, + "loss": 0.0, + "reward": 1.759375087916851, + "reward_std": 0.0473508988507092, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 0.9986607171595097, + "step": 1042 + }, + { + "completion_length": 232.0491180419922, + "epoch": 0.17502829121086383, + "grad_norm": 0.26158107193461716, + "kl": 0.0008912086486816406, + "learning_rate": 1.2888888888888887e-07, + "loss": 0.0, + "reward": 1.705357238650322, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.709821455180645, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1044 + }, + { + "completion_length": 233.5178680419922, + "epoch": 0.17536359445073138, + "grad_norm": 0.16438665975664443, + "kl": 0.0008466243743896484, + "learning_rate": 1.291358024691358e-07, + "loss": 0.0, + "reward": 1.799553632736206, + "reward_std": 0.061240497045218945, + "rewards/equation_reward_func": 0.8026785925030708, + "rewards/format_reward_func": 0.9968750029802322, + "step": 1046 + }, + { + "completion_length": 232.6875123977661, + "epoch": 0.17569889769059893, + "grad_norm": 0.1428673378811846, + "kl": 0.0009279251098632812, + "learning_rate": 1.2938271604938273e-07, + "loss": 0.0, + "reward": 1.7607143595814705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607142999768257, + "rewards/format_reward_func": 1.0, + "step": 1048 + }, + { + "completion_length": 247.0312614440918, + "epoch": 0.17603420093046648, + "grad_norm": 0.24613124143913667, + "kl": 0.0008966922760009766, + "learning_rate": 1.2962962962962961e-07, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 1050 + }, + { + "completion_length": 244.1875123977661, + "epoch": 0.17636950417033403, + "grad_norm": 0.20124636357271036, + "kl": 0.00095367431640625, + "learning_rate": 1.2987654320987653e-07, + "loss": 0.0, + "reward": 1.7178572341799736, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571708500385, + "rewards/format_reward_func": 1.0, + "step": 1052 + }, + { + "completion_length": 243.13840675354004, + "epoch": 0.17670480741020161, + "grad_norm": 0.2653305938876837, + "kl": 0.0008423328399658203, + "learning_rate": 1.3012345679012347e-07, + "loss": 0.0, + "reward": 1.7285714969038963, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 1054 + }, + { + "completion_length": 238.96429824829102, + "epoch": 0.17704011065006917, + "grad_norm": 0.33954870085400096, + "kl": 0.0009109973907470703, + "learning_rate": 1.3037037037037036e-07, + "loss": 0.0, + "reward": 1.725000075995922, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7339286059141159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1056 + }, + { + "completion_length": 255.60269451141357, + "epoch": 0.17737541388993672, + "grad_norm": 0.336539878900831, + "kl": 0.0009992122650146484, + "learning_rate": 1.3061728395061727e-07, + "loss": 0.0, + "reward": 1.703571505844593, + "reward_std": 0.09091372601687908, + "rewards/equation_reward_func": 0.7125000264495611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1058 + }, + { + "completion_length": 240.08929634094238, + "epoch": 0.17771071712980427, + "grad_norm": 0.26274674597932585, + "kl": 0.0010219812393188477, + "learning_rate": 1.308641975308642e-07, + "loss": 0.0, + "reward": 1.7392857745289803, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1060 + }, + { + "completion_length": 238.883939743042, + "epoch": 0.17804602036967182, + "grad_norm": 0.29858341249542397, + "kl": 0.000922083854675293, + "learning_rate": 1.3111111111111113e-07, + "loss": 0.0, + "reward": 1.7035715132951736, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7035714648663998, + "rewards/format_reward_func": 1.0, + "step": 1062 + }, + { + "completion_length": 238.93751049041748, + "epoch": 0.17838132360953937, + "grad_norm": 0.26838733869333925, + "kl": 0.0009055137634277344, + "learning_rate": 1.3135802469135802e-07, + "loss": 0.0, + "reward": 1.7125000730156898, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7169643323868513, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1064 + }, + { + "completion_length": 240.852689743042, + "epoch": 0.17871662684940692, + "grad_norm": 0.3392750138879499, + "kl": 0.0009553432464599609, + "learning_rate": 1.3160493827160493e-07, + "loss": 0.0, + "reward": 1.7785715088248253, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7875000238418579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1066 + }, + { + "completion_length": 235.77679824829102, + "epoch": 0.17905193008927447, + "grad_norm": 0.18746947628661784, + "kl": 0.001000523567199707, + "learning_rate": 1.3185185185185185e-07, + "loss": 0.0, + "reward": 1.778571479022503, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.778571454808116, + "rewards/format_reward_func": 1.0, + "step": 1068 + }, + { + "completion_length": 241.3928680419922, + "epoch": 0.17938723332914205, + "grad_norm": 0.3135635456080588, + "kl": 0.0010788440704345703, + "learning_rate": 1.3209876543209874e-07, + "loss": 0.0, + "reward": 1.7571429535746574, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 1070 + }, + { + "completion_length": 239.50001049041748, + "epoch": 0.1797225365690096, + "grad_norm": 0.15649356816838816, + "kl": 0.0009417533874511719, + "learning_rate": 1.3234567901234568e-07, + "loss": 0.0, + "reward": 1.7910714745521545, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7955357357859612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1072 + }, + { + "completion_length": 242.91518878936768, + "epoch": 0.18005783980887716, + "grad_norm": 0.2412609119195414, + "kl": 0.0009436607360839844, + "learning_rate": 1.325925925925926e-07, + "loss": 0.0, + "reward": 1.775000050663948, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000264495611, + "rewards/format_reward_func": 1.0, + "step": 1074 + }, + { + "completion_length": 230.6696538925171, + "epoch": 0.1803931430487447, + "grad_norm": 0.3883932440560228, + "kl": 0.0012047290802001953, + "learning_rate": 1.328395061728395e-07, + "loss": 0.0, + "reward": 1.7714286521077156, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 1076 + }, + { + "completion_length": 243.3125114440918, + "epoch": 0.18072844628861226, + "grad_norm": 0.2765865793717639, + "kl": 0.0011830329895019531, + "learning_rate": 1.330864197530864e-07, + "loss": 0.0, + "reward": 1.7928572446107864, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 1078 + }, + { + "completion_length": 242.84822463989258, + "epoch": 0.1810637495284798, + "grad_norm": 0.3631849365890873, + "kl": 0.0010890960693359375, + "learning_rate": 1.3333333333333334e-07, + "loss": 0.0, + "reward": 1.739285796880722, + "reward_std": 0.09596448950469494, + "rewards/equation_reward_func": 0.7482143230736256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1080 + }, + { + "completion_length": 231.1919765472412, + "epoch": 0.18139905276834736, + "grad_norm": 0.12764449962903732, + "kl": 0.0010178089141845703, + "learning_rate": 1.3358024691358025e-07, + "loss": 0.0, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 1082 + }, + { + "completion_length": 223.36161708831787, + "epoch": 0.18173435600821494, + "grad_norm": 0.223794147157261, + "kl": 0.0009124279022216797, + "learning_rate": 1.3382716049382717e-07, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 1084 + }, + { + "completion_length": 240.0357255935669, + "epoch": 0.1820696592480825, + "grad_norm": 0.21731252308037757, + "kl": 0.0009878873825073242, + "learning_rate": 1.3407407407407405e-07, + "loss": 0.0, + "reward": 1.775000087916851, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 1086 + }, + { + "completion_length": 238.94197463989258, + "epoch": 0.18240496248795005, + "grad_norm": 0.3074918487704999, + "kl": 0.0010647773742675781, + "learning_rate": 1.34320987654321e-07, + "loss": 0.0, + "reward": 1.7214286401867867, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7303571701049805, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1088 + }, + { + "completion_length": 239.14286708831787, + "epoch": 0.1827402657278176, + "grad_norm": 0.13563050504252466, + "kl": 0.001009225845336914, + "learning_rate": 1.345679012345679e-07, + "loss": 0.0, + "reward": 1.7839286252856255, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7973214648663998, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1090 + }, + { + "completion_length": 244.07590293884277, + "epoch": 0.18307556896768515, + "grad_norm": 0.35995071786593574, + "kl": 0.0010128021240234375, + "learning_rate": 1.348148148148148e-07, + "loss": 0.0, + "reward": 1.764285795390606, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7732143141329288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1092 + }, + { + "completion_length": 247.30358600616455, + "epoch": 0.1834108722075527, + "grad_norm": 0.20962785028909983, + "kl": 0.00118255615234375, + "learning_rate": 1.3506172839506171e-07, + "loss": 0.0, + "reward": 1.7321429327130318, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 1094 + }, + { + "completion_length": 238.3571538925171, + "epoch": 0.18374617544742025, + "grad_norm": 0.27384299158814857, + "kl": 0.0010139942169189453, + "learning_rate": 1.3530864197530863e-07, + "loss": 0.0, + "reward": 1.7571429163217545, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 1096 + }, + { + "completion_length": 239.65179538726807, + "epoch": 0.1840814786872878, + "grad_norm": 0.28567982737530095, + "kl": 0.0012295246124267578, + "learning_rate": 1.3555555555555557e-07, + "loss": 0.0, + "reward": 1.750892922282219, + "reward_std": 0.06944798585027456, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9982142895460129, + "step": 1098 + }, + { + "completion_length": 237.58929634094238, + "epoch": 0.18441678192715538, + "grad_norm": 0.2796157535424371, + "kl": 0.001096487045288086, + "learning_rate": 1.3580246913580246e-07, + "loss": 0.0, + "reward": 1.7803571969270706, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1100 + }, + { + "completion_length": 236.9107265472412, + "epoch": 0.18475208516702293, + "grad_norm": 0.23820606951477952, + "kl": 0.0011627674102783203, + "learning_rate": 1.3604938271604937e-07, + "loss": 0.0, + "reward": 1.8071429207921028, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 1102 + }, + { + "completion_length": 238.8705472946167, + "epoch": 0.18508738840689049, + "grad_norm": 0.17300731372670122, + "kl": 0.0011518001556396484, + "learning_rate": 1.362962962962963e-07, + "loss": 0.0, + "reward": 1.717857226729393, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7178571671247482, + "rewards/format_reward_func": 1.0, + "step": 1104 + }, + { + "completion_length": 241.66965293884277, + "epoch": 0.18542269164675804, + "grad_norm": 0.27020448765160554, + "kl": 0.0010493993759155273, + "learning_rate": 1.365432098765432e-07, + "loss": 0.0, + "reward": 1.784375049173832, + "reward_std": 0.05240166233852506, + "rewards/equation_reward_func": 0.7857143320143223, + "rewards/format_reward_func": 0.9986607171595097, + "step": 1106 + }, + { + "completion_length": 243.57590579986572, + "epoch": 0.1857579948866256, + "grad_norm": 0.2713591769752031, + "kl": 0.0012111663818359375, + "learning_rate": 1.3679012345679012e-07, + "loss": 0.0, + "reward": 1.789285808801651, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 1.0, + "step": 1108 + }, + { + "completion_length": 247.6339406967163, + "epoch": 0.18609329812649314, + "grad_norm": 0.3714389827370947, + "kl": 0.0012345314025878906, + "learning_rate": 1.3703703703703703e-07, + "loss": 0.0, + "reward": 1.7500000596046448, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 1110 + }, + { + "completion_length": 241.4107255935669, + "epoch": 0.1864286013663607, + "grad_norm": 0.2053685320097415, + "kl": 0.0010557174682617188, + "learning_rate": 1.3728395061728395e-07, + "loss": 0.0, + "reward": 1.7500000819563866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 1.0, + "step": 1112 + }, + { + "completion_length": 233.26786994934082, + "epoch": 0.18676390460622827, + "grad_norm": 0.33413966119251975, + "kl": 0.001161813735961914, + "learning_rate": 1.3753086419753086e-07, + "loss": 0.0, + "reward": 1.7910714820027351, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7955357581377029, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1114 + }, + { + "completion_length": 248.7053680419922, + "epoch": 0.18709920784609582, + "grad_norm": 0.19048515531842317, + "kl": 0.0014069080352783203, + "learning_rate": 1.3777777777777778e-07, + "loss": 0.0, + "reward": 1.7214286401867867, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7303571775555611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1116 + }, + { + "completion_length": 247.49108219146729, + "epoch": 0.18743451108596337, + "grad_norm": 0.23644455559250388, + "kl": 0.0011017322540283203, + "learning_rate": 1.380246913580247e-07, + "loss": 0.0, + "reward": 1.7392857819795609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 1118 + }, + { + "completion_length": 241.5982265472412, + "epoch": 0.18776981432583092, + "grad_norm": 0.20435446836020607, + "kl": 0.0010347366333007812, + "learning_rate": 1.3827160493827158e-07, + "loss": 0.0, + "reward": 1.733928643167019, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928894996643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1120 + }, + { + "completion_length": 244.29018878936768, + "epoch": 0.18810511756569848, + "grad_norm": 0.2660093786588556, + "kl": 0.001314401626586914, + "learning_rate": 1.385185185185185e-07, + "loss": 0.0, + "reward": 1.7428572326898575, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 1122 + }, + { + "completion_length": 232.94643688201904, + "epoch": 0.18844042080556603, + "grad_norm": 0.2939042070750301, + "kl": 0.0010373592376708984, + "learning_rate": 1.3876543209876544e-07, + "loss": 0.0, + "reward": 1.7714286223053932, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7803571708500385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1124 + }, + { + "completion_length": 232.59822463989258, + "epoch": 0.18877572404543358, + "grad_norm": 0.2232515295771991, + "kl": 0.0011088848114013672, + "learning_rate": 1.3901234567901235e-07, + "loss": 0.0, + "reward": 1.751785770058632, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500461935997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1126 + }, + { + "completion_length": 235.14733123779297, + "epoch": 0.18911102728530113, + "grad_norm": 0.23194326544395175, + "kl": 0.0011610984802246094, + "learning_rate": 1.3925925925925924e-07, + "loss": 0.0, + "reward": 1.7428572177886963, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7517857402563095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1128 + }, + { + "completion_length": 235.05804634094238, + "epoch": 0.1894463305251687, + "grad_norm": 0.2536261881719766, + "kl": 0.0012238025665283203, + "learning_rate": 1.3950617283950615e-07, + "loss": 0.0, + "reward": 1.7553571984171867, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214447498322, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1130 + }, + { + "completion_length": 224.6384038925171, + "epoch": 0.18978163376503626, + "grad_norm": 0.10219346905103746, + "kl": 0.001035451889038086, + "learning_rate": 1.397530864197531e-07, + "loss": 0.0, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143230736256, + "rewards/format_reward_func": 1.0, + "step": 1132 + }, + { + "completion_length": 237.4776906967163, + "epoch": 0.1901169370049038, + "grad_norm": 0.20131262973323824, + "kl": 0.0013401508331298828, + "learning_rate": 1.4e-07, + "loss": 0.0, + "reward": 1.7607143446803093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 1134 + }, + { + "completion_length": 242.58929443359375, + "epoch": 0.19045224024477136, + "grad_norm": 0.3540408384031364, + "kl": 0.0012521743774414062, + "learning_rate": 1.402469135802469e-07, + "loss": 0.0, + "reward": 1.7696429193019867, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1136 + }, + { + "completion_length": 244.79911613464355, + "epoch": 0.19078754348463892, + "grad_norm": 0.34346695563107493, + "kl": 0.001157999038696289, + "learning_rate": 1.404938271604938e-07, + "loss": 0.0, + "reward": 1.7196429371833801, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7241071611642838, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1138 + }, + { + "completion_length": 241.477689743042, + "epoch": 0.19112284672450647, + "grad_norm": 0.20868917961928712, + "kl": 0.0011320114135742188, + "learning_rate": 1.4074074074074075e-07, + "loss": 0.0, + "reward": 1.7571429088711739, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 1140 + }, + { + "completion_length": 244.14733409881592, + "epoch": 0.19145814996437402, + "grad_norm": 0.32124170225006765, + "kl": 0.0014166831970214844, + "learning_rate": 1.4098765432098764e-07, + "loss": 0.0, + "reward": 1.7107143476605415, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7196428887546062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1142 + }, + { + "completion_length": 236.6116180419922, + "epoch": 0.1917934532042416, + "grad_norm": 0.3877598831982432, + "kl": 0.0013773441314697266, + "learning_rate": 1.4123456790123456e-07, + "loss": 0.0, + "reward": 1.7232143878936768, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7276786006987095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1144 + }, + { + "completion_length": 241.6562623977661, + "epoch": 0.19212875644410915, + "grad_norm": 0.2101486308792021, + "kl": 0.0011439323425292969, + "learning_rate": 1.4148148148148147e-07, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 1146 + }, + { + "completion_length": 238.7857265472412, + "epoch": 0.1924640596839767, + "grad_norm": 0.24726688519808554, + "kl": 0.0011260509490966797, + "learning_rate": 1.417283950617284e-07, + "loss": 0.0, + "reward": 1.8035714849829674, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8035714477300644, + "rewards/format_reward_func": 1.0, + "step": 1148 + }, + { + "completion_length": 237.46429634094238, + "epoch": 0.19279936292384425, + "grad_norm": 0.24592047025377717, + "kl": 0.001191854476928711, + "learning_rate": 1.419753086419753e-07, + "loss": 0.0, + "reward": 1.7107143849134445, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7107143141329288, + "rewards/format_reward_func": 1.0, + "step": 1150 + }, + { + "completion_length": 237.977689743042, + "epoch": 0.1931346661637118, + "grad_norm": 0.3134071733947118, + "kl": 0.001195669174194336, + "learning_rate": 1.4222222222222222e-07, + "loss": 0.0, + "reward": 1.8178571984171867, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.8267857357859612, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1152 + }, + { + "completion_length": 251.6651906967163, + "epoch": 0.19346996940357936, + "grad_norm": 0.25882729527590953, + "kl": 0.0014595985412597656, + "learning_rate": 1.4246913580246913e-07, + "loss": 0.0, + "reward": 1.7571429163217545, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7660714462399483, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1154 + }, + { + "completion_length": 239.1428689956665, + "epoch": 0.1938052726434469, + "grad_norm": 0.22827723141273715, + "kl": 0.0012450218200683594, + "learning_rate": 1.4271604938271602e-07, + "loss": 0.0, + "reward": 1.8142857626080513, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857290804386, + "rewards/format_reward_func": 1.0, + "step": 1156 + }, + { + "completion_length": 227.54911613464355, + "epoch": 0.19414057588331446, + "grad_norm": 0.2404619261901875, + "kl": 0.0012445449829101562, + "learning_rate": 1.4296296296296296e-07, + "loss": 0.0, + "reward": 1.7464286535978317, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 1158 + }, + { + "completion_length": 239.5759038925171, + "epoch": 0.19447587912318204, + "grad_norm": 0.22192693701997795, + "kl": 0.0011388063430786133, + "learning_rate": 1.4320987654320988e-07, + "loss": 0.0, + "reward": 1.7714286521077156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 1160 + }, + { + "completion_length": 240.5357265472412, + "epoch": 0.1948111823630496, + "grad_norm": 0.29006089497662224, + "kl": 0.0012791156768798828, + "learning_rate": 1.434567901234568e-07, + "loss": 0.0, + "reward": 1.7678571939468384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571809083223, + "rewards/format_reward_func": 1.0, + "step": 1162 + }, + { + "completion_length": 232.29465293884277, + "epoch": 0.19514648560291714, + "grad_norm": 0.33240946056531395, + "kl": 0.0012426376342773438, + "learning_rate": 1.4370370370370368e-07, + "loss": 0.0, + "reward": 1.7428572103381157, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 1164 + }, + { + "completion_length": 238.4196538925171, + "epoch": 0.1954817888427847, + "grad_norm": 0.27403505123708316, + "kl": 0.0012884140014648438, + "learning_rate": 1.4395061728395062e-07, + "loss": 0.0, + "reward": 1.7250000834465027, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7250000201165676, + "rewards/format_reward_func": 1.0, + "step": 1166 + }, + { + "completion_length": 237.09822368621826, + "epoch": 0.19581709208265224, + "grad_norm": 0.4095224412410846, + "kl": 0.0012373924255371094, + "learning_rate": 1.4419753086419753e-07, + "loss": 0.0, + "reward": 1.8107143267989159, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143249362707, + "rewards/format_reward_func": 1.0, + "step": 1168 + }, + { + "completion_length": 241.94197273254395, + "epoch": 0.1961523953225198, + "grad_norm": 0.4084487835326827, + "kl": 0.0016338825225830078, + "learning_rate": 1.4444444444444442e-07, + "loss": 0.0, + "reward": 1.7625000700354576, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1170 + }, + { + "completion_length": 230.22322368621826, + "epoch": 0.19648769856238735, + "grad_norm": 0.22951686862873494, + "kl": 0.0014760494232177734, + "learning_rate": 1.4469135802469134e-07, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 1172 + }, + { + "completion_length": 234.08483219146729, + "epoch": 0.19682300180225493, + "grad_norm": 0.4243391288107956, + "kl": 0.001817464828491211, + "learning_rate": 1.4493827160493828e-07, + "loss": 0.0, + "reward": 1.7357143759727478, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7446428872644901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1174 + }, + { + "completion_length": 238.15625858306885, + "epoch": 0.19715830504212248, + "grad_norm": 0.40595547340881505, + "kl": 0.0014379024505615234, + "learning_rate": 1.451851851851852e-07, + "loss": 0.0, + "reward": 1.6946429386734962, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7080357484519482, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1176 + }, + { + "completion_length": 236.58483123779297, + "epoch": 0.19749360828199003, + "grad_norm": 0.20405546121413234, + "kl": 0.0012536048889160156, + "learning_rate": 1.4543209876543208e-07, + "loss": 0.0, + "reward": 1.7214286774396896, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 1178 + }, + { + "completion_length": 241.3884048461914, + "epoch": 0.19782891152185758, + "grad_norm": 0.531077990728977, + "kl": 0.001644134521484375, + "learning_rate": 1.45679012345679e-07, + "loss": 0.0, + "reward": 1.7196429371833801, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1180 + }, + { + "completion_length": 236.0535831451416, + "epoch": 0.19816421476172513, + "grad_norm": 0.16600747936281435, + "kl": 0.001375436782836914, + "learning_rate": 1.459259259259259e-07, + "loss": 0.0, + "reward": 1.7071429193019867, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7071428894996643, + "rewards/format_reward_func": 1.0, + "step": 1182 + }, + { + "completion_length": 239.51786613464355, + "epoch": 0.19849951800159268, + "grad_norm": 0.18321459355862832, + "kl": 0.0015826225280761719, + "learning_rate": 1.4617283950617285e-07, + "loss": 0.0, + "reward": 1.76607146859169, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1184 + }, + { + "completion_length": 234.85268688201904, + "epoch": 0.19883482124146024, + "grad_norm": 0.2741705293832859, + "kl": 0.002036571502685547, + "learning_rate": 1.4641975308641974e-07, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143055647612, + "rewards/format_reward_func": 1.0, + "step": 1186 + }, + { + "completion_length": 242.77679443359375, + "epoch": 0.1991701244813278, + "grad_norm": 0.2830046064019713, + "kl": 0.0015716552734375, + "learning_rate": 1.4666666666666666e-07, + "loss": 0.0, + "reward": 1.6839286535978317, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.6883928831666708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1188 + }, + { + "completion_length": 245.2276906967163, + "epoch": 0.19950542772119537, + "grad_norm": 0.3715662769891078, + "kl": 0.0017573833465576172, + "learning_rate": 1.4691358024691357e-07, + "loss": 0.0, + "reward": 1.7375000789761543, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1190 + }, + { + "completion_length": 244.12054538726807, + "epoch": 0.19984073096106292, + "grad_norm": 0.27907119303296496, + "kl": 0.0016562938690185547, + "learning_rate": 1.4716049382716049e-07, + "loss": 0.0, + "reward": 1.8035714849829674, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8035714402794838, + "rewards/format_reward_func": 1.0, + "step": 1192 + }, + { + "completion_length": 243.1205472946167, + "epoch": 0.20017603420093047, + "grad_norm": 0.45567982927097334, + "kl": 0.0018799304962158203, + "learning_rate": 1.474074074074074e-07, + "loss": 0.0, + "reward": 1.7321429401636124, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7410714589059353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1194 + }, + { + "completion_length": 232.75447368621826, + "epoch": 0.20051133744079802, + "grad_norm": 0.21608914943333704, + "kl": 0.0015969276428222656, + "learning_rate": 1.4765432098765432e-07, + "loss": 0.0, + "reward": 1.8000000417232513, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 1196 + }, + { + "completion_length": 241.44197368621826, + "epoch": 0.20084664068066557, + "grad_norm": 0.26748599126218425, + "kl": 0.0018237829208374023, + "learning_rate": 1.4790123456790123e-07, + "loss": 0.0, + "reward": 1.814285784959793, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 1198 + }, + { + "completion_length": 228.45983219146729, + "epoch": 0.20118194392053312, + "grad_norm": 0.28378913534322847, + "kl": 0.0016875267028808594, + "learning_rate": 1.4814814814814815e-07, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 1200 + }, + { + "completion_length": 241.91519165039062, + "epoch": 0.20151724716040068, + "grad_norm": 0.2504716674270821, + "kl": 0.0017731189727783203, + "learning_rate": 1.4839506172839506e-07, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7803571745753288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1202 + }, + { + "completion_length": 243.80804538726807, + "epoch": 0.20185255040026825, + "grad_norm": 0.33704098188644993, + "kl": 0.002542257308959961, + "learning_rate": 1.4864197530864197e-07, + "loss": 0.0, + "reward": 1.7053572237491608, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7098214663565159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1204 + }, + { + "completion_length": 234.44643878936768, + "epoch": 0.2021878536401358, + "grad_norm": 0.2747468585504003, + "kl": 0.0015654563903808594, + "learning_rate": 1.4888888888888886e-07, + "loss": 0.0, + "reward": 1.7964286357164383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964285872876644, + "rewards/format_reward_func": 1.0, + "step": 1206 + }, + { + "completion_length": 237.40625953674316, + "epoch": 0.20252315688000336, + "grad_norm": 0.24314632357287652, + "kl": 0.001814126968383789, + "learning_rate": 1.4913580246913578e-07, + "loss": 0.0, + "reward": 1.7107143551111221, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7107143215835094, + "rewards/format_reward_func": 1.0, + "step": 1208 + }, + { + "completion_length": 239.18304824829102, + "epoch": 0.2028584601198709, + "grad_norm": 0.24318172658651124, + "kl": 0.002092123031616211, + "learning_rate": 1.4938271604938272e-07, + "loss": 0.0, + "reward": 1.828571505844593, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8285714499652386, + "rewards/format_reward_func": 1.0, + "step": 1210 + }, + { + "completion_length": 238.27679824829102, + "epoch": 0.20319376335973846, + "grad_norm": 0.32728322145173633, + "kl": 0.003085613250732422, + "learning_rate": 1.4962962962962963e-07, + "loss": 0.0, + "reward": 1.744642935693264, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1212 + }, + { + "completion_length": 240.79465198516846, + "epoch": 0.203529066599606, + "grad_norm": 0.283836659998682, + "kl": 0.0021562576293945312, + "learning_rate": 1.4987654320987652e-07, + "loss": 0.0, + "reward": 1.716071493923664, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7294643130153418, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1214 + }, + { + "completion_length": 244.0491180419922, + "epoch": 0.20386436983947356, + "grad_norm": 0.26081031140969885, + "kl": 0.0022537708282470703, + "learning_rate": 1.5012345679012344e-07, + "loss": 0.0, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.742857189849019, + "rewards/format_reward_func": 1.0, + "step": 1216 + }, + { + "completion_length": 241.91072845458984, + "epoch": 0.20419967307934114, + "grad_norm": 0.30485410684658837, + "kl": 0.0026428699493408203, + "learning_rate": 1.5037037037037038e-07, + "loss": 0.0, + "reward": 1.7642857730388641, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7732143215835094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1218 + }, + { + "completion_length": 241.0669755935669, + "epoch": 0.2045349763192087, + "grad_norm": 0.2265319246913318, + "kl": 0.0030205249786376953, + "learning_rate": 1.506172839506173e-07, + "loss": 0.0, + "reward": 1.8232143446803093, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8276785835623741, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1220 + }, + { + "completion_length": 238.30358028411865, + "epoch": 0.20487027955907625, + "grad_norm": 0.2382537179736265, + "kl": 0.0023665428161621094, + "learning_rate": 1.5086419753086418e-07, + "loss": 0.0, + "reward": 1.8017857819795609, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.8062500357627869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1222 + }, + { + "completion_length": 245.17411708831787, + "epoch": 0.2052055827989438, + "grad_norm": 0.3324516681199202, + "kl": 0.0020515918731689453, + "learning_rate": 1.511111111111111e-07, + "loss": 0.0, + "reward": 1.7857143357396126, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 1224 + }, + { + "completion_length": 239.4196548461914, + "epoch": 0.20554088603881135, + "grad_norm": 0.25251488896253566, + "kl": 0.0025861263275146484, + "learning_rate": 1.5135802469135804e-07, + "loss": 0.0, + "reward": 1.7250000983476639, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000424683094, + "rewards/format_reward_func": 1.0, + "step": 1226 + }, + { + "completion_length": 236.84822463989258, + "epoch": 0.2058761892786789, + "grad_norm": 0.23428687641725407, + "kl": 0.0025599002838134766, + "learning_rate": 1.5160493827160493e-07, + "loss": 0.0, + "reward": 1.6964286267757416, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.6964286118745804, + "rewards/format_reward_func": 1.0, + "step": 1228 + }, + { + "completion_length": 252.34375953674316, + "epoch": 0.20621149251854645, + "grad_norm": 0.32242002681200926, + "kl": 0.0028755664825439453, + "learning_rate": 1.5185185185185184e-07, + "loss": 0.0, + "reward": 1.7125000804662704, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7169643137603998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1230 + }, + { + "completion_length": 242.9107265472412, + "epoch": 0.206546795758414, + "grad_norm": 0.24004483575380084, + "kl": 0.002190113067626953, + "learning_rate": 1.5209876543209876e-07, + "loss": 0.0, + "reward": 1.7250000834465027, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7339285984635353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1232 + }, + { + "completion_length": 237.33036708831787, + "epoch": 0.20688209899828158, + "grad_norm": 0.24368559931981323, + "kl": 0.0017604827880859375, + "learning_rate": 1.523456790123457e-07, + "loss": 0.0, + "reward": 1.7660715132951736, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1234 + }, + { + "completion_length": 231.47322368621826, + "epoch": 0.20721740223814913, + "grad_norm": 0.5328530490006232, + "kl": 0.0025615692138671875, + "learning_rate": 1.5259259259259259e-07, + "loss": 0.0, + "reward": 1.7785715162754059, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 1236 + }, + { + "completion_length": 240.35268878936768, + "epoch": 0.20755270547801669, + "grad_norm": 0.18652124710992135, + "kl": 0.002346515655517578, + "learning_rate": 1.528395061728395e-07, + "loss": 0.0, + "reward": 1.776785783469677, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1238 + }, + { + "completion_length": 236.0446548461914, + "epoch": 0.20788800871788424, + "grad_norm": 0.30917406808952935, + "kl": 0.0021347999572753906, + "learning_rate": 1.5308641975308642e-07, + "loss": 0.0, + "reward": 1.7455357909202576, + "reward_std": 0.07702413015067577, + "rewards/equation_reward_func": 0.7517857402563095, + "rewards/format_reward_func": 0.9937500059604645, + "step": 1240 + }, + { + "completion_length": 246.4285831451416, + "epoch": 0.2082233119577518, + "grad_norm": 0.24049563158557588, + "kl": 0.004585981369018555, + "learning_rate": 1.533333333333333e-07, + "loss": 0.0, + "reward": 1.7035715207457542, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7035714723169804, + "rewards/format_reward_func": 1.0, + "step": 1242 + }, + { + "completion_length": 252.42858123779297, + "epoch": 0.20855861519761934, + "grad_norm": 0.28232304155666915, + "kl": 0.004839181900024414, + "learning_rate": 1.5358024691358024e-07, + "loss": 0.0, + "reward": 1.8035715073347092, + "reward_std": 0.09596449043601751, + "rewards/equation_reward_func": 0.8125000149011612, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1244 + }, + { + "completion_length": 237.1562614440918, + "epoch": 0.2088939184374869, + "grad_norm": 0.252566996772104, + "kl": 0.0032820701599121094, + "learning_rate": 1.5382716049382716e-07, + "loss": 0.0, + "reward": 1.8035714626312256, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 1246 + }, + { + "completion_length": 224.24554538726807, + "epoch": 0.20922922167735447, + "grad_norm": 0.4379151122153357, + "kl": 0.0019059181213378906, + "learning_rate": 1.5407407407407407e-07, + "loss": 0.0, + "reward": 1.7571429461240768, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 1248 + }, + { + "completion_length": 243.2366189956665, + "epoch": 0.20956452491722202, + "grad_norm": 0.2435262045125475, + "kl": 0.0021789073944091797, + "learning_rate": 1.5432098765432096e-07, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7839285843074322, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1250 + }, + { + "completion_length": 240.821439743042, + "epoch": 0.20989982815708957, + "grad_norm": 0.3508141244807139, + "kl": 0.002946138381958008, + "learning_rate": 1.545679012345679e-07, + "loss": 0.0, + "reward": 1.6964286416769028, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.6964286155998707, + "rewards/format_reward_func": 1.0, + "step": 1252 + }, + { + "completion_length": 247.0669755935669, + "epoch": 0.21023513139695713, + "grad_norm": 0.24180904017268992, + "kl": 0.002747058868408203, + "learning_rate": 1.5481481481481482e-07, + "loss": 0.0, + "reward": 1.7857143431901932, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7946428805589676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1254 + }, + { + "completion_length": 229.96875858306885, + "epoch": 0.21057043463682468, + "grad_norm": 0.2411987768476652, + "kl": 0.002772808074951172, + "learning_rate": 1.550617283950617e-07, + "loss": 0.0, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286033064127, + "rewards/format_reward_func": 1.0, + "step": 1256 + }, + { + "completion_length": 236.8303680419922, + "epoch": 0.21090573787669223, + "grad_norm": 0.32063347431687766, + "kl": 0.002824068069458008, + "learning_rate": 1.5530864197530862e-07, + "loss": 0.0, + "reward": 1.7892857789993286, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 1258 + }, + { + "completion_length": 241.53125953674316, + "epoch": 0.21124104111655978, + "grad_norm": 0.31144432521616555, + "kl": 0.004393339157104492, + "learning_rate": 1.5555555555555556e-07, + "loss": 0.0, + "reward": 1.782142885029316, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821429073810577, + "rewards/format_reward_func": 1.0, + "step": 1260 + }, + { + "completion_length": 238.9642972946167, + "epoch": 0.21157634435642733, + "grad_norm": 0.29715450473241195, + "kl": 0.004702568054199219, + "learning_rate": 1.5580246913580248e-07, + "loss": 0.0, + "reward": 1.7357143610715866, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143312692642, + "rewards/format_reward_func": 1.0, + "step": 1262 + }, + { + "completion_length": 241.18751049041748, + "epoch": 0.2119116475962949, + "grad_norm": 0.1951487271047744, + "kl": 0.0032672882080078125, + "learning_rate": 1.5604938271604937e-07, + "loss": 0.0, + "reward": 1.7500000596046448, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 1264 + }, + { + "completion_length": 241.1384038925171, + "epoch": 0.21224695083616246, + "grad_norm": 0.24012888678173097, + "kl": 0.0034089088439941406, + "learning_rate": 1.5629629629629628e-07, + "loss": 0.0, + "reward": 1.7678572088479996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 1266 + }, + { + "completion_length": 232.91965293884277, + "epoch": 0.21258225407603001, + "grad_norm": 0.25773928795116036, + "kl": 0.003378152847290039, + "learning_rate": 1.565432098765432e-07, + "loss": 0.0, + "reward": 1.7964286357164383, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.8053571693599224, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1268 + }, + { + "completion_length": 243.12054920196533, + "epoch": 0.21291755731589757, + "grad_norm": 0.17985039137077788, + "kl": 0.003942012786865234, + "learning_rate": 1.5679012345679014e-07, + "loss": 0.0, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786010712385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1270 + }, + { + "completion_length": 236.90626049041748, + "epoch": 0.21325286055576512, + "grad_norm": 0.1548271078866053, + "kl": 0.005479335784912109, + "learning_rate": 1.5703703703703703e-07, + "loss": 0.0, + "reward": 1.7303572297096252, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7348214648663998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1272 + }, + { + "completion_length": 241.95536708831787, + "epoch": 0.21358816379563267, + "grad_norm": 0.27982895939363556, + "kl": 0.004017353057861328, + "learning_rate": 1.5728395061728394e-07, + "loss": 0.0, + "reward": 1.79642865806818, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 1274 + }, + { + "completion_length": 246.4598331451416, + "epoch": 0.21392346703550022, + "grad_norm": 0.24055112645480836, + "kl": 0.00449061393737793, + "learning_rate": 1.5753086419753086e-07, + "loss": 0.0, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 1276 + }, + { + "completion_length": 242.5312623977661, + "epoch": 0.2142587702753678, + "grad_norm": 0.19247495138177573, + "kl": 0.002553701400756836, + "learning_rate": 1.5777777777777777e-07, + "loss": 0.0, + "reward": 1.8089286163449287, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8133928887546062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1278 + }, + { + "completion_length": 242.32144260406494, + "epoch": 0.21459407351523535, + "grad_norm": 0.1627487076419372, + "kl": 0.0036182403564453125, + "learning_rate": 1.5802469135802468e-07, + "loss": 0.0, + "reward": 1.7607143372297287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 1280 + }, + { + "completion_length": 240.27233123779297, + "epoch": 0.2149293767551029, + "grad_norm": 0.22177047448633702, + "kl": 0.005349874496459961, + "learning_rate": 1.582716049382716e-07, + "loss": 0.0, + "reward": 1.7321429252624512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 1282 + }, + { + "completion_length": 234.53572463989258, + "epoch": 0.21526467999497045, + "grad_norm": 0.2958328217477393, + "kl": 0.004651069641113281, + "learning_rate": 1.5851851851851851e-07, + "loss": 0.0, + "reward": 1.7214286476373672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 1284 + }, + { + "completion_length": 241.8080472946167, + "epoch": 0.215599983234838, + "grad_norm": 0.33299459227609995, + "kl": 0.004356861114501953, + "learning_rate": 1.5876543209876543e-07, + "loss": 0.0, + "reward": 1.7285714969038963, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.7375000417232513, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1286 + }, + { + "completion_length": 235.60268878936768, + "epoch": 0.21593528647470556, + "grad_norm": 0.35271283163499656, + "kl": 0.005709171295166016, + "learning_rate": 1.5901234567901234e-07, + "loss": 0.0, + "reward": 1.7089286595582962, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7133928686380386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1288 + }, + { + "completion_length": 244.56251049041748, + "epoch": 0.2162705897145731, + "grad_norm": 0.1859424366173882, + "kl": 0.003217935562133789, + "learning_rate": 1.5925925925925926e-07, + "loss": 0.0, + "reward": 1.7625000476837158, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7669642996042967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1290 + }, + { + "completion_length": 240.76786708831787, + "epoch": 0.21660589295444066, + "grad_norm": 0.24532934625705524, + "kl": 0.0031633377075195312, + "learning_rate": 1.5950617283950615e-07, + "loss": 0.0, + "reward": 1.6625000908970833, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.6669643167406321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1292 + }, + { + "completion_length": 243.30358505249023, + "epoch": 0.21694119619430824, + "grad_norm": 0.2586537390088073, + "kl": 0.002529144287109375, + "learning_rate": 1.5975308641975306e-07, + "loss": 0.0, + "reward": 1.776785783469677, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1294 + }, + { + "completion_length": 234.05357933044434, + "epoch": 0.2172764994341758, + "grad_norm": 0.20464408868137368, + "kl": 0.006105184555053711, + "learning_rate": 1.6e-07, + "loss": 0.0, + "reward": 1.7696429193019867, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1296 + }, + { + "completion_length": 243.42858123779297, + "epoch": 0.21761180267404334, + "grad_norm": 0.4311301072230461, + "kl": 0.004184722900390625, + "learning_rate": 1.6024691358024692e-07, + "loss": 0.0, + "reward": 1.7160714864730835, + "reward_std": 0.11364215798676014, + "rewards/equation_reward_func": 0.7294643148779869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1298 + }, + { + "completion_length": 233.65626049041748, + "epoch": 0.2179471059139109, + "grad_norm": 0.5118573409819533, + "kl": 0.006073713302612305, + "learning_rate": 1.604938271604938e-07, + "loss": 0.0, + "reward": 1.7464286535978317, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7553571686148643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1300 + }, + { + "completion_length": 242.00894165039062, + "epoch": 0.21828240915377844, + "grad_norm": 0.18901941957665191, + "kl": 0.004821300506591797, + "learning_rate": 1.6074074074074072e-07, + "loss": 0.0, + "reward": 1.7214286476373672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 1302 + }, + { + "completion_length": 243.61608123779297, + "epoch": 0.218617712393646, + "grad_norm": 0.2868402947325543, + "kl": 0.004178762435913086, + "learning_rate": 1.6098765432098766e-07, + "loss": 0.0, + "reward": 1.7732143253087997, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776785865426064, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1304 + }, + { + "completion_length": 231.63840293884277, + "epoch": 0.21895301563351355, + "grad_norm": 0.28362586224310077, + "kl": 0.0027713775634765625, + "learning_rate": 1.6123456790123455e-07, + "loss": 0.0, + "reward": 1.7857143357396126, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7857143357396126, + "rewards/format_reward_func": 1.0, + "step": 1306 + }, + { + "completion_length": 244.44643783569336, + "epoch": 0.21928831887338113, + "grad_norm": 0.2076709051554259, + "kl": 0.0029850006103515625, + "learning_rate": 1.6148148148148147e-07, + "loss": 0.0, + "reward": 1.703571505844593, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7125000357627869, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1308 + }, + { + "completion_length": 233.55358219146729, + "epoch": 0.21962362211324868, + "grad_norm": 0.25018701924401665, + "kl": 0.0067141056060791016, + "learning_rate": 1.6172839506172838e-07, + "loss": 0.0, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 1310 + }, + { + "completion_length": 239.6026906967163, + "epoch": 0.21995892535311623, + "grad_norm": 0.3511161464206752, + "kl": 0.00603485107421875, + "learning_rate": 1.6197530864197532e-07, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 1312 + }, + { + "completion_length": 240.23215579986572, + "epoch": 0.22029422859298378, + "grad_norm": 0.28800201598541564, + "kl": 0.003075122833251953, + "learning_rate": 1.622222222222222e-07, + "loss": 0.0, + "reward": 1.7714286670088768, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7803571633994579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1314 + }, + { + "completion_length": 234.04019165039062, + "epoch": 0.22062953183285133, + "grad_norm": 0.2676481126389581, + "kl": 0.004632472991943359, + "learning_rate": 1.6246913580246912e-07, + "loss": 0.0, + "reward": 1.7339286282658577, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383929006755352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1316 + }, + { + "completion_length": 244.27679443359375, + "epoch": 0.22096483507271888, + "grad_norm": 0.3087973565894431, + "kl": 0.010606765747070312, + "learning_rate": 1.6271604938271604e-07, + "loss": 0.0, + "reward": 1.6928572058677673, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7017857395112514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1318 + }, + { + "completion_length": 241.70983123779297, + "epoch": 0.22130013831258644, + "grad_norm": 0.25034205125370274, + "kl": 0.00440216064453125, + "learning_rate": 1.6296296296296298e-07, + "loss": 0.0, + "reward": 1.7589286267757416, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7723214589059353, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1320 + }, + { + "completion_length": 234.6785831451416, + "epoch": 0.221635441552454, + "grad_norm": 0.41038167641507267, + "kl": 0.00315093994140625, + "learning_rate": 1.6320987654320987e-07, + "loss": 0.0, + "reward": 1.7392857819795609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 1322 + }, + { + "completion_length": 245.37947368621826, + "epoch": 0.22197074479232157, + "grad_norm": 0.806420438279851, + "kl": 0.009297370910644531, + "learning_rate": 1.6345679012345678e-07, + "loss": 0.0, + "reward": 1.7625000551342964, + "reward_std": 0.0732360603287816, + "rewards/equation_reward_func": 0.7758928723633289, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1324 + }, + { + "completion_length": 229.21429538726807, + "epoch": 0.22230604803218912, + "grad_norm": 0.29786161917524756, + "kl": 0.004057168960571289, + "learning_rate": 1.637037037037037e-07, + "loss": 0.0, + "reward": 1.7857143580913544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 1326 + }, + { + "completion_length": 241.70090293884277, + "epoch": 0.22264135127205667, + "grad_norm": 0.2705780895983189, + "kl": 0.0032672882080078125, + "learning_rate": 1.639506172839506e-07, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 1328 + }, + { + "completion_length": 244.21429538726807, + "epoch": 0.22297665451192422, + "grad_norm": 0.24169684934286853, + "kl": 0.003974437713623047, + "learning_rate": 1.6419753086419753e-07, + "loss": 0.0, + "reward": 1.771428644657135, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 1330 + }, + { + "completion_length": 236.27679634094238, + "epoch": 0.22331195775179177, + "grad_norm": 0.2513951306851172, + "kl": 0.003792285919189453, + "learning_rate": 1.6444444444444444e-07, + "loss": 0.0, + "reward": 1.7232143506407738, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1332 + }, + { + "completion_length": 239.6830472946167, + "epoch": 0.22364726099165932, + "grad_norm": 0.26017730409226486, + "kl": 0.006929159164428711, + "learning_rate": 1.6469135802469136e-07, + "loss": 0.0, + "reward": 1.8214286267757416, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8214285969734192, + "rewards/format_reward_func": 1.0, + "step": 1334 + }, + { + "completion_length": 240.4062614440918, + "epoch": 0.22398256423152688, + "grad_norm": 0.33342545268150153, + "kl": 0.007687568664550781, + "learning_rate": 1.6493827160493825e-07, + "loss": 0.0, + "reward": 1.765178643167019, + "reward_std": 0.03914341004565358, + "rewards/equation_reward_func": 0.7669643089175224, + "rewards/format_reward_func": 0.9982142895460129, + "step": 1336 + }, + { + "completion_length": 238.1294755935669, + "epoch": 0.22431786747139446, + "grad_norm": 0.28918367802683476, + "kl": 0.003994941711425781, + "learning_rate": 1.651851851851852e-07, + "loss": 0.0, + "reward": 1.7035715281963348, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7035714536905289, + "rewards/format_reward_func": 1.0, + "step": 1338 + }, + { + "completion_length": 237.70090103149414, + "epoch": 0.224653170711262, + "grad_norm": 0.2494069780734928, + "kl": 0.011424541473388672, + "learning_rate": 1.654320987654321e-07, + "loss": 0.0, + "reward": 1.741071492433548, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7455357518047094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1340 + }, + { + "completion_length": 237.7991180419922, + "epoch": 0.22498847395112956, + "grad_norm": 0.303123570931485, + "kl": 0.003299713134765625, + "learning_rate": 1.65679012345679e-07, + "loss": 0.0, + "reward": 1.6857143640518188, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.6946428939700127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1342 + }, + { + "completion_length": 245.04911708831787, + "epoch": 0.2253237771909971, + "grad_norm": 0.2523171541215057, + "kl": 0.005613803863525391, + "learning_rate": 1.659259259259259e-07, + "loss": 0.0, + "reward": 1.7571429088711739, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7660714723169804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1344 + }, + { + "completion_length": 243.0937614440918, + "epoch": 0.22565908043086466, + "grad_norm": 0.14018283087264063, + "kl": 0.0050811767578125, + "learning_rate": 1.6617283950617285e-07, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000443309546, + "rewards/format_reward_func": 1.0, + "step": 1346 + }, + { + "completion_length": 241.71429443359375, + "epoch": 0.2259943836707322, + "grad_norm": 0.41487774404342265, + "kl": 0.005957126617431641, + "learning_rate": 1.6641975308641976e-07, + "loss": 0.0, + "reward": 1.7410714998841286, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7455357573926449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1348 + }, + { + "completion_length": 230.00893878936768, + "epoch": 0.22632968691059976, + "grad_norm": 0.2806775670690932, + "kl": 0.00467681884765625, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0, + "reward": 1.7714286372065544, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7803571708500385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1350 + }, + { + "completion_length": 231.69643878936768, + "epoch": 0.22666499015046732, + "grad_norm": 0.24691130694143, + "kl": 0.005158901214599609, + "learning_rate": 1.6691358024691357e-07, + "loss": 0.0, + "reward": 1.7321429401636124, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7321428805589676, + "rewards/format_reward_func": 1.0, + "step": 1352 + }, + { + "completion_length": 240.7857255935669, + "epoch": 0.2270002933903349, + "grad_norm": 0.24994952871151663, + "kl": 0.005618095397949219, + "learning_rate": 1.6716049382716048e-07, + "loss": 0.0, + "reward": 1.7571429163217545, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428921073675, + "rewards/format_reward_func": 1.0, + "step": 1354 + }, + { + "completion_length": 248.33929824829102, + "epoch": 0.22733559663020245, + "grad_norm": 0.09872595517896748, + "kl": 0.009372234344482422, + "learning_rate": 1.674074074074074e-07, + "loss": 0.0, + "reward": 1.717857226729393, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7267857305705547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1356 + }, + { + "completion_length": 245.62054634094238, + "epoch": 0.22767089987007, + "grad_norm": 0.2373904924245633, + "kl": 0.008755207061767578, + "learning_rate": 1.676543209876543e-07, + "loss": 0.0, + "reward": 1.710714377462864, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7107143122702837, + "rewards/format_reward_func": 1.0, + "step": 1358 + }, + { + "completion_length": 234.75893878936768, + "epoch": 0.22800620310993755, + "grad_norm": 0.20854539089599378, + "kl": 0.007511138916015625, + "learning_rate": 1.6790123456790122e-07, + "loss": 0.0, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 1360 + }, + { + "completion_length": 234.74108028411865, + "epoch": 0.2283415063498051, + "grad_norm": 0.28694424946821906, + "kl": 0.0039539337158203125, + "learning_rate": 1.6814814814814814e-07, + "loss": 0.0, + "reward": 1.7000000700354576, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7000000551342964, + "rewards/format_reward_func": 1.0, + "step": 1362 + }, + { + "completion_length": 241.7366189956665, + "epoch": 0.22867680958967265, + "grad_norm": 0.3141697765365759, + "kl": 0.010674476623535156, + "learning_rate": 1.6839506172839505e-07, + "loss": 0.0, + "reward": 1.8071429431438446, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 1364 + }, + { + "completion_length": 239.72322368621826, + "epoch": 0.2290121128295402, + "grad_norm": 0.2671441737251063, + "kl": 0.005108356475830078, + "learning_rate": 1.6864197530864197e-07, + "loss": 0.0, + "reward": 1.7696429267525673, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1366 + }, + { + "completion_length": 228.58483219146729, + "epoch": 0.22934741606940778, + "grad_norm": 0.21440951279482529, + "kl": 0.003632068634033203, + "learning_rate": 1.6888888888888888e-07, + "loss": 0.0, + "reward": 1.74642863124609, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 1368 + }, + { + "completion_length": 235.47322463989258, + "epoch": 0.22968271930927533, + "grad_norm": 0.20954095826447186, + "kl": 0.008241653442382812, + "learning_rate": 1.691358024691358e-07, + "loss": 0.0, + "reward": 1.7107143700122833, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.710714315995574, + "rewards/format_reward_func": 1.0, + "step": 1370 + }, + { + "completion_length": 237.51786994934082, + "epoch": 0.2300180225491429, + "grad_norm": 0.17949433295497624, + "kl": 0.007944583892822266, + "learning_rate": 1.693827160493827e-07, + "loss": 0.0, + "reward": 1.7250000536441803, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.72500004991889, + "rewards/format_reward_func": 1.0, + "step": 1372 + }, + { + "completion_length": 239.4687614440918, + "epoch": 0.23035332578901044, + "grad_norm": 0.3259964424478922, + "kl": 0.004845619201660156, + "learning_rate": 1.6962962962962963e-07, + "loss": 0.0, + "reward": 1.7285715192556381, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 1374 + }, + { + "completion_length": 241.49108409881592, + "epoch": 0.230688629028878, + "grad_norm": 0.27244331118122284, + "kl": 0.009942054748535156, + "learning_rate": 1.6987654320987654e-07, + "loss": 0.0, + "reward": 1.7714286595582962, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 1376 + }, + { + "completion_length": 236.321439743042, + "epoch": 0.23102393226874554, + "grad_norm": 0.07781785904774369, + "kl": 0.0042362213134765625, + "learning_rate": 1.7012345679012343e-07, + "loss": 0.0, + "reward": 1.7178572192788124, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7258928790688515, + "rewards/format_reward_func": 0.9919642955064774, + "step": 1378 + }, + { + "completion_length": 237.6562623977661, + "epoch": 0.2313592355086131, + "grad_norm": 0.3306974061678042, + "kl": 0.02078533172607422, + "learning_rate": 1.7037037037037035e-07, + "loss": 0.0, + "reward": 1.7660714909434319, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1380 + }, + { + "completion_length": 234.9107265472412, + "epoch": 0.23169453874848064, + "grad_norm": 0.15438550905304013, + "kl": 0.010261058807373047, + "learning_rate": 1.706172839506173e-07, + "loss": 0.0, + "reward": 1.7000000476837158, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7089286036789417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1382 + }, + { + "completion_length": 243.08483505249023, + "epoch": 0.23202984198834822, + "grad_norm": 0.26681977267282586, + "kl": 0.017525196075439453, + "learning_rate": 1.708641975308642e-07, + "loss": 0.0, + "reward": 1.7392857894301414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857633531094, + "rewards/format_reward_func": 1.0, + "step": 1384 + }, + { + "completion_length": 235.85715103149414, + "epoch": 0.23236514522821577, + "grad_norm": 0.2000816579266356, + "kl": 0.01715850830078125, + "learning_rate": 1.711111111111111e-07, + "loss": 0.0, + "reward": 1.7321429327130318, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428991854191, + "rewards/format_reward_func": 1.0, + "step": 1386 + }, + { + "completion_length": 242.9017972946167, + "epoch": 0.23270044846808333, + "grad_norm": 0.22182251115978738, + "kl": 0.006072044372558594, + "learning_rate": 1.71358024691358e-07, + "loss": 0.0, + "reward": 1.7125000655651093, + "reward_std": 0.09343911055475473, + "rewards/equation_reward_func": 0.7258928790688515, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1388 + }, + { + "completion_length": 237.6830472946167, + "epoch": 0.23303575170795088, + "grad_norm": 0.14505985813023767, + "kl": 0.006579399108886719, + "learning_rate": 1.7160493827160495e-07, + "loss": 0.0, + "reward": 1.830357201397419, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8348214477300644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1390 + }, + { + "completion_length": 221.95536518096924, + "epoch": 0.23337105494781843, + "grad_norm": 0.3887991399283552, + "kl": 0.009667396545410156, + "learning_rate": 1.7185185185185183e-07, + "loss": 0.0, + "reward": 1.7843750938773155, + "reward_std": 0.06250318652018905, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 0.9986607171595097, + "step": 1392 + }, + { + "completion_length": 239.72322750091553, + "epoch": 0.23370635818768598, + "grad_norm": 0.31497940932668206, + "kl": 0.00586700439453125, + "learning_rate": 1.7209876543209875e-07, + "loss": 0.0, + "reward": 1.778571493923664, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714417695999, + "rewards/format_reward_func": 1.0, + "step": 1394 + }, + { + "completion_length": 237.57143783569336, + "epoch": 0.23404166142755353, + "grad_norm": 0.1722442961054919, + "kl": 0.01059722900390625, + "learning_rate": 1.7234567901234566e-07, + "loss": 0.0, + "reward": 1.7535715028643608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 1396 + }, + { + "completion_length": 241.54018878936768, + "epoch": 0.2343769646674211, + "grad_norm": 0.23289390607293414, + "kl": 0.016029834747314453, + "learning_rate": 1.725925925925926e-07, + "loss": 0.0, + "reward": 1.69821435213089, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7116071581840515, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1398 + }, + { + "completion_length": 240.33036613464355, + "epoch": 0.23471226790728866, + "grad_norm": 0.3716827959785001, + "kl": 0.014810562133789062, + "learning_rate": 1.728395061728395e-07, + "loss": 0.0, + "reward": 1.6892858073115349, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.6982143186032772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1400 + }, + { + "completion_length": 235.44197463989258, + "epoch": 0.23504757114715621, + "grad_norm": 0.15371024300134672, + "kl": 0.017034530639648438, + "learning_rate": 1.730864197530864e-07, + "loss": 0.0, + "reward": 1.7178572118282318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571820259094, + "rewards/format_reward_func": 1.0, + "step": 1402 + }, + { + "completion_length": 228.0089406967163, + "epoch": 0.23538287438702377, + "grad_norm": 0.280897369010513, + "kl": 0.011105060577392578, + "learning_rate": 1.7333333333333332e-07, + "loss": 0.0, + "reward": 1.7964286655187607, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 1404 + }, + { + "completion_length": 242.0759048461914, + "epoch": 0.23571817762689132, + "grad_norm": 0.4060917647274439, + "kl": 0.014769554138183594, + "learning_rate": 1.7358024691358027e-07, + "loss": 0.0, + "reward": 1.6803572252392769, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.684821454808116, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1406 + }, + { + "completion_length": 234.25000953674316, + "epoch": 0.23605348086675887, + "grad_norm": 0.29418746031538556, + "kl": 0.007956981658935547, + "learning_rate": 1.7382716049382715e-07, + "loss": 0.0, + "reward": 1.6821429431438446, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6821429021656513, + "rewards/format_reward_func": 1.0, + "step": 1408 + }, + { + "completion_length": 238.62501049041748, + "epoch": 0.23638878410662642, + "grad_norm": 0.5412517362596901, + "kl": 0.016735076904296875, + "learning_rate": 1.7407407407407407e-07, + "loss": 0.0, + "reward": 1.7750000432133675, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 0.9821428656578064, + "step": 1410 + }, + { + "completion_length": 244.31697273254395, + "epoch": 0.23672408734649397, + "grad_norm": 0.2775285412382821, + "kl": 0.013233184814453125, + "learning_rate": 1.7432098765432098e-07, + "loss": 0.0, + "reward": 1.751785784959793, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1412 + }, + { + "completion_length": 230.88393878936768, + "epoch": 0.23705939058636155, + "grad_norm": 0.30478659922904655, + "kl": 0.005757331848144531, + "learning_rate": 1.7456790123456787e-07, + "loss": 0.0, + "reward": 1.8285714760422707, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8285714387893677, + "rewards/format_reward_func": 1.0, + "step": 1414 + }, + { + "completion_length": 230.1384038925171, + "epoch": 0.2373946938262291, + "grad_norm": 0.159625589618849, + "kl": 0.018886566162109375, + "learning_rate": 1.748148148148148e-07, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 1416 + }, + { + "completion_length": 233.52233409881592, + "epoch": 0.23772999706609665, + "grad_norm": 0.3840361324210306, + "kl": 0.018432140350341797, + "learning_rate": 1.7506172839506173e-07, + "loss": 0.0, + "reward": 1.7607143372297287, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7696428969502449, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1418 + }, + { + "completion_length": 233.1428689956665, + "epoch": 0.2380653003059642, + "grad_norm": 0.31642404943545666, + "kl": 0.019349098205566406, + "learning_rate": 1.7530864197530864e-07, + "loss": 0.0, + "reward": 1.7107143551111221, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7107143178582191, + "rewards/format_reward_func": 1.0, + "step": 1420 + }, + { + "completion_length": 245.95983219146729, + "epoch": 0.23840060354583176, + "grad_norm": 0.1705597345975343, + "kl": 0.008008956909179688, + "learning_rate": 1.7555555555555553e-07, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 1422 + }, + { + "completion_length": 232.80358123779297, + "epoch": 0.2387359067856993, + "grad_norm": 0.6085086681901897, + "kl": 0.05005693435668945, + "learning_rate": 1.7580246913580247e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1424 + }, + { + "completion_length": 237.4107255935669, + "epoch": 0.23907121002556686, + "grad_norm": 0.22994413881569736, + "kl": 0.02605152130126953, + "learning_rate": 1.7604938271604939e-07, + "loss": 0.0, + "reward": 1.7214286550879478, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 1426 + }, + { + "completion_length": 241.17411994934082, + "epoch": 0.23940651326543444, + "grad_norm": 0.4663555822879698, + "kl": 0.04881477355957031, + "learning_rate": 1.7629629629629627e-07, + "loss": 0.0, + "reward": 1.6642858311533928, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.6732143200933933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1428 + }, + { + "completion_length": 229.7678680419922, + "epoch": 0.239741816505302, + "grad_norm": 0.08922893400986967, + "kl": 0.012132644653320312, + "learning_rate": 1.765432098765432e-07, + "loss": 0.0, + "reward": 1.7000000923871994, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7000000365078449, + "rewards/format_reward_func": 1.0, + "step": 1430 + }, + { + "completion_length": 235.18304443359375, + "epoch": 0.24007711974516954, + "grad_norm": 0.2590381600205453, + "kl": 0.028981685638427734, + "learning_rate": 1.7679012345679013e-07, + "loss": 0.0, + "reward": 1.7321429401636124, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7410714440047741, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1432 + }, + { + "completion_length": 235.06697368621826, + "epoch": 0.2404124229850371, + "grad_norm": 0.21589017259195808, + "kl": 0.09281253814697266, + "learning_rate": 1.7703703703703705e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857536673546, + "rewards/format_reward_func": 1.0, + "step": 1434 + }, + { + "completion_length": 235.43304634094238, + "epoch": 0.24074772622490465, + "grad_norm": 0.11790548457588271, + "kl": 0.08521080017089844, + "learning_rate": 1.7728395061728393e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 1436 + }, + { + "completion_length": 234.1696538925171, + "epoch": 0.2410830294647722, + "grad_norm": 0.23563333190513872, + "kl": 0.02545928955078125, + "learning_rate": 1.7753086419753085e-07, + "loss": 0.0, + "reward": 1.7071429416537285, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7071428894996643, + "rewards/format_reward_func": 1.0, + "step": 1438 + }, + { + "completion_length": 230.19197463989258, + "epoch": 0.24141833270463975, + "grad_norm": 0.2772527679123808, + "kl": 0.06699180603027344, + "learning_rate": 1.7777777777777776e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 1440 + }, + { + "completion_length": 233.6294755935669, + "epoch": 0.2417536359445073, + "grad_norm": 0.10134875392539826, + "kl": 0.036830902099609375, + "learning_rate": 1.7802469135802468e-07, + "loss": 0.0, + "reward": 1.725000061094761, + "reward_std": 0.005050762556493282, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 1442 + }, + { + "completion_length": 236.0178680419922, + "epoch": 0.24208893918437488, + "grad_norm": 0.7041311897794371, + "kl": 0.15368270874023438, + "learning_rate": 1.782716049382716e-07, + "loss": 0.0002, + "reward": 1.7839286252856255, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1444 + }, + { + "completion_length": 231.64733219146729, + "epoch": 0.24242424242424243, + "grad_norm": 0.26936496812526695, + "kl": 0.08620834350585938, + "learning_rate": 1.785185185185185e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7169643267989159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1446 + }, + { + "completion_length": 234.21875858306885, + "epoch": 0.24275954566410998, + "grad_norm": 0.29822283393676635, + "kl": 0.137451171875, + "learning_rate": 1.7876543209876542e-07, + "loss": 0.0001, + "reward": 1.721428632736206, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7214286271482706, + "rewards/format_reward_func": 1.0, + "step": 1448 + }, + { + "completion_length": 233.08483123779297, + "epoch": 0.24309484890397753, + "grad_norm": 0.13332326691597654, + "kl": 0.16257810592651367, + "learning_rate": 1.7901234567901234e-07, + "loss": 0.0002, + "reward": 1.7089286521077156, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7133928947150707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1450 + }, + { + "completion_length": 234.89733219146729, + "epoch": 0.24343015214384509, + "grad_norm": 0.17016165008545617, + "kl": 0.09032154083251953, + "learning_rate": 1.7925925925925925e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643089175224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1452 + }, + { + "completion_length": 237.5848331451416, + "epoch": 0.24376545538371264, + "grad_norm": 0.21223775500209358, + "kl": 0.168792724609375, + "learning_rate": 1.7950617283950617e-07, + "loss": 0.0002, + "reward": 1.7071429565548897, + "reward_std": 0.03030457627028227, + "rewards/equation_reward_func": 0.716071454808116, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1454 + }, + { + "completion_length": 236.383939743042, + "epoch": 0.2441007586235802, + "grad_norm": 0.22677331825081817, + "kl": 0.09221458435058594, + "learning_rate": 1.7975308641975308e-07, + "loss": 0.0001, + "reward": 1.734375074505806, + "reward_std": 0.05240166210569441, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 0.9986607171595097, + "step": 1456 + }, + { + "completion_length": 236.5357265472412, + "epoch": 0.24443606186344777, + "grad_norm": 0.3695446121649956, + "kl": 0.1760234832763672, + "learning_rate": 1.8e-07, + "loss": 0.0002, + "reward": 1.7285715341567993, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7375000342726707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1458 + }, + { + "completion_length": 235.0535831451416, + "epoch": 0.24477136510331532, + "grad_norm": 0.19789398802687613, + "kl": 0.11786079406738281, + "learning_rate": 1.802469135802469e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7651786003261805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1460 + }, + { + "completion_length": 231.3928680419922, + "epoch": 0.24510666834318287, + "grad_norm": 0.49119529018607455, + "kl": 0.14634132385253906, + "learning_rate": 1.8049382716049383e-07, + "loss": 0.0001, + "reward": 1.7946429029107094, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7991071715950966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1462 + }, + { + "completion_length": 237.61608409881592, + "epoch": 0.24544197158305042, + "grad_norm": 0.34020786263175756, + "kl": 0.2747945785522461, + "learning_rate": 1.8074074074074072e-07, + "loss": 0.0003, + "reward": 1.778571479022503, + "reward_std": 0.1010152529925108, + "rewards/equation_reward_func": 0.7875000312924385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1464 + }, + { + "completion_length": 238.42858219146729, + "epoch": 0.24577727482291797, + "grad_norm": 0.2181514729696241, + "kl": 0.48163700103759766, + "learning_rate": 1.8098765432098763e-07, + "loss": 0.0005, + "reward": 1.717857226729393, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7267857454717159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1466 + }, + { + "completion_length": 228.74108028411865, + "epoch": 0.24611257806278553, + "grad_norm": 0.22860592494644247, + "kl": 0.4790782928466797, + "learning_rate": 1.8123456790123457e-07, + "loss": 0.0005, + "reward": 1.7303571850061417, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7348214704543352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1468 + }, + { + "completion_length": 234.67411994934082, + "epoch": 0.24644788130265308, + "grad_norm": 0.24797354188614557, + "kl": 0.008512496948242188, + "learning_rate": 1.8148148148148149e-07, + "loss": 0.0, + "reward": 1.7625000700354576, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1470 + }, + { + "completion_length": 243.23215770721436, + "epoch": 0.24678318454252063, + "grad_norm": 0.20385499798706525, + "kl": 0.18136978149414062, + "learning_rate": 1.8172839506172837e-07, + "loss": 0.0002, + "reward": 1.7107143476605415, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7107143308967352, + "rewards/format_reward_func": 1.0, + "step": 1472 + }, + { + "completion_length": 230.69197463989258, + "epoch": 0.2471184877823882, + "grad_norm": 0.4924804061728603, + "kl": 0.16703414916992188, + "learning_rate": 1.819753086419753e-07, + "loss": 0.0002, + "reward": 1.723214365541935, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1474 + }, + { + "completion_length": 230.3750123977661, + "epoch": 0.24745379102225576, + "grad_norm": 0.41683170206161424, + "kl": 0.630040168762207, + "learning_rate": 1.8222222222222223e-07, + "loss": 0.0006, + "reward": 1.669642947614193, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.6741071697324514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1476 + }, + { + "completion_length": 235.13840103149414, + "epoch": 0.2477890942621233, + "grad_norm": 0.31188651409494367, + "kl": 0.8284721374511719, + "learning_rate": 1.8246913580246912e-07, + "loss": 0.0008, + "reward": 1.7071429193019867, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7160714641213417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1478 + }, + { + "completion_length": 234.34375858306885, + "epoch": 0.24812439750199086, + "grad_norm": 0.3133969512879245, + "kl": 0.012765884399414062, + "learning_rate": 1.8271604938271603e-07, + "loss": 0.0, + "reward": 1.7303572222590446, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.734821455553174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1480 + }, + { + "completion_length": 236.3259038925171, + "epoch": 0.2484597007418584, + "grad_norm": 0.20803832575885528, + "kl": 0.3232555389404297, + "learning_rate": 1.8296296296296295e-07, + "loss": 0.0003, + "reward": 1.7571428939700127, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 1482 + }, + { + "completion_length": 241.29465293884277, + "epoch": 0.24879500398172597, + "grad_norm": 0.32787663426007396, + "kl": 0.4119834899902344, + "learning_rate": 1.832098765432099e-07, + "loss": 0.0004, + "reward": 1.7107143849134445, + "reward_std": 0.07576144114136696, + "rewards/equation_reward_func": 0.719642885029316, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1484 + }, + { + "completion_length": 234.43304538726807, + "epoch": 0.24913030722159352, + "grad_norm": 0.2691108188962023, + "kl": 0.06020641326904297, + "learning_rate": 1.8345679012345678e-07, + "loss": 0.0001, + "reward": 1.7767857685685158, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1486 + }, + { + "completion_length": 235.89286994934082, + "epoch": 0.2494656104614611, + "grad_norm": 0.34431069891908556, + "kl": 0.8705596923828125, + "learning_rate": 1.837037037037037e-07, + "loss": 0.0009, + "reward": 1.775000050663948, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000301748514, + "rewards/format_reward_func": 1.0, + "step": 1488 + }, + { + "completion_length": 227.37500953674316, + "epoch": 0.24980091370132865, + "grad_norm": 0.6695050569787687, + "kl": 0.19437503814697266, + "learning_rate": 1.839506172839506e-07, + "loss": 0.0002, + "reward": 1.7946429252624512, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7991071715950966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1490 + }, + { + "completion_length": 228.99108123779297, + "epoch": 0.2501362169411962, + "grad_norm": 0.30169039119381347, + "kl": 0.5019960403442383, + "learning_rate": 1.841975308641975e-07, + "loss": 0.0005, + "reward": 1.744642935693264, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1492 + }, + { + "completion_length": 225.2142972946167, + "epoch": 0.2504715201810637, + "grad_norm": 0.14293763277183175, + "kl": 0.21941280364990234, + "learning_rate": 1.8444444444444444e-07, + "loss": 0.0002, + "reward": 1.7214286550879478, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286029338837, + "rewards/format_reward_func": 1.0, + "step": 1494 + }, + { + "completion_length": 242.78126049041748, + "epoch": 0.2508068234209313, + "grad_norm": 0.19395245164727634, + "kl": 0.25534629821777344, + "learning_rate": 1.8469135802469135e-07, + "loss": 0.0003, + "reward": 1.7732143551111221, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1496 + }, + { + "completion_length": 225.95536708831787, + "epoch": 0.2511421266607989, + "grad_norm": 0.21436022346581826, + "kl": 0.3440971374511719, + "learning_rate": 1.8493827160493827e-07, + "loss": 0.0003, + "reward": 1.8075893446803093, + "reward_std": 0.0460882093757391, + "rewards/equation_reward_func": 0.813392873853445, + "rewards/format_reward_func": 0.994196429848671, + "step": 1498 + }, + { + "completion_length": 233.38393878936768, + "epoch": 0.2514774299006664, + "grad_norm": 0.3264524194551336, + "kl": 0.22805309295654297, + "learning_rate": 1.8518518518518516e-07, + "loss": 0.0002, + "reward": 1.6928572431206703, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7017857395112514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1500 + }, + { + "completion_length": 229.2991189956665, + "epoch": 0.251812733140534, + "grad_norm": 0.20478533160273893, + "kl": 0.14220809936523438, + "learning_rate": 1.854320987654321e-07, + "loss": 0.0001, + "reward": 1.7910714745521545, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1502 + }, + { + "completion_length": 237.68750762939453, + "epoch": 0.2521480363804015, + "grad_norm": 0.22934017891844902, + "kl": 0.11223888397216797, + "learning_rate": 1.85679012345679e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 1504 + }, + { + "completion_length": 235.20536708831787, + "epoch": 0.2524833396202691, + "grad_norm": 0.4432363920330559, + "kl": 0.08673858642578125, + "learning_rate": 1.8592592592592593e-07, + "loss": 0.0001, + "reward": 1.7696429342031479, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071581840515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1506 + }, + { + "completion_length": 230.18304443359375, + "epoch": 0.2528186428601366, + "grad_norm": 0.21735637485812978, + "kl": 0.16140174865722656, + "learning_rate": 1.8617283950617281e-07, + "loss": 0.0002, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 1508 + }, + { + "completion_length": 232.8884038925171, + "epoch": 0.2531539461000042, + "grad_norm": 0.2751427765130532, + "kl": 0.03536796569824219, + "learning_rate": 1.8641975308641976e-07, + "loss": 0.0, + "reward": 1.8071428909897804, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428760886192, + "rewards/format_reward_func": 1.0, + "step": 1510 + }, + { + "completion_length": 241.68304634094238, + "epoch": 0.25348924933987177, + "grad_norm": 0.3222525911957361, + "kl": 0.14169883728027344, + "learning_rate": 1.8666666666666667e-07, + "loss": 0.0001, + "reward": 1.6553572192788124, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.6687500290572643, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1512 + }, + { + "completion_length": 235.7053689956665, + "epoch": 0.2538245525797393, + "grad_norm": 0.6563885167540162, + "kl": 0.09424400329589844, + "learning_rate": 1.8691358024691356e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143349945545, + "rewards/format_reward_func": 1.0, + "step": 1514 + }, + { + "completion_length": 233.75447368621826, + "epoch": 0.2541598558196069, + "grad_norm": 0.5746414100535419, + "kl": 0.06946945190429688, + "learning_rate": 1.8716049382716047e-07, + "loss": 0.0001, + "reward": 1.7678572311997414, + "reward_std": 0.07576144114136696, + "rewards/equation_reward_func": 0.7767857387661934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1516 + }, + { + "completion_length": 223.63393783569336, + "epoch": 0.2544951590594744, + "grad_norm": 0.1957711124441575, + "kl": 0.10572624206542969, + "learning_rate": 1.8740740740740742e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8017857410013676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1518 + }, + { + "completion_length": 236.38393783569336, + "epoch": 0.254830462299342, + "grad_norm": 0.30273470214807835, + "kl": 0.15691757202148438, + "learning_rate": 1.8765432098765433e-07, + "loss": 0.0002, + "reward": 1.7214286476373672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 1520 + }, + { + "completion_length": 226.78572463989258, + "epoch": 0.2551657655392095, + "grad_norm": 0.3477983467378772, + "kl": 0.047286033630371094, + "learning_rate": 1.8790123456790122e-07, + "loss": 0.0, + "reward": 1.7339286357164383, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.73839289881289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1522 + }, + { + "completion_length": 225.79465198516846, + "epoch": 0.2555010687790771, + "grad_norm": 0.21323643527682343, + "kl": 0.05094718933105469, + "learning_rate": 1.8814814814814813e-07, + "loss": 0.0001, + "reward": 1.7392858192324638, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 1524 + }, + { + "completion_length": 237.12947845458984, + "epoch": 0.25583637201894466, + "grad_norm": 0.2633799298401754, + "kl": 0.10648727416992188, + "learning_rate": 1.8839506172839505e-07, + "loss": 0.0001, + "reward": 1.6892857775092125, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6892857439815998, + "rewards/format_reward_func": 1.0, + "step": 1526 + }, + { + "completion_length": 233.39286613464355, + "epoch": 0.2561716752588122, + "grad_norm": 0.3611704280068289, + "kl": 0.039473533630371094, + "learning_rate": 1.8864197530864196e-07, + "loss": 0.0, + "reward": 1.7428572177886963, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 1.0, + "step": 1528 + }, + { + "completion_length": 238.5580472946167, + "epoch": 0.25650697849867976, + "grad_norm": 0.32581177303136893, + "kl": 0.12656688690185547, + "learning_rate": 1.8888888888888888e-07, + "loss": 0.0001, + "reward": 1.7446429505944252, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7491071633994579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1530 + }, + { + "completion_length": 227.6741180419922, + "epoch": 0.2568422817385473, + "grad_norm": 0.3034539012000064, + "kl": 0.046042442321777344, + "learning_rate": 1.891358024691358e-07, + "loss": 0.0, + "reward": 1.7821428999304771, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 1532 + }, + { + "completion_length": 239.6250114440918, + "epoch": 0.25717758497841486, + "grad_norm": 0.48368909019447376, + "kl": 0.06012153625488281, + "learning_rate": 1.893827160493827e-07, + "loss": 0.0001, + "reward": 1.737500086426735, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1534 + }, + { + "completion_length": 237.06251049041748, + "epoch": 0.2575128882182824, + "grad_norm": 0.26842429522669164, + "kl": 0.06570243835449219, + "learning_rate": 1.8962962962962962e-07, + "loss": 0.0001, + "reward": 1.7267857789993286, + "reward_std": 0.10354063380509615, + "rewards/equation_reward_func": 0.7401786055415869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1536 + }, + { + "completion_length": 228.790189743042, + "epoch": 0.25784819145814997, + "grad_norm": 0.4018896461427612, + "kl": 0.035400390625, + "learning_rate": 1.8987654320987654e-07, + "loss": 0.0, + "reward": 1.737500086426735, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1538 + }, + { + "completion_length": 229.99554634094238, + "epoch": 0.25818349469801755, + "grad_norm": 0.2506558632879474, + "kl": 0.0076465606689453125, + "learning_rate": 1.9012345679012345e-07, + "loss": 0.0, + "reward": 1.805357187986374, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8187500257045031, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1540 + }, + { + "completion_length": 230.18304347991943, + "epoch": 0.25851879793788507, + "grad_norm": 0.29934272366305864, + "kl": 0.06483650207519531, + "learning_rate": 1.9037037037037037e-07, + "loss": 0.0001, + "reward": 1.7214286625385284, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7303571719676256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1542 + }, + { + "completion_length": 224.05358219146729, + "epoch": 0.25885410117775265, + "grad_norm": 0.3433338629396455, + "kl": 0.02986907958984375, + "learning_rate": 1.9061728395061728e-07, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 1544 + }, + { + "completion_length": 239.09375953674316, + "epoch": 0.2591894044176202, + "grad_norm": 0.31108987070476973, + "kl": 0.058963775634765625, + "learning_rate": 1.908641975308642e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1546 + }, + { + "completion_length": 228.30358123779297, + "epoch": 0.25952470765748775, + "grad_norm": 0.2713275566948536, + "kl": 0.023523330688476562, + "learning_rate": 1.911111111111111e-07, + "loss": 0.0, + "reward": 1.766071505844593, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1548 + }, + { + "completion_length": 224.51340293884277, + "epoch": 0.2598600108973553, + "grad_norm": 0.31686450102462, + "kl": 0.010550498962402344, + "learning_rate": 1.91358024691358e-07, + "loss": 0.0, + "reward": 1.7750000581145287, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7839285973459482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1550 + }, + { + "completion_length": 225.01786708831787, + "epoch": 0.26019531413722286, + "grad_norm": 0.515107201500274, + "kl": 0.0798177719116211, + "learning_rate": 1.9160493827160491e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7875000238418579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1552 + }, + { + "completion_length": 228.12947463989258, + "epoch": 0.2605306173770904, + "grad_norm": 0.3542582192758546, + "kl": 0.040058135986328125, + "learning_rate": 1.9185185185185186e-07, + "loss": 0.0, + "reward": 1.7946429029107094, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7991071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1554 + }, + { + "completion_length": 236.19644165039062, + "epoch": 0.26086592061695796, + "grad_norm": 0.31969628871868205, + "kl": 0.06807518005371094, + "learning_rate": 1.9209876543209877e-07, + "loss": 0.0001, + "reward": 1.7142858132719994, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7142857536673546, + "rewards/format_reward_func": 1.0, + "step": 1556 + }, + { + "completion_length": 240.19643878936768, + "epoch": 0.26120122385682554, + "grad_norm": 0.41985963092768086, + "kl": 0.11923789978027344, + "learning_rate": 1.9234567901234566e-07, + "loss": 0.0001, + "reward": 1.6767858043313026, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.689285745844245, + "rewards/format_reward_func": 0.9875000044703484, + "step": 1558 + }, + { + "completion_length": 238.20537185668945, + "epoch": 0.26153652709669306, + "grad_norm": 0.26763216571838205, + "kl": 0.036202430725097656, + "learning_rate": 1.9259259259259257e-07, + "loss": 0.0, + "reward": 1.8017857670783997, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.806250024586916, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1560 + }, + { + "completion_length": 241.16072368621826, + "epoch": 0.26187183033656064, + "grad_norm": 0.7795636460003993, + "kl": 0.08544540405273438, + "learning_rate": 1.9283950617283951e-07, + "loss": 0.0001, + "reward": 1.7285714894533157, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7464286163449287, + "rewards/format_reward_func": 0.9821428619325161, + "step": 1562 + }, + { + "completion_length": 226.32590198516846, + "epoch": 0.26220713357642816, + "grad_norm": 0.26431885626858, + "kl": 0.031281471252441406, + "learning_rate": 1.930864197530864e-07, + "loss": 0.0, + "reward": 1.7482143566012383, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1564 + }, + { + "completion_length": 230.01786708831787, + "epoch": 0.26254243681629574, + "grad_norm": 0.25896091143440436, + "kl": 0.19019317626953125, + "learning_rate": 1.9333333333333332e-07, + "loss": 0.0002, + "reward": 1.778571479022503, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7875000275671482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1566 + }, + { + "completion_length": 228.42858219146729, + "epoch": 0.26287774005616327, + "grad_norm": 0.22123026075224217, + "kl": 0.11663246154785156, + "learning_rate": 1.9358024691358023e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1568 + }, + { + "completion_length": 223.83929634094238, + "epoch": 0.26321304329603085, + "grad_norm": 0.34891901610585063, + "kl": 0.0828256607055664, + "learning_rate": 1.9382716049382717e-07, + "loss": 0.0001, + "reward": 1.7392857745289803, + "reward_std": 0.07576143741607666, + "rewards/equation_reward_func": 0.7482143193483353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1570 + }, + { + "completion_length": 234.86607933044434, + "epoch": 0.2635483465358984, + "grad_norm": 0.7646794550333904, + "kl": 0.0537567138671875, + "learning_rate": 1.9407407407407406e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.777678582817316, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1572 + }, + { + "completion_length": 231.30804538726807, + "epoch": 0.26388364977576595, + "grad_norm": 0.20786625219993726, + "kl": 0.06808948516845703, + "learning_rate": 1.9432098765432098e-07, + "loss": 0.0001, + "reward": 1.7410714849829674, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9964285716414452, + "step": 1574 + }, + { + "completion_length": 234.66072463989258, + "epoch": 0.26421895301563353, + "grad_norm": 0.17919898669534326, + "kl": 0.19156932830810547, + "learning_rate": 1.945679012345679e-07, + "loss": 0.0002, + "reward": 1.8160715103149414, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.820535734295845, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1576 + }, + { + "completion_length": 234.99107933044434, + "epoch": 0.26455425625550105, + "grad_norm": 0.25375846103409333, + "kl": 0.1639862060546875, + "learning_rate": 1.9481481481481478e-07, + "loss": 0.0002, + "reward": 1.7392857819795609, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 1578 + }, + { + "completion_length": 232.89733028411865, + "epoch": 0.26488955949536863, + "grad_norm": 0.20598495484765386, + "kl": 0.32021331787109375, + "learning_rate": 1.9506172839506172e-07, + "loss": 0.0003, + "reward": 1.7625000849366188, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7758928909897804, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1580 + }, + { + "completion_length": 229.95090293884277, + "epoch": 0.26522486273523616, + "grad_norm": 0.23689001841514462, + "kl": 0.028433799743652344, + "learning_rate": 1.9530864197530864e-07, + "loss": 0.0, + "reward": 1.7750000581145287, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7839285992085934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1582 + }, + { + "completion_length": 237.3973331451416, + "epoch": 0.26556016597510373, + "grad_norm": 0.30083193411151415, + "kl": 0.0551910400390625, + "learning_rate": 1.9555555555555555e-07, + "loss": 0.0001, + "reward": 1.7236607745289803, + "reward_std": 0.0776554741896689, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 0.9986607171595097, + "step": 1584 + }, + { + "completion_length": 233.04018783569336, + "epoch": 0.2658954692149713, + "grad_norm": 0.28273465688517146, + "kl": 0.12781333923339844, + "learning_rate": 1.9580246913580244e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7241071742027998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1586 + }, + { + "completion_length": 236.07143878936768, + "epoch": 0.26623077245483884, + "grad_norm": 0.13755953595441434, + "kl": 0.26348114013671875, + "learning_rate": 1.9604938271604938e-07, + "loss": 0.0003, + "reward": 1.7428572252392769, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571674972773, + "rewards/format_reward_func": 1.0, + "step": 1588 + }, + { + "completion_length": 231.6205472946167, + "epoch": 0.2665660756947064, + "grad_norm": 0.24383190621635564, + "kl": 0.07479667663574219, + "learning_rate": 1.962962962962963e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357536673546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1590 + }, + { + "completion_length": 239.19197368621826, + "epoch": 0.26690137893457394, + "grad_norm": 0.2082010962256202, + "kl": 0.21643352508544922, + "learning_rate": 1.965432098765432e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 1592 + }, + { + "completion_length": 220.26786613464355, + "epoch": 0.2672366821744415, + "grad_norm": 0.33452171117357393, + "kl": 0.12536048889160156, + "learning_rate": 1.967901234567901e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7410714514553547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1594 + }, + { + "completion_length": 226.27233219146729, + "epoch": 0.26757198541430904, + "grad_norm": 0.3662231272162297, + "kl": 0.12168693542480469, + "learning_rate": 1.9703703703703704e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 1596 + }, + { + "completion_length": 223.64286708831787, + "epoch": 0.2679072886541766, + "grad_norm": 0.45793697052291993, + "kl": 0.15146446228027344, + "learning_rate": 1.9728395061728395e-07, + "loss": 0.0002, + "reward": 1.8000000268220901, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.8089286014437675, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1598 + }, + { + "completion_length": 227.0178689956665, + "epoch": 0.2682425918940442, + "grad_norm": 0.5879741708402264, + "kl": 0.11877250671386719, + "learning_rate": 1.9753086419753084e-07, + "loss": 0.0001, + "reward": 1.7714286670088768, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 1600 + }, + { + "completion_length": 225.34822463989258, + "epoch": 0.2685778951339117, + "grad_norm": 0.3172965825133875, + "kl": 0.1242218017578125, + "learning_rate": 1.9777777777777776e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 1602 + }, + { + "completion_length": 225.34376049041748, + "epoch": 0.2689131983737793, + "grad_norm": 0.19786567305881758, + "kl": 0.041957855224609375, + "learning_rate": 1.980246913580247e-07, + "loss": 0.0, + "reward": 1.7821429297327995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 1604 + }, + { + "completion_length": 231.17411613464355, + "epoch": 0.26924850161364683, + "grad_norm": 0.199203557943275, + "kl": 0.2041006088256836, + "learning_rate": 1.9827160493827161e-07, + "loss": 0.0002, + "reward": 1.807142935693264, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 1606 + }, + { + "completion_length": 244.96429824829102, + "epoch": 0.2695838048535144, + "grad_norm": 0.5953925603348911, + "kl": 0.39769744873046875, + "learning_rate": 1.985185185185185e-07, + "loss": 0.0004, + "reward": 1.721428632736206, + "reward_std": 0.0909137288108468, + "rewards/equation_reward_func": 0.7392857391387224, + "rewards/format_reward_func": 0.9821428656578064, + "step": 1608 + }, + { + "completion_length": 227.01340198516846, + "epoch": 0.26991910809338193, + "grad_norm": 0.6255436231056462, + "kl": 0.01708221435546875, + "learning_rate": 1.9876543209876542e-07, + "loss": 0.0, + "reward": 1.6892858073115349, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6892857495695353, + "rewards/format_reward_func": 1.0, + "step": 1610 + }, + { + "completion_length": 227.75001049041748, + "epoch": 0.2702544113332495, + "grad_norm": 0.23378469972731863, + "kl": 0.2529106140136719, + "learning_rate": 1.9901234567901233e-07, + "loss": 0.0003, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 1612 + }, + { + "completion_length": 241.0134048461914, + "epoch": 0.27058971457311704, + "grad_norm": 0.2597360936259886, + "kl": 0.17315101623535156, + "learning_rate": 1.9925925925925925e-07, + "loss": 0.0002, + "reward": 1.717857226729393, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 1614 + }, + { + "completion_length": 231.7946548461914, + "epoch": 0.2709250178129846, + "grad_norm": 0.46129278988052735, + "kl": 0.10466766357421875, + "learning_rate": 1.9950617283950616e-07, + "loss": 0.0001, + "reward": 1.7446428909897804, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1616 + }, + { + "completion_length": 224.696439743042, + "epoch": 0.2712603210528522, + "grad_norm": 0.4255237430650551, + "kl": 0.17496871948242188, + "learning_rate": 1.9975308641975308e-07, + "loss": 0.0002, + "reward": 1.7714286521077156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 1618 + }, + { + "completion_length": 229.60715198516846, + "epoch": 0.2715956242927197, + "grad_norm": 0.189806473011921, + "kl": 0.1701374053955078, + "learning_rate": 2e-07, + "loss": 0.0002, + "reward": 1.7821429297327995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428906172514, + "rewards/format_reward_func": 1.0, + "step": 1620 + }, + { + "completion_length": 223.0089406967163, + "epoch": 0.2719309275325873, + "grad_norm": 0.2910744729086953, + "kl": 0.29755210876464844, + "learning_rate": 2.002469135802469e-07, + "loss": 0.0003, + "reward": 1.6928572282195091, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7017857581377029, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1622 + }, + { + "completion_length": 221.98215007781982, + "epoch": 0.2722662307724548, + "grad_norm": 0.22473947784807705, + "kl": 0.17630863189697266, + "learning_rate": 2.0049382716049382e-07, + "loss": 0.0002, + "reward": 1.7428571805357933, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 1.0, + "step": 1624 + }, + { + "completion_length": 244.11608219146729, + "epoch": 0.2726015340123224, + "grad_norm": 0.40284671702968305, + "kl": 0.11990642547607422, + "learning_rate": 2.0074074074074074e-07, + "loss": 0.0001, + "reward": 1.7321429178118706, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7410714589059353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1626 + }, + { + "completion_length": 227.2187614440918, + "epoch": 0.2729368372521899, + "grad_norm": 0.26856909090880804, + "kl": 0.08802032470703125, + "learning_rate": 2.0098765432098762e-07, + "loss": 0.0001, + "reward": 1.6857143640518188, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.6857143305242062, + "rewards/format_reward_func": 1.0, + "step": 1628 + }, + { + "completion_length": 220.85715103149414, + "epoch": 0.2732721404920575, + "grad_norm": 0.20613533704375775, + "kl": 0.019829750061035156, + "learning_rate": 2.0123456790123457e-07, + "loss": 0.0, + "reward": 1.7928572073578835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 1630 + }, + { + "completion_length": 227.7321548461914, + "epoch": 0.2736074437319251, + "grad_norm": 0.20721345610666672, + "kl": 0.10821723937988281, + "learning_rate": 2.0148148148148148e-07, + "loss": 0.0001, + "reward": 1.7071429267525673, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7160714566707611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1632 + }, + { + "completion_length": 224.10268783569336, + "epoch": 0.2739427469717926, + "grad_norm": 0.2862899956793819, + "kl": 0.10094642639160156, + "learning_rate": 2.017283950617284e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 1634 + }, + { + "completion_length": 221.59375953674316, + "epoch": 0.2742780502116602, + "grad_norm": 0.3526294409199377, + "kl": 0.10612869262695312, + "learning_rate": 2.0197530864197528e-07, + "loss": 0.0001, + "reward": 1.8129464909434319, + "reward_std": 0.04230013629421592, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 0.9986607171595097, + "step": 1636 + }, + { + "completion_length": 240.34822368621826, + "epoch": 0.2746133534515277, + "grad_norm": 0.22284661939811562, + "kl": 0.19942092895507812, + "learning_rate": 2.022222222222222e-07, + "loss": 0.0002, + "reward": 1.7035714983940125, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7035714723169804, + "rewards/format_reward_func": 1.0, + "step": 1638 + }, + { + "completion_length": 228.54465103149414, + "epoch": 0.2749486566913953, + "grad_norm": 0.23164365281361313, + "kl": 0.035366058349609375, + "learning_rate": 2.0246913580246914e-07, + "loss": 0.0, + "reward": 1.700000062584877, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7000000216066837, + "rewards/format_reward_func": 1.0, + "step": 1640 + }, + { + "completion_length": 224.1071538925171, + "epoch": 0.2752839599312628, + "grad_norm": 0.3284594210277132, + "kl": 0.09695816040039062, + "learning_rate": 2.0271604938271605e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 1642 + }, + { + "completion_length": 229.30804634094238, + "epoch": 0.2756192631711304, + "grad_norm": 0.383168344007737, + "kl": 0.11481475830078125, + "learning_rate": 2.0296296296296294e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7258928753435612, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1644 + }, + { + "completion_length": 228.91518783569336, + "epoch": 0.27595456641099797, + "grad_norm": 0.4223823876902598, + "kl": 0.1330738067626953, + "learning_rate": 2.0320987654320986e-07, + "loss": 0.0001, + "reward": 1.7196429371833801, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.724107189103961, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1646 + }, + { + "completion_length": 230.92858219146729, + "epoch": 0.2762898696508655, + "grad_norm": 0.19741134994974258, + "kl": 0.24378204345703125, + "learning_rate": 2.034567901234568e-07, + "loss": 0.0002, + "reward": 1.7535714730620384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 1648 + }, + { + "completion_length": 238.7812623977661, + "epoch": 0.2766251728907331, + "grad_norm": 0.23204192482524613, + "kl": 0.17847061157226562, + "learning_rate": 2.0370370370370369e-07, + "loss": 0.0002, + "reward": 1.7232143431901932, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1650 + }, + { + "completion_length": 227.9196538925171, + "epoch": 0.2769604761306006, + "grad_norm": 0.43032013537420893, + "kl": 0.3296394348144531, + "learning_rate": 2.039506172839506e-07, + "loss": 0.0003, + "reward": 1.7875000536441803, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1652 + }, + { + "completion_length": 237.1696538925171, + "epoch": 0.2772957793704682, + "grad_norm": 0.2077137808223099, + "kl": 0.31763267517089844, + "learning_rate": 2.0419753086419752e-07, + "loss": 0.0003, + "reward": 1.7214286550879478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 1654 + }, + { + "completion_length": 231.7098331451416, + "epoch": 0.2776310826103357, + "grad_norm": 0.08385124065366012, + "kl": 0.20266056060791016, + "learning_rate": 2.0444444444444446e-07, + "loss": 0.0002, + "reward": 1.725000075995922, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7339286096394062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1656 + }, + { + "completion_length": 238.44643878936768, + "epoch": 0.2779663858502033, + "grad_norm": 0.22718182123890435, + "kl": 0.027462005615234375, + "learning_rate": 2.0469135802469135e-07, + "loss": 0.0, + "reward": 1.7660714909434319, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1658 + }, + { + "completion_length": 229.26340198516846, + "epoch": 0.27830168909007086, + "grad_norm": 0.30742028326270204, + "kl": 0.30859375, + "learning_rate": 2.0493827160493826e-07, + "loss": 0.0003, + "reward": 1.7589286267757416, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1660 + }, + { + "completion_length": 229.0580472946167, + "epoch": 0.2786369923299384, + "grad_norm": 0.44218696016693715, + "kl": 0.05316638946533203, + "learning_rate": 2.0518518518518518e-07, + "loss": 0.0001, + "reward": 1.7598214894533157, + "reward_std": 0.07702413015067577, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9937500059604645, + "step": 1662 + }, + { + "completion_length": 230.37947463989258, + "epoch": 0.27897229556980596, + "grad_norm": 0.1850784449678431, + "kl": 0.06500816345214844, + "learning_rate": 2.0543209876543206e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 1664 + }, + { + "completion_length": 222.53126049041748, + "epoch": 0.2793075988096735, + "grad_norm": 0.279684266768164, + "kl": 0.011688232421875, + "learning_rate": 2.05679012345679e-07, + "loss": 0.0, + "reward": 1.766964353621006, + "reward_std": 0.06692260596901178, + "rewards/equation_reward_func": 0.7741071730852127, + "rewards/format_reward_func": 0.9928571507334709, + "step": 1666 + }, + { + "completion_length": 223.3303689956665, + "epoch": 0.27964290204954106, + "grad_norm": 0.2295089233084164, + "kl": 0.034732818603515625, + "learning_rate": 2.0592592592592592e-07, + "loss": 0.0, + "reward": 1.7821429371833801, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 1668 + }, + { + "completion_length": 224.62501049041748, + "epoch": 0.2799782052894086, + "grad_norm": 0.16455998588464119, + "kl": 0.03418731689453125, + "learning_rate": 2.0617283950617283e-07, + "loss": 0.0, + "reward": 1.7607143595814705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 1670 + }, + { + "completion_length": 228.66965103149414, + "epoch": 0.28031350852927617, + "grad_norm": 0.16323662515693607, + "kl": 0.1743144989013672, + "learning_rate": 2.0641975308641972e-07, + "loss": 0.0002, + "reward": 1.753571480512619, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 1672 + }, + { + "completion_length": 231.9107265472412, + "epoch": 0.28064881176914375, + "grad_norm": 0.17971824743419645, + "kl": 0.07829761505126953, + "learning_rate": 2.0666666666666666e-07, + "loss": 0.0001, + "reward": 1.7553572207689285, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7598214540630579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1674 + }, + { + "completion_length": 236.86162090301514, + "epoch": 0.28098411500901127, + "grad_norm": 0.18232481891962177, + "kl": 0.1730976104736328, + "learning_rate": 2.0691358024691358e-07, + "loss": 0.0002, + "reward": 1.755357213318348, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7687500342726707, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1676 + }, + { + "completion_length": 233.86161613464355, + "epoch": 0.28131941824887885, + "grad_norm": 0.2875084708059203, + "kl": 0.08128738403320312, + "learning_rate": 2.0716049382716047e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 1678 + }, + { + "completion_length": 230.69644260406494, + "epoch": 0.2816547214887464, + "grad_norm": 0.23955868610264447, + "kl": 0.12451171875, + "learning_rate": 2.0740740740740738e-07, + "loss": 0.0001, + "reward": 1.7160715088248253, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7205357421189547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1680 + }, + { + "completion_length": 225.41965293884277, + "epoch": 0.28199002472861395, + "grad_norm": 0.24345243540818787, + "kl": 0.27367496490478516, + "learning_rate": 2.0765432098765432e-07, + "loss": 0.0003, + "reward": 1.776785783469677, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1682 + }, + { + "completion_length": 223.46875953674316, + "epoch": 0.2823253279684815, + "grad_norm": 0.25014435964613124, + "kl": 0.030881881713867188, + "learning_rate": 2.0790123456790124e-07, + "loss": 0.0, + "reward": 1.7571429163217545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 1684 + }, + { + "completion_length": 224.60268783569336, + "epoch": 0.28266063120834906, + "grad_norm": 0.2666289593787631, + "kl": 0.03217506408691406, + "learning_rate": 2.0814814814814813e-07, + "loss": 0.0, + "reward": 1.7571429163217545, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571429051458836, + "rewards/format_reward_func": 1.0, + "step": 1686 + }, + { + "completion_length": 224.8214406967163, + "epoch": 0.2829959344482166, + "grad_norm": 0.2344435869553199, + "kl": 0.014842987060546875, + "learning_rate": 2.0839506172839504e-07, + "loss": 0.0, + "reward": 1.7553572207689285, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214633762836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1688 + }, + { + "completion_length": 222.98661613464355, + "epoch": 0.28333123768808416, + "grad_norm": 0.2974750754107423, + "kl": 0.10511493682861328, + "learning_rate": 2.0864197530864198e-07, + "loss": 0.0001, + "reward": 1.7678572311997414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 1690 + }, + { + "completion_length": 225.80804538726807, + "epoch": 0.28366654092795174, + "grad_norm": 0.40492073155295566, + "kl": 0.19135475158691406, + "learning_rate": 2.088888888888889e-07, + "loss": 0.0002, + "reward": 1.7625000551342964, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7758928760886192, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1692 + }, + { + "completion_length": 225.75000858306885, + "epoch": 0.28400184416781926, + "grad_norm": 0.21306977700182334, + "kl": 0.2654247283935547, + "learning_rate": 2.0913580246913579e-07, + "loss": 0.0003, + "reward": 1.6964286789298058, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.6964285988360643, + "rewards/format_reward_func": 1.0, + "step": 1694 + }, + { + "completion_length": 224.08036708831787, + "epoch": 0.28433714740768684, + "grad_norm": 0.25962073008852654, + "kl": 0.022909164428710938, + "learning_rate": 2.093827160493827e-07, + "loss": 0.0, + "reward": 1.7464286163449287, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286163449287, + "rewards/format_reward_func": 1.0, + "step": 1696 + }, + { + "completion_length": 220.73661708831787, + "epoch": 0.28467245064755436, + "grad_norm": 0.2998414003107778, + "kl": 0.34929847717285156, + "learning_rate": 2.0962962962962962e-07, + "loss": 0.0003, + "reward": 1.7428572326898575, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 1698 + }, + { + "completion_length": 222.66965198516846, + "epoch": 0.28500775388742194, + "grad_norm": 0.21281974812319984, + "kl": 0.18515777587890625, + "learning_rate": 2.0987654320987653e-07, + "loss": 0.0002, + "reward": 1.775000050663948, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 1700 + }, + { + "completion_length": 220.08036613464355, + "epoch": 0.28534305712728947, + "grad_norm": 0.3371442250969937, + "kl": 0.10428047180175781, + "learning_rate": 2.1012345679012345e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571864962578, + "rewards/format_reward_func": 1.0, + "step": 1702 + }, + { + "completion_length": 222.62054538726807, + "epoch": 0.28567836036715705, + "grad_norm": 0.5105200926308263, + "kl": 0.22168350219726562, + "learning_rate": 2.1037037037037036e-07, + "loss": 0.0002, + "reward": 1.7428572252392769, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 1.0, + "step": 1704 + }, + { + "completion_length": 226.08929538726807, + "epoch": 0.2860136636070246, + "grad_norm": 0.4370022389501131, + "kl": 0.1873798370361328, + "learning_rate": 2.1061728395061727e-07, + "loss": 0.0002, + "reward": 1.773214340209961, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1706 + }, + { + "completion_length": 226.66518878936768, + "epoch": 0.28634896684689215, + "grad_norm": 0.11375834492782687, + "kl": 0.1496267318725586, + "learning_rate": 2.108641975308642e-07, + "loss": 0.0002, + "reward": 1.8160714656114578, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8205357566475868, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1708 + }, + { + "completion_length": 230.83483123779297, + "epoch": 0.28668427008675973, + "grad_norm": 0.46898913636215384, + "kl": 0.1600666046142578, + "learning_rate": 2.111111111111111e-07, + "loss": 0.0002, + "reward": 1.7767858058214188, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7812500149011612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1710 + }, + { + "completion_length": 219.47768783569336, + "epoch": 0.28701957332662725, + "grad_norm": 0.0017106252395304078, + "kl": 0.03565216064453125, + "learning_rate": 2.1135802469135802e-07, + "loss": 0.0, + "reward": 1.7357143685221672, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143107801676, + "rewards/format_reward_func": 1.0, + "step": 1712 + }, + { + "completion_length": 221.4509048461914, + "epoch": 0.28735487656649483, + "grad_norm": 0.29632758745427196, + "kl": 0.10880851745605469, + "learning_rate": 2.116049382716049e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 1714 + }, + { + "completion_length": 236.94197463989258, + "epoch": 0.28769017980636236, + "grad_norm": 0.2917454687041177, + "kl": 0.2630786895751953, + "learning_rate": 2.1185185185185185e-07, + "loss": 0.0003, + "reward": 1.787500061094761, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1716 + }, + { + "completion_length": 222.91965293884277, + "epoch": 0.28802548304622994, + "grad_norm": 0.3137815271928074, + "kl": 0.04746055603027344, + "learning_rate": 2.1209876543209876e-07, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 1718 + }, + { + "completion_length": 231.45983123779297, + "epoch": 0.2883607862860975, + "grad_norm": 0.28325392986376813, + "kl": 0.1917266845703125, + "learning_rate": 2.1234567901234568e-07, + "loss": 0.0002, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428686380386, + "rewards/format_reward_func": 1.0, + "step": 1720 + }, + { + "completion_length": 226.11608219146729, + "epoch": 0.28869608952596504, + "grad_norm": 0.6358652431461438, + "kl": 0.370849609375, + "learning_rate": 2.1259259259259257e-07, + "loss": 0.0004, + "reward": 1.7017857879400253, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7062500268220901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1722 + }, + { + "completion_length": 232.03572463989258, + "epoch": 0.2890313927658326, + "grad_norm": 0.1300999799664777, + "kl": 0.3420524597167969, + "learning_rate": 2.1283950617283948e-07, + "loss": 0.0003, + "reward": 1.7178572341799736, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 1724 + }, + { + "completion_length": 223.0357255935669, + "epoch": 0.28936669600570014, + "grad_norm": 0.269776246641907, + "kl": 0.7043094635009766, + "learning_rate": 2.1308641975308642e-07, + "loss": 0.0007, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 1726 + }, + { + "completion_length": 229.25447177886963, + "epoch": 0.2897019992455677, + "grad_norm": 0.26928261439673645, + "kl": 0.1688404083251953, + "learning_rate": 2.1333333333333334e-07, + "loss": 0.0002, + "reward": 1.725000075995922, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000461935997, + "rewards/format_reward_func": 1.0, + "step": 1728 + }, + { + "completion_length": 227.66518688201904, + "epoch": 0.29003730248543524, + "grad_norm": 0.25291738156225757, + "kl": 0.019290924072265625, + "learning_rate": 2.1358024691358023e-07, + "loss": 0.0, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 1730 + }, + { + "completion_length": 232.74108123779297, + "epoch": 0.2903726057253028, + "grad_norm": 0.3125680880835837, + "kl": 0.11983871459960938, + "learning_rate": 2.1382716049382714e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428958326578, + "rewards/format_reward_func": 1.0, + "step": 1732 + }, + { + "completion_length": 234.12501049041748, + "epoch": 0.2907079089651704, + "grad_norm": 0.16799823386356563, + "kl": 0.32921409606933594, + "learning_rate": 2.1407407407407408e-07, + "loss": 0.0003, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 1734 + }, + { + "completion_length": 229.38840293884277, + "epoch": 0.2910432122050379, + "grad_norm": 0.2370698650557852, + "kl": 0.29618072509765625, + "learning_rate": 2.1432098765432097e-07, + "loss": 0.0003, + "reward": 1.767857201397419, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 1736 + }, + { + "completion_length": 233.07590198516846, + "epoch": 0.2913785154449055, + "grad_norm": 0.31089682021611725, + "kl": 0.3679676055908203, + "learning_rate": 2.1456790123456789e-07, + "loss": 0.0004, + "reward": 1.746428668498993, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 1738 + }, + { + "completion_length": 220.51786518096924, + "epoch": 0.29171381868477303, + "grad_norm": 0.3056727159750165, + "kl": 0.5169277191162109, + "learning_rate": 2.148148148148148e-07, + "loss": 0.0005, + "reward": 1.7142857983708382, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857443541288, + "rewards/format_reward_func": 1.0, + "step": 1740 + }, + { + "completion_length": 229.81250953674316, + "epoch": 0.2920491219246406, + "grad_norm": 0.1519463725257348, + "kl": 0.013641357421875, + "learning_rate": 2.1506172839506174e-07, + "loss": 0.0, + "reward": 1.707142911851406, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7071428913623095, + "rewards/format_reward_func": 1.0, + "step": 1742 + }, + { + "completion_length": 233.602689743042, + "epoch": 0.29238442516450813, + "grad_norm": 0.3022476612112489, + "kl": 0.06960678100585938, + "learning_rate": 2.1530864197530863e-07, + "loss": 0.0001, + "reward": 1.70089291036129, + "reward_std": 0.069447988178581, + "rewards/equation_reward_func": 0.7205357626080513, + "rewards/format_reward_func": 0.9803571552038193, + "step": 1744 + }, + { + "completion_length": 225.93304443359375, + "epoch": 0.2927197284043757, + "grad_norm": 0.16061238412841028, + "kl": 0.06715774536132812, + "learning_rate": 2.1555555555555554e-07, + "loss": 0.0001, + "reward": 1.7910714745521545, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1746 + }, + { + "completion_length": 231.93304634094238, + "epoch": 0.29305503164424324, + "grad_norm": 0.1863992188614141, + "kl": 0.30138206481933594, + "learning_rate": 2.1580246913580246e-07, + "loss": 0.0003, + "reward": 1.7571429014205933, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1748 + }, + { + "completion_length": 234.3259038925171, + "epoch": 0.2933903348841108, + "grad_norm": 0.4137435993933847, + "kl": 0.1421070098876953, + "learning_rate": 2.1604938271604935e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 1750 + }, + { + "completion_length": 225.38840579986572, + "epoch": 0.2937256381239784, + "grad_norm": 0.24968596600314152, + "kl": 0.10309982299804688, + "learning_rate": 2.162962962962963e-07, + "loss": 0.0001, + "reward": 1.7160715162754059, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7205357551574707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1752 + }, + { + "completion_length": 228.83036708831787, + "epoch": 0.2940609413638459, + "grad_norm": 0.3084047530337667, + "kl": 0.11126136779785156, + "learning_rate": 2.165432098765432e-07, + "loss": 0.0001, + "reward": 1.687500074505806, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.6919643208384514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1754 + }, + { + "completion_length": 241.81697368621826, + "epoch": 0.2943962446037135, + "grad_norm": 0.18964625385824094, + "kl": 0.3287525177001953, + "learning_rate": 2.1679012345679012e-07, + "loss": 0.0003, + "reward": 1.650000087916851, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.65892861969769, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1756 + }, + { + "completion_length": 236.86608219146729, + "epoch": 0.294731547843581, + "grad_norm": 0.3619400638075198, + "kl": 0.15472793579101562, + "learning_rate": 2.17037037037037e-07, + "loss": 0.0002, + "reward": 1.773214340209961, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7776785921305418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1758 + }, + { + "completion_length": 235.95983123779297, + "epoch": 0.2950668510834486, + "grad_norm": 0.3397716552384549, + "kl": 0.6170005798339844, + "learning_rate": 2.1728395061728395e-07, + "loss": 0.0006, + "reward": 1.7392857894301414, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 1760 + }, + { + "completion_length": 233.0759048461914, + "epoch": 0.2954021543233161, + "grad_norm": 0.23654675030018038, + "kl": 0.013632774353027344, + "learning_rate": 2.1753086419753086e-07, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 1762 + }, + { + "completion_length": 226.56250953674316, + "epoch": 0.2957374575631837, + "grad_norm": 0.288814634064109, + "kl": 0.12615585327148438, + "learning_rate": 2.1777777777777775e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000163912773, + "rewards/format_reward_func": 1.0, + "step": 1764 + }, + { + "completion_length": 229.02679824829102, + "epoch": 0.2960727608030513, + "grad_norm": 0.24413996834108162, + "kl": 0.1447772979736328, + "learning_rate": 2.1802469135802467e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 1766 + }, + { + "completion_length": 231.7634038925171, + "epoch": 0.2964080640429188, + "grad_norm": 0.4000458770990593, + "kl": 0.15684127807617188, + "learning_rate": 2.182716049382716e-07, + "loss": 0.0002, + "reward": 1.7267858013510704, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7312500234693289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1768 + }, + { + "completion_length": 223.84375953674316, + "epoch": 0.2967433672827864, + "grad_norm": 0.2284878268099746, + "kl": 0.13471412658691406, + "learning_rate": 2.1851851851851852e-07, + "loss": 0.0001, + "reward": 1.767857238650322, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 1770 + }, + { + "completion_length": 215.60715293884277, + "epoch": 0.2970786705226539, + "grad_norm": 0.19678904064037697, + "kl": 0.05077934265136719, + "learning_rate": 2.187654320987654e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 1772 + }, + { + "completion_length": 227.2946538925171, + "epoch": 0.2974139737625215, + "grad_norm": 0.33814303171679666, + "kl": 0.11888504028320312, + "learning_rate": 2.1901234567901233e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 1774 + }, + { + "completion_length": 223.72322463989258, + "epoch": 0.297749277002389, + "grad_norm": 0.19717208048606372, + "kl": 0.24188232421875, + "learning_rate": 2.1925925925925927e-07, + "loss": 0.0002, + "reward": 1.7928572222590446, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 1776 + }, + { + "completion_length": 228.0625123977661, + "epoch": 0.2980845802422566, + "grad_norm": 0.271144369694773, + "kl": 0.2233419418334961, + "learning_rate": 2.1950617283950618e-07, + "loss": 0.0002, + "reward": 1.7839286178350449, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.788392897695303, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1778 + }, + { + "completion_length": 237.03572368621826, + "epoch": 0.29841988348212417, + "grad_norm": 0.2724767955336668, + "kl": 0.021409988403320312, + "learning_rate": 2.1975308641975307e-07, + "loss": 0.0, + "reward": 1.775000087916851, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 1780 + }, + { + "completion_length": 230.90179538726807, + "epoch": 0.2987551867219917, + "grad_norm": 0.1573001174468948, + "kl": 0.029628753662109375, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "reward": 1.7125000953674316, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 0.987500011920929, + "step": 1782 + }, + { + "completion_length": 230.42411994934082, + "epoch": 0.2990904899618593, + "grad_norm": 0.29308188881949576, + "kl": 0.06266975402832031, + "learning_rate": 2.202469135802469e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 1784 + }, + { + "completion_length": 225.75447368621826, + "epoch": 0.2994257932017268, + "grad_norm": 0.37598447598224566, + "kl": 0.06233024597167969, + "learning_rate": 2.2049382716049381e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 1786 + }, + { + "completion_length": 239.06697368621826, + "epoch": 0.2997610964415944, + "grad_norm": 0.2410658800415634, + "kl": 0.08719062805175781, + "learning_rate": 2.2074074074074073e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7232143227010965, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1788 + }, + { + "completion_length": 237.25001430511475, + "epoch": 0.3000963996814619, + "grad_norm": 0.27428234718509725, + "kl": 0.0132598876953125, + "learning_rate": 2.2098765432098764e-07, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 1790 + }, + { + "completion_length": 230.80804538726807, + "epoch": 0.3004317029213295, + "grad_norm": 0.20216357930418644, + "kl": 0.03976631164550781, + "learning_rate": 2.2123456790123456e-07, + "loss": 0.0, + "reward": 1.7232143729925156, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7276785839349031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1792 + }, + { + "completion_length": 230.75893783569336, + "epoch": 0.30076700616119706, + "grad_norm": 0.3042072979752627, + "kl": 0.09014701843261719, + "learning_rate": 2.2148148148148147e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 1794 + }, + { + "completion_length": 227.02679443359375, + "epoch": 0.3011023094010646, + "grad_norm": 0.27795542549745866, + "kl": 0.04588603973388672, + "learning_rate": 2.217283950617284e-07, + "loss": 0.0, + "reward": 1.7678571939468384, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 1796 + }, + { + "completion_length": 236.84822750091553, + "epoch": 0.30143761264093216, + "grad_norm": 0.4820397237602499, + "kl": 0.4211463928222656, + "learning_rate": 2.219753086419753e-07, + "loss": 0.0004, + "reward": 1.7625000402331352, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7758928947150707, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1798 + }, + { + "completion_length": 226.65179538726807, + "epoch": 0.3017729158807997, + "grad_norm": 0.2887272445792341, + "kl": 0.016399383544921875, + "learning_rate": 2.222222222222222e-07, + "loss": 0.0, + "reward": 1.7678571939468384, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571660071611, + "rewards/format_reward_func": 1.0, + "step": 1800 + }, + { + "completion_length": 235.01340293884277, + "epoch": 0.30210821912066727, + "grad_norm": 0.6643608729826055, + "kl": 0.1521320343017578, + "learning_rate": 2.2246913580246913e-07, + "loss": 0.0002, + "reward": 1.7410714998841286, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1802 + }, + { + "completion_length": 225.6384038925171, + "epoch": 0.3024435223605348, + "grad_norm": 0.26640352513056065, + "kl": 0.03233146667480469, + "learning_rate": 2.2271604938271605e-07, + "loss": 0.0, + "reward": 1.7500000596046448, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 1804 + }, + { + "completion_length": 228.18750953674316, + "epoch": 0.30277882560040237, + "grad_norm": 0.14731187386849665, + "kl": 0.027124404907226562, + "learning_rate": 2.2296296296296296e-07, + "loss": 0.0, + "reward": 1.7089286521077156, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7133928928524256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1806 + }, + { + "completion_length": 239.9464406967163, + "epoch": 0.3031141288402699, + "grad_norm": 0.29234930155500477, + "kl": 0.020626068115234375, + "learning_rate": 2.2320987654320985e-07, + "loss": 0.0, + "reward": 1.7607143372297287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 1808 + }, + { + "completion_length": 235.46875858306885, + "epoch": 0.30344943208013747, + "grad_norm": 0.1040229140664818, + "kl": 0.015781402587890625, + "learning_rate": 2.2345679012345677e-07, + "loss": 0.0, + "reward": 1.7285715118050575, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 1810 + }, + { + "completion_length": 237.58929920196533, + "epoch": 0.30378473532000505, + "grad_norm": 0.10295620756709041, + "kl": 0.16724872589111328, + "learning_rate": 2.237037037037037e-07, + "loss": 0.0002, + "reward": 1.7696429267525673, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7830357439815998, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1812 + }, + { + "completion_length": 228.08929634094238, + "epoch": 0.3041200385598726, + "grad_norm": 0.15218741534858393, + "kl": 0.2906198501586914, + "learning_rate": 2.239506172839506e-07, + "loss": 0.0003, + "reward": 1.7428572177886963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 1814 + }, + { + "completion_length": 224.18750762939453, + "epoch": 0.30445534179974015, + "grad_norm": 0.19074701356213938, + "kl": 0.06517219543457031, + "learning_rate": 2.241975308641975e-07, + "loss": 0.0001, + "reward": 1.7142858058214188, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7142857518047094, + "rewards/format_reward_func": 1.0, + "step": 1816 + }, + { + "completion_length": 222.05804443359375, + "epoch": 0.3047906450396077, + "grad_norm": 0.1476048923105393, + "kl": 0.012349128723144531, + "learning_rate": 2.2444444444444442e-07, + "loss": 0.0, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 1818 + }, + { + "completion_length": 233.8750114440918, + "epoch": 0.30512594827947526, + "grad_norm": 0.17331623039744623, + "kl": 0.01285552978515625, + "learning_rate": 2.2469135802469137e-07, + "loss": 0.0, + "reward": 1.8000000566244125, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 1820 + }, + { + "completion_length": 231.1250114440918, + "epoch": 0.3054612515193428, + "grad_norm": 0.3307768489017182, + "kl": 0.1770782470703125, + "learning_rate": 2.2493827160493825e-07, + "loss": 0.0002, + "reward": 1.744642935693264, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1822 + }, + { + "completion_length": 233.92858028411865, + "epoch": 0.30579655475921036, + "grad_norm": 0.22175695720606406, + "kl": 0.16938114166259766, + "learning_rate": 2.2518518518518517e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 1824 + }, + { + "completion_length": 230.14733219146729, + "epoch": 0.30613185799907794, + "grad_norm": 0.27675034844790847, + "kl": 0.020910263061523438, + "learning_rate": 2.2543209876543208e-07, + "loss": 0.0, + "reward": 1.8107143640518188, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 1826 + }, + { + "completion_length": 247.6160831451416, + "epoch": 0.30646716123894546, + "grad_norm": 0.44542823477944804, + "kl": 0.24846267700195312, + "learning_rate": 2.2567901234567903e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7482143267989159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1828 + }, + { + "completion_length": 226.4062623977661, + "epoch": 0.30680246447881304, + "grad_norm": 0.3461096636929037, + "kl": 0.05735588073730469, + "learning_rate": 2.2592592592592591e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857618629932, + "rewards/format_reward_func": 1.0, + "step": 1830 + }, + { + "completion_length": 237.54465198516846, + "epoch": 0.30713776771868057, + "grad_norm": 0.4290936324075917, + "kl": 0.08965682983398438, + "learning_rate": 2.2617283950617283e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7410714589059353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1832 + }, + { + "completion_length": 234.8839406967163, + "epoch": 0.30747307095854814, + "grad_norm": 0.28822848873036533, + "kl": 0.035064697265625, + "learning_rate": 2.2641975308641974e-07, + "loss": 0.0, + "reward": 1.7821428924798965, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428906172514, + "rewards/format_reward_func": 1.0, + "step": 1834 + }, + { + "completion_length": 226.7634038925171, + "epoch": 0.30780837419841567, + "grad_norm": 0.14622024873230458, + "kl": 0.35060882568359375, + "learning_rate": 2.2666666666666663e-07, + "loss": 0.0004, + "reward": 1.7767857611179352, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7812500484287739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1836 + }, + { + "completion_length": 241.6919755935669, + "epoch": 0.30814367743828325, + "grad_norm": 0.35153050154322957, + "kl": 0.012337684631347656, + "learning_rate": 2.2691358024691357e-07, + "loss": 0.0, + "reward": 1.7267858162522316, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7312500290572643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1838 + }, + { + "completion_length": 224.90179634094238, + "epoch": 0.3084789806781508, + "grad_norm": 0.2655829602093783, + "kl": 0.0985097885131836, + "learning_rate": 2.271604938271605e-07, + "loss": 0.0001, + "reward": 1.801785759627819, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500357627869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1840 + }, + { + "completion_length": 241.04911994934082, + "epoch": 0.30881428391801835, + "grad_norm": 0.2453221248498777, + "kl": 0.4122467041015625, + "learning_rate": 2.274074074074074e-07, + "loss": 0.0004, + "reward": 1.7589286044239998, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928991854191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1842 + }, + { + "completion_length": 231.6741180419922, + "epoch": 0.30914958715788593, + "grad_norm": 0.2234689362529436, + "kl": 0.09752082824707031, + "learning_rate": 2.276543209876543e-07, + "loss": 0.0001, + "reward": 1.7142857983708382, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857518047094, + "rewards/format_reward_func": 1.0, + "step": 1844 + }, + { + "completion_length": 229.68751049041748, + "epoch": 0.30948489039775345, + "grad_norm": 0.33753612337173233, + "kl": 0.017671585083007812, + "learning_rate": 2.2790123456790123e-07, + "loss": 0.0, + "reward": 1.7642857879400253, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 1846 + }, + { + "completion_length": 236.9821538925171, + "epoch": 0.30982019363762103, + "grad_norm": 0.34463075474830307, + "kl": 0.06949996948242188, + "learning_rate": 2.2814814814814815e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 1848 + }, + { + "completion_length": 222.60268783569336, + "epoch": 0.31015549687748856, + "grad_norm": 0.3017657562993159, + "kl": 0.17605209350585938, + "learning_rate": 2.2839506172839504e-07, + "loss": 0.0002, + "reward": 1.7410714998841286, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1850 + }, + { + "completion_length": 233.35268878936768, + "epoch": 0.31049080011735614, + "grad_norm": 0.19114987608251482, + "kl": 0.11333656311035156, + "learning_rate": 2.2864197530864195e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.74910718947649, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1852 + }, + { + "completion_length": 239.08929443359375, + "epoch": 0.3108261033572237, + "grad_norm": 0.38085970590024615, + "kl": 0.1593780517578125, + "learning_rate": 2.288888888888889e-07, + "loss": 0.0002, + "reward": 1.7250000536441803, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000461935997, + "rewards/format_reward_func": 1.0, + "step": 1854 + }, + { + "completion_length": 231.82143878936768, + "epoch": 0.31116140659709124, + "grad_norm": 0.7838052230054621, + "kl": 0.08046340942382812, + "learning_rate": 2.291358024691358e-07, + "loss": 0.0001, + "reward": 1.8071429207921028, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 1856 + }, + { + "completion_length": 231.53126049041748, + "epoch": 0.3114967098369588, + "grad_norm": 0.22527110312924714, + "kl": 0.16312026977539062, + "learning_rate": 2.293827160493827e-07, + "loss": 0.0002, + "reward": 1.8035714775323868, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714402794838, + "rewards/format_reward_func": 1.0, + "step": 1858 + }, + { + "completion_length": 226.75447273254395, + "epoch": 0.31183201307682634, + "grad_norm": 0.20671855052520396, + "kl": 0.08982467651367188, + "learning_rate": 2.296296296296296e-07, + "loss": 0.0001, + "reward": 1.7392857670783997, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 1860 + }, + { + "completion_length": 229.11161518096924, + "epoch": 0.3121673163166939, + "grad_norm": 0.23576458309190282, + "kl": 0.07481575012207031, + "learning_rate": 2.2987654320987655e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286107569933, + "rewards/format_reward_func": 1.0, + "step": 1862 + }, + { + "completion_length": 220.23215293884277, + "epoch": 0.31250261955656145, + "grad_norm": 0.2948337250310543, + "kl": 0.048417091369628906, + "learning_rate": 2.3012345679012347e-07, + "loss": 0.0, + "reward": 1.7910714745521545, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357506871223, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1864 + }, + { + "completion_length": 231.8035831451416, + "epoch": 0.312837922796429, + "grad_norm": 0.3180397707419142, + "kl": 0.17154312133789062, + "learning_rate": 2.3037037037037035e-07, + "loss": 0.0002, + "reward": 1.6535715013742447, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.6535714697092772, + "rewards/format_reward_func": 1.0, + "step": 1866 + }, + { + "completion_length": 222.42858219146729, + "epoch": 0.31317322603629655, + "grad_norm": 0.23005009438891452, + "kl": 0.07344245910644531, + "learning_rate": 2.3061728395061727e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7410714626312256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1868 + }, + { + "completion_length": 241.3794765472412, + "epoch": 0.3135085292761641, + "grad_norm": 0.27632991514710453, + "kl": 0.26630401611328125, + "learning_rate": 2.3086419753086418e-07, + "loss": 0.0003, + "reward": 1.807142935693264, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.807142898440361, + "rewards/format_reward_func": 1.0, + "step": 1870 + }, + { + "completion_length": 231.10268783569336, + "epoch": 0.3138438325160317, + "grad_norm": 0.24458798323898348, + "kl": 0.024705886840820312, + "learning_rate": 2.311111111111111e-07, + "loss": 0.0, + "reward": 1.7160714864730835, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7205357383936644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1872 + }, + { + "completion_length": 231.321439743042, + "epoch": 0.31417913575589923, + "grad_norm": 0.22131545732884386, + "kl": 0.034271240234375, + "learning_rate": 2.31358024691358e-07, + "loss": 0.0, + "reward": 1.8000000342726707, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 1.0, + "step": 1874 + }, + { + "completion_length": 229.5089406967163, + "epoch": 0.3145144389957668, + "grad_norm": 0.22465986688885642, + "kl": 0.165924072265625, + "learning_rate": 2.3160493827160493e-07, + "loss": 0.0002, + "reward": 1.7660714834928513, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1876 + }, + { + "completion_length": 233.3884038925171, + "epoch": 0.31484974223563433, + "grad_norm": 0.23910415798432272, + "kl": 0.18875694274902344, + "learning_rate": 2.3185185185185184e-07, + "loss": 0.0002, + "reward": 1.7535714879631996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 1878 + }, + { + "completion_length": 233.0357265472412, + "epoch": 0.3151850454755019, + "grad_norm": 0.40358730438180807, + "kl": 0.04170417785644531, + "learning_rate": 2.3209876543209876e-07, + "loss": 0.0, + "reward": 1.7607143595814705, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1880 + }, + { + "completion_length": 225.48661994934082, + "epoch": 0.31552034871536944, + "grad_norm": 0.2931343957440977, + "kl": 0.08726978302001953, + "learning_rate": 2.3234567901234567e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 1882 + }, + { + "completion_length": 231.7009048461914, + "epoch": 0.315855651955237, + "grad_norm": 0.5066000684278723, + "kl": 0.06025505065917969, + "learning_rate": 2.325925925925926e-07, + "loss": 0.0001, + "reward": 1.7053572237491608, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7098214663565159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1884 + }, + { + "completion_length": 225.73661613464355, + "epoch": 0.3161909551951046, + "grad_norm": 0.11965847860757209, + "kl": 0.09498214721679688, + "learning_rate": 2.3283950617283948e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 1886 + }, + { + "completion_length": 235.83483123779297, + "epoch": 0.3165262584349721, + "grad_norm": 0.27708246702194467, + "kl": 0.26693153381347656, + "learning_rate": 2.3308641975308642e-07, + "loss": 0.0003, + "reward": 1.7107143625617027, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.71071432903409, + "rewards/format_reward_func": 1.0, + "step": 1888 + }, + { + "completion_length": 230.55358219146729, + "epoch": 0.3168615616748397, + "grad_norm": 0.2196535389068709, + "kl": 0.1350536346435547, + "learning_rate": 2.3333333333333333e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143256813288, + "rewards/format_reward_func": 1.0, + "step": 1890 + }, + { + "completion_length": 233.94643783569336, + "epoch": 0.3171968649147072, + "grad_norm": 0.16739288082692094, + "kl": 0.018312454223632812, + "learning_rate": 2.3358024691358025e-07, + "loss": 0.0, + "reward": 1.6982143446803093, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.702678618952632, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1892 + }, + { + "completion_length": 227.74554634094238, + "epoch": 0.3175321681545748, + "grad_norm": 0.23554998821278342, + "kl": 0.04662895202636719, + "learning_rate": 2.3382716049382713e-07, + "loss": 0.0, + "reward": 1.764285758137703, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 1894 + }, + { + "completion_length": 230.40625953674316, + "epoch": 0.3178674713944423, + "grad_norm": 0.3893511360014083, + "kl": 0.03877544403076172, + "learning_rate": 2.3407407407407405e-07, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1896 + }, + { + "completion_length": 235.1250114440918, + "epoch": 0.3182027746343099, + "grad_norm": 0.2841157377433794, + "kl": 0.19602394104003906, + "learning_rate": 2.34320987654321e-07, + "loss": 0.0002, + "reward": 1.7428572252392769, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 1898 + }, + { + "completion_length": 226.2366189956665, + "epoch": 0.3185380778741775, + "grad_norm": 0.17757173837637225, + "kl": 0.0174407958984375, + "learning_rate": 2.3456790123456788e-07, + "loss": 0.0, + "reward": 1.7357143387198448, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 1900 + }, + { + "completion_length": 237.0000123977661, + "epoch": 0.318873381114045, + "grad_norm": 0.43075952842159304, + "kl": 0.13301658630371094, + "learning_rate": 2.348148148148148e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857301980257, + "rewards/format_reward_func": 1.0, + "step": 1902 + }, + { + "completion_length": 230.7544755935669, + "epoch": 0.3192086843539126, + "grad_norm": 0.2751534698278088, + "kl": 0.07329368591308594, + "learning_rate": 2.350617283950617e-07, + "loss": 0.0001, + "reward": 1.7053572237491608, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7098214663565159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1904 + }, + { + "completion_length": 228.5669755935669, + "epoch": 0.3195439875937801, + "grad_norm": 0.25202227295897695, + "kl": 0.03522491455078125, + "learning_rate": 2.3530864197530865e-07, + "loss": 0.0, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 1906 + }, + { + "completion_length": 238.35715675354004, + "epoch": 0.3198792908336477, + "grad_norm": 0.17488179771033238, + "kl": 0.021076202392578125, + "learning_rate": 2.3555555555555554e-07, + "loss": 0.0, + "reward": 1.7714286595582962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 1908 + }, + { + "completion_length": 226.04465579986572, + "epoch": 0.3202145940735152, + "grad_norm": 0.29303369817907765, + "kl": 0.011088371276855469, + "learning_rate": 2.3580246913580245e-07, + "loss": 0.0, + "reward": 1.764285795390606, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 1910 + }, + { + "completion_length": 238.8259048461914, + "epoch": 0.3205498973133828, + "grad_norm": 0.28677067456421385, + "kl": 0.06038475036621094, + "learning_rate": 2.3604938271604937e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1912 + }, + { + "completion_length": 230.89286613464355, + "epoch": 0.32088520055325037, + "grad_norm": 0.23104718016621956, + "kl": 0.023987770080566406, + "learning_rate": 2.362962962962963e-07, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714469850063, + "rewards/format_reward_func": 1.0, + "step": 1914 + }, + { + "completion_length": 238.77679824829102, + "epoch": 0.3212205037931179, + "grad_norm": 0.24706180736743527, + "kl": 0.2645835876464844, + "learning_rate": 2.365432098765432e-07, + "loss": 0.0003, + "reward": 1.7500000596046448, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 1916 + }, + { + "completion_length": 240.2544755935669, + "epoch": 0.3215558070329855, + "grad_norm": 0.2133498248289064, + "kl": 0.12011337280273438, + "learning_rate": 2.367901234567901e-07, + "loss": 0.0001, + "reward": 1.758928619325161, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1918 + }, + { + "completion_length": 228.94197368621826, + "epoch": 0.321891110272853, + "grad_norm": 0.43973771460402583, + "kl": 0.11147499084472656, + "learning_rate": 2.3703703703703703e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 1920 + }, + { + "completion_length": 230.3169755935669, + "epoch": 0.3222264135127206, + "grad_norm": 0.22943147719990709, + "kl": 0.04773712158203125, + "learning_rate": 2.3728395061728394e-07, + "loss": 0.0, + "reward": 1.7803571969270706, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1922 + }, + { + "completion_length": 231.18751049041748, + "epoch": 0.3225617167525881, + "grad_norm": 0.35686116682750085, + "kl": 0.07299137115478516, + "learning_rate": 2.3753086419753086e-07, + "loss": 0.0001, + "reward": 1.7125001028180122, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.716964315623045, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1924 + }, + { + "completion_length": 224.07143783569336, + "epoch": 0.3228970199924557, + "grad_norm": 0.30472172697594374, + "kl": 0.029535293579101562, + "learning_rate": 2.3777777777777777e-07, + "loss": 0.0, + "reward": 1.7500000819563866, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 1926 + }, + { + "completion_length": 244.20983409881592, + "epoch": 0.32323232323232326, + "grad_norm": 0.2661952587072812, + "kl": 0.019370079040527344, + "learning_rate": 2.3802469135802469e-07, + "loss": 0.0, + "reward": 1.7642857655882835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 1928 + }, + { + "completion_length": 231.15179824829102, + "epoch": 0.3235676264721908, + "grad_norm": 0.2501130921501854, + "kl": 0.027837753295898438, + "learning_rate": 2.3827160493827157e-07, + "loss": 0.0, + "reward": 1.7928571924567223, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 1930 + }, + { + "completion_length": 233.75001430511475, + "epoch": 0.32390292971205836, + "grad_norm": 0.44629294557250104, + "kl": 0.07960128784179688, + "learning_rate": 2.385185185185185e-07, + "loss": 0.0001, + "reward": 1.6946429312229156, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.6991071682423353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1932 + }, + { + "completion_length": 235.3928680419922, + "epoch": 0.3242382329519259, + "grad_norm": 0.5119444231013779, + "kl": 0.10437774658203125, + "learning_rate": 2.3876543209876543e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.739285746589303, + "rewards/format_reward_func": 1.0, + "step": 1934 + }, + { + "completion_length": 236.93304538726807, + "epoch": 0.32457353619179347, + "grad_norm": 0.2119830430647975, + "kl": 0.03171539306640625, + "learning_rate": 2.390123456790123e-07, + "loss": 0.0, + "reward": 1.7464286461472511, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 1936 + }, + { + "completion_length": 228.1294755935669, + "epoch": 0.324908839431661, + "grad_norm": 0.27788889937507205, + "kl": 0.01113128662109375, + "learning_rate": 2.3925925925925926e-07, + "loss": 0.0, + "reward": 1.6607143729925156, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.6607143338769674, + "rewards/format_reward_func": 1.0, + "step": 1938 + }, + { + "completion_length": 237.4955472946167, + "epoch": 0.32524414267152857, + "grad_norm": 0.24299818770137305, + "kl": 0.013406753540039062, + "learning_rate": 2.3950617283950615e-07, + "loss": 0.0, + "reward": 1.7428572252392769, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 1940 + }, + { + "completion_length": 233.36608409881592, + "epoch": 0.3255794459113961, + "grad_norm": 0.1810489507798674, + "kl": 0.03422355651855469, + "learning_rate": 2.397530864197531e-07, + "loss": 0.0, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 1942 + }, + { + "completion_length": 229.81697368621826, + "epoch": 0.32591474915126367, + "grad_norm": 0.1674598005786771, + "kl": 0.044902801513671875, + "learning_rate": 2.4e-07, + "loss": 0.0, + "reward": 1.7607143446803093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 1944 + }, + { + "completion_length": 229.18304634094238, + "epoch": 0.32625005239113125, + "grad_norm": 0.1620064245637525, + "kl": 0.017425537109375, + "learning_rate": 2.402469135802469e-07, + "loss": 0.0, + "reward": 1.7000000923871994, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.700000025331974, + "rewards/format_reward_func": 1.0, + "step": 1946 + }, + { + "completion_length": 236.3392972946167, + "epoch": 0.3265853556309988, + "grad_norm": 0.20727220749960834, + "kl": 0.010746002197265625, + "learning_rate": 2.404938271604938e-07, + "loss": 0.0, + "reward": 1.7107143551111221, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7196428794413805, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1948 + }, + { + "completion_length": 234.52679634094238, + "epoch": 0.32692065887086635, + "grad_norm": 0.3255722185826685, + "kl": 0.07966041564941406, + "learning_rate": 2.407407407407407e-07, + "loss": 0.0001, + "reward": 1.7839286476373672, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1950 + }, + { + "completion_length": 229.72768783569336, + "epoch": 0.3272559621107339, + "grad_norm": 0.32015743982788275, + "kl": 0.08532905578613281, + "learning_rate": 2.4098765432098764e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 1952 + }, + { + "completion_length": 227.7232265472412, + "epoch": 0.32759126535060146, + "grad_norm": 0.21736216860562788, + "kl": 0.04352378845214844, + "learning_rate": 2.412345679012346e-07, + "loss": 0.0, + "reward": 1.785714328289032, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143115252256, + "rewards/format_reward_func": 1.0, + "step": 1954 + }, + { + "completion_length": 229.696439743042, + "epoch": 0.327926568590469, + "grad_norm": 0.214901028849602, + "kl": 0.054378509521484375, + "learning_rate": 2.4148148148148147e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 1956 + }, + { + "completion_length": 230.96429538726807, + "epoch": 0.32826187183033656, + "grad_norm": 0.35751988952802904, + "kl": 0.04533100128173828, + "learning_rate": 2.4172839506172836e-07, + "loss": 0.0, + "reward": 1.7142857909202576, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7142857611179352, + "rewards/format_reward_func": 1.0, + "step": 1958 + }, + { + "completion_length": 225.32590293884277, + "epoch": 0.32859717507020414, + "grad_norm": 0.29350278528039353, + "kl": 0.014001846313476562, + "learning_rate": 2.419753086419753e-07, + "loss": 0.0, + "reward": 1.7321429327130318, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428991854191, + "rewards/format_reward_func": 1.0, + "step": 1960 + }, + { + "completion_length": 226.90626049041748, + "epoch": 0.32893247831007166, + "grad_norm": 0.2693844120927604, + "kl": 0.07091999053955078, + "learning_rate": 2.4222222222222224e-07, + "loss": 0.0001, + "reward": 1.79464291036129, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8080357313156128, + "rewards/format_reward_func": 0.9866071455180645, + "step": 1962 + }, + { + "completion_length": 232.9732255935669, + "epoch": 0.32926778154993924, + "grad_norm": 0.36427653784791236, + "kl": 0.00850677490234375, + "learning_rate": 2.4246913580246913e-07, + "loss": 0.0, + "reward": 1.7107143476605415, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7196428999304771, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1964 + }, + { + "completion_length": 238.4821538925171, + "epoch": 0.32960308478980677, + "grad_norm": 0.18662587745852874, + "kl": 0.014192581176757812, + "learning_rate": 2.42716049382716e-07, + "loss": 0.0, + "reward": 1.7178571969270706, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571801632643, + "rewards/format_reward_func": 1.0, + "step": 1966 + }, + { + "completion_length": 235.21429538726807, + "epoch": 0.32993838802967435, + "grad_norm": 0.3017454432462034, + "kl": 0.0271148681640625, + "learning_rate": 2.4296296296296296e-07, + "loss": 0.0, + "reward": 1.755357213318348, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214447498322, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1968 + }, + { + "completion_length": 222.56697368621826, + "epoch": 0.33027369126954187, + "grad_norm": 0.3683075178582106, + "kl": 0.008819580078125, + "learning_rate": 2.432098765432099e-07, + "loss": 0.0, + "reward": 1.8504464700818062, + "reward_std": 0.07007933221757412, + "rewards/equation_reward_func": 0.8562500178813934, + "rewards/format_reward_func": 0.9941964335739613, + "step": 1970 + }, + { + "completion_length": 225.54018688201904, + "epoch": 0.33060899450940945, + "grad_norm": 0.254113309813442, + "kl": 0.015944480895996094, + "learning_rate": 2.434567901234568e-07, + "loss": 0.0, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 1972 + }, + { + "completion_length": 228.3303680419922, + "epoch": 0.330944297749277, + "grad_norm": 0.3838229118855729, + "kl": 0.012636184692382812, + "learning_rate": 2.437037037037037e-07, + "loss": 0.0, + "reward": 1.7071429416537285, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7160714659839869, + "rewards/format_reward_func": 0.9910714328289032, + "step": 1974 + }, + { + "completion_length": 237.89733028411865, + "epoch": 0.33127960098914455, + "grad_norm": 0.33245679288182095, + "kl": 0.03874683380126953, + "learning_rate": 2.439506172839506e-07, + "loss": 0.0, + "reward": 1.7553572058677673, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1976 + }, + { + "completion_length": 228.4241180419922, + "epoch": 0.33161490422901213, + "grad_norm": 0.1817437147379376, + "kl": 0.06694793701171875, + "learning_rate": 2.4419753086419756e-07, + "loss": 0.0001, + "reward": 1.8107143566012383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 1978 + }, + { + "completion_length": 231.20090293884277, + "epoch": 0.33195020746887965, + "grad_norm": 0.23502872535450134, + "kl": 0.08605766296386719, + "learning_rate": 2.4444444444444445e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000432133675, + "rewards/format_reward_func": 1.0, + "step": 1980 + }, + { + "completion_length": 228.21429538726807, + "epoch": 0.33228551070874723, + "grad_norm": 0.1645380719145372, + "kl": 0.022317886352539062, + "learning_rate": 2.4469135802469133e-07, + "loss": 0.0, + "reward": 1.7571429386734962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 1982 + }, + { + "completion_length": 230.04018878936768, + "epoch": 0.33262081394861476, + "grad_norm": 0.5832458221239087, + "kl": 0.07063102722167969, + "learning_rate": 2.449382716049383e-07, + "loss": 0.0001, + "reward": 1.7303572222590446, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7437500357627869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 1984 + }, + { + "completion_length": 230.77679634094238, + "epoch": 0.33295611718848234, + "grad_norm": 0.32034128630556463, + "kl": 0.05539894104003906, + "learning_rate": 2.4518518518518516e-07, + "loss": 0.0001, + "reward": 1.7125000581145287, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7169643081724644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 1986 + }, + { + "completion_length": 221.37054538726807, + "epoch": 0.3332914204283499, + "grad_norm": 0.3104718829687837, + "kl": 0.0174102783203125, + "learning_rate": 2.454320987654321e-07, + "loss": 0.0, + "reward": 1.7571429163217545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 1988 + }, + { + "completion_length": 230.3973331451416, + "epoch": 0.33362672366821744, + "grad_norm": 0.28791233501393015, + "kl": 0.02490234375, + "learning_rate": 2.45679012345679e-07, + "loss": 0.0, + "reward": 1.814285770058632, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 1990 + }, + { + "completion_length": 231.79911708831787, + "epoch": 0.333962026908085, + "grad_norm": 0.1954296500371838, + "kl": 0.022606849670410156, + "learning_rate": 2.4592592592592593e-07, + "loss": 0.0, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 1992 + }, + { + "completion_length": 236.7053680419922, + "epoch": 0.33429733014795254, + "grad_norm": 0.21342070200715885, + "kl": 0.10403156280517578, + "learning_rate": 2.461728395061728e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286029338837, + "rewards/format_reward_func": 1.0, + "step": 1994 + }, + { + "completion_length": 225.05358409881592, + "epoch": 0.3346326333878201, + "grad_norm": 0.5948907828800926, + "kl": 0.0234222412109375, + "learning_rate": 2.4641975308641976e-07, + "loss": 0.0, + "reward": 1.7357143685221672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143070548773, + "rewards/format_reward_func": 1.0, + "step": 1996 + }, + { + "completion_length": 229.7366189956665, + "epoch": 0.33496793662768765, + "grad_norm": 0.14636131178410403, + "kl": 0.037792205810546875, + "learning_rate": 2.4666666666666665e-07, + "loss": 0.0, + "reward": 1.7142857983708382, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 1.0, + "step": 1998 + }, + { + "completion_length": 228.67858219146729, + "epoch": 0.3353032398675552, + "grad_norm": 0.1664830030099182, + "kl": 0.01624774932861328, + "learning_rate": 2.4691358024691354e-07, + "loss": 0.0, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 2000 + }, + { + "completion_length": 222.88840293884277, + "epoch": 0.33563854310742275, + "grad_norm": 0.26362387045617086, + "kl": 0.012630462646484375, + "learning_rate": 2.471604938271605e-07, + "loss": 0.0, + "reward": 1.796428620815277, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286133646965, + "rewards/format_reward_func": 1.0, + "step": 2002 + }, + { + "completion_length": 223.72322273254395, + "epoch": 0.33597384634729033, + "grad_norm": 0.0014755454691630647, + "kl": 0.009225845336914062, + "learning_rate": 2.474074074074074e-07, + "loss": 0.0, + "reward": 1.7285715118050575, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714708268642, + "rewards/format_reward_func": 1.0, + "step": 2004 + }, + { + "completion_length": 220.60715198516846, + "epoch": 0.3363091495871579, + "grad_norm": 0.310672175130401, + "kl": 0.016305923461914062, + "learning_rate": 2.476543209876543e-07, + "loss": 0.0, + "reward": 1.8000000417232513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 2006 + }, + { + "completion_length": 227.81250953674316, + "epoch": 0.33664445282702543, + "grad_norm": 0.2702079969416635, + "kl": 0.10282135009765625, + "learning_rate": 2.479012345679012e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2008 + }, + { + "completion_length": 225.70536613464355, + "epoch": 0.336979756066893, + "grad_norm": 0.20606098388880464, + "kl": 0.018157005310058594, + "learning_rate": 2.4814814814814814e-07, + "loss": 0.0, + "reward": 1.8053572103381157, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8098214641213417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2010 + }, + { + "completion_length": 237.58036708831787, + "epoch": 0.33731505930676053, + "grad_norm": 0.2482946763221189, + "kl": 0.05824089050292969, + "learning_rate": 2.4839506172839503e-07, + "loss": 0.0001, + "reward": 1.7035715132951736, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7035714760422707, + "rewards/format_reward_func": 1.0, + "step": 2012 + }, + { + "completion_length": 233.10269260406494, + "epoch": 0.3376503625466281, + "grad_norm": 0.28002200040351294, + "kl": 0.0264739990234375, + "learning_rate": 2.4864197530864197e-07, + "loss": 0.0, + "reward": 1.7267857864499092, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7312500234693289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2014 + }, + { + "completion_length": 236.15626049041748, + "epoch": 0.33798566578649564, + "grad_norm": 0.47691443247370385, + "kl": 0.02268695831298828, + "learning_rate": 2.4888888888888886e-07, + "loss": 0.0, + "reward": 1.757142923772335, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 2016 + }, + { + "completion_length": 220.9732255935669, + "epoch": 0.3383209690263632, + "grad_norm": 0.4744433084933257, + "kl": 0.03647422790527344, + "learning_rate": 2.491358024691358e-07, + "loss": 0.0, + "reward": 1.7857143506407738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 2018 + }, + { + "completion_length": 227.18304538726807, + "epoch": 0.3386562722662308, + "grad_norm": 0.16060134070242493, + "kl": 0.05447578430175781, + "learning_rate": 2.493827160493827e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 2020 + }, + { + "completion_length": 230.07143783569336, + "epoch": 0.3389915755060983, + "grad_norm": 0.349626308958822, + "kl": 0.09692955017089844, + "learning_rate": 2.4962962962962963e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 2022 + }, + { + "completion_length": 227.34375953674316, + "epoch": 0.3393268787459659, + "grad_norm": 0.229310222683427, + "kl": 0.022785186767578125, + "learning_rate": 2.498765432098765e-07, + "loss": 0.0, + "reward": 1.7803572192788124, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214562982321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2024 + }, + { + "completion_length": 224.8839406967163, + "epoch": 0.3396621819858334, + "grad_norm": 0.13934129466630318, + "kl": 0.05133819580078125, + "learning_rate": 2.501234567901234e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 2026 + }, + { + "completion_length": 229.83036708831787, + "epoch": 0.339997485225701, + "grad_norm": 0.11982324549628261, + "kl": 0.03446388244628906, + "learning_rate": 2.5037037037037035e-07, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 2028 + }, + { + "completion_length": 220.665189743042, + "epoch": 0.3403327884655685, + "grad_norm": 0.14265061273334256, + "kl": 0.014574050903320312, + "learning_rate": 2.506172839506173e-07, + "loss": 0.0, + "reward": 1.8107143267989159, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 2030 + }, + { + "completion_length": 228.60715293884277, + "epoch": 0.3406680917054361, + "grad_norm": 0.08317394544730386, + "kl": 0.05511188507080078, + "learning_rate": 2.508641975308642e-07, + "loss": 0.0001, + "reward": 1.6892857998609543, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.689285745844245, + "rewards/format_reward_func": 1.0, + "step": 2032 + }, + { + "completion_length": 224.03572368621826, + "epoch": 0.3410033949453037, + "grad_norm": 0.13839855068066612, + "kl": 0.013246536254882812, + "learning_rate": 2.511111111111111e-07, + "loss": 0.0, + "reward": 1.7714286148548126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 2034 + }, + { + "completion_length": 236.71876335144043, + "epoch": 0.3413386981851712, + "grad_norm": 0.46152422665186366, + "kl": 0.03157615661621094, + "learning_rate": 2.51358024691358e-07, + "loss": 0.0, + "reward": 1.7232143431901932, + "reward_std": 0.10859139822423458, + "rewards/equation_reward_func": 0.7366071827709675, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2036 + }, + { + "completion_length": 219.82143783569336, + "epoch": 0.3416740014250388, + "grad_norm": 0.2956090340920406, + "kl": 0.007686614990234375, + "learning_rate": 2.5160493827160495e-07, + "loss": 0.0, + "reward": 1.792857214808464, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 2038 + }, + { + "completion_length": 237.73215293884277, + "epoch": 0.3420093046649063, + "grad_norm": 0.3170164235407117, + "kl": 0.04242229461669922, + "learning_rate": 2.5185185185185184e-07, + "loss": 0.0, + "reward": 1.7392857819795609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857596278191, + "rewards/format_reward_func": 1.0, + "step": 2040 + }, + { + "completion_length": 229.84375858306885, + "epoch": 0.3423446079047739, + "grad_norm": 0.18831475868217637, + "kl": 0.022202491760253906, + "learning_rate": 2.520987654320987e-07, + "loss": 0.0, + "reward": 1.7785715162754059, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 2042 + }, + { + "completion_length": 225.55358219146729, + "epoch": 0.3426799111446414, + "grad_norm": 0.2642266312318678, + "kl": 0.07865715026855469, + "learning_rate": 2.5234567901234567e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2044 + }, + { + "completion_length": 235.05357933044434, + "epoch": 0.343015214384509, + "grad_norm": 0.2661198557562209, + "kl": 0.028797149658203125, + "learning_rate": 2.5259259259259255e-07, + "loss": 0.0, + "reward": 1.7446429207921028, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2046 + }, + { + "completion_length": 236.3259048461914, + "epoch": 0.34335051762437657, + "grad_norm": 0.3349485363833583, + "kl": 0.029720306396484375, + "learning_rate": 2.528395061728395e-07, + "loss": 0.0, + "reward": 1.7357143759727478, + "reward_std": 0.09091372601687908, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 2048 + }, + { + "completion_length": 231.45983123779297, + "epoch": 0.3436858208642441, + "grad_norm": 0.4207071809083083, + "kl": 0.06764984130859375, + "learning_rate": 2.5308641975308644e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 2050 + }, + { + "completion_length": 229.0580472946167, + "epoch": 0.3440211241041117, + "grad_norm": 0.2937621352389509, + "kl": 0.02901458740234375, + "learning_rate": 2.533333333333333e-07, + "loss": 0.0, + "reward": 1.7321429401636124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 2052 + }, + { + "completion_length": 232.07590198516846, + "epoch": 0.3443564273439792, + "grad_norm": 0.291425140305454, + "kl": 0.015499114990234375, + "learning_rate": 2.535802469135802e-07, + "loss": 0.0, + "reward": 1.714285783469677, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857387661934, + "rewards/format_reward_func": 1.0, + "step": 2054 + }, + { + "completion_length": 234.2053680419922, + "epoch": 0.3446917305838468, + "grad_norm": 0.37230523225702566, + "kl": 0.01446533203125, + "learning_rate": 2.5382716049382716e-07, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.09596448857337236, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 2056 + }, + { + "completion_length": 231.3526906967163, + "epoch": 0.3450270338237143, + "grad_norm": 0.13254691883933248, + "kl": 0.015127182006835938, + "learning_rate": 2.5407407407407404e-07, + "loss": 0.0, + "reward": 1.8571428805589676, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.8571428842842579, + "rewards/format_reward_func": 1.0, + "step": 2058 + }, + { + "completion_length": 224.11161708831787, + "epoch": 0.3453623370635819, + "grad_norm": 0.2855218059164058, + "kl": 0.021585464477539062, + "learning_rate": 2.54320987654321e-07, + "loss": 0.0, + "reward": 1.7392858043313026, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 2060 + }, + { + "completion_length": 230.78572463989258, + "epoch": 0.3456976403034494, + "grad_norm": 0.2911414082155755, + "kl": 0.01221466064453125, + "learning_rate": 2.5456790123456787e-07, + "loss": 0.0, + "reward": 1.7321429550647736, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428991854191, + "rewards/format_reward_func": 1.0, + "step": 2062 + }, + { + "completion_length": 233.9642972946167, + "epoch": 0.346032943543317, + "grad_norm": 0.3881993855084975, + "kl": 0.0128936767578125, + "learning_rate": 2.548148148148148e-07, + "loss": 0.0, + "reward": 1.7785715237259865, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7785714380443096, + "rewards/format_reward_func": 1.0, + "step": 2064 + }, + { + "completion_length": 221.6384038925171, + "epoch": 0.34636824678318456, + "grad_norm": 0.26392784790783114, + "kl": 0.0257110595703125, + "learning_rate": 2.5506172839506176e-07, + "loss": 0.0, + "reward": 1.792857214808464, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 2066 + }, + { + "completion_length": 227.79911613464355, + "epoch": 0.3467035500230521, + "grad_norm": 0.32096854524746926, + "kl": 0.026147842407226562, + "learning_rate": 2.553086419753086e-07, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143204659224, + "rewards/format_reward_func": 1.0, + "step": 2068 + }, + { + "completion_length": 235.91072750091553, + "epoch": 0.34703885326291967, + "grad_norm": 0.4583612266863497, + "kl": 0.03256988525390625, + "learning_rate": 2.5555555555555553e-07, + "loss": 0.0, + "reward": 1.8196429461240768, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8241071552038193, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2070 + }, + { + "completion_length": 228.0000114440918, + "epoch": 0.3473741565027872, + "grad_norm": 0.18039195795853832, + "kl": 0.03345489501953125, + "learning_rate": 2.558024691358024e-07, + "loss": 0.0, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000242143869, + "rewards/format_reward_func": 1.0, + "step": 2072 + }, + { + "completion_length": 228.9687614440918, + "epoch": 0.34770945974265477, + "grad_norm": 0.19847492504938696, + "kl": 0.011264801025390625, + "learning_rate": 2.5604938271604936e-07, + "loss": 0.0, + "reward": 1.7736607789993286, + "reward_std": 0.027147850021719933, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 0.9986607171595097, + "step": 2074 + }, + { + "completion_length": 233.9241189956665, + "epoch": 0.3480447629825223, + "grad_norm": 0.21015001646006334, + "kl": 0.009357452392578125, + "learning_rate": 2.562962962962963e-07, + "loss": 0.0, + "reward": 1.7732143476605415, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2076 + }, + { + "completion_length": 232.8482255935669, + "epoch": 0.3483800662223899, + "grad_norm": 0.33872680822882006, + "kl": 0.022771835327148438, + "learning_rate": 2.565432098765432e-07, + "loss": 0.0, + "reward": 1.8000000640749931, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.8089285977184772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2078 + }, + { + "completion_length": 229.36161708831787, + "epoch": 0.34871536946225745, + "grad_norm": 0.28679987177075117, + "kl": 0.054790496826171875, + "learning_rate": 2.5679012345679013e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 2080 + }, + { + "completion_length": 217.6384038925171, + "epoch": 0.349050672702125, + "grad_norm": 0.27843672987692414, + "kl": 0.00815582275390625, + "learning_rate": 2.570370370370371e-07, + "loss": 0.0, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 2082 + }, + { + "completion_length": 237.37501049041748, + "epoch": 0.34938597594199255, + "grad_norm": 0.37663868418897395, + "kl": 0.09327888488769531, + "learning_rate": 2.572839506172839e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286185801029, + "rewards/format_reward_func": 1.0, + "step": 2084 + }, + { + "completion_length": 229.415189743042, + "epoch": 0.3497212791818601, + "grad_norm": 0.2059930556278535, + "kl": 0.07577705383300781, + "learning_rate": 2.5753086419753085e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 2086 + }, + { + "completion_length": 223.26786613464355, + "epoch": 0.35005658242172766, + "grad_norm": 0.26716203101438885, + "kl": 0.01068878173828125, + "learning_rate": 2.5777777777777774e-07, + "loss": 0.0, + "reward": 1.7482143640518188, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2088 + }, + { + "completion_length": 223.92858505249023, + "epoch": 0.3503918856615952, + "grad_norm": 0.5845833867815128, + "kl": 0.16082000732421875, + "learning_rate": 2.580246913580247e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 2090 + }, + { + "completion_length": 230.1384038925171, + "epoch": 0.35072718890146276, + "grad_norm": 0.30983776550084197, + "kl": 0.0106658935546875, + "learning_rate": 2.582716049382716e-07, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 2092 + }, + { + "completion_length": 225.102689743042, + "epoch": 0.35106249214133034, + "grad_norm": 0.342229109593764, + "kl": 0.019098281860351562, + "learning_rate": 2.585185185185185e-07, + "loss": 0.0, + "reward": 1.778571479022503, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7785714790225029, + "rewards/format_reward_func": 1.0, + "step": 2094 + }, + { + "completion_length": 231.91518783569336, + "epoch": 0.35139779538119786, + "grad_norm": 0.34481677702260877, + "kl": 0.04109764099121094, + "learning_rate": 2.5876543209876545e-07, + "loss": 0.0, + "reward": 1.7464286535978317, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.746428620070219, + "rewards/format_reward_func": 1.0, + "step": 2096 + }, + { + "completion_length": 233.06697368621826, + "epoch": 0.35173309862106544, + "grad_norm": 0.31827473583036603, + "kl": 0.01319122314453125, + "learning_rate": 2.590123456790123e-07, + "loss": 0.0, + "reward": 1.769642911851406, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2098 + }, + { + "completion_length": 230.33036708831787, + "epoch": 0.35206840186093297, + "grad_norm": 0.351989871127145, + "kl": 0.07884025573730469, + "learning_rate": 2.5925925925925923e-07, + "loss": 0.0001, + "reward": 1.7285715192556381, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714540630579, + "rewards/format_reward_func": 1.0, + "step": 2100 + }, + { + "completion_length": 220.97768783569336, + "epoch": 0.35240370510080055, + "grad_norm": 0.09114508670738188, + "kl": 0.024440765380859375, + "learning_rate": 2.5950617283950617e-07, + "loss": 0.0, + "reward": 1.8107143566012383, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 2102 + }, + { + "completion_length": 232.7946538925171, + "epoch": 0.35273900834066807, + "grad_norm": 0.2583027938351751, + "kl": 0.04311180114746094, + "learning_rate": 2.5975308641975306e-07, + "loss": 0.0, + "reward": 1.7607143223285675, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7696428950875998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2104 + }, + { + "completion_length": 225.14286613464355, + "epoch": 0.35307431158053565, + "grad_norm": 0.2558472070866311, + "kl": 0.009954452514648438, + "learning_rate": 2.6e-07, + "loss": 0.0, + "reward": 1.8250000402331352, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000383704901, + "rewards/format_reward_func": 1.0, + "step": 2106 + }, + { + "completion_length": 230.54465198516846, + "epoch": 0.35340961482040323, + "grad_norm": 0.37407279058475107, + "kl": 0.0380706787109375, + "learning_rate": 2.6024691358024694e-07, + "loss": 0.0, + "reward": 1.7214286401867867, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214286141097546, + "rewards/format_reward_func": 1.0, + "step": 2108 + }, + { + "completion_length": 221.95536518096924, + "epoch": 0.35374491806027075, + "grad_norm": 0.16814010224897627, + "kl": 0.08795928955078125, + "learning_rate": 2.6049382716049383e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 2110 + }, + { + "completion_length": 228.25893783569336, + "epoch": 0.35408022130013833, + "grad_norm": 0.45033267511988145, + "kl": 0.026628494262695312, + "learning_rate": 2.607407407407407e-07, + "loss": 0.0, + "reward": 1.730357214808464, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7348214685916901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2112 + }, + { + "completion_length": 234.99108219146729, + "epoch": 0.35441552454000586, + "grad_norm": 0.13415719938476556, + "kl": 0.062084197998046875, + "learning_rate": 2.609876543209876e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 2114 + }, + { + "completion_length": 224.31697368621826, + "epoch": 0.35475082777987343, + "grad_norm": 0.4264411382430659, + "kl": 0.043720245361328125, + "learning_rate": 2.6123456790123455e-07, + "loss": 0.0, + "reward": 1.7303572073578835, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214592784643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2116 + }, + { + "completion_length": 226.071439743042, + "epoch": 0.35508613101974096, + "grad_norm": 0.40260476168590575, + "kl": 0.015716552734375, + "learning_rate": 2.614814814814815e-07, + "loss": 0.0, + "reward": 1.7392858043313026, + "reward_std": 0.09596448857337236, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 2118 + }, + { + "completion_length": 233.508939743042, + "epoch": 0.35542143425960854, + "grad_norm": 0.24759868181676284, + "kl": 0.024053573608398438, + "learning_rate": 2.617283950617284e-07, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 2120 + }, + { + "completion_length": 224.0267972946167, + "epoch": 0.35575673749947606, + "grad_norm": 0.10767464037699406, + "kl": 0.14308738708496094, + "learning_rate": 2.619753086419753e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 2122 + }, + { + "completion_length": 231.52232933044434, + "epoch": 0.35609204073934364, + "grad_norm": 0.3298464882369941, + "kl": 0.01337432861328125, + "learning_rate": 2.6222222222222226e-07, + "loss": 0.0, + "reward": 1.7232143580913544, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7276786006987095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2124 + }, + { + "completion_length": 223.696439743042, + "epoch": 0.3564273439792112, + "grad_norm": 0.24994278995491304, + "kl": 0.025163650512695312, + "learning_rate": 2.624691358024691e-07, + "loss": 0.0, + "reward": 1.7928571924567223, + "reward_std": 0.06060914974659681, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 2126 + }, + { + "completion_length": 224.03125953674316, + "epoch": 0.35676264721907874, + "grad_norm": 0.423221266506577, + "kl": 0.08066177368164062, + "learning_rate": 2.6271604938271604e-07, + "loss": 0.0001, + "reward": 1.7214286625385284, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214285973459482, + "rewards/format_reward_func": 1.0, + "step": 2128 + }, + { + "completion_length": 231.25447273254395, + "epoch": 0.3570979504589463, + "grad_norm": 0.22862671789859407, + "kl": 0.025022506713867188, + "learning_rate": 2.629629629629629e-07, + "loss": 0.0, + "reward": 1.7303571924567223, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.734821455553174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2130 + }, + { + "completion_length": 226.540189743042, + "epoch": 0.35743325369881385, + "grad_norm": 0.19159948818631775, + "kl": 0.011707305908203125, + "learning_rate": 2.6320987654320986e-07, + "loss": 0.0, + "reward": 1.7428572326898575, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 2132 + }, + { + "completion_length": 221.31697463989258, + "epoch": 0.3577685569386814, + "grad_norm": 0.29371425581013727, + "kl": 0.04029273986816406, + "learning_rate": 2.634567901234568e-07, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 2134 + }, + { + "completion_length": 228.50447463989258, + "epoch": 0.35810386017854895, + "grad_norm": 0.12820217400008221, + "kl": 0.10086727142333984, + "learning_rate": 2.637037037037037e-07, + "loss": 0.0001, + "reward": 1.8107143491506577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143007218838, + "rewards/format_reward_func": 1.0, + "step": 2136 + }, + { + "completion_length": 231.02233219146729, + "epoch": 0.35843916341841653, + "grad_norm": 0.23459826960008642, + "kl": 0.043140411376953125, + "learning_rate": 2.6395061728395064e-07, + "loss": 0.0, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 2138 + }, + { + "completion_length": 221.94643878936768, + "epoch": 0.3587744666582841, + "grad_norm": 0.3008130333023876, + "kl": 0.010679244995117188, + "learning_rate": 2.6419753086419747e-07, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 2140 + }, + { + "completion_length": 220.62947273254395, + "epoch": 0.35910976989815163, + "grad_norm": 0.32973760375580985, + "kl": 0.09985923767089844, + "learning_rate": 2.644444444444444e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571850061417, + "rewards/format_reward_func": 1.0, + "step": 2142 + }, + { + "completion_length": 228.62501049041748, + "epoch": 0.3594450731380192, + "grad_norm": 0.48222064013341237, + "kl": 0.10888290405273438, + "learning_rate": 2.6469135802469135e-07, + "loss": 0.0001, + "reward": 1.7000000700354576, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7000000458210707, + "rewards/format_reward_func": 1.0, + "step": 2144 + }, + { + "completion_length": 229.02233219146729, + "epoch": 0.35978037637788673, + "grad_norm": 0.23380887912828222, + "kl": 0.08852958679199219, + "learning_rate": 2.6493827160493824e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857670783997, + "rewards/format_reward_func": 1.0, + "step": 2146 + }, + { + "completion_length": 228.0669755935669, + "epoch": 0.3601156796177543, + "grad_norm": 0.32278175684584354, + "kl": 0.17383193969726562, + "learning_rate": 2.651851851851852e-07, + "loss": 0.0002, + "reward": 1.735714353621006, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 2148 + }, + { + "completion_length": 227.0759048461914, + "epoch": 0.36045098285762184, + "grad_norm": 0.26353297086437466, + "kl": 0.0684661865234375, + "learning_rate": 2.654320987654321e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714477300644, + "rewards/format_reward_func": 1.0, + "step": 2150 + }, + { + "completion_length": 228.1428680419922, + "epoch": 0.3607862860974894, + "grad_norm": 0.22570297887110932, + "kl": 0.05149078369140625, + "learning_rate": 2.65679012345679e-07, + "loss": 0.0001, + "reward": 1.7000000774860382, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7000000402331352, + "rewards/format_reward_func": 1.0, + "step": 2152 + }, + { + "completion_length": 220.92858028411865, + "epoch": 0.361121589337357, + "grad_norm": 0.3708480771834063, + "kl": 0.08710098266601562, + "learning_rate": 2.659259259259259e-07, + "loss": 0.0001, + "reward": 1.84285718947649, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8428571671247482, + "rewards/format_reward_func": 1.0, + "step": 2154 + }, + { + "completion_length": 235.52679443359375, + "epoch": 0.3614568925772245, + "grad_norm": 0.1637799996002816, + "kl": 0.05851173400878906, + "learning_rate": 2.661728395061728e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 2156 + }, + { + "completion_length": 226.90626049041748, + "epoch": 0.3617921958170921, + "grad_norm": 0.3360529398155563, + "kl": 0.021284103393554688, + "learning_rate": 2.6641975308641973e-07, + "loss": 0.0, + "reward": 1.7821429371833801, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 2158 + }, + { + "completion_length": 233.8973331451416, + "epoch": 0.3621274990569596, + "grad_norm": 0.31128516923824884, + "kl": 0.15531539916992188, + "learning_rate": 2.6666666666666667e-07, + "loss": 0.0002, + "reward": 1.764285758137703, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 2160 + }, + { + "completion_length": 223.227689743042, + "epoch": 0.3624628022968272, + "grad_norm": 0.23111936609267744, + "kl": 0.038791656494140625, + "learning_rate": 2.6691358024691356e-07, + "loss": 0.0, + "reward": 1.817857176065445, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571686148643, + "rewards/format_reward_func": 1.0, + "step": 2162 + }, + { + "completion_length": 221.3035831451416, + "epoch": 0.3627981055366947, + "grad_norm": 0.4127253192986052, + "kl": 0.04503440856933594, + "learning_rate": 2.671604938271605e-07, + "loss": 0.0, + "reward": 1.7821429446339607, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 2164 + }, + { + "completion_length": 221.6428680419922, + "epoch": 0.3631334087765623, + "grad_norm": 0.223704776342276, + "kl": 0.061389923095703125, + "learning_rate": 2.674074074074074e-07, + "loss": 0.0001, + "reward": 1.7428572252392769, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571712225676, + "rewards/format_reward_func": 1.0, + "step": 2166 + }, + { + "completion_length": 228.24108409881592, + "epoch": 0.3634687120164299, + "grad_norm": 0.22441233228251525, + "kl": 0.03566169738769531, + "learning_rate": 2.6765432098765433e-07, + "loss": 0.0, + "reward": 1.7410714998841286, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2168 + }, + { + "completion_length": 225.6830472946167, + "epoch": 0.3638040152562974, + "grad_norm": 0.13680117104986803, + "kl": 0.015272140502929688, + "learning_rate": 2.679012345679012e-07, + "loss": 0.0, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 2170 + }, + { + "completion_length": 222.26340293884277, + "epoch": 0.364139318496165, + "grad_norm": 0.26671750198225047, + "kl": 0.02013397216796875, + "learning_rate": 2.681481481481481e-07, + "loss": 0.0, + "reward": 1.7214286550879478, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 2172 + }, + { + "completion_length": 221.37947463989258, + "epoch": 0.3644746217360325, + "grad_norm": 0.2374098871508374, + "kl": 0.06538772583007812, + "learning_rate": 2.6839506172839505e-07, + "loss": 0.0001, + "reward": 1.7785715162754059, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 2174 + }, + { + "completion_length": 237.63393878936768, + "epoch": 0.3648099249759001, + "grad_norm": 0.2467816451908553, + "kl": 0.03937530517578125, + "learning_rate": 2.68641975308642e-07, + "loss": 0.0, + "reward": 1.7446429431438446, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2176 + }, + { + "completion_length": 230.14733123779297, + "epoch": 0.3651452282157676, + "grad_norm": 0.3179348050690457, + "kl": 0.01219940185546875, + "learning_rate": 2.688888888888889e-07, + "loss": 0.0, + "reward": 1.8178572207689285, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.8178571499884129, + "rewards/format_reward_func": 1.0, + "step": 2178 + }, + { + "completion_length": 228.75447368621826, + "epoch": 0.3654805314556352, + "grad_norm": 0.2525358576089782, + "kl": 0.02394866943359375, + "learning_rate": 2.691358024691358e-07, + "loss": 0.0, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 2180 + }, + { + "completion_length": 222.07590293884277, + "epoch": 0.3658158346955028, + "grad_norm": 0.2556722238310013, + "kl": 0.022491455078125, + "learning_rate": 2.693827160493827e-07, + "loss": 0.0, + "reward": 1.6678572222590446, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.6678571812808514, + "rewards/format_reward_func": 1.0, + "step": 2182 + }, + { + "completion_length": 236.40179634094238, + "epoch": 0.3661511379353703, + "grad_norm": 0.29412853954255963, + "kl": 0.07938003540039062, + "learning_rate": 2.696296296296296e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571429051458836, + "rewards/format_reward_func": 1.0, + "step": 2184 + }, + { + "completion_length": 234.1428689956665, + "epoch": 0.3664864411752379, + "grad_norm": 0.2060815902787529, + "kl": 0.10279083251953125, + "learning_rate": 2.6987654320987654e-07, + "loss": 0.0001, + "reward": 1.698214367032051, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7026786133646965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2186 + }, + { + "completion_length": 227.27233219146729, + "epoch": 0.3668217444151054, + "grad_norm": 0.3214990798853585, + "kl": 0.06783485412597656, + "learning_rate": 2.7012345679012343e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 2188 + }, + { + "completion_length": 232.0803680419922, + "epoch": 0.367157047654973, + "grad_norm": 0.29116456137569957, + "kl": 0.06546974182128906, + "learning_rate": 2.7037037037037037e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714440047741, + "rewards/format_reward_func": 1.0, + "step": 2190 + }, + { + "completion_length": 236.8259048461914, + "epoch": 0.3674923508948405, + "grad_norm": 0.275873402245631, + "kl": 0.3006572723388672, + "learning_rate": 2.7061728395061726e-07, + "loss": 0.0003, + "reward": 1.7125000655651093, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7169643267989159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2192 + }, + { + "completion_length": 235.76786613464355, + "epoch": 0.3678276541347081, + "grad_norm": 0.37498091593844435, + "kl": 0.12526512145996094, + "learning_rate": 2.708641975308642e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 2194 + }, + { + "completion_length": 238.69197463989258, + "epoch": 0.3681629573745756, + "grad_norm": 0.19498387354928298, + "kl": 0.19458770751953125, + "learning_rate": 2.7111111111111114e-07, + "loss": 0.0002, + "reward": 1.7267857939004898, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7312500365078449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2196 + }, + { + "completion_length": 232.6339406967163, + "epoch": 0.3684982606144432, + "grad_norm": 0.16930298305774352, + "kl": 0.04517364501953125, + "learning_rate": 2.71358024691358e-07, + "loss": 0.0, + "reward": 1.7821429148316383, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2198 + }, + { + "completion_length": 225.65179538726807, + "epoch": 0.36883356385431076, + "grad_norm": 0.2732590185925259, + "kl": 0.09507369995117188, + "learning_rate": 2.716049382716049e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571697324514, + "rewards/format_reward_func": 1.0, + "step": 2200 + }, + { + "completion_length": 220.92858028411865, + "epoch": 0.3691688670941783, + "grad_norm": 0.3480826700798807, + "kl": 0.2006244659423828, + "learning_rate": 2.7185185185185186e-07, + "loss": 0.0002, + "reward": 1.7571429386734962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 2202 + }, + { + "completion_length": 228.95983028411865, + "epoch": 0.36950417033404587, + "grad_norm": 0.23230064612406418, + "kl": 0.09466743469238281, + "learning_rate": 2.7209876543209875e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2204 + }, + { + "completion_length": 225.86161613464355, + "epoch": 0.3698394735739134, + "grad_norm": 0.37423496124042877, + "kl": 0.07233428955078125, + "learning_rate": 2.723456790123457e-07, + "loss": 0.0001, + "reward": 1.7000000849366188, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7000000383704901, + "rewards/format_reward_func": 1.0, + "step": 2206 + }, + { + "completion_length": 235.37501049041748, + "epoch": 0.37017477681378097, + "grad_norm": 0.20144566733771846, + "kl": 0.08749961853027344, + "learning_rate": 2.725925925925926e-07, + "loss": 0.0001, + "reward": 1.7232143506407738, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2208 + }, + { + "completion_length": 231.55804538726807, + "epoch": 0.3705100800536485, + "grad_norm": 0.9344322170031661, + "kl": 0.2306499481201172, + "learning_rate": 2.728395061728395e-07, + "loss": 0.0002, + "reward": 1.7214286476373672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214286029338837, + "rewards/format_reward_func": 1.0, + "step": 2210 + }, + { + "completion_length": 232.2946538925171, + "epoch": 0.3708453832935161, + "grad_norm": 0.1253578302593943, + "kl": 0.07514762878417969, + "learning_rate": 2.730864197530864e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.753571467474103, + "rewards/format_reward_func": 1.0, + "step": 2212 + }, + { + "completion_length": 232.62054824829102, + "epoch": 0.37118068653338365, + "grad_norm": 0.20158043710907292, + "kl": 0.1405200958251953, + "learning_rate": 2.733333333333333e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7589286118745804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2214 + }, + { + "completion_length": 231.17411518096924, + "epoch": 0.3715159897732512, + "grad_norm": 0.5074480366301922, + "kl": 0.08559036254882812, + "learning_rate": 2.7358024691358023e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 2216 + }, + { + "completion_length": 228.22768783569336, + "epoch": 0.37185129301311876, + "grad_norm": 0.16069692799159083, + "kl": 0.05332183837890625, + "learning_rate": 2.738271604938271e-07, + "loss": 0.0001, + "reward": 1.7714286595582962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 2218 + }, + { + "completion_length": 233.99108219146729, + "epoch": 0.3721865962529863, + "grad_norm": 0.45528804532488343, + "kl": 0.2368755340576172, + "learning_rate": 2.7407407407407406e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 2220 + }, + { + "completion_length": 226.79465198516846, + "epoch": 0.37252189949285386, + "grad_norm": 0.4392680631342267, + "kl": 0.0537261962890625, + "learning_rate": 2.74320987654321e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 2222 + }, + { + "completion_length": 240.25000858306885, + "epoch": 0.3728572027327214, + "grad_norm": 0.42132753666272715, + "kl": 0.18262672424316406, + "learning_rate": 2.745679012345679e-07, + "loss": 0.0002, + "reward": 1.7696429044008255, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2224 + }, + { + "completion_length": 227.9419755935669, + "epoch": 0.37319250597258896, + "grad_norm": 0.29602895586363986, + "kl": 0.07367324829101562, + "learning_rate": 2.748148148148148e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 2226 + }, + { + "completion_length": 222.40179443359375, + "epoch": 0.37352780921245654, + "grad_norm": 0.18568951191388172, + "kl": 0.08404731750488281, + "learning_rate": 2.750617283950617e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714469850063, + "rewards/format_reward_func": 1.0, + "step": 2228 + }, + { + "completion_length": 230.79018878936768, + "epoch": 0.37386311245232406, + "grad_norm": 0.2065159293766093, + "kl": 0.041645050048828125, + "learning_rate": 2.753086419753086e-07, + "loss": 0.0, + "reward": 1.796428643167019, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 2230 + }, + { + "completion_length": 224.0625114440918, + "epoch": 0.37419841569219164, + "grad_norm": 0.33197065162865186, + "kl": 0.0833282470703125, + "learning_rate": 2.7555555555555555e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142873853445, + "rewards/format_reward_func": 1.0, + "step": 2232 + }, + { + "completion_length": 235.4776906967163, + "epoch": 0.37453371893205917, + "grad_norm": 0.25584059088231476, + "kl": 0.040119171142578125, + "learning_rate": 2.7580246913580244e-07, + "loss": 0.0, + "reward": 1.7107143625617027, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7107143253087997, + "rewards/format_reward_func": 1.0, + "step": 2234 + }, + { + "completion_length": 231.67411708831787, + "epoch": 0.37486902217192675, + "grad_norm": 0.35658627996828857, + "kl": 0.6641654968261719, + "learning_rate": 2.760493827160494e-07, + "loss": 0.0007, + "reward": 1.7589286491274834, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2236 + }, + { + "completion_length": 231.4196548461914, + "epoch": 0.37520432541179427, + "grad_norm": 0.36641847714887776, + "kl": 0.057590484619140625, + "learning_rate": 2.762962962962963e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 2238 + }, + { + "completion_length": 228.5223331451416, + "epoch": 0.37553962865166185, + "grad_norm": 0.24546364124161799, + "kl": 0.059139251708984375, + "learning_rate": 2.7654320987654316e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928898721933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2240 + }, + { + "completion_length": 233.8750123977661, + "epoch": 0.37587493189152943, + "grad_norm": 0.2251870227230481, + "kl": 0.18067550659179688, + "learning_rate": 2.767901234567901e-07, + "loss": 0.0002, + "reward": 1.7196429297327995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2242 + }, + { + "completion_length": 225.83036613464355, + "epoch": 0.37621023513139695, + "grad_norm": 0.18177423931815112, + "kl": 0.13021469116210938, + "learning_rate": 2.77037037037037e-07, + "loss": 0.0001, + "reward": 1.7321429178118706, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7321428898721933, + "rewards/format_reward_func": 1.0, + "step": 2244 + }, + { + "completion_length": 224.49108028411865, + "epoch": 0.37654553837126453, + "grad_norm": 0.156748013206083, + "kl": 0.1608562469482422, + "learning_rate": 2.7728395061728393e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 2246 + }, + { + "completion_length": 236.4464406967163, + "epoch": 0.37688084161113206, + "grad_norm": 0.25260715945203926, + "kl": 0.2948493957519531, + "learning_rate": 2.7753086419753087e-07, + "loss": 0.0003, + "reward": 1.6928572282195091, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.6928571742027998, + "rewards/format_reward_func": 1.0, + "step": 2248 + }, + { + "completion_length": 235.70983409881592, + "epoch": 0.37721614485099964, + "grad_norm": 0.17868279769604858, + "kl": 0.021900177001953125, + "learning_rate": 2.7777777777777776e-07, + "loss": 0.0, + "reward": 1.7464286535978317, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285902678967, + "rewards/format_reward_func": 1.0, + "step": 2250 + }, + { + "completion_length": 228.9509038925171, + "epoch": 0.37755144809086716, + "grad_norm": 0.24309000324636804, + "kl": 0.2515907287597656, + "learning_rate": 2.780246913580247e-07, + "loss": 0.0003, + "reward": 1.7696429267525673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2252 + }, + { + "completion_length": 226.20983219146729, + "epoch": 0.37788675133073474, + "grad_norm": 0.2029313002323469, + "kl": 0.1733074188232422, + "learning_rate": 2.782716049382716e-07, + "loss": 0.0002, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 1.0, + "step": 2254 + }, + { + "completion_length": 228.9732265472412, + "epoch": 0.37822205457060226, + "grad_norm": 0.3040059058343939, + "kl": 0.12060165405273438, + "learning_rate": 2.785185185185185e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 2256 + }, + { + "completion_length": 233.83036518096924, + "epoch": 0.37855735781046984, + "grad_norm": 0.3822294853412589, + "kl": 0.17465782165527344, + "learning_rate": 2.787654320987654e-07, + "loss": 0.0002, + "reward": 1.7446429133415222, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2258 + }, + { + "completion_length": 228.77233028411865, + "epoch": 0.3788926610503374, + "grad_norm": 0.3139393157242552, + "kl": 0.5619621276855469, + "learning_rate": 2.790123456790123e-07, + "loss": 0.0006, + "reward": 1.7410714849829674, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2260 + }, + { + "completion_length": 228.11608123779297, + "epoch": 0.37922796429020494, + "grad_norm": 0.1977537144990948, + "kl": 0.09502792358398438, + "learning_rate": 2.7925925925925925e-07, + "loss": 0.0001, + "reward": 1.7000000923871994, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.700000025331974, + "rewards/format_reward_func": 1.0, + "step": 2262 + }, + { + "completion_length": 231.3035831451416, + "epoch": 0.3795632675300725, + "grad_norm": 0.262065947559945, + "kl": 0.09780120849609375, + "learning_rate": 2.795061728395062e-07, + "loss": 0.0001, + "reward": 1.7035714983940125, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7035714611411095, + "rewards/format_reward_func": 1.0, + "step": 2264 + }, + { + "completion_length": 230.77679634094238, + "epoch": 0.37989857076994005, + "grad_norm": 0.37533314523360195, + "kl": 0.018627166748046875, + "learning_rate": 2.797530864197531e-07, + "loss": 0.0, + "reward": 1.7571429088711739, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 2266 + }, + { + "completion_length": 229.883939743042, + "epoch": 0.3802338740098076, + "grad_norm": 0.2793404214719423, + "kl": 0.12833023071289062, + "learning_rate": 2.8e-07, + "loss": 0.0001, + "reward": 1.7553571835160255, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7598214708268642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2268 + }, + { + "completion_length": 223.883939743042, + "epoch": 0.38056917724967515, + "grad_norm": 0.12149878381355204, + "kl": 0.021205902099609375, + "learning_rate": 2.8024691358024685e-07, + "loss": 0.0, + "reward": 1.7714286148548126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286148548126, + "rewards/format_reward_func": 1.0, + "step": 2270 + }, + { + "completion_length": 225.12947463989258, + "epoch": 0.38090448048954273, + "grad_norm": 0.20368255516424766, + "kl": 0.05774688720703125, + "learning_rate": 2.804938271604938e-07, + "loss": 0.0001, + "reward": 1.6821429431438446, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6821428947150707, + "rewards/format_reward_func": 1.0, + "step": 2272 + }, + { + "completion_length": 229.54465293884277, + "epoch": 0.3812397837294103, + "grad_norm": 0.2472125611890382, + "kl": 0.2638816833496094, + "learning_rate": 2.8074074074074074e-07, + "loss": 0.0003, + "reward": 1.7392857819795609, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7482143081724644, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2274 + }, + { + "completion_length": 224.26340293884277, + "epoch": 0.38157508696927783, + "grad_norm": 0.33149997659240854, + "kl": 0.07962608337402344, + "learning_rate": 2.809876543209876e-07, + "loss": 0.0001, + "reward": 1.7785715162754059, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 1.0, + "step": 2276 + }, + { + "completion_length": 227.98661708831787, + "epoch": 0.3819103902091454, + "grad_norm": 0.3172056878755379, + "kl": 0.09324264526367188, + "learning_rate": 2.8123456790123457e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 2278 + }, + { + "completion_length": 225.8660831451416, + "epoch": 0.38224569344901294, + "grad_norm": 0.5301463048688589, + "kl": 0.4278106689453125, + "learning_rate": 2.814814814814815e-07, + "loss": 0.0004, + "reward": 1.74642863124609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286163449287, + "rewards/format_reward_func": 1.0, + "step": 2280 + }, + { + "completion_length": 229.35715293884277, + "epoch": 0.3825809966888805, + "grad_norm": 0.38356873327089946, + "kl": 0.5712699890136719, + "learning_rate": 2.817283950617284e-07, + "loss": 0.0006, + "reward": 1.7267857864499092, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.731250049546361, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2282 + }, + { + "completion_length": 235.3884038925171, + "epoch": 0.38291629992874804, + "grad_norm": 0.934076897552111, + "kl": 0.042568206787109375, + "learning_rate": 2.819753086419753e-07, + "loss": 0.0, + "reward": 1.7892857566475868, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 2284 + }, + { + "completion_length": 227.19643878936768, + "epoch": 0.3832516031686156, + "grad_norm": 0.16375574573485474, + "kl": 0.04029083251953125, + "learning_rate": 2.8222222222222217e-07, + "loss": 0.0, + "reward": 1.789285771548748, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 2286 + }, + { + "completion_length": 227.34375953674316, + "epoch": 0.3835869064084832, + "grad_norm": 0.3301045316257568, + "kl": 0.05418968200683594, + "learning_rate": 2.824691358024691e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 2288 + }, + { + "completion_length": 224.05804443359375, + "epoch": 0.3839222096483507, + "grad_norm": 0.2926858428633974, + "kl": 0.021707534790039062, + "learning_rate": 2.8271604938271606e-07, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7732143104076385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2290 + }, + { + "completion_length": 230.9553689956665, + "epoch": 0.3842575128882183, + "grad_norm": 0.32705642260561363, + "kl": 0.13091659545898438, + "learning_rate": 2.8296296296296294e-07, + "loss": 0.0001, + "reward": 1.6982143595814705, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7026786096394062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2292 + }, + { + "completion_length": 229.50447273254395, + "epoch": 0.3845928161280858, + "grad_norm": 0.21564649196585978, + "kl": 0.039546966552734375, + "learning_rate": 2.832098765432099e-07, + "loss": 0.0, + "reward": 1.7464286759495735, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 2294 + }, + { + "completion_length": 219.53125953674316, + "epoch": 0.3849281193679534, + "grad_norm": 0.26692613221581457, + "kl": 0.033473968505859375, + "learning_rate": 2.834567901234568e-07, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 2296 + }, + { + "completion_length": 233.65179538726807, + "epoch": 0.3852634226078209, + "grad_norm": 0.2678121392019473, + "kl": 0.06494903564453125, + "learning_rate": 2.8370370370370366e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714782774448, + "rewards/format_reward_func": 1.0, + "step": 2298 + }, + { + "completion_length": 228.22322368621826, + "epoch": 0.3855987258476885, + "grad_norm": 0.39709523911952316, + "kl": 0.040225982666015625, + "learning_rate": 2.839506172839506e-07, + "loss": 0.0, + "reward": 1.7250000834465027, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7339286096394062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2300 + }, + { + "completion_length": 217.65179634094238, + "epoch": 0.3859340290875561, + "grad_norm": 0.39208732176897304, + "kl": 0.064727783203125, + "learning_rate": 2.841975308641975e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 2302 + }, + { + "completion_length": 229.55804920196533, + "epoch": 0.3862693323274236, + "grad_norm": 0.31110812623321255, + "kl": 0.036376953125, + "learning_rate": 2.8444444444444443e-07, + "loss": 0.0, + "reward": 1.8035714998841286, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8035714663565159, + "rewards/format_reward_func": 1.0, + "step": 2304 + }, + { + "completion_length": 227.11161708831787, + "epoch": 0.3866046355672912, + "grad_norm": 0.2816325042184154, + "kl": 0.028156280517578125, + "learning_rate": 2.846913580246914e-07, + "loss": 0.0, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 2306 + }, + { + "completion_length": 216.4241189956665, + "epoch": 0.3869399388071587, + "grad_norm": 0.14324961565346786, + "kl": 0.02130126953125, + "learning_rate": 2.8493827160493826e-07, + "loss": 0.0, + "reward": 1.821428619325161, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.821428582072258, + "rewards/format_reward_func": 1.0, + "step": 2308 + }, + { + "completion_length": 221.38393878936768, + "epoch": 0.3872752420470263, + "grad_norm": 0.26443223145301653, + "kl": 0.026519775390625, + "learning_rate": 2.851851851851852e-07, + "loss": 0.0, + "reward": 1.742857187986374, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 2310 + }, + { + "completion_length": 234.01340293884277, + "epoch": 0.3876105452868938, + "grad_norm": 0.16721530630211082, + "kl": 0.02655792236328125, + "learning_rate": 2.8543209876543204e-07, + "loss": 0.0, + "reward": 1.7142857909202576, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.714285746216774, + "rewards/format_reward_func": 1.0, + "step": 2312 + }, + { + "completion_length": 236.1384048461914, + "epoch": 0.3879458485267614, + "grad_norm": 0.18018195259232184, + "kl": 0.025665283203125, + "learning_rate": 2.85679012345679e-07, + "loss": 0.0, + "reward": 1.7142857760190964, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857611179352, + "rewards/format_reward_func": 1.0, + "step": 2314 + }, + { + "completion_length": 231.79465293884277, + "epoch": 0.3882811517666289, + "grad_norm": 0.2801403066500493, + "kl": 0.034091949462890625, + "learning_rate": 2.859259259259259e-07, + "loss": 0.0, + "reward": 1.7464286535978317, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.746428620070219, + "rewards/format_reward_func": 1.0, + "step": 2316 + }, + { + "completion_length": 222.54465103149414, + "epoch": 0.3886164550064965, + "grad_norm": 0.19240949356448653, + "kl": 0.026641845703125, + "learning_rate": 2.861728395061728e-07, + "loss": 0.0, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 2318 + }, + { + "completion_length": 227.81251335144043, + "epoch": 0.3889517582463641, + "grad_norm": 0.14613030642705596, + "kl": 0.030071258544921875, + "learning_rate": 2.8641975308641975e-07, + "loss": 0.0, + "reward": 1.7035714983940125, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7035714592784643, + "rewards/format_reward_func": 1.0, + "step": 2320 + }, + { + "completion_length": 227.46876049041748, + "epoch": 0.3892870614862316, + "grad_norm": 0.39846413926302476, + "kl": 0.030303955078125, + "learning_rate": 2.866666666666667e-07, + "loss": 0.0, + "reward": 1.7767857685685158, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2322 + }, + { + "completion_length": 221.89733123779297, + "epoch": 0.3896223647260992, + "grad_norm": 0.41518256668271264, + "kl": 0.040134429931640625, + "learning_rate": 2.869135802469136e-07, + "loss": 0.0, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 2324 + }, + { + "completion_length": 234.54465579986572, + "epoch": 0.3899576679659667, + "grad_norm": 0.4979212066521887, + "kl": 0.04520416259765625, + "learning_rate": 2.8716049382716047e-07, + "loss": 0.0, + "reward": 1.7500000596046448, + "reward_std": 0.09091372601687908, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 2326 + }, + { + "completion_length": 225.66518783569336, + "epoch": 0.3902929712058343, + "grad_norm": 0.68471046288006, + "kl": 0.06603240966796875, + "learning_rate": 2.8740740740740736e-07, + "loss": 0.0001, + "reward": 1.7232143580913544, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7276786118745804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2328 + }, + { + "completion_length": 230.5134038925171, + "epoch": 0.3906282744457018, + "grad_norm": 0.1734750680599172, + "kl": 0.022487640380859375, + "learning_rate": 2.876543209876543e-07, + "loss": 0.0, + "reward": 1.7642857879400253, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 2330 + }, + { + "completion_length": 226.7009038925171, + "epoch": 0.3909635776855694, + "grad_norm": 0.17619003683755471, + "kl": 0.052631378173828125, + "learning_rate": 2.8790123456790124e-07, + "loss": 0.0001, + "reward": 1.6821429207921028, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.682142898440361, + "rewards/format_reward_func": 1.0, + "step": 2332 + }, + { + "completion_length": 223.3928680419922, + "epoch": 0.39129888092543696, + "grad_norm": 0.2130792522496975, + "kl": 0.0273590087890625, + "learning_rate": 2.8814814814814813e-07, + "loss": 0.0, + "reward": 1.7625000849366188, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643089175224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2334 + }, + { + "completion_length": 230.91518878936768, + "epoch": 0.3916341841653045, + "grad_norm": 0.16563670418519555, + "kl": 0.0242462158203125, + "learning_rate": 2.8839506172839507e-07, + "loss": 0.0, + "reward": 1.7607143223285675, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 2336 + }, + { + "completion_length": 223.83036708831787, + "epoch": 0.39196948740517207, + "grad_norm": 0.2537452887729266, + "kl": 0.0279083251953125, + "learning_rate": 2.8864197530864196e-07, + "loss": 0.0, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 2338 + }, + { + "completion_length": 233.96876430511475, + "epoch": 0.3923047906450396, + "grad_norm": 0.25310078074433373, + "kl": 0.04262542724609375, + "learning_rate": 2.8888888888888885e-07, + "loss": 0.0, + "reward": 1.8000000566244125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 2340 + }, + { + "completion_length": 231.4732255935669, + "epoch": 0.39264009388490717, + "grad_norm": 0.6108374179336262, + "kl": 0.07244873046875, + "learning_rate": 2.891358024691358e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000096857548, + "rewards/format_reward_func": 1.0, + "step": 2342 + }, + { + "completion_length": 230.8437614440918, + "epoch": 0.3929753971247747, + "grad_norm": 0.20767365117804806, + "kl": 0.031772613525390625, + "learning_rate": 2.893827160493827e-07, + "loss": 0.0, + "reward": 1.6892857998609543, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6892857402563095, + "rewards/format_reward_func": 1.0, + "step": 2344 + }, + { + "completion_length": 229.22768783569336, + "epoch": 0.3933107003646423, + "grad_norm": 0.18029574615767352, + "kl": 0.055999755859375, + "learning_rate": 2.896296296296296e-07, + "loss": 0.0001, + "reward": 1.7357143759727478, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 2346 + }, + { + "completion_length": 231.33483123779297, + "epoch": 0.39364600360450985, + "grad_norm": 0.174887596588021, + "kl": 0.020839691162109375, + "learning_rate": 2.8987654320987656e-07, + "loss": 0.0, + "reward": 1.7142857983708382, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7142857443541288, + "rewards/format_reward_func": 1.0, + "step": 2348 + }, + { + "completion_length": 239.0803680419922, + "epoch": 0.3939813068443774, + "grad_norm": 0.24591155442162888, + "kl": 0.028659820556640625, + "learning_rate": 2.9012345679012345e-07, + "loss": 0.0, + "reward": 1.7821429073810577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 2350 + }, + { + "completion_length": 232.0580472946167, + "epoch": 0.39431661008424496, + "grad_norm": 0.16364288828454512, + "kl": 0.031280517578125, + "learning_rate": 2.903703703703704e-07, + "loss": 0.0, + "reward": 1.7428572177886963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571823984385, + "rewards/format_reward_func": 1.0, + "step": 2352 + }, + { + "completion_length": 238.96429634094238, + "epoch": 0.3946519133241125, + "grad_norm": 0.25228292667784574, + "kl": 0.03216552734375, + "learning_rate": 2.906172839506173e-07, + "loss": 0.0, + "reward": 1.7625000700354576, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7669643089175224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2354 + }, + { + "completion_length": 237.01786994934082, + "epoch": 0.39498721656398006, + "grad_norm": 0.2681810554388981, + "kl": 0.0478515625, + "learning_rate": 2.9086419753086416e-07, + "loss": 0.0, + "reward": 1.7214286476373672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 2356 + }, + { + "completion_length": 231.8660831451416, + "epoch": 0.3953225198038476, + "grad_norm": 0.2985047069341516, + "kl": 0.048858642578125, + "learning_rate": 2.911111111111111e-07, + "loss": 0.0, + "reward": 1.7732143327593803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776785958558321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2358 + }, + { + "completion_length": 240.35268878936768, + "epoch": 0.39565782304371516, + "grad_norm": 0.2855045435472761, + "kl": 0.023204803466796875, + "learning_rate": 2.91358024691358e-07, + "loss": 0.0, + "reward": 1.7428571954369545, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7517857439815998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2360 + }, + { + "completion_length": 237.57143878936768, + "epoch": 0.39599312628358274, + "grad_norm": 0.290504962458472, + "kl": 0.030864715576171875, + "learning_rate": 2.9160493827160494e-07, + "loss": 0.0, + "reward": 1.7607143744826317, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 2362 + }, + { + "completion_length": 231.0669755935669, + "epoch": 0.39632842952345027, + "grad_norm": 0.3066050981425015, + "kl": 0.025148391723632812, + "learning_rate": 2.918518518518518e-07, + "loss": 0.0, + "reward": 1.791071504354477, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.795535746961832, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2364 + }, + { + "completion_length": 238.04465579986572, + "epoch": 0.39666373276331784, + "grad_norm": 0.1959750175843398, + "kl": 0.034114837646484375, + "learning_rate": 2.9209876543209877e-07, + "loss": 0.0, + "reward": 1.6928572431206703, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.692857164889574, + "rewards/format_reward_func": 1.0, + "step": 2366 + }, + { + "completion_length": 224.20090293884277, + "epoch": 0.39699903600318537, + "grad_norm": 0.18280818205958912, + "kl": 0.0224151611328125, + "learning_rate": 2.923456790123457e-07, + "loss": 0.0, + "reward": 1.753571517765522, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 2368 + }, + { + "completion_length": 228.18750953674316, + "epoch": 0.39733433924305295, + "grad_norm": 0.40446503207075296, + "kl": 0.059787750244140625, + "learning_rate": 2.9259259259259254e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 2370 + }, + { + "completion_length": 238.78572750091553, + "epoch": 0.39766964248292047, + "grad_norm": 0.3511749525167252, + "kl": 0.02414703369140625, + "learning_rate": 2.928395061728395e-07, + "loss": 0.0, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 2372 + }, + { + "completion_length": 250.41518783569336, + "epoch": 0.39800494572278805, + "grad_norm": 0.4279324335140687, + "kl": 0.04547119140625, + "learning_rate": 2.930864197530864e-07, + "loss": 0.0, + "reward": 1.712500087916851, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7169643044471741, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2374 + }, + { + "completion_length": 233.25893878936768, + "epoch": 0.3983402489626556, + "grad_norm": 0.3870702027447503, + "kl": 0.03582000732421875, + "learning_rate": 2.933333333333333e-07, + "loss": 0.0, + "reward": 1.6857143640518188, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.6857143249362707, + "rewards/format_reward_func": 1.0, + "step": 2376 + }, + { + "completion_length": 243.75000953674316, + "epoch": 0.39867555220252315, + "grad_norm": 0.2756062262293934, + "kl": 0.03237152099609375, + "learning_rate": 2.9358024691358025e-07, + "loss": 0.0, + "reward": 1.7607143744826317, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 2378 + }, + { + "completion_length": 232.4821538925171, + "epoch": 0.39901085544239073, + "grad_norm": 0.21033752055447466, + "kl": 0.022663116455078125, + "learning_rate": 2.9382716049382714e-07, + "loss": 0.0, + "reward": 1.7017858177423477, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7062500212341547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2380 + }, + { + "completion_length": 233.0535831451416, + "epoch": 0.39934615868225826, + "grad_norm": 0.30371212478863435, + "kl": 0.02435302734375, + "learning_rate": 2.940740740740741e-07, + "loss": 0.0, + "reward": 1.7821429446339607, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.782142873853445, + "rewards/format_reward_func": 1.0, + "step": 2382 + }, + { + "completion_length": 233.5491180419922, + "epoch": 0.39968146192212584, + "grad_norm": 0.29400370741808635, + "kl": 0.0242462158203125, + "learning_rate": 2.9432098765432097e-07, + "loss": 0.0, + "reward": 1.7357143387198448, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143182307482, + "rewards/format_reward_func": 1.0, + "step": 2384 + }, + { + "completion_length": 235.47768878936768, + "epoch": 0.40001676516199336, + "grad_norm": 0.24495863289215739, + "kl": 0.0254669189453125, + "learning_rate": 2.9456790123456786e-07, + "loss": 0.0, + "reward": 1.7607143372297287, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7696428783237934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2386 + }, + { + "completion_length": 234.45090293884277, + "epoch": 0.40035206840186094, + "grad_norm": 0.18587902604819076, + "kl": 0.0247650146484375, + "learning_rate": 2.948148148148148e-07, + "loss": 0.0, + "reward": 1.807142898440361, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.807142898440361, + "rewards/format_reward_func": 1.0, + "step": 2388 + }, + { + "completion_length": 233.30358409881592, + "epoch": 0.40068737164172846, + "grad_norm": 0.41121728830485704, + "kl": 0.036991119384765625, + "learning_rate": 2.950617283950617e-07, + "loss": 0.0, + "reward": 1.7928572073578835, + "reward_std": 0.07071067485958338, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 2390 + }, + { + "completion_length": 234.9732255935669, + "epoch": 0.40102267488159604, + "grad_norm": 0.2577178462756106, + "kl": 0.027858734130859375, + "learning_rate": 2.9530864197530863e-07, + "loss": 0.0, + "reward": 1.839285746216774, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.839285746216774, + "rewards/format_reward_func": 1.0, + "step": 2392 + }, + { + "completion_length": 233.92411994934082, + "epoch": 0.4013579781214636, + "grad_norm": 0.08764588825744794, + "kl": 0.02735137939453125, + "learning_rate": 2.9555555555555557e-07, + "loss": 0.0, + "reward": 1.8035714700818062, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 2394 + }, + { + "completion_length": 238.86161708831787, + "epoch": 0.40169328136133114, + "grad_norm": 0.1686954172881697, + "kl": 0.026401519775390625, + "learning_rate": 2.9580246913580246e-07, + "loss": 0.0, + "reward": 1.7428571954369545, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7517857570201159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2396 + }, + { + "completion_length": 240.46875953674316, + "epoch": 0.4020285846011987, + "grad_norm": 0.28063510826087984, + "kl": 0.034069061279296875, + "learning_rate": 2.9604938271604935e-07, + "loss": 0.0, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 2398 + }, + { + "completion_length": 243.3928680419922, + "epoch": 0.40236388784106625, + "grad_norm": 0.4889857363092545, + "kl": 0.03224945068359375, + "learning_rate": 2.962962962962963e-07, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.762500025331974, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2400 + }, + { + "completion_length": 236.2500123977661, + "epoch": 0.4026991910809338, + "grad_norm": 0.10869538161812134, + "kl": 0.03144073486328125, + "learning_rate": 2.965432098765432e-07, + "loss": 0.0, + "reward": 1.7178572118282318, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7178571708500385, + "rewards/format_reward_func": 1.0, + "step": 2402 + }, + { + "completion_length": 244.38840579986572, + "epoch": 0.40303449432080135, + "grad_norm": 0.40208088618904964, + "kl": 0.033901214599609375, + "learning_rate": 2.967901234567901e-07, + "loss": 0.0, + "reward": 1.7714286372065544, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7803571596741676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2404 + }, + { + "completion_length": 232.1964406967163, + "epoch": 0.40336979756066893, + "grad_norm": 0.290996152868013, + "kl": 0.026607513427734375, + "learning_rate": 2.97037037037037e-07, + "loss": 0.0, + "reward": 1.7875000461935997, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2406 + }, + { + "completion_length": 243.1785831451416, + "epoch": 0.4037051008005365, + "grad_norm": 0.17181050036006004, + "kl": 0.06661605834960938, + "learning_rate": 2.9728395061728395e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 2408 + }, + { + "completion_length": 239.33483219146729, + "epoch": 0.40404040404040403, + "grad_norm": 0.20456214359825808, + "kl": 0.06856155395507812, + "learning_rate": 2.975308641975309e-07, + "loss": 0.0001, + "reward": 1.7625000402331352, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2410 + }, + { + "completion_length": 236.35268878936768, + "epoch": 0.4043757072802716, + "grad_norm": 0.10846825466310321, + "kl": 0.02588653564453125, + "learning_rate": 2.9777777777777773e-07, + "loss": 0.0, + "reward": 1.8053571805357933, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8098214603960514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2412 + }, + { + "completion_length": 227.97768783569336, + "epoch": 0.40471101052013914, + "grad_norm": 0.3076350136465618, + "kl": 0.043216705322265625, + "learning_rate": 2.9802469135802467e-07, + "loss": 0.0, + "reward": 1.7839286103844643, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2414 + }, + { + "completion_length": 241.9509048461914, + "epoch": 0.4050463137600067, + "grad_norm": 0.22497192914816633, + "kl": 0.0537261962890625, + "learning_rate": 2.9827160493827156e-07, + "loss": 0.0001, + "reward": 1.6750000715255737, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.6839286033064127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2416 + }, + { + "completion_length": 237.3259048461914, + "epoch": 0.40538161699987424, + "grad_norm": 0.2892909531482391, + "kl": 0.0372314453125, + "learning_rate": 2.985185185185185e-07, + "loss": 0.0, + "reward": 1.7535714954137802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 2418 + }, + { + "completion_length": 248.46429920196533, + "epoch": 0.4057169202397418, + "grad_norm": 0.20048739142980532, + "kl": 0.025386810302734375, + "learning_rate": 2.9876543209876544e-07, + "loss": 0.0, + "reward": 1.717857226729393, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 2420 + }, + { + "completion_length": 238.56250953674316, + "epoch": 0.4060522234796094, + "grad_norm": 0.3425412260910249, + "kl": 0.06465911865234375, + "learning_rate": 2.9901234567901233e-07, + "loss": 0.0001, + "reward": 1.783928632736206, + "reward_std": 0.09343910776078701, + "rewards/equation_reward_func": 0.7883928958326578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2422 + }, + { + "completion_length": 243.9151906967163, + "epoch": 0.4063875267194769, + "grad_norm": 0.2759994660120673, + "kl": 0.034027099609375, + "learning_rate": 2.9925925925925927e-07, + "loss": 0.0, + "reward": 1.703571505844593, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7035714723169804, + "rewards/format_reward_func": 1.0, + "step": 2424 + }, + { + "completion_length": 241.92858123779297, + "epoch": 0.4067228299593445, + "grad_norm": 0.21101376319229112, + "kl": 0.023448944091796875, + "learning_rate": 2.9950617283950616e-07, + "loss": 0.0, + "reward": 1.70000009983778, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7000000402331352, + "rewards/format_reward_func": 1.0, + "step": 2426 + }, + { + "completion_length": 239.5357265472412, + "epoch": 0.407058133199212, + "grad_norm": 0.1853824828635671, + "kl": 0.022472381591796875, + "learning_rate": 2.9975308641975305e-07, + "loss": 0.0, + "reward": 1.719642922282219, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071835160255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2428 + }, + { + "completion_length": 230.90625953674316, + "epoch": 0.4073934364390796, + "grad_norm": 0.10183301453604178, + "kl": 0.033657073974609375, + "learning_rate": 3e-07, + "loss": 0.0, + "reward": 1.7553572058677673, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214540630579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2430 + }, + { + "completion_length": 242.81251049041748, + "epoch": 0.4077287396789471, + "grad_norm": 0.046270323750636944, + "kl": 0.024127960205078125, + "learning_rate": 3.002469135802469e-07, + "loss": 0.0, + "reward": 1.7696429193019867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071917116642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2432 + }, + { + "completion_length": 240.46875953674316, + "epoch": 0.4080640429188147, + "grad_norm": 0.3617812467572274, + "kl": 0.0596771240234375, + "learning_rate": 3.004938271604938e-07, + "loss": 0.0001, + "reward": 1.7482143640518188, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.752678606659174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2434 + }, + { + "completion_length": 238.13840293884277, + "epoch": 0.4083993461586823, + "grad_norm": 0.474064872648244, + "kl": 0.042621612548828125, + "learning_rate": 3.0074074074074076e-07, + "loss": 0.0, + "reward": 1.717857226729393, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857417464256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2436 + }, + { + "completion_length": 229.0267972946167, + "epoch": 0.4087346493985498, + "grad_norm": 0.25945331903624114, + "kl": 0.061847686767578125, + "learning_rate": 3.0098765432098765e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 2438 + }, + { + "completion_length": 246.94197845458984, + "epoch": 0.4090699526384174, + "grad_norm": 0.29150718449013036, + "kl": 0.19774627685546875, + "learning_rate": 3.012345679012346e-07, + "loss": 0.0002, + "reward": 1.7267857939004898, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7401785925030708, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2440 + }, + { + "completion_length": 243.5937623977661, + "epoch": 0.4094052558782849, + "grad_norm": 0.47884281229914716, + "kl": 0.08807373046875, + "learning_rate": 3.014814814814814e-07, + "loss": 0.0001, + "reward": 1.7017857804894447, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7151786088943481, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2442 + }, + { + "completion_length": 233.33036994934082, + "epoch": 0.4097405591181525, + "grad_norm": 0.39819654959775574, + "kl": 0.07984161376953125, + "learning_rate": 3.0172839506172836e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 2444 + }, + { + "completion_length": 247.2232265472412, + "epoch": 0.41007586235802, + "grad_norm": 0.2930877870059224, + "kl": 0.07854843139648438, + "learning_rate": 3.019753086419753e-07, + "loss": 0.0001, + "reward": 1.6589286550879478, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.6633928846567869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2446 + }, + { + "completion_length": 242.20983219146729, + "epoch": 0.4104111655978876, + "grad_norm": 0.327769941363608, + "kl": 0.04544830322265625, + "learning_rate": 3.022222222222222e-07, + "loss": 0.0, + "reward": 1.6892857924103737, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.6982143186032772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2448 + }, + { + "completion_length": 234.95090293884277, + "epoch": 0.4107464688377551, + "grad_norm": 0.2819124888649559, + "kl": 0.103240966796875, + "learning_rate": 3.0246913580246913e-07, + "loss": 0.0001, + "reward": 1.710714340209961, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7196428794413805, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2450 + }, + { + "completion_length": 238.6250114440918, + "epoch": 0.4110817720776227, + "grad_norm": 0.13389653861381598, + "kl": 0.024539947509765625, + "learning_rate": 3.027160493827161e-07, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 2452 + }, + { + "completion_length": 249.633939743042, + "epoch": 0.4114170753174903, + "grad_norm": 0.2228403364562828, + "kl": 0.033233642578125, + "learning_rate": 3.0296296296296296e-07, + "loss": 0.0, + "reward": 1.7946429029107094, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071715950966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2454 + }, + { + "completion_length": 239.16519260406494, + "epoch": 0.4117523785573578, + "grad_norm": 0.29195230568320907, + "kl": 0.02500152587890625, + "learning_rate": 3.0320987654320985e-07, + "loss": 0.0, + "reward": 1.8071429133415222, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 2456 + }, + { + "completion_length": 243.6875114440918, + "epoch": 0.4120876817972254, + "grad_norm": 0.17428188867849037, + "kl": 0.048297882080078125, + "learning_rate": 3.0345679012345674e-07, + "loss": 0.0, + "reward": 1.7571429535746574, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2458 + }, + { + "completion_length": 236.196439743042, + "epoch": 0.4124229850370929, + "grad_norm": 0.36680589539865105, + "kl": 0.061859130859375, + "learning_rate": 3.037037037037037e-07, + "loss": 0.0001, + "reward": 1.785714365541935, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7946428768336773, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2460 + }, + { + "completion_length": 241.49554824829102, + "epoch": 0.4127582882769605, + "grad_norm": 0.482576488789002, + "kl": 0.05983734130859375, + "learning_rate": 3.039506172839506e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.10354063659906387, + "rewards/equation_reward_func": 0.7758928947150707, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2462 + }, + { + "completion_length": 251.9509048461914, + "epoch": 0.413093591516828, + "grad_norm": 0.5102705709365054, + "kl": 0.23926925659179688, + "learning_rate": 3.041975308641975e-07, + "loss": 0.0002, + "reward": 1.7821429148316383, + "reward_std": 0.11616754066199064, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2464 + }, + { + "completion_length": 243.87054538726807, + "epoch": 0.4134288947566956, + "grad_norm": 0.6208793238312089, + "kl": 0.6168899536132812, + "learning_rate": 3.0444444444444445e-07, + "loss": 0.0006, + "reward": 1.78035718947649, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7848214507102966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2466 + }, + { + "completion_length": 230.75447463989258, + "epoch": 0.41376419799656317, + "grad_norm": 0.20628736823470975, + "kl": 0.09053802490234375, + "learning_rate": 3.046913580246914e-07, + "loss": 0.0001, + "reward": 1.8357143327593803, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8357142955064774, + "rewards/format_reward_func": 1.0, + "step": 2468 + }, + { + "completion_length": 247.08036708831787, + "epoch": 0.4140995012364307, + "grad_norm": 0.34160066484399093, + "kl": 0.14976119995117188, + "learning_rate": 3.0493827160493823e-07, + "loss": 0.0001, + "reward": 1.726785771548748, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.731250025331974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2470 + }, + { + "completion_length": 240.43304824829102, + "epoch": 0.41443480447629827, + "grad_norm": 0.22342851163494104, + "kl": 0.24057769775390625, + "learning_rate": 3.0518518518518517e-07, + "loss": 0.0002, + "reward": 1.7160714864730835, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7205357383936644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2472 + }, + { + "completion_length": 241.9062623977661, + "epoch": 0.4147701077161658, + "grad_norm": 0.1707649901922291, + "kl": 0.0610198974609375, + "learning_rate": 3.0543209876543206e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643219560385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2474 + }, + { + "completion_length": 244.6607255935669, + "epoch": 0.41510541095603337, + "grad_norm": 0.28532945804476273, + "kl": 0.042633056640625, + "learning_rate": 3.05679012345679e-07, + "loss": 0.0, + "reward": 1.8250000402331352, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.825000025331974, + "rewards/format_reward_func": 1.0, + "step": 2476 + }, + { + "completion_length": 246.70983409881592, + "epoch": 0.4154407141959009, + "grad_norm": 0.20875711813517592, + "kl": 0.06849288940429688, + "learning_rate": 3.0592592592592594e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286033064127, + "rewards/format_reward_func": 1.0, + "step": 2478 + }, + { + "completion_length": 242.1428680419922, + "epoch": 0.4157760174357685, + "grad_norm": 0.16125349654244475, + "kl": 0.026233673095703125, + "learning_rate": 3.0617283950617283e-07, + "loss": 0.0, + "reward": 1.7321429252624512, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7321428768336773, + "rewards/format_reward_func": 1.0, + "step": 2480 + }, + { + "completion_length": 242.0625123977661, + "epoch": 0.41611132067563605, + "grad_norm": 0.24992527870915915, + "kl": 0.032123565673828125, + "learning_rate": 3.0641975308641977e-07, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 2482 + }, + { + "completion_length": 251.56251335144043, + "epoch": 0.4164466239155036, + "grad_norm": 0.277668143360467, + "kl": 0.05127716064453125, + "learning_rate": 3.066666666666666e-07, + "loss": 0.0001, + "reward": 1.7000001072883606, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7000000290572643, + "rewards/format_reward_func": 1.0, + "step": 2484 + }, + { + "completion_length": 244.11608123779297, + "epoch": 0.41678192715537116, + "grad_norm": 0.1966415408927424, + "kl": 0.033660888671875, + "learning_rate": 3.0691358024691355e-07, + "loss": 0.0, + "reward": 1.678571529686451, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.6785714775323868, + "rewards/format_reward_func": 1.0, + "step": 2486 + }, + { + "completion_length": 243.00000858306885, + "epoch": 0.4171172303952387, + "grad_norm": 0.4143451684580404, + "kl": 0.0799713134765625, + "learning_rate": 3.071604938271605e-07, + "loss": 0.0001, + "reward": 1.7665179148316383, + "reward_std": 0.06755394861102104, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 0.9986607171595097, + "step": 2488 + }, + { + "completion_length": 237.1473331451416, + "epoch": 0.41745253363510626, + "grad_norm": 0.26423638528297066, + "kl": 0.03952789306640625, + "learning_rate": 3.074074074074074e-07, + "loss": 0.0, + "reward": 1.814285784959793, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 2490 + }, + { + "completion_length": 244.64286708831787, + "epoch": 0.4177878368749738, + "grad_norm": 0.2312057942169059, + "kl": 0.04471588134765625, + "learning_rate": 3.076543209876543e-07, + "loss": 0.0, + "reward": 1.7593750730156898, + "reward_std": 0.027147849323228, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 0.9986607171595097, + "step": 2492 + }, + { + "completion_length": 246.00447273254395, + "epoch": 0.41812314011484136, + "grad_norm": 0.4950536770067875, + "kl": 0.04170989990234375, + "learning_rate": 3.0790123456790126e-07, + "loss": 0.0, + "reward": 1.7196429297327995, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7241071779280901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2494 + }, + { + "completion_length": 244.5491189956665, + "epoch": 0.41845844335470894, + "grad_norm": 0.32448324165354375, + "kl": 0.02623748779296875, + "learning_rate": 3.0814814814814815e-07, + "loss": 0.0, + "reward": 1.7821429446339607, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7910714540630579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2496 + }, + { + "completion_length": 247.46430015563965, + "epoch": 0.41879374659457647, + "grad_norm": 0.23471168214360996, + "kl": 0.0288848876953125, + "learning_rate": 3.0839506172839504e-07, + "loss": 0.0, + "reward": 1.7267857864499092, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7401786036789417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2498 + }, + { + "completion_length": 244.1384038925171, + "epoch": 0.41912904983444405, + "grad_norm": 0.2761535326936902, + "kl": 0.025539398193359375, + "learning_rate": 3.086419753086419e-07, + "loss": 0.0, + "reward": 1.783928632736206, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2500 + }, + { + "completion_length": 248.1651906967163, + "epoch": 0.41946435307431157, + "grad_norm": 0.2309364519086521, + "kl": 0.037647247314453125, + "learning_rate": 3.0888888888888887e-07, + "loss": 0.0, + "reward": 1.7642858028411865, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 2502 + }, + { + "completion_length": 250.14733219146729, + "epoch": 0.41979965631417915, + "grad_norm": 0.29956528286167283, + "kl": 0.07854461669921875, + "learning_rate": 3.091358024691358e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2504 + }, + { + "completion_length": 246.00893783569336, + "epoch": 0.42013495955404667, + "grad_norm": 0.3199713423763226, + "kl": 0.08681488037109375, + "learning_rate": 3.093827160493827e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7517857439815998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2506 + }, + { + "completion_length": 243.0134048461914, + "epoch": 0.42047026279391425, + "grad_norm": 0.11689773260946061, + "kl": 0.06568527221679688, + "learning_rate": 3.0962962962962964e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 1.0, + "step": 2508 + }, + { + "completion_length": 240.65179920196533, + "epoch": 0.4208055660337818, + "grad_norm": 0.3331850418354822, + "kl": 0.0395355224609375, + "learning_rate": 3.098765432098765e-07, + "loss": 0.0, + "reward": 1.7553572058677673, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2510 + }, + { + "completion_length": 239.3125114440918, + "epoch": 0.42114086927364935, + "grad_norm": 0.24332880806945797, + "kl": 0.026187896728515625, + "learning_rate": 3.101234567901234e-07, + "loss": 0.0, + "reward": 1.771428644657135, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7803571745753288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2512 + }, + { + "completion_length": 256.1294775009155, + "epoch": 0.42147617251351693, + "grad_norm": 0.2725284304146817, + "kl": 0.04680633544921875, + "learning_rate": 3.1037037037037036e-07, + "loss": 0.0, + "reward": 1.721428632736206, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214285973459482, + "rewards/format_reward_func": 1.0, + "step": 2514 + }, + { + "completion_length": 242.76340103149414, + "epoch": 0.42181147575338446, + "grad_norm": 0.4089842216356524, + "kl": 0.0549163818359375, + "learning_rate": 3.1061728395061724e-07, + "loss": 0.0001, + "reward": 1.700000062584877, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7000000327825546, + "rewards/format_reward_func": 1.0, + "step": 2516 + }, + { + "completion_length": 239.29465103149414, + "epoch": 0.42214677899325204, + "grad_norm": 0.22043840909969697, + "kl": 0.05002593994140625, + "learning_rate": 3.108641975308642e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000149011612, + "rewards/format_reward_func": 1.0, + "step": 2518 + }, + { + "completion_length": 243.62947368621826, + "epoch": 0.42248208223311956, + "grad_norm": 0.1826835616573898, + "kl": 0.0255279541015625, + "learning_rate": 3.111111111111111e-07, + "loss": 0.0, + "reward": 1.7714286670088768, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 2520 + }, + { + "completion_length": 243.39287090301514, + "epoch": 0.42281738547298714, + "grad_norm": 0.216918530888218, + "kl": 0.035877227783203125, + "learning_rate": 3.11358024691358e-07, + "loss": 0.0, + "reward": 1.7750000730156898, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 2522 + }, + { + "completion_length": 242.32590579986572, + "epoch": 0.42315268871285466, + "grad_norm": 0.16724150439793084, + "kl": 0.03157806396484375, + "learning_rate": 3.1160493827160496e-07, + "loss": 0.0, + "reward": 1.7928571999073029, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 2524 + }, + { + "completion_length": 244.15179634094238, + "epoch": 0.42348799195272224, + "grad_norm": 0.2824979326339329, + "kl": 0.04889678955078125, + "learning_rate": 3.118518518518518e-07, + "loss": 0.0, + "reward": 1.7571429461240768, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7660714536905289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2526 + }, + { + "completion_length": 243.45536708831787, + "epoch": 0.4238232951925898, + "grad_norm": 0.37376178454239156, + "kl": 0.03272247314453125, + "learning_rate": 3.1209876543209873e-07, + "loss": 0.0, + "reward": 1.7660714983940125, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2528 + }, + { + "completion_length": 235.4509048461914, + "epoch": 0.42415859843245735, + "grad_norm": 0.40747437616592286, + "kl": 0.04550933837890625, + "learning_rate": 3.123456790123457e-07, + "loss": 0.0, + "reward": 1.757142923772335, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 2530 + }, + { + "completion_length": 248.3750123977661, + "epoch": 0.4244939016723249, + "grad_norm": 0.2179129790820734, + "kl": 0.059764862060546875, + "learning_rate": 3.1259259259259256e-07, + "loss": 0.0001, + "reward": 1.7553571984171867, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214577883482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2532 + }, + { + "completion_length": 255.93304538726807, + "epoch": 0.42482920491219245, + "grad_norm": 0.253351262982618, + "kl": 0.0557861328125, + "learning_rate": 3.128395061728395e-07, + "loss": 0.0001, + "reward": 1.7642857655882835, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.77321432903409, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2534 + }, + { + "completion_length": 242.6294755935669, + "epoch": 0.42516450815206003, + "grad_norm": 0.21174747359151216, + "kl": 0.0252227783203125, + "learning_rate": 3.130864197530864e-07, + "loss": 0.0, + "reward": 1.8178572058677673, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.817857176065445, + "rewards/format_reward_func": 1.0, + "step": 2536 + }, + { + "completion_length": 236.34376049041748, + "epoch": 0.42549981139192755, + "grad_norm": 0.07787793090487856, + "kl": 0.049343109130859375, + "learning_rate": 3.1333333333333333e-07, + "loss": 0.0, + "reward": 1.735714353621006, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143182307482, + "rewards/format_reward_func": 1.0, + "step": 2538 + }, + { + "completion_length": 244.9241189956665, + "epoch": 0.42583511463179513, + "grad_norm": 0.2622517959405339, + "kl": 0.04746246337890625, + "learning_rate": 3.135802469135803e-07, + "loss": 0.0, + "reward": 1.7517858073115349, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2540 + }, + { + "completion_length": 239.76786613464355, + "epoch": 0.4261704178716627, + "grad_norm": 0.34941696728550165, + "kl": 0.02500152587890625, + "learning_rate": 3.138271604938271e-07, + "loss": 0.0, + "reward": 1.7303572073578835, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2542 + }, + { + "completion_length": 237.49554443359375, + "epoch": 0.42650572111153023, + "grad_norm": 0.20175321039359187, + "kl": 0.022891998291015625, + "learning_rate": 3.1407407407407405e-07, + "loss": 0.0, + "reward": 1.7357143461704254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.735714303329587, + "rewards/format_reward_func": 1.0, + "step": 2544 + }, + { + "completion_length": 251.40626335144043, + "epoch": 0.4268410243513978, + "grad_norm": 0.13603414263062696, + "kl": 0.08626937866210938, + "learning_rate": 3.14320987654321e-07, + "loss": 0.0001, + "reward": 1.7178572490811348, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2546 + }, + { + "completion_length": 249.23215293884277, + "epoch": 0.42717632759126534, + "grad_norm": 0.25842443169289875, + "kl": 0.039806365966796875, + "learning_rate": 3.145679012345679e-07, + "loss": 0.0, + "reward": 1.723214365541935, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7276785802096128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2548 + }, + { + "completion_length": 248.36608219146729, + "epoch": 0.4275116308311329, + "grad_norm": 0.2714677153001056, + "kl": 0.12255859375, + "learning_rate": 3.148148148148148e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7178571745753288, + "rewards/format_reward_func": 1.0, + "step": 2550 + }, + { + "completion_length": 236.23215293884277, + "epoch": 0.42784693407100044, + "grad_norm": 0.35098529470563555, + "kl": 0.03569793701171875, + "learning_rate": 3.150617283950617e-07, + "loss": 0.0, + "reward": 1.7535714879631996, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 2552 + }, + { + "completion_length": 251.02679443359375, + "epoch": 0.428182237310868, + "grad_norm": 0.12539207610264913, + "kl": 0.08616256713867188, + "learning_rate": 3.1530864197530865e-07, + "loss": 0.0001, + "reward": 1.7946429029107094, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.799107164144516, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2554 + }, + { + "completion_length": 238.7901906967163, + "epoch": 0.4285175405507356, + "grad_norm": 0.3178237413778879, + "kl": 0.023960113525390625, + "learning_rate": 3.1555555555555554e-07, + "loss": 0.0, + "reward": 1.7446429207921028, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2556 + }, + { + "completion_length": 245.50001049041748, + "epoch": 0.4288528437906031, + "grad_norm": 0.45006753486366646, + "kl": 0.10959625244140625, + "learning_rate": 3.1580246913580243e-07, + "loss": 0.0001, + "reward": 1.7267857939004898, + "reward_std": 0.0732360566034913, + "rewards/equation_reward_func": 0.7312500216066837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2558 + }, + { + "completion_length": 247.4196548461914, + "epoch": 0.4291881470304707, + "grad_norm": 0.8192667075625969, + "kl": 0.22243881225585938, + "learning_rate": 3.1604938271604937e-07, + "loss": 0.0002, + "reward": 1.7750000581145287, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7839286047965288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2560 + }, + { + "completion_length": 251.14286708831787, + "epoch": 0.4295234502703382, + "grad_norm": 0.22454144415636515, + "kl": 0.024169921875, + "learning_rate": 3.1629629629629626e-07, + "loss": 0.0, + "reward": 1.726785808801651, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7312500290572643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2562 + }, + { + "completion_length": 246.77679634094238, + "epoch": 0.4298587535102058, + "grad_norm": 0.3261496817001146, + "kl": 0.19002532958984375, + "learning_rate": 3.165432098765432e-07, + "loss": 0.0002, + "reward": 1.7892857939004898, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 2564 + }, + { + "completion_length": 243.477689743042, + "epoch": 0.43019405675007333, + "grad_norm": 0.33328892831898055, + "kl": 0.037197113037109375, + "learning_rate": 3.1679012345679014e-07, + "loss": 0.0, + "reward": 1.737500049173832, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7508929036557674, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2566 + }, + { + "completion_length": 236.32590198516846, + "epoch": 0.4305293599899409, + "grad_norm": 0.145820177741581, + "kl": 0.02503204345703125, + "learning_rate": 3.1703703703703703e-07, + "loss": 0.0, + "reward": 1.7500000521540642, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7589285969734192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2568 + }, + { + "completion_length": 246.85715293884277, + "epoch": 0.43086466322980843, + "grad_norm": 0.29553026796035753, + "kl": 0.06223297119140625, + "learning_rate": 3.172839506172839e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7758928798139095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2570 + }, + { + "completion_length": 247.07590198516846, + "epoch": 0.431199966469676, + "grad_norm": 0.15917956082740858, + "kl": 0.30030059814453125, + "learning_rate": 3.1753086419753086e-07, + "loss": 0.0003, + "reward": 1.7714286372065544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 2572 + }, + { + "completion_length": 259.2946538925171, + "epoch": 0.4315352697095436, + "grad_norm": 0.45168312035874253, + "kl": 0.3671760559082031, + "learning_rate": 3.1777777777777775e-07, + "loss": 0.0004, + "reward": 1.7446429282426834, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2574 + }, + { + "completion_length": 240.11608409881592, + "epoch": 0.4318705729494111, + "grad_norm": 0.3496117852276753, + "kl": 0.02508544921875, + "learning_rate": 3.180246913580247e-07, + "loss": 0.0, + "reward": 1.7375000715255737, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7419643253087997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2576 + }, + { + "completion_length": 243.9821538925171, + "epoch": 0.4322058761892787, + "grad_norm": 0.3225535714095935, + "kl": 0.02301025390625, + "learning_rate": 3.182716049382716e-07, + "loss": 0.0, + "reward": 1.7250001057982445, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2578 + }, + { + "completion_length": 250.12054634094238, + "epoch": 0.4325411794291462, + "grad_norm": 0.26559242472932765, + "kl": 0.034000396728515625, + "learning_rate": 3.185185185185185e-07, + "loss": 0.0, + "reward": 1.7857143506407738, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143320143223, + "rewards/format_reward_func": 1.0, + "step": 2580 + }, + { + "completion_length": 250.25447463989258, + "epoch": 0.4328764826690138, + "grad_norm": 0.2759127477860899, + "kl": 0.06327056884765625, + "learning_rate": 3.1876543209876546e-07, + "loss": 0.0001, + "reward": 1.7017857730388641, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7151785958558321, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2582 + }, + { + "completion_length": 244.22769165039062, + "epoch": 0.4332117859088813, + "grad_norm": 0.207482672686499, + "kl": 0.1029205322265625, + "learning_rate": 3.190123456790123e-07, + "loss": 0.0001, + "reward": 1.7089286521077156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7133928947150707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2584 + }, + { + "completion_length": 245.727689743042, + "epoch": 0.4335470891487489, + "grad_norm": 0.5793295166969608, + "kl": 0.1691436767578125, + "learning_rate": 3.1925925925925924e-07, + "loss": 0.0002, + "reward": 1.7196429371833801, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2586 + }, + { + "completion_length": 251.60715293884277, + "epoch": 0.4338823923886165, + "grad_norm": 0.5828281237924078, + "kl": 0.9253044128417969, + "learning_rate": 3.195061728395061e-07, + "loss": 0.0009, + "reward": 1.7607143223285675, + "reward_std": 0.07576144114136696, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2588 + }, + { + "completion_length": 255.88394260406494, + "epoch": 0.434217695628484, + "grad_norm": 0.25950153452146346, + "kl": 0.05170440673828125, + "learning_rate": 3.1975308641975307e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 2590 + }, + { + "completion_length": 251.83929443359375, + "epoch": 0.4345529988683516, + "grad_norm": 0.31718516852729894, + "kl": 0.11458206176757812, + "learning_rate": 3.2e-07, + "loss": 0.0001, + "reward": 1.7803571820259094, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7848214656114578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2592 + }, + { + "completion_length": 251.4419765472412, + "epoch": 0.4348883021082191, + "grad_norm": 0.1884583384444149, + "kl": 0.12668609619140625, + "learning_rate": 3.202469135802469e-07, + "loss": 0.0001, + "reward": 1.7339286357164383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.73839289881289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2594 + }, + { + "completion_length": 252.9375123977661, + "epoch": 0.4352236053480867, + "grad_norm": 0.22589381749613982, + "kl": 0.035221099853515625, + "learning_rate": 3.2049382716049384e-07, + "loss": 0.0, + "reward": 1.8482143506407738, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.852678582072258, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2596 + }, + { + "completion_length": 246.90179538726807, + "epoch": 0.4355589085879542, + "grad_norm": 0.15749758438094288, + "kl": 0.04557037353515625, + "learning_rate": 3.207407407407407e-07, + "loss": 0.0, + "reward": 1.7821429073810577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 2598 + }, + { + "completion_length": 265.1696529388428, + "epoch": 0.4358942118278218, + "grad_norm": 0.22695902010162736, + "kl": 0.19362640380859375, + "learning_rate": 3.209876543209876e-07, + "loss": 0.0002, + "reward": 1.7160714864730835, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7294643111526966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2600 + }, + { + "completion_length": 246.68751049041748, + "epoch": 0.43622951506768937, + "grad_norm": 0.3494971527672536, + "kl": 0.2670745849609375, + "learning_rate": 3.2123456790123455e-07, + "loss": 0.0003, + "reward": 1.7857143729925156, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7857142984867096, + "rewards/format_reward_func": 1.0, + "step": 2602 + }, + { + "completion_length": 252.92858505249023, + "epoch": 0.4365648183075569, + "grad_norm": 0.2570008866666796, + "kl": 0.08303451538085938, + "learning_rate": 3.2148148148148144e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2604 + }, + { + "completion_length": 254.10269165039062, + "epoch": 0.43690012154742447, + "grad_norm": 0.3532176636632374, + "kl": 1.0090484619140625, + "learning_rate": 3.217283950617284e-07, + "loss": 0.001, + "reward": 1.7375000640749931, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2606 + }, + { + "completion_length": 252.3526906967163, + "epoch": 0.437235424787292, + "grad_norm": 0.1619935140323748, + "kl": 0.17085647583007812, + "learning_rate": 3.219753086419753e-07, + "loss": 0.0002, + "reward": 1.7375000789761543, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.750892873853445, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2608 + }, + { + "completion_length": 253.14286708831787, + "epoch": 0.4375707280271596, + "grad_norm": 0.19724744628716312, + "kl": 0.15912628173828125, + "learning_rate": 3.222222222222222e-07, + "loss": 0.0002, + "reward": 1.7232143878936768, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7276786081492901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2610 + }, + { + "completion_length": 257.7009057998657, + "epoch": 0.4379060312670271, + "grad_norm": 0.500823014531597, + "kl": 0.095916748046875, + "learning_rate": 3.224691358024691e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2612 + }, + { + "completion_length": 242.83929824829102, + "epoch": 0.4382413345068947, + "grad_norm": 0.30820036330593226, + "kl": 0.0233306884765625, + "learning_rate": 3.22716049382716e-07, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143055647612, + "rewards/format_reward_func": 1.0, + "step": 2614 + }, + { + "completion_length": 253.11608219146729, + "epoch": 0.43857663774676225, + "grad_norm": 0.2536925262111839, + "kl": 0.04438018798828125, + "learning_rate": 3.2296296296296293e-07, + "loss": 0.0, + "reward": 1.708928644657135, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7133928816765547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2616 + }, + { + "completion_length": 258.24554920196533, + "epoch": 0.4389119409866298, + "grad_norm": 0.28150699335408913, + "kl": 0.10482025146484375, + "learning_rate": 3.2320987654320987e-07, + "loss": 0.0001, + "reward": 1.7339286357164383, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7473214454948902, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2618 + }, + { + "completion_length": 254.0580472946167, + "epoch": 0.43924724422649736, + "grad_norm": 0.12402202969919433, + "kl": 0.6347122192382812, + "learning_rate": 3.2345679012345676e-07, + "loss": 0.0006, + "reward": 1.7178572192788124, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7267857454717159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2620 + }, + { + "completion_length": 257.6473321914673, + "epoch": 0.4395825474663649, + "grad_norm": 0.3870609356353709, + "kl": 0.027385711669921875, + "learning_rate": 3.237037037037037e-07, + "loss": 0.0, + "reward": 1.7107143551111221, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7196428887546062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2622 + }, + { + "completion_length": 250.04912090301514, + "epoch": 0.43991785070623246, + "grad_norm": 0.4512728132951566, + "kl": 0.14263916015625, + "learning_rate": 3.2395061728395064e-07, + "loss": 0.0001, + "reward": 1.7303572297096252, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7348214723169804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2624 + }, + { + "completion_length": 266.42858695983887, + "epoch": 0.4402531539461, + "grad_norm": 0.2475325032392611, + "kl": 0.025585174560546875, + "learning_rate": 3.2419753086419753e-07, + "loss": 0.0, + "reward": 1.7392857670783997, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2626 + }, + { + "completion_length": 240.79019260406494, + "epoch": 0.44058845718596756, + "grad_norm": 0.24164859505408506, + "kl": 0.3480949401855469, + "learning_rate": 3.244444444444444e-07, + "loss": 0.0003, + "reward": 1.7964286357164383, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8053571805357933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2628 + }, + { + "completion_length": 254.13393878936768, + "epoch": 0.4409237604258351, + "grad_norm": 0.31436550452414275, + "kl": 0.12886428833007812, + "learning_rate": 3.246913580246913e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7553571872413158, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2630 + }, + { + "completion_length": 251.0580472946167, + "epoch": 0.44125906366570267, + "grad_norm": 0.29791950286400054, + "kl": 0.037143707275390625, + "learning_rate": 3.2493827160493825e-07, + "loss": 0.0, + "reward": 1.783928632736206, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2632 + }, + { + "completion_length": 270.63840675354004, + "epoch": 0.44159436690557025, + "grad_norm": 0.24728104547594096, + "kl": 0.06447219848632812, + "learning_rate": 3.251851851851852e-07, + "loss": 0.0001, + "reward": 1.7410714998841286, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7544643133878708, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2634 + }, + { + "completion_length": 248.92858409881592, + "epoch": 0.44192967014543777, + "grad_norm": 0.31478920467178306, + "kl": 0.051105499267578125, + "learning_rate": 3.254320987654321e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7696428894996643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2636 + }, + { + "completion_length": 249.5759038925171, + "epoch": 0.44226497338530535, + "grad_norm": 0.24993023096821998, + "kl": 0.13302993774414062, + "learning_rate": 3.25679012345679e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.045456865802407265, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2638 + }, + { + "completion_length": 260.30804920196533, + "epoch": 0.4426002766251729, + "grad_norm": 1.3821262018529439, + "kl": 0.258209228515625, + "learning_rate": 3.2592592592592596e-07, + "loss": 0.0003, + "reward": 1.7089286297559738, + "reward_std": 0.08838834799826145, + "rewards/equation_reward_func": 0.7312500216066837, + "rewards/format_reward_func": 0.977678582072258, + "step": 2640 + }, + { + "completion_length": 262.3973340988159, + "epoch": 0.44293557986504045, + "grad_norm": 0.23587870226894336, + "kl": 0.12923049926757812, + "learning_rate": 3.261728395061728e-07, + "loss": 0.0001, + "reward": 1.7589286044239998, + "reward_std": 0.10859139915555716, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.977678582072258, + "step": 2642 + }, + { + "completion_length": 265.0803699493408, + "epoch": 0.443270883104908, + "grad_norm": 0.260573659490083, + "kl": 0.06291961669921875, + "learning_rate": 3.2641975308641974e-07, + "loss": 0.0001, + "reward": 1.7142857983708382, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7232143115252256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2644 + }, + { + "completion_length": 266.5848331451416, + "epoch": 0.44360618634477555, + "grad_norm": 0.29066419116889264, + "kl": 0.4171791076660156, + "learning_rate": 3.2666666666666663e-07, + "loss": 0.0004, + "reward": 1.7571428939700127, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2646 + }, + { + "completion_length": 255.9464406967163, + "epoch": 0.44394148958464313, + "grad_norm": 0.2812603360108659, + "kl": 0.05731964111328125, + "learning_rate": 3.2691358024691357e-07, + "loss": 0.0001, + "reward": 1.7303571701049805, + "reward_std": 0.1085914010182023, + "rewards/equation_reward_func": 0.7616071719676256, + "rewards/format_reward_func": 0.9687500149011612, + "step": 2648 + }, + { + "completion_length": 262.03572940826416, + "epoch": 0.44427679282451066, + "grad_norm": 0.30695546284, + "kl": 0.057254791259765625, + "learning_rate": 3.271604938271605e-07, + "loss": 0.0001, + "reward": 1.7053572461009026, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7187500298023224, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2650 + }, + { + "completion_length": 264.9910831451416, + "epoch": 0.44461209606437824, + "grad_norm": 0.3474524782514886, + "kl": 0.06933212280273438, + "learning_rate": 3.274074074074074e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7660714648663998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2652 + }, + { + "completion_length": 249.60268878936768, + "epoch": 0.44494739930424576, + "grad_norm": 0.2084635078884685, + "kl": 0.029857635498046875, + "learning_rate": 3.2765432098765434e-07, + "loss": 0.0, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2654 + }, + { + "completion_length": 247.1919755935669, + "epoch": 0.44528270254411334, + "grad_norm": 0.1889165270559453, + "kl": 0.0886993408203125, + "learning_rate": 3.279012345679012e-07, + "loss": 0.0001, + "reward": 1.78035718947649, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2656 + }, + { + "completion_length": 245.50894165039062, + "epoch": 0.44561800578398086, + "grad_norm": 0.1117686353305998, + "kl": 0.03253936767578125, + "learning_rate": 3.281481481481481e-07, + "loss": 0.0, + "reward": 1.7714286521077156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 2658 + }, + { + "completion_length": 250.4375123977661, + "epoch": 0.44595330902384844, + "grad_norm": 0.2595808740548275, + "kl": 0.1915740966796875, + "learning_rate": 3.2839506172839506e-07, + "loss": 0.0002, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 2660 + }, + { + "completion_length": 251.02233600616455, + "epoch": 0.446288612263716, + "grad_norm": 0.2233516502035073, + "kl": 0.02516937255859375, + "learning_rate": 3.2864197530864195e-07, + "loss": 0.0, + "reward": 1.7214286401867867, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 2662 + }, + { + "completion_length": 257.571439743042, + "epoch": 0.44662391550358355, + "grad_norm": 0.2758449396807126, + "kl": 0.046142578125, + "learning_rate": 3.288888888888889e-07, + "loss": 0.0, + "reward": 1.7553572207689285, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2664 + }, + { + "completion_length": 270.15179920196533, + "epoch": 0.4469592187434511, + "grad_norm": 0.16203992665582476, + "kl": 0.20227813720703125, + "learning_rate": 3.2913580246913583e-07, + "loss": 0.0002, + "reward": 1.748214341700077, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.761607164517045, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2666 + }, + { + "completion_length": 257.2455472946167, + "epoch": 0.44729452198331865, + "grad_norm": 0.31669039361071327, + "kl": 0.17920303344726562, + "learning_rate": 3.293827160493827e-07, + "loss": 0.0002, + "reward": 1.803571492433548, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 2668 + }, + { + "completion_length": 248.63393783569336, + "epoch": 0.44762982522318623, + "grad_norm": 0.14210074166673994, + "kl": 0.06937408447265625, + "learning_rate": 3.296296296296296e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2670 + }, + { + "completion_length": 250.87947368621826, + "epoch": 0.44796512846305375, + "grad_norm": 0.6957656299935766, + "kl": 0.3000144958496094, + "learning_rate": 3.298765432098765e-07, + "loss": 0.0003, + "reward": 1.7718750685453415, + "reward_std": 0.059977806406095624, + "rewards/equation_reward_func": 0.7776785902678967, + "rewards/format_reward_func": 0.9941964335739613, + "step": 2672 + }, + { + "completion_length": 249.4732265472412, + "epoch": 0.44830043170292133, + "grad_norm": 0.28613974030082806, + "kl": 0.361419677734375, + "learning_rate": 3.3012345679012343e-07, + "loss": 0.0004, + "reward": 1.82321435213089, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.8276785984635353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2674 + }, + { + "completion_length": 255.8571538925171, + "epoch": 0.4486357349427889, + "grad_norm": 0.15867424298767457, + "kl": 0.026119232177734375, + "learning_rate": 3.303703703703704e-07, + "loss": 0.0, + "reward": 1.7232143431901932, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7366071715950966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2676 + }, + { + "completion_length": 253.98215579986572, + "epoch": 0.44897103818265643, + "grad_norm": 0.7509986400487191, + "kl": 0.14146804809570312, + "learning_rate": 3.3061728395061726e-07, + "loss": 0.0001, + "reward": 1.7553572207689285, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2678 + }, + { + "completion_length": 252.8482255935669, + "epoch": 0.449306341422524, + "grad_norm": 0.38029749797861767, + "kl": 0.07612991333007812, + "learning_rate": 3.308641975308642e-07, + "loss": 0.0001, + "reward": 1.7267857864499092, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7312500365078449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2680 + }, + { + "completion_length": 251.9285831451416, + "epoch": 0.44964164466239154, + "grad_norm": 0.39717612267346547, + "kl": 0.026416778564453125, + "learning_rate": 3.311111111111111e-07, + "loss": 0.0, + "reward": 1.7428572252392769, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 1.0, + "step": 2682 + }, + { + "completion_length": 253.34822463989258, + "epoch": 0.4499769479022591, + "grad_norm": 0.643773820689036, + "kl": 1.5482711791992188, + "learning_rate": 3.31358024691358e-07, + "loss": 0.0016, + "reward": 1.7107143700122833, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7196429036557674, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2684 + }, + { + "completion_length": 251.17858600616455, + "epoch": 0.45031225114212664, + "grad_norm": 0.34670280364640704, + "kl": 0.042018890380859375, + "learning_rate": 3.316049382716049e-07, + "loss": 0.0, + "reward": 1.7267857864499092, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7312500346451998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2686 + }, + { + "completion_length": 245.5312614440918, + "epoch": 0.4506475543819942, + "grad_norm": 0.25670407143953416, + "kl": 0.06372833251953125, + "learning_rate": 3.318518518518518e-07, + "loss": 0.0001, + "reward": 1.8107143640518188, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 2688 + }, + { + "completion_length": 244.03572463989258, + "epoch": 0.45098285762186174, + "grad_norm": 0.23765421348038518, + "kl": 0.027862548828125, + "learning_rate": 3.3209876543209875e-07, + "loss": 0.0, + "reward": 1.7857143357396126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 2690 + }, + { + "completion_length": 247.9151906967163, + "epoch": 0.4513181608617293, + "grad_norm": 0.33600943743007694, + "kl": 0.559478759765625, + "learning_rate": 3.323456790123457e-07, + "loss": 0.0006, + "reward": 1.7571429461240768, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 2692 + }, + { + "completion_length": 237.571439743042, + "epoch": 0.4516534641015969, + "grad_norm": 0.5969202483502274, + "kl": 0.6981925964355469, + "learning_rate": 3.325925925925926e-07, + "loss": 0.0007, + "reward": 1.7446429505944252, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2694 + }, + { + "completion_length": 244.20090103149414, + "epoch": 0.4519887673414644, + "grad_norm": 1.2490766104629545, + "kl": 0.19190216064453125, + "learning_rate": 3.328395061728395e-07, + "loss": 0.0002, + "reward": 1.7553572207689285, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7687500305473804, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2696 + }, + { + "completion_length": 256.09375953674316, + "epoch": 0.452324070581332, + "grad_norm": 0.3169020842125548, + "kl": 0.19821548461914062, + "learning_rate": 3.3308641975308636e-07, + "loss": 0.0002, + "reward": 1.744642935693264, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2698 + }, + { + "completion_length": 251.383939743042, + "epoch": 0.45265937382119953, + "grad_norm": 0.17454350064179114, + "kl": 0.209808349609375, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0002, + "reward": 1.796428643167019, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 2700 + }, + { + "completion_length": 250.05805110931396, + "epoch": 0.4529946770610671, + "grad_norm": 0.270399027082715, + "kl": 0.09576034545898438, + "learning_rate": 3.3358024691358024e-07, + "loss": 0.0001, + "reward": 1.7964286655187607, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285910129547, + "rewards/format_reward_func": 1.0, + "step": 2702 + }, + { + "completion_length": 235.66072463989258, + "epoch": 0.45332998030093463, + "grad_norm": 0.25215380030085344, + "kl": 0.033405303955078125, + "learning_rate": 3.3382716049382713e-07, + "loss": 0.0, + "reward": 1.7535715028643608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 2704 + }, + { + "completion_length": 258.839298248291, + "epoch": 0.4536652835408022, + "grad_norm": 0.507861698652949, + "kl": 0.32860565185546875, + "learning_rate": 3.3407407407407407e-07, + "loss": 0.0003, + "reward": 1.7625000923871994, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2706 + }, + { + "completion_length": 248.53572368621826, + "epoch": 0.4540005867806698, + "grad_norm": 0.30873013848794667, + "kl": 0.034900665283203125, + "learning_rate": 3.3432098765432096e-07, + "loss": 0.0, + "reward": 1.7625000551342964, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2708 + }, + { + "completion_length": 242.1696548461914, + "epoch": 0.4543358900205373, + "grad_norm": 0.36290902428851235, + "kl": 0.1857452392578125, + "learning_rate": 3.345679012345679e-07, + "loss": 0.0002, + "reward": 1.7875000685453415, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2710 + }, + { + "completion_length": 265.44197273254395, + "epoch": 0.4546711932604049, + "grad_norm": 0.18967670433015887, + "kl": 0.16097640991210938, + "learning_rate": 3.348148148148148e-07, + "loss": 0.0002, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286033064127, + "rewards/format_reward_func": 1.0, + "step": 2712 + }, + { + "completion_length": 266.29019260406494, + "epoch": 0.4550064965002724, + "grad_norm": 0.2752528674594683, + "kl": 0.06867218017578125, + "learning_rate": 3.350617283950617e-07, + "loss": 0.0001, + "reward": 1.6803572475910187, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.6848214659839869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2714 + }, + { + "completion_length": 249.27233505249023, + "epoch": 0.45534179974014, + "grad_norm": 0.17644565403173249, + "kl": 0.35277557373046875, + "learning_rate": 3.353086419753086e-07, + "loss": 0.0004, + "reward": 1.7946429252624512, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7991071715950966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2716 + }, + { + "completion_length": 257.3259038925171, + "epoch": 0.4556771029800075, + "grad_norm": 0.18222821575997314, + "kl": 0.06207275390625, + "learning_rate": 3.3555555555555556e-07, + "loss": 0.0001, + "reward": 1.7410715147852898, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2718 + }, + { + "completion_length": 258.589298248291, + "epoch": 0.4560124062198751, + "grad_norm": 0.40252505186651977, + "kl": 0.049884796142578125, + "learning_rate": 3.3580246913580245e-07, + "loss": 0.0, + "reward": 1.7500000819563866, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7589285857975483, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2720 + }, + { + "completion_length": 244.1696548461914, + "epoch": 0.4563477094597427, + "grad_norm": 0.44376256831190036, + "kl": 0.5896148681640625, + "learning_rate": 3.360493827160494e-07, + "loss": 0.0006, + "reward": 1.7410715147852898, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7455357406288385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2722 + }, + { + "completion_length": 246.1919765472412, + "epoch": 0.4566830126996102, + "grad_norm": 0.29343191916555844, + "kl": 0.07892227172851562, + "learning_rate": 3.362962962962963e-07, + "loss": 0.0001, + "reward": 1.700000062584877, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7000000327825546, + "rewards/format_reward_func": 1.0, + "step": 2724 + }, + { + "completion_length": 245.77679443359375, + "epoch": 0.4570183159394778, + "grad_norm": 0.2667512601424003, + "kl": 0.09537887573242188, + "learning_rate": 3.365432098765432e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2726 + }, + { + "completion_length": 249.2634038925171, + "epoch": 0.4573536191793453, + "grad_norm": 0.38442369886561234, + "kl": 0.36904144287109375, + "learning_rate": 3.367901234567901e-07, + "loss": 0.0004, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 2728 + }, + { + "completion_length": 254.82144260406494, + "epoch": 0.4576889224192129, + "grad_norm": 0.3137500522359342, + "kl": 0.061252593994140625, + "learning_rate": 3.37037037037037e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7625000402331352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2730 + }, + { + "completion_length": 246.1785831451416, + "epoch": 0.4580242256590804, + "grad_norm": 0.2809572400397591, + "kl": 0.11748123168945312, + "learning_rate": 3.3728395061728394e-07, + "loss": 0.0001, + "reward": 1.79464291036129, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2732 + }, + { + "completion_length": 255.915189743042, + "epoch": 0.458359528898948, + "grad_norm": 0.2971358870435252, + "kl": 0.028736114501953125, + "learning_rate": 3.375308641975308e-07, + "loss": 0.0, + "reward": 1.8017857819795609, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500320374966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2734 + }, + { + "completion_length": 257.7544765472412, + "epoch": 0.45869483213881557, + "grad_norm": 0.3195023386657291, + "kl": 0.030345916748046875, + "learning_rate": 3.3777777777777777e-07, + "loss": 0.0, + "reward": 1.7517857551574707, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7562500201165676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2736 + }, + { + "completion_length": 246.2366180419922, + "epoch": 0.4590301353786831, + "grad_norm": 0.9720315208811928, + "kl": 0.024379730224609375, + "learning_rate": 3.380246913580247e-07, + "loss": 0.0, + "reward": 1.79464291036129, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7991071715950966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2738 + }, + { + "completion_length": 250.5803689956665, + "epoch": 0.45936543861855067, + "grad_norm": 0.26845835890331887, + "kl": 0.0423736572265625, + "learning_rate": 3.382716049382716e-07, + "loss": 0.0, + "reward": 1.7696429267525673, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2740 + }, + { + "completion_length": 261.7544765472412, + "epoch": 0.4597007418584182, + "grad_norm": 0.19155715201137047, + "kl": 0.08993148803710938, + "learning_rate": 3.385185185185185e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285887777805, + "rewards/format_reward_func": 1.0, + "step": 2742 + }, + { + "completion_length": 255.4375123977661, + "epoch": 0.4600360450982858, + "grad_norm": 0.24483028716397143, + "kl": 0.07487106323242188, + "learning_rate": 3.387654320987654e-07, + "loss": 0.0001, + "reward": 1.6910715326666832, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.704464316368103, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2744 + }, + { + "completion_length": 251.3973331451416, + "epoch": 0.4603713483381533, + "grad_norm": 0.208316662402822, + "kl": 0.02557373046875, + "learning_rate": 3.390123456790123e-07, + "loss": 0.0, + "reward": 1.7714286372065544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 2746 + }, + { + "completion_length": 259.19197845458984, + "epoch": 0.4607066515780209, + "grad_norm": 0.1999658236725068, + "kl": 0.037628173828125, + "learning_rate": 3.3925925925925926e-07, + "loss": 0.0, + "reward": 1.7767857536673546, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2748 + }, + { + "completion_length": 247.6741180419922, + "epoch": 0.46104195481788846, + "grad_norm": 0.26139475662583994, + "kl": 0.0295257568359375, + "learning_rate": 3.3950617283950614e-07, + "loss": 0.0, + "reward": 1.7464286461472511, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 2750 + }, + { + "completion_length": 258.54018783569336, + "epoch": 0.461377258057756, + "grad_norm": 0.13583613800255978, + "kl": 0.0282440185546875, + "learning_rate": 3.397530864197531e-07, + "loss": 0.0, + "reward": 1.7642857730388641, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7732143141329288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2752 + }, + { + "completion_length": 251.55358123779297, + "epoch": 0.46171256129762356, + "grad_norm": 0.24682817309475594, + "kl": 0.0307464599609375, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "reward": 1.8017857745289803, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.8062500357627869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2754 + }, + { + "completion_length": 257.0848321914673, + "epoch": 0.4620478645374911, + "grad_norm": 0.15096030885914136, + "kl": 0.02770233154296875, + "learning_rate": 3.4024691358024686e-07, + "loss": 0.0, + "reward": 1.7678571790456772, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7767857424914837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2756 + }, + { + "completion_length": 252.02680206298828, + "epoch": 0.46238316777735866, + "grad_norm": 0.23076502570985755, + "kl": 0.026927947998046875, + "learning_rate": 3.404938271604938e-07, + "loss": 0.0, + "reward": 1.7375000640749931, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2758 + }, + { + "completion_length": 260.3348331451416, + "epoch": 0.4627184710172262, + "grad_norm": 0.34134054770080197, + "kl": 0.050151824951171875, + "learning_rate": 3.407407407407407e-07, + "loss": 0.0001, + "reward": 1.758928619325161, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2760 + }, + { + "completion_length": 255.1071548461914, + "epoch": 0.46305377425709376, + "grad_norm": 0.43200431098358355, + "kl": 0.082275390625, + "learning_rate": 3.4098765432098763e-07, + "loss": 0.0001, + "reward": 1.7482143715023994, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7616071663796902, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2762 + }, + { + "completion_length": 262.69643783569336, + "epoch": 0.4633890774969613, + "grad_norm": 0.37310963538113023, + "kl": 0.16982269287109375, + "learning_rate": 3.412345679012346e-07, + "loss": 0.0002, + "reward": 1.8035714700818062, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 2764 + }, + { + "completion_length": 251.98215579986572, + "epoch": 0.46372438073682887, + "grad_norm": 0.5049977633001252, + "kl": 0.03232574462890625, + "learning_rate": 3.4148148148148146e-07, + "loss": 0.0, + "reward": 1.7392857745289803, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 2766 + }, + { + "completion_length": 249.11608219146729, + "epoch": 0.46405968397669645, + "grad_norm": 0.16067851846385553, + "kl": 0.03436279296875, + "learning_rate": 3.417283950617284e-07, + "loss": 0.0, + "reward": 1.7410714775323868, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7455357480794191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2768 + }, + { + "completion_length": 247.83929634094238, + "epoch": 0.46439498721656397, + "grad_norm": 0.23395496207649558, + "kl": 0.054126739501953125, + "learning_rate": 3.419753086419753e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2770 + }, + { + "completion_length": 250.0937623977661, + "epoch": 0.46473029045643155, + "grad_norm": 0.18523182105694547, + "kl": 0.024806976318359375, + "learning_rate": 3.422222222222222e-07, + "loss": 0.0, + "reward": 1.821428619325161, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214285895228386, + "rewards/format_reward_func": 1.0, + "step": 2772 + }, + { + "completion_length": 256.6875114440918, + "epoch": 0.4650655936962991, + "grad_norm": 0.24536718633510723, + "kl": 0.19739151000976562, + "learning_rate": 3.424691358024691e-07, + "loss": 0.0002, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 2774 + }, + { + "completion_length": 256.1562614440918, + "epoch": 0.46540089693616665, + "grad_norm": 0.3292800179362696, + "kl": 0.12223052978515625, + "learning_rate": 3.42716049382716e-07, + "loss": 0.0001, + "reward": 1.714285783469677, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7232143152505159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2776 + }, + { + "completion_length": 242.8571538925171, + "epoch": 0.4657362001760342, + "grad_norm": 0.40450119722326106, + "kl": 0.03858184814453125, + "learning_rate": 3.4296296296296295e-07, + "loss": 0.0, + "reward": 1.8250000551342964, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000216066837, + "rewards/format_reward_func": 1.0, + "step": 2778 + }, + { + "completion_length": 250.78572463989258, + "epoch": 0.46607150341590176, + "grad_norm": 0.40163561926349534, + "kl": 0.05098724365234375, + "learning_rate": 3.432098765432099e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 1.0, + "step": 2780 + }, + { + "completion_length": 247.16072750091553, + "epoch": 0.46640680665576933, + "grad_norm": 0.2388646035496535, + "kl": 0.050304412841796875, + "learning_rate": 3.434567901234568e-07, + "loss": 0.0001, + "reward": 1.7982143610715866, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8026785925030708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2782 + }, + { + "completion_length": 251.508939743042, + "epoch": 0.46674210989563686, + "grad_norm": 0.23875992787468778, + "kl": 0.04918670654296875, + "learning_rate": 3.4370370370370367e-07, + "loss": 0.0, + "reward": 1.7571429088711739, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7660714592784643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2784 + }, + { + "completion_length": 251.3437623977661, + "epoch": 0.46707741313550444, + "grad_norm": 0.188319172046665, + "kl": 0.3220100402832031, + "learning_rate": 3.4395061728395056e-07, + "loss": 0.0003, + "reward": 1.7785715013742447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714510828257, + "rewards/format_reward_func": 1.0, + "step": 2786 + }, + { + "completion_length": 247.6607255935669, + "epoch": 0.46741271637537196, + "grad_norm": 0.3122224174936983, + "kl": 0.0294189453125, + "learning_rate": 3.441975308641975e-07, + "loss": 0.0, + "reward": 1.776785746216774, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500167638063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2788 + }, + { + "completion_length": 254.26787090301514, + "epoch": 0.46774801961523954, + "grad_norm": 0.22040518881569834, + "kl": 0.098358154296875, + "learning_rate": 3.4444444444444444e-07, + "loss": 0.0001, + "reward": 1.7178572341799736, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571745753288, + "rewards/format_reward_func": 1.0, + "step": 2790 + }, + { + "completion_length": 242.29911708831787, + "epoch": 0.46808332285510706, + "grad_norm": 0.3139530838726713, + "kl": 0.039592742919921875, + "learning_rate": 3.4469135802469133e-07, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 2792 + }, + { + "completion_length": 254.37947463989258, + "epoch": 0.46841862609497464, + "grad_norm": 0.2681590517820234, + "kl": 0.03741455078125, + "learning_rate": 3.4493827160493827e-07, + "loss": 0.0, + "reward": 1.7857143506407738, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7946428842842579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2794 + }, + { + "completion_length": 256.9241180419922, + "epoch": 0.4687539293348422, + "grad_norm": 0.17318988896415513, + "kl": 0.1388397216796875, + "learning_rate": 3.451851851851852e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.749107176437974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2796 + }, + { + "completion_length": 264.62500953674316, + "epoch": 0.46908923257470975, + "grad_norm": 0.5645059371676063, + "kl": 0.10883331298828125, + "learning_rate": 3.4543209876543205e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7946428842842579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2798 + }, + { + "completion_length": 264.3080463409424, + "epoch": 0.4694245358145773, + "grad_norm": 0.3930393143745379, + "kl": 0.0624847412109375, + "learning_rate": 3.45679012345679e-07, + "loss": 0.0001, + "reward": 1.698214367032051, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7026786059141159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2800 + }, + { + "completion_length": 257.2098340988159, + "epoch": 0.46975983905444485, + "grad_norm": 0.2631303652587504, + "kl": 0.04084014892578125, + "learning_rate": 3.459259259259259e-07, + "loss": 0.0, + "reward": 1.7107143625617027, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7196428962051868, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2802 + }, + { + "completion_length": 257.65179443359375, + "epoch": 0.47009514229431243, + "grad_norm": 0.22529626467128386, + "kl": 0.0889739990234375, + "learning_rate": 3.461728395061728e-07, + "loss": 0.0001, + "reward": 1.7732143625617027, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776785921305418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2804 + }, + { + "completion_length": 254.2678689956665, + "epoch": 0.47043044553417995, + "grad_norm": 0.6203961669806676, + "kl": 0.20491790771484375, + "learning_rate": 3.4641975308641976e-07, + "loss": 0.0002, + "reward": 1.7517857924103737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2806 + }, + { + "completion_length": 256.9107275009155, + "epoch": 0.47076574877404753, + "grad_norm": 0.24761485503433714, + "kl": 0.09076690673828125, + "learning_rate": 3.4666666666666665e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2808 + }, + { + "completion_length": 258.24108028411865, + "epoch": 0.4711010520139151, + "grad_norm": 0.2496260686532331, + "kl": 0.03143310546875, + "learning_rate": 3.469135802469136e-07, + "loss": 0.0, + "reward": 1.7482143864035606, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7526785917580128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2810 + }, + { + "completion_length": 254.9509038925171, + "epoch": 0.47143635525378264, + "grad_norm": 0.6028251611292678, + "kl": 0.3297767639160156, + "learning_rate": 3.4716049382716053e-07, + "loss": 0.0003, + "reward": 1.7464286237955093, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.755357164889574, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2812 + }, + { + "completion_length": 269.39286708831787, + "epoch": 0.4717716584936502, + "grad_norm": 0.3415395676744013, + "kl": 0.1169281005859375, + "learning_rate": 3.4740740740740737e-07, + "loss": 0.0001, + "reward": 1.7392857745289803, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2814 + }, + { + "completion_length": 263.53125953674316, + "epoch": 0.47210696173351774, + "grad_norm": 0.23960314502568703, + "kl": 0.050708770751953125, + "learning_rate": 3.476543209876543e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.0909137288108468, + "rewards/equation_reward_func": 0.7750000171363354, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2816 + }, + { + "completion_length": 268.6651887893677, + "epoch": 0.4724422649733853, + "grad_norm": 0.22502083720491892, + "kl": 0.3942832946777344, + "learning_rate": 3.479012345679012e-07, + "loss": 0.0004, + "reward": 1.7125000581145287, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7258929070085287, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2818 + }, + { + "completion_length": 266.383939743042, + "epoch": 0.47277756821325284, + "grad_norm": 0.3852008894067241, + "kl": 0.5472488403320312, + "learning_rate": 3.4814814814814814e-07, + "loss": 0.0005, + "reward": 1.7785715088248253, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 2820 + }, + { + "completion_length": 277.2678737640381, + "epoch": 0.4731128714531204, + "grad_norm": 0.4898862429233096, + "kl": 0.44490814208984375, + "learning_rate": 3.483950617283951e-07, + "loss": 0.0004, + "reward": 1.7464286535978317, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2822 + }, + { + "completion_length": 271.20983505249023, + "epoch": 0.47344817469298794, + "grad_norm": 0.3050807925241347, + "kl": 0.32114410400390625, + "learning_rate": 3.4864197530864197e-07, + "loss": 0.0003, + "reward": 1.7107143327593803, + "reward_std": 0.05555839091539383, + "rewards/equation_reward_func": 0.7285714615136385, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2824 + }, + { + "completion_length": 262.2857255935669, + "epoch": 0.4737834779328555, + "grad_norm": 0.32417959842074145, + "kl": 0.27597808837890625, + "learning_rate": 3.488888888888889e-07, + "loss": 0.0003, + "reward": 1.7196429446339607, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.7330357395112514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2826 + }, + { + "completion_length": 256.62947940826416, + "epoch": 0.4741187811727231, + "grad_norm": 0.43158639531492315, + "kl": 0.3496856689453125, + "learning_rate": 3.4913580246913574e-07, + "loss": 0.0003, + "reward": 1.7857143506407738, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7946428786963224, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2828 + }, + { + "completion_length": 257.06251430511475, + "epoch": 0.4744540844125906, + "grad_norm": 0.16878279995351914, + "kl": 0.02948760986328125, + "learning_rate": 3.493827160493827e-07, + "loss": 0.0, + "reward": 1.7964286133646965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 2830 + }, + { + "completion_length": 268.3035821914673, + "epoch": 0.4747893876524582, + "grad_norm": 1.3822497181999542, + "kl": 1.5914840698242188, + "learning_rate": 3.496296296296296e-07, + "loss": 0.0016, + "reward": 1.723214365541935, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7366071864962578, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2832 + }, + { + "completion_length": 272.46876430511475, + "epoch": 0.47512469089232573, + "grad_norm": 0.31405924815564323, + "kl": 0.60906982421875, + "learning_rate": 3.498765432098765e-07, + "loss": 0.0006, + "reward": 1.721428632736206, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2834 + }, + { + "completion_length": 265.4419765472412, + "epoch": 0.4754599941321933, + "grad_norm": 0.3334541089600988, + "kl": 0.1193695068359375, + "learning_rate": 3.5012345679012345e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7330357357859612, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2836 + }, + { + "completion_length": 261.68751430511475, + "epoch": 0.47579529737206083, + "grad_norm": 0.2359020883684203, + "kl": 0.02530670166015625, + "learning_rate": 3.503703703703704e-07, + "loss": 0.0, + "reward": 1.7392857819795609, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 2838 + }, + { + "completion_length": 270.17858123779297, + "epoch": 0.4761306006119284, + "grad_norm": 0.19232319568658027, + "kl": 0.028003692626953125, + "learning_rate": 3.506172839506173e-07, + "loss": 0.0, + "reward": 1.6857143491506577, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7035714518278837, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2840 + }, + { + "completion_length": 248.52679824829102, + "epoch": 0.476465903851796, + "grad_norm": 0.14569116729208106, + "kl": 0.08068084716796875, + "learning_rate": 3.5086419753086417e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 2842 + }, + { + "completion_length": 254.58929634094238, + "epoch": 0.4768012070916635, + "grad_norm": 1.0018321660589764, + "kl": 0.047954559326171875, + "learning_rate": 3.5111111111111106e-07, + "loss": 0.0, + "reward": 1.8000000640749931, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8089286014437675, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2844 + }, + { + "completion_length": 242.83483219146729, + "epoch": 0.4771365103315311, + "grad_norm": 0.1876792247373185, + "kl": 0.19379425048828125, + "learning_rate": 3.51358024691358e-07, + "loss": 0.0002, + "reward": 1.7428572177886963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 2846 + }, + { + "completion_length": 257.9866199493408, + "epoch": 0.4774718135713986, + "grad_norm": 0.21949993388796094, + "kl": 0.054508209228515625, + "learning_rate": 3.5160493827160494e-07, + "loss": 0.0001, + "reward": 1.7696429416537285, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2848 + }, + { + "completion_length": 248.25447463989258, + "epoch": 0.4778071168112662, + "grad_norm": 0.47622915884966727, + "kl": 0.5040206909179688, + "learning_rate": 3.5185185185185183e-07, + "loss": 0.0005, + "reward": 1.7625000849366188, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643089175224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2850 + }, + { + "completion_length": 249.97768783569336, + "epoch": 0.4781424200511337, + "grad_norm": 0.40265000030374465, + "kl": 0.05511474609375, + "learning_rate": 3.5209876543209877e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2852 + }, + { + "completion_length": 264.2812614440918, + "epoch": 0.4784777232910013, + "grad_norm": 0.38387369362331086, + "kl": 0.31607818603515625, + "learning_rate": 3.5234567901234566e-07, + "loss": 0.0003, + "reward": 1.7267857939004898, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7401785887777805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2854 + }, + { + "completion_length": 257.3392972946167, + "epoch": 0.4788130265308689, + "grad_norm": 0.5951750074514578, + "kl": 0.13494873046875, + "learning_rate": 3.5259259259259255e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2856 + }, + { + "completion_length": 254.5134048461914, + "epoch": 0.4791483297707364, + "grad_norm": 0.17450689785228546, + "kl": 0.03231048583984375, + "learning_rate": 3.528395061728395e-07, + "loss": 0.0, + "reward": 1.741071492433548, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357536673546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2858 + }, + { + "completion_length": 257.2321529388428, + "epoch": 0.479483633010604, + "grad_norm": 0.1944576284948148, + "kl": 0.288116455078125, + "learning_rate": 3.530864197530864e-07, + "loss": 0.0003, + "reward": 1.7428571954369545, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7517857421189547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2860 + }, + { + "completion_length": 260.0357275009155, + "epoch": 0.4798189362504715, + "grad_norm": 0.19695027022142356, + "kl": 0.067718505859375, + "learning_rate": 3.533333333333333e-07, + "loss": 0.0001, + "reward": 1.805357202887535, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8098214454948902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2862 + }, + { + "completion_length": 255.81251430511475, + "epoch": 0.4801542394903391, + "grad_norm": 0.23358145138386263, + "kl": 0.33197021484375, + "learning_rate": 3.5358024691358026e-07, + "loss": 0.0003, + "reward": 1.733928643167019, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7383928894996643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2864 + }, + { + "completion_length": 248.89733505249023, + "epoch": 0.4804895427302066, + "grad_norm": 0.349013914148386, + "kl": 0.0921783447265625, + "learning_rate": 3.5382716049382715e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7339286021888256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2866 + }, + { + "completion_length": 246.35715675354004, + "epoch": 0.4808248459700742, + "grad_norm": 0.5646698298864009, + "kl": 0.31024169921875, + "learning_rate": 3.540740740740741e-07, + "loss": 0.0003, + "reward": 1.6839286461472511, + "reward_std": 0.10354063473641872, + "rewards/equation_reward_func": 0.6973214633762836, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2868 + }, + { + "completion_length": 258.14287090301514, + "epoch": 0.48116014920994177, + "grad_norm": 0.52150068072927, + "kl": 0.208526611328125, + "learning_rate": 3.5432098765432093e-07, + "loss": 0.0002, + "reward": 1.73214291036129, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7410714700818062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2870 + }, + { + "completion_length": 249.07590198516846, + "epoch": 0.4814954524498093, + "grad_norm": 0.31669922163507147, + "kl": 0.0349273681640625, + "learning_rate": 3.5456790123456787e-07, + "loss": 0.0, + "reward": 1.7464286237955093, + "reward_std": 0.07576144114136696, + "rewards/equation_reward_func": 0.755357189103961, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2872 + }, + { + "completion_length": 249.3392972946167, + "epoch": 0.48183075568967687, + "grad_norm": 0.25581827702063864, + "kl": 0.05368804931640625, + "learning_rate": 3.548148148148148e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 2874 + }, + { + "completion_length": 259.40179538726807, + "epoch": 0.4821660589295444, + "grad_norm": 0.2812439109626435, + "kl": 1.0510787963867188, + "learning_rate": 3.550617283950617e-07, + "loss": 0.0011, + "reward": 1.6892857775092125, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.6982143223285675, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2876 + }, + { + "completion_length": 250.75447273254395, + "epoch": 0.482501362169412, + "grad_norm": 0.19207748486652276, + "kl": 0.322998046875, + "learning_rate": 3.5530864197530864e-07, + "loss": 0.0003, + "reward": 1.7035715207457542, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7035714648663998, + "rewards/format_reward_func": 1.0, + "step": 2878 + }, + { + "completion_length": 260.4017963409424, + "epoch": 0.4828366654092795, + "grad_norm": 0.22444574368780876, + "kl": 0.17897415161132812, + "learning_rate": 3.5555555555555553e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7732143364846706, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2880 + }, + { + "completion_length": 250.5178680419922, + "epoch": 0.4831719686491471, + "grad_norm": 0.5271841775692403, + "kl": 0.5541343688964844, + "learning_rate": 3.5580246913580247e-07, + "loss": 0.0006, + "reward": 1.7732143476605415, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2882 + }, + { + "completion_length": 255.05804634094238, + "epoch": 0.4835072718890146, + "grad_norm": 0.465173051193132, + "kl": 0.31304931640625, + "learning_rate": 3.5604938271604936e-07, + "loss": 0.0003, + "reward": 1.694642923772335, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.6991071719676256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2884 + }, + { + "completion_length": 261.0982255935669, + "epoch": 0.4838425751288822, + "grad_norm": 0.3209358019665937, + "kl": 0.6431045532226562, + "learning_rate": 3.5629629629629625e-07, + "loss": 0.0006, + "reward": 1.742857202887535, + "reward_std": 0.1010152529925108, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2886 + }, + { + "completion_length": 253.11608409881592, + "epoch": 0.48417787836874976, + "grad_norm": 0.2680813222428663, + "kl": 0.21021652221679688, + "learning_rate": 3.565432098765432e-07, + "loss": 0.0002, + "reward": 1.7303572222590446, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7348214685916901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2888 + }, + { + "completion_length": 253.08929443359375, + "epoch": 0.4845131816086173, + "grad_norm": 0.5583623489192696, + "kl": 0.2989044189453125, + "learning_rate": 3.5679012345679013e-07, + "loss": 0.0003, + "reward": 1.753571480512619, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7625000402331352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2890 + }, + { + "completion_length": 248.2455472946167, + "epoch": 0.48484848484848486, + "grad_norm": 0.2820007992710127, + "kl": 0.31207275390625, + "learning_rate": 3.57037037037037e-07, + "loss": 0.0003, + "reward": 1.7875000536441803, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7910714484751225, + "rewards/format_reward_func": 0.9964285790920258, + "step": 2892 + }, + { + "completion_length": 253.58483409881592, + "epoch": 0.4851837880883524, + "grad_norm": 0.7035434638322244, + "kl": 0.6702957153320312, + "learning_rate": 3.5728395061728396e-07, + "loss": 0.0007, + "reward": 1.7000000923871994, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7089285925030708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2894 + }, + { + "completion_length": 256.2410831451416, + "epoch": 0.48551909132821996, + "grad_norm": 0.3080211232059757, + "kl": 0.267303466796875, + "learning_rate": 3.5753086419753085e-07, + "loss": 0.0003, + "reward": 1.7410714998841286, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7544643096625805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2896 + }, + { + "completion_length": 257.6384029388428, + "epoch": 0.4858543945680875, + "grad_norm": 0.28553151949283306, + "kl": 0.15687179565429688, + "learning_rate": 3.5777777777777773e-07, + "loss": 0.0002, + "reward": 1.7339286133646965, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7473214752972126, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2898 + }, + { + "completion_length": 255.65625667572021, + "epoch": 0.48618969780795507, + "grad_norm": 0.21765843503980004, + "kl": 0.5928115844726562, + "learning_rate": 3.580246913580247e-07, + "loss": 0.0006, + "reward": 1.6803572103381157, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.6937500424683094, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2900 + }, + { + "completion_length": 254.33929920196533, + "epoch": 0.48652500104782265, + "grad_norm": 0.44552416758581037, + "kl": 0.3790931701660156, + "learning_rate": 3.5827160493827156e-07, + "loss": 0.0004, + "reward": 1.6964286491274834, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7053571753203869, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2902 + }, + { + "completion_length": 253.09376335144043, + "epoch": 0.48686030428769017, + "grad_norm": 0.9647608524452777, + "kl": 1.1598663330078125, + "learning_rate": 3.585185185185185e-07, + "loss": 0.0012, + "reward": 1.714285783469677, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7142857518047094, + "rewards/format_reward_func": 1.0, + "step": 2904 + }, + { + "completion_length": 255.12054920196533, + "epoch": 0.48719560752755775, + "grad_norm": 0.29048605992776716, + "kl": 0.21368408203125, + "learning_rate": 3.587654320987654e-07, + "loss": 0.0002, + "reward": 1.7589286342263222, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7723214663565159, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2906 + }, + { + "completion_length": 255.02679920196533, + "epoch": 0.4875309107674253, + "grad_norm": 0.2027351239625565, + "kl": 0.7143478393554688, + "learning_rate": 3.5901234567901234e-07, + "loss": 0.0007, + "reward": 1.7660714983940125, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7794643118977547, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2908 + }, + { + "completion_length": 252.3973331451416, + "epoch": 0.48786621400729285, + "grad_norm": 0.19049513041306962, + "kl": 0.08922576904296875, + "learning_rate": 3.592592592592593e-07, + "loss": 0.0001, + "reward": 1.791071504354477, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.795535746961832, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2910 + }, + { + "completion_length": 256.40625953674316, + "epoch": 0.4882015172471604, + "grad_norm": 0.35376853781284395, + "kl": 0.48523712158203125, + "learning_rate": 3.5950617283950616e-07, + "loss": 0.0005, + "reward": 1.769642911851406, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071805357933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2912 + }, + { + "completion_length": 263.09822273254395, + "epoch": 0.48853682048702796, + "grad_norm": 0.36369623998455325, + "kl": 0.08524322509765625, + "learning_rate": 3.5975308641975305e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 2914 + }, + { + "completion_length": 245.4553689956665, + "epoch": 0.48887212372689554, + "grad_norm": 0.1363731743599683, + "kl": 0.03830718994140625, + "learning_rate": 3.6e-07, + "loss": 0.0, + "reward": 1.7232143729925156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2916 + }, + { + "completion_length": 259.6875123977661, + "epoch": 0.48920742696676306, + "grad_norm": 0.22073589348512726, + "kl": 0.175567626953125, + "learning_rate": 3.602469135802469e-07, + "loss": 0.0002, + "reward": 1.7821428999304771, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.791071455925703, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2918 + }, + { + "completion_length": 256.28572845458984, + "epoch": 0.48954273020663064, + "grad_norm": 0.2817423794616662, + "kl": 0.23409271240234375, + "learning_rate": 3.604938271604938e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 2920 + }, + { + "completion_length": 262.0446500778198, + "epoch": 0.48987803344649816, + "grad_norm": 0.5882471776016766, + "kl": 0.3173255920410156, + "learning_rate": 3.607407407407407e-07, + "loss": 0.0003, + "reward": 1.6803572252392769, + "reward_std": 0.06818529777228832, + "rewards/equation_reward_func": 0.6937500387430191, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2922 + }, + { + "completion_length": 247.25447463989258, + "epoch": 0.49021333668636574, + "grad_norm": 0.2754932860486546, + "kl": 0.6283416748046875, + "learning_rate": 3.6098765432098765e-07, + "loss": 0.0006, + "reward": 1.758928656578064, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2924 + }, + { + "completion_length": 242.98661994934082, + "epoch": 0.49054863992623327, + "grad_norm": 0.31778700835528595, + "kl": 0.11323165893554688, + "learning_rate": 3.612345679012346e-07, + "loss": 0.0001, + "reward": 1.7339286729693413, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2926 + }, + { + "completion_length": 255.58929920196533, + "epoch": 0.49088394316610084, + "grad_norm": 0.2705317676366948, + "kl": 1.1042404174804688, + "learning_rate": 3.6148148148148143e-07, + "loss": 0.0011, + "reward": 1.7803571820259094, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214693367481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2928 + }, + { + "completion_length": 248.96430015563965, + "epoch": 0.4912192464059684, + "grad_norm": 0.1774856462598626, + "kl": 0.22403717041015625, + "learning_rate": 3.6172839506172837e-07, + "loss": 0.0002, + "reward": 1.810714341700077, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 2930 + }, + { + "completion_length": 250.04911613464355, + "epoch": 0.49155454964583595, + "grad_norm": 0.2009737631916188, + "kl": 0.3294029235839844, + "learning_rate": 3.6197530864197526e-07, + "loss": 0.0003, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 2932 + }, + { + "completion_length": 252.9062623977661, + "epoch": 0.4918898528857035, + "grad_norm": 0.24142261752824526, + "kl": 0.37865447998046875, + "learning_rate": 3.622222222222222e-07, + "loss": 0.0004, + "reward": 1.7660714760422707, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2934 + }, + { + "completion_length": 245.23215198516846, + "epoch": 0.49222515612557105, + "grad_norm": 0.3224585490033016, + "kl": 1.2538604736328125, + "learning_rate": 3.6246913580246914e-07, + "loss": 0.0013, + "reward": 1.8285714834928513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8285714499652386, + "rewards/format_reward_func": 1.0, + "step": 2936 + }, + { + "completion_length": 242.7098331451416, + "epoch": 0.49256045936543863, + "grad_norm": 0.07482152881354606, + "kl": 0.6891937255859375, + "learning_rate": 3.6271604938271603e-07, + "loss": 0.0007, + "reward": 1.7821429446339607, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 2938 + }, + { + "completion_length": 250.76340293884277, + "epoch": 0.49289576260530615, + "grad_norm": 0.3688486923883066, + "kl": 0.46811676025390625, + "learning_rate": 3.6296296296296297e-07, + "loss": 0.0005, + "reward": 1.7053572237491608, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.709821468219161, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2940 + }, + { + "completion_length": 247.9598331451416, + "epoch": 0.49323106584517373, + "grad_norm": 0.38334694612665543, + "kl": 0.5134353637695312, + "learning_rate": 3.6320987654320986e-07, + "loss": 0.0005, + "reward": 1.814285770058632, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8142857346683741, + "rewards/format_reward_func": 1.0, + "step": 2942 + }, + { + "completion_length": 253.24108505249023, + "epoch": 0.49356636908504126, + "grad_norm": 0.2466986529569209, + "kl": 0.073577880859375, + "learning_rate": 3.6345679012345675e-07, + "loss": 0.0001, + "reward": 1.682142935693264, + "reward_std": 0.09596449043601751, + "rewards/equation_reward_func": 0.7000000309199095, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2944 + }, + { + "completion_length": 253.5044755935669, + "epoch": 0.49390167232490884, + "grad_norm": 0.22385096048864217, + "kl": 0.6382865905761719, + "learning_rate": 3.637037037037037e-07, + "loss": 0.0006, + "reward": 1.7464286461472511, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7553571835160255, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2946 + }, + { + "completion_length": 244.7544755935669, + "epoch": 0.4942369755647764, + "grad_norm": 0.3057270423350129, + "kl": 0.124908447265625, + "learning_rate": 3.639506172839506e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 2948 + }, + { + "completion_length": 251.68304634094238, + "epoch": 0.49457227880464394, + "grad_norm": 0.8529840796001801, + "kl": 1.887176513671875, + "learning_rate": 3.641975308641975e-07, + "loss": 0.0019, + "reward": 1.7017858028411865, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7062500342726707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2950 + }, + { + "completion_length": 250.33929824829102, + "epoch": 0.4949075820445115, + "grad_norm": 0.32756161445028503, + "kl": 0.14838409423828125, + "learning_rate": 3.6444444444444446e-07, + "loss": 0.0001, + "reward": 1.7089286372065544, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7133928779512644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2952 + }, + { + "completion_length": 259.4419775009155, + "epoch": 0.49524288528437904, + "grad_norm": 0.7213068467496084, + "kl": 0.8767471313476562, + "learning_rate": 3.6469135802469135e-07, + "loss": 0.0009, + "reward": 1.7714286148548126, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7803571783006191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2954 + }, + { + "completion_length": 253.06250953674316, + "epoch": 0.4955781885242466, + "grad_norm": 0.3223780611086043, + "kl": 0.16445159912109375, + "learning_rate": 3.6493827160493824e-07, + "loss": 0.0002, + "reward": 1.7607143223285675, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2956 + }, + { + "completion_length": 250.9732265472412, + "epoch": 0.49591349176411414, + "grad_norm": 0.31015288975976857, + "kl": 0.32781982421875, + "learning_rate": 3.651851851851851e-07, + "loss": 0.0003, + "reward": 1.7000000774860382, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7089285999536514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2958 + }, + { + "completion_length": 244.3750123977661, + "epoch": 0.4962487950039817, + "grad_norm": 0.24747093123257072, + "kl": 0.4291877746582031, + "learning_rate": 3.6543209876543207e-07, + "loss": 0.0004, + "reward": 1.7482143491506577, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.7616071701049805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2960 + }, + { + "completion_length": 257.76786708831787, + "epoch": 0.4965840982438493, + "grad_norm": 0.3119423738396043, + "kl": 0.40831756591796875, + "learning_rate": 3.65679012345679e-07, + "loss": 0.0004, + "reward": 1.732142947614193, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2962 + }, + { + "completion_length": 259.17858505249023, + "epoch": 0.4969194014837168, + "grad_norm": 0.1830517098733492, + "kl": 0.05135345458984375, + "learning_rate": 3.659259259259259e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 0.9821428619325161, + "step": 2964 + }, + { + "completion_length": 243.9419755935669, + "epoch": 0.4972547047235844, + "grad_norm": 0.21681102140064856, + "kl": 0.10626220703125, + "learning_rate": 3.6617283950617284e-07, + "loss": 0.0001, + "reward": 1.7964286133646965, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.8053571581840515, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2966 + }, + { + "completion_length": 255.5312614440918, + "epoch": 0.49759000796345193, + "grad_norm": 0.30420774625152075, + "kl": 0.27170562744140625, + "learning_rate": 3.664197530864198e-07, + "loss": 0.0003, + "reward": 1.7839286178350449, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7973214574158192, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2968 + }, + { + "completion_length": 260.1562623977661, + "epoch": 0.4979253112033195, + "grad_norm": 0.5679008529737339, + "kl": 0.6922416687011719, + "learning_rate": 3.666666666666666e-07, + "loss": 0.0007, + "reward": 1.703571505844593, + "reward_std": 0.09091372601687908, + "rewards/equation_reward_func": 0.7303571589291096, + "rewards/format_reward_func": 0.9732142984867096, + "step": 2970 + }, + { + "completion_length": 256.95983123779297, + "epoch": 0.49826061444318703, + "grad_norm": 0.21251929924314933, + "kl": 0.3111114501953125, + "learning_rate": 3.6691358024691356e-07, + "loss": 0.0003, + "reward": 1.7339286282658577, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7473214603960514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2972 + }, + { + "completion_length": 250.0044755935669, + "epoch": 0.4985959176830546, + "grad_norm": 0.2855125484052691, + "kl": 0.6980438232421875, + "learning_rate": 3.6716049382716044e-07, + "loss": 0.0007, + "reward": 1.7303571924567223, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7437500283122063, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2974 + }, + { + "completion_length": 266.54019260406494, + "epoch": 0.4989312209229222, + "grad_norm": 0.4551964805783462, + "kl": 0.3116912841796875, + "learning_rate": 3.674074074074074e-07, + "loss": 0.0003, + "reward": 1.8089286163449287, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.8223214447498322, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2976 + }, + { + "completion_length": 257.308048248291, + "epoch": 0.4992665241627897, + "grad_norm": 0.38176881311264943, + "kl": 0.36710357666015625, + "learning_rate": 3.6765432098765433e-07, + "loss": 0.0004, + "reward": 1.7035714983940125, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2978 + }, + { + "completion_length": 260.9598340988159, + "epoch": 0.4996018274026573, + "grad_norm": 0.14641630559689583, + "kl": 0.6771926879882812, + "learning_rate": 3.679012345679012e-07, + "loss": 0.0007, + "reward": 1.74642863124609, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.755357176065445, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2980 + }, + { + "completion_length": 258.0625114440918, + "epoch": 0.4999371306425248, + "grad_norm": 0.2595195319935882, + "kl": 0.0545196533203125, + "learning_rate": 3.6814814814814816e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2982 + }, + { + "completion_length": 263.83929920196533, + "epoch": 0.5002724338823924, + "grad_norm": 0.24109920616356117, + "kl": 0.05385589599609375, + "learning_rate": 3.68395061728395e-07, + "loss": 0.0001, + "reward": 1.7982143461704254, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8116071783006191, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2984 + }, + { + "completion_length": 252.6160831451416, + "epoch": 0.5006077371222599, + "grad_norm": 0.31178103313116434, + "kl": 0.11287689208984375, + "learning_rate": 3.6864197530864193e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2986 + }, + { + "completion_length": 258.7276906967163, + "epoch": 0.5009430403621274, + "grad_norm": 0.257026450835211, + "kl": 0.25646209716796875, + "learning_rate": 3.688888888888889e-07, + "loss": 0.0003, + "reward": 1.7714286297559738, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7803571596741676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 2988 + }, + { + "completion_length": 251.8125123977661, + "epoch": 0.5012783436019951, + "grad_norm": 0.26271403970330076, + "kl": 0.6702995300292969, + "learning_rate": 3.6913580246913576e-07, + "loss": 0.0007, + "reward": 1.8125000521540642, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.8169642947614193, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2990 + }, + { + "completion_length": 256.6384038925171, + "epoch": 0.5016136468418626, + "grad_norm": 0.27592010057674143, + "kl": 0.1610565185546875, + "learning_rate": 3.693827160493827e-07, + "loss": 0.0002, + "reward": 1.6785715073347092, + "reward_std": 0.08081220369786024, + "rewards/equation_reward_func": 0.6964286081492901, + "rewards/format_reward_func": 0.9821428656578064, + "step": 2992 + }, + { + "completion_length": 257.120548248291, + "epoch": 0.5019489500817301, + "grad_norm": 0.5610497082441622, + "kl": 0.25261688232421875, + "learning_rate": 3.6962962962962965e-07, + "loss": 0.0003, + "reward": 1.680357240140438, + "reward_std": 0.10859139636158943, + "rewards/equation_reward_func": 0.7026785984635353, + "rewards/format_reward_func": 0.977678582072258, + "step": 2994 + }, + { + "completion_length": 268.7410840988159, + "epoch": 0.5022842533215978, + "grad_norm": 0.5487386942065536, + "kl": 0.42702484130859375, + "learning_rate": 3.6987654320987653e-07, + "loss": 0.0004, + "reward": 1.7446429282426834, + "reward_std": 0.08838834799826145, + "rewards/equation_reward_func": 0.7580357454717159, + "rewards/format_reward_func": 0.9866071492433548, + "step": 2996 + }, + { + "completion_length": 250.97322463989258, + "epoch": 0.5026195565614653, + "grad_norm": 0.3198474395173545, + "kl": 0.5860786437988281, + "learning_rate": 3.701234567901235e-07, + "loss": 0.0006, + "reward": 1.7339286506175995, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928876370192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 2998 + }, + { + "completion_length": 255.59822463989258, + "epoch": 0.5029548598013328, + "grad_norm": 0.26785004048529404, + "kl": 1.1002006530761719, + "learning_rate": 3.703703703703703e-07, + "loss": 0.0011, + "reward": 1.748214341700077, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7616071663796902, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3000 + }, + { + "completion_length": 261.35269260406494, + "epoch": 0.5032901630412003, + "grad_norm": 0.3310002273693556, + "kl": 0.20697784423828125, + "learning_rate": 3.7061728395061725e-07, + "loss": 0.0002, + "reward": 1.6982143595814705, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7116071823984385, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3002 + }, + { + "completion_length": 253.9732255935669, + "epoch": 0.503625466281068, + "grad_norm": 0.13514736768128646, + "kl": 0.20084381103515625, + "learning_rate": 3.708641975308642e-07, + "loss": 0.0002, + "reward": 1.7875000461935997, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.800892872735858, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3004 + }, + { + "completion_length": 259.071439743042, + "epoch": 0.5039607695209355, + "grad_norm": 0.3078015700685397, + "kl": 0.37459564208984375, + "learning_rate": 3.711111111111111e-07, + "loss": 0.0004, + "reward": 1.7839286252856255, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.7973214499652386, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3006 + }, + { + "completion_length": 263.93751335144043, + "epoch": 0.504296072760803, + "grad_norm": 0.36645424767648727, + "kl": 0.47135162353515625, + "learning_rate": 3.71358024691358e-07, + "loss": 0.0005, + "reward": 1.7571429163217545, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3008 + }, + { + "completion_length": 255.758939743042, + "epoch": 0.5046313760006707, + "grad_norm": 0.7864030190498497, + "kl": 1.5985565185546875, + "learning_rate": 3.7160493827160496e-07, + "loss": 0.0016, + "reward": 1.7482143342494965, + "reward_std": 0.07828682288527489, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9776785783469677, + "step": 3010 + }, + { + "completion_length": 248.2634038925171, + "epoch": 0.5049666792405382, + "grad_norm": 0.2667122406171907, + "kl": 0.03884124755859375, + "learning_rate": 3.7185185185185185e-07, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7732143178582191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3012 + }, + { + "completion_length": 267.8482303619385, + "epoch": 0.5053019824804057, + "grad_norm": 0.3173331245909394, + "kl": 1.0220565795898438, + "learning_rate": 3.7209876543209874e-07, + "loss": 0.001, + "reward": 1.7321429252624512, + "reward_std": 0.09596449043601751, + "rewards/equation_reward_func": 0.7589285932481289, + "rewards/format_reward_func": 0.9732142984867096, + "step": 3014 + }, + { + "completion_length": 243.81697368621826, + "epoch": 0.5056372857202732, + "grad_norm": 0.34077550562632575, + "kl": 0.040744781494140625, + "learning_rate": 3.7234567901234563e-07, + "loss": 0.0, + "reward": 1.7589286267757416, + "reward_std": 0.09848987217992544, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.977678582072258, + "step": 3016 + }, + { + "completion_length": 261.33929347991943, + "epoch": 0.5059725889601409, + "grad_norm": 0.15313021699348667, + "kl": 0.43424224853515625, + "learning_rate": 3.7259259259259257e-07, + "loss": 0.0004, + "reward": 1.716071479022503, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7294643260538578, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3018 + }, + { + "completion_length": 258.4509029388428, + "epoch": 0.5063078922000084, + "grad_norm": 0.5652791164362287, + "kl": 0.9497528076171875, + "learning_rate": 3.728395061728395e-07, + "loss": 0.0009, + "reward": 1.6750000938773155, + "reward_std": 0.11616754159331322, + "rewards/equation_reward_func": 0.6928571835160255, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3020 + }, + { + "completion_length": 267.790189743042, + "epoch": 0.5066431954398759, + "grad_norm": 0.2861186061735038, + "kl": 0.532745361328125, + "learning_rate": 3.730864197530864e-07, + "loss": 0.0005, + "reward": 1.7785714715719223, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7875000201165676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3022 + }, + { + "completion_length": 266.2500162124634, + "epoch": 0.5069784986797435, + "grad_norm": 0.4827885536321817, + "kl": 0.32605743408203125, + "learning_rate": 3.7333333333333334e-07, + "loss": 0.0003, + "reward": 1.6928572058677673, + "reward_std": 0.1010152529925108, + "rewards/equation_reward_func": 0.7107143234461546, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3024 + }, + { + "completion_length": 256.75447368621826, + "epoch": 0.5073138019196111, + "grad_norm": 0.29762340493815626, + "kl": 0.08817291259765625, + "learning_rate": 3.7358024691358023e-07, + "loss": 0.0001, + "reward": 1.6732143610715866, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.6955357305705547, + "rewards/format_reward_func": 0.977678582072258, + "step": 3026 + }, + { + "completion_length": 250.3794765472412, + "epoch": 0.5076491051594786, + "grad_norm": 0.29653367322563745, + "kl": 0.04032135009765625, + "learning_rate": 3.738271604938271e-07, + "loss": 0.0, + "reward": 1.717857226729393, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7267857417464256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3028 + }, + { + "completion_length": 256.73661708831787, + "epoch": 0.5079844083993461, + "grad_norm": 0.19002438532993526, + "kl": 0.0325164794921875, + "learning_rate": 3.7407407407407406e-07, + "loss": 0.0, + "reward": 1.7125000953674316, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.716964315623045, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3030 + }, + { + "completion_length": 264.87947940826416, + "epoch": 0.5083197116392137, + "grad_norm": 0.47697001771334524, + "kl": 0.1249542236328125, + "learning_rate": 3.7432098765432095e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3032 + }, + { + "completion_length": 252.37501335144043, + "epoch": 0.5086550148790813, + "grad_norm": 0.19164757321243217, + "kl": 0.16481781005859375, + "learning_rate": 3.745679012345679e-07, + "loss": 0.0002, + "reward": 1.6839286610484123, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.6973214689642191, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3034 + }, + { + "completion_length": 242.11608409881592, + "epoch": 0.5089903181189488, + "grad_norm": 0.2965385453786233, + "kl": 0.102325439453125, + "learning_rate": 3.7481481481481483e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3036 + }, + { + "completion_length": 264.0134038925171, + "epoch": 0.5093256213588164, + "grad_norm": 0.26828654844694805, + "kl": 0.50030517578125, + "learning_rate": 3.750617283950617e-07, + "loss": 0.0005, + "reward": 1.7232143431901932, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9598214477300644, + "step": 3038 + }, + { + "completion_length": 254.5625114440918, + "epoch": 0.509660924598684, + "grad_norm": 0.193990185560616, + "kl": 0.0839996337890625, + "learning_rate": 3.7530864197530866e-07, + "loss": 0.0001, + "reward": 1.739285759627819, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7482143323868513, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3040 + }, + { + "completion_length": 246.9419755935669, + "epoch": 0.5099962278385515, + "grad_norm": 0.19313521551747367, + "kl": 0.03554534912109375, + "learning_rate": 3.755555555555555e-07, + "loss": 0.0, + "reward": 1.7839286252856255, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928753435612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3042 + }, + { + "completion_length": 255.01340675354004, + "epoch": 0.510331531078419, + "grad_norm": 0.7765180616026391, + "kl": 0.51654052734375, + "learning_rate": 3.7580246913580244e-07, + "loss": 0.0005, + "reward": 1.7482143342494965, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.977678582072258, + "step": 3044 + }, + { + "completion_length": 256.6339416503906, + "epoch": 0.5106668343182866, + "grad_norm": 1.1364308614205811, + "kl": 0.6205825805664062, + "learning_rate": 3.760493827160494e-07, + "loss": 0.0006, + "reward": 1.7375000715255737, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7508928962051868, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3046 + }, + { + "completion_length": 246.1830472946167, + "epoch": 0.5110021375581542, + "grad_norm": 0.2493496018800458, + "kl": 0.158294677734375, + "learning_rate": 3.7629629629629627e-07, + "loss": 0.0002, + "reward": 1.7750000432133675, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7839285992085934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3048 + }, + { + "completion_length": 261.41072940826416, + "epoch": 0.5113374407980217, + "grad_norm": 0.19985846821705897, + "kl": 0.2151336669921875, + "learning_rate": 3.765432098765432e-07, + "loss": 0.0002, + "reward": 1.7446428835391998, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.977678582072258, + "step": 3050 + }, + { + "completion_length": 257.71429920196533, + "epoch": 0.5116727440378893, + "grad_norm": 0.22633980489201774, + "kl": 0.155548095703125, + "learning_rate": 3.767901234567901e-07, + "loss": 0.0002, + "reward": 1.694642961025238, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7071428969502449, + "rewards/format_reward_func": 0.9875000044703484, + "step": 3052 + }, + { + "completion_length": 259.6160840988159, + "epoch": 0.5120080472777568, + "grad_norm": 0.13005143693972465, + "kl": 0.12020111083984375, + "learning_rate": 3.7703703703703704e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3054 + }, + { + "completion_length": 252.43751049041748, + "epoch": 0.5123433505176244, + "grad_norm": 0.29216567176643266, + "kl": 0.304046630859375, + "learning_rate": 3.772839506172839e-07, + "loss": 0.0003, + "reward": 1.7129464820027351, + "reward_std": 0.08270623860880733, + "rewards/equation_reward_func": 0.7232143189758062, + "rewards/format_reward_func": 0.9897321499884129, + "step": 3056 + }, + { + "completion_length": 251.09376525878906, + "epoch": 0.5126786537574919, + "grad_norm": 0.6033130029832788, + "kl": 0.08933258056640625, + "learning_rate": 3.775308641975308e-07, + "loss": 0.0001, + "reward": 1.7767857685685158, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7901786081492901, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3058 + }, + { + "completion_length": 252.43751335144043, + "epoch": 0.5130139569973595, + "grad_norm": 0.20902483387252335, + "kl": 0.0396728515625, + "learning_rate": 3.7777777777777775e-07, + "loss": 0.0, + "reward": 1.7375000715255737, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3060 + }, + { + "completion_length": 249.22769165039062, + "epoch": 0.513349260237227, + "grad_norm": 0.260245315584681, + "kl": 0.18543243408203125, + "learning_rate": 3.780246913580247e-07, + "loss": 0.0002, + "reward": 1.7446429207921028, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3062 + }, + { + "completion_length": 254.43751049041748, + "epoch": 0.5136845634770946, + "grad_norm": 0.26409884638490183, + "kl": 0.037139892578125, + "learning_rate": 3.782716049382716e-07, + "loss": 0.0, + "reward": 1.696428656578064, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.6964286081492901, + "rewards/format_reward_func": 1.0, + "step": 3064 + }, + { + "completion_length": 260.54465103149414, + "epoch": 0.5140198667169622, + "grad_norm": 0.1428687655533802, + "kl": 0.0744171142578125, + "learning_rate": 3.785185185185185e-07, + "loss": 0.0001, + "reward": 1.7482143640518188, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3066 + }, + { + "completion_length": 246.53572368621826, + "epoch": 0.5143551699568297, + "grad_norm": 0.16185149934506265, + "kl": 0.0672454833984375, + "learning_rate": 3.787654320987654e-07, + "loss": 0.0001, + "reward": 1.7589286491274834, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3068 + }, + { + "completion_length": 258.9821557998657, + "epoch": 0.5146904731966973, + "grad_norm": 0.2306501331554964, + "kl": 0.0682220458984375, + "learning_rate": 3.790123456790123e-07, + "loss": 0.0001, + "reward": 1.7035715207457542, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7125000543892384, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3070 + }, + { + "completion_length": 264.87947845458984, + "epoch": 0.5150257764365648, + "grad_norm": 0.22456347881762204, + "kl": 0.06907272338867188, + "learning_rate": 3.7925925925925924e-07, + "loss": 0.0001, + "reward": 1.6892858147621155, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6892857514321804, + "rewards/format_reward_func": 1.0, + "step": 3072 + }, + { + "completion_length": 258.65179538726807, + "epoch": 0.5153610796764324, + "grad_norm": 0.35293272384908336, + "kl": 0.2332611083984375, + "learning_rate": 3.7950617283950613e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7482143044471741, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3074 + }, + { + "completion_length": 251.87054443359375, + "epoch": 0.5156963829162999, + "grad_norm": 0.1800318548596281, + "kl": 0.1234283447265625, + "learning_rate": 3.7975308641975307e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928902447224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3076 + }, + { + "completion_length": 246.4553689956665, + "epoch": 0.5160316861561675, + "grad_norm": 0.1740786371284468, + "kl": 0.02825164794921875, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "reward": 1.7267857939004898, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7312500402331352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3078 + }, + { + "completion_length": 258.22769260406494, + "epoch": 0.5163669893960351, + "grad_norm": 0.3160476219985606, + "kl": 0.057590484619140625, + "learning_rate": 3.802469135802469e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3080 + }, + { + "completion_length": 252.28572750091553, + "epoch": 0.5167022926359026, + "grad_norm": 0.4762047675136293, + "kl": 0.20147705078125, + "learning_rate": 3.8049382716049384e-07, + "loss": 0.0002, + "reward": 1.751785770058632, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7651785984635353, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3082 + }, + { + "completion_length": 257.0401906967163, + "epoch": 0.5170375958757701, + "grad_norm": 0.4295341728220505, + "kl": 0.029018402099609375, + "learning_rate": 3.8074074074074073e-07, + "loss": 0.0, + "reward": 1.7089286372065544, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7133928965777159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3084 + }, + { + "completion_length": 258.2321529388428, + "epoch": 0.5173728991156377, + "grad_norm": 0.2085002371188389, + "kl": 0.2578277587890625, + "learning_rate": 3.809876543209876e-07, + "loss": 0.0003, + "reward": 1.7178572043776512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571913391352, + "rewards/format_reward_func": 1.0, + "step": 3086 + }, + { + "completion_length": 263.6384038925171, + "epoch": 0.5177082023555053, + "grad_norm": 0.564076640872581, + "kl": 0.08950042724609375, + "learning_rate": 3.8123456790123456e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 3088 + }, + { + "completion_length": 252.54465675354004, + "epoch": 0.5180435055953728, + "grad_norm": 0.18742434123139506, + "kl": 0.1312103271484375, + "learning_rate": 3.8148148148148145e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500163912773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3090 + }, + { + "completion_length": 250.01786708831787, + "epoch": 0.5183788088352403, + "grad_norm": 0.20853683279872454, + "kl": 0.11835479736328125, + "learning_rate": 3.817283950617284e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285835623741, + "rewards/format_reward_func": 1.0, + "step": 3092 + }, + { + "completion_length": 261.95983695983887, + "epoch": 0.518714112075108, + "grad_norm": 0.20069159944437373, + "kl": 0.052577972412109375, + "learning_rate": 3.819753086419753e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.783928606659174, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3094 + }, + { + "completion_length": 248.60268878936768, + "epoch": 0.5190494153149755, + "grad_norm": 0.18308556647430432, + "kl": 0.05411529541015625, + "learning_rate": 3.822222222222222e-07, + "loss": 0.0001, + "reward": 1.7196429520845413, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071797907352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3096 + }, + { + "completion_length": 252.9107255935669, + "epoch": 0.519384718554843, + "grad_norm": 0.23069855632685662, + "kl": 0.0457916259765625, + "learning_rate": 3.8246913580246916e-07, + "loss": 0.0, + "reward": 1.744642935693264, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3098 + }, + { + "completion_length": 250.46429920196533, + "epoch": 0.5197200217947106, + "grad_norm": 0.259615918588719, + "kl": 0.05632781982421875, + "learning_rate": 3.82716049382716e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633929066359997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3100 + }, + { + "completion_length": 255.2142972946167, + "epoch": 0.5200553250345782, + "grad_norm": 0.2773907131668989, + "kl": 0.36530303955078125, + "learning_rate": 3.8296296296296294e-07, + "loss": 0.0004, + "reward": 1.758928656578064, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928693830967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3102 + }, + { + "completion_length": 248.92412090301514, + "epoch": 0.5203906282744457, + "grad_norm": 0.37992031338955395, + "kl": 0.17604446411132812, + "learning_rate": 3.8320987654320983e-07, + "loss": 0.0002, + "reward": 1.7678572162985802, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7767857499420643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3104 + }, + { + "completion_length": 258.3125114440918, + "epoch": 0.5207259315143132, + "grad_norm": 0.394132585956962, + "kl": 0.433685302734375, + "learning_rate": 3.8345679012345677e-07, + "loss": 0.0004, + "reward": 1.6821429505944252, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.6910714581608772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3106 + }, + { + "completion_length": 264.12501430511475, + "epoch": 0.5210612347541808, + "grad_norm": 0.18801212145262286, + "kl": 0.64044189453125, + "learning_rate": 3.837037037037037e-07, + "loss": 0.0006, + "reward": 1.7093750983476639, + "reward_std": 0.037249374436214566, + "rewards/equation_reward_func": 0.7107143178582191, + "rewards/format_reward_func": 0.9986607171595097, + "step": 3108 + }, + { + "completion_length": 251.20983409881592, + "epoch": 0.5213965379940484, + "grad_norm": 0.25513628192429766, + "kl": 0.08748626708984375, + "learning_rate": 3.839506172839506e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428906172514, + "rewards/format_reward_func": 1.0, + "step": 3110 + }, + { + "completion_length": 255.2857265472412, + "epoch": 0.5217318412339159, + "grad_norm": 0.28046900188641055, + "kl": 0.3914947509765625, + "learning_rate": 3.8419753086419754e-07, + "loss": 0.0004, + "reward": 1.7464286386966705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 3112 + }, + { + "completion_length": 256.214298248291, + "epoch": 0.5220671444737834, + "grad_norm": 0.22531422784150337, + "kl": 0.24646377563476562, + "learning_rate": 3.8444444444444443e-07, + "loss": 0.0002, + "reward": 1.796428605914116, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286170899868, + "rewards/format_reward_func": 1.0, + "step": 3114 + }, + { + "completion_length": 257.6250104904175, + "epoch": 0.5224024477136511, + "grad_norm": 0.2730117358782168, + "kl": 0.4022064208984375, + "learning_rate": 3.846913580246913e-07, + "loss": 0.0004, + "reward": 1.7392857745289803, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7482143193483353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3116 + }, + { + "completion_length": 250.9687623977661, + "epoch": 0.5227377509535186, + "grad_norm": 0.6576136637774147, + "kl": 0.7910614013671875, + "learning_rate": 3.8493827160493826e-07, + "loss": 0.0008, + "reward": 1.7821428924798965, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 3118 + }, + { + "completion_length": 260.790189743042, + "epoch": 0.5230730541933861, + "grad_norm": 0.24315733472477172, + "kl": 0.04315185546875, + "learning_rate": 3.8518518518518515e-07, + "loss": 0.0, + "reward": 1.7857143431901932, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7946428805589676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3120 + }, + { + "completion_length": 264.5000114440918, + "epoch": 0.5234083574332536, + "grad_norm": 0.22778397283566784, + "kl": 0.7026290893554688, + "learning_rate": 3.854320987654321e-07, + "loss": 0.0007, + "reward": 1.823214329779148, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8276785910129547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3122 + }, + { + "completion_length": 262.43751335144043, + "epoch": 0.5237436606731213, + "grad_norm": 0.21985933881844663, + "kl": 0.5026016235351562, + "learning_rate": 3.8567901234567903e-07, + "loss": 0.0005, + "reward": 1.742857202887535, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 3124 + }, + { + "completion_length": 249.99108219146729, + "epoch": 0.5240789639129888, + "grad_norm": 0.33885053896790346, + "kl": 0.574188232421875, + "learning_rate": 3.859259259259259e-07, + "loss": 0.0006, + "reward": 1.7892857789993286, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857566475868, + "rewards/format_reward_func": 1.0, + "step": 3126 + }, + { + "completion_length": 257.49108505249023, + "epoch": 0.5244142671528563, + "grad_norm": 0.2792327924735609, + "kl": 0.32171630859375, + "learning_rate": 3.861728395061728e-07, + "loss": 0.0003, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 3128 + }, + { + "completion_length": 263.89733505249023, + "epoch": 0.524749570392724, + "grad_norm": 0.5251332741259391, + "kl": 0.3920097351074219, + "learning_rate": 3.864197530864197e-07, + "loss": 0.0004, + "reward": 1.7375000715255737, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.750892898067832, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3130 + }, + { + "completion_length": 258.620548248291, + "epoch": 0.5250848736325915, + "grad_norm": 0.17825779621215138, + "kl": 0.44925689697265625, + "learning_rate": 3.8666666666666664e-07, + "loss": 0.0004, + "reward": 1.7982143387198448, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.8116071708500385, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3132 + }, + { + "completion_length": 259.758939743042, + "epoch": 0.525420176872459, + "grad_norm": 0.22491108314728606, + "kl": 0.089263916015625, + "learning_rate": 3.869135802469136e-07, + "loss": 0.0001, + "reward": 1.7517857924103737, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3134 + }, + { + "completion_length": 265.76787090301514, + "epoch": 0.5257554801123265, + "grad_norm": 0.25530360444858247, + "kl": 0.16037750244140625, + "learning_rate": 3.8716049382716046e-07, + "loss": 0.0002, + "reward": 1.7232143506407738, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7276786155998707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3136 + }, + { + "completion_length": 266.25894260406494, + "epoch": 0.5260907833521942, + "grad_norm": 0.21909712127541298, + "kl": 0.12729644775390625, + "learning_rate": 3.874074074074074e-07, + "loss": 0.0001, + "reward": 1.7250000983476639, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000238418579, + "rewards/format_reward_func": 1.0, + "step": 3138 + }, + { + "completion_length": 254.43304538726807, + "epoch": 0.5264260865920617, + "grad_norm": 0.36150859827321014, + "kl": 0.08214187622070312, + "learning_rate": 3.8765432098765435e-07, + "loss": 0.0001, + "reward": 1.7232143506407738, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.7366071734577417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3140 + }, + { + "completion_length": 264.1160831451416, + "epoch": 0.5267613898319292, + "grad_norm": 0.2555141927292338, + "kl": 0.055820465087890625, + "learning_rate": 3.879012345679012e-07, + "loss": 0.0001, + "reward": 1.698214367032051, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7116071879863739, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3142 + }, + { + "completion_length": 258.9196548461914, + "epoch": 0.5270966930717969, + "grad_norm": 0.1265249629832738, + "kl": 0.09015274047851562, + "learning_rate": 3.881481481481481e-07, + "loss": 0.0001, + "reward": 1.8392857387661934, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8392857313156128, + "rewards/format_reward_func": 1.0, + "step": 3144 + }, + { + "completion_length": 262.70983600616455, + "epoch": 0.5274319963116644, + "grad_norm": 0.2757642721961975, + "kl": 0.04498291015625, + "learning_rate": 3.88395061728395e-07, + "loss": 0.0, + "reward": 1.7678572237491608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 3146 + }, + { + "completion_length": 265.4866199493408, + "epoch": 0.5277672995515319, + "grad_norm": 0.21547518845810912, + "kl": 0.05883026123046875, + "learning_rate": 3.8864197530864195e-07, + "loss": 0.0001, + "reward": 1.6821429505944252, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.6821428947150707, + "rewards/format_reward_func": 1.0, + "step": 3148 + }, + { + "completion_length": 262.4687623977661, + "epoch": 0.5281026027913994, + "grad_norm": 0.11776663498490876, + "kl": 0.15918731689453125, + "learning_rate": 3.888888888888889e-07, + "loss": 0.0002, + "reward": 1.8321428894996643, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8321428801864386, + "rewards/format_reward_func": 1.0, + "step": 3150 + }, + { + "completion_length": 259.6116247177124, + "epoch": 0.5284379060312671, + "grad_norm": 0.35745340475596804, + "kl": 0.0491790771484375, + "learning_rate": 3.891358024691358e-07, + "loss": 0.0, + "reward": 1.7142857983708382, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3152 + }, + { + "completion_length": 262.5401906967163, + "epoch": 0.5287732092711346, + "grad_norm": 0.21916166918525395, + "kl": 0.047977447509765625, + "learning_rate": 3.893827160493827e-07, + "loss": 0.0, + "reward": 1.7660714760422707, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3154 + }, + { + "completion_length": 264.5000123977661, + "epoch": 0.5291085125110021, + "grad_norm": 0.30194854312892355, + "kl": 0.06650543212890625, + "learning_rate": 3.8962962962962956e-07, + "loss": 0.0001, + "reward": 1.7303572297096252, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7348214648663998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3156 + }, + { + "completion_length": 260.9910821914673, + "epoch": 0.5294438157508697, + "grad_norm": 0.08596848806407578, + "kl": 0.05260467529296875, + "learning_rate": 3.898765432098765e-07, + "loss": 0.0001, + "reward": 1.8125000670552254, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8169643096625805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3158 + }, + { + "completion_length": 259.5000104904175, + "epoch": 0.5297791189907373, + "grad_norm": 0.16451427183843081, + "kl": 0.037384033203125, + "learning_rate": 3.9012345679012344e-07, + "loss": 0.0, + "reward": 1.791071467101574, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3160 + }, + { + "completion_length": 267.2187614440918, + "epoch": 0.5301144222306048, + "grad_norm": 0.2709270291715736, + "kl": 0.031284332275390625, + "learning_rate": 3.9037037037037033e-07, + "loss": 0.0, + "reward": 1.8000000566244125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 3162 + }, + { + "completion_length": 271.5491180419922, + "epoch": 0.5304497254704723, + "grad_norm": 0.3913483799722536, + "kl": 0.08670806884765625, + "learning_rate": 3.9061728395061727e-07, + "loss": 0.0001, + "reward": 1.7875000536441803, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7919643223285675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3164 + }, + { + "completion_length": 257.41965675354004, + "epoch": 0.53078502871034, + "grad_norm": 0.17074821649440147, + "kl": 0.03655242919921875, + "learning_rate": 3.908641975308642e-07, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 3166 + }, + { + "completion_length": 268.9821538925171, + "epoch": 0.5311203319502075, + "grad_norm": 0.19454170410731372, + "kl": 0.0432891845703125, + "learning_rate": 3.911111111111111e-07, + "loss": 0.0, + "reward": 1.6910715103149414, + "reward_std": 0.0732360603287816, + "rewards/equation_reward_func": 0.7044643256813288, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3168 + }, + { + "completion_length": 259.4464406967163, + "epoch": 0.531455635190075, + "grad_norm": 0.2802742714672685, + "kl": 0.03183746337890625, + "learning_rate": 3.91358024691358e-07, + "loss": 0.0, + "reward": 1.7357143685221672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143312692642, + "rewards/format_reward_func": 1.0, + "step": 3170 + }, + { + "completion_length": 255.79465579986572, + "epoch": 0.5317909384299426, + "grad_norm": 0.24431833873124947, + "kl": 0.07769012451171875, + "learning_rate": 3.916049382716049e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428943425417, + "rewards/format_reward_func": 1.0, + "step": 3172 + }, + { + "completion_length": 261.43305015563965, + "epoch": 0.5321262416698102, + "grad_norm": 0.2838905343856962, + "kl": 0.0420074462890625, + "learning_rate": 3.918518518518518e-07, + "loss": 0.0, + "reward": 1.7053572162985802, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7098214700818062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3174 + }, + { + "completion_length": 261.9464406967163, + "epoch": 0.5324615449096777, + "grad_norm": 0.383031704099419, + "kl": 0.052783966064453125, + "learning_rate": 3.9209876543209876e-07, + "loss": 0.0001, + "reward": 1.7660715132951736, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3176 + }, + { + "completion_length": 253.23661994934082, + "epoch": 0.5327968481495452, + "grad_norm": 0.3703123406378154, + "kl": 0.0786895751953125, + "learning_rate": 3.9234567901234565e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3178 + }, + { + "completion_length": 267.46876525878906, + "epoch": 0.5331321513894128, + "grad_norm": 0.2207114618581663, + "kl": 0.0382232666015625, + "learning_rate": 3.925925925925926e-07, + "loss": 0.0, + "reward": 1.725000061094761, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7339286021888256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3180 + }, + { + "completion_length": 260.8259086608887, + "epoch": 0.5334674546292804, + "grad_norm": 0.3279488891762705, + "kl": 0.03820037841796875, + "learning_rate": 3.9283950617283953e-07, + "loss": 0.0, + "reward": 1.7464286386966705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 3182 + }, + { + "completion_length": 253.90625858306885, + "epoch": 0.5338027578691479, + "grad_norm": 0.2417637339334101, + "kl": 0.031524658203125, + "learning_rate": 3.930864197530864e-07, + "loss": 0.0, + "reward": 1.7375000789761543, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7419643253087997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3184 + }, + { + "completion_length": 255.61608600616455, + "epoch": 0.5341380611090155, + "grad_norm": 0.20489918160535506, + "kl": 0.056060791015625, + "learning_rate": 3.933333333333333e-07, + "loss": 0.0001, + "reward": 1.7875000461935997, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3186 + }, + { + "completion_length": 257.433048248291, + "epoch": 0.534473364348883, + "grad_norm": 0.25187838214239483, + "kl": 0.026729583740234375, + "learning_rate": 3.935802469135802e-07, + "loss": 0.0, + "reward": 1.7714286595582962, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7803571820259094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3188 + }, + { + "completion_length": 269.12501335144043, + "epoch": 0.5348086675887506, + "grad_norm": 0.28126525645205525, + "kl": 0.05770111083984375, + "learning_rate": 3.9382716049382714e-07, + "loss": 0.0001, + "reward": 1.6446429342031479, + "reward_std": 0.10859139915555716, + "rewards/equation_reward_func": 0.666964327916503, + "rewards/format_reward_func": 0.977678582072258, + "step": 3190 + }, + { + "completion_length": 264.58929920196533, + "epoch": 0.5351439708286181, + "grad_norm": 0.21792582285765727, + "kl": 0.02896881103515625, + "learning_rate": 3.940740740740741e-07, + "loss": 0.0, + "reward": 1.7214286401867867, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7214286141097546, + "rewards/format_reward_func": 1.0, + "step": 3192 + }, + { + "completion_length": 261.4419775009155, + "epoch": 0.5354792740684857, + "grad_norm": 0.26582125873677714, + "kl": 0.1292572021484375, + "learning_rate": 3.9432098765432097e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143297791481, + "rewards/format_reward_func": 1.0, + "step": 3194 + }, + { + "completion_length": 254.1830472946167, + "epoch": 0.5358145773083532, + "grad_norm": 0.12337232155198748, + "kl": 0.09566116333007812, + "learning_rate": 3.945679012345679e-07, + "loss": 0.0001, + "reward": 1.7767857611179352, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7812500167638063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3196 + }, + { + "completion_length": 246.31250858306885, + "epoch": 0.5361498805482208, + "grad_norm": 0.13809997347069525, + "kl": 0.02978515625, + "learning_rate": 3.948148148148148e-07, + "loss": 0.0, + "reward": 1.7107143551111221, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7107143253087997, + "rewards/format_reward_func": 1.0, + "step": 3198 + }, + { + "completion_length": 252.37055015563965, + "epoch": 0.5364851837880884, + "grad_norm": 0.2090765586833693, + "kl": 0.037384033203125, + "learning_rate": 3.950617283950617e-07, + "loss": 0.0, + "reward": 1.7642858028411865, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 3200 + }, + { + "completion_length": 256.308048248291, + "epoch": 0.5368204870279559, + "grad_norm": 0.2867788682863003, + "kl": 0.027191162109375, + "learning_rate": 3.9530864197530863e-07, + "loss": 0.0, + "reward": 1.7428572326898575, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 1.0, + "step": 3202 + }, + { + "completion_length": 242.46876049041748, + "epoch": 0.5371557902678235, + "grad_norm": 0.19248161610849127, + "kl": 0.0301055908203125, + "learning_rate": 3.955555555555555e-07, + "loss": 0.0, + "reward": 1.7857143431901932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 3204 + }, + { + "completion_length": 253.46876049041748, + "epoch": 0.537491093507691, + "grad_norm": 0.3401432023399993, + "kl": 0.047245025634765625, + "learning_rate": 3.9580246913580246e-07, + "loss": 0.0, + "reward": 1.7553572207689285, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214633762836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3206 + }, + { + "completion_length": 251.1205472946167, + "epoch": 0.5378263967475586, + "grad_norm": 0.3760622304921827, + "kl": 0.032135009765625, + "learning_rate": 3.960493827160494e-07, + "loss": 0.0, + "reward": 1.7767857611179352, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7812500353902578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3208 + }, + { + "completion_length": 252.9330472946167, + "epoch": 0.5381616999874261, + "grad_norm": 0.3604519919978038, + "kl": 0.10263824462890625, + "learning_rate": 3.962962962962963e-07, + "loss": 0.0001, + "reward": 1.7071429565548897, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.707142885774374, + "rewards/format_reward_func": 1.0, + "step": 3210 + }, + { + "completion_length": 251.75893878936768, + "epoch": 0.5384970032272937, + "grad_norm": 0.22457165395344272, + "kl": 0.15355682373046875, + "learning_rate": 3.9654320987654323e-07, + "loss": 0.0002, + "reward": 1.7464286237955093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.746428620070219, + "rewards/format_reward_func": 1.0, + "step": 3212 + }, + { + "completion_length": 256.7812623977661, + "epoch": 0.5388323064671613, + "grad_norm": 0.21129731942518568, + "kl": 0.11245346069335938, + "learning_rate": 3.9679012345679006e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3214 + }, + { + "completion_length": 246.5000114440918, + "epoch": 0.5391676097070288, + "grad_norm": 0.18051497905212877, + "kl": 0.03157806396484375, + "learning_rate": 3.97037037037037e-07, + "loss": 0.0, + "reward": 1.7625000849366188, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3216 + }, + { + "completion_length": 245.3794765472412, + "epoch": 0.5395029129468963, + "grad_norm": 0.9203589714968138, + "kl": 0.261016845703125, + "learning_rate": 3.9728395061728395e-07, + "loss": 0.0003, + "reward": 1.6696429401636124, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.6741071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3218 + }, + { + "completion_length": 250.7232255935669, + "epoch": 0.5398382161867639, + "grad_norm": 0.3017788178994304, + "kl": 0.03530120849609375, + "learning_rate": 3.9753086419753083e-07, + "loss": 0.0, + "reward": 1.677678644657135, + "reward_std": 0.07197336759418249, + "rewards/equation_reward_func": 0.683928593993187, + "rewards/format_reward_func": 0.9937500059604645, + "step": 3220 + }, + { + "completion_length": 249.57144165039062, + "epoch": 0.5401735194266315, + "grad_norm": 0.5284210803368328, + "kl": 0.11692047119140625, + "learning_rate": 3.977777777777778e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 3222 + }, + { + "completion_length": 246.1919765472412, + "epoch": 0.540508822666499, + "grad_norm": 0.13747975941449442, + "kl": 0.24882888793945312, + "learning_rate": 3.9802469135802466e-07, + "loss": 0.0002, + "reward": 1.8142857626080513, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 3224 + }, + { + "completion_length": 255.8080472946167, + "epoch": 0.5408441259063665, + "grad_norm": 0.2599070366859934, + "kl": 0.08871078491210938, + "learning_rate": 3.982716049382716e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3226 + }, + { + "completion_length": 250.17411613464355, + "epoch": 0.5411794291462341, + "grad_norm": 0.5749702489024938, + "kl": 0.1636505126953125, + "learning_rate": 3.985185185185185e-07, + "loss": 0.0002, + "reward": 1.7000000923871994, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7000000309199095, + "rewards/format_reward_func": 1.0, + "step": 3228 + }, + { + "completion_length": 248.71429347991943, + "epoch": 0.5415147323861017, + "grad_norm": 0.4718805170337936, + "kl": 0.0569000244140625, + "learning_rate": 3.987654320987654e-07, + "loss": 0.0001, + "reward": 1.6589286774396896, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.6633928883820772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3230 + }, + { + "completion_length": 251.4196538925171, + "epoch": 0.5418500356259692, + "grad_norm": 0.36286334724653574, + "kl": 0.04682159423828125, + "learning_rate": 3.990123456790123e-07, + "loss": 0.0, + "reward": 1.796428620815277, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7964286133646965, + "rewards/format_reward_func": 1.0, + "step": 3232 + }, + { + "completion_length": 253.1026906967163, + "epoch": 0.5421853388658368, + "grad_norm": 0.24718759183417172, + "kl": 0.08046340942382812, + "learning_rate": 3.9925925925925926e-07, + "loss": 0.0001, + "reward": 1.7875000461935997, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643297791481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3234 + }, + { + "completion_length": 253.43750953674316, + "epoch": 0.5425206421057044, + "grad_norm": 0.14230902666933232, + "kl": 0.028270721435546875, + "learning_rate": 3.9950617283950615e-07, + "loss": 0.0, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 3236 + }, + { + "completion_length": 254.06697940826416, + "epoch": 0.5428559453455719, + "grad_norm": 0.6630361438819221, + "kl": 0.07397842407226562, + "learning_rate": 3.997530864197531e-07, + "loss": 0.0001, + "reward": 1.7107143551111221, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7196429036557674, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3238 + }, + { + "completion_length": 243.4062623977661, + "epoch": 0.5431912485854394, + "grad_norm": 0.15745796244051208, + "kl": 0.023761749267578125, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 1.8089286088943481, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.8133928924798965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3240 + }, + { + "completion_length": 247.17411994934082, + "epoch": 0.543526551825307, + "grad_norm": 0.11801541444411404, + "kl": 0.045162200927734375, + "learning_rate": 4.0024691358024687e-07, + "loss": 0.0, + "reward": 1.667857214808464, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6678571905940771, + "rewards/format_reward_func": 1.0, + "step": 3242 + }, + { + "completion_length": 246.5937614440918, + "epoch": 0.5438618550651746, + "grad_norm": 0.24052847857381288, + "kl": 0.038970947265625, + "learning_rate": 4.004938271604938e-07, + "loss": 0.0, + "reward": 1.7142858058214188, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857480794191, + "rewards/format_reward_func": 1.0, + "step": 3244 + }, + { + "completion_length": 246.37054824829102, + "epoch": 0.5441971583050421, + "grad_norm": 0.20182668701059084, + "kl": 0.0252532958984375, + "learning_rate": 4.007407407407407e-07, + "loss": 0.0, + "reward": 1.791071467101574, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.795535746961832, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3246 + }, + { + "completion_length": 248.64733505249023, + "epoch": 0.5445324615449096, + "grad_norm": 0.25054728863777276, + "kl": 0.023464202880859375, + "learning_rate": 4.0098765432098764e-07, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7660714536905289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3248 + }, + { + "completion_length": 252.946439743042, + "epoch": 0.5448677647847773, + "grad_norm": 0.2787722432356983, + "kl": 0.05184173583984375, + "learning_rate": 4.0123456790123453e-07, + "loss": 0.0001, + "reward": 1.7125001102685928, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7169643118977547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3250 + }, + { + "completion_length": 242.03572463989258, + "epoch": 0.5452030680246448, + "grad_norm": 0.18001654363123426, + "kl": 0.0496063232421875, + "learning_rate": 4.0148148148148147e-07, + "loss": 0.0, + "reward": 1.8392857611179352, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8392857238650322, + "rewards/format_reward_func": 1.0, + "step": 3252 + }, + { + "completion_length": 244.8839406967163, + "epoch": 0.5455383712645123, + "grad_norm": 0.3107235931047464, + "kl": 0.032939910888671875, + "learning_rate": 4.017283950617284e-07, + "loss": 0.0, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 3254 + }, + { + "completion_length": 254.87054443359375, + "epoch": 0.5458736745043798, + "grad_norm": 0.5024745088879808, + "kl": 0.0463104248046875, + "learning_rate": 4.0197530864197525e-07, + "loss": 0.0, + "reward": 1.7660714983940125, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3256 + }, + { + "completion_length": 243.50447368621826, + "epoch": 0.5462089777442475, + "grad_norm": 0.19769072115630126, + "kl": 0.028018951416015625, + "learning_rate": 4.022222222222222e-07, + "loss": 0.0, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 3258 + }, + { + "completion_length": 252.5134038925171, + "epoch": 0.546544280984115, + "grad_norm": 0.25804087534525366, + "kl": 0.037353515625, + "learning_rate": 4.0246913580246913e-07, + "loss": 0.0, + "reward": 1.7875000685453415, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3260 + }, + { + "completion_length": 249.3125114440918, + "epoch": 0.5468795842239825, + "grad_norm": 0.3788881858860527, + "kl": 0.036830902099609375, + "learning_rate": 4.02716049382716e-07, + "loss": 0.0, + "reward": 1.7178572192788124, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.717857176437974, + "rewards/format_reward_func": 1.0, + "step": 3262 + }, + { + "completion_length": 249.19643878936768, + "epoch": 0.5472148874638502, + "grad_norm": 0.15403221802227582, + "kl": 0.03723907470703125, + "learning_rate": 4.0296296296296296e-07, + "loss": 0.0, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 3264 + }, + { + "completion_length": 242.4509048461914, + "epoch": 0.5475501907037177, + "grad_norm": 0.4151835339606924, + "kl": 0.03864288330078125, + "learning_rate": 4.0320987654320985e-07, + "loss": 0.0, + "reward": 1.7982143461704254, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3266 + }, + { + "completion_length": 257.71876335144043, + "epoch": 0.5478854939435852, + "grad_norm": 0.28616567137191395, + "kl": 0.03456878662109375, + "learning_rate": 4.034567901234568e-07, + "loss": 0.0, + "reward": 1.6892858296632767, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.698214303702116, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3268 + }, + { + "completion_length": 247.76786613464355, + "epoch": 0.5482207971834527, + "grad_norm": 0.33138120271755234, + "kl": 0.027130126953125, + "learning_rate": 4.0370370370370373e-07, + "loss": 0.0, + "reward": 1.807142898440361, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 3270 + }, + { + "completion_length": 253.633939743042, + "epoch": 0.5485561004233204, + "grad_norm": 0.2025119387803373, + "kl": 0.07606124877929688, + "learning_rate": 4.0395061728395057e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3272 + }, + { + "completion_length": 249.10269165039062, + "epoch": 0.5488914036631879, + "grad_norm": 0.3509880306028421, + "kl": 0.06534576416015625, + "learning_rate": 4.041975308641975e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 3274 + }, + { + "completion_length": 248.8884048461914, + "epoch": 0.5492267069030554, + "grad_norm": 0.3225348638068555, + "kl": 0.06076812744140625, + "learning_rate": 4.044444444444444e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 3276 + }, + { + "completion_length": 246.52679347991943, + "epoch": 0.549562010142923, + "grad_norm": 0.1652321697093135, + "kl": 0.045623779296875, + "learning_rate": 4.0469135802469134e-07, + "loss": 0.0, + "reward": 1.7678572237491608, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 3278 + }, + { + "completion_length": 248.196439743042, + "epoch": 0.5498973133827906, + "grad_norm": 0.27022884678968045, + "kl": 0.03060150146484375, + "learning_rate": 4.049382716049383e-07, + "loss": 0.0, + "reward": 1.8160714954137802, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8205357268452644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3280 + }, + { + "completion_length": 257.5267972946167, + "epoch": 0.5502326166226581, + "grad_norm": 0.4939771906485696, + "kl": 0.14651107788085938, + "learning_rate": 4.0518518518518517e-07, + "loss": 0.0001, + "reward": 1.7410714849829674, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7544643189758062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3282 + }, + { + "completion_length": 248.93750858306885, + "epoch": 0.5505679198625256, + "grad_norm": 0.3348022744898028, + "kl": 0.032135009765625, + "learning_rate": 4.054320987654321e-07, + "loss": 0.0, + "reward": 1.7125000655651093, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.716964315623045, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3284 + }, + { + "completion_length": 253.27679538726807, + "epoch": 0.5509032231023933, + "grad_norm": 0.4286314695204813, + "kl": 0.06396102905273438, + "learning_rate": 4.05679012345679e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7169643193483353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3286 + }, + { + "completion_length": 244.12947750091553, + "epoch": 0.5512385263422608, + "grad_norm": 0.23906097739214408, + "kl": 0.027721405029296875, + "learning_rate": 4.059259259259259e-07, + "loss": 0.0, + "reward": 1.7767857983708382, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3288 + }, + { + "completion_length": 248.7991189956665, + "epoch": 0.5515738295821283, + "grad_norm": 0.3127992260300639, + "kl": 0.031581878662109375, + "learning_rate": 4.061728395061728e-07, + "loss": 0.0, + "reward": 1.7071429342031479, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7071428820490837, + "rewards/format_reward_func": 1.0, + "step": 3290 + }, + { + "completion_length": 249.4330472946167, + "epoch": 0.5519091328219959, + "grad_norm": 0.20008099014907435, + "kl": 0.0303955078125, + "learning_rate": 4.064197530864197e-07, + "loss": 0.0, + "reward": 1.748214341700077, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7526785936206579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3292 + }, + { + "completion_length": 256.7366180419922, + "epoch": 0.5522444360618635, + "grad_norm": 0.2468886215141364, + "kl": 0.029834747314453125, + "learning_rate": 4.0666666666666666e-07, + "loss": 0.0, + "reward": 1.7267858013510704, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.731250025331974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3294 + }, + { + "completion_length": 255.7053680419922, + "epoch": 0.552579739301731, + "grad_norm": 0.34286991600304806, + "kl": 0.032001495361328125, + "learning_rate": 4.069135802469136e-07, + "loss": 0.0, + "reward": 1.792857177555561, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 3296 + }, + { + "completion_length": 246.50001049041748, + "epoch": 0.5529150425415985, + "grad_norm": 0.24654089324264858, + "kl": 0.024234771728515625, + "learning_rate": 4.071604938271605e-07, + "loss": 0.0, + "reward": 1.7500000819563866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 3298 + }, + { + "completion_length": 256.11162090301514, + "epoch": 0.5532503457814661, + "grad_norm": 0.28781653670523644, + "kl": 0.028278350830078125, + "learning_rate": 4.0740740740740737e-07, + "loss": 0.0, + "reward": 1.7642857655882835, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7732143141329288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3300 + }, + { + "completion_length": 249.9642972946167, + "epoch": 0.5535856490213337, + "grad_norm": 0.3905942900862037, + "kl": 0.026691436767578125, + "learning_rate": 4.0765432098765426e-07, + "loss": 0.0, + "reward": 1.778571493923664, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 3302 + }, + { + "completion_length": 241.17858219146729, + "epoch": 0.5539209522612012, + "grad_norm": 0.29323430298274383, + "kl": 0.028812408447265625, + "learning_rate": 4.079012345679012e-07, + "loss": 0.0, + "reward": 1.814285770058632, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 3304 + }, + { + "completion_length": 242.20983219146729, + "epoch": 0.5542562555010688, + "grad_norm": 0.15078015109364112, + "kl": 0.033367156982421875, + "learning_rate": 4.0814814814814814e-07, + "loss": 0.0, + "reward": 1.8178571835160255, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571723401546, + "rewards/format_reward_func": 1.0, + "step": 3306 + }, + { + "completion_length": 265.4196557998657, + "epoch": 0.5545915587409364, + "grad_norm": 0.3085516247377834, + "kl": 0.03081512451171875, + "learning_rate": 4.0839506172839503e-07, + "loss": 0.0, + "reward": 1.7428572103381157, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 3308 + }, + { + "completion_length": 247.75447463989258, + "epoch": 0.5549268619808039, + "grad_norm": 0.3167820411329688, + "kl": 0.0263214111328125, + "learning_rate": 4.08641975308642e-07, + "loss": 0.0, + "reward": 1.735714390873909, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7446428947150707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3310 + }, + { + "completion_length": 253.93304538726807, + "epoch": 0.5552621652206714, + "grad_norm": 0.26818302974849845, + "kl": 0.02646636962890625, + "learning_rate": 4.088888888888889e-07, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000443309546, + "rewards/format_reward_func": 1.0, + "step": 3312 + }, + { + "completion_length": 244.9107255935669, + "epoch": 0.555597468460539, + "grad_norm": 0.18918188197499258, + "kl": 0.030887603759765625, + "learning_rate": 4.0913580246913575e-07, + "loss": 0.0, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 3314 + }, + { + "completion_length": 253.24108123779297, + "epoch": 0.5559327717004066, + "grad_norm": 0.1587575101234306, + "kl": 0.03311920166015625, + "learning_rate": 4.093827160493827e-07, + "loss": 0.0, + "reward": 1.7482143267989159, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3316 + }, + { + "completion_length": 243.60715579986572, + "epoch": 0.5562680749402741, + "grad_norm": 0.1467960955689759, + "kl": 0.02486419677734375, + "learning_rate": 4.096296296296296e-07, + "loss": 0.0, + "reward": 1.7714286521077156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 3318 + }, + { + "completion_length": 266.9151906967163, + "epoch": 0.5566033781801417, + "grad_norm": 0.13892360946264437, + "kl": 0.045444488525390625, + "learning_rate": 4.098765432098765e-07, + "loss": 0.0, + "reward": 1.7714286595582962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 3320 + }, + { + "completion_length": 254.00447463989258, + "epoch": 0.5569386814200092, + "grad_norm": 0.24898486187793703, + "kl": 0.030483245849609375, + "learning_rate": 4.1012345679012346e-07, + "loss": 0.0, + "reward": 1.7946429327130318, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071678698063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3322 + }, + { + "completion_length": 250.87501335144043, + "epoch": 0.5572739846598768, + "grad_norm": 0.2500029856545992, + "kl": 0.03379058837890625, + "learning_rate": 4.1037037037037035e-07, + "loss": 0.0, + "reward": 1.7535714879631996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 3324 + }, + { + "completion_length": 255.415189743042, + "epoch": 0.5576092878997443, + "grad_norm": 0.29364029372225736, + "kl": 0.020549774169921875, + "learning_rate": 4.106172839506173e-07, + "loss": 0.0, + "reward": 1.7285715192556381, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 3326 + }, + { + "completion_length": 266.12054920196533, + "epoch": 0.5579445911396119, + "grad_norm": 0.25937008559805713, + "kl": 0.05391693115234375, + "learning_rate": 4.1086419753086413e-07, + "loss": 0.0001, + "reward": 1.6928572207689285, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7017857618629932, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3328 + }, + { + "completion_length": 254.53126525878906, + "epoch": 0.5582798943794794, + "grad_norm": 0.25669521443692506, + "kl": 0.018245697021484375, + "learning_rate": 4.1111111111111107e-07, + "loss": 0.0, + "reward": 1.805357187986374, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.809821454808116, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3330 + }, + { + "completion_length": 271.227689743042, + "epoch": 0.558615197619347, + "grad_norm": 0.3242229226510621, + "kl": 0.025936126708984375, + "learning_rate": 4.11358024691358e-07, + "loss": 0.0, + "reward": 1.7857143431901932, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7946428842842579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3332 + }, + { + "completion_length": 257.14733123779297, + "epoch": 0.5589505008592146, + "grad_norm": 0.16505049686226236, + "kl": 0.01825714111328125, + "learning_rate": 4.116049382716049e-07, + "loss": 0.0, + "reward": 1.733928643167019, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7383929006755352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3334 + }, + { + "completion_length": 243.60268878936768, + "epoch": 0.5592858040990821, + "grad_norm": 0.24723268697187017, + "kl": 0.021701812744140625, + "learning_rate": 4.1185185185185184e-07, + "loss": 0.0, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428921073675, + "rewards/format_reward_func": 1.0, + "step": 3336 + }, + { + "completion_length": 253.2500114440918, + "epoch": 0.5596211073389497, + "grad_norm": 0.2512410261507857, + "kl": 0.032398223876953125, + "learning_rate": 4.120987654320988e-07, + "loss": 0.0, + "reward": 1.7517857924103737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500201165676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3338 + }, + { + "completion_length": 247.8928680419922, + "epoch": 0.5599564105788172, + "grad_norm": 0.20871903585274287, + "kl": 0.02616119384765625, + "learning_rate": 4.1234567901234567e-07, + "loss": 0.0, + "reward": 1.8214286267757416, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 1.0, + "step": 3340 + }, + { + "completion_length": 262.75893783569336, + "epoch": 0.5602917138186848, + "grad_norm": 0.3195999706923693, + "kl": 0.020320892333984375, + "learning_rate": 4.1259259259259256e-07, + "loss": 0.0, + "reward": 1.769642911851406, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3342 + }, + { + "completion_length": 258.5803699493408, + "epoch": 0.5606270170585523, + "grad_norm": 0.12534649378964235, + "kl": 0.021961212158203125, + "learning_rate": 4.1283950617283945e-07, + "loss": 0.0, + "reward": 1.7589286267757416, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928768336773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3344 + }, + { + "completion_length": 269.0892963409424, + "epoch": 0.5609623202984199, + "grad_norm": 0.18819239134083926, + "kl": 0.022518157958984375, + "learning_rate": 4.130864197530864e-07, + "loss": 0.0, + "reward": 1.7785715088248253, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 3346 + }, + { + "completion_length": 256.68304443359375, + "epoch": 0.5612976235382875, + "grad_norm": 0.17556385032649469, + "kl": 0.034328460693359375, + "learning_rate": 4.1333333333333333e-07, + "loss": 0.0, + "reward": 1.714285783469677, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.722321467474103, + "rewards/format_reward_func": 0.9919642880558968, + "step": 3348 + }, + { + "completion_length": 258.80804538726807, + "epoch": 0.561632926778155, + "grad_norm": 0.19901055099402243, + "kl": 0.05564117431640625, + "learning_rate": 4.135802469135802e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 3350 + }, + { + "completion_length": 258.51786708831787, + "epoch": 0.5619682300180225, + "grad_norm": 0.2753161669446979, + "kl": 0.25748443603515625, + "learning_rate": 4.1382716049382716e-07, + "loss": 0.0003, + "reward": 1.7946429252624512, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.8080357350409031, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3352 + }, + { + "completion_length": 268.1294775009155, + "epoch": 0.5623035332578901, + "grad_norm": 0.25622783243812475, + "kl": 0.02396392822265625, + "learning_rate": 4.140740740740741e-07, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.10606601554900408, + "rewards/equation_reward_func": 0.7428571581840515, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3354 + }, + { + "completion_length": 259.7812614440918, + "epoch": 0.5626388364977577, + "grad_norm": 0.11022749186091951, + "kl": 0.02692413330078125, + "learning_rate": 4.1432098765432094e-07, + "loss": 0.0, + "reward": 1.771428644657135, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 3356 + }, + { + "completion_length": 251.7053689956665, + "epoch": 0.5629741397376252, + "grad_norm": 0.25546924480034083, + "kl": 0.02170562744140625, + "learning_rate": 4.145679012345679e-07, + "loss": 0.0, + "reward": 1.8000000640749931, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 1.0, + "step": 3358 + }, + { + "completion_length": 268.7857246398926, + "epoch": 0.5633094429774927, + "grad_norm": 0.26079986354717527, + "kl": 0.06954193115234375, + "learning_rate": 4.1481481481481476e-07, + "loss": 0.0001, + "reward": 1.7910714820027351, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7955357357859612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3360 + }, + { + "completion_length": 254.7366180419922, + "epoch": 0.5636447462173603, + "grad_norm": 0.10312987099632315, + "kl": 0.0501708984375, + "learning_rate": 4.150617283950617e-07, + "loss": 0.0001, + "reward": 1.8660714626312256, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8705357313156128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3362 + }, + { + "completion_length": 271.85715770721436, + "epoch": 0.5639800494572279, + "grad_norm": 0.21244909731089875, + "kl": 0.06446456909179688, + "learning_rate": 4.1530864197530865e-07, + "loss": 0.0001, + "reward": 1.666071504354477, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.67946432903409, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3364 + }, + { + "completion_length": 263.2009057998657, + "epoch": 0.5643153526970954, + "grad_norm": 0.282676138928249, + "kl": 0.0485687255859375, + "learning_rate": 4.1555555555555554e-07, + "loss": 0.0, + "reward": 1.7142857983708382, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.723214328289032, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3366 + }, + { + "completion_length": 249.79465579986572, + "epoch": 0.564650655936963, + "grad_norm": 0.2782074907909122, + "kl": 0.0198974609375, + "learning_rate": 4.158024691358025e-07, + "loss": 0.0, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 3368 + }, + { + "completion_length": 265.97768783569336, + "epoch": 0.5649859591768306, + "grad_norm": 0.28559346638349703, + "kl": 0.0273895263671875, + "learning_rate": 4.1604938271604937e-07, + "loss": 0.0, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 3370 + }, + { + "completion_length": 257.76786708831787, + "epoch": 0.5653212624166981, + "grad_norm": 0.318607705266382, + "kl": 0.022426605224609375, + "learning_rate": 4.1629629629629625e-07, + "loss": 0.0, + "reward": 1.7642858028411865, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7642857450991869, + "rewards/format_reward_func": 1.0, + "step": 3372 + }, + { + "completion_length": 269.2634057998657, + "epoch": 0.5656565656565656, + "grad_norm": 0.23426763199945552, + "kl": 0.02051544189453125, + "learning_rate": 4.165432098765432e-07, + "loss": 0.0, + "reward": 1.7928571924567223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 3374 + }, + { + "completion_length": 273.3705463409424, + "epoch": 0.5659918688964332, + "grad_norm": 0.23627110930730033, + "kl": 0.06494903564453125, + "learning_rate": 4.167901234567901e-07, + "loss": 0.0001, + "reward": 1.667857214808464, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.6767857521772385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3376 + }, + { + "completion_length": 261.133939743042, + "epoch": 0.5663271721363008, + "grad_norm": 0.2841826348282849, + "kl": 0.051708221435546875, + "learning_rate": 4.17037037037037e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 3378 + }, + { + "completion_length": 258.04911708831787, + "epoch": 0.5666624753761683, + "grad_norm": 0.12243006146358627, + "kl": 0.0408172607421875, + "learning_rate": 4.1728395061728397e-07, + "loss": 0.0, + "reward": 1.7357143238186836, + "reward_std": 0.03030457627028227, + "rewards/equation_reward_func": 0.744642898440361, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3380 + }, + { + "completion_length": 264.5535831451416, + "epoch": 0.5669977786160358, + "grad_norm": 0.185726261550338, + "kl": 0.03836822509765625, + "learning_rate": 4.1753086419753085e-07, + "loss": 0.0, + "reward": 1.7839286178350449, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7883928902447224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3382 + }, + { + "completion_length": 268.5580472946167, + "epoch": 0.5673330818559035, + "grad_norm": 0.646482019678551, + "kl": 0.1957550048828125, + "learning_rate": 4.177777777777778e-07, + "loss": 0.0002, + "reward": 1.725000075995922, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7428571581840515, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3384 + }, + { + "completion_length": 262.4910840988159, + "epoch": 0.567668385095771, + "grad_norm": 0.28343271824101846, + "kl": 0.032367706298828125, + "learning_rate": 4.1802469135802463e-07, + "loss": 0.0, + "reward": 1.769642911851406, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071823984385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3386 + }, + { + "completion_length": 267.2009048461914, + "epoch": 0.5680036883356385, + "grad_norm": 0.30762902087198446, + "kl": 0.22566604614257812, + "learning_rate": 4.1827160493827157e-07, + "loss": 0.0002, + "reward": 1.7607143372297287, + "reward_std": 0.09596448950469494, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3388 + }, + { + "completion_length": 261.4509029388428, + "epoch": 0.568338991575506, + "grad_norm": 0.11987462910732502, + "kl": 0.06043243408203125, + "learning_rate": 4.185185185185185e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7375000324100256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3390 + }, + { + "completion_length": 264.91072368621826, + "epoch": 0.5686742948153737, + "grad_norm": 0.19351392333550568, + "kl": 0.05767250061035156, + "learning_rate": 4.187654320987654e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3392 + }, + { + "completion_length": 259.0089406967163, + "epoch": 0.5690095980552412, + "grad_norm": 0.19611064120910207, + "kl": 0.3055915832519531, + "learning_rate": 4.1901234567901234e-07, + "loss": 0.0003, + "reward": 1.6803572252392769, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.684821467846632, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3394 + }, + { + "completion_length": 255.05804824829102, + "epoch": 0.5693449012951087, + "grad_norm": 0.23378041111291098, + "kl": 0.034976959228515625, + "learning_rate": 4.1925925925925923e-07, + "loss": 0.0, + "reward": 1.7642857655882835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857562750578, + "rewards/format_reward_func": 1.0, + "step": 3396 + }, + { + "completion_length": 255.34822463989258, + "epoch": 0.5696802045349764, + "grad_norm": 0.20520753045702905, + "kl": 0.12027740478515625, + "learning_rate": 4.1950617283950617e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 3398 + }, + { + "completion_length": 263.2142972946167, + "epoch": 0.5700155077748439, + "grad_norm": 0.23000060551566762, + "kl": 0.09436416625976562, + "learning_rate": 4.1975308641975306e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571429014205933, + "rewards/format_reward_func": 1.0, + "step": 3400 + }, + { + "completion_length": 257.0535840988159, + "epoch": 0.5703508110147114, + "grad_norm": 0.27011411244907235, + "kl": 0.02211761474609375, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "reward": 1.7678572162985802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 3402 + }, + { + "completion_length": 260.5580472946167, + "epoch": 0.5706861142545789, + "grad_norm": 0.35785158580122634, + "kl": 0.030055999755859375, + "learning_rate": 4.202469135802469e-07, + "loss": 0.0, + "reward": 1.773214377462864, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3404 + }, + { + "completion_length": 269.73662090301514, + "epoch": 0.5710214174944466, + "grad_norm": 0.2128801567734277, + "kl": 0.246490478515625, + "learning_rate": 4.2049382716049383e-07, + "loss": 0.0002, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 3406 + }, + { + "completion_length": 270.5312623977661, + "epoch": 0.5713567207343141, + "grad_norm": 0.21184046708081816, + "kl": 0.039031982421875, + "learning_rate": 4.207407407407407e-07, + "loss": 0.0, + "reward": 1.7464286610484123, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286163449287, + "rewards/format_reward_func": 1.0, + "step": 3408 + }, + { + "completion_length": 268.50001335144043, + "epoch": 0.5716920239741816, + "grad_norm": 0.46570394726523234, + "kl": 0.2350921630859375, + "learning_rate": 4.2098765432098766e-07, + "loss": 0.0002, + "reward": 1.735267922282219, + "reward_std": 0.08144354820251465, + "rewards/equation_reward_func": 0.7473214603960514, + "rewards/format_reward_func": 0.9879464358091354, + "step": 3410 + }, + { + "completion_length": 267.25894260406494, + "epoch": 0.5720273272140493, + "grad_norm": 0.2918455764955246, + "kl": 0.12889862060546875, + "learning_rate": 4.2123456790123455e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 3412 + }, + { + "completion_length": 262.6294765472412, + "epoch": 0.5723626304539168, + "grad_norm": 0.24566651888157798, + "kl": 0.15311050415039062, + "learning_rate": 4.2148148148148144e-07, + "loss": 0.0002, + "reward": 1.7660714983940125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3414 + }, + { + "completion_length": 265.6651906967163, + "epoch": 0.5726979336937843, + "grad_norm": 0.3001935923900465, + "kl": 0.13809585571289062, + "learning_rate": 4.217283950617284e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7732143215835094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3416 + }, + { + "completion_length": 257.58483505249023, + "epoch": 0.5730332369336518, + "grad_norm": 0.2690218679515261, + "kl": 0.030422210693359375, + "learning_rate": 4.2197530864197527e-07, + "loss": 0.0, + "reward": 1.7464286461472511, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7553571704775095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3418 + }, + { + "completion_length": 260.55358600616455, + "epoch": 0.5733685401735195, + "grad_norm": 0.5002997706686756, + "kl": 0.168212890625, + "learning_rate": 4.222222222222222e-07, + "loss": 0.0002, + "reward": 1.6875000596046448, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.6919643338769674, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3420 + }, + { + "completion_length": 254.7410831451416, + "epoch": 0.573703843413387, + "grad_norm": 0.2889726297534045, + "kl": 0.024105072021484375, + "learning_rate": 4.224691358024691e-07, + "loss": 0.0, + "reward": 1.7678572088479996, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571902215481, + "rewards/format_reward_func": 1.0, + "step": 3422 + }, + { + "completion_length": 253.49108600616455, + "epoch": 0.5740391466532545, + "grad_norm": 0.21100531871607167, + "kl": 0.02407073974609375, + "learning_rate": 4.2271604938271604e-07, + "loss": 0.0, + "reward": 1.8285714983940125, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8285714499652386, + "rewards/format_reward_func": 1.0, + "step": 3424 + }, + { + "completion_length": 262.3169775009155, + "epoch": 0.5743744498931221, + "grad_norm": 0.33953151575700286, + "kl": 0.038204193115234375, + "learning_rate": 4.22962962962963e-07, + "loss": 0.0, + "reward": 1.7232143580913544, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7276786006987095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3426 + }, + { + "completion_length": 265.00447845458984, + "epoch": 0.5747097531329897, + "grad_norm": 0.33966484859625834, + "kl": 0.032154083251953125, + "learning_rate": 4.232098765432098e-07, + "loss": 0.0, + "reward": 1.7285714820027351, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 3428 + }, + { + "completion_length": 263.45537185668945, + "epoch": 0.5750450563728572, + "grad_norm": 0.2174688881241481, + "kl": 0.09495925903320312, + "learning_rate": 4.2345679012345676e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7553571686148643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3430 + }, + { + "completion_length": 267.9419765472412, + "epoch": 0.5753803596127247, + "grad_norm": 0.18092360650560677, + "kl": 0.2818183898925781, + "learning_rate": 4.237037037037037e-07, + "loss": 0.0003, + "reward": 1.757142923772335, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 3432 + }, + { + "completion_length": 268.40180015563965, + "epoch": 0.5757156628525923, + "grad_norm": 0.36947735776015517, + "kl": 0.067230224609375, + "learning_rate": 4.239506172839506e-07, + "loss": 0.0001, + "reward": 1.7517858073115349, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3434 + }, + { + "completion_length": 264.8303737640381, + "epoch": 0.5760509660924599, + "grad_norm": 0.18742152942426193, + "kl": 0.06931304931640625, + "learning_rate": 4.2419753086419753e-07, + "loss": 0.0001, + "reward": 1.8160714954137802, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8205357491970062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3436 + }, + { + "completion_length": 264.5312614440918, + "epoch": 0.5763862693323274, + "grad_norm": 0.2804896692149197, + "kl": 0.05013275146484375, + "learning_rate": 4.244444444444444e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 3438 + }, + { + "completion_length": 271.4732275009155, + "epoch": 0.576721572572195, + "grad_norm": 0.18135283016552467, + "kl": 0.050182342529296875, + "learning_rate": 4.2469135802469136e-07, + "loss": 0.0001, + "reward": 1.7107143849134445, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7107143141329288, + "rewards/format_reward_func": 1.0, + "step": 3440 + }, + { + "completion_length": 269.2544765472412, + "epoch": 0.5770568758120626, + "grad_norm": 0.2010359293113445, + "kl": 0.035064697265625, + "learning_rate": 4.2493827160493825e-07, + "loss": 0.0, + "reward": 1.7428572177886963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 1.0, + "step": 3442 + }, + { + "completion_length": 271.80358505249023, + "epoch": 0.5773921790519301, + "grad_norm": 0.23671808515711454, + "kl": 0.025234222412109375, + "learning_rate": 4.2518518518518513e-07, + "loss": 0.0, + "reward": 1.725000075995922, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 3444 + }, + { + "completion_length": 252.01340579986572, + "epoch": 0.5777274822917976, + "grad_norm": 0.1996366433463103, + "kl": 0.024082183837890625, + "learning_rate": 4.254320987654321e-07, + "loss": 0.0, + "reward": 1.785714328289032, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 3446 + }, + { + "completion_length": 273.3348340988159, + "epoch": 0.5780627855316652, + "grad_norm": 0.14140392980556388, + "kl": 0.033966064453125, + "learning_rate": 4.2567901234567896e-07, + "loss": 0.0, + "reward": 1.803571492433548, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 3448 + }, + { + "completion_length": 261.0446548461914, + "epoch": 0.5783980887715328, + "grad_norm": 0.25123462292319676, + "kl": 0.0379486083984375, + "learning_rate": 4.259259259259259e-07, + "loss": 0.0, + "reward": 1.7392857670783997, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857298254967, + "rewards/format_reward_func": 1.0, + "step": 3450 + }, + { + "completion_length": 267.0893020629883, + "epoch": 0.5787333920114003, + "grad_norm": 0.2699059485478521, + "kl": 0.11783599853515625, + "learning_rate": 4.2617283950617285e-07, + "loss": 0.0001, + "reward": 1.69821435213089, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7026785928755999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3452 + }, + { + "completion_length": 268.49108600616455, + "epoch": 0.5790686952512679, + "grad_norm": 0.2084339363468767, + "kl": 0.033172607421875, + "learning_rate": 4.2641975308641973e-07, + "loss": 0.0, + "reward": 1.758482187986374, + "reward_std": 0.03851206600666046, + "rewards/equation_reward_func": 0.7660714592784643, + "rewards/format_reward_func": 0.9924107193946838, + "step": 3454 + }, + { + "completion_length": 259.3839406967163, + "epoch": 0.5794039984911354, + "grad_norm": 0.17434411741363384, + "kl": 0.0621795654296875, + "learning_rate": 4.266666666666667e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 3456 + }, + { + "completion_length": 265.0089387893677, + "epoch": 0.579739301731003, + "grad_norm": 0.3167726445807968, + "kl": 0.03658294677734375, + "learning_rate": 4.2691358024691356e-07, + "loss": 0.0, + "reward": 1.7785714715719223, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 3458 + }, + { + "completion_length": 259.3928699493408, + "epoch": 0.5800746049708705, + "grad_norm": 0.2945341184109623, + "kl": 0.0406341552734375, + "learning_rate": 4.2716049382716045e-07, + "loss": 0.0, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 3460 + }, + { + "completion_length": 265.3660821914673, + "epoch": 0.5804099082107381, + "grad_norm": 0.0889414542345476, + "kl": 0.028598785400390625, + "learning_rate": 4.274074074074074e-07, + "loss": 0.0, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 3462 + }, + { + "completion_length": 262.91072940826416, + "epoch": 0.5807452114506056, + "grad_norm": 0.13428184690834213, + "kl": 0.03705596923828125, + "learning_rate": 4.276543209876543e-07, + "loss": 0.0, + "reward": 1.7196429371833801, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3464 + }, + { + "completion_length": 263.2098340988159, + "epoch": 0.5810805146904732, + "grad_norm": 0.2829331040542585, + "kl": 0.05352783203125, + "learning_rate": 4.279012345679012e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7553571686148643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3466 + }, + { + "completion_length": 272.1339416503906, + "epoch": 0.5814158179303408, + "grad_norm": 0.15916034359450193, + "kl": 0.05171966552734375, + "learning_rate": 4.2814814814814816e-07, + "loss": 0.0001, + "reward": 1.7196429446339607, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3468 + }, + { + "completion_length": 256.48215770721436, + "epoch": 0.5817511211702083, + "grad_norm": 0.3651088268384066, + "kl": 0.0333251953125, + "learning_rate": 4.2839506172839505e-07, + "loss": 0.0, + "reward": 1.7107143700122833, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7196428962051868, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3470 + }, + { + "completion_length": 259.0134029388428, + "epoch": 0.5820864244100759, + "grad_norm": 0.41088836334113304, + "kl": 0.04825592041015625, + "learning_rate": 4.2864197530864194e-07, + "loss": 0.0, + "reward": 1.7553572282195091, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214671015739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3472 + }, + { + "completion_length": 256.48662090301514, + "epoch": 0.5824217276499434, + "grad_norm": 0.14851198065644858, + "kl": 0.032390594482421875, + "learning_rate": 4.2888888888888883e-07, + "loss": 0.0, + "reward": 1.79464291036129, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071790456772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3474 + }, + { + "completion_length": 258.7901916503906, + "epoch": 0.582757030889811, + "grad_norm": 0.1969503633329373, + "kl": 0.0368194580078125, + "learning_rate": 4.2913580246913577e-07, + "loss": 0.0, + "reward": 1.7089286595582962, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7133928835391998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3476 + }, + { + "completion_length": 257.8928699493408, + "epoch": 0.5830923341296785, + "grad_norm": 0.3246895776320101, + "kl": 0.12652206420898438, + "learning_rate": 4.293827160493827e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 3478 + }, + { + "completion_length": 258.3392963409424, + "epoch": 0.5834276373695461, + "grad_norm": 0.28282227891625317, + "kl": 0.02780914306640625, + "learning_rate": 4.296296296296296e-07, + "loss": 0.0, + "reward": 1.7375000789761543, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7508928887546062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3480 + }, + { + "completion_length": 257.0491189956665, + "epoch": 0.5837629406094136, + "grad_norm": 0.18441674849853126, + "kl": 0.02960205078125, + "learning_rate": 4.2987654320987654e-07, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 3482 + }, + { + "completion_length": 260.7812614440918, + "epoch": 0.5840982438492812, + "grad_norm": 0.23324259920480803, + "kl": 0.03562164306640625, + "learning_rate": 4.301234567901235e-07, + "loss": 0.0, + "reward": 1.725000075995922, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7250000238418579, + "rewards/format_reward_func": 1.0, + "step": 3484 + }, + { + "completion_length": 263.1651906967163, + "epoch": 0.5844335470891487, + "grad_norm": 0.41779317202713057, + "kl": 0.08310699462890625, + "learning_rate": 4.303703703703703e-07, + "loss": 0.0001, + "reward": 1.6625000908970833, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.6758929044008255, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3486 + }, + { + "completion_length": 254.05804824829102, + "epoch": 0.5847688503290163, + "grad_norm": 0.21896304164973182, + "kl": 0.16289520263671875, + "learning_rate": 4.3061728395061726e-07, + "loss": 0.0002, + "reward": 1.7589286267757416, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3488 + }, + { + "completion_length": 250.95090103149414, + "epoch": 0.5851041535688839, + "grad_norm": 0.15230906377968959, + "kl": 0.1225433349609375, + "learning_rate": 4.3086419753086415e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 3490 + }, + { + "completion_length": 253.61161708831787, + "epoch": 0.5854394568087514, + "grad_norm": 0.13634640897201905, + "kl": 0.06240081787109375, + "learning_rate": 4.311111111111111e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.792857164517045, + "rewards/format_reward_func": 1.0, + "step": 3492 + }, + { + "completion_length": 259.2410831451416, + "epoch": 0.585774760048619, + "grad_norm": 0.20498559394312932, + "kl": 0.02706146240234375, + "learning_rate": 4.3135802469135803e-07, + "loss": 0.0, + "reward": 1.8125000298023224, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.8169643245637417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3494 + }, + { + "completion_length": 249.1384048461914, + "epoch": 0.5861100632884865, + "grad_norm": 0.31684540637992265, + "kl": 0.05062103271484375, + "learning_rate": 4.316049382716049e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 3496 + }, + { + "completion_length": 265.8660821914673, + "epoch": 0.5864453665283541, + "grad_norm": 0.2820708311134308, + "kl": 0.08450698852539062, + "learning_rate": 4.3185185185185186e-07, + "loss": 0.0001, + "reward": 1.8250000402331352, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.8339285887777805, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3498 + }, + { + "completion_length": 258.276798248291, + "epoch": 0.5867806697682216, + "grad_norm": 0.18988002197728343, + "kl": 0.0531158447265625, + "learning_rate": 4.320987654320987e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8017857559025288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3500 + }, + { + "completion_length": 253.03572940826416, + "epoch": 0.5871159730080892, + "grad_norm": 0.17893344507554287, + "kl": 0.02783966064453125, + "learning_rate": 4.3234567901234564e-07, + "loss": 0.0, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 3502 + }, + { + "completion_length": 259.8973321914673, + "epoch": 0.5874512762479568, + "grad_norm": 0.22936645917658965, + "kl": 0.02904510498046875, + "learning_rate": 4.325925925925926e-07, + "loss": 0.0, + "reward": 1.7857143580913544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 3504 + }, + { + "completion_length": 260.5357275009155, + "epoch": 0.5877865794878243, + "grad_norm": 0.41946924390943746, + "kl": 0.3809051513671875, + "learning_rate": 4.3283950617283947e-07, + "loss": 0.0004, + "reward": 1.7357143387198448, + "reward_std": 0.10606601741164923, + "rewards/equation_reward_func": 0.7625000216066837, + "rewards/format_reward_func": 0.9732142984867096, + "step": 3506 + }, + { + "completion_length": 259.0982275009155, + "epoch": 0.5881218827276918, + "grad_norm": 0.22516693768991086, + "kl": 0.03868865966796875, + "learning_rate": 4.330864197530864e-07, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7696428932249546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3508 + }, + { + "completion_length": 246.68304634094238, + "epoch": 0.5884571859675594, + "grad_norm": 0.16891949258278482, + "kl": 0.02500152587890625, + "learning_rate": 4.3333333333333335e-07, + "loss": 0.0, + "reward": 1.7660715207457542, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3510 + }, + { + "completion_length": 256.5044746398926, + "epoch": 0.588792489207427, + "grad_norm": 0.16682964616175375, + "kl": 0.03672027587890625, + "learning_rate": 4.3358024691358024e-07, + "loss": 0.0, + "reward": 1.7732143551111221, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776785865426064, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3512 + }, + { + "completion_length": 262.71429538726807, + "epoch": 0.5891277924472945, + "grad_norm": 0.22590571064361892, + "kl": 0.036014556884765625, + "learning_rate": 4.338271604938271e-07, + "loss": 0.0, + "reward": 1.7714286297559738, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7803571689873934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3514 + }, + { + "completion_length": 262.2366199493408, + "epoch": 0.589463095687162, + "grad_norm": 0.2227653303159639, + "kl": 0.05431365966796875, + "learning_rate": 4.34074074074074e-07, + "loss": 0.0001, + "reward": 1.716071493923664, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7294643130153418, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3516 + }, + { + "completion_length": 263.89287090301514, + "epoch": 0.5897983989270297, + "grad_norm": 0.22453191240129117, + "kl": 0.161773681640625, + "learning_rate": 4.3432098765432096e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 3518 + }, + { + "completion_length": 261.9509029388428, + "epoch": 0.5901337021668972, + "grad_norm": 0.25398918227897027, + "kl": 0.0648956298828125, + "learning_rate": 4.345679012345679e-07, + "loss": 0.0001, + "reward": 1.7357143834233284, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 3520 + }, + { + "completion_length": 262.4598340988159, + "epoch": 0.5904690054067647, + "grad_norm": 0.2944405516763985, + "kl": 0.025665283203125, + "learning_rate": 4.348148148148148e-07, + "loss": 0.0, + "reward": 1.7642857730388641, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7732143104076385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3522 + }, + { + "completion_length": 271.6696586608887, + "epoch": 0.5908043086466322, + "grad_norm": 0.24076922620858562, + "kl": 0.0668487548828125, + "learning_rate": 4.350617283950617e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3524 + }, + { + "completion_length": 262.7812604904175, + "epoch": 0.5911396118864999, + "grad_norm": 0.3099566866844783, + "kl": 0.07646942138671875, + "learning_rate": 4.3530864197530867e-07, + "loss": 0.0001, + "reward": 1.746428668498993, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3526 + }, + { + "completion_length": 258.2232255935669, + "epoch": 0.5914749151263674, + "grad_norm": 0.17710092998351407, + "kl": 0.031280517578125, + "learning_rate": 4.355555555555555e-07, + "loss": 0.0, + "reward": 1.8071429207921028, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8160714544355869, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3528 + }, + { + "completion_length": 249.54465579986572, + "epoch": 0.5918102183662349, + "grad_norm": 0.2899026154248438, + "kl": 0.0284576416015625, + "learning_rate": 4.3580246913580244e-07, + "loss": 0.0, + "reward": 1.7875000685453415, + "reward_std": 0.06818529777228832, + "rewards/equation_reward_func": 0.8008928876370192, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3530 + }, + { + "completion_length": 255.883939743042, + "epoch": 0.5921455216061026, + "grad_norm": 0.15538153919110007, + "kl": 0.02648162841796875, + "learning_rate": 4.3604938271604933e-07, + "loss": 0.0, + "reward": 1.800000049173832, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 3532 + }, + { + "completion_length": 263.08929920196533, + "epoch": 0.5924808248459701, + "grad_norm": 0.3173318092500211, + "kl": 0.06317138671875, + "learning_rate": 4.362962962962963e-07, + "loss": 0.0001, + "reward": 1.7875000685453415, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3534 + }, + { + "completion_length": 252.59376049041748, + "epoch": 0.5928161280858376, + "grad_norm": 0.345473218389504, + "kl": 0.054718017578125, + "learning_rate": 4.365432098765432e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3536 + }, + { + "completion_length": 266.7723331451416, + "epoch": 0.5931514313257051, + "grad_norm": 0.1637067972754746, + "kl": 0.04750823974609375, + "learning_rate": 4.367901234567901e-07, + "loss": 0.0, + "reward": 1.7357143461704254, + "reward_std": 0.03030457627028227, + "rewards/equation_reward_func": 0.7446428872644901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3538 + }, + { + "completion_length": 259.96875858306885, + "epoch": 0.5934867345655728, + "grad_norm": 0.30374336459795204, + "kl": 0.037841796875, + "learning_rate": 4.3703703703703704e-07, + "loss": 0.0, + "reward": 1.762500062584877, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3540 + }, + { + "completion_length": 257.20537090301514, + "epoch": 0.5938220378054403, + "grad_norm": 0.18775558730289618, + "kl": 0.18558502197265625, + "learning_rate": 4.372839506172839e-07, + "loss": 0.0002, + "reward": 1.7482143566012383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3542 + }, + { + "completion_length": 267.7589416503906, + "epoch": 0.5941573410453078, + "grad_norm": 0.27798905796014955, + "kl": 0.0243682861328125, + "learning_rate": 4.375308641975308e-07, + "loss": 0.0, + "reward": 1.7464286237955093, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7553571723401546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3544 + }, + { + "completion_length": 254.3839406967163, + "epoch": 0.5944926442851755, + "grad_norm": 0.16482240300876788, + "kl": 0.2323150634765625, + "learning_rate": 4.3777777777777776e-07, + "loss": 0.0002, + "reward": 1.7535714954137802, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7535714767873287, + "rewards/format_reward_func": 1.0, + "step": 3546 + }, + { + "completion_length": 269.70090103149414, + "epoch": 0.594827947525043, + "grad_norm": 0.21117970471010705, + "kl": 0.17474365234375, + "learning_rate": 4.3802469135802465e-07, + "loss": 0.0002, + "reward": 1.7589286267757416, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.772321455180645, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3548 + }, + { + "completion_length": 260.1517972946167, + "epoch": 0.5951632507649105, + "grad_norm": 0.2455256251100981, + "kl": 0.029510498046875, + "learning_rate": 4.382716049382716e-07, + "loss": 0.0, + "reward": 1.8000000715255737, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3550 + }, + { + "completion_length": 267.6428699493408, + "epoch": 0.595498554004778, + "grad_norm": 0.21551056799708354, + "kl": 0.03183746337890625, + "learning_rate": 4.3851851851851853e-07, + "loss": 0.0, + "reward": 1.7196429520845413, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3552 + }, + { + "completion_length": 258.040189743042, + "epoch": 0.5958338572446457, + "grad_norm": 0.1783330798048646, + "kl": 0.14756011962890625, + "learning_rate": 4.387654320987654e-07, + "loss": 0.0001, + "reward": 1.707142949104309, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7071428969502449, + "rewards/format_reward_func": 1.0, + "step": 3554 + }, + { + "completion_length": 249.8259038925171, + "epoch": 0.5961691604845132, + "grad_norm": 0.21122598255990901, + "kl": 0.02812957763671875, + "learning_rate": 4.3901234567901236e-07, + "loss": 0.0, + "reward": 1.773214340209961, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776786088943481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3556 + }, + { + "completion_length": 250.7991132736206, + "epoch": 0.5965044637243807, + "grad_norm": 0.1805206880065137, + "kl": 0.0556182861328125, + "learning_rate": 4.392592592592592e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 3558 + }, + { + "completion_length": 250.87054634094238, + "epoch": 0.5968397669642483, + "grad_norm": 0.7938875469714126, + "kl": 0.3453216552734375, + "learning_rate": 4.3950617283950614e-07, + "loss": 0.0003, + "reward": 1.7946429401636124, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.799107164144516, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3560 + }, + { + "completion_length": 260.9732265472412, + "epoch": 0.5971750702041159, + "grad_norm": 0.19210387107786725, + "kl": 0.04297637939453125, + "learning_rate": 4.397530864197531e-07, + "loss": 0.0, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286144822836, + "rewards/format_reward_func": 1.0, + "step": 3562 + }, + { + "completion_length": 245.0669755935669, + "epoch": 0.5975103734439834, + "grad_norm": 0.2801304059499941, + "kl": 0.0260467529296875, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "reward": 1.7928571924567223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 3564 + }, + { + "completion_length": 271.33037090301514, + "epoch": 0.5978456766838509, + "grad_norm": 0.11718352236281365, + "kl": 0.06449508666992188, + "learning_rate": 4.402469135802469e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7544643245637417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3566 + }, + { + "completion_length": 260.3348321914673, + "epoch": 0.5981809799237185, + "grad_norm": 0.3075517351831485, + "kl": 0.08880615234375, + "learning_rate": 4.404938271604938e-07, + "loss": 0.0001, + "reward": 1.687500074505806, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7098214663565159, + "rewards/format_reward_func": 0.977678582072258, + "step": 3568 + }, + { + "completion_length": 263.1026906967163, + "epoch": 0.5985162831635861, + "grad_norm": 0.267592381406727, + "kl": 0.07773208618164062, + "learning_rate": 4.4074074074074074e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7357143014669418, + "rewards/format_reward_func": 1.0, + "step": 3570 + }, + { + "completion_length": 259.83036708831787, + "epoch": 0.5988515864034536, + "grad_norm": 0.16486227865906747, + "kl": 0.0298004150390625, + "learning_rate": 4.4098765432098763e-07, + "loss": 0.0, + "reward": 1.7589286342263222, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3572 + }, + { + "completion_length": 263.4285831451416, + "epoch": 0.5991868896433212, + "grad_norm": 0.2593585175782869, + "kl": 0.02993011474609375, + "learning_rate": 4.412345679012345e-07, + "loss": 0.0, + "reward": 1.717857226729393, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857454717159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3574 + }, + { + "completion_length": 253.36161994934082, + "epoch": 0.5995221928831888, + "grad_norm": 0.22218048407968422, + "kl": 0.029876708984375, + "learning_rate": 4.4148148148148146e-07, + "loss": 0.0, + "reward": 1.8071429133415222, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 3576 + }, + { + "completion_length": 255.6785831451416, + "epoch": 0.5998574961230563, + "grad_norm": 0.27713844671160104, + "kl": 0.02791595458984375, + "learning_rate": 4.417283950617284e-07, + "loss": 0.0, + "reward": 1.7214286550879478, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 3578 + }, + { + "completion_length": 258.5223331451416, + "epoch": 0.6001927993629238, + "grad_norm": 0.2268908937876009, + "kl": 0.14574432373046875, + "learning_rate": 4.419753086419753e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7973214574158192, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3580 + }, + { + "completion_length": 260.96876430511475, + "epoch": 0.6005281026027914, + "grad_norm": 0.008560615600363496, + "kl": 0.04407501220703125, + "learning_rate": 4.4222222222222223e-07, + "loss": 0.0, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 3582 + }, + { + "completion_length": 263.3794746398926, + "epoch": 0.600863405842659, + "grad_norm": 0.3125592782204781, + "kl": 0.039703369140625, + "learning_rate": 4.424691358024691e-07, + "loss": 0.0, + "reward": 1.7321429401636124, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7410714589059353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3584 + }, + { + "completion_length": 265.0401945114136, + "epoch": 0.6011987090825265, + "grad_norm": 0.23566926170906846, + "kl": 0.064239501953125, + "learning_rate": 4.42716049382716e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3586 + }, + { + "completion_length": 260.8169765472412, + "epoch": 0.6015340123223941, + "grad_norm": 0.18104321652659505, + "kl": 0.06757354736328125, + "learning_rate": 4.4296296296296295e-07, + "loss": 0.0001, + "reward": 1.7142857685685158, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857518047094, + "rewards/format_reward_func": 1.0, + "step": 3588 + }, + { + "completion_length": 247.33929824829102, + "epoch": 0.6018693155622616, + "grad_norm": 0.2856698746219867, + "kl": 0.051727294921875, + "learning_rate": 4.4320987654320984e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 3590 + }, + { + "completion_length": 258.3259029388428, + "epoch": 0.6022046188021292, + "grad_norm": 0.4413945663867954, + "kl": 0.070465087890625, + "learning_rate": 4.434567901234568e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 3592 + }, + { + "completion_length": 260.0669746398926, + "epoch": 0.6025399220419967, + "grad_norm": 0.20788258491713757, + "kl": 0.076507568359375, + "learning_rate": 4.4370370370370367e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7642857320606709, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3594 + }, + { + "completion_length": 257.54019260406494, + "epoch": 0.6028752252818643, + "grad_norm": 0.35534726550307105, + "kl": 0.02875518798828125, + "learning_rate": 4.439506172839506e-07, + "loss": 0.0, + "reward": 1.7821429297327995, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 3596 + }, + { + "completion_length": 265.495548248291, + "epoch": 0.6032105285217318, + "grad_norm": 0.32475422464455267, + "kl": 0.25457000732421875, + "learning_rate": 4.4419753086419755e-07, + "loss": 0.0003, + "reward": 1.691071517765522, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.6955357398837805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3598 + }, + { + "completion_length": 256.9910821914673, + "epoch": 0.6035458317615994, + "grad_norm": 0.3155386853226163, + "kl": 0.033954620361328125, + "learning_rate": 4.444444444444444e-07, + "loss": 0.0, + "reward": 1.7339286506175995, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3600 + }, + { + "completion_length": 249.37501049041748, + "epoch": 0.603881135001467, + "grad_norm": 0.3366549119682878, + "kl": 0.15158843994140625, + "learning_rate": 4.446913580246913e-07, + "loss": 0.0002, + "reward": 1.8053571954369545, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8098214510828257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3602 + }, + { + "completion_length": 261.1160831451416, + "epoch": 0.6042164382413345, + "grad_norm": 0.2374163328003629, + "kl": 0.10363006591796875, + "learning_rate": 4.4493827160493827e-07, + "loss": 0.0001, + "reward": 1.7964286506175995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 3604 + }, + { + "completion_length": 265.6518039703369, + "epoch": 0.604551741481202, + "grad_norm": 0.24493017949672824, + "kl": 0.33367919921875, + "learning_rate": 4.4518518518518515e-07, + "loss": 0.0003, + "reward": 1.7375001013278961, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7508928775787354, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3606 + }, + { + "completion_length": 267.93304920196533, + "epoch": 0.6048870447210696, + "grad_norm": 0.4602830429080869, + "kl": 0.5742149353027344, + "learning_rate": 4.454320987654321e-07, + "loss": 0.0006, + "reward": 1.778571479022503, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7875000312924385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3608 + }, + { + "completion_length": 269.2678699493408, + "epoch": 0.6052223479609372, + "grad_norm": 0.3401300966547623, + "kl": 0.16431427001953125, + "learning_rate": 4.45679012345679e-07, + "loss": 0.0002, + "reward": 1.7714286372065544, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7803571745753288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3610 + }, + { + "completion_length": 274.7009029388428, + "epoch": 0.6055576512008047, + "grad_norm": 0.32991585835717985, + "kl": 0.12230682373046875, + "learning_rate": 4.459259259259259e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7330357395112514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3612 + }, + { + "completion_length": 270.4866189956665, + "epoch": 0.6058929544406723, + "grad_norm": 0.255009936150431, + "kl": 0.03687286376953125, + "learning_rate": 4.461728395061728e-07, + "loss": 0.0, + "reward": 1.7625000551342964, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7758928798139095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3614 + }, + { + "completion_length": 275.08929920196533, + "epoch": 0.6062282576805398, + "grad_norm": 0.18435769219493214, + "kl": 0.14650726318359375, + "learning_rate": 4.464197530864197e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7589286118745804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3616 + }, + { + "completion_length": 275.9955472946167, + "epoch": 0.6065635609204074, + "grad_norm": 0.50317108046287, + "kl": 0.8645095825195312, + "learning_rate": 4.4666666666666664e-07, + "loss": 0.0009, + "reward": 1.7982143312692642, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.8116071671247482, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3618 + }, + { + "completion_length": 265.30804538726807, + "epoch": 0.6068988641602749, + "grad_norm": 0.2825123505770023, + "kl": 0.03264617919921875, + "learning_rate": 4.4691358024691353e-07, + "loss": 0.0, + "reward": 1.7464286386966705, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.755357176065445, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3620 + }, + { + "completion_length": 271.2857246398926, + "epoch": 0.6072341674001425, + "grad_norm": 0.18990244417672877, + "kl": 0.08856964111328125, + "learning_rate": 4.4716049382716047e-07, + "loss": 0.0001, + "reward": 1.7053572162985802, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7187500335276127, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3622 + }, + { + "completion_length": 255.42411994934082, + "epoch": 0.6075694706400101, + "grad_norm": 0.28196388269226436, + "kl": 0.16127777099609375, + "learning_rate": 4.474074074074074e-07, + "loss": 0.0002, + "reward": 1.8125000521540642, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8169643022119999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3624 + }, + { + "completion_length": 273.1428689956665, + "epoch": 0.6079047738798776, + "grad_norm": 0.4959674196608172, + "kl": 1.2173080444335938, + "learning_rate": 4.476543209876543e-07, + "loss": 0.0012, + "reward": 1.714285783469677, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.732142886146903, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3626 + }, + { + "completion_length": 264.4330472946167, + "epoch": 0.6082400771197451, + "grad_norm": 0.18397191856889522, + "kl": 0.22011566162109375, + "learning_rate": 4.479012345679012e-07, + "loss": 0.0002, + "reward": 1.7714286521077156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 3628 + }, + { + "completion_length": 265.06697845458984, + "epoch": 0.6085753803596127, + "grad_norm": 0.27923133250657345, + "kl": 0.48724365234375, + "learning_rate": 4.4814814814814813e-07, + "loss": 0.0005, + "reward": 1.7446429207921028, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7580357510596514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3630 + }, + { + "completion_length": 263.29912185668945, + "epoch": 0.6089106835994803, + "grad_norm": 0.2536893755418448, + "kl": 0.7797088623046875, + "learning_rate": 4.48395061728395e-07, + "loss": 0.0008, + "reward": 1.8160715252161026, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.820535734295845, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3632 + }, + { + "completion_length": 263.401798248291, + "epoch": 0.6092459868393478, + "grad_norm": 0.18328565704488098, + "kl": 0.48796844482421875, + "learning_rate": 4.4864197530864196e-07, + "loss": 0.0005, + "reward": 1.7196429297327995, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.7241071872413158, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3634 + }, + { + "completion_length": 257.8973331451416, + "epoch": 0.6095812900792154, + "grad_norm": 0.4203365404663292, + "kl": 0.1136932373046875, + "learning_rate": 4.4888888888888885e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7732142955064774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3636 + }, + { + "completion_length": 261.6384038925171, + "epoch": 0.609916593319083, + "grad_norm": 0.27518745976881687, + "kl": 0.206268310546875, + "learning_rate": 4.491358024691358e-07, + "loss": 0.0002, + "reward": 1.6767858192324638, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.681250024586916, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3638 + }, + { + "completion_length": 258.3660840988159, + "epoch": 0.6102518965589505, + "grad_norm": 0.28078115457804403, + "kl": 0.970550537109375, + "learning_rate": 4.4938271604938273e-07, + "loss": 0.001, + "reward": 1.7750000655651093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 3640 + }, + { + "completion_length": 261.7500104904175, + "epoch": 0.610587199798818, + "grad_norm": 0.38733232869582146, + "kl": 1.2825469970703125, + "learning_rate": 4.496296296296296e-07, + "loss": 0.0013, + "reward": 1.7160715013742447, + "reward_std": 0.09848987404257059, + "rewards/equation_reward_func": 0.7383928783237934, + "rewards/format_reward_func": 0.977678582072258, + "step": 3642 + }, + { + "completion_length": 254.3928680419922, + "epoch": 0.6109225030386856, + "grad_norm": 0.22281246987673115, + "kl": 0.1315765380859375, + "learning_rate": 4.498765432098765e-07, + "loss": 0.0001, + "reward": 1.7642857655882835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 3644 + }, + { + "completion_length": 275.2098331451416, + "epoch": 0.6112578062785532, + "grad_norm": 0.3146735029654327, + "kl": 1.4842376708984375, + "learning_rate": 4.501234567901234e-07, + "loss": 0.0015, + "reward": 1.676785796880722, + "reward_std": 0.11364216171205044, + "rewards/equation_reward_func": 0.6991071626543999, + "rewards/format_reward_func": 0.977678582072258, + "step": 3646 + }, + { + "completion_length": 255.4241180419922, + "epoch": 0.6115931095184207, + "grad_norm": 0.2005355282523996, + "kl": 0.8999481201171875, + "learning_rate": 4.5037037037037034e-07, + "loss": 0.0009, + "reward": 1.7816964611411095, + "reward_std": 0.03598668519407511, + "rewards/equation_reward_func": 0.7937500290572643, + "rewards/format_reward_func": 0.9879464358091354, + "step": 3648 + }, + { + "completion_length": 274.8169755935669, + "epoch": 0.6119284127582882, + "grad_norm": 0.14003841077332135, + "kl": 2.5995941162109375, + "learning_rate": 4.506172839506173e-07, + "loss": 0.0026, + "reward": 1.7392857894301414, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7482143137603998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3650 + }, + { + "completion_length": 259.9642972946167, + "epoch": 0.6122637159981559, + "grad_norm": 0.4061581313184363, + "kl": 0.16374969482421875, + "learning_rate": 4.5086419753086417e-07, + "loss": 0.0002, + "reward": 1.7892857491970062, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7982143089175224, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3652 + }, + { + "completion_length": 268.370548248291, + "epoch": 0.6125990192380234, + "grad_norm": 0.38859381055339315, + "kl": 1.5375289916992188, + "learning_rate": 4.511111111111111e-07, + "loss": 0.0015, + "reward": 1.7678572162985802, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7767857424914837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3654 + }, + { + "completion_length": 253.7812623977661, + "epoch": 0.6129343224778909, + "grad_norm": 0.22522029722982645, + "kl": 0.46714019775390625, + "learning_rate": 4.5135802469135805e-07, + "loss": 0.0005, + "reward": 1.7160715013742447, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7294643335044384, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3656 + }, + { + "completion_length": 257.25001430511475, + "epoch": 0.6132696257177584, + "grad_norm": 0.32075938946960136, + "kl": 0.14533233642578125, + "learning_rate": 4.516049382716049e-07, + "loss": 0.0001, + "reward": 1.812500074505806, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.8258928768336773, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3658 + }, + { + "completion_length": 262.0535840988159, + "epoch": 0.6136049289576261, + "grad_norm": 0.16659359770005583, + "kl": 0.3695220947265625, + "learning_rate": 4.5185185185185183e-07, + "loss": 0.0004, + "reward": 1.744642935693264, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3660 + }, + { + "completion_length": 259.53126430511475, + "epoch": 0.6139402321974936, + "grad_norm": 0.3132566015456766, + "kl": 0.03826141357421875, + "learning_rate": 4.520987654320987e-07, + "loss": 0.0, + "reward": 1.7339286357164383, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7473214641213417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3662 + }, + { + "completion_length": 256.51787090301514, + "epoch": 0.6142755354373611, + "grad_norm": 0.2757417864081997, + "kl": 0.06600189208984375, + "learning_rate": 4.5234567901234566e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 3664 + }, + { + "completion_length": 245.51340198516846, + "epoch": 0.6146108386772288, + "grad_norm": 0.17796460260570873, + "kl": 0.2266845703125, + "learning_rate": 4.525925925925926e-07, + "loss": 0.0002, + "reward": 1.751785784959793, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3666 + }, + { + "completion_length": 258.04912281036377, + "epoch": 0.6149461419170963, + "grad_norm": 0.3287939137579827, + "kl": 0.10906219482421875, + "learning_rate": 4.528395061728395e-07, + "loss": 0.0001, + "reward": 1.758928656578064, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7633928898721933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3668 + }, + { + "completion_length": 248.98661994934082, + "epoch": 0.6152814451569638, + "grad_norm": 0.12255448020441631, + "kl": 0.041259765625, + "learning_rate": 4.5308641975308643e-07, + "loss": 0.0, + "reward": 1.7946429029107094, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3670 + }, + { + "completion_length": 260.6919746398926, + "epoch": 0.6156167483968313, + "grad_norm": 0.26965476202319116, + "kl": 0.07506561279296875, + "learning_rate": 4.5333333333333326e-07, + "loss": 0.0001, + "reward": 1.7392857670783997, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3672 + }, + { + "completion_length": 253.81697940826416, + "epoch": 0.615952051636699, + "grad_norm": 0.22782034526467285, + "kl": 0.09051513671875, + "learning_rate": 4.535802469135802e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7875000275671482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3674 + }, + { + "completion_length": 254.91519165039062, + "epoch": 0.6162873548765665, + "grad_norm": 0.2151209777437395, + "kl": 0.23415374755859375, + "learning_rate": 4.5382716049382715e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.06060915347188711, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3676 + }, + { + "completion_length": 257.40626335144043, + "epoch": 0.616622658116434, + "grad_norm": 0.16959510394683283, + "kl": 0.04297637939453125, + "learning_rate": 4.5407407407407403e-07, + "loss": 0.0, + "reward": 1.7196429371833801, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3678 + }, + { + "completion_length": 264.46876335144043, + "epoch": 0.6169579613563017, + "grad_norm": 0.4565196614990558, + "kl": 0.9423294067382812, + "learning_rate": 4.54320987654321e-07, + "loss": 0.0009, + "reward": 1.6928572207689285, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.7196428887546062, + "rewards/format_reward_func": 0.9732142984867096, + "step": 3680 + }, + { + "completion_length": 249.86161708831787, + "epoch": 0.6172932645961692, + "grad_norm": 0.20489827746347586, + "kl": 0.0494842529296875, + "learning_rate": 4.545679012345679e-07, + "loss": 0.0, + "reward": 1.728571504354477, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714522004128, + "rewards/format_reward_func": 1.0, + "step": 3682 + }, + { + "completion_length": 260.1294765472412, + "epoch": 0.6176285678360367, + "grad_norm": 0.8204189816885677, + "kl": 1.1546859741210938, + "learning_rate": 4.548148148148148e-07, + "loss": 0.0012, + "reward": 1.7500000596046448, + "reward_std": 0.07071067858487368, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3684 + }, + { + "completion_length": 250.92858219146729, + "epoch": 0.6179638710759042, + "grad_norm": 0.3160806438632435, + "kl": 0.0600128173828125, + "learning_rate": 4.550617283950617e-07, + "loss": 0.0001, + "reward": 1.7160715088248253, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7294643092900515, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3686 + }, + { + "completion_length": 257.26786708831787, + "epoch": 0.6182991743157719, + "grad_norm": 0.36269127757830905, + "kl": 0.04349517822265625, + "learning_rate": 4.553086419753086e-07, + "loss": 0.0, + "reward": 1.7750000655651093, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 3688 + }, + { + "completion_length": 253.61608219146729, + "epoch": 0.6186344775556394, + "grad_norm": 0.5216867037004947, + "kl": 1.238555908203125, + "learning_rate": 4.555555555555555e-07, + "loss": 0.0012, + "reward": 1.7321429252624512, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.741071468219161, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3690 + }, + { + "completion_length": 259.3839387893677, + "epoch": 0.6189697807955069, + "grad_norm": 0.8136101314294178, + "kl": 3.5810470581054688, + "learning_rate": 4.5580246913580246e-07, + "loss": 0.0036, + "reward": 1.7357143834233284, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3692 + }, + { + "completion_length": 260.73215675354004, + "epoch": 0.6193050840353745, + "grad_norm": 0.2764728454179333, + "kl": 0.43869781494140625, + "learning_rate": 4.5604938271604935e-07, + "loss": 0.0004, + "reward": 1.7339286506175995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7383928913623095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3694 + }, + { + "completion_length": 255.40625953674316, + "epoch": 0.6196403872752421, + "grad_norm": 0.27911070506951186, + "kl": 0.2676239013671875, + "learning_rate": 4.562962962962963e-07, + "loss": 0.0003, + "reward": 1.748214341700077, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3696 + }, + { + "completion_length": 243.78572368621826, + "epoch": 0.6199756905151096, + "grad_norm": 0.23063148507154832, + "kl": 0.04782867431640625, + "learning_rate": 4.5654320987654324e-07, + "loss": 0.0, + "reward": 1.7517857998609543, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3698 + }, + { + "completion_length": 249.63394260406494, + "epoch": 0.6203109937549771, + "grad_norm": 0.250609945184822, + "kl": 0.130401611328125, + "learning_rate": 4.5679012345679007e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7696428932249546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3700 + }, + { + "completion_length": 252.46876430511475, + "epoch": 0.6206462969948447, + "grad_norm": 0.3030883228478924, + "kl": 0.6184768676757812, + "learning_rate": 4.57037037037037e-07, + "loss": 0.0006, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 3702 + }, + { + "completion_length": 245.9821538925171, + "epoch": 0.6209816002347123, + "grad_norm": 0.22697660760153765, + "kl": 0.03972625732421875, + "learning_rate": 4.572839506172839e-07, + "loss": 0.0, + "reward": 1.7428572177886963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 3704 + }, + { + "completion_length": 246.85268878936768, + "epoch": 0.6213169034745798, + "grad_norm": 0.692975794353227, + "kl": 1.3213653564453125, + "learning_rate": 4.5753086419753084e-07, + "loss": 0.0013, + "reward": 1.7714286148548126, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7803571857511997, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3706 + }, + { + "completion_length": 246.72768878936768, + "epoch": 0.6216522067144474, + "grad_norm": 0.2097287152304815, + "kl": 0.0494232177734375, + "learning_rate": 4.577777777777778e-07, + "loss": 0.0, + "reward": 1.7160715162754059, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7205357514321804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3708 + }, + { + "completion_length": 252.20536708831787, + "epoch": 0.621987509954315, + "grad_norm": 0.3316795839811799, + "kl": 0.046783447265625, + "learning_rate": 4.5802469135802467e-07, + "loss": 0.0, + "reward": 1.6767858117818832, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.6812500283122063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3710 + }, + { + "completion_length": 247.56251049041748, + "epoch": 0.6223228131941825, + "grad_norm": 0.1709766239519439, + "kl": 0.240081787109375, + "learning_rate": 4.582716049382716e-07, + "loss": 0.0002, + "reward": 1.7843750640749931, + "reward_std": 0.02209708606824279, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 0.9986607171595097, + "step": 3712 + }, + { + "completion_length": 255.37947750091553, + "epoch": 0.62265811643405, + "grad_norm": 0.24275119545561563, + "kl": 0.20494842529296875, + "learning_rate": 4.5851851851851845e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7910714596509933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3714 + }, + { + "completion_length": 257.3348331451416, + "epoch": 0.6229934196739176, + "grad_norm": 0.1281208930031389, + "kl": 0.1625823974609375, + "learning_rate": 4.587654320987654e-07, + "loss": 0.0002, + "reward": 1.7625000551342964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669642996042967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3716 + }, + { + "completion_length": 257.07590103149414, + "epoch": 0.6233287229137852, + "grad_norm": 0.30500522924730594, + "kl": 0.0452117919921875, + "learning_rate": 4.5901234567901233e-07, + "loss": 0.0, + "reward": 1.7750000655651093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 3718 + }, + { + "completion_length": 263.6384048461914, + "epoch": 0.6236640261536527, + "grad_norm": 0.21559576261696073, + "kl": 0.29811859130859375, + "learning_rate": 4.592592592592592e-07, + "loss": 0.0003, + "reward": 1.7428572177886963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 3720 + }, + { + "completion_length": 250.7053689956665, + "epoch": 0.6239993293935203, + "grad_norm": 0.2677336786068505, + "kl": 0.135406494140625, + "learning_rate": 4.5950617283950616e-07, + "loss": 0.0001, + "reward": 1.7125000581145287, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7258928865194321, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3722 + }, + { + "completion_length": 254.31697750091553, + "epoch": 0.6243346326333878, + "grad_norm": 0.20902682284429114, + "kl": 0.03919219970703125, + "learning_rate": 4.597530864197531e-07, + "loss": 0.0, + "reward": 1.7375000789761543, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7508928887546062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3724 + }, + { + "completion_length": 257.21876335144043, + "epoch": 0.6246699358732554, + "grad_norm": 0.31779276105932713, + "kl": 0.034637451171875, + "learning_rate": 4.6e-07, + "loss": 0.0, + "reward": 1.701785795390606, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7062500342726707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3726 + }, + { + "completion_length": 262.53572368621826, + "epoch": 0.6250052391131229, + "grad_norm": 0.2653994448875676, + "kl": 0.065643310546875, + "learning_rate": 4.6024691358024693e-07, + "loss": 0.0001, + "reward": 1.78035718947649, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7848214693367481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3728 + }, + { + "completion_length": 254.70536994934082, + "epoch": 0.6253405423529905, + "grad_norm": 0.19647143960852814, + "kl": 0.039215087890625, + "learning_rate": 4.6049382716049377e-07, + "loss": 0.0, + "reward": 1.7392858117818832, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 3730 + }, + { + "completion_length": 252.8794755935669, + "epoch": 0.625675845592858, + "grad_norm": 0.2663136311118988, + "kl": 0.04831695556640625, + "learning_rate": 4.607407407407407e-07, + "loss": 0.0, + "reward": 1.791071467101574, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7955357320606709, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3732 + }, + { + "completion_length": 261.4866180419922, + "epoch": 0.6260111488327256, + "grad_norm": 0.5976991428641791, + "kl": 0.09447479248046875, + "learning_rate": 4.6098765432098765e-07, + "loss": 0.0001, + "reward": 1.6808036416769028, + "reward_std": 0.05745242489501834, + "rewards/equation_reward_func": 0.6910714711993933, + "rewards/format_reward_func": 0.9897321499884129, + "step": 3734 + }, + { + "completion_length": 249.4062623977661, + "epoch": 0.6263464520725931, + "grad_norm": 0.20909877453812703, + "kl": 0.03856658935546875, + "learning_rate": 4.6123456790123454e-07, + "loss": 0.0, + "reward": 1.7517857551574707, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500406056643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3736 + }, + { + "completion_length": 258.77679920196533, + "epoch": 0.6266817553124607, + "grad_norm": 0.20655282498276678, + "kl": 0.04911041259765625, + "learning_rate": 4.614814814814815e-07, + "loss": 0.0, + "reward": 1.7875000685453415, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3738 + }, + { + "completion_length": 247.31697463989258, + "epoch": 0.6270170585523283, + "grad_norm": 0.2317755787603404, + "kl": 0.07225799560546875, + "learning_rate": 4.6172839506172837e-07, + "loss": 0.0001, + "reward": 1.7767857685685158, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3740 + }, + { + "completion_length": 241.24554824829102, + "epoch": 0.6273523617921958, + "grad_norm": 0.24892404972568108, + "kl": 0.21956634521484375, + "learning_rate": 4.619753086419753e-07, + "loss": 0.0002, + "reward": 1.750000074505806, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7589286044239998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3742 + }, + { + "completion_length": 250.0357265472412, + "epoch": 0.6276876650320634, + "grad_norm": 0.2655812732823818, + "kl": 0.07500457763671875, + "learning_rate": 4.622222222222222e-07, + "loss": 0.0001, + "reward": 1.7517857626080513, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3744 + }, + { + "completion_length": 255.49108505249023, + "epoch": 0.6280229682719309, + "grad_norm": 0.37752015835330077, + "kl": 0.08672332763671875, + "learning_rate": 4.624691358024691e-07, + "loss": 0.0001, + "reward": 1.7910714894533157, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3746 + }, + { + "completion_length": 242.4598331451416, + "epoch": 0.6283582715117985, + "grad_norm": 0.22297982995630966, + "kl": 0.07720947265625, + "learning_rate": 4.62716049382716e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071574389935, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3748 + }, + { + "completion_length": 247.51340579986572, + "epoch": 0.628693574751666, + "grad_norm": 0.2722172443493719, + "kl": 0.3848876953125, + "learning_rate": 4.6296296296296297e-07, + "loss": 0.0004, + "reward": 1.7553572207689285, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7687500268220901, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3750 + }, + { + "completion_length": 244.4509048461914, + "epoch": 0.6290288779915336, + "grad_norm": 0.23104650402682889, + "kl": 0.04361724853515625, + "learning_rate": 4.6320987654320986e-07, + "loss": 0.0, + "reward": 1.769642911851406, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3752 + }, + { + "completion_length": 248.19197845458984, + "epoch": 0.6293641812314011, + "grad_norm": 0.2429922187286504, + "kl": 0.41692352294921875, + "learning_rate": 4.634567901234568e-07, + "loss": 0.0004, + "reward": 1.7375000566244125, + "reward_std": 0.10859139543026686, + "rewards/equation_reward_func": 0.7508928887546062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3754 + }, + { + "completion_length": 243.48661994934082, + "epoch": 0.6296994844712687, + "grad_norm": 0.5067240377764414, + "kl": 1.2078857421875, + "learning_rate": 4.637037037037037e-07, + "loss": 0.0012, + "reward": 1.8008929044008255, + "reward_std": 0.049244935158640146, + "rewards/equation_reward_func": 0.8026785850524902, + "rewards/format_reward_func": 0.9982142895460129, + "step": 3756 + }, + { + "completion_length": 254.66965770721436, + "epoch": 0.6300347877111363, + "grad_norm": 0.20975011767382296, + "kl": 0.07564544677734375, + "learning_rate": 4.639506172839506e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7660714611411095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3758 + }, + { + "completion_length": 262.8035840988159, + "epoch": 0.6303700909510038, + "grad_norm": 0.9492404850582921, + "kl": 3.7341232299804688, + "learning_rate": 4.641975308641975e-07, + "loss": 0.0037, + "reward": 1.7589286118745804, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7812500186264515, + "rewards/format_reward_func": 0.977678582072258, + "step": 3760 + }, + { + "completion_length": 273.1250123977661, + "epoch": 0.6307053941908713, + "grad_norm": 0.4768543484406165, + "kl": 0.6936798095703125, + "learning_rate": 4.644444444444444e-07, + "loss": 0.0007, + "reward": 1.6446429416537285, + "reward_std": 0.13889597728848457, + "rewards/equation_reward_func": 0.6937500238418579, + "rewards/format_reward_func": 0.9508928805589676, + "step": 3762 + }, + { + "completion_length": 253.3169765472412, + "epoch": 0.6310406974307389, + "grad_norm": 0.545748958630044, + "kl": 0.9039535522460938, + "learning_rate": 4.6469135802469134e-07, + "loss": 0.0009, + "reward": 1.6625000536441803, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7026786003261805, + "rewards/format_reward_func": 0.9598214477300644, + "step": 3764 + }, + { + "completion_length": 262.6830463409424, + "epoch": 0.6313760006706065, + "grad_norm": 0.39882267008820893, + "kl": 0.4140472412109375, + "learning_rate": 4.6493827160493823e-07, + "loss": 0.0004, + "reward": 1.708928644657135, + "reward_std": 0.12879444938153028, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9598214477300644, + "step": 3766 + }, + { + "completion_length": 275.0089416503906, + "epoch": 0.631711303910474, + "grad_norm": 0.4620834014351106, + "kl": 0.78497314453125, + "learning_rate": 4.651851851851852e-07, + "loss": 0.0008, + "reward": 1.6575893387198448, + "reward_std": 0.20139916171319783, + "rewards/equation_reward_func": 0.7080357428640127, + "rewards/format_reward_func": 0.9495535977184772, + "step": 3768 + }, + { + "completion_length": 261.3437623977661, + "epoch": 0.6320466071503416, + "grad_norm": 0.40476944953139327, + "kl": 2.376129150390625, + "learning_rate": 4.654320987654321e-07, + "loss": 0.0024, + "reward": 1.6946428939700127, + "reward_std": 0.10859139915555716, + "rewards/equation_reward_func": 0.734821442514658, + "rewards/format_reward_func": 0.9598214477300644, + "step": 3770 + }, + { + "completion_length": 248.98215579986572, + "epoch": 0.6323819103902092, + "grad_norm": 0.18001335211723607, + "kl": 1.412933349609375, + "learning_rate": 4.6567901234567895e-07, + "loss": 0.0014, + "reward": 1.7330357730388641, + "reward_std": 0.08460027631372213, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9803571552038193, + "step": 3772 + }, + { + "completion_length": 263.4330463409424, + "epoch": 0.6327172136300767, + "grad_norm": 0.3190957941370173, + "kl": 0.8990097045898438, + "learning_rate": 4.659259259259259e-07, + "loss": 0.0009, + "reward": 1.68392863124609, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.706250037997961, + "rewards/format_reward_func": 0.977678582072258, + "step": 3774 + }, + { + "completion_length": 254.8794755935669, + "epoch": 0.6330525168699442, + "grad_norm": 0.31257213744887996, + "kl": 0.89825439453125, + "learning_rate": 4.6617283950617283e-07, + "loss": 0.0009, + "reward": 1.7276786416769028, + "reward_std": 0.07197337062098086, + "rewards/equation_reward_func": 0.7437500357627869, + "rewards/format_reward_func": 0.9839285798370838, + "step": 3776 + }, + { + "completion_length": 258.2276916503906, + "epoch": 0.6333878201098118, + "grad_norm": 0.7055444014800766, + "kl": 8.225692749023438, + "learning_rate": 4.664197530864197e-07, + "loss": 0.0082, + "reward": 1.6964286342263222, + "reward_std": 0.10606601648032665, + "rewards/equation_reward_func": 0.7232143171131611, + "rewards/format_reward_func": 0.9732142984867096, + "step": 3778 + }, + { + "completion_length": 265.3482275009155, + "epoch": 0.6337231233496794, + "grad_norm": 0.9790034779343154, + "kl": 4.100105285644531, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.0041, + "reward": 1.657142959535122, + "reward_std": 0.1616244027391076, + "rewards/equation_reward_func": 0.6928571723401546, + "rewards/format_reward_func": 0.9642857313156128, + "step": 3780 + }, + { + "completion_length": 256.03572845458984, + "epoch": 0.6340584265895469, + "grad_norm": 0.29187955878990074, + "kl": 0.35442352294921875, + "learning_rate": 4.6691358024691355e-07, + "loss": 0.0004, + "reward": 1.6660715118050575, + "reward_std": 0.10859139915555716, + "rewards/equation_reward_func": 0.6883928924798965, + "rewards/format_reward_func": 0.977678582072258, + "step": 3782 + }, + { + "completion_length": 252.56697750091553, + "epoch": 0.6343937298294144, + "grad_norm": 0.41425872484802523, + "kl": 0.2930908203125, + "learning_rate": 4.671604938271605e-07, + "loss": 0.0003, + "reward": 1.7535714730620384, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7625000216066837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3784 + }, + { + "completion_length": 244.29018878936768, + "epoch": 0.6347290330692821, + "grad_norm": 0.4734383263821688, + "kl": 0.1266326904296875, + "learning_rate": 4.674074074074074e-07, + "loss": 0.0001, + "reward": 1.6875000819563866, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.6919643208384514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3786 + }, + { + "completion_length": 255.27679634094238, + "epoch": 0.6350643363091496, + "grad_norm": 0.5475088235706634, + "kl": 0.07224273681640625, + "learning_rate": 4.6765432098765427e-07, + "loss": 0.0001, + "reward": 1.657142959535122, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.6741071790456772, + "rewards/format_reward_func": 0.9830357283353806, + "step": 3788 + }, + { + "completion_length": 241.01340293884277, + "epoch": 0.6353996395490171, + "grad_norm": 0.20196700802888495, + "kl": 0.046722412109375, + "learning_rate": 4.679012345679012e-07, + "loss": 0.0, + "reward": 1.7357143759727478, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 3790 + }, + { + "completion_length": 260.00893783569336, + "epoch": 0.6357349427888846, + "grad_norm": 0.6002225994492422, + "kl": 0.1472930908203125, + "learning_rate": 4.681481481481481e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7446428835391998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3792 + }, + { + "completion_length": 253.6026906967163, + "epoch": 0.6360702460287523, + "grad_norm": 0.23931617298852403, + "kl": 0.0593109130859375, + "learning_rate": 4.6839506172839504e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3794 + }, + { + "completion_length": 250.92411708831787, + "epoch": 0.6364055492686198, + "grad_norm": 0.2321628612327699, + "kl": 0.057647705078125, + "learning_rate": 4.68641975308642e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7303571663796902, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3796 + }, + { + "completion_length": 253.66965103149414, + "epoch": 0.6367408525084873, + "grad_norm": 0.34411481195861043, + "kl": 0.0452880859375, + "learning_rate": 4.6888888888888887e-07, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7339286170899868, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3798 + }, + { + "completion_length": 249.4017972946167, + "epoch": 0.637076155748355, + "grad_norm": 0.3156697298150423, + "kl": 0.0444183349609375, + "learning_rate": 4.6913580246913576e-07, + "loss": 0.0, + "reward": 1.8125000596046448, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.8169643133878708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3800 + }, + { + "completion_length": 251.35268688201904, + "epoch": 0.6374114589882225, + "grad_norm": 0.39588853568629256, + "kl": 0.0497589111328125, + "learning_rate": 4.693827160493827e-07, + "loss": 0.0, + "reward": 1.7366072237491608, + "reward_std": 0.06944798771291971, + "rewards/equation_reward_func": 0.7473214641213417, + "rewards/format_reward_func": 0.9892857223749161, + "step": 3802 + }, + { + "completion_length": 269.97769260406494, + "epoch": 0.63774676222809, + "grad_norm": 0.2850366769440957, + "kl": 0.05326080322265625, + "learning_rate": 4.696296296296296e-07, + "loss": 0.0001, + "reward": 1.692857250571251, + "reward_std": 0.11111677810549736, + "rewards/equation_reward_func": 0.7107143178582191, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3804 + }, + { + "completion_length": 244.63840293884277, + "epoch": 0.6380820654679575, + "grad_norm": 0.27088312156428124, + "kl": 0.0478973388671875, + "learning_rate": 4.6987654320987653e-07, + "loss": 0.0, + "reward": 1.7982143461704254, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8026785925030708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3806 + }, + { + "completion_length": 255.5223331451416, + "epoch": 0.6384173687078252, + "grad_norm": 0.15589935664162494, + "kl": 0.05022430419921875, + "learning_rate": 4.701234567901234e-07, + "loss": 0.0001, + "reward": 1.6910715103149414, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.6955357454717159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3808 + }, + { + "completion_length": 260.58483123779297, + "epoch": 0.6387526719476927, + "grad_norm": 0.46828900118421973, + "kl": 0.0674285888671875, + "learning_rate": 4.7037037037037036e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7991071566939354, + "rewards/format_reward_func": 0.977678582072258, + "step": 3810 + }, + { + "completion_length": 243.7053689956665, + "epoch": 0.6390879751875602, + "grad_norm": 0.1797720332442718, + "kl": 0.05417633056640625, + "learning_rate": 4.706172839506173e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 3812 + }, + { + "completion_length": 252.46429634094238, + "epoch": 0.6394232784274279, + "grad_norm": 1.0245365579568664, + "kl": 0.049041748046875, + "learning_rate": 4.7086419753086414e-07, + "loss": 0.0, + "reward": 1.7839286252856255, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928939700127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3814 + }, + { + "completion_length": 248.0803680419922, + "epoch": 0.6397585816672954, + "grad_norm": 0.1690612877408516, + "kl": 0.04561614990234375, + "learning_rate": 4.711111111111111e-07, + "loss": 0.0, + "reward": 1.8089286237955093, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.813392885029316, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3816 + }, + { + "completion_length": 242.28125953674316, + "epoch": 0.6400938849071629, + "grad_norm": 0.1799869429711767, + "kl": 0.04736328125, + "learning_rate": 4.7135802469135797e-07, + "loss": 0.0, + "reward": 1.8000000566244125, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8089286014437675, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3818 + }, + { + "completion_length": 247.89286994934082, + "epoch": 0.6404291881470304, + "grad_norm": 0.2382277237406641, + "kl": 0.04730224609375, + "learning_rate": 4.716049382716049e-07, + "loss": 0.0, + "reward": 1.7089286744594574, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7133928835391998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3820 + }, + { + "completion_length": 253.94643878936768, + "epoch": 0.6407644913868981, + "grad_norm": 0.28596620542102036, + "kl": 0.0505523681640625, + "learning_rate": 4.7185185185185185e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3822 + }, + { + "completion_length": 247.9732255935669, + "epoch": 0.6410997946267656, + "grad_norm": 0.11720032512746174, + "kl": 0.05326080322265625, + "learning_rate": 4.7209876543209874e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7580357398837805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3824 + }, + { + "completion_length": 263.95090675354004, + "epoch": 0.6414350978666331, + "grad_norm": 0.17071112444417788, + "kl": 0.06207275390625, + "learning_rate": 4.723456790123457e-07, + "loss": 0.0001, + "reward": 1.7303572073578835, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7437500283122063, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3826 + }, + { + "completion_length": 259.48661613464355, + "epoch": 0.6417704011065007, + "grad_norm": 0.2601322695367908, + "kl": 0.06282806396484375, + "learning_rate": 4.725925925925926e-07, + "loss": 0.0001, + "reward": 1.7633929252624512, + "reward_std": 0.10227794293314219, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9750000089406967, + "step": 3828 + }, + { + "completion_length": 258.39733600616455, + "epoch": 0.6421057043463683, + "grad_norm": 0.22779481358853262, + "kl": 0.04955291748046875, + "learning_rate": 4.7283950617283945e-07, + "loss": 0.0, + "reward": 1.7535714879631996, + "reward_std": 0.07576144207268953, + "rewards/equation_reward_func": 0.771428594365716, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3830 + }, + { + "completion_length": 250.21876430511475, + "epoch": 0.6424410075862358, + "grad_norm": 0.43527318397332027, + "kl": 0.05060577392578125, + "learning_rate": 4.730864197530864e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3832 + }, + { + "completion_length": 262.4598340988159, + "epoch": 0.6427763108261033, + "grad_norm": 0.21911360098756538, + "kl": 0.06902313232421875, + "learning_rate": 4.733333333333333e-07, + "loss": 0.0001, + "reward": 1.6392858177423477, + "reward_std": 0.09596449136734009, + "rewards/equation_reward_func": 0.6660714633762836, + "rewards/format_reward_func": 0.9732142984867096, + "step": 3834 + }, + { + "completion_length": 273.68304920196533, + "epoch": 0.643111614065971, + "grad_norm": 1.615608755048004, + "kl": 0.154632568359375, + "learning_rate": 4.735802469135802e-07, + "loss": 0.0002, + "reward": 1.6790179163217545, + "reward_std": 0.12058696104213595, + "rewards/equation_reward_func": 0.7294643055647612, + "rewards/format_reward_func": 0.949553593993187, + "step": 3836 + }, + { + "completion_length": 264.53572273254395, + "epoch": 0.6434469173058385, + "grad_norm": 0.2778056266442193, + "kl": 0.07929229736328125, + "learning_rate": 4.7382716049382717e-07, + "loss": 0.0001, + "reward": 1.680357240140438, + "reward_std": 0.13384521193802357, + "rewards/equation_reward_func": 0.7294643186032772, + "rewards/format_reward_func": 0.9508928768336773, + "step": 3838 + }, + { + "completion_length": 267.4062623977661, + "epoch": 0.643782220545706, + "grad_norm": 0.18156955523379578, + "kl": 0.06781005859375, + "learning_rate": 4.7407407407407405e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.08586296625435352, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 0.9642857275903225, + "step": 3840 + }, + { + "completion_length": 260.8839406967163, + "epoch": 0.6441175237855736, + "grad_norm": 0.31977780335247097, + "kl": 0.153656005859375, + "learning_rate": 4.74320987654321e-07, + "loss": 0.0002, + "reward": 1.7285714894533157, + "reward_std": 0.07071068044751883, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3842 + }, + { + "completion_length": 261.85269355773926, + "epoch": 0.6444528270254412, + "grad_norm": 0.3417284451794752, + "kl": 0.09317779541015625, + "learning_rate": 4.745679012345679e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 0.9821428656578064, + "step": 3844 + }, + { + "completion_length": 246.83036994934082, + "epoch": 0.6447881302653087, + "grad_norm": 0.17477561853506282, + "kl": 0.06887054443359375, + "learning_rate": 4.7481481481481477e-07, + "loss": 0.0001, + "reward": 1.7625000327825546, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7758928872644901, + "rewards/format_reward_func": 0.9866071492433548, + "step": 3846 + }, + { + "completion_length": 255.99554634094238, + "epoch": 0.6451234335051762, + "grad_norm": 0.44253711066314283, + "kl": 0.1705474853515625, + "learning_rate": 4.750617283950617e-07, + "loss": 0.0002, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428794413805, + "rewards/format_reward_func": 1.0, + "step": 3848 + }, + { + "completion_length": 248.18750953674316, + "epoch": 0.6454587367450438, + "grad_norm": 0.2387987965129388, + "kl": 0.067352294921875, + "learning_rate": 4.753086419753086e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7732143215835094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3850 + }, + { + "completion_length": 251.83483409881592, + "epoch": 0.6457940399849114, + "grad_norm": 0.18727904381625438, + "kl": 0.0418701171875, + "learning_rate": 4.7555555555555554e-07, + "loss": 0.0, + "reward": 1.7571429312229156, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3852 + }, + { + "completion_length": 253.39733219146729, + "epoch": 0.6461293432247789, + "grad_norm": 0.18840059404995516, + "kl": 0.23563385009765625, + "learning_rate": 4.758024691358025e-07, + "loss": 0.0002, + "reward": 1.7321429252624512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428973227739, + "rewards/format_reward_func": 1.0, + "step": 3854 + }, + { + "completion_length": 259.3571557998657, + "epoch": 0.6464646464646465, + "grad_norm": 0.24891815306132903, + "kl": 0.1553192138671875, + "learning_rate": 4.7604938271604937e-07, + "loss": 0.0002, + "reward": 1.7517857551574707, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500536441803, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3856 + }, + { + "completion_length": 249.96875953674316, + "epoch": 0.646799949704514, + "grad_norm": 0.1939251343094505, + "kl": 0.10715484619140625, + "learning_rate": 4.7629629629629626e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.73214291036129, + "rewards/format_reward_func": 1.0, + "step": 3858 + }, + { + "completion_length": 264.3616189956665, + "epoch": 0.6471352529443816, + "grad_norm": 0.31045812888398727, + "kl": 0.240264892578125, + "learning_rate": 4.7654320987654315e-07, + "loss": 0.0002, + "reward": 1.7089286521077156, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7133929040282965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3860 + }, + { + "completion_length": 248.1741189956665, + "epoch": 0.6474705561842491, + "grad_norm": 0.29967271897328407, + "kl": 0.142913818359375, + "learning_rate": 4.767901234567901e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3862 + }, + { + "completion_length": 244.29465293884277, + "epoch": 0.6478058594241167, + "grad_norm": 0.34126496872835815, + "kl": 0.0392608642578125, + "learning_rate": 4.77037037037037e-07, + "loss": 0.0, + "reward": 1.7178572192788124, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7178571820259094, + "rewards/format_reward_func": 1.0, + "step": 3864 + }, + { + "completion_length": 253.07144165039062, + "epoch": 0.6481411626639842, + "grad_norm": 5.56472514014859, + "kl": 5.6567535400390625, + "learning_rate": 4.772839506172839e-07, + "loss": 0.0057, + "reward": 1.775000050663948, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.783928606659174, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3866 + }, + { + "completion_length": 260.7812614440918, + "epoch": 0.6484764659038518, + "grad_norm": 0.2560663927017658, + "kl": 0.22957611083984375, + "learning_rate": 4.775308641975309e-07, + "loss": 0.0002, + "reward": 1.7428571954369545, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7517857402563095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3868 + }, + { + "completion_length": 254.96876335144043, + "epoch": 0.6488117691437193, + "grad_norm": 0.21644056151909313, + "kl": 0.15191650390625, + "learning_rate": 4.777777777777778e-07, + "loss": 0.0002, + "reward": 1.764285795390606, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 3870 + }, + { + "completion_length": 254.66072463989258, + "epoch": 0.6491470723835869, + "grad_norm": 0.346480276669036, + "kl": 0.04920196533203125, + "learning_rate": 4.780246913580246e-07, + "loss": 0.0, + "reward": 1.748214341700077, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3872 + }, + { + "completion_length": 267.8482265472412, + "epoch": 0.6494823756234545, + "grad_norm": 0.29899998204476824, + "kl": 0.03923797607421875, + "learning_rate": 4.782716049382716e-07, + "loss": 0.0, + "reward": 1.7321429550647736, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 3874 + }, + { + "completion_length": 258.6026906967163, + "epoch": 0.649817678863322, + "grad_norm": 0.08752132121753611, + "kl": 0.0488739013671875, + "learning_rate": 4.785185185185185e-07, + "loss": 0.0, + "reward": 1.7267858013510704, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7312500216066837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3876 + }, + { + "completion_length": 261.4687662124634, + "epoch": 0.6501529821031896, + "grad_norm": 0.32540356794934916, + "kl": 0.0558624267578125, + "learning_rate": 4.787654320987654e-07, + "loss": 0.0001, + "reward": 1.7428572326898575, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7517857421189547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3878 + }, + { + "completion_length": 239.008939743042, + "epoch": 0.6504882853430571, + "grad_norm": 0.23190866512432148, + "kl": 0.072418212890625, + "learning_rate": 4.790123456790123e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 3880 + }, + { + "completion_length": 254.6875123977661, + "epoch": 0.6508235885829247, + "grad_norm": 0.6270072702489933, + "kl": 0.17505645751953125, + "learning_rate": 4.792592592592592e-07, + "loss": 0.0002, + "reward": 1.726785808801651, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.731250025331974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3882 + }, + { + "completion_length": 260.0134048461914, + "epoch": 0.6511588918227922, + "grad_norm": 0.23875093940047093, + "kl": 0.08642578125, + "learning_rate": 4.795061728395062e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7553571797907352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3884 + }, + { + "completion_length": 245.8928689956665, + "epoch": 0.6514941950626598, + "grad_norm": 0.15408522982548276, + "kl": 0.04544830322265625, + "learning_rate": 4.79753086419753e-07, + "loss": 0.0, + "reward": 1.7160715237259865, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7205357477068901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3886 + }, + { + "completion_length": 260.94644355773926, + "epoch": 0.6518294983025273, + "grad_norm": 0.1560889557012829, + "kl": 0.0451812744140625, + "learning_rate": 4.8e-07, + "loss": 0.0, + "reward": 1.7142857760190964, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857443541288, + "rewards/format_reward_func": 1.0, + "step": 3888 + }, + { + "completion_length": 250.03125953674316, + "epoch": 0.6521648015423949, + "grad_norm": 0.1550896631696827, + "kl": 0.04175567626953125, + "learning_rate": 4.802469135802469e-07, + "loss": 0.0, + "reward": 1.74821437895298, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3890 + }, + { + "completion_length": 244.3660831451416, + "epoch": 0.6525001047822625, + "grad_norm": 0.3224570837771038, + "kl": 0.05951690673828125, + "learning_rate": 4.804938271604938e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 3892 + }, + { + "completion_length": 233.0625114440918, + "epoch": 0.65283540802213, + "grad_norm": 0.23181470359460926, + "kl": 0.04352569580078125, + "learning_rate": 4.807407407407407e-07, + "loss": 0.0, + "reward": 1.7535714879631996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714488476515, + "rewards/format_reward_func": 1.0, + "step": 3894 + }, + { + "completion_length": 247.50447273254395, + "epoch": 0.6531707112619975, + "grad_norm": 0.4284565414178115, + "kl": 0.0639801025390625, + "learning_rate": 4.809876543209876e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 3896 + }, + { + "completion_length": 254.9419755935669, + "epoch": 0.6535060145018651, + "grad_norm": 0.2229631953694362, + "kl": 0.09334564208984375, + "learning_rate": 4.812345679012346e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.728571455925703, + "rewards/format_reward_func": 1.0, + "step": 3898 + }, + { + "completion_length": 244.6071548461914, + "epoch": 0.6538413177417327, + "grad_norm": 0.1938204139192448, + "kl": 0.06211090087890625, + "learning_rate": 4.814814814814814e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7383928801864386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3900 + }, + { + "completion_length": 257.8392963409424, + "epoch": 0.6541766209816002, + "grad_norm": 0.21032308816890635, + "kl": 0.29449462890625, + "learning_rate": 4.817283950617283e-07, + "loss": 0.0003, + "reward": 1.7571429088711739, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 3902 + }, + { + "completion_length": 247.4509038925171, + "epoch": 0.6545119242214678, + "grad_norm": 0.3046733976307481, + "kl": 0.14829254150390625, + "learning_rate": 4.819753086419753e-07, + "loss": 0.0001, + "reward": 1.8089286163449287, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8133928775787354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3904 + }, + { + "completion_length": 246.89733695983887, + "epoch": 0.6548472274613354, + "grad_norm": 0.39941927413480616, + "kl": 0.2657928466796875, + "learning_rate": 4.822222222222222e-07, + "loss": 0.0003, + "reward": 1.7285714969038963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 3906 + }, + { + "completion_length": 251.4241180419922, + "epoch": 0.6551825307012029, + "grad_norm": 0.16084090620029584, + "kl": 0.05809783935546875, + "learning_rate": 4.824691358024692e-07, + "loss": 0.0001, + "reward": 1.755357213318348, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3908 + }, + { + "completion_length": 253.85269165039062, + "epoch": 0.6555178339410704, + "grad_norm": 0.19861177988293185, + "kl": 0.0594329833984375, + "learning_rate": 4.82716049382716e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 3910 + }, + { + "completion_length": 246.08929634094238, + "epoch": 0.655853137180938, + "grad_norm": 0.24120335029514853, + "kl": 0.047882080078125, + "learning_rate": 4.829629629629629e-07, + "loss": 0.0, + "reward": 1.707142949104309, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7071428913623095, + "rewards/format_reward_func": 1.0, + "step": 3912 + }, + { + "completion_length": 245.1428689956665, + "epoch": 0.6561884404208056, + "grad_norm": 0.30892198013938815, + "kl": 0.05008697509765625, + "learning_rate": 4.832098765432099e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 3914 + }, + { + "completion_length": 256.74554920196533, + "epoch": 0.6565237436606731, + "grad_norm": 0.2089904775804934, + "kl": 0.05450439453125, + "learning_rate": 4.834567901234567e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7839286141097546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3916 + }, + { + "completion_length": 250.4196548461914, + "epoch": 0.6568590469005406, + "grad_norm": 0.26592058060680673, + "kl": 0.0504913330078125, + "learning_rate": 4.837037037037037e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 3918 + }, + { + "completion_length": 249.6785831451416, + "epoch": 0.6571943501404083, + "grad_norm": 0.040893189818078074, + "kl": 0.04575347900390625, + "learning_rate": 4.839506172839506e-07, + "loss": 0.0, + "reward": 1.782142922282219, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3920 + }, + { + "completion_length": 259.4509057998657, + "epoch": 0.6575296533802758, + "grad_norm": 0.13055749625194799, + "kl": 0.18926239013671875, + "learning_rate": 4.841975308641975e-07, + "loss": 0.0002, + "reward": 1.7000000700354576, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7089285999536514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3922 + }, + { + "completion_length": 257.4285821914673, + "epoch": 0.6578649566201433, + "grad_norm": 0.2842193792194698, + "kl": 0.5179595947265625, + "learning_rate": 4.844444444444445e-07, + "loss": 0.0005, + "reward": 1.7035714983940125, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.712500037625432, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3924 + }, + { + "completion_length": 257.8616199493408, + "epoch": 0.6582002598600109, + "grad_norm": 0.21372307018495712, + "kl": 0.04400634765625, + "learning_rate": 4.846913580246913e-07, + "loss": 0.0, + "reward": 1.6982143595814705, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.702678607776761, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3926 + }, + { + "completion_length": 250.1741189956665, + "epoch": 0.6585355630998785, + "grad_norm": 0.2179283235343074, + "kl": 0.0731353759765625, + "learning_rate": 4.849382716049383e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 3928 + }, + { + "completion_length": 255.47769165039062, + "epoch": 0.658870866339746, + "grad_norm": 0.20049233410239328, + "kl": 0.0641937255859375, + "learning_rate": 4.851851851851852e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 3930 + }, + { + "completion_length": 248.6294755935669, + "epoch": 0.6592061695796135, + "grad_norm": 0.41126545761337224, + "kl": 0.062530517578125, + "learning_rate": 4.85432098765432e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7589285969734192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3932 + }, + { + "completion_length": 243.2500114440918, + "epoch": 0.6595414728194812, + "grad_norm": 0.3831697730735232, + "kl": 0.14061737060546875, + "learning_rate": 4.85679012345679e-07, + "loss": 0.0001, + "reward": 1.7517857998609543, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3934 + }, + { + "completion_length": 244.65625953674316, + "epoch": 0.6598767760593487, + "grad_norm": 0.19433989843152025, + "kl": 0.05853271484375, + "learning_rate": 4.859259259259259e-07, + "loss": 0.0001, + "reward": 1.723214365541935, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7276786118745804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3936 + }, + { + "completion_length": 247.46429824829102, + "epoch": 0.6602120792992162, + "grad_norm": 0.27797701510862327, + "kl": 0.07159423828125, + "learning_rate": 4.861728395061729e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 3938 + }, + { + "completion_length": 245.87054824829102, + "epoch": 0.6605473825390837, + "grad_norm": 0.3248319516246433, + "kl": 0.05426025390625, + "learning_rate": 4.864197530864198e-07, + "loss": 0.0001, + "reward": 1.7000000923871994, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7000000365078449, + "rewards/format_reward_func": 1.0, + "step": 3940 + }, + { + "completion_length": 256.95536613464355, + "epoch": 0.6608826857789514, + "grad_norm": 0.21364917257066787, + "kl": 0.9769058227539062, + "learning_rate": 4.866666666666666e-07, + "loss": 0.001, + "reward": 1.7607143446803093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 3942 + }, + { + "completion_length": 242.3303680419922, + "epoch": 0.6612179890188189, + "grad_norm": 0.6023822317409888, + "kl": 1.2734832763671875, + "learning_rate": 4.869135802469136e-07, + "loss": 0.0013, + "reward": 1.7776786386966705, + "reward_std": 0.08207489224150777, + "rewards/equation_reward_func": 0.7839285992085934, + "rewards/format_reward_func": 0.9937500059604645, + "step": 3944 + }, + { + "completion_length": 247.9821538925171, + "epoch": 0.6615532922586864, + "grad_norm": 0.21272371324088035, + "kl": 0.08065032958984375, + "learning_rate": 4.871604938271604e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 3946 + }, + { + "completion_length": 255.55804824829102, + "epoch": 0.661888595498554, + "grad_norm": 0.2834225170622278, + "kl": 0.0803070068359375, + "learning_rate": 4.874074074074073e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3948 + }, + { + "completion_length": 249.44197845458984, + "epoch": 0.6622238987384216, + "grad_norm": 0.16218911032112202, + "kl": 0.06241607666015625, + "learning_rate": 4.876543209876543e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 3950 + }, + { + "completion_length": 241.6071548461914, + "epoch": 0.6625592019782891, + "grad_norm": 0.24746330003861983, + "kl": 0.06597900390625, + "learning_rate": 4.879012345679012e-07, + "loss": 0.0001, + "reward": 1.8058036267757416, + "reward_std": 0.03219861118122935, + "rewards/equation_reward_func": 0.8071428760886192, + "rewards/format_reward_func": 0.9986607171595097, + "step": 3952 + }, + { + "completion_length": 256.14733505249023, + "epoch": 0.6628945052181566, + "grad_norm": 0.26130082812710603, + "kl": 0.0624542236328125, + "learning_rate": 4.881481481481482e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 3954 + }, + { + "completion_length": 238.16519165039062, + "epoch": 0.6632298084580243, + "grad_norm": 0.5290773175802812, + "kl": 0.268890380859375, + "learning_rate": 4.883950617283951e-07, + "loss": 0.0003, + "reward": 1.7696429267525673, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3956 + }, + { + "completion_length": 247.38840675354004, + "epoch": 0.6635651116978918, + "grad_norm": 0.18398423923323337, + "kl": 0.072296142578125, + "learning_rate": 4.88641975308642e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571745753288, + "rewards/format_reward_func": 1.0, + "step": 3958 + }, + { + "completion_length": 238.4776906967163, + "epoch": 0.6639004149377593, + "grad_norm": 0.9794345870204398, + "kl": 0.08960723876953125, + "learning_rate": 4.888888888888889e-07, + "loss": 0.0001, + "reward": 1.742857240140438, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571656346321, + "rewards/format_reward_func": 1.0, + "step": 3960 + }, + { + "completion_length": 239.95983409881592, + "epoch": 0.6642357181776269, + "grad_norm": 0.27360081177453655, + "kl": 0.092071533203125, + "learning_rate": 4.891358024691357e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 3962 + }, + { + "completion_length": 251.196439743042, + "epoch": 0.6645710214174945, + "grad_norm": 0.31778832714402766, + "kl": 0.05353546142578125, + "learning_rate": 4.893827160493827e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 3964 + }, + { + "completion_length": 242.4687623977661, + "epoch": 0.664906324657362, + "grad_norm": 0.17589893133387763, + "kl": 0.10141754150390625, + "learning_rate": 4.896296296296296e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 3966 + }, + { + "completion_length": 233.05357933044434, + "epoch": 0.6652416278972295, + "grad_norm": 0.21849168483193168, + "kl": 0.052154541015625, + "learning_rate": 4.898765432098765e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571660071611, + "rewards/format_reward_func": 1.0, + "step": 3968 + }, + { + "completion_length": 245.81697273254395, + "epoch": 0.6655769311370971, + "grad_norm": 0.17133072243592265, + "kl": 0.060943603515625, + "learning_rate": 4.901234567901235e-07, + "loss": 0.0001, + "reward": 1.801785759627819, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500283122063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3970 + }, + { + "completion_length": 230.18304920196533, + "epoch": 0.6659122343769647, + "grad_norm": 0.28829507057273934, + "kl": 0.063507080078125, + "learning_rate": 4.903703703703703e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 3972 + }, + { + "completion_length": 245.37500953674316, + "epoch": 0.6662475376168322, + "grad_norm": 0.38017604329471877, + "kl": 0.047882080078125, + "learning_rate": 4.906172839506173e-07, + "loss": 0.0, + "reward": 1.773214340209961, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3974 + }, + { + "completion_length": 243.52233123779297, + "epoch": 0.6665828408566998, + "grad_norm": 0.15516051295757663, + "kl": 0.05037689208984375, + "learning_rate": 4.908641975308642e-07, + "loss": 0.0001, + "reward": 1.716071531176567, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7205357439815998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 3976 + }, + { + "completion_length": 238.6875123977661, + "epoch": 0.6669181440965674, + "grad_norm": 0.24182324189802537, + "kl": 0.04228973388671875, + "learning_rate": 4.91111111111111e-07, + "loss": 0.0, + "reward": 1.7285714820027351, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7375000454485416, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3978 + }, + { + "completion_length": 234.69643688201904, + "epoch": 0.6672534473364349, + "grad_norm": 0.2698467568318753, + "kl": 0.0405731201171875, + "learning_rate": 4.91358024691358e-07, + "loss": 0.0, + "reward": 1.7500000819563866, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 3980 + }, + { + "completion_length": 236.25893783569336, + "epoch": 0.6675887505763024, + "grad_norm": 0.23898061740767884, + "kl": 0.0444488525390625, + "learning_rate": 4.916049382716049e-07, + "loss": 0.0, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 3982 + }, + { + "completion_length": 246.40626049041748, + "epoch": 0.66792405381617, + "grad_norm": 0.20696305255706168, + "kl": 0.05846405029296875, + "learning_rate": 4.918518518518519e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428824216127, + "rewards/format_reward_func": 1.0, + "step": 3984 + }, + { + "completion_length": 248.3214406967163, + "epoch": 0.6682593570560376, + "grad_norm": 0.21276370391828747, + "kl": 0.04009246826171875, + "learning_rate": 4.920987654320987e-07, + "loss": 0.0, + "reward": 1.8107143342494965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 3986 + }, + { + "completion_length": 254.3259038925171, + "epoch": 0.6685946602959051, + "grad_norm": 0.10395895819260763, + "kl": 0.0443267822265625, + "learning_rate": 4.923456790123456e-07, + "loss": 0.0, + "reward": 1.68571437895298, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.6857143174856901, + "rewards/format_reward_func": 1.0, + "step": 3988 + }, + { + "completion_length": 233.83929824829102, + "epoch": 0.6689299635357726, + "grad_norm": 0.3865026619681927, + "kl": 0.055450439453125, + "learning_rate": 4.925925925925926e-07, + "loss": 0.0001, + "reward": 1.8285714983940125, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.8375000171363354, + "rewards/format_reward_func": 0.9910714328289032, + "step": 3990 + }, + { + "completion_length": 247.30358219146729, + "epoch": 0.6692652667756402, + "grad_norm": 0.1731516426047963, + "kl": 0.0422210693359375, + "learning_rate": 4.928395061728395e-07, + "loss": 0.0, + "reward": 1.8142857626080513, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857514321804, + "rewards/format_reward_func": 1.0, + "step": 3992 + }, + { + "completion_length": 242.26340579986572, + "epoch": 0.6696005700155078, + "grad_norm": 0.3130980713610786, + "kl": 0.0402679443359375, + "learning_rate": 4.930864197530864e-07, + "loss": 0.0, + "reward": 1.7785715013742447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 3994 + }, + { + "completion_length": 240.4776906967163, + "epoch": 0.6699358732553753, + "grad_norm": 0.18480171777833662, + "kl": 0.04180145263671875, + "learning_rate": 4.933333333333333e-07, + "loss": 0.0, + "reward": 1.7464286386966705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 3996 + }, + { + "completion_length": 239.3214406967163, + "epoch": 0.6702711764952429, + "grad_norm": 0.2287792806277298, + "kl": 0.05632781982421875, + "learning_rate": 4.935802469135802e-07, + "loss": 0.0001, + "reward": 1.8071428909897804, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 3998 + }, + { + "completion_length": 233.94643783569336, + "epoch": 0.6706064797351105, + "grad_norm": 0.05918938448798421, + "kl": 0.038482666015625, + "learning_rate": 4.938271604938271e-07, + "loss": 0.0, + "reward": 1.7642857730388641, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7732143104076385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4000 + }, + { + "completion_length": 253.99554920196533, + "epoch": 0.670941782974978, + "grad_norm": 0.1558838101555109, + "kl": 0.039703369140625, + "learning_rate": 4.94074074074074e-07, + "loss": 0.0, + "reward": 1.730357214808464, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7348214518278837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4002 + }, + { + "completion_length": 237.4062623977661, + "epoch": 0.6712770862148455, + "grad_norm": 0.23672466601064102, + "kl": 0.03436279296875, + "learning_rate": 4.94320987654321e-07, + "loss": 0.0, + "reward": 1.755357213318348, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4004 + }, + { + "completion_length": 235.45983028411865, + "epoch": 0.6716123894547131, + "grad_norm": 0.23630688175269002, + "kl": 0.04888153076171875, + "learning_rate": 4.945679012345679e-07, + "loss": 0.0, + "reward": 1.7625000774860382, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669643275439739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4006 + }, + { + "completion_length": 234.4107255935669, + "epoch": 0.6719476926945807, + "grad_norm": 0.1509615780935135, + "kl": 0.0428009033203125, + "learning_rate": 4.948148148148148e-07, + "loss": 0.0, + "reward": 1.7142857909202576, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857424914837, + "rewards/format_reward_func": 1.0, + "step": 4008 + }, + { + "completion_length": 248.5669755935669, + "epoch": 0.6722829959344482, + "grad_norm": 0.24903427922550173, + "kl": 0.03661346435546875, + "learning_rate": 4.950617283950617e-07, + "loss": 0.0, + "reward": 1.7750000804662704, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7839285917580128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4010 + }, + { + "completion_length": 239.47322463989258, + "epoch": 0.6726182991743158, + "grad_norm": 0.3243591425526177, + "kl": 0.0417327880859375, + "learning_rate": 4.953086419753086e-07, + "loss": 0.0, + "reward": 1.7803572118282318, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7848214469850063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4012 + }, + { + "completion_length": 245.58483028411865, + "epoch": 0.6729536024141833, + "grad_norm": 0.19513335100841234, + "kl": 0.055511474609375, + "learning_rate": 4.955555555555556e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 4014 + }, + { + "completion_length": 232.09822273254395, + "epoch": 0.6732889056540509, + "grad_norm": 0.2645182804473894, + "kl": 0.0370941162109375, + "learning_rate": 4.958024691358024e-07, + "loss": 0.0, + "reward": 1.8321428894996643, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8321428894996643, + "rewards/format_reward_func": 1.0, + "step": 4016 + }, + { + "completion_length": 232.3392972946167, + "epoch": 0.6736242088939184, + "grad_norm": 0.2761508130046895, + "kl": 0.04465484619140625, + "learning_rate": 4.960493827160493e-07, + "loss": 0.0, + "reward": 1.7642857804894447, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 4018 + }, + { + "completion_length": 244.05358123779297, + "epoch": 0.673959512133786, + "grad_norm": 0.29287932155581453, + "kl": 0.085784912109375, + "learning_rate": 4.962962962962963e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 4020 + }, + { + "completion_length": 240.5223331451416, + "epoch": 0.6742948153736535, + "grad_norm": 0.224555054040133, + "kl": 0.03826904296875, + "learning_rate": 4.965432098765432e-07, + "loss": 0.0, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 4022 + }, + { + "completion_length": 234.50001335144043, + "epoch": 0.6746301186135211, + "grad_norm": 0.24682979395578233, + "kl": 0.037628173828125, + "learning_rate": 4.967901234567901e-07, + "loss": 0.0, + "reward": 1.7964286506175995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 4024 + }, + { + "completion_length": 233.65179634094238, + "epoch": 0.6749654218533887, + "grad_norm": 0.1787319429847436, + "kl": 0.04058074951171875, + "learning_rate": 4.97037037037037e-07, + "loss": 0.0, + "reward": 1.7892857566475868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 4026 + }, + { + "completion_length": 252.76340675354004, + "epoch": 0.6753007250932562, + "grad_norm": 0.39406210419938975, + "kl": 0.0468902587890625, + "learning_rate": 4.972839506172839e-07, + "loss": 0.0, + "reward": 1.7428572103381157, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7428571656346321, + "rewards/format_reward_func": 1.0, + "step": 4028 + }, + { + "completion_length": 242.0937614440918, + "epoch": 0.6756360283331238, + "grad_norm": 0.4016790575300383, + "kl": 0.043731689453125, + "learning_rate": 4.975308641975308e-07, + "loss": 0.0, + "reward": 1.764285758137703, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 4030 + }, + { + "completion_length": 244.1384048461914, + "epoch": 0.6759713315729913, + "grad_norm": 0.33463115537827764, + "kl": 0.04061126708984375, + "learning_rate": 4.977777777777777e-07, + "loss": 0.0, + "reward": 1.7857143431901932, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7946428768336773, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4032 + }, + { + "completion_length": 252.40179443359375, + "epoch": 0.6763066348128589, + "grad_norm": 0.16931830679373155, + "kl": 0.0411834716796875, + "learning_rate": 4.980246913580247e-07, + "loss": 0.0, + "reward": 1.8017857521772385, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500357627869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4034 + }, + { + "completion_length": 241.4687614440918, + "epoch": 0.6766419380527264, + "grad_norm": 0.3814898253396864, + "kl": 0.0712890625, + "learning_rate": 4.982716049382716e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 4036 + }, + { + "completion_length": 247.8750123977661, + "epoch": 0.676977241292594, + "grad_norm": 0.16397861442788428, + "kl": 0.0486602783203125, + "learning_rate": 4.985185185185185e-07, + "loss": 0.0, + "reward": 1.7410715147852898, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7544643171131611, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4038 + }, + { + "completion_length": 252.11608409881592, + "epoch": 0.6773125445324616, + "grad_norm": 0.6544560134450959, + "kl": 0.1189422607421875, + "learning_rate": 4.987654320987654e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7482143267989159, + "rewards/format_reward_func": 0.9964285716414452, + "step": 4040 + }, + { + "completion_length": 245.38394165039062, + "epoch": 0.6776478477723291, + "grad_norm": 0.2508024090704286, + "kl": 0.0443267822265625, + "learning_rate": 4.990123456790123e-07, + "loss": 0.0, + "reward": 1.7250000834465027, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 1.0, + "step": 4042 + }, + { + "completion_length": 251.5044765472412, + "epoch": 0.6779831510121966, + "grad_norm": 0.2267515588216866, + "kl": 0.03842926025390625, + "learning_rate": 4.992592592592593e-07, + "loss": 0.0, + "reward": 1.7410714849829674, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357573926449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4044 + }, + { + "completion_length": 248.20090579986572, + "epoch": 0.6783184542520642, + "grad_norm": 0.24181546344204055, + "kl": 0.04335784912109375, + "learning_rate": 4.995061728395061e-07, + "loss": 0.0, + "reward": 1.7535715103149414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 4046 + }, + { + "completion_length": 243.383939743042, + "epoch": 0.6786537574919318, + "grad_norm": 0.1358614247171582, + "kl": 0.03560638427734375, + "learning_rate": 4.99753086419753e-07, + "loss": 0.0, + "reward": 1.7392858043313026, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 4048 + }, + { + "completion_length": 253.8303689956665, + "epoch": 0.6789890607317993, + "grad_norm": 0.17895015083217394, + "kl": 0.0395355224609375, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": 1.7450893595814705, + "reward_std": 0.037249373737722635, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4050 + }, + { + "completion_length": 255.852689743042, + "epoch": 0.6793243639716668, + "grad_norm": 0.28552055154012596, + "kl": 0.038604736328125, + "learning_rate": 4.999999997122036e-07, + "loss": 0.0, + "reward": 1.7285715192556381, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.728571442887187, + "rewards/format_reward_func": 1.0, + "step": 4052 + }, + { + "completion_length": 243.11608123779297, + "epoch": 0.6796596672115345, + "grad_norm": 0.4019371586263685, + "kl": 0.04984283447265625, + "learning_rate": 4.999999988488147e-07, + "loss": 0.0, + "reward": 1.682142972946167, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6821428947150707, + "rewards/format_reward_func": 1.0, + "step": 4054 + }, + { + "completion_length": 255.87947845458984, + "epoch": 0.679994970451402, + "grad_norm": 0.21289432083989482, + "kl": 0.04183197021484375, + "learning_rate": 4.999999974098329e-07, + "loss": 0.0, + "reward": 1.7607143744826317, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 1.0, + "step": 4056 + }, + { + "completion_length": 247.18304538726807, + "epoch": 0.6803302736912695, + "grad_norm": 0.5173896017748014, + "kl": 0.05936431884765625, + "learning_rate": 4.999999953952585e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 4058 + }, + { + "completion_length": 254.18751335144043, + "epoch": 0.680665576931137, + "grad_norm": 0.21114889600510645, + "kl": 0.03630828857421875, + "learning_rate": 4.999999928050913e-07, + "loss": 0.0, + "reward": 1.6946429163217545, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.6991071812808514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4060 + }, + { + "completion_length": 263.2276916503906, + "epoch": 0.6810008801710047, + "grad_norm": 0.7612326196090565, + "kl": 0.044281005859375, + "learning_rate": 4.999999896393315e-07, + "loss": 0.0, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 1.0, + "step": 4062 + }, + { + "completion_length": 249.9509038925171, + "epoch": 0.6813361834108722, + "grad_norm": 0.3232769892257542, + "kl": 0.0448150634765625, + "learning_rate": 4.99999985897979e-07, + "loss": 0.0, + "reward": 1.7696429193019867, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4064 + }, + { + "completion_length": 250.33483409881592, + "epoch": 0.6816714866507397, + "grad_norm": 0.2483480478469988, + "kl": 0.03652191162109375, + "learning_rate": 4.99999981581034e-07, + "loss": 0.0, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 4066 + }, + { + "completion_length": 254.665189743042, + "epoch": 0.6820067898906074, + "grad_norm": 0.18635947460012328, + "kl": 0.03978729248046875, + "learning_rate": 4.999999766884962e-07, + "loss": 0.0, + "reward": 1.739285796880722, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 4068 + }, + { + "completion_length": 251.08483123779297, + "epoch": 0.6823420931304749, + "grad_norm": 0.16341314696001333, + "kl": 0.03818511962890625, + "learning_rate": 4.999999712203659e-07, + "loss": 0.0, + "reward": 1.7321429252624512, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7321428805589676, + "rewards/format_reward_func": 1.0, + "step": 4070 + }, + { + "completion_length": 248.5803689956665, + "epoch": 0.6826773963703424, + "grad_norm": 0.22882997275793895, + "kl": 0.03765106201171875, + "learning_rate": 4.999999651766429e-07, + "loss": 0.0, + "reward": 1.8285714909434319, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8285714499652386, + "rewards/format_reward_func": 1.0, + "step": 4072 + }, + { + "completion_length": 247.45983028411865, + "epoch": 0.6830126996102099, + "grad_norm": 0.30347725721095986, + "kl": 0.03338623046875, + "learning_rate": 4.999999585573273e-07, + "loss": 0.0, + "reward": 1.769642911851406, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4074 + }, + { + "completion_length": 243.571439743042, + "epoch": 0.6833480028500776, + "grad_norm": 0.09929448446690448, + "kl": 0.0332489013671875, + "learning_rate": 4.99999951362419e-07, + "loss": 0.0, + "reward": 1.7821429148316383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 4076 + }, + { + "completion_length": 248.22322463989258, + "epoch": 0.6836833060899451, + "grad_norm": 0.23674906761103792, + "kl": 0.03179168701171875, + "learning_rate": 4.999999435919181e-07, + "loss": 0.0, + "reward": 1.8196429163217545, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8241071738302708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4078 + }, + { + "completion_length": 249.00001430511475, + "epoch": 0.6840186093298126, + "grad_norm": 0.2803521117329266, + "kl": 0.03696441650390625, + "learning_rate": 4.999999352458248e-07, + "loss": 0.0, + "reward": 1.7571429461240768, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.757142873480916, + "rewards/format_reward_func": 1.0, + "step": 4080 + }, + { + "completion_length": 255.54465293884277, + "epoch": 0.6843539125696803, + "grad_norm": 0.18009514947831426, + "kl": 0.08721160888671875, + "learning_rate": 4.999999263241389e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7803571708500385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4082 + }, + { + "completion_length": 255.66072463989258, + "epoch": 0.6846892158095478, + "grad_norm": 0.2365843923361676, + "kl": 0.04498291015625, + "learning_rate": 4.999999168268604e-07, + "loss": 0.0, + "reward": 1.7642857730388641, + "reward_std": 0.09091372601687908, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 4084 + }, + { + "completion_length": 246.0134048461914, + "epoch": 0.6850245190494153, + "grad_norm": 0.4343459100999908, + "kl": 0.03946685791015625, + "learning_rate": 4.999999067539895e-07, + "loss": 0.0, + "reward": 1.7660714909434319, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4086 + }, + { + "completion_length": 255.75001335144043, + "epoch": 0.6853598222892828, + "grad_norm": 0.22611496179012103, + "kl": 0.03882598876953125, + "learning_rate": 4.99999896105526e-07, + "loss": 0.0, + "reward": 1.7589286491274834, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.763392873108387, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4088 + }, + { + "completion_length": 245.85268878936768, + "epoch": 0.6856951255291505, + "grad_norm": 0.1820744807846706, + "kl": 0.055633544921875, + "learning_rate": 4.999998848814701e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 4090 + }, + { + "completion_length": 245.1785831451416, + "epoch": 0.686030428769018, + "grad_norm": 0.2699339435142035, + "kl": 0.0371856689453125, + "learning_rate": 4.999998730818219e-07, + "loss": 0.0, + "reward": 1.7767857760190964, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500409781933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4092 + }, + { + "completion_length": 250.8794755935669, + "epoch": 0.6863657320088855, + "grad_norm": 0.18753002933538254, + "kl": 0.06398773193359375, + "learning_rate": 4.999998607065812e-07, + "loss": 0.0001, + "reward": 1.7285714820027351, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714726895094, + "rewards/format_reward_func": 1.0, + "step": 4094 + }, + { + "completion_length": 250.4509038925171, + "epoch": 0.6867010352487531, + "grad_norm": 0.23536728398067072, + "kl": 0.0369110107421875, + "learning_rate": 4.999998477557482e-07, + "loss": 0.0, + "reward": 1.7250000834465027, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 4096 + }, + { + "completion_length": 249.23215770721436, + "epoch": 0.6870363384886207, + "grad_norm": 0.2694385693254488, + "kl": 0.04195404052734375, + "learning_rate": 4.999998342293226e-07, + "loss": 0.0, + "reward": 1.7125000655651093, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7169643379747868, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4098 + }, + { + "completion_length": 237.4642972946167, + "epoch": 0.6873716417284882, + "grad_norm": 0.20593764322751265, + "kl": 0.05100250244140625, + "learning_rate": 4.999998201273049e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7982143014669418, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4100 + }, + { + "completion_length": 243.3303689956665, + "epoch": 0.6877069449683557, + "grad_norm": 0.2052447336531401, + "kl": 0.03682708740234375, + "learning_rate": 4.999998054496949e-07, + "loss": 0.0, + "reward": 1.7892857864499092, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 4102 + }, + { + "completion_length": 248.73215293884277, + "epoch": 0.6880422482082234, + "grad_norm": 0.19766473428498785, + "kl": 0.0457916259765625, + "learning_rate": 4.999997901964926e-07, + "loss": 0.0, + "reward": 1.7357143461704254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 4104 + }, + { + "completion_length": 252.6250123977661, + "epoch": 0.6883775514480909, + "grad_norm": 0.2928526606003355, + "kl": 0.04019927978515625, + "learning_rate": 4.999997743676982e-07, + "loss": 0.0, + "reward": 1.7214286625385284, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7303571812808514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4106 + }, + { + "completion_length": 237.3973331451416, + "epoch": 0.6887128546879584, + "grad_norm": 0.11667197186742236, + "kl": 0.034725189208984375, + "learning_rate": 4.999997579633115e-07, + "loss": 0.0, + "reward": 1.8178571909666061, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.8267857357859612, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4108 + }, + { + "completion_length": 244.5446538925171, + "epoch": 0.689048157927826, + "grad_norm": 0.2764998637390368, + "kl": 0.06385040283203125, + "learning_rate": 4.999997409833327e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 4110 + }, + { + "completion_length": 244.508939743042, + "epoch": 0.6893834611676936, + "grad_norm": 0.2871939727209474, + "kl": 0.038726806640625, + "learning_rate": 4.999997234277618e-07, + "loss": 0.0, + "reward": 1.725000075995922, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 4112 + }, + { + "completion_length": 240.6428680419922, + "epoch": 0.6897187644075611, + "grad_norm": 0.26143479711552936, + "kl": 0.0357818603515625, + "learning_rate": 4.999997052965989e-07, + "loss": 0.0, + "reward": 1.767857201397419, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.767857177183032, + "rewards/format_reward_func": 1.0, + "step": 4114 + }, + { + "completion_length": 251.58929824829102, + "epoch": 0.6900540676474286, + "grad_norm": 0.3161348591284228, + "kl": 0.09564971923828125, + "learning_rate": 4.99999686589844e-07, + "loss": 0.0001, + "reward": 1.7107143551111221, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7107143215835094, + "rewards/format_reward_func": 1.0, + "step": 4116 + }, + { + "completion_length": 247.52232933044434, + "epoch": 0.6903893708872962, + "grad_norm": 0.45746669207629176, + "kl": 0.18476104736328125, + "learning_rate": 4.999996673074971e-07, + "loss": 0.0002, + "reward": 1.7696429193019867, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071600466967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4118 + }, + { + "completion_length": 249.89733123779297, + "epoch": 0.6907246741271638, + "grad_norm": 0.275198361958013, + "kl": 0.0482330322265625, + "learning_rate": 4.999996474495583e-07, + "loss": 0.0, + "reward": 1.796428643167019, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964285910129547, + "rewards/format_reward_func": 1.0, + "step": 4120 + }, + { + "completion_length": 250.66072940826416, + "epoch": 0.6910599773670313, + "grad_norm": 0.1616753946632268, + "kl": 0.0462493896484375, + "learning_rate": 4.999996270160275e-07, + "loss": 0.0, + "reward": 1.7839286103844643, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4122 + }, + { + "completion_length": 242.08929920196533, + "epoch": 0.6913952806068988, + "grad_norm": 0.28224358954519824, + "kl": 0.03643035888671875, + "learning_rate": 4.999996060069049e-07, + "loss": 0.0, + "reward": 1.739285796880722, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 4124 + }, + { + "completion_length": 239.62054538726807, + "epoch": 0.6917305838467664, + "grad_norm": 0.2959435225661601, + "kl": 0.32910919189453125, + "learning_rate": 4.999995844221906e-07, + "loss": 0.0003, + "reward": 1.7642857506871223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 4126 + }, + { + "completion_length": 256.1160840988159, + "epoch": 0.692065887086634, + "grad_norm": 0.16159897450649705, + "kl": 0.04415130615234375, + "learning_rate": 4.999995622618846e-07, + "loss": 0.0, + "reward": 1.7196428999304771, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4128 + }, + { + "completion_length": 241.51787185668945, + "epoch": 0.6924011903265015, + "grad_norm": 0.16545788625877894, + "kl": 0.0444793701171875, + "learning_rate": 4.999995395259868e-07, + "loss": 0.0, + "reward": 1.7250000685453415, + "reward_std": 0.025253813713788986, + "rewards/equation_reward_func": 0.7339285910129547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4130 + }, + { + "completion_length": 245.4241180419922, + "epoch": 0.6927364935663691, + "grad_norm": 0.1501842352304599, + "kl": 0.6236724853515625, + "learning_rate": 4.999995162144974e-07, + "loss": 0.0006, + "reward": 1.79464291036129, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.8080357499420643, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4132 + }, + { + "completion_length": 248.04911708831787, + "epoch": 0.6930717968062367, + "grad_norm": 0.12794145815186433, + "kl": 0.04578399658203125, + "learning_rate": 4.999994923274164e-07, + "loss": 0.0, + "reward": 1.8071429133415222, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 4134 + }, + { + "completion_length": 253.52233219146729, + "epoch": 0.6934071000461042, + "grad_norm": 0.24779154668910555, + "kl": 0.340484619140625, + "learning_rate": 4.999994678647439e-07, + "loss": 0.0003, + "reward": 1.817857213318348, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 4136 + }, + { + "completion_length": 252.89733409881592, + "epoch": 0.6937424032859717, + "grad_norm": 0.06910974345915508, + "kl": 0.4642486572265625, + "learning_rate": 4.999994428264799e-07, + "loss": 0.0005, + "reward": 1.7196429371833801, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071704775095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4138 + }, + { + "completion_length": 245.62501430511475, + "epoch": 0.6940777065258393, + "grad_norm": 0.24250637427227054, + "kl": 0.04045867919921875, + "learning_rate": 4.999994172126245e-07, + "loss": 0.0, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 4140 + }, + { + "completion_length": 252.32143783569336, + "epoch": 0.6944130097657069, + "grad_norm": 0.21264235248878327, + "kl": 0.040924072265625, + "learning_rate": 4.999993910231778e-07, + "loss": 0.0, + "reward": 1.7875000685453415, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.791964303702116, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4142 + }, + { + "completion_length": 256.4776887893677, + "epoch": 0.6947483130055744, + "grad_norm": 0.18883792480654485, + "kl": 0.0831146240234375, + "learning_rate": 4.999993642581397e-07, + "loss": 0.0001, + "reward": 1.7803571969270706, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.793750025331974, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4144 + }, + { + "completion_length": 256.1026906967163, + "epoch": 0.695083616245442, + "grad_norm": 0.4030051813383355, + "kl": 0.08502197265625, + "learning_rate": 4.999993369175105e-07, + "loss": 0.0001, + "reward": 1.7750000953674316, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7839285954833031, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4146 + }, + { + "completion_length": 250.8705472946167, + "epoch": 0.6954189194853095, + "grad_norm": 0.25329291872625403, + "kl": 0.055938720703125, + "learning_rate": 4.999993090012901e-07, + "loss": 0.0001, + "reward": 1.723214365541935, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4148 + }, + { + "completion_length": 261.0134029388428, + "epoch": 0.6957542227251771, + "grad_norm": 0.28460622009753217, + "kl": 0.0702972412109375, + "learning_rate": 4.999992805094786e-07, + "loss": 0.0001, + "reward": 1.764285758137703, + "reward_std": 0.11111677717417479, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 0.9821428656578064, + "step": 4150 + }, + { + "completion_length": 260.727689743042, + "epoch": 0.6960895259650446, + "grad_norm": 0.24220639520506346, + "kl": 0.05754852294921875, + "learning_rate": 4.99999251442076e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.7651786059141159, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4152 + }, + { + "completion_length": 253.7053680419922, + "epoch": 0.6964248292049122, + "grad_norm": 0.20059391948110178, + "kl": 0.0672760009765625, + "learning_rate": 4.999992217990825e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 4154 + }, + { + "completion_length": 257.23661708831787, + "epoch": 0.6967601324447797, + "grad_norm": 0.17352793125955543, + "kl": 0.09174346923828125, + "learning_rate": 4.99999191580498e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4156 + }, + { + "completion_length": 247.43304538726807, + "epoch": 0.6970954356846473, + "grad_norm": 0.4455503723352921, + "kl": 0.055694580078125, + "learning_rate": 4.999991607863228e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 4158 + }, + { + "completion_length": 249.290189743042, + "epoch": 0.6974307389245149, + "grad_norm": 0.207070863422582, + "kl": 0.05780029296875, + "learning_rate": 4.999991294165567e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526785843074322, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4160 + }, + { + "completion_length": 243.20536613464355, + "epoch": 0.6977660421643824, + "grad_norm": 0.30572698157206896, + "kl": 0.05780029296875, + "learning_rate": 4.999990974712001e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 4162 + }, + { + "completion_length": 250.08483219146729, + "epoch": 0.69810134540425, + "grad_norm": 0.2265343376034305, + "kl": 0.0560302734375, + "learning_rate": 4.999990649502528e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 4164 + }, + { + "completion_length": 250.10715579986572, + "epoch": 0.6984366486441175, + "grad_norm": 0.204340719128157, + "kl": 0.04798126220703125, + "learning_rate": 4.999990318537149e-07, + "loss": 0.0, + "reward": 1.7571429014205933, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 4166 + }, + { + "completion_length": 261.6071548461914, + "epoch": 0.6987719518839851, + "grad_norm": 0.2263993650313919, + "kl": 0.0576019287109375, + "learning_rate": 4.999989981815865e-07, + "loss": 0.0001, + "reward": 1.7660715132951736, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4168 + }, + { + "completion_length": 258.6785840988159, + "epoch": 0.6991072551238526, + "grad_norm": 0.5688840890288541, + "kl": 0.08251953125, + "learning_rate": 4.999989639338678e-07, + "loss": 0.0001, + "reward": 1.6928572356700897, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.6928571872413158, + "rewards/format_reward_func": 1.0, + "step": 4170 + }, + { + "completion_length": 251.9866189956665, + "epoch": 0.6994425583637202, + "grad_norm": 0.26599224494018464, + "kl": 0.06134796142578125, + "learning_rate": 4.999989291105588e-07, + "loss": 0.0001, + "reward": 1.714285783469677, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7232143376022577, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4172 + }, + { + "completion_length": 243.78572463989258, + "epoch": 0.6997778616035878, + "grad_norm": 0.2615405086321927, + "kl": 0.0572052001953125, + "learning_rate": 4.999988937116595e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4174 + }, + { + "completion_length": 246.68751335144043, + "epoch": 0.7001131648434553, + "grad_norm": 0.27148599882970154, + "kl": 0.060150146484375, + "learning_rate": 4.999988577371702e-07, + "loss": 0.0001, + "reward": 1.7089286372065544, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7133928909897804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4176 + }, + { + "completion_length": 246.977689743042, + "epoch": 0.7004484680833228, + "grad_norm": 0.28624932053045216, + "kl": 0.0514373779296875, + "learning_rate": 4.999988211870907e-07, + "loss": 0.0001, + "reward": 1.817857213318348, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 4178 + }, + { + "completion_length": 247.38840579986572, + "epoch": 0.7007837713231904, + "grad_norm": 0.23016414642039873, + "kl": 0.06423187255859375, + "learning_rate": 4.999987840614212e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4180 + }, + { + "completion_length": 244.47322845458984, + "epoch": 0.701119074563058, + "grad_norm": 0.35210518296275867, + "kl": 0.0606842041015625, + "learning_rate": 4.999987463601619e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 4182 + }, + { + "completion_length": 235.9598331451416, + "epoch": 0.7014543778029255, + "grad_norm": 0.19201963590977644, + "kl": 0.0580291748046875, + "learning_rate": 4.999987080833128e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 4184 + }, + { + "completion_length": 240.67411518096924, + "epoch": 0.701789681042793, + "grad_norm": 0.13459371154887387, + "kl": 0.04982757568359375, + "learning_rate": 4.999986692308739e-07, + "loss": 0.0, + "reward": 1.7392857745289803, + "reward_std": 0.025253813713788986, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4186 + }, + { + "completion_length": 248.06697750091553, + "epoch": 0.7021249842826607, + "grad_norm": 0.3074579710955277, + "kl": 0.05918121337890625, + "learning_rate": 4.999986298028454e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4188 + }, + { + "completion_length": 246.7500114440918, + "epoch": 0.7024602875225282, + "grad_norm": 0.3280048381695006, + "kl": 0.0514984130859375, + "learning_rate": 4.999985897992274e-07, + "loss": 0.0001, + "reward": 1.7214286252856255, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7214286029338837, + "rewards/format_reward_func": 1.0, + "step": 4190 + }, + { + "completion_length": 237.38840293884277, + "epoch": 0.7027955907623957, + "grad_norm": 0.2873894479997041, + "kl": 0.0552215576171875, + "learning_rate": 4.999985492200199e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000111758709, + "rewards/format_reward_func": 1.0, + "step": 4192 + }, + { + "completion_length": 235.9821538925171, + "epoch": 0.7031308940022633, + "grad_norm": 0.2801387774210403, + "kl": 0.0472564697265625, + "learning_rate": 4.999985080652232e-07, + "loss": 0.0, + "reward": 1.7357143461704254, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7446428835391998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4194 + }, + { + "completion_length": 241.6071548461914, + "epoch": 0.7034661972421309, + "grad_norm": 0.2864929528982983, + "kl": 0.0567474365234375, + "learning_rate": 4.99998466334837e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7375000268220901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4196 + }, + { + "completion_length": 233.7232255935669, + "epoch": 0.7038015004819984, + "grad_norm": 0.27030146938564104, + "kl": 0.0601806640625, + "learning_rate": 4.999984240288618e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7928571682423353, + "rewards/format_reward_func": 1.0, + "step": 4198 + }, + { + "completion_length": 229.2187614440918, + "epoch": 0.7041368037218659, + "grad_norm": 0.12054623921859627, + "kl": 0.061126708984375, + "learning_rate": 4.999983811472975e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 4200 + }, + { + "completion_length": 234.73215293884277, + "epoch": 0.7044721069617336, + "grad_norm": 0.19157688673168496, + "kl": 0.05464935302734375, + "learning_rate": 4.999983376901442e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 1.0, + "step": 4202 + }, + { + "completion_length": 232.62500953674316, + "epoch": 0.7048074102016011, + "grad_norm": 0.31481143427392383, + "kl": 0.05657958984375, + "learning_rate": 4.99998293657402e-07, + "loss": 0.0001, + "reward": 1.7767857909202576, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4204 + }, + { + "completion_length": 241.8125114440918, + "epoch": 0.7051427134414686, + "grad_norm": 0.7161200664730465, + "kl": 0.06140899658203125, + "learning_rate": 4.999982490490711e-07, + "loss": 0.0001, + "reward": 1.7500001043081284, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 4206 + }, + { + "completion_length": 237.290189743042, + "epoch": 0.7054780166813361, + "grad_norm": 0.2976173951165174, + "kl": 0.074066162109375, + "learning_rate": 4.999982038651515e-07, + "loss": 0.0001, + "reward": 1.7142858058214188, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857313156128, + "rewards/format_reward_func": 1.0, + "step": 4208 + }, + { + "completion_length": 237.55804634094238, + "epoch": 0.7058133199212038, + "grad_norm": 1.294270106977747, + "kl": 0.0904998779296875, + "learning_rate": 4.999981581056434e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 4210 + }, + { + "completion_length": 267.2544746398926, + "epoch": 0.7061486231610713, + "grad_norm": 0.6168138104071692, + "kl": 0.399688720703125, + "learning_rate": 4.999981117705468e-07, + "loss": 0.0004, + "reward": 1.526785783469677, + "reward_std": 0.33082496002316475, + "rewards/equation_reward_func": 0.6741071753203869, + "rewards/format_reward_func": 0.8526786118745804, + "step": 4212 + }, + { + "completion_length": 260.12500858306885, + "epoch": 0.7064839264009388, + "grad_norm": 0.31882764649050893, + "kl": 0.3432769775390625, + "learning_rate": 4.999980648598619e-07, + "loss": 0.0003, + "reward": 1.548214353621006, + "reward_std": 0.154048265889287, + "rewards/equation_reward_func": 0.6419643182307482, + "rewards/format_reward_func": 0.9062500335276127, + "step": 4214 + }, + { + "completion_length": 246.56697750091553, + "epoch": 0.7068192296408065, + "grad_norm": 0.24098316483255156, + "kl": 0.2245941162109375, + "learning_rate": 4.999980173735887e-07, + "loss": 0.0002, + "reward": 1.7535714879631996, + "reward_std": 0.09596449136734009, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 0.9821428656578064, + "step": 4216 + }, + { + "completion_length": 234.25893783569336, + "epoch": 0.707154532880674, + "grad_norm": 0.316137458068654, + "kl": 0.201934814453125, + "learning_rate": 4.999979693117275e-07, + "loss": 0.0002, + "reward": 1.7839286178350449, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4218 + }, + { + "completion_length": 237.37947368621826, + "epoch": 0.7074898361205415, + "grad_norm": 0.08814150042085736, + "kl": 0.1511077880859375, + "learning_rate": 4.999979206742782e-07, + "loss": 0.0002, + "reward": 1.7214286625385284, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7214286010712385, + "rewards/format_reward_func": 1.0, + "step": 4220 + }, + { + "completion_length": 238.258939743042, + "epoch": 0.707825139360409, + "grad_norm": 0.18570467030693735, + "kl": 0.11859130859375, + "learning_rate": 4.99997871461241e-07, + "loss": 0.0001, + "reward": 1.7178572118282318, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571987897158, + "rewards/format_reward_func": 1.0, + "step": 4222 + }, + { + "completion_length": 231.2009048461914, + "epoch": 0.7081604426002767, + "grad_norm": 0.2520463039518999, + "kl": 0.0881805419921875, + "learning_rate": 4.99997821672616e-07, + "loss": 0.0001, + "reward": 1.755357213318348, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214484751225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4224 + }, + { + "completion_length": 221.31250953674316, + "epoch": 0.7084957458401442, + "grad_norm": 0.40034280305593867, + "kl": 0.0886993408203125, + "learning_rate": 4.999977713084033e-07, + "loss": 0.0001, + "reward": 1.7839286178350449, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7883928939700127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4226 + }, + { + "completion_length": 232.2857255935669, + "epoch": 0.7088310490800117, + "grad_norm": 0.18969120135227732, + "kl": 0.161529541015625, + "learning_rate": 4.99997720368603e-07, + "loss": 0.0002, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 4228 + }, + { + "completion_length": 224.8750114440918, + "epoch": 0.7091663523198793, + "grad_norm": 0.17195233399392781, + "kl": 0.1042633056640625, + "learning_rate": 4.999976688532153e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 4230 + }, + { + "completion_length": 216.96429538726807, + "epoch": 0.7095016555597469, + "grad_norm": 0.26720859900169286, + "kl": 0.0937652587890625, + "learning_rate": 4.999976167622403e-07, + "loss": 0.0001, + "reward": 1.7892858013510704, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857305705547, + "rewards/format_reward_func": 1.0, + "step": 4232 + }, + { + "completion_length": 230.7321548461914, + "epoch": 0.7098369587996144, + "grad_norm": 0.15169793509915727, + "kl": 0.101806640625, + "learning_rate": 4.99997564095678e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 4234 + }, + { + "completion_length": 224.22768783569336, + "epoch": 0.7101722620394819, + "grad_norm": 0.0039682440196801665, + "kl": 0.0744781494140625, + "learning_rate": 4.999975108535288e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 4236 + }, + { + "completion_length": 231.23215293884277, + "epoch": 0.7105075652793496, + "grad_norm": 0.25855297377161357, + "kl": 0.1085968017578125, + "learning_rate": 4.999974570357925e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 4238 + }, + { + "completion_length": 221.85715293884277, + "epoch": 0.7108428685192171, + "grad_norm": 0.33264525079129287, + "kl": 0.124664306640625, + "learning_rate": 4.999974026424694e-07, + "loss": 0.0001, + "reward": 1.7665179520845413, + "reward_std": 0.037249373737722635, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4240 + }, + { + "completion_length": 230.91072273254395, + "epoch": 0.7111781717590846, + "grad_norm": 0.30380484394856, + "kl": 0.088958740234375, + "learning_rate": 4.999973476735596e-07, + "loss": 0.0001, + "reward": 1.7142858058214188, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 1.0, + "step": 4242 + }, + { + "completion_length": 209.61161518096924, + "epoch": 0.7115134749989521, + "grad_norm": 0.4488191314283875, + "kl": 0.10036468505859375, + "learning_rate": 4.999972921290632e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4244 + }, + { + "completion_length": 215.65626049041748, + "epoch": 0.7118487782388198, + "grad_norm": 0.2875484664537562, + "kl": 0.10089874267578125, + "learning_rate": 4.999972360089804e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 4246 + }, + { + "completion_length": 224.06697368621826, + "epoch": 0.7121840814786873, + "grad_norm": 0.25228779296360854, + "kl": 0.1021575927734375, + "learning_rate": 4.99997179313311e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857142947614193, + "rewards/format_reward_func": 1.0, + "step": 4248 + }, + { + "completion_length": 221.2991189956665, + "epoch": 0.7125193847185548, + "grad_norm": 0.21487976932657415, + "kl": 0.101104736328125, + "learning_rate": 4.999971220420557e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633928824216127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4250 + }, + { + "completion_length": 221.16518878936768, + "epoch": 0.7128546879584224, + "grad_norm": 0.17815512314744297, + "kl": 0.1637115478515625, + "learning_rate": 4.999970641952142e-07, + "loss": 0.0002, + "reward": 1.7464286163449287, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7464286349713802, + "rewards/format_reward_func": 1.0, + "step": 4252 + }, + { + "completion_length": 229.12054634094238, + "epoch": 0.71318999119829, + "grad_norm": 0.28107228216462243, + "kl": 0.1016998291015625, + "learning_rate": 4.999970057727869e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 4254 + }, + { + "completion_length": 224.0803680419922, + "epoch": 0.7135252944381575, + "grad_norm": 0.2769542511971505, + "kl": 0.09307861328125, + "learning_rate": 4.999969467747736e-07, + "loss": 0.0001, + "reward": 1.6785715073347092, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.678571455180645, + "rewards/format_reward_func": 1.0, + "step": 4256 + }, + { + "completion_length": 220.14286708831787, + "epoch": 0.713860597678025, + "grad_norm": 0.3152799908343859, + "kl": 0.1051177978515625, + "learning_rate": 4.999968872011748e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 4258 + }, + { + "completion_length": 218.1741180419922, + "epoch": 0.7141959009178926, + "grad_norm": 0.3424092259120418, + "kl": 0.09275054931640625, + "learning_rate": 4.999968270519905e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4260 + }, + { + "completion_length": 217.66965198516846, + "epoch": 0.7145312041577602, + "grad_norm": 0.24849842953591986, + "kl": 0.1216278076171875, + "learning_rate": 4.999967663272207e-07, + "loss": 0.0001, + "reward": 1.817857176065445, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571574389935, + "rewards/format_reward_func": 1.0, + "step": 4262 + }, + { + "completion_length": 216.36161518096924, + "epoch": 0.7148665073976277, + "grad_norm": 0.1771942860689893, + "kl": 0.1179046630859375, + "learning_rate": 4.999967050268657e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 4264 + }, + { + "completion_length": 227.24108028411865, + "epoch": 0.7152018106374953, + "grad_norm": 0.23422982109973303, + "kl": 0.180267333984375, + "learning_rate": 4.999966431509255e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 4266 + }, + { + "completion_length": 222.05357933044434, + "epoch": 0.7155371138773629, + "grad_norm": 0.2085439923439692, + "kl": 0.128448486328125, + "learning_rate": 4.999965806994005e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 4268 + }, + { + "completion_length": 222.42411994934082, + "epoch": 0.7158724171172304, + "grad_norm": 0.2230259143420745, + "kl": 0.12346649169921875, + "learning_rate": 4.999965176722905e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 4270 + }, + { + "completion_length": 228.90625858306885, + "epoch": 0.7162077203570979, + "grad_norm": 0.14893081654121634, + "kl": 0.11907958984375, + "learning_rate": 4.999964540695959e-07, + "loss": 0.0001, + "reward": 1.7214286625385284, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214285954833031, + "rewards/format_reward_func": 1.0, + "step": 4272 + }, + { + "completion_length": 225.11608123779297, + "epoch": 0.7165430235969655, + "grad_norm": 0.28803787165074113, + "kl": 0.1007537841796875, + "learning_rate": 4.999963898913168e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 4274 + }, + { + "completion_length": 228.2232255935669, + "epoch": 0.7168783268368331, + "grad_norm": 0.3384674104032473, + "kl": 0.105621337890625, + "learning_rate": 4.999963251374533e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 4276 + }, + { + "completion_length": 226.52679634094238, + "epoch": 0.7172136300767006, + "grad_norm": 0.2255294947243617, + "kl": 0.087432861328125, + "learning_rate": 4.999962598080055e-07, + "loss": 0.0001, + "reward": 1.6875000894069672, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.6919643133878708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4278 + }, + { + "completion_length": 229.29911708831787, + "epoch": 0.7175489333165682, + "grad_norm": 0.17192694908242845, + "kl": 0.1046295166015625, + "learning_rate": 4.999961939029738e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714469850063, + "rewards/format_reward_func": 1.0, + "step": 4280 + }, + { + "completion_length": 228.3571538925171, + "epoch": 0.7178842365564357, + "grad_norm": 0.2238045079730679, + "kl": 0.1636505126953125, + "learning_rate": 4.999961274223581e-07, + "loss": 0.0002, + "reward": 1.8071428835391998, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 4282 + }, + { + "completion_length": 229.05358028411865, + "epoch": 0.7182195397963033, + "grad_norm": 0.1545800715199412, + "kl": 0.1165924072265625, + "learning_rate": 4.999960603661585e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 4284 + }, + { + "completion_length": 221.13393878936768, + "epoch": 0.7185548430361708, + "grad_norm": 0.16588131399199002, + "kl": 0.10345458984375, + "learning_rate": 4.999959927343754e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714440047741, + "rewards/format_reward_func": 1.0, + "step": 4286 + }, + { + "completion_length": 227.6250114440918, + "epoch": 0.7188901462760384, + "grad_norm": 0.23818363949622876, + "kl": 0.1177215576171875, + "learning_rate": 4.999959245270088e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428921073675, + "rewards/format_reward_func": 1.0, + "step": 4288 + }, + { + "completion_length": 223.75000762939453, + "epoch": 0.719225449515906, + "grad_norm": 0.30625169067460517, + "kl": 0.1202545166015625, + "learning_rate": 4.999958557440589e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4290 + }, + { + "completion_length": 223.0803680419922, + "epoch": 0.7195607527557735, + "grad_norm": 0.3376410563758561, + "kl": 0.11126708984375, + "learning_rate": 4.999957863855257e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857525497675, + "rewards/format_reward_func": 1.0, + "step": 4292 + }, + { + "completion_length": 220.6741180419922, + "epoch": 0.7198960559956411, + "grad_norm": 0.1431025431630177, + "kl": 0.0991058349609375, + "learning_rate": 4.999957164514097e-07, + "loss": 0.0001, + "reward": 1.7571428939700127, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 4294 + }, + { + "completion_length": 218.46429538726807, + "epoch": 0.7202313592355086, + "grad_norm": 0.20456145213274624, + "kl": 0.118072509765625, + "learning_rate": 4.999956459417109e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 4296 + }, + { + "completion_length": 220.79911422729492, + "epoch": 0.7205666624753762, + "grad_norm": 0.34602565657175177, + "kl": 0.10986328125, + "learning_rate": 4.999955748564293e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 4298 + }, + { + "completion_length": 226.42858219146729, + "epoch": 0.7209019657152437, + "grad_norm": 0.30351734013178966, + "kl": 0.1095733642578125, + "learning_rate": 4.999955031955653e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 4300 + }, + { + "completion_length": 224.81697177886963, + "epoch": 0.7212372689551113, + "grad_norm": 0.22631733827458714, + "kl": 0.1202392578125, + "learning_rate": 4.999954309591188e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4302 + }, + { + "completion_length": 222.08929538726807, + "epoch": 0.7215725721949788, + "grad_norm": 0.19482129354219505, + "kl": 0.1111297607421875, + "learning_rate": 4.999953581470903e-07, + "loss": 0.0001, + "reward": 1.81428574770689, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857421189547, + "rewards/format_reward_func": 1.0, + "step": 4304 + }, + { + "completion_length": 228.1116189956665, + "epoch": 0.7219078754348464, + "grad_norm": 0.15421096995458153, + "kl": 0.106353759765625, + "learning_rate": 4.999952847594796e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214286006987095, + "rewards/format_reward_func": 1.0, + "step": 4306 + }, + { + "completion_length": 222.49108028411865, + "epoch": 0.722243178674714, + "grad_norm": 0.2832644661349313, + "kl": 0.1219482421875, + "learning_rate": 4.999952107962873e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 4308 + }, + { + "completion_length": 228.3526906967163, + "epoch": 0.7225784819145815, + "grad_norm": 0.2739123316787506, + "kl": 0.1038055419921875, + "learning_rate": 4.999951362575131e-07, + "loss": 0.0001, + "reward": 1.7660714834928513, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.770535746589303, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4310 + }, + { + "completion_length": 224.91965293884277, + "epoch": 0.722913785154449, + "grad_norm": 0.33445050842398866, + "kl": 0.131072998046875, + "learning_rate": 4.999950611431576e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 4312 + }, + { + "completion_length": 226.31697463989258, + "epoch": 0.7232490883943166, + "grad_norm": 0.21341353048079031, + "kl": 0.12420654296875, + "learning_rate": 4.999949854532206e-07, + "loss": 0.0001, + "reward": 1.7160715162754059, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7205357402563095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4314 + }, + { + "completion_length": 229.93304443359375, + "epoch": 0.7235843916341842, + "grad_norm": 0.30601848106418333, + "kl": 0.1174163818359375, + "learning_rate": 4.999949091877026e-07, + "loss": 0.0001, + "reward": 1.7589286640286446, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4316 + }, + { + "completion_length": 232.65179920196533, + "epoch": 0.7239196948740517, + "grad_norm": 0.1687238975285787, + "kl": 0.10968017578125, + "learning_rate": 4.999948323466035e-07, + "loss": 0.0001, + "reward": 1.6839286535978317, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.6883928887546062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4318 + }, + { + "completion_length": 224.08483123779297, + "epoch": 0.7242549981139192, + "grad_norm": 0.004285123511293161, + "kl": 0.11199951171875, + "learning_rate": 4.999947549299238e-07, + "loss": 0.0001, + "reward": 1.782142885029316, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7910714726895094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4320 + }, + { + "completion_length": 224.9196538925171, + "epoch": 0.7245903013537869, + "grad_norm": 0.19576979486681245, + "kl": 0.107696533203125, + "learning_rate": 4.999946769376633e-07, + "loss": 0.0001, + "reward": 1.7357143312692642, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 4322 + }, + { + "completion_length": 221.56697177886963, + "epoch": 0.7249256045936544, + "grad_norm": 0.28962875146992173, + "kl": 0.10064697265625, + "learning_rate": 4.999945983698224e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714715719223, + "rewards/format_reward_func": 1.0, + "step": 4324 + }, + { + "completion_length": 233.17411708831787, + "epoch": 0.7252609078335219, + "grad_norm": 0.46336086157682704, + "kl": 0.10491943359375, + "learning_rate": 4.999945192264013e-07, + "loss": 0.0001, + "reward": 1.7665179446339607, + "reward_std": 0.06755394907668233, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4326 + }, + { + "completion_length": 230.23661613464355, + "epoch": 0.7255962110733895, + "grad_norm": 0.2822536367827795, + "kl": 0.1027679443359375, + "learning_rate": 4.999944395074001e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 4328 + }, + { + "completion_length": 233.15626049041748, + "epoch": 0.7259315143132571, + "grad_norm": 0.19606233734620573, + "kl": 0.0909881591796875, + "learning_rate": 4.99994359212819e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 4330 + }, + { + "completion_length": 229.98215293884277, + "epoch": 0.7262668175531246, + "grad_norm": 0.3140268857004368, + "kl": 0.103546142578125, + "learning_rate": 4.999942783426583e-07, + "loss": 0.0001, + "reward": 1.7303572297096252, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214648663998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4332 + }, + { + "completion_length": 241.1294765472412, + "epoch": 0.7266021207929921, + "grad_norm": 0.0038540600931844153, + "kl": 0.0908050537109375, + "learning_rate": 4.999941968969179e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 4334 + }, + { + "completion_length": 237.35268783569336, + "epoch": 0.7269374240328598, + "grad_norm": 0.28605535444102753, + "kl": 0.094390869140625, + "learning_rate": 4.999941148755983e-07, + "loss": 0.0001, + "reward": 1.8165179342031479, + "reward_std": 0.0473508988507092, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4336 + }, + { + "completion_length": 248.6830472946167, + "epoch": 0.7272727272727273, + "grad_norm": 0.14850043176738945, + "kl": 0.0977783203125, + "learning_rate": 4.999940322786994e-07, + "loss": 0.0001, + "reward": 1.8232143372297287, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8276785910129547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4338 + }, + { + "completion_length": 244.63840293884277, + "epoch": 0.7276080305125948, + "grad_norm": 0.3221783517308769, + "kl": 0.1132965087890625, + "learning_rate": 4.999939491062217e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 4340 + }, + { + "completion_length": 241.77679634094238, + "epoch": 0.7279433337524623, + "grad_norm": 0.182633691742018, + "kl": 0.111907958984375, + "learning_rate": 4.999938653581652e-07, + "loss": 0.0001, + "reward": 1.7107143625617027, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7107143122702837, + "rewards/format_reward_func": 1.0, + "step": 4342 + }, + { + "completion_length": 248.05358028411865, + "epoch": 0.72827863699233, + "grad_norm": 0.18817809176508976, + "kl": 0.1070709228515625, + "learning_rate": 4.999937810345301e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571850061417, + "rewards/format_reward_func": 1.0, + "step": 4344 + }, + { + "completion_length": 243.54018878936768, + "epoch": 0.7286139402321975, + "grad_norm": 0.22139500314548327, + "kl": 0.121734619140625, + "learning_rate": 4.999936961353166e-07, + "loss": 0.0001, + "reward": 1.714285783469677, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857536673546, + "rewards/format_reward_func": 1.0, + "step": 4346 + }, + { + "completion_length": 238.27233409881592, + "epoch": 0.728949243472065, + "grad_norm": 0.2784243303823706, + "kl": 0.097625732421875, + "learning_rate": 4.999936106605251e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 4348 + }, + { + "completion_length": 243.17858219146729, + "epoch": 0.7292845467119327, + "grad_norm": 0.23200086785651602, + "kl": 0.102752685546875, + "learning_rate": 4.999935246101554e-07, + "loss": 0.0001, + "reward": 1.8214286044239998, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8214286081492901, + "rewards/format_reward_func": 1.0, + "step": 4350 + }, + { + "completion_length": 243.65626049041748, + "epoch": 0.7296198499518002, + "grad_norm": 0.2619354559845573, + "kl": 0.103515625, + "learning_rate": 4.99993437984208e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 4352 + }, + { + "completion_length": 244.821439743042, + "epoch": 0.7299551531916677, + "grad_norm": 0.38513346945983323, + "kl": 0.101348876953125, + "learning_rate": 4.99993350782683e-07, + "loss": 0.0001, + "reward": 1.8232143372297287, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.827678594738245, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4354 + }, + { + "completion_length": 229.3884038925171, + "epoch": 0.7302904564315352, + "grad_norm": 0.12321027482845619, + "kl": 0.107513427734375, + "learning_rate": 4.999932630055807e-07, + "loss": 0.0001, + "reward": 1.8250000327825546, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8250000327825546, + "rewards/format_reward_func": 1.0, + "step": 4356 + }, + { + "completion_length": 242.5446548461914, + "epoch": 0.7306257596714029, + "grad_norm": 0.30145710161379585, + "kl": 0.223480224609375, + "learning_rate": 4.99993174652901e-07, + "loss": 0.0002, + "reward": 1.728571504354477, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7375000454485416, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4358 + }, + { + "completion_length": 236.34822273254395, + "epoch": 0.7309610629112704, + "grad_norm": 0.3049816173707025, + "kl": 0.103057861328125, + "learning_rate": 4.999930857246445e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 4360 + }, + { + "completion_length": 245.19644260406494, + "epoch": 0.7312963661511379, + "grad_norm": 0.262340460674097, + "kl": 0.0929107666015625, + "learning_rate": 4.999929962208113e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 4362 + }, + { + "completion_length": 238.6339406967163, + "epoch": 0.7316316693910055, + "grad_norm": 0.21572310070899314, + "kl": 0.097137451171875, + "learning_rate": 4.999929061414014e-07, + "loss": 0.0001, + "reward": 1.7446429058909416, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071838885546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4364 + }, + { + "completion_length": 236.9866180419922, + "epoch": 0.7319669726308731, + "grad_norm": 0.20339241801146096, + "kl": 0.1023712158203125, + "learning_rate": 4.999928154864152e-07, + "loss": 0.0001, + "reward": 1.7558036297559738, + "reward_std": 0.04230013629421592, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4366 + }, + { + "completion_length": 239.3705472946167, + "epoch": 0.7323022758707406, + "grad_norm": 0.055339300543254144, + "kl": 0.09466552734375, + "learning_rate": 4.999927242558527e-07, + "loss": 0.0001, + "reward": 1.719642922282219, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7241071742027998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4368 + }, + { + "completion_length": 237.75001049041748, + "epoch": 0.7326375791106081, + "grad_norm": 0.2988596314576556, + "kl": 0.091400146484375, + "learning_rate": 4.999926324497145e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 4370 + }, + { + "completion_length": 234.7321538925171, + "epoch": 0.7329728823504758, + "grad_norm": 0.12317099081226329, + "kl": 0.095947265625, + "learning_rate": 4.999925400680004e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 4372 + }, + { + "completion_length": 231.85268783569336, + "epoch": 0.7333081855903433, + "grad_norm": 0.2941982386958749, + "kl": 0.09271240234375, + "learning_rate": 4.999924471107108e-07, + "loss": 0.0001, + "reward": 1.807142898440361, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 4374 + }, + { + "completion_length": 229.54018783569336, + "epoch": 0.7336434888302108, + "grad_norm": 0.24718352959271161, + "kl": 0.0890960693359375, + "learning_rate": 4.99992353577846e-07, + "loss": 0.0001, + "reward": 1.7982143461704254, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8026785925030708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4376 + }, + { + "completion_length": 237.3794755935669, + "epoch": 0.7339787920700783, + "grad_norm": 0.21844899000323725, + "kl": 0.0857391357421875, + "learning_rate": 4.999922594694059e-07, + "loss": 0.0001, + "reward": 1.7982143312692642, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4378 + }, + { + "completion_length": 228.34822463989258, + "epoch": 0.734314095309946, + "grad_norm": 0.31139955126338215, + "kl": 0.0860137939453125, + "learning_rate": 4.999921647853911e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 4380 + }, + { + "completion_length": 236.80804538726807, + "epoch": 0.7346493985498135, + "grad_norm": 0.25288666916101726, + "kl": 0.0904083251953125, + "learning_rate": 4.999920695258016e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 4382 + }, + { + "completion_length": 237.9509038925171, + "epoch": 0.734984701789681, + "grad_norm": 0.15349708899045258, + "kl": 0.0903472900390625, + "learning_rate": 4.999919736906377e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 4384 + }, + { + "completion_length": 233.35715198516846, + "epoch": 0.7353200050295486, + "grad_norm": 0.20999069531459416, + "kl": 0.0869598388671875, + "learning_rate": 4.999918772798995e-07, + "loss": 0.0001, + "reward": 1.6910715103149414, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.6955357491970062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4386 + }, + { + "completion_length": 225.27679538726807, + "epoch": 0.7356553082694162, + "grad_norm": 0.21644641902809753, + "kl": 0.0892486572265625, + "learning_rate": 4.999917802935875e-07, + "loss": 0.0001, + "reward": 1.7607143744826317, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 4388 + }, + { + "completion_length": 242.883939743042, + "epoch": 0.7359906115092837, + "grad_norm": 0.2097634616773623, + "kl": 0.10491943359375, + "learning_rate": 4.999916827317016e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 4390 + }, + { + "completion_length": 234.321439743042, + "epoch": 0.7363259147491512, + "grad_norm": 0.301361903974932, + "kl": 0.0983428955078125, + "learning_rate": 4.99991584594242e-07, + "loss": 0.0001, + "reward": 1.7267857491970062, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7312500327825546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4392 + }, + { + "completion_length": 240.28572463989258, + "epoch": 0.7366612179890188, + "grad_norm": 0.2971735765340939, + "kl": 0.106536865234375, + "learning_rate": 4.999914858812094e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 4394 + }, + { + "completion_length": 236.8035831451416, + "epoch": 0.7369965212288864, + "grad_norm": 0.24862167834729915, + "kl": 0.1034393310546875, + "learning_rate": 4.999913865926035e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4396 + }, + { + "completion_length": 236.5892972946167, + "epoch": 0.7373318244687539, + "grad_norm": 0.17730660559017838, + "kl": 0.1024017333984375, + "learning_rate": 4.999912867284247e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4398 + }, + { + "completion_length": 235.58929538726807, + "epoch": 0.7376671277086215, + "grad_norm": 0.19618760815384542, + "kl": 0.0888214111328125, + "learning_rate": 4.999911862886735e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 4400 + }, + { + "completion_length": 241.71429824829102, + "epoch": 0.738002430948489, + "grad_norm": 0.23361102730300204, + "kl": 0.097503662109375, + "learning_rate": 4.999910852733496e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.09596449043601751, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 0.9821428656578064, + "step": 4402 + }, + { + "completion_length": 230.92858123779297, + "epoch": 0.7383377341883566, + "grad_norm": 0.13161910784321273, + "kl": 0.102325439453125, + "learning_rate": 4.999909836824538e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 4404 + }, + { + "completion_length": 238.383939743042, + "epoch": 0.7386730374282241, + "grad_norm": 0.11006102889865772, + "kl": 0.088531494140625, + "learning_rate": 4.999908815159859e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4406 + }, + { + "completion_length": 242.133939743042, + "epoch": 0.7390083406680917, + "grad_norm": 0.21817612106604028, + "kl": 0.0933990478515625, + "learning_rate": 4.999907787739464e-07, + "loss": 0.0001, + "reward": 1.7571429535746574, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 4408 + }, + { + "completion_length": 240.34375858306885, + "epoch": 0.7393436439079593, + "grad_norm": 0.15413740306942578, + "kl": 0.0780181884765625, + "learning_rate": 4.999906754563354e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4410 + }, + { + "completion_length": 231.25893783569336, + "epoch": 0.7396789471478268, + "grad_norm": 0.14813305726517229, + "kl": 0.089752197265625, + "learning_rate": 4.999905715631532e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.785714328289032, + "rewards/format_reward_func": 1.0, + "step": 4412 + }, + { + "completion_length": 236.66072273254395, + "epoch": 0.7400142503876944, + "grad_norm": 0.46415342718216857, + "kl": 0.08685302734375, + "learning_rate": 4.999904670943999e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7892857231199741, + "rewards/format_reward_func": 1.0, + "step": 4414 + }, + { + "completion_length": 231.31697463989258, + "epoch": 0.7403495536275619, + "grad_norm": 0.2764112079171355, + "kl": 0.081451416015625, + "learning_rate": 4.999903620500759e-07, + "loss": 0.0001, + "reward": 1.805357202887535, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8098214566707611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4416 + }, + { + "completion_length": 230.25447368621826, + "epoch": 0.7406848568674295, + "grad_norm": 0.11346544928980262, + "kl": 0.0750732421875, + "learning_rate": 4.999902564301815e-07, + "loss": 0.0001, + "reward": 1.8035714626312256, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 4418 + }, + { + "completion_length": 227.22768878936768, + "epoch": 0.741020160107297, + "grad_norm": 0.23948562808508556, + "kl": 0.0700836181640625, + "learning_rate": 4.999901502347168e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 4420 + }, + { + "completion_length": 230.86608409881592, + "epoch": 0.7413554633471646, + "grad_norm": 0.2489915856046395, + "kl": 0.0712127685546875, + "learning_rate": 4.99990043463682e-07, + "loss": 0.0001, + "reward": 1.6915179416537285, + "reward_std": 0.06250318652018905, + "rewards/equation_reward_func": 0.6928571667522192, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4422 + }, + { + "completion_length": 228.29911613464355, + "epoch": 0.7416907665870321, + "grad_norm": 0.1812571795709786, + "kl": 0.077301025390625, + "learning_rate": 4.999899361170775e-07, + "loss": 0.0001, + "reward": 1.7290179431438446, + "reward_std": 0.0599778073374182, + "rewards/equation_reward_func": 0.7348214685916901, + "rewards/format_reward_func": 0.9941964335739613, + "step": 4424 + }, + { + "completion_length": 230.34375858306885, + "epoch": 0.7420260698268997, + "grad_norm": 0.26999021740137746, + "kl": 0.0660858154296875, + "learning_rate": 4.999898281949035e-07, + "loss": 0.0001, + "reward": 1.7000000849366188, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.708928631618619, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4426 + }, + { + "completion_length": 225.80358028411865, + "epoch": 0.7423613730667673, + "grad_norm": 0.30457239085627186, + "kl": 0.0674896240234375, + "learning_rate": 4.999897196971602e-07, + "loss": 0.0001, + "reward": 1.7214286550879478, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 4428 + }, + { + "completion_length": 224.40626049041748, + "epoch": 0.7426966763066348, + "grad_norm": 0.21223745143930695, + "kl": 0.0684814453125, + "learning_rate": 4.999896106238479e-07, + "loss": 0.0001, + "reward": 1.8035714700818062, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 4430 + }, + { + "completion_length": 227.92411708831787, + "epoch": 0.7430319795465024, + "grad_norm": 0.26564997925657297, + "kl": 0.06092071533203125, + "learning_rate": 4.999895009749667e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 4432 + }, + { + "completion_length": 218.03126049041748, + "epoch": 0.7433672827863699, + "grad_norm": 0.23386483907548092, + "kl": 0.068023681640625, + "learning_rate": 4.999893907505172e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 4434 + }, + { + "completion_length": 227.23661518096924, + "epoch": 0.7437025860262375, + "grad_norm": 0.2580465030816322, + "kl": 0.065032958984375, + "learning_rate": 4.999892799504992e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 4436 + }, + { + "completion_length": 218.92411613464355, + "epoch": 0.744037889266105, + "grad_norm": 0.3291190988547193, + "kl": 0.087890625, + "learning_rate": 4.999891685749133e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 4438 + }, + { + "completion_length": 230.83483219146729, + "epoch": 0.7443731925059726, + "grad_norm": 0.20748276719564904, + "kl": 0.0592041015625, + "learning_rate": 4.999890566237597e-07, + "loss": 0.0001, + "reward": 1.8232143446803093, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.827678594738245, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4440 + }, + { + "completion_length": 223.0892972946167, + "epoch": 0.7447084957458402, + "grad_norm": 0.3388534696826737, + "kl": 0.067169189453125, + "learning_rate": 4.999889440970385e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714562982321, + "rewards/format_reward_func": 1.0, + "step": 4442 + }, + { + "completion_length": 222.49554634094238, + "epoch": 0.7450437989857077, + "grad_norm": 0.2706600096126691, + "kl": 0.0620880126953125, + "learning_rate": 4.999888309947501e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643253087997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4444 + }, + { + "completion_length": 221.94197368621826, + "epoch": 0.7453791022255752, + "grad_norm": 0.24547408044411922, + "kl": 0.06488037109375, + "learning_rate": 4.999887173168947e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 4446 + }, + { + "completion_length": 214.54018783569336, + "epoch": 0.7457144054654428, + "grad_norm": 0.1853125626531355, + "kl": 0.0753631591796875, + "learning_rate": 4.999886030634727e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 4448 + }, + { + "completion_length": 221.7053680419922, + "epoch": 0.7460497087053104, + "grad_norm": 0.19931767396673297, + "kl": 0.0742950439453125, + "learning_rate": 4.999884882344842e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 4450 + }, + { + "completion_length": 217.12947273254395, + "epoch": 0.7463850119451779, + "grad_norm": 0.1636954950166591, + "kl": 0.06884765625, + "learning_rate": 4.999883728299294e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 4452 + }, + { + "completion_length": 215.13840103149414, + "epoch": 0.7467203151850454, + "grad_norm": 0.2250329654524265, + "kl": 0.07806396484375, + "learning_rate": 4.999882568498087e-07, + "loss": 0.0001, + "reward": 1.7107143476605415, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143308967352, + "rewards/format_reward_func": 1.0, + "step": 4454 + }, + { + "completion_length": 213.21429443359375, + "epoch": 0.7470556184249131, + "grad_norm": 0.2505331278808723, + "kl": 0.0777130126953125, + "learning_rate": 4.999881402941225e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.792857164517045, + "rewards/format_reward_func": 1.0, + "step": 4456 + }, + { + "completion_length": 222.19197463989258, + "epoch": 0.7473909216647806, + "grad_norm": 0.16466956778998829, + "kl": 0.0843505859375, + "learning_rate": 4.999880231628707e-07, + "loss": 0.0001, + "reward": 1.74910718947649, + "reward_std": 0.05177031829953194, + "rewards/equation_reward_func": 0.7553571872413158, + "rewards/format_reward_func": 0.9937500059604645, + "step": 4458 + }, + { + "completion_length": 221.11161613464355, + "epoch": 0.7477262249046481, + "grad_norm": 0.2396358314615966, + "kl": 0.0821380615234375, + "learning_rate": 4.999879054560539e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 4460 + }, + { + "completion_length": 227.1651906967163, + "epoch": 0.7480615281445157, + "grad_norm": 0.33908675843157704, + "kl": 0.0939178466796875, + "learning_rate": 4.999877871736723e-07, + "loss": 0.0001, + "reward": 1.7892857939004898, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 1.0, + "step": 4462 + }, + { + "completion_length": 216.33929538726807, + "epoch": 0.7483968313843833, + "grad_norm": 0.14003622530992152, + "kl": 0.09588623046875, + "learning_rate": 4.999876683157261e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 4464 + }, + { + "completion_length": 218.0446538925171, + "epoch": 0.7487321346242508, + "grad_norm": 0.2560114087268138, + "kl": 0.09271240234375, + "learning_rate": 4.999875488822155e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 4466 + }, + { + "completion_length": 224.3437614440918, + "epoch": 0.7490674378641183, + "grad_norm": 0.31758820202141924, + "kl": 0.07741546630859375, + "learning_rate": 4.999874288731409e-07, + "loss": 0.0001, + "reward": 1.7232143729925156, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7276786118745804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4468 + }, + { + "completion_length": 213.69643878936768, + "epoch": 0.749402741103986, + "grad_norm": 0.44047391205664077, + "kl": 0.089691162109375, + "learning_rate": 4.999873082885027e-07, + "loss": 0.0001, + "reward": 1.7142857983708382, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857387661934, + "rewards/format_reward_func": 1.0, + "step": 4470 + }, + { + "completion_length": 207.86608123779297, + "epoch": 0.7497380443438535, + "grad_norm": 0.4861281163334542, + "kl": 0.09527587890625, + "learning_rate": 4.999871871283008e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 4472 + }, + { + "completion_length": 209.946439743042, + "epoch": 0.750073347583721, + "grad_norm": 0.19385061512985016, + "kl": 0.1087646484375, + "learning_rate": 4.999870653925359e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 4474 + }, + { + "completion_length": 214.26786518096924, + "epoch": 0.7504086508235885, + "grad_norm": 0.15678270263892813, + "kl": 0.098419189453125, + "learning_rate": 4.99986943081208e-07, + "loss": 0.0001, + "reward": 1.737500086426735, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.741964315995574, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4476 + }, + { + "completion_length": 203.92411613464355, + "epoch": 0.7507439540634562, + "grad_norm": 0.2732266849605991, + "kl": 0.11285400390625, + "learning_rate": 4.999868201943175e-07, + "loss": 0.0001, + "reward": 1.7089286595582962, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7133928798139095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4478 + }, + { + "completion_length": 215.11161613464355, + "epoch": 0.7510792573033237, + "grad_norm": 0.17807125877809504, + "kl": 0.111358642578125, + "learning_rate": 4.999866967318645e-07, + "loss": 0.0001, + "reward": 1.7071429342031479, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7071428913623095, + "rewards/format_reward_func": 1.0, + "step": 4480 + }, + { + "completion_length": 216.14286708831787, + "epoch": 0.7514145605431912, + "grad_norm": 0.25479605394237403, + "kl": 0.1113739013671875, + "learning_rate": 4.999865726938497e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 4482 + }, + { + "completion_length": 208.52233028411865, + "epoch": 0.7517498637830589, + "grad_norm": 0.3540568480128976, + "kl": 0.128082275390625, + "learning_rate": 4.999864480802729e-07, + "loss": 0.0001, + "reward": 1.7142858281731606, + "reward_std": 0.07071067858487368, + "rewards/equation_reward_func": 0.7232143059372902, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4484 + }, + { + "completion_length": 222.41072273254395, + "epoch": 0.7520851670229264, + "grad_norm": 0.28369640416103614, + "kl": 0.104766845703125, + "learning_rate": 4.999863228911347e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7294643130153418, + "rewards/format_reward_func": 0.9919642955064774, + "step": 4486 + }, + { + "completion_length": 213.75893783569336, + "epoch": 0.7524204702627939, + "grad_norm": 0.0049135323066809514, + "kl": 0.1141357421875, + "learning_rate": 4.999861971264353e-07, + "loss": 0.0001, + "reward": 1.6982143744826317, + "reward_std": 0.012626906856894493, + "rewards/equation_reward_func": 0.7026786096394062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4488 + }, + { + "completion_length": 221.0803680419922, + "epoch": 0.7527557735026614, + "grad_norm": 0.20714617745583425, + "kl": 0.104827880859375, + "learning_rate": 4.999860707861751e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 4490 + }, + { + "completion_length": 216.79911708831787, + "epoch": 0.7530910767425291, + "grad_norm": 0.3164850402145241, + "kl": 0.0961151123046875, + "learning_rate": 4.999859438703541e-07, + "loss": 0.0001, + "reward": 1.8017857670783997, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.8062500357627869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4492 + }, + { + "completion_length": 216.8303680419922, + "epoch": 0.7534263799823966, + "grad_norm": 0.41792000510914323, + "kl": 0.1044158935546875, + "learning_rate": 4.999858163789728e-07, + "loss": 0.0001, + "reward": 1.7732143551111221, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7776785902678967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4494 + }, + { + "completion_length": 220.0446548461914, + "epoch": 0.7537616832222641, + "grad_norm": 0.3411173294484826, + "kl": 0.0836029052734375, + "learning_rate": 4.999856883120314e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 4496 + }, + { + "completion_length": 220.37500858306885, + "epoch": 0.7540969864621316, + "grad_norm": 0.2682306203706297, + "kl": 0.0904388427734375, + "learning_rate": 4.999855596695304e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000316649675, + "rewards/format_reward_func": 1.0, + "step": 4498 + }, + { + "completion_length": 219.66072273254395, + "epoch": 0.7544322897019993, + "grad_norm": 0.25123051432297105, + "kl": 0.0814666748046875, + "learning_rate": 4.999854304514699e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4500 + }, + { + "completion_length": 221.74554538726807, + "epoch": 0.7547675929418668, + "grad_norm": 0.25857901554901946, + "kl": 0.0801544189453125, + "learning_rate": 4.999853006578503e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 4502 + }, + { + "completion_length": 227.44643878936768, + "epoch": 0.7551028961817343, + "grad_norm": 0.37851301366513695, + "kl": 0.092071533203125, + "learning_rate": 4.999851702886719e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7455357555299997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4504 + }, + { + "completion_length": 225.27233219146729, + "epoch": 0.755438199421602, + "grad_norm": 0.2513086311013245, + "kl": 0.0858306884765625, + "learning_rate": 4.999850393439348e-07, + "loss": 0.0001, + "reward": 1.7035714983940125, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7035714853554964, + "rewards/format_reward_func": 1.0, + "step": 4506 + }, + { + "completion_length": 226.72768688201904, + "epoch": 0.7557735026614695, + "grad_norm": 0.3411021833760753, + "kl": 0.0855865478515625, + "learning_rate": 4.999849078236395e-07, + "loss": 0.0001, + "reward": 1.7035715132951736, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7035714630037546, + "rewards/format_reward_func": 1.0, + "step": 4508 + }, + { + "completion_length": 223.00001049041748, + "epoch": 0.756108805901337, + "grad_norm": 0.2224263673764874, + "kl": 0.09161376953125, + "learning_rate": 4.999847757277862e-07, + "loss": 0.0001, + "reward": 1.8500000089406967, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8500000238418579, + "rewards/format_reward_func": 1.0, + "step": 4510 + }, + { + "completion_length": 233.04911708831787, + "epoch": 0.7564441091412045, + "grad_norm": 0.003793286746061747, + "kl": 0.0831298828125, + "learning_rate": 4.999846430563753e-07, + "loss": 0.0001, + "reward": 1.7571429088711739, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 4512 + }, + { + "completion_length": 226.94197368621826, + "epoch": 0.7567794123810722, + "grad_norm": 0.27291864041588626, + "kl": 0.0889892578125, + "learning_rate": 4.999845098094071e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 1.0, + "step": 4514 + }, + { + "completion_length": 224.39733219146729, + "epoch": 0.7571147156209397, + "grad_norm": 0.22265584848231698, + "kl": 0.087921142578125, + "learning_rate": 4.999843759868818e-07, + "loss": 0.0001, + "reward": 1.7589286118745804, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4516 + }, + { + "completion_length": 228.15179538726807, + "epoch": 0.7574500188608072, + "grad_norm": 0.20859738186600213, + "kl": 0.08905029296875, + "learning_rate": 4.999842415887999e-07, + "loss": 0.0001, + "reward": 1.8214286118745804, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214285969734192, + "rewards/format_reward_func": 1.0, + "step": 4518 + }, + { + "completion_length": 235.78125858306885, + "epoch": 0.7577853221006748, + "grad_norm": 0.2778610372869086, + "kl": 0.10809326171875, + "learning_rate": 4.999841066151615e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 4520 + }, + { + "completion_length": 222.90625858306885, + "epoch": 0.7581206253405424, + "grad_norm": 0.47212251372452274, + "kl": 0.087066650390625, + "learning_rate": 4.99983971065967e-07, + "loss": 0.0001, + "reward": 1.758928619325161, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.763392886146903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4522 + }, + { + "completion_length": 226.86161613464355, + "epoch": 0.7584559285804099, + "grad_norm": 0.17318141957091432, + "kl": 0.08984375, + "learning_rate": 4.999838349412168e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143357396126, + "rewards/format_reward_func": 1.0, + "step": 4524 + }, + { + "completion_length": 235.1741180419922, + "epoch": 0.7587912318202774, + "grad_norm": 0.2131832104092294, + "kl": 0.0755767822265625, + "learning_rate": 4.99983698240911e-07, + "loss": 0.0001, + "reward": 1.7290179133415222, + "reward_std": 0.04987628059461713, + "rewards/equation_reward_func": 0.7348214574158192, + "rewards/format_reward_func": 0.9941964335739613, + "step": 4526 + }, + { + "completion_length": 236.27233123779297, + "epoch": 0.759126535060145, + "grad_norm": 0.13201408256364378, + "kl": 0.06096649169921875, + "learning_rate": 4.999835609650501e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 4528 + }, + { + "completion_length": 240.81697273254395, + "epoch": 0.7594618383000126, + "grad_norm": 0.21052478496355254, + "kl": 0.0613861083984375, + "learning_rate": 4.999834231136344e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 4530 + }, + { + "completion_length": 241.55358219146729, + "epoch": 0.7597971415398801, + "grad_norm": 0.13556321579045813, + "kl": 0.058197021484375, + "learning_rate": 4.999832846866641e-07, + "loss": 0.0001, + "reward": 1.7843750566244125, + "reward_std": 0.04230013629421592, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4532 + }, + { + "completion_length": 227.63840579986572, + "epoch": 0.7601324447797477, + "grad_norm": 0.23882174356311658, + "kl": 0.069305419921875, + "learning_rate": 4.999831456841395e-07, + "loss": 0.0001, + "reward": 1.8125000670552254, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8169643171131611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4534 + }, + { + "completion_length": 237.0134048461914, + "epoch": 0.7604677480196153, + "grad_norm": 0.18469807951411582, + "kl": 0.059722900390625, + "learning_rate": 4.999830061060613e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 4536 + }, + { + "completion_length": 233.2991180419922, + "epoch": 0.7608030512594828, + "grad_norm": 0.23399028758696575, + "kl": 0.05745697021484375, + "learning_rate": 4.999828659524293e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714562982321, + "rewards/format_reward_func": 1.0, + "step": 4538 + }, + { + "completion_length": 239.99108219146729, + "epoch": 0.7611383544993503, + "grad_norm": 0.18787150584745277, + "kl": 0.0531463623046875, + "learning_rate": 4.999827252232441e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428883820772, + "rewards/format_reward_func": 1.0, + "step": 4540 + }, + { + "completion_length": 231.78125953674316, + "epoch": 0.7614736577392179, + "grad_norm": 0.3122036679038995, + "kl": 0.05462646484375, + "learning_rate": 4.99982583918506e-07, + "loss": 0.0001, + "reward": 1.7232143506407738, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7276786062866449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4542 + }, + { + "completion_length": 237.63393878936768, + "epoch": 0.7618089609790855, + "grad_norm": 0.3230807616701815, + "kl": 0.0553741455078125, + "learning_rate": 4.999824420382153e-07, + "loss": 0.0001, + "reward": 1.7285715267062187, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714484751225, + "rewards/format_reward_func": 1.0, + "step": 4544 + }, + { + "completion_length": 236.14286994934082, + "epoch": 0.762144264218953, + "grad_norm": 0.27987783469978955, + "kl": 0.0552215576171875, + "learning_rate": 4.999822995823724e-07, + "loss": 0.0001, + "reward": 1.7839286550879478, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4546 + }, + { + "completion_length": 235.84822368621826, + "epoch": 0.7624795674588206, + "grad_norm": 0.297388880050552, + "kl": 0.0589752197265625, + "learning_rate": 4.999821565509774e-07, + "loss": 0.0001, + "reward": 1.7392857745289803, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857503145933, + "rewards/format_reward_func": 1.0, + "step": 4548 + }, + { + "completion_length": 227.46429538726807, + "epoch": 0.7628148706986881, + "grad_norm": 0.18228938587993593, + "kl": 0.0537109375, + "learning_rate": 4.999820129440309e-07, + "loss": 0.0001, + "reward": 1.7593750804662704, + "reward_std": 0.02714784862473607, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4550 + }, + { + "completion_length": 238.2500114440918, + "epoch": 0.7631501739385557, + "grad_norm": 0.29408530140054356, + "kl": 0.0583953857421875, + "learning_rate": 4.999818687615332e-07, + "loss": 0.0001, + "reward": 1.7379465028643608, + "reward_std": 0.037249373737722635, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4552 + }, + { + "completion_length": 237.05358505249023, + "epoch": 0.7634854771784232, + "grad_norm": 0.21134377845046912, + "kl": 0.0565338134765625, + "learning_rate": 4.999817240034843e-07, + "loss": 0.0001, + "reward": 1.7964286506175995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 4554 + }, + { + "completion_length": 237.1785831451416, + "epoch": 0.7638207804182908, + "grad_norm": 0.3860091198419233, + "kl": 0.0579986572265625, + "learning_rate": 4.99981578669885e-07, + "loss": 0.0001, + "reward": 1.7107143625617027, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7107143253087997, + "rewards/format_reward_func": 1.0, + "step": 4556 + }, + { + "completion_length": 233.02233219146729, + "epoch": 0.7641560836581583, + "grad_norm": 0.22527084686531942, + "kl": 0.05401611328125, + "learning_rate": 4.999814327607354e-07, + "loss": 0.0001, + "reward": 1.8196429163217545, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.8241071775555611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4558 + }, + { + "completion_length": 235.0357255935669, + "epoch": 0.7644913868980259, + "grad_norm": 0.20327881479355508, + "kl": 0.0784149169921875, + "learning_rate": 4.999812862760358e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 4560 + }, + { + "completion_length": 246.8794765472412, + "epoch": 0.7648266901378935, + "grad_norm": 0.27998225865935156, + "kl": 0.05908203125, + "learning_rate": 4.999811392157866e-07, + "loss": 0.0001, + "reward": 1.7178572118282318, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7267857566475868, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4562 + }, + { + "completion_length": 230.05804634094238, + "epoch": 0.765161993377761, + "grad_norm": 0.22216394342281265, + "kl": 0.0649871826171875, + "learning_rate": 4.999809915799882e-07, + "loss": 0.0001, + "reward": 1.776785783469677, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4564 + }, + { + "completion_length": 238.883939743042, + "epoch": 0.7654972966176286, + "grad_norm": 0.31322171637204343, + "kl": 0.060089111328125, + "learning_rate": 4.999808433686408e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 4566 + }, + { + "completion_length": 233.1607265472412, + "epoch": 0.7658325998574961, + "grad_norm": 0.2450290221357226, + "kl": 0.0533905029296875, + "learning_rate": 4.999806945817447e-07, + "loss": 0.0001, + "reward": 1.7910714820027351, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357506871223, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4568 + }, + { + "completion_length": 241.15626049041748, + "epoch": 0.7661679030973637, + "grad_norm": 0.189232939152715, + "kl": 0.050811767578125, + "learning_rate": 4.999805452193006e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 4570 + }, + { + "completion_length": 234.21429634094238, + "epoch": 0.7665032063372312, + "grad_norm": 0.2246744520966242, + "kl": 0.05870819091796875, + "learning_rate": 4.999803952813084e-07, + "loss": 0.0001, + "reward": 1.6964286491274834, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6964286137372255, + "rewards/format_reward_func": 1.0, + "step": 4572 + }, + { + "completion_length": 232.60268783569336, + "epoch": 0.7668385095770988, + "grad_norm": 0.3987041317522369, + "kl": 0.05547332763671875, + "learning_rate": 4.999802447677688e-07, + "loss": 0.0001, + "reward": 1.8250000476837158, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000216066837, + "rewards/format_reward_func": 1.0, + "step": 4574 + }, + { + "completion_length": 233.89733123779297, + "epoch": 0.7671738128169664, + "grad_norm": 0.285725683188748, + "kl": 0.0704803466796875, + "learning_rate": 4.999800936786818e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 4576 + }, + { + "completion_length": 226.71429538726807, + "epoch": 0.7675091160568339, + "grad_norm": 0.13415930213725472, + "kl": 0.0756378173828125, + "learning_rate": 4.999799420140481e-07, + "loss": 0.0001, + "reward": 1.7410714998841286, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4578 + }, + { + "completion_length": 223.12054824829102, + "epoch": 0.7678444192967014, + "grad_norm": 0.26553482124859545, + "kl": 0.0814056396484375, + "learning_rate": 4.999797897738679e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 4580 + }, + { + "completion_length": 227.18750858306885, + "epoch": 0.768179722536569, + "grad_norm": 0.25025503420441153, + "kl": 0.0826568603515625, + "learning_rate": 4.999796369581414e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 1.0, + "step": 4582 + }, + { + "completion_length": 227.69643783569336, + "epoch": 0.7685150257764366, + "grad_norm": 0.24404022688359478, + "kl": 0.0773773193359375, + "learning_rate": 4.999794835668692e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 4584 + }, + { + "completion_length": 230.09375953674316, + "epoch": 0.7688503290163041, + "grad_norm": 0.29632477108847594, + "kl": 0.0727691650390625, + "learning_rate": 4.999793296000515e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428947150707, + "rewards/format_reward_func": 1.0, + "step": 4586 + }, + { + "completion_length": 227.6205472946167, + "epoch": 0.7691856322561716, + "grad_norm": 0.13950592949953058, + "kl": 0.0792388916015625, + "learning_rate": 4.999791750576887e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 4588 + }, + { + "completion_length": 226.1071548461914, + "epoch": 0.7695209354960393, + "grad_norm": 0.16795992322820116, + "kl": 0.0785980224609375, + "learning_rate": 4.999790199397813e-07, + "loss": 0.0001, + "reward": 1.7892857566475868, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857436090708, + "rewards/format_reward_func": 1.0, + "step": 4590 + }, + { + "completion_length": 233.9732255935669, + "epoch": 0.7698562387359068, + "grad_norm": 0.2591469522626041, + "kl": 0.0863189697265625, + "learning_rate": 4.999788642463293e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 4592 + }, + { + "completion_length": 230.99108219146729, + "epoch": 0.7701915419757743, + "grad_norm": 0.18624617499016574, + "kl": 0.083465576171875, + "learning_rate": 4.999787079773333e-07, + "loss": 0.0001, + "reward": 1.710714377462864, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7107143178582191, + "rewards/format_reward_func": 1.0, + "step": 4594 + }, + { + "completion_length": 226.89733123779297, + "epoch": 0.7705268452156419, + "grad_norm": 0.32854213372203656, + "kl": 0.06622314453125, + "learning_rate": 4.999785511327936e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 4596 + }, + { + "completion_length": 228.39733123779297, + "epoch": 0.7708621484555095, + "grad_norm": 0.27153695010869333, + "kl": 0.080352783203125, + "learning_rate": 4.999783937127107e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 4598 + }, + { + "completion_length": 229.54018878936768, + "epoch": 0.771197451695377, + "grad_norm": 0.20648485712931536, + "kl": 0.068572998046875, + "learning_rate": 4.999782357170849e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571734577417, + "rewards/format_reward_func": 1.0, + "step": 4600 + }, + { + "completion_length": 221.12947463989258, + "epoch": 0.7715327549352445, + "grad_norm": 0.2005184739035029, + "kl": 0.070556640625, + "learning_rate": 4.999780771459164e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.814285758882761, + "rewards/format_reward_func": 1.0, + "step": 4602 + }, + { + "completion_length": 238.4553689956665, + "epoch": 0.7718680581751122, + "grad_norm": 0.31052239666651466, + "kl": 0.065032958984375, + "learning_rate": 4.999779179992057e-07, + "loss": 0.0001, + "reward": 1.7808036282658577, + "reward_std": 0.037249373737722635, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4604 + }, + { + "completion_length": 238.0089406967163, + "epoch": 0.7722033614149797, + "grad_norm": 0.1775028297389513, + "kl": 0.071075439453125, + "learning_rate": 4.999777582769532e-07, + "loss": 0.0001, + "reward": 1.7107143849134445, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7107143178582191, + "rewards/format_reward_func": 1.0, + "step": 4606 + }, + { + "completion_length": 240.54465293884277, + "epoch": 0.7725386646548472, + "grad_norm": 0.4550589791379301, + "kl": 0.0648345947265625, + "learning_rate": 4.999775979791591e-07, + "loss": 0.0001, + "reward": 1.7071429565548897, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7071429006755352, + "rewards/format_reward_func": 1.0, + "step": 4608 + }, + { + "completion_length": 235.96875953674316, + "epoch": 0.7728739678947147, + "grad_norm": 0.2854396965216469, + "kl": 0.067779541015625, + "learning_rate": 4.999774371058239e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857596278191, + "rewards/format_reward_func": 1.0, + "step": 4610 + }, + { + "completion_length": 233.13393878936768, + "epoch": 0.7732092711345824, + "grad_norm": 0.3207019572513585, + "kl": 0.085296630859375, + "learning_rate": 4.999772756569482e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000573694706, + "rewards/format_reward_func": 1.0, + "step": 4612 + }, + { + "completion_length": 236.45983219146729, + "epoch": 0.7735445743744499, + "grad_norm": 0.3600593221437672, + "kl": 0.0930633544921875, + "learning_rate": 4.999771136325318e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 4614 + }, + { + "completion_length": 242.47322463989258, + "epoch": 0.7738798776143174, + "grad_norm": 0.4511198912935247, + "kl": 0.069305419921875, + "learning_rate": 4.999769510325756e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071838885546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4616 + }, + { + "completion_length": 235.93304634094238, + "epoch": 0.7742151808541851, + "grad_norm": 0.26591008144932465, + "kl": 0.0665130615234375, + "learning_rate": 4.999767878570797e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 4618 + }, + { + "completion_length": 245.4062614440918, + "epoch": 0.7745504840940526, + "grad_norm": 0.2758105569697166, + "kl": 0.062347412109375, + "learning_rate": 4.999766241060446e-07, + "loss": 0.0001, + "reward": 1.7392858043313026, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857540398836, + "rewards/format_reward_func": 1.0, + "step": 4620 + }, + { + "completion_length": 239.96429443359375, + "epoch": 0.7748857873339201, + "grad_norm": 0.1850762925480341, + "kl": 0.0596771240234375, + "learning_rate": 4.999764597794706e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 4622 + }, + { + "completion_length": 233.30358219146729, + "epoch": 0.7752210905737876, + "grad_norm": 0.2923727962502333, + "kl": 0.0526580810546875, + "learning_rate": 4.999762948773581e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 4624 + }, + { + "completion_length": 237.18750953674316, + "epoch": 0.7755563938136553, + "grad_norm": 0.29674990766493126, + "kl": 0.0712127685546875, + "learning_rate": 4.999761293997074e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357573926449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4626 + }, + { + "completion_length": 243.34376335144043, + "epoch": 0.7758916970535228, + "grad_norm": 0.12933399523501038, + "kl": 0.065521240234375, + "learning_rate": 4.999759633465191e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 4628 + }, + { + "completion_length": 243.59375953674316, + "epoch": 0.7762270002933903, + "grad_norm": 0.3046586914770116, + "kl": 0.0637664794921875, + "learning_rate": 4.999757967177934e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 4630 + }, + { + "completion_length": 242.4151906967163, + "epoch": 0.7765623035332578, + "grad_norm": 0.18912232830456813, + "kl": 0.056549072265625, + "learning_rate": 4.999756295135306e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 4632 + }, + { + "completion_length": 251.13393783569336, + "epoch": 0.7768976067731255, + "grad_norm": 0.45607946377712877, + "kl": 0.0615692138671875, + "learning_rate": 4.999754617337315e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 4634 + }, + { + "completion_length": 254.1919765472412, + "epoch": 0.777232910012993, + "grad_norm": 0.20415688926947373, + "kl": 0.0582427978515625, + "learning_rate": 4.99975293378396e-07, + "loss": 0.0001, + "reward": 1.7696429342031479, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4636 + }, + { + "completion_length": 258.8482265472412, + "epoch": 0.7775682132528605, + "grad_norm": 0.35244348616472587, + "kl": 0.06280517578125, + "learning_rate": 4.999751244475247e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.741964302957058, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4638 + }, + { + "completion_length": 242.8080472946167, + "epoch": 0.7779035164927282, + "grad_norm": 0.1982457603260857, + "kl": 0.0550079345703125, + "learning_rate": 4.99974954941118e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 4640 + }, + { + "completion_length": 244.2142972946167, + "epoch": 0.7782388197325957, + "grad_norm": 0.31280826798701716, + "kl": 0.0586395263671875, + "learning_rate": 4.999747848591763e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 4642 + }, + { + "completion_length": 245.86608219146729, + "epoch": 0.7785741229724632, + "grad_norm": 0.19968901429705566, + "kl": 0.08355712890625, + "learning_rate": 4.999746142017e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 4644 + }, + { + "completion_length": 250.78126525878906, + "epoch": 0.7789094262123307, + "grad_norm": 0.305616407661564, + "kl": 0.063568115234375, + "learning_rate": 4.999744429686894e-07, + "loss": 0.0001, + "reward": 1.7875000685453415, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4646 + }, + { + "completion_length": 250.68304824829102, + "epoch": 0.7792447294521984, + "grad_norm": 0.20280940768286793, + "kl": 0.0625, + "learning_rate": 4.99974271160145e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 4648 + }, + { + "completion_length": 249.08929920196533, + "epoch": 0.7795800326920659, + "grad_norm": 0.26340091091539314, + "kl": 0.0640411376953125, + "learning_rate": 4.999740987760671e-07, + "loss": 0.0001, + "reward": 1.7504464983940125, + "reward_std": 0.05997780757024884, + "rewards/equation_reward_func": 0.7651786133646965, + "rewards/format_reward_func": 0.9852678664028645, + "step": 4650 + }, + { + "completion_length": 248.95090579986572, + "epoch": 0.7799153359319334, + "grad_norm": 0.3629256277036554, + "kl": 0.0641937255859375, + "learning_rate": 4.999739258164561e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 4652 + }, + { + "completion_length": 243.58929538726807, + "epoch": 0.780250639171801, + "grad_norm": 0.1749645065771544, + "kl": 0.0629425048828125, + "learning_rate": 4.999737522813124e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571589291096, + "rewards/format_reward_func": 1.0, + "step": 4654 + }, + { + "completion_length": 250.03125953674316, + "epoch": 0.7805859424116686, + "grad_norm": 0.5312740210004152, + "kl": 0.0813446044921875, + "learning_rate": 4.999735781706365e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7910714615136385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4656 + }, + { + "completion_length": 248.3616189956665, + "epoch": 0.7809212456515361, + "grad_norm": 0.1763252818665863, + "kl": 0.0592498779296875, + "learning_rate": 4.999734034844289e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 4658 + }, + { + "completion_length": 248.52233028411865, + "epoch": 0.7812565488914036, + "grad_norm": 0.27396564397788564, + "kl": 0.080841064453125, + "learning_rate": 4.999732282226896e-07, + "loss": 0.0001, + "reward": 1.7160715237259865, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7205357477068901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4660 + }, + { + "completion_length": 249.57590579986572, + "epoch": 0.7815918521312712, + "grad_norm": 0.29187870403616006, + "kl": 0.067626953125, + "learning_rate": 4.999730523854194e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7589286006987095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4662 + }, + { + "completion_length": 244.07143878936768, + "epoch": 0.7819271553711388, + "grad_norm": 0.2820194000485553, + "kl": 0.0643157958984375, + "learning_rate": 4.999728759726185e-07, + "loss": 0.0001, + "reward": 1.708928644657135, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7133928909897804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4664 + }, + { + "completion_length": 246.00001049041748, + "epoch": 0.7822624586110063, + "grad_norm": 0.21389940795165543, + "kl": 0.0571746826171875, + "learning_rate": 4.999726989842874e-07, + "loss": 0.0001, + "reward": 1.8267857655882835, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8312500230967999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4666 + }, + { + "completion_length": 250.70536994934082, + "epoch": 0.7825977618508739, + "grad_norm": 0.20641362672396715, + "kl": 0.0699310302734375, + "learning_rate": 4.999725214204263e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4668 + }, + { + "completion_length": 249.31697368621826, + "epoch": 0.7829330650907415, + "grad_norm": 0.25385673762727295, + "kl": 0.059173583984375, + "learning_rate": 4.999723432810359e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 4670 + }, + { + "completion_length": 246.61608505249023, + "epoch": 0.783268368330609, + "grad_norm": 0.2611840826725265, + "kl": 0.0884552001953125, + "learning_rate": 4.999721645661165e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4672 + }, + { + "completion_length": 250.42411994934082, + "epoch": 0.7836036715704765, + "grad_norm": 0.6915758002266459, + "kl": 0.102630615234375, + "learning_rate": 4.999719852756685e-07, + "loss": 0.0001, + "reward": 1.7857143729925156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143022119999, + "rewards/format_reward_func": 1.0, + "step": 4674 + }, + { + "completion_length": 244.9375114440918, + "epoch": 0.7839389748103441, + "grad_norm": 0.2730890094350734, + "kl": 0.0650482177734375, + "learning_rate": 4.999718054096922e-07, + "loss": 0.0001, + "reward": 1.7580358013510704, + "reward_std": 0.059346460737288, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9982142895460129, + "step": 4676 + }, + { + "completion_length": 252.31251430511475, + "epoch": 0.7842742780502117, + "grad_norm": 0.21485735246991344, + "kl": 0.05242919921875, + "learning_rate": 4.999716249681883e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 4678 + }, + { + "completion_length": 242.25447463989258, + "epoch": 0.7846095812900792, + "grad_norm": 0.20691538510174362, + "kl": 0.0582122802734375, + "learning_rate": 4.999714439511568e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 4680 + }, + { + "completion_length": 253.27679824829102, + "epoch": 0.7849448845299468, + "grad_norm": 0.21287063557356575, + "kl": 0.05328369140625, + "learning_rate": 4.999712623585985e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4682 + }, + { + "completion_length": 261.8303689956665, + "epoch": 0.7852801877698143, + "grad_norm": 0.3163254069693305, + "kl": 0.052947998046875, + "learning_rate": 4.999710801905137e-07, + "loss": 0.0001, + "reward": 1.7553571835160255, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4684 + }, + { + "completion_length": 255.2634048461914, + "epoch": 0.7856154910096819, + "grad_norm": 0.13242133409732526, + "kl": 0.04907989501953125, + "learning_rate": 4.999708974469028e-07, + "loss": 0.0, + "reward": 1.7357143685221672, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7446428760886192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4686 + }, + { + "completion_length": 261.0491189956665, + "epoch": 0.7859507942495494, + "grad_norm": 0.30653420990937486, + "kl": 0.0510711669921875, + "learning_rate": 4.999707141277662e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 4688 + }, + { + "completion_length": 248.20090007781982, + "epoch": 0.786286097489417, + "grad_norm": 0.1144669042372897, + "kl": 0.04682159423828125, + "learning_rate": 4.999705302331042e-07, + "loss": 0.0, + "reward": 1.7392857819795609, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 4690 + }, + { + "completion_length": 249.48215579986572, + "epoch": 0.7866214007292845, + "grad_norm": 0.10345151057283293, + "kl": 0.0513916015625, + "learning_rate": 4.999703457629175e-07, + "loss": 0.0001, + "reward": 1.832142911851406, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8321428708732128, + "rewards/format_reward_func": 1.0, + "step": 4692 + }, + { + "completion_length": 250.82590293884277, + "epoch": 0.7869567039691521, + "grad_norm": 0.3815717918348069, + "kl": 0.056793212890625, + "learning_rate": 4.999701607172063e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 1.0, + "step": 4694 + }, + { + "completion_length": 244.66072750091553, + "epoch": 0.7872920072090197, + "grad_norm": 0.21475385548445075, + "kl": 0.06305694580078125, + "learning_rate": 4.999699750959712e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 4696 + }, + { + "completion_length": 244.7321548461914, + "epoch": 0.7876273104488872, + "grad_norm": 0.30145265431663126, + "kl": 0.0600738525390625, + "learning_rate": 4.999697888992124e-07, + "loss": 0.0001, + "reward": 1.739732213318348, + "reward_std": 0.05745242489501834, + "rewards/equation_reward_func": 0.7446428835391998, + "rewards/format_reward_func": 0.9950892888009548, + "step": 4698 + }, + { + "completion_length": 248.17411708831787, + "epoch": 0.7879626136887548, + "grad_norm": 0.167633043860595, + "kl": 0.04900360107421875, + "learning_rate": 4.999696021269305e-07, + "loss": 0.0, + "reward": 1.7285714969038963, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7285714615136385, + "rewards/format_reward_func": 1.0, + "step": 4700 + }, + { + "completion_length": 244.21429538726807, + "epoch": 0.7882979169286223, + "grad_norm": 0.18431015600634554, + "kl": 0.069671630859375, + "learning_rate": 4.99969414779126e-07, + "loss": 0.0001, + "reward": 1.7928571850061417, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 4702 + }, + { + "completion_length": 243.78126049041748, + "epoch": 0.7886332201684899, + "grad_norm": 0.15342521106660034, + "kl": 0.0619964599609375, + "learning_rate": 4.999692268557992e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 4704 + }, + { + "completion_length": 244.00001335144043, + "epoch": 0.7889685234083574, + "grad_norm": 0.2383847607881836, + "kl": 0.06097412109375, + "learning_rate": 4.999690383569505e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 4706 + }, + { + "completion_length": 243.5759038925171, + "epoch": 0.789303826648225, + "grad_norm": 0.3709918619220796, + "kl": 0.05507659912109375, + "learning_rate": 4.999688492825803e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.09596449043601751, + "rewards/equation_reward_func": 0.7839285992085934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4708 + }, + { + "completion_length": 237.4776906967163, + "epoch": 0.7896391298880926, + "grad_norm": 0.13338077156805117, + "kl": 0.04788970947265625, + "learning_rate": 4.999686596326894e-07, + "loss": 0.0, + "reward": 1.7803572118282318, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7848214358091354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4710 + }, + { + "completion_length": 250.5625123977661, + "epoch": 0.7899744331279601, + "grad_norm": 0.34551931468587194, + "kl": 0.0578155517578125, + "learning_rate": 4.999684694072776e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285980910063, + "rewards/format_reward_func": 1.0, + "step": 4712 + }, + { + "completion_length": 241.7053680419922, + "epoch": 0.7903097363678276, + "grad_norm": 0.24656622735147873, + "kl": 0.0606536865234375, + "learning_rate": 4.99968278606346e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 4714 + }, + { + "completion_length": 242.2321548461914, + "epoch": 0.7906450396076952, + "grad_norm": 0.2908306029397776, + "kl": 0.06658172607421875, + "learning_rate": 4.999680872298946e-07, + "loss": 0.0001, + "reward": 1.7267858013510704, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7312500365078449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4716 + }, + { + "completion_length": 235.7767972946167, + "epoch": 0.7909803428475628, + "grad_norm": 0.1948066984295881, + "kl": 0.0508880615234375, + "learning_rate": 4.999678952779241e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 4718 + }, + { + "completion_length": 238.86608123779297, + "epoch": 0.7913156460874303, + "grad_norm": 0.22120546745892436, + "kl": 0.0527191162109375, + "learning_rate": 4.999677027504347e-07, + "loss": 0.0001, + "reward": 1.8267857804894447, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8312500230967999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4720 + }, + { + "completion_length": 245.20983123779297, + "epoch": 0.7916509493272978, + "grad_norm": 0.1871303555582509, + "kl": 0.05560302734375, + "learning_rate": 4.99967509647427e-07, + "loss": 0.0001, + "reward": 1.7178572043776512, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571671247482, + "rewards/format_reward_func": 1.0, + "step": 4722 + }, + { + "completion_length": 238.77233028411865, + "epoch": 0.7919862525671655, + "grad_norm": 0.4842632365108479, + "kl": 0.0672454833984375, + "learning_rate": 4.999673159689015e-07, + "loss": 0.0001, + "reward": 1.7803571745753288, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4724 + }, + { + "completion_length": 243.75000953674316, + "epoch": 0.792321555807033, + "grad_norm": 0.2471484051407418, + "kl": 0.0733184814453125, + "learning_rate": 4.999671217148585e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 1.0, + "step": 4726 + }, + { + "completion_length": 239.40180015563965, + "epoch": 0.7926568590469005, + "grad_norm": 0.21497333312573835, + "kl": 0.0543670654296875, + "learning_rate": 4.999669268852984e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643066823483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4728 + }, + { + "completion_length": 247.93304634094238, + "epoch": 0.792992162286768, + "grad_norm": 0.1317931786223707, + "kl": 0.05374908447265625, + "learning_rate": 4.999667314802218e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 4730 + }, + { + "completion_length": 240.290189743042, + "epoch": 0.7933274655266357, + "grad_norm": 0.2581755180585634, + "kl": 0.05332183837890625, + "learning_rate": 4.999665354996292e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 4732 + }, + { + "completion_length": 239.62054824829102, + "epoch": 0.7936627687665032, + "grad_norm": 0.20370485342766165, + "kl": 0.0496673583984375, + "learning_rate": 4.99966338943521e-07, + "loss": 0.0, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 4734 + }, + { + "completion_length": 242.45537090301514, + "epoch": 0.7939980720063707, + "grad_norm": 0.29268332541299974, + "kl": 0.0518035888671875, + "learning_rate": 4.999661418118975e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 4736 + }, + { + "completion_length": 244.91518783569336, + "epoch": 0.7943333752462384, + "grad_norm": 0.16324022350176343, + "kl": 0.090301513671875, + "learning_rate": 4.999659441047592e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4738 + }, + { + "completion_length": 246.04465675354004, + "epoch": 0.7946686784861059, + "grad_norm": 0.1255347734918534, + "kl": 0.04901123046875, + "learning_rate": 4.999657458221067e-07, + "loss": 0.0, + "reward": 1.7642857879400253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 4740 + }, + { + "completion_length": 249.23215579986572, + "epoch": 0.7950039817259734, + "grad_norm": 0.16224730747256555, + "kl": 0.0569610595703125, + "learning_rate": 4.999655469639404e-07, + "loss": 0.0001, + "reward": 1.753571480512619, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714767873287, + "rewards/format_reward_func": 1.0, + "step": 4742 + }, + { + "completion_length": 238.4955472946167, + "epoch": 0.7953392849658409, + "grad_norm": 0.18577863111781406, + "kl": 0.05908203125, + "learning_rate": 4.999653475302607e-07, + "loss": 0.0001, + "reward": 1.8232143223285675, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.8276786021888256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4744 + }, + { + "completion_length": 243.20090293884277, + "epoch": 0.7956745882057086, + "grad_norm": 0.21887712276758306, + "kl": 0.047943115234375, + "learning_rate": 4.99965147521068e-07, + "loss": 0.0, + "reward": 1.8214285895228386, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214286155998707, + "rewards/format_reward_func": 1.0, + "step": 4746 + }, + { + "completion_length": 247.2455472946167, + "epoch": 0.7960098914455761, + "grad_norm": 0.3119421742863889, + "kl": 0.0528106689453125, + "learning_rate": 4.999649469363629e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 4748 + }, + { + "completion_length": 239.9419755935669, + "epoch": 0.7963451946854436, + "grad_norm": 0.28155752485961605, + "kl": 0.05231475830078125, + "learning_rate": 4.999647457761459e-07, + "loss": 0.0001, + "reward": 1.8178571984171867, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 4750 + }, + { + "completion_length": 252.48661708831787, + "epoch": 0.7966804979253111, + "grad_norm": 0.3151087780616481, + "kl": 0.04834747314453125, + "learning_rate": 4.999645440404173e-07, + "loss": 0.0, + "reward": 1.767857201397419, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 4752 + }, + { + "completion_length": 241.24554824829102, + "epoch": 0.7970158011651788, + "grad_norm": 0.2119678596279154, + "kl": 0.057403564453125, + "learning_rate": 4.999643417291776e-07, + "loss": 0.0001, + "reward": 1.8250000551342964, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000141561031, + "rewards/format_reward_func": 1.0, + "step": 4754 + }, + { + "completion_length": 250.9196548461914, + "epoch": 0.7973511044050463, + "grad_norm": 0.19341988481809425, + "kl": 0.0551300048828125, + "learning_rate": 4.999641388424274e-07, + "loss": 0.0001, + "reward": 1.719642922282219, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4756 + }, + { + "completion_length": 244.50447368621826, + "epoch": 0.7976864076449138, + "grad_norm": 0.18595105296353032, + "kl": 0.06219482421875, + "learning_rate": 4.99963935380167e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526785936206579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4758 + }, + { + "completion_length": 248.3348331451416, + "epoch": 0.7980217108847815, + "grad_norm": 0.2563888180930464, + "kl": 0.053497314453125, + "learning_rate": 4.99963731342397e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8214285895228386, + "rewards/format_reward_func": 1.0, + "step": 4760 + }, + { + "completion_length": 254.7500114440918, + "epoch": 0.798357014124649, + "grad_norm": 0.22967847403831104, + "kl": 0.05162811279296875, + "learning_rate": 4.999635267291178e-07, + "loss": 0.0001, + "reward": 1.7428571954369545, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571786731482, + "rewards/format_reward_func": 1.0, + "step": 4762 + }, + { + "completion_length": 251.24554538726807, + "epoch": 0.7986923173645165, + "grad_norm": 0.18666718271717592, + "kl": 0.0740814208984375, + "learning_rate": 4.999633215403298e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 4764 + }, + { + "completion_length": 254.54911518096924, + "epoch": 0.799027620604384, + "grad_norm": 0.20649343618390373, + "kl": 0.0608673095703125, + "learning_rate": 4.999631157760337e-07, + "loss": 0.0001, + "reward": 1.717857226729393, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571857511997, + "rewards/format_reward_func": 1.0, + "step": 4766 + }, + { + "completion_length": 258.2901916503906, + "epoch": 0.7993629238442517, + "grad_norm": 0.21921901486791798, + "kl": 0.0643157958984375, + "learning_rate": 4.999629094362298e-07, + "loss": 0.0001, + "reward": 1.735714390873909, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143089175224, + "rewards/format_reward_func": 1.0, + "step": 4768 + }, + { + "completion_length": 249.4464406967163, + "epoch": 0.7996982270841192, + "grad_norm": 0.2943164089246413, + "kl": 0.05615234375, + "learning_rate": 4.999627025209186e-07, + "loss": 0.0001, + "reward": 1.7723214775323868, + "reward_std": 0.06944798585027456, + "rewards/equation_reward_func": 0.7741071917116642, + "rewards/format_reward_func": 0.9982142895460129, + "step": 4770 + }, + { + "completion_length": 256.4866180419922, + "epoch": 0.8000335303239867, + "grad_norm": 0.17801495941107487, + "kl": 0.057403564453125, + "learning_rate": 4.999624950301005e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 4772 + }, + { + "completion_length": 253.21876430511475, + "epoch": 0.8003688335638544, + "grad_norm": 0.22709532160911014, + "kl": 0.0803985595703125, + "learning_rate": 4.999622869637761e-07, + "loss": 0.0001, + "reward": 1.8285714909434319, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8285714574158192, + "rewards/format_reward_func": 1.0, + "step": 4774 + }, + { + "completion_length": 255.7321548461914, + "epoch": 0.8007041368037219, + "grad_norm": 0.43890108257018107, + "kl": 0.06640625, + "learning_rate": 4.999620783219457e-07, + "loss": 0.0001, + "reward": 1.817857213318348, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 4776 + }, + { + "completion_length": 249.24108505249023, + "epoch": 0.8010394400435894, + "grad_norm": 0.21523694406650426, + "kl": 0.060211181640625, + "learning_rate": 4.999618691046101e-07, + "loss": 0.0001, + "reward": 1.8160715103149414, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8205357380211353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4778 + }, + { + "completion_length": 259.96875953674316, + "epoch": 0.8013747432834569, + "grad_norm": 0.30152421281682196, + "kl": 0.062408447265625, + "learning_rate": 4.999616593117696e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.10859139636158943, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4780 + }, + { + "completion_length": 248.758939743042, + "epoch": 0.8017100465233246, + "grad_norm": 0.18162921628223075, + "kl": 0.05597686767578125, + "learning_rate": 4.999614489434246e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 4782 + }, + { + "completion_length": 249.38840198516846, + "epoch": 0.8020453497631921, + "grad_norm": 0.18843137481806166, + "kl": 0.0561370849609375, + "learning_rate": 4.999612379995757e-07, + "loss": 0.0001, + "reward": 1.7928571850061417, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 4784 + }, + { + "completion_length": 253.75447750091553, + "epoch": 0.8023806530030596, + "grad_norm": 0.11907621852180425, + "kl": 0.060516357421875, + "learning_rate": 4.999610264802233e-07, + "loss": 0.0001, + "reward": 1.6660715192556381, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.6705357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4786 + }, + { + "completion_length": 261.1696586608887, + "epoch": 0.8027159562429272, + "grad_norm": 0.18805736926606073, + "kl": 0.064910888671875, + "learning_rate": 4.999608143853679e-07, + "loss": 0.0001, + "reward": 1.807142898440361, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 4788 + }, + { + "completion_length": 248.7276906967163, + "epoch": 0.8030512594827948, + "grad_norm": 0.21305119555940016, + "kl": 0.05750274658203125, + "learning_rate": 4.999606017150102e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 4790 + }, + { + "completion_length": 259.43305015563965, + "epoch": 0.8033865627226623, + "grad_norm": 0.21317110179286103, + "kl": 0.05792236328125, + "learning_rate": 4.999603884691504e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 4792 + }, + { + "completion_length": 252.7232265472412, + "epoch": 0.8037218659625298, + "grad_norm": 0.15468881031021706, + "kl": 0.0545654296875, + "learning_rate": 4.999601746477891e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464285995811224, + "rewards/format_reward_func": 1.0, + "step": 4794 + }, + { + "completion_length": 270.4776916503906, + "epoch": 0.8040571692023974, + "grad_norm": 0.18228137072443312, + "kl": 0.0901336669921875, + "learning_rate": 4.999599602509269e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321429029107094, + "rewards/format_reward_func": 1.0, + "step": 4796 + }, + { + "completion_length": 268.3571529388428, + "epoch": 0.804392472442265, + "grad_norm": 0.21559347957903086, + "kl": 0.0575103759765625, + "learning_rate": 4.999597452785641e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7553571723401546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4798 + }, + { + "completion_length": 265.1384048461914, + "epoch": 0.8047277756821325, + "grad_norm": 0.2657425103639842, + "kl": 0.158416748046875, + "learning_rate": 4.999595297307014e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 4800 + }, + { + "completion_length": 274.2276916503906, + "epoch": 0.8050630789220001, + "grad_norm": 0.1548226820890832, + "kl": 0.0614166259765625, + "learning_rate": 4.99959313607339e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4802 + }, + { + "completion_length": 279.4017963409424, + "epoch": 0.8053983821618677, + "grad_norm": 0.2341381706367825, + "kl": 0.0644378662109375, + "learning_rate": 4.999590969084777e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4804 + }, + { + "completion_length": 270.4687604904175, + "epoch": 0.8057336854017352, + "grad_norm": 0.22770723007849342, + "kl": 0.088226318359375, + "learning_rate": 4.999588796341178e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4806 + }, + { + "completion_length": 277.86162090301514, + "epoch": 0.8060689886416027, + "grad_norm": 0.3219488111395403, + "kl": 0.069610595703125, + "learning_rate": 4.999586617842599e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7696428894996643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4808 + }, + { + "completion_length": 267.16965770721436, + "epoch": 0.8064042918814703, + "grad_norm": 0.21817359942228975, + "kl": 0.0725860595703125, + "learning_rate": 4.999584433589046e-07, + "loss": 0.0001, + "reward": 1.7196428924798965, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7330357562750578, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4810 + }, + { + "completion_length": 275.1696548461914, + "epoch": 0.8067395951213379, + "grad_norm": 0.17800857048787785, + "kl": 0.075469970703125, + "learning_rate": 4.999582243580522e-07, + "loss": 0.0001, + "reward": 1.7446429431438446, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4812 + }, + { + "completion_length": 274.85268688201904, + "epoch": 0.8070748983612054, + "grad_norm": 0.24170594435818452, + "kl": 0.0898284912109375, + "learning_rate": 4.999580047817033e-07, + "loss": 0.0001, + "reward": 1.7696429342031479, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7830357402563095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4814 + }, + { + "completion_length": 270.9776916503906, + "epoch": 0.807410201601073, + "grad_norm": 0.24363523164547604, + "kl": 0.1192169189453125, + "learning_rate": 4.999577846298584e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4816 + }, + { + "completion_length": 281.633939743042, + "epoch": 0.8077455048409405, + "grad_norm": 0.2132089822493617, + "kl": 0.0901947021484375, + "learning_rate": 4.999575639025179e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 4818 + }, + { + "completion_length": 272.4642972946167, + "epoch": 0.8080808080808081, + "grad_norm": 0.1495708958499177, + "kl": 0.2392578125, + "learning_rate": 4.999573425996826e-07, + "loss": 0.0002, + "reward": 1.7589286342263222, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4820 + }, + { + "completion_length": 281.1071548461914, + "epoch": 0.8084161113206756, + "grad_norm": 0.18808619582281788, + "kl": 0.0755767822265625, + "learning_rate": 4.999571207213527e-07, + "loss": 0.0001, + "reward": 1.819642886519432, + "reward_std": 0.0530330091714859, + "rewards/equation_reward_func": 0.8330357410013676, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4822 + }, + { + "completion_length": 276.33930110931396, + "epoch": 0.8087514145605432, + "grad_norm": 0.25074459221996265, + "kl": 0.1247100830078125, + "learning_rate": 4.999568982675288e-07, + "loss": 0.0001, + "reward": 1.7839286178350449, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928939700127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4824 + }, + { + "completion_length": 268.6116180419922, + "epoch": 0.8090867178004107, + "grad_norm": 0.2757940434856731, + "kl": 0.079986572265625, + "learning_rate": 4.999566752382115e-07, + "loss": 0.0001, + "reward": 1.7714286670088768, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 4826 + }, + { + "completion_length": 271.4196538925171, + "epoch": 0.8094220210402783, + "grad_norm": 0.37402314358434524, + "kl": 0.088958740234375, + "learning_rate": 4.999564516334014e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4828 + }, + { + "completion_length": 282.1160821914673, + "epoch": 0.8097573242801459, + "grad_norm": 0.2814170427276152, + "kl": 0.0948486328125, + "learning_rate": 4.999562274530986e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.7803571671247482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4830 + }, + { + "completion_length": 274.2634038925171, + "epoch": 0.8100926275200134, + "grad_norm": 0.45118148911042716, + "kl": 0.0821533203125, + "learning_rate": 4.999560026973041e-07, + "loss": 0.0001, + "reward": 1.7000000700354576, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7178571671247482, + "rewards/format_reward_func": 0.9821428656578064, + "step": 4832 + }, + { + "completion_length": 275.20537281036377, + "epoch": 0.810427930759881, + "grad_norm": 0.18307441024953733, + "kl": 0.08013916015625, + "learning_rate": 4.999557773660181e-07, + "loss": 0.0001, + "reward": 1.7339286133646965, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4834 + }, + { + "completion_length": 276.81250953674316, + "epoch": 0.8107632339997485, + "grad_norm": 0.2023363492109368, + "kl": 0.0673065185546875, + "learning_rate": 4.999555514592412e-07, + "loss": 0.0001, + "reward": 1.716071479022503, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7205357365310192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4836 + }, + { + "completion_length": 270.35715770721436, + "epoch": 0.8110985372396161, + "grad_norm": 0.21590173948082855, + "kl": 0.079620361328125, + "learning_rate": 4.999553249769741e-07, + "loss": 0.0001, + "reward": 1.807142898440361, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8071428816765547, + "rewards/format_reward_func": 1.0, + "step": 4838 + }, + { + "completion_length": 276.65179920196533, + "epoch": 0.8114338404794836, + "grad_norm": 0.18878714466772603, + "kl": 0.0695648193359375, + "learning_rate": 4.999550979192169e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 4840 + }, + { + "completion_length": 285.02680015563965, + "epoch": 0.8117691437193512, + "grad_norm": 0.1801698236755662, + "kl": 0.0682830810546875, + "learning_rate": 4.999548702859706e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7625000178813934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4842 + }, + { + "completion_length": 266.7946529388428, + "epoch": 0.8121044469592188, + "grad_norm": 0.29530471967628835, + "kl": 0.06537628173828125, + "learning_rate": 4.999546420772355e-07, + "loss": 0.0001, + "reward": 1.7910714745521545, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7955357357859612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4844 + }, + { + "completion_length": 277.7946548461914, + "epoch": 0.8124397501990863, + "grad_norm": 0.2510459183216338, + "kl": 0.0666961669921875, + "learning_rate": 4.99954413293012e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7598214745521545, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4846 + }, + { + "completion_length": 283.46876335144043, + "epoch": 0.8127750534389538, + "grad_norm": 0.1536660170745747, + "kl": 0.0618133544921875, + "learning_rate": 4.999541839333009e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7267857454717159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4848 + }, + { + "completion_length": 272.60715675354004, + "epoch": 0.8131103566788214, + "grad_norm": 0.2226667586865844, + "kl": 0.070068359375, + "learning_rate": 4.999539539981026e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 1.0, + "step": 4850 + }, + { + "completion_length": 261.90179443359375, + "epoch": 0.813445659918689, + "grad_norm": 0.3051079389978345, + "kl": 0.0652008056640625, + "learning_rate": 4.999537234874175e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571574389935, + "rewards/format_reward_func": 1.0, + "step": 4852 + }, + { + "completion_length": 270.1562662124634, + "epoch": 0.8137809631585565, + "grad_norm": 0.1828016501904028, + "kl": 0.062744140625, + "learning_rate": 4.999534924012463e-07, + "loss": 0.0001, + "reward": 1.7589286118745804, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4854 + }, + { + "completion_length": 269.808048248291, + "epoch": 0.814116266398424, + "grad_norm": 0.320133616246475, + "kl": 0.075286865234375, + "learning_rate": 4.999532607395895e-07, + "loss": 0.0001, + "reward": 1.742857240140438, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 4856 + }, + { + "completion_length": 275.1205472946167, + "epoch": 0.8144515696382917, + "grad_norm": 0.16699386169014177, + "kl": 0.0585174560546875, + "learning_rate": 4.999530285024477e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 4858 + }, + { + "completion_length": 272.2544765472412, + "epoch": 0.8147868728781592, + "grad_norm": 0.40172270093922924, + "kl": 0.0598907470703125, + "learning_rate": 4.999527956898213e-07, + "loss": 0.0001, + "reward": 1.7196429446339607, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7330357432365417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4860 + }, + { + "completion_length": 266.4866189956665, + "epoch": 0.8151221761180267, + "grad_norm": 0.37233589266703304, + "kl": 0.065093994140625, + "learning_rate": 4.999525623017109e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8089285809546709, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4862 + }, + { + "completion_length": 267.90179347991943, + "epoch": 0.8154574793578943, + "grad_norm": 0.30506025414719584, + "kl": 0.062347412109375, + "learning_rate": 4.99952328338117e-07, + "loss": 0.0001, + "reward": 1.776785783469677, + "reward_std": 0.09343910776078701, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4864 + }, + { + "completion_length": 259.2142963409424, + "epoch": 0.8157927825977619, + "grad_norm": 0.12008669658374085, + "kl": 0.062469482421875, + "learning_rate": 4.999520937990401e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.792857151478529, + "rewards/format_reward_func": 1.0, + "step": 4866 + }, + { + "completion_length": 255.54465198516846, + "epoch": 0.8161280858376294, + "grad_norm": 0.19604347972133174, + "kl": 0.0647430419921875, + "learning_rate": 4.999518586844809e-07, + "loss": 0.0001, + "reward": 1.7696429416537285, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4868 + }, + { + "completion_length": 258.7634048461914, + "epoch": 0.8164633890774969, + "grad_norm": 0.31025431871878384, + "kl": 0.06878662109375, + "learning_rate": 4.999516229944397e-07, + "loss": 0.0001, + "reward": 1.7142857983708382, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7142857387661934, + "rewards/format_reward_func": 1.0, + "step": 4870 + }, + { + "completion_length": 258.1473340988159, + "epoch": 0.8167986923173646, + "grad_norm": 0.059445911244861434, + "kl": 0.0688018798828125, + "learning_rate": 4.999513867289173e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143212109804, + "rewards/format_reward_func": 1.0, + "step": 4872 + }, + { + "completion_length": 252.93304920196533, + "epoch": 0.8171339955572321, + "grad_norm": 0.0881423719531909, + "kl": 0.0708465576171875, + "learning_rate": 4.999511498879142e-07, + "loss": 0.0001, + "reward": 1.7736607939004898, + "reward_std": 0.013258251827210188, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4874 + }, + { + "completion_length": 256.0223321914673, + "epoch": 0.8174692987970996, + "grad_norm": 0.2124047604525655, + "kl": 0.071258544921875, + "learning_rate": 4.999509124714308e-07, + "loss": 0.0001, + "reward": 1.753571480512619, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.753571480512619, + "rewards/format_reward_func": 1.0, + "step": 4876 + }, + { + "completion_length": 256.4375114440918, + "epoch": 0.8178046020369671, + "grad_norm": 0.18623706897044853, + "kl": 0.0633697509765625, + "learning_rate": 4.999506744794677e-07, + "loss": 0.0001, + "reward": 1.7678571864962578, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4878 + }, + { + "completion_length": 255.67411994934082, + "epoch": 0.8181399052768348, + "grad_norm": 0.15097665520310913, + "kl": 0.0752410888671875, + "learning_rate": 4.999504359120255e-07, + "loss": 0.0001, + "reward": 1.7415179386734962, + "reward_std": 0.032198611879721284, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4880 + }, + { + "completion_length": 273.29019260406494, + "epoch": 0.8184752085167023, + "grad_norm": 0.26742352819239673, + "kl": 0.0709228515625, + "learning_rate": 4.999501967691048e-07, + "loss": 0.0001, + "reward": 1.6964286714792252, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7053571790456772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4882 + }, + { + "completion_length": 261.84376430511475, + "epoch": 0.8188105117565698, + "grad_norm": 0.25184865004850154, + "kl": 0.0738372802734375, + "learning_rate": 4.99949957050706e-07, + "loss": 0.0001, + "reward": 1.7290179282426834, + "reward_std": 0.07007933035492897, + "rewards/equation_reward_func": 0.7348214648663998, + "rewards/format_reward_func": 0.994196429848671, + "step": 4884 + }, + { + "completion_length": 253.09375953674316, + "epoch": 0.8191458149964373, + "grad_norm": 0.21782260571475182, + "kl": 0.079376220703125, + "learning_rate": 4.999497167568297e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 4886 + }, + { + "completion_length": 253.2812614440918, + "epoch": 0.819481118236305, + "grad_norm": 0.16804666737366383, + "kl": 0.0702362060546875, + "learning_rate": 4.999494758874765e-07, + "loss": 0.0001, + "reward": 1.7642857655882835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857413738966, + "rewards/format_reward_func": 1.0, + "step": 4888 + }, + { + "completion_length": 265.95983600616455, + "epoch": 0.8198164214761725, + "grad_norm": 0.09307030417656371, + "kl": 0.0602874755859375, + "learning_rate": 4.999492344426469e-07, + "loss": 0.0001, + "reward": 1.733928620815277, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4890 + }, + { + "completion_length": 259.1428699493408, + "epoch": 0.82015172471604, + "grad_norm": 0.25695249055536407, + "kl": 0.0602569580078125, + "learning_rate": 4.999489924223416e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143055647612, + "rewards/format_reward_func": 1.0, + "step": 4892 + }, + { + "completion_length": 269.8214387893677, + "epoch": 0.8204870279559077, + "grad_norm": 0.19842259243217908, + "kl": 0.0663299560546875, + "learning_rate": 4.999487498265609e-07, + "loss": 0.0001, + "reward": 1.7410714849829674, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4894 + }, + { + "completion_length": 257.30804443359375, + "epoch": 0.8208223311957752, + "grad_norm": 0.2123599581687482, + "kl": 0.071533203125, + "learning_rate": 4.999485066553056e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857283353806, + "rewards/format_reward_func": 1.0, + "step": 4896 + }, + { + "completion_length": 270.36608028411865, + "epoch": 0.8211576344356427, + "grad_norm": 0.1736771649562477, + "kl": 0.060089111328125, + "learning_rate": 4.999482629085762e-07, + "loss": 0.0001, + "reward": 1.7892858013510704, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7982143089175224, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4898 + }, + { + "completion_length": 257.4955463409424, + "epoch": 0.8214929376755102, + "grad_norm": 0.20787241332400688, + "kl": 0.0594024658203125, + "learning_rate": 4.99948018586373e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7803571820259094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4900 + }, + { + "completion_length": 259.64287090301514, + "epoch": 0.8218282409153779, + "grad_norm": 0.23193187383227143, + "kl": 0.0570831298828125, + "learning_rate": 4.99947773688697e-07, + "loss": 0.0001, + "reward": 1.7660714834928513, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4902 + }, + { + "completion_length": 260.96429920196533, + "epoch": 0.8221635441552454, + "grad_norm": 0.17487735272584332, + "kl": 0.05246734619140625, + "learning_rate": 4.999475282155485e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428921073675, + "rewards/format_reward_func": 1.0, + "step": 4904 + }, + { + "completion_length": 261.5446557998657, + "epoch": 0.8224988473951129, + "grad_norm": 0.27319235143934706, + "kl": 0.0552215576171875, + "learning_rate": 4.999472821669281e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8214285969734192, + "rewards/format_reward_func": 1.0, + "step": 4906 + }, + { + "completion_length": 271.3348379135132, + "epoch": 0.8228341506349806, + "grad_norm": 0.2630989065803111, + "kl": 0.0597381591796875, + "learning_rate": 4.999470355428364e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7678571939468384, + "rewards/format_reward_func": 1.0, + "step": 4908 + }, + { + "completion_length": 273.5089406967163, + "epoch": 0.8231694538748481, + "grad_norm": 0.17175984680368336, + "kl": 0.0602264404296875, + "learning_rate": 4.999467883432739e-07, + "loss": 0.0001, + "reward": 1.6875000819563866, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.6919643208384514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4910 + }, + { + "completion_length": 261.35715675354004, + "epoch": 0.8235047571147156, + "grad_norm": 0.2206713915711781, + "kl": 0.0681610107421875, + "learning_rate": 4.999465405682414e-07, + "loss": 0.0001, + "reward": 1.733928620815277, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7473214603960514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4912 + }, + { + "completion_length": 277.81697368621826, + "epoch": 0.8238400603545831, + "grad_norm": 0.2808354461978824, + "kl": 0.0636749267578125, + "learning_rate": 4.999462922177391e-07, + "loss": 0.0001, + "reward": 1.7464286759495735, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 4914 + }, + { + "completion_length": 270.495548248291, + "epoch": 0.8241753635944508, + "grad_norm": 0.37773820919936024, + "kl": 0.0789031982421875, + "learning_rate": 4.999460432917678e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4916 + }, + { + "completion_length": 269.8794746398926, + "epoch": 0.8245106668343183, + "grad_norm": 0.26859032997452337, + "kl": 0.0596160888671875, + "learning_rate": 4.999457937903281e-07, + "loss": 0.0001, + "reward": 1.6767858043313026, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.6901786029338837, + "rewards/format_reward_func": 0.9866071492433548, + "step": 4918 + }, + { + "completion_length": 268.6384048461914, + "epoch": 0.8248459700741858, + "grad_norm": 0.22619369001240797, + "kl": 0.0674896240234375, + "learning_rate": 4.999455437134205e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 0.9821428656578064, + "step": 4920 + }, + { + "completion_length": 273.4375162124634, + "epoch": 0.8251812733140534, + "grad_norm": 0.14572012351895092, + "kl": 0.064910888671875, + "learning_rate": 4.999452930610455e-07, + "loss": 0.0001, + "reward": 1.7107143625617027, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143178582191, + "rewards/format_reward_func": 1.0, + "step": 4922 + }, + { + "completion_length": 273.55804347991943, + "epoch": 0.825516576553921, + "grad_norm": 0.19885069207259493, + "kl": 0.0724029541015625, + "learning_rate": 4.999450418332038e-07, + "loss": 0.0001, + "reward": 1.737500086426735, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419642992317677, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4924 + }, + { + "completion_length": 267.4598321914673, + "epoch": 0.8258518797937885, + "grad_norm": 0.20614664708216593, + "kl": 0.1008758544921875, + "learning_rate": 4.99944790029896e-07, + "loss": 0.0001, + "reward": 1.7696429193019867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071730852127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4926 + }, + { + "completion_length": 270.4241199493408, + "epoch": 0.826187183033656, + "grad_norm": 0.39868854843655405, + "kl": 0.0659942626953125, + "learning_rate": 4.999445376511225e-07, + "loss": 0.0001, + "reward": 1.8125000596046448, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8169643133878708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4928 + }, + { + "completion_length": 273.05804920196533, + "epoch": 0.8265224862735236, + "grad_norm": 0.44631029880094397, + "kl": 0.047637939453125, + "learning_rate": 4.999442846968841e-07, + "loss": 0.0, + "reward": 1.7982143238186836, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4930 + }, + { + "completion_length": 264.0848321914673, + "epoch": 0.8268577895133912, + "grad_norm": 0.16727886038983347, + "kl": 0.0599212646484375, + "learning_rate": 4.999440311671812e-07, + "loss": 0.0001, + "reward": 1.8464286103844643, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8464285917580128, + "rewards/format_reward_func": 1.0, + "step": 4932 + }, + { + "completion_length": 264.13840675354004, + "epoch": 0.8271930927532587, + "grad_norm": 0.29495113787224825, + "kl": 0.0543212890625, + "learning_rate": 4.999437770620146e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714469850063, + "rewards/format_reward_func": 1.0, + "step": 4934 + }, + { + "completion_length": 258.9285831451416, + "epoch": 0.8275283959931263, + "grad_norm": 0.22670554901695683, + "kl": 0.0487060546875, + "learning_rate": 4.999435223813847e-07, + "loss": 0.0, + "reward": 1.8250000476837158, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.8339285813271999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4936 + }, + { + "completion_length": 267.41965770721436, + "epoch": 0.8278636992329939, + "grad_norm": 0.26918688987822686, + "kl": 0.05322265625, + "learning_rate": 4.999432671252921e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7419643104076385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4938 + }, + { + "completion_length": 263.3214416503906, + "epoch": 0.8281990024728614, + "grad_norm": 0.21402795920358994, + "kl": 0.059417724609375, + "learning_rate": 4.999430112937374e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 4940 + }, + { + "completion_length": 271.71430110931396, + "epoch": 0.8285343057127289, + "grad_norm": 0.7274596587412323, + "kl": 0.0613861083984375, + "learning_rate": 4.999427548867214e-07, + "loss": 0.0001, + "reward": 1.689285784959793, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.689285758882761, + "rewards/format_reward_func": 1.0, + "step": 4942 + }, + { + "completion_length": 242.38840103149414, + "epoch": 0.8288696089525965, + "grad_norm": 0.196112449776157, + "kl": 0.0539093017578125, + "learning_rate": 4.999424979042443e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428626775742, + "rewards/format_reward_func": 1.0, + "step": 4944 + }, + { + "completion_length": 268.79465675354004, + "epoch": 0.8292049121924641, + "grad_norm": 0.14449989796392068, + "kl": 0.06292724609375, + "learning_rate": 4.99942240346307e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7482143193483353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4946 + }, + { + "completion_length": 259.68751430511475, + "epoch": 0.8295402154323316, + "grad_norm": 0.19207518609316115, + "kl": 0.05278778076171875, + "learning_rate": 4.999419822129099e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 4948 + }, + { + "completion_length": 269.71429538726807, + "epoch": 0.8298755186721992, + "grad_norm": 0.19097486547502238, + "kl": 0.15569305419921875, + "learning_rate": 4.999417235040538e-07, + "loss": 0.0002, + "reward": 1.7214286401867867, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7392857428640127, + "rewards/format_reward_func": 0.9821428619325161, + "step": 4950 + }, + { + "completion_length": 260.8214406967163, + "epoch": 0.8302108219120667, + "grad_norm": 0.15030844373911773, + "kl": 0.088470458984375, + "learning_rate": 4.99941464219739e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7491071838885546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4952 + }, + { + "completion_length": 257.60715770721436, + "epoch": 0.8305461251519343, + "grad_norm": 0.2758773487892338, + "kl": 0.069061279296875, + "learning_rate": 4.999412043599665e-07, + "loss": 0.0001, + "reward": 1.68571437895298, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.6937500294297934, + "rewards/format_reward_func": 0.9919642880558968, + "step": 4954 + }, + { + "completion_length": 262.9910821914673, + "epoch": 0.8308814283918018, + "grad_norm": 0.21574039383575827, + "kl": 0.05950927734375, + "learning_rate": 4.999409439247366e-07, + "loss": 0.0001, + "reward": 1.7915179207921028, + "reward_std": 0.04230013699270785, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 0.9986607171595097, + "step": 4956 + }, + { + "completion_length": 257.2946548461914, + "epoch": 0.8312167316316694, + "grad_norm": 0.19333692041023248, + "kl": 0.0700531005859375, + "learning_rate": 4.999406829140499e-07, + "loss": 0.0001, + "reward": 1.7428571954369545, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7517857514321804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4958 + }, + { + "completion_length": 258.95983505249023, + "epoch": 0.831552034871537, + "grad_norm": 0.28450642542702165, + "kl": 0.1026763916015625, + "learning_rate": 4.999404213279072e-07, + "loss": 0.0001, + "reward": 1.7107143551111221, + "reward_std": 0.09091372601687908, + "rewards/equation_reward_func": 0.7196428775787354, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4960 + }, + { + "completion_length": 258.48662185668945, + "epoch": 0.8318873381114045, + "grad_norm": 0.11666481478020153, + "kl": 0.0623321533203125, + "learning_rate": 4.999401591663088e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285910129547, + "rewards/format_reward_func": 1.0, + "step": 4962 + }, + { + "completion_length": 261.14287090301514, + "epoch": 0.8322226413512721, + "grad_norm": 0.2595529695459257, + "kl": 0.0776519775390625, + "learning_rate": 4.999398964292556e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286033064127, + "rewards/format_reward_func": 1.0, + "step": 4964 + }, + { + "completion_length": 255.86161422729492, + "epoch": 0.8325579445911396, + "grad_norm": 0.11477647603067251, + "kl": 0.05771636962890625, + "learning_rate": 4.999396331167481e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 4966 + }, + { + "completion_length": 251.63840866088867, + "epoch": 0.8328932478310072, + "grad_norm": 0.3354314841872632, + "kl": 0.0924072265625, + "learning_rate": 4.999393692287868e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4968 + }, + { + "completion_length": 259.92858505249023, + "epoch": 0.8332285510708747, + "grad_norm": 0.22319074901580088, + "kl": 0.05889892578125, + "learning_rate": 4.999391047653726e-07, + "loss": 0.0001, + "reward": 1.73214291036129, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428898721933, + "rewards/format_reward_func": 1.0, + "step": 4970 + }, + { + "completion_length": 253.16965579986572, + "epoch": 0.8335638543107423, + "grad_norm": 0.20776365236196978, + "kl": 0.0605621337890625, + "learning_rate": 4.999388397265057e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 4972 + }, + { + "completion_length": 258.81250953674316, + "epoch": 0.8338991575506098, + "grad_norm": 0.3116271721626666, + "kl": 0.0717926025390625, + "learning_rate": 4.999385741121871e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 4974 + }, + { + "completion_length": 257.9732246398926, + "epoch": 0.8342344607904774, + "grad_norm": 0.16783852777353048, + "kl": 0.0492401123046875, + "learning_rate": 4.999383079224171e-07, + "loss": 0.0, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 4976 + }, + { + "completion_length": 267.3125123977661, + "epoch": 0.834569764030345, + "grad_norm": 0.2939545154505324, + "kl": 0.1884918212890625, + "learning_rate": 4.999380411571965e-07, + "loss": 0.0002, + "reward": 1.7214286178350449, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7303571756929159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4978 + }, + { + "completion_length": 271.2857255935669, + "epoch": 0.8349050672702125, + "grad_norm": 0.13913934639957815, + "kl": 0.26397705078125, + "learning_rate": 4.999377738165259e-07, + "loss": 0.0003, + "reward": 1.74642863124609, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7553571723401546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4980 + }, + { + "completion_length": 258.9553680419922, + "epoch": 0.83524037051008, + "grad_norm": 0.2516402147403652, + "kl": 0.0686798095703125, + "learning_rate": 4.999375059004057e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4982 + }, + { + "completion_length": 268.2321557998657, + "epoch": 0.8355756737499476, + "grad_norm": 0.2667209451787418, + "kl": 0.1626129150390625, + "learning_rate": 4.999372374088369e-07, + "loss": 0.0002, + "reward": 1.7803571969270706, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214488476515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4984 + }, + { + "completion_length": 260.77233695983887, + "epoch": 0.8359109769898152, + "grad_norm": 0.26300885880997005, + "kl": 0.1019744873046875, + "learning_rate": 4.999369683418199e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 4986 + }, + { + "completion_length": 259.6116199493408, + "epoch": 0.8362462802296827, + "grad_norm": 0.3083753439451682, + "kl": 0.171356201171875, + "learning_rate": 4.999366986993552e-07, + "loss": 0.0002, + "reward": 1.8321429267525673, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8321428745985031, + "rewards/format_reward_func": 1.0, + "step": 4988 + }, + { + "completion_length": 278.30804920196533, + "epoch": 0.8365815834695502, + "grad_norm": 0.4166251158747486, + "kl": 0.43697357177734375, + "learning_rate": 4.999364284814435e-07, + "loss": 0.0004, + "reward": 1.753571480512619, + "reward_std": 0.08586296625435352, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 0.9821428656578064, + "step": 4990 + }, + { + "completion_length": 273.8884048461914, + "epoch": 0.8369168867094179, + "grad_norm": 0.22651730612716978, + "kl": 0.0768585205078125, + "learning_rate": 4.999361576880856e-07, + "loss": 0.0001, + "reward": 1.7178572416305542, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857380211353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 4992 + }, + { + "completion_length": 265.2321548461914, + "epoch": 0.8372521899492854, + "grad_norm": 0.19169125415619304, + "kl": 0.3283843994140625, + "learning_rate": 4.99935886319282e-07, + "loss": 0.0003, + "reward": 1.8142857551574707, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857626080513, + "rewards/format_reward_func": 1.0, + "step": 4994 + }, + { + "completion_length": 264.4375123977661, + "epoch": 0.8375874931891529, + "grad_norm": 0.14860434722498145, + "kl": 0.04929351806640625, + "learning_rate": 4.999356143750332e-07, + "loss": 0.0, + "reward": 1.7089286297559738, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.713392898440361, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4996 + }, + { + "completion_length": 262.3259057998657, + "epoch": 0.8379227964290205, + "grad_norm": 0.3040842393844373, + "kl": 0.06220245361328125, + "learning_rate": 4.999353418553402e-07, + "loss": 0.0001, + "reward": 1.7946429178118706, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7991071734577417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 4998 + }, + { + "completion_length": 256.0982255935669, + "epoch": 0.8382580996688881, + "grad_norm": 0.9815141329046756, + "kl": 0.08681488037109375, + "learning_rate": 4.999350687602031e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571850061417, + "rewards/format_reward_func": 1.0, + "step": 5000 + }, + { + "completion_length": 257.6517963409424, + "epoch": 0.8385934029087556, + "grad_norm": 0.28596476806596116, + "kl": 0.1295928955078125, + "learning_rate": 4.99934795089623e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571566939354, + "rewards/format_reward_func": 1.0, + "step": 5002 + }, + { + "completion_length": 254.4642972946167, + "epoch": 0.8389287061486231, + "grad_norm": 0.2751226439728887, + "kl": 0.3716888427734375, + "learning_rate": 4.999345208436002e-07, + "loss": 0.0004, + "reward": 1.7410714998841286, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5004 + }, + { + "completion_length": 264.1964406967163, + "epoch": 0.8392640093884907, + "grad_norm": 0.17527055408946596, + "kl": 0.0655517578125, + "learning_rate": 4.999342460221355e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 5006 + }, + { + "completion_length": 268.5848340988159, + "epoch": 0.8395993126283583, + "grad_norm": 0.30189495346003126, + "kl": 0.08837127685546875, + "learning_rate": 4.999339706252295e-07, + "loss": 0.0001, + "reward": 1.741517961025238, + "reward_std": 0.0688166399486363, + "rewards/equation_reward_func": 0.7428571674972773, + "rewards/format_reward_func": 0.9986607171595097, + "step": 5008 + }, + { + "completion_length": 271.37947940826416, + "epoch": 0.8399346158682258, + "grad_norm": 0.2656036792725558, + "kl": 0.06268310546875, + "learning_rate": 4.999336946528828e-07, + "loss": 0.0001, + "reward": 1.7696429342031479, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7741071805357933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5010 + }, + { + "completion_length": 262.29019260406494, + "epoch": 0.8402699191080933, + "grad_norm": 0.2827278859733368, + "kl": 0.06427001953125, + "learning_rate": 4.999334181050961e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 5012 + }, + { + "completion_length": 267.7767972946167, + "epoch": 0.840605222347961, + "grad_norm": 0.1459763848123801, + "kl": 0.2068939208984375, + "learning_rate": 4.9993314098187e-07, + "loss": 0.0002, + "reward": 1.7464286237955093, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 5014 + }, + { + "completion_length": 260.0580472946167, + "epoch": 0.8409405255878285, + "grad_norm": 0.11177792675682865, + "kl": 0.141387939453125, + "learning_rate": 4.999328632832052e-07, + "loss": 0.0001, + "reward": 1.8321429044008255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8321428783237934, + "rewards/format_reward_func": 1.0, + "step": 5016 + }, + { + "completion_length": 269.8392963409424, + "epoch": 0.841275828827696, + "grad_norm": 0.2513538542042273, + "kl": 0.0636138916015625, + "learning_rate": 4.999325850091022e-07, + "loss": 0.0001, + "reward": 1.7767857685685158, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7901785969734192, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5018 + }, + { + "completion_length": 263.446439743042, + "epoch": 0.8416111320675635, + "grad_norm": 0.1607530545343153, + "kl": 0.3533172607421875, + "learning_rate": 4.999323061595617e-07, + "loss": 0.0004, + "reward": 1.7129465118050575, + "reward_std": 0.0625031883828342, + "rewards/equation_reward_func": 0.7232143208384514, + "rewards/format_reward_func": 0.9897321499884129, + "step": 5020 + }, + { + "completion_length": 278.18304538726807, + "epoch": 0.8419464353074312, + "grad_norm": 0.7651100930666153, + "kl": 2.2594146728515625, + "learning_rate": 4.999320267345844e-07, + "loss": 0.0023, + "reward": 1.707589365541935, + "reward_std": 0.09028238267637789, + "rewards/equation_reward_func": 0.7223214656114578, + "rewards/format_reward_func": 0.9852678664028645, + "step": 5022 + }, + { + "completion_length": 265.76786708831787, + "epoch": 0.8422817385472987, + "grad_norm": 0.5229851016677849, + "kl": 0.22228240966796875, + "learning_rate": 4.999317467341709e-07, + "loss": 0.0002, + "reward": 1.7272322103381157, + "reward_std": 0.07765547605231404, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 0.980803582817316, + "step": 5024 + }, + { + "completion_length": 265.40626335144043, + "epoch": 0.8426170417871662, + "grad_norm": 0.4540552845227721, + "kl": 0.666046142578125, + "learning_rate": 4.99931466158322e-07, + "loss": 0.0007, + "reward": 1.7071429193019867, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5026 + }, + { + "completion_length": 256.67411708831787, + "epoch": 0.8429523450270339, + "grad_norm": 0.2477658548989634, + "kl": 0.38092041015625, + "learning_rate": 4.99931185007038e-07, + "loss": 0.0004, + "reward": 1.7392857819795609, + "reward_std": 0.07576144114136696, + "rewards/equation_reward_func": 0.7482143230736256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5028 + }, + { + "completion_length": 261.7321557998657, + "epoch": 0.8432876482669014, + "grad_norm": 294.4796959677068, + "kl": 292.18275451660156, + "learning_rate": 4.999309032803199e-07, + "loss": 0.2905, + "reward": 1.7553571984171867, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7687500268220901, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5030 + }, + { + "completion_length": 254.8660831451416, + "epoch": 0.8436229515067689, + "grad_norm": 0.3487734623831873, + "kl": 0.2953338623046875, + "learning_rate": 4.99930620978168e-07, + "loss": 0.0003, + "reward": 1.7625000849366188, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5032 + }, + { + "completion_length": 250.8214406967163, + "epoch": 0.8439582547466364, + "grad_norm": 0.845100666891351, + "kl": 1.3850021362304688, + "learning_rate": 4.999303381005833e-07, + "loss": 0.0014, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 5034 + }, + { + "completion_length": 260.23662090301514, + "epoch": 0.8442935579865041, + "grad_norm": 0.29851671677387165, + "kl": 0.46588134765625, + "learning_rate": 4.999300546475663e-07, + "loss": 0.0005, + "reward": 1.7897321954369545, + "reward_std": 0.09533314313739538, + "rewards/equation_reward_func": 0.8044643066823483, + "rewards/format_reward_func": 0.9852678626775742, + "step": 5036 + }, + { + "completion_length": 259.9821557998657, + "epoch": 0.8446288612263716, + "grad_norm": 0.19617775336306847, + "kl": 0.188018798828125, + "learning_rate": 4.999297706191175e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 5038 + }, + { + "completion_length": 259.6562623977661, + "epoch": 0.8449641644662391, + "grad_norm": 0.36903900589044275, + "kl": 0.31150054931640625, + "learning_rate": 4.999294860152378e-07, + "loss": 0.0003, + "reward": 1.7035715207457542, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7125000394880772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5040 + }, + { + "completion_length": 251.31697845458984, + "epoch": 0.8452994677061068, + "grad_norm": 0.34778111463721867, + "kl": 0.58111572265625, + "learning_rate": 4.999292008359277e-07, + "loss": 0.0006, + "reward": 1.7892857864499092, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 1.0, + "step": 5042 + }, + { + "completion_length": 259.6517972946167, + "epoch": 0.8456347709459743, + "grad_norm": 0.22109059202099945, + "kl": 0.05115509033203125, + "learning_rate": 4.99928915081188e-07, + "loss": 0.0001, + "reward": 1.701785758137703, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.706250037997961, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5044 + }, + { + "completion_length": 255.47322463989258, + "epoch": 0.8459700741858418, + "grad_norm": 0.767684363928975, + "kl": 1.7428512573242188, + "learning_rate": 4.999286287510192e-07, + "loss": 0.0017, + "reward": 1.7303572073578835, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5046 + }, + { + "completion_length": 247.4866180419922, + "epoch": 0.8463053774257093, + "grad_norm": 0.2620184516888097, + "kl": 0.0875244140625, + "learning_rate": 4.999283418454221e-07, + "loss": 0.0001, + "reward": 1.7696429044008255, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7741071842610836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5048 + }, + { + "completion_length": 257.3928699493408, + "epoch": 0.846640680665577, + "grad_norm": 0.2051462165393263, + "kl": 0.0811309814453125, + "learning_rate": 4.999280543643973e-07, + "loss": 0.0001, + "reward": 1.7428572252392769, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 5050 + }, + { + "completion_length": 257.3348340988159, + "epoch": 0.8469759839054445, + "grad_norm": 0.23980940731317518, + "kl": 0.0730438232421875, + "learning_rate": 4.999277663079453e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 5052 + }, + { + "completion_length": 256.02233505249023, + "epoch": 0.847311287145312, + "grad_norm": 0.2466820519649015, + "kl": 0.0859832763671875, + "learning_rate": 4.99927477676067e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5054 + }, + { + "completion_length": 242.81697368621826, + "epoch": 0.8476465903851796, + "grad_norm": 0.19081753990918476, + "kl": 0.04564666748046875, + "learning_rate": 4.99927188468763e-07, + "loss": 0.0, + "reward": 1.7910714894533157, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7955357450991869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5056 + }, + { + "completion_length": 250.4375114440918, + "epoch": 0.8479818936250472, + "grad_norm": 0.27938780172578787, + "kl": 0.1655731201171875, + "learning_rate": 4.99926898686034e-07, + "loss": 0.0002, + "reward": 1.7464286163449287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 5058 + }, + { + "completion_length": 253.05804443359375, + "epoch": 0.8483171968649147, + "grad_norm": 0.5356787768614212, + "kl": 0.06327056884765625, + "learning_rate": 4.999266083278806e-07, + "loss": 0.0001, + "reward": 1.7446429058909416, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5060 + }, + { + "completion_length": 237.91965198516846, + "epoch": 0.8486525001047822, + "grad_norm": 0.24795477280872208, + "kl": 0.11028289794921875, + "learning_rate": 4.999263173943034e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 5062 + }, + { + "completion_length": 246.70983219146729, + "epoch": 0.8489878033446498, + "grad_norm": 0.2763103500220934, + "kl": 0.081085205078125, + "learning_rate": 4.999260258853032e-07, + "loss": 0.0001, + "reward": 1.8089286386966705, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8133928813040257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5064 + }, + { + "completion_length": 252.1830472946167, + "epoch": 0.8493231065845174, + "grad_norm": 0.10641035138334937, + "kl": 0.04872894287109375, + "learning_rate": 4.999257338008806e-07, + "loss": 0.0, + "reward": 1.7517857626080513, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5066 + }, + { + "completion_length": 255.8259038925171, + "epoch": 0.8496584098243849, + "grad_norm": 0.23307262313618435, + "kl": 0.04785919189453125, + "learning_rate": 4.999254411410363e-07, + "loss": 0.0, + "reward": 1.730357214808464, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7348214536905289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5068 + }, + { + "completion_length": 244.45983219146729, + "epoch": 0.8499937130642525, + "grad_norm": 0.23193165760967088, + "kl": 0.2115325927734375, + "learning_rate": 4.99925147905771e-07, + "loss": 0.0002, + "reward": 1.785714328289032, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7946428805589676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5070 + }, + { + "completion_length": 244.90626049041748, + "epoch": 0.8503290163041201, + "grad_norm": 0.24690095810292512, + "kl": 0.053955078125, + "learning_rate": 4.999248540950853e-07, + "loss": 0.0001, + "reward": 1.7785715162754059, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 5072 + }, + { + "completion_length": 253.98661613464355, + "epoch": 0.8506643195439876, + "grad_norm": 0.23389187074428028, + "kl": 0.1283416748046875, + "learning_rate": 4.999245597089799e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5074 + }, + { + "completion_length": 247.71875762939453, + "epoch": 0.8509996227838551, + "grad_norm": 0.8323127056861116, + "kl": 0.1199188232421875, + "learning_rate": 4.999242647474555e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7910714596509933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5076 + }, + { + "completion_length": 236.50000953674316, + "epoch": 0.8513349260237227, + "grad_norm": 0.18800976984029746, + "kl": 0.1182861328125, + "learning_rate": 4.99923969210513e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.753571467474103, + "rewards/format_reward_func": 1.0, + "step": 5078 + }, + { + "completion_length": 239.78126049041748, + "epoch": 0.8516702292635903, + "grad_norm": 0.2201142403006137, + "kl": 0.0549774169921875, + "learning_rate": 4.999236730981526e-07, + "loss": 0.0001, + "reward": 1.714285783469677, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857536673546, + "rewards/format_reward_func": 1.0, + "step": 5080 + }, + { + "completion_length": 247.9062623977661, + "epoch": 0.8520055325034578, + "grad_norm": 4.761260457058568, + "kl": 0.383270263671875, + "learning_rate": 4.999233764103753e-07, + "loss": 0.0004, + "reward": 1.753571480512619, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7625000402331352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5082 + }, + { + "completion_length": 240.20983409881592, + "epoch": 0.8523408357433254, + "grad_norm": 0.126562800778106, + "kl": 0.554901123046875, + "learning_rate": 4.999230791471818e-07, + "loss": 0.0006, + "reward": 1.7714286521077156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 5084 + }, + { + "completion_length": 230.77233409881592, + "epoch": 0.8526761389831929, + "grad_norm": 0.29796437906708106, + "kl": 0.05060577392578125, + "learning_rate": 4.999227813085725e-07, + "loss": 0.0001, + "reward": 1.8285714834928513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8285714499652386, + "rewards/format_reward_func": 1.0, + "step": 5086 + }, + { + "completion_length": 239.77679538726807, + "epoch": 0.8530114422230605, + "grad_norm": 0.30966121200378954, + "kl": 0.2183380126953125, + "learning_rate": 4.999224828945485e-07, + "loss": 0.0002, + "reward": 1.7946429029107094, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.799107164144516, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5088 + }, + { + "completion_length": 244.41519165039062, + "epoch": 0.853346745462928, + "grad_norm": 0.14491462362379912, + "kl": 0.11983489990234375, + "learning_rate": 4.999221839051102e-07, + "loss": 0.0001, + "reward": 1.7375000789761543, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5090 + }, + { + "completion_length": 234.41965293884277, + "epoch": 0.8536820487027956, + "grad_norm": 0.24775391265290617, + "kl": 0.1589813232421875, + "learning_rate": 4.999218843402584e-07, + "loss": 0.0002, + "reward": 1.718303643167019, + "reward_std": 0.0650285689625889, + "rewards/equation_reward_func": 0.724107164889574, + "rewards/format_reward_func": 0.9941964335739613, + "step": 5092 + }, + { + "completion_length": 236.27233123779297, + "epoch": 0.8540173519426631, + "grad_norm": 0.4428321684880137, + "kl": 0.21246337890625, + "learning_rate": 4.999215841999937e-07, + "loss": 0.0002, + "reward": 1.6910714730620384, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7223214618861675, + "rewards/format_reward_func": 0.9687500149011612, + "step": 5094 + }, + { + "completion_length": 236.79465293884277, + "epoch": 0.8543526551825307, + "grad_norm": 0.3418168828278188, + "kl": 0.14962005615234375, + "learning_rate": 4.999212834843169e-07, + "loss": 0.0001, + "reward": 1.798214353621006, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.8116071708500385, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5096 + }, + { + "completion_length": 230.79911708831787, + "epoch": 0.8546879584223983, + "grad_norm": 0.20071501266811895, + "kl": 0.1268768310546875, + "learning_rate": 4.999209821932287e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 5098 + }, + { + "completion_length": 226.96875858306885, + "epoch": 0.8550232616622658, + "grad_norm": 0.17248835077943814, + "kl": 0.098907470703125, + "learning_rate": 4.999206803267296e-07, + "loss": 0.0001, + "reward": 1.689285822212696, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.6982143241912127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5100 + }, + { + "completion_length": 228.93751049041748, + "epoch": 0.8553585649021334, + "grad_norm": 0.30204625045537503, + "kl": 0.187408447265625, + "learning_rate": 4.999203778848206e-07, + "loss": 0.0002, + "reward": 1.7410714775323868, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357480794191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5102 + }, + { + "completion_length": 223.1071538925171, + "epoch": 0.8556938681420009, + "grad_norm": 0.783522810256943, + "kl": 2.531463623046875, + "learning_rate": 4.999200748675021e-07, + "loss": 0.0025, + "reward": 1.7250000908970833, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7339285984635353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5104 + }, + { + "completion_length": 234.47768878936768, + "epoch": 0.8560291713818685, + "grad_norm": 0.24328087656232975, + "kl": 0.229888916015625, + "learning_rate": 4.99919771274775e-07, + "loss": 0.0002, + "reward": 1.7200893685221672, + "reward_std": 0.06250318791717291, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 0.9986607171595097, + "step": 5106 + }, + { + "completion_length": 244.42858219146729, + "epoch": 0.856364474621736, + "grad_norm": 0.29232745791069964, + "kl": 1.5421829223632812, + "learning_rate": 4.999194671066398e-07, + "loss": 0.0015, + "reward": 1.721428632736206, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7303571812808514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5108 + }, + { + "completion_length": 252.26340675354004, + "epoch": 0.8566997778616036, + "grad_norm": 0.4569750985390259, + "kl": 1.1767578125, + "learning_rate": 4.999191623630974e-07, + "loss": 0.0012, + "reward": 1.6625000834465027, + "reward_std": 0.11364215891808271, + "rewards/equation_reward_func": 0.6848214641213417, + "rewards/format_reward_func": 0.977678582072258, + "step": 5110 + }, + { + "completion_length": 247.93751430511475, + "epoch": 0.8570350811014712, + "grad_norm": 0.34339815751092734, + "kl": 0.3735198974609375, + "learning_rate": 4.999188570441485e-07, + "loss": 0.0004, + "reward": 1.7035714611411095, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7214286047965288, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5112 + }, + { + "completion_length": 228.1562614440918, + "epoch": 0.8573703843413387, + "grad_norm": 0.24380516996038454, + "kl": 0.1014862060546875, + "learning_rate": 4.999185511497937e-07, + "loss": 0.0001, + "reward": 1.8160714954137802, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.8205357380211353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5114 + }, + { + "completion_length": 238.89286613464355, + "epoch": 0.8577056875812062, + "grad_norm": 0.2579426290533824, + "kl": 0.203277587890625, + "learning_rate": 4.999182446800336e-07, + "loss": 0.0002, + "reward": 1.732142947614193, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5116 + }, + { + "completion_length": 229.06251049041748, + "epoch": 0.8580409908210738, + "grad_norm": 0.39017097066306555, + "kl": 1.1303558349609375, + "learning_rate": 4.999179376348691e-07, + "loss": 0.0011, + "reward": 1.7343750596046448, + "reward_std": 0.08270624093711376, + "rewards/equation_reward_func": 0.7535714525729418, + "rewards/format_reward_func": 0.980803582817316, + "step": 5118 + }, + { + "completion_length": 226.82143878936768, + "epoch": 0.8583762940609414, + "grad_norm": 0.8587286627098762, + "kl": 0.360626220703125, + "learning_rate": 4.999176300143009e-07, + "loss": 0.0004, + "reward": 1.751785784959793, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5120 + }, + { + "completion_length": 229.28126049041748, + "epoch": 0.8587115973008089, + "grad_norm": 0.19873741773246498, + "kl": 0.1075286865234375, + "learning_rate": 4.999173218183296e-07, + "loss": 0.0001, + "reward": 1.776785783469677, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500186264515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5122 + }, + { + "completion_length": 227.12054538726807, + "epoch": 0.8590469005406765, + "grad_norm": 0.2766724367610363, + "kl": 0.2878875732421875, + "learning_rate": 4.99917013046956e-07, + "loss": 0.0003, + "reward": 1.698214367032051, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7026786021888256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5124 + }, + { + "completion_length": 225.43750858306885, + "epoch": 0.8593822037805441, + "grad_norm": 0.45199430854316097, + "kl": 0.27734375, + "learning_rate": 4.999167037001807e-07, + "loss": 0.0003, + "reward": 1.7625000849366188, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5126 + }, + { + "completion_length": 220.16965198516846, + "epoch": 0.8597175070204116, + "grad_norm": 0.1847114816856559, + "kl": 0.318359375, + "learning_rate": 4.999163937780046e-07, + "loss": 0.0003, + "reward": 1.8035714775323868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 5128 + }, + { + "completion_length": 227.0312623977661, + "epoch": 0.8600528102602791, + "grad_norm": 0.34900171200182317, + "kl": 0.3245086669921875, + "learning_rate": 4.999160832804282e-07, + "loss": 0.0003, + "reward": 1.7464286461472511, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 5130 + }, + { + "completion_length": 218.93304634094238, + "epoch": 0.8603881135001467, + "grad_norm": 0.2367511358892103, + "kl": 0.117279052734375, + "learning_rate": 4.999157722074524e-07, + "loss": 0.0001, + "reward": 1.8022321909666061, + "reward_std": 0.027147848159074783, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 0.9986607171595097, + "step": 5132 + }, + { + "completion_length": 219.32590198516846, + "epoch": 0.8607234167400143, + "grad_norm": 0.1688927659968083, + "kl": 0.1425018310546875, + "learning_rate": 4.999154605590778e-07, + "loss": 0.0001, + "reward": 1.8178571909666061, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 5134 + }, + { + "completion_length": 227.3794755935669, + "epoch": 0.8610587199798818, + "grad_norm": 0.19784225720299922, + "kl": 0.2835845947265625, + "learning_rate": 4.999151483353052e-07, + "loss": 0.0003, + "reward": 1.7660714834928513, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5136 + }, + { + "completion_length": 242.0803680419922, + "epoch": 0.8613940232197493, + "grad_norm": 0.26270531928407537, + "kl": 0.5782623291015625, + "learning_rate": 4.999148355361351e-07, + "loss": 0.0006, + "reward": 1.775000050663948, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 5138 + }, + { + "completion_length": 238.0357265472412, + "epoch": 0.8617293264596169, + "grad_norm": 0.22026295954848013, + "kl": 0.259002685546875, + "learning_rate": 4.999145221615685e-07, + "loss": 0.0003, + "reward": 1.7678572088479996, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7767857350409031, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5140 + }, + { + "completion_length": 228.8259048461914, + "epoch": 0.8620646296994845, + "grad_norm": 0.14104751882657987, + "kl": 0.0952606201171875, + "learning_rate": 4.99914208211606e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 5142 + }, + { + "completion_length": 236.7812614440918, + "epoch": 0.862399932939352, + "grad_norm": 0.2925028960594958, + "kl": 0.1213836669921875, + "learning_rate": 4.999138936862484e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7803571708500385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5144 + }, + { + "completion_length": 242.68751049041748, + "epoch": 0.8627352361792195, + "grad_norm": 0.4819099304816873, + "kl": 0.167633056640625, + "learning_rate": 4.999135785854962e-07, + "loss": 0.0002, + "reward": 1.7214286401867867, + "reward_std": 0.12121830228716135, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5146 + }, + { + "completion_length": 249.82143878936768, + "epoch": 0.8630705394190872, + "grad_norm": 0.25499206507829786, + "kl": 0.2075958251953125, + "learning_rate": 4.999132629093503e-07, + "loss": 0.0002, + "reward": 1.7035714983940125, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7125000450760126, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5148 + }, + { + "completion_length": 238.89733409881592, + "epoch": 0.8634058426589547, + "grad_norm": 0.2327296549492035, + "kl": 0.1017913818359375, + "learning_rate": 4.999129466578116e-07, + "loss": 0.0001, + "reward": 1.6696429401636124, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.6741071790456772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5150 + }, + { + "completion_length": 247.5759038925171, + "epoch": 0.8637411458988222, + "grad_norm": 0.20944422452788633, + "kl": 0.100006103515625, + "learning_rate": 4.999126298308805e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7419643104076385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5152 + }, + { + "completion_length": 244.74108409881592, + "epoch": 0.8640764491386898, + "grad_norm": 0.3227991323663998, + "kl": 0.1121673583984375, + "learning_rate": 4.999123124285578e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 5154 + }, + { + "completion_length": 246.30358219146729, + "epoch": 0.8644117523785574, + "grad_norm": 0.6971793031048401, + "kl": 0.1251983642578125, + "learning_rate": 4.999119944508445e-07, + "loss": 0.0001, + "reward": 1.7071429416537285, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7160714715719223, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5156 + }, + { + "completion_length": 241.41518688201904, + "epoch": 0.8647470556184249, + "grad_norm": 0.30123485204649847, + "kl": 0.0756683349609375, + "learning_rate": 4.999116758977409e-07, + "loss": 0.0001, + "reward": 1.7232143878936768, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5158 + }, + { + "completion_length": 242.10715293884277, + "epoch": 0.8650823588582924, + "grad_norm": 0.19740171135735748, + "kl": 0.0821380615234375, + "learning_rate": 4.999113567692481e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 5160 + }, + { + "completion_length": 241.02679634094238, + "epoch": 0.8654176620981601, + "grad_norm": 0.34486952501648477, + "kl": 0.049346923828125, + "learning_rate": 4.999110370653667e-07, + "loss": 0.0, + "reward": 1.7839286476373672, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7883928846567869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5162 + }, + { + "completion_length": 235.915189743042, + "epoch": 0.8657529653380276, + "grad_norm": 0.23825360495441034, + "kl": 0.05047607421875, + "learning_rate": 4.999107167860973e-07, + "loss": 0.0001, + "reward": 1.7946429327130318, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071604192257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5164 + }, + { + "completion_length": 237.2946538925171, + "epoch": 0.8660882685778951, + "grad_norm": 0.23002121289385286, + "kl": 0.0514068603515625, + "learning_rate": 4.999103959314409e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 5166 + }, + { + "completion_length": 238.7589406967163, + "epoch": 0.8664235718177626, + "grad_norm": 0.7932820511428698, + "kl": 0.05147552490234375, + "learning_rate": 4.999100745013981e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7419643104076385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5168 + }, + { + "completion_length": 236.88840675354004, + "epoch": 0.8667588750576303, + "grad_norm": 0.27955379277920045, + "kl": 0.0646514892578125, + "learning_rate": 4.999097524959695e-07, + "loss": 0.0001, + "reward": 1.691071517765522, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.6955357603728771, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5170 + }, + { + "completion_length": 256.50893783569336, + "epoch": 0.8670941782974978, + "grad_norm": 0.29478152619315, + "kl": 0.074310302734375, + "learning_rate": 4.999094299151562e-07, + "loss": 0.0001, + "reward": 1.7214286625385284, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214285898953676, + "rewards/format_reward_func": 1.0, + "step": 5172 + }, + { + "completion_length": 235.45983123779297, + "epoch": 0.8674294815373653, + "grad_norm": 0.32795899856400945, + "kl": 0.0535736083984375, + "learning_rate": 4.999091067589585e-07, + "loss": 0.0001, + "reward": 1.725000061094761, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 1.0, + "step": 5174 + }, + { + "completion_length": 237.6116180419922, + "epoch": 0.867764784777233, + "grad_norm": 0.19602738115025517, + "kl": 0.0596466064453125, + "learning_rate": 4.999087830273777e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 5176 + }, + { + "completion_length": 242.0625123977661, + "epoch": 0.8681000880171005, + "grad_norm": 0.5267813801735062, + "kl": 0.0671844482421875, + "learning_rate": 4.99908458720414e-07, + "loss": 0.0001, + "reward": 1.7285714894533157, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7464285995811224, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5178 + }, + { + "completion_length": 230.95536994934082, + "epoch": 0.868435391256968, + "grad_norm": 0.24433297853268698, + "kl": 0.064056396484375, + "learning_rate": 4.999081338380684e-07, + "loss": 0.0001, + "reward": 1.7500000968575478, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7580357417464256, + "rewards/format_reward_func": 0.9919642880558968, + "step": 5180 + }, + { + "completion_length": 246.0982255935669, + "epoch": 0.8687706944968355, + "grad_norm": 0.8587767902492196, + "kl": 0.1986541748046875, + "learning_rate": 4.999078083803416e-07, + "loss": 0.0002, + "reward": 1.7321429252624512, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7410714663565159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5182 + }, + { + "completion_length": 231.10715293884277, + "epoch": 0.8691059977367032, + "grad_norm": 0.22419756108609906, + "kl": 0.0513763427734375, + "learning_rate": 4.999074823472344e-07, + "loss": 0.0001, + "reward": 1.7285715192556381, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.728571455925703, + "rewards/format_reward_func": 1.0, + "step": 5184 + }, + { + "completion_length": 234.24108219146729, + "epoch": 0.8694413009765707, + "grad_norm": 0.2572663801825886, + "kl": 0.12969970703125, + "learning_rate": 4.999071557387475e-07, + "loss": 0.0001, + "reward": 1.7267857939004898, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7312500402331352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5186 + }, + { + "completion_length": 237.3482265472412, + "epoch": 0.8697766042164382, + "grad_norm": 0.2707232720959836, + "kl": 0.117584228515625, + "learning_rate": 4.999068285548816e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5188 + }, + { + "completion_length": 229.05804538726807, + "epoch": 0.8701119074563058, + "grad_norm": 0.1857235486891341, + "kl": 0.05426025390625, + "learning_rate": 4.999065007956377e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 5190 + }, + { + "completion_length": 238.2723331451416, + "epoch": 0.8704472106961734, + "grad_norm": 0.4293622096350722, + "kl": 0.111297607421875, + "learning_rate": 4.999061724610163e-07, + "loss": 0.0001, + "reward": 1.7553572207689285, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7598214671015739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5192 + }, + { + "completion_length": 236.36161994934082, + "epoch": 0.8707825139360409, + "grad_norm": 0.2534201839770564, + "kl": 0.2327423095703125, + "learning_rate": 4.999058435510182e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857655882835, + "rewards/format_reward_func": 1.0, + "step": 5194 + }, + { + "completion_length": 242.7053689956665, + "epoch": 0.8711178171759084, + "grad_norm": 0.4089098177812318, + "kl": 0.1260986328125, + "learning_rate": 4.999055140656442e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 5196 + }, + { + "completion_length": 242.01786708831787, + "epoch": 0.871453120415776, + "grad_norm": 0.2138423740410924, + "kl": 0.1017303466796875, + "learning_rate": 4.99905184004895e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 5198 + }, + { + "completion_length": 246.15179824829102, + "epoch": 0.8717884236556436, + "grad_norm": 0.25927983778414143, + "kl": 0.0787200927734375, + "learning_rate": 4.999048533687715e-07, + "loss": 0.0001, + "reward": 1.6803572252392769, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.6937500238418579, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5200 + }, + { + "completion_length": 241.31250953674316, + "epoch": 0.8721237268955111, + "grad_norm": 0.25433932775478646, + "kl": 0.0538330078125, + "learning_rate": 4.999045221572743e-07, + "loss": 0.0001, + "reward": 1.705357238650322, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7098214626312256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5202 + }, + { + "completion_length": 240.00447368621826, + "epoch": 0.8724590301353787, + "grad_norm": 0.1208999441426067, + "kl": 0.084716796875, + "learning_rate": 4.999041903704043e-07, + "loss": 0.0001, + "reward": 1.7250000685453415, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7250000331550837, + "rewards/format_reward_func": 1.0, + "step": 5204 + }, + { + "completion_length": 248.1384048461914, + "epoch": 0.8727943333752463, + "grad_norm": 0.347887490986824, + "kl": 0.2652587890625, + "learning_rate": 4.999038580081621e-07, + "loss": 0.0003, + "reward": 1.721428669989109, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214286029338837, + "rewards/format_reward_func": 1.0, + "step": 5206 + }, + { + "completion_length": 241.5580472946167, + "epoch": 0.8731296366151138, + "grad_norm": 0.11248959369685928, + "kl": 0.09877777099609375, + "learning_rate": 4.999035250705486e-07, + "loss": 0.0001, + "reward": 1.7428572326898575, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 5208 + }, + { + "completion_length": 246.4553680419922, + "epoch": 0.8734649398549813, + "grad_norm": 0.20355932834966817, + "kl": 0.119964599609375, + "learning_rate": 4.999031915575645e-07, + "loss": 0.0001, + "reward": 1.678571492433548, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.6785714589059353, + "rewards/format_reward_func": 1.0, + "step": 5210 + }, + { + "completion_length": 240.62501335144043, + "epoch": 0.8738002430948489, + "grad_norm": 0.3614959079050482, + "kl": 0.0868377685546875, + "learning_rate": 4.999028574692107e-07, + "loss": 0.0001, + "reward": 1.7232143729925156, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7276785932481289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5212 + }, + { + "completion_length": 232.2321548461914, + "epoch": 0.8741355463347165, + "grad_norm": 0.27345804390142686, + "kl": 0.267913818359375, + "learning_rate": 4.999025228054878e-07, + "loss": 0.0003, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 5214 + }, + { + "completion_length": 243.81250953674316, + "epoch": 0.874470849574584, + "grad_norm": 0.25686369513436424, + "kl": 0.05169677734375, + "learning_rate": 4.999021875663967e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321429010480642, + "rewards/format_reward_func": 1.0, + "step": 5216 + }, + { + "completion_length": 236.5892972946167, + "epoch": 0.8748061528144516, + "grad_norm": 0.2938128211300729, + "kl": 0.0554046630859375, + "learning_rate": 4.99901851751938e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 5218 + }, + { + "completion_length": 238.8482255935669, + "epoch": 0.8751414560543191, + "grad_norm": 0.2940552545555363, + "kl": 0.202545166015625, + "learning_rate": 4.999015153621126e-07, + "loss": 0.0002, + "reward": 1.7428572252392769, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 5220 + }, + { + "completion_length": 235.82143783569336, + "epoch": 0.8754767592941867, + "grad_norm": 0.18830219720647964, + "kl": 0.124786376953125, + "learning_rate": 4.999011783969213e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 5222 + }, + { + "completion_length": 245.26786994934082, + "epoch": 0.8758120625340542, + "grad_norm": 0.38066338937574323, + "kl": 0.1001739501953125, + "learning_rate": 4.999008408563649e-07, + "loss": 0.0001, + "reward": 1.7321429178118706, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7410714514553547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5224 + }, + { + "completion_length": 233.4732255935669, + "epoch": 0.8761473657739218, + "grad_norm": 0.4616713763436838, + "kl": 0.1026153564453125, + "learning_rate": 4.999005027404439e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 5226 + }, + { + "completion_length": 239.80358409881592, + "epoch": 0.8764826690137894, + "grad_norm": 0.2662263067010209, + "kl": 0.0876007080078125, + "learning_rate": 4.999001640491595e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5228 + }, + { + "completion_length": 238.1294755935669, + "epoch": 0.8768179722536569, + "grad_norm": 0.2532932673862093, + "kl": 0.064971923828125, + "learning_rate": 4.998998247825121e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 5230 + }, + { + "completion_length": 239.14733219146729, + "epoch": 0.8771532754935245, + "grad_norm": 0.22009531892447384, + "kl": 0.06134033203125, + "learning_rate": 4.998994849405027e-07, + "loss": 0.0001, + "reward": 1.780357226729393, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5232 + }, + { + "completion_length": 242.4062614440918, + "epoch": 0.877488578733392, + "grad_norm": 0.1460440667442655, + "kl": 0.0941925048828125, + "learning_rate": 4.99899144523132e-07, + "loss": 0.0001, + "reward": 1.705357201397419, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7187500353902578, + "rewards/format_reward_func": 0.9866071455180645, + "step": 5234 + }, + { + "completion_length": 239.28126049041748, + "epoch": 0.8778238819732596, + "grad_norm": 0.25190830542060527, + "kl": 0.0667724609375, + "learning_rate": 4.998988035304009e-07, + "loss": 0.0001, + "reward": 1.7232143729925156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5236 + }, + { + "completion_length": 244.16518783569336, + "epoch": 0.8781591852131271, + "grad_norm": 0.17071947606065108, + "kl": 0.089874267578125, + "learning_rate": 4.998984619623101e-07, + "loss": 0.0001, + "reward": 1.7339286655187607, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5238 + }, + { + "completion_length": 230.227689743042, + "epoch": 0.8784944884529947, + "grad_norm": 0.3664008933351735, + "kl": 0.0554656982421875, + "learning_rate": 4.998981198188603e-07, + "loss": 0.0001, + "reward": 1.814285784959793, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 5240 + }, + { + "completion_length": 229.21876049041748, + "epoch": 0.8788297916928622, + "grad_norm": 0.21221439980005558, + "kl": 0.06195068359375, + "learning_rate": 4.998977771000525e-07, + "loss": 0.0001, + "reward": 1.7642857655882835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.764285733923316, + "rewards/format_reward_func": 1.0, + "step": 5242 + }, + { + "completion_length": 234.59376049041748, + "epoch": 0.8791650949327298, + "grad_norm": 0.3746951037284132, + "kl": 0.07000732421875, + "learning_rate": 4.998974338058872e-07, + "loss": 0.0001, + "reward": 1.6857143566012383, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.6857143249362707, + "rewards/format_reward_func": 1.0, + "step": 5244 + }, + { + "completion_length": 242.1696538925171, + "epoch": 0.8795003981725974, + "grad_norm": 0.18881294484194702, + "kl": 0.0675506591796875, + "learning_rate": 4.998970899363655e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 5246 + }, + { + "completion_length": 226.1384038925171, + "epoch": 0.8798357014124649, + "grad_norm": 0.22507211877639, + "kl": 0.07757568359375, + "learning_rate": 4.998967454914879e-07, + "loss": 0.0001, + "reward": 1.830357201397419, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8348214440047741, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5248 + }, + { + "completion_length": 245.1026906967163, + "epoch": 0.8801710046523324, + "grad_norm": 0.22393823771667665, + "kl": 0.06451416015625, + "learning_rate": 4.998964004712555e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 5250 + }, + { + "completion_length": 232.92411708831787, + "epoch": 0.8805063078922, + "grad_norm": 0.2144873023449417, + "kl": 0.0620269775390625, + "learning_rate": 4.998960548756689e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7732143141329288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5252 + }, + { + "completion_length": 232.87054347991943, + "epoch": 0.8808416111320676, + "grad_norm": 0.1989102781559067, + "kl": 0.05689239501953125, + "learning_rate": 4.998957087047288e-07, + "loss": 0.0001, + "reward": 1.7464286237955093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 5254 + }, + { + "completion_length": 237.9062614440918, + "epoch": 0.8811769143719351, + "grad_norm": 0.2512641577169618, + "kl": 0.058868408203125, + "learning_rate": 4.998953619584363e-07, + "loss": 0.0001, + "reward": 1.7803572118282318, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7848214693367481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5256 + }, + { + "completion_length": 244.99108219146729, + "epoch": 0.8815122176118027, + "grad_norm": 0.25304485784456665, + "kl": 0.0676422119140625, + "learning_rate": 4.998950146367918e-07, + "loss": 0.0001, + "reward": 1.7053572162985802, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7098214570432901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5258 + }, + { + "completion_length": 236.65179538726807, + "epoch": 0.8818475208516702, + "grad_norm": 0.2993844993610931, + "kl": 0.0611572265625, + "learning_rate": 4.998946667397966e-07, + "loss": 0.0001, + "reward": 1.7357143834233284, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 5260 + }, + { + "completion_length": 235.5044755935669, + "epoch": 0.8821828240915378, + "grad_norm": 0.30016998238421877, + "kl": 0.0649871826171875, + "learning_rate": 4.99894318267451e-07, + "loss": 0.0001, + "reward": 1.785714365541935, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 5262 + }, + { + "completion_length": 239.3973331451416, + "epoch": 0.8825181273314053, + "grad_norm": 0.3114488879046236, + "kl": 0.08050537109375, + "learning_rate": 4.99893969219756e-07, + "loss": 0.0001, + "reward": 1.7892857939004898, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7982143089175224, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5264 + }, + { + "completion_length": 230.040189743042, + "epoch": 0.8828534305712729, + "grad_norm": 0.3517487633510106, + "kl": 0.0715179443359375, + "learning_rate": 4.998936195967126e-07, + "loss": 0.0001, + "reward": 1.7236607894301414, + "reward_std": 0.06755394977517426, + "rewards/equation_reward_func": 0.7250000424683094, + "rewards/format_reward_func": 0.9986607171595097, + "step": 5266 + }, + { + "completion_length": 235.5178680419922, + "epoch": 0.8831887338111405, + "grad_norm": 0.41806628054869266, + "kl": 0.07794189453125, + "learning_rate": 4.998932693983213e-07, + "loss": 0.0001, + "reward": 1.7053572461009026, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7098214700818062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5268 + }, + { + "completion_length": 232.10268783569336, + "epoch": 0.883524037051008, + "grad_norm": 0.2858866580292781, + "kl": 0.08392333984375, + "learning_rate": 4.998929186245832e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 5270 + }, + { + "completion_length": 233.68751049041748, + "epoch": 0.8838593402908755, + "grad_norm": 0.25485525873994247, + "kl": 0.0716400146484375, + "learning_rate": 4.998925672754987e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 5272 + }, + { + "completion_length": 226.62947463989258, + "epoch": 0.8841946435307431, + "grad_norm": 0.17777709268158565, + "kl": 0.0927581787109375, + "learning_rate": 4.998922153510691e-07, + "loss": 0.0001, + "reward": 1.807142935693264, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 5274 + }, + { + "completion_length": 240.352689743042, + "epoch": 0.8845299467706107, + "grad_norm": 0.18581282838696533, + "kl": 0.2295989990234375, + "learning_rate": 4.998918628512949e-07, + "loss": 0.0002, + "reward": 1.7089286372065544, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7133928909897804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5276 + }, + { + "completion_length": 233.8303680419922, + "epoch": 0.8848652500104782, + "grad_norm": 0.3049793148610497, + "kl": 0.0879058837890625, + "learning_rate": 4.998915097761769e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 5278 + }, + { + "completion_length": 232.40179443359375, + "epoch": 0.8852005532503457, + "grad_norm": 0.09532660700253795, + "kl": 0.06298828125, + "learning_rate": 4.998911561257161e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.807142898440361, + "rewards/format_reward_func": 1.0, + "step": 5280 + }, + { + "completion_length": 226.49554634094238, + "epoch": 0.8855358564902134, + "grad_norm": 0.2932981971207065, + "kl": 0.06439208984375, + "learning_rate": 4.998908018999131e-07, + "loss": 0.0001, + "reward": 1.732142947614193, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428786963224, + "rewards/format_reward_func": 1.0, + "step": 5282 + }, + { + "completion_length": 238.65179634094238, + "epoch": 0.8858711597300809, + "grad_norm": 0.32438201855121496, + "kl": 0.0819091796875, + "learning_rate": 4.998904470987689e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.767857177183032, + "rewards/format_reward_func": 1.0, + "step": 5284 + }, + { + "completion_length": 238.85268878936768, + "epoch": 0.8862064629699484, + "grad_norm": 0.3012234729171621, + "kl": 0.08880615234375, + "learning_rate": 4.998900917222842e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 5286 + }, + { + "completion_length": 236.5580472946167, + "epoch": 0.886541766209816, + "grad_norm": 0.2963282908676942, + "kl": 0.07366943359375, + "learning_rate": 4.998897357704598e-07, + "loss": 0.0001, + "reward": 1.8017857819795609, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.8062500208616257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5288 + }, + { + "completion_length": 233.37054538726807, + "epoch": 0.8868770694496836, + "grad_norm": 0.3279870054415193, + "kl": 0.06890869140625, + "learning_rate": 4.998893792432966e-07, + "loss": 0.0001, + "reward": 1.7642857506871223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857413738966, + "rewards/format_reward_func": 1.0, + "step": 5290 + }, + { + "completion_length": 252.91072463989258, + "epoch": 0.8872123726895511, + "grad_norm": 0.27756577372857344, + "kl": 0.121978759765625, + "learning_rate": 4.998890221407956e-07, + "loss": 0.0001, + "reward": 1.748214341700077, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526785973459482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5292 + }, + { + "completion_length": 236.54018688201904, + "epoch": 0.8875476759294186, + "grad_norm": 0.2331425793873391, + "kl": 0.1655120849609375, + "learning_rate": 4.998886644629572e-07, + "loss": 0.0002, + "reward": 1.7857143580913544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 5294 + }, + { + "completion_length": 243.5491189956665, + "epoch": 0.8878829791692863, + "grad_norm": 0.16776429680449167, + "kl": 0.079864501953125, + "learning_rate": 4.998883062097824e-07, + "loss": 0.0001, + "reward": 1.7705357745289803, + "reward_std": 0.0416687922552228, + "rewards/equation_reward_func": 0.7767857387661934, + "rewards/format_reward_func": 0.9937500059604645, + "step": 5296 + }, + { + "completion_length": 240.41965293884277, + "epoch": 0.8882182824091538, + "grad_norm": 0.8081502933749403, + "kl": 0.257476806640625, + "learning_rate": 4.998879473812722e-07, + "loss": 0.0003, + "reward": 1.8214286267757416, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8214286006987095, + "rewards/format_reward_func": 1.0, + "step": 5298 + }, + { + "completion_length": 244.4687614440918, + "epoch": 0.8885535856490213, + "grad_norm": 0.14530388506531866, + "kl": 0.0879058837890625, + "learning_rate": 4.998875879774273e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 5300 + }, + { + "completion_length": 229.28126335144043, + "epoch": 0.8888888888888888, + "grad_norm": 0.2260688976115879, + "kl": 0.09368896484375, + "learning_rate": 4.998872279982485e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 5302 + }, + { + "completion_length": 234.7812614440918, + "epoch": 0.8892241921287565, + "grad_norm": 0.1600646981940207, + "kl": 0.0696563720703125, + "learning_rate": 4.998868674437365e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571622818708, + "rewards/format_reward_func": 1.0, + "step": 5304 + }, + { + "completion_length": 245.08929824829102, + "epoch": 0.889559495368624, + "grad_norm": 0.20507006335606773, + "kl": 0.068084716796875, + "learning_rate": 4.998865063138926e-07, + "loss": 0.0001, + "reward": 1.7482143715023994, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5306 + }, + { + "completion_length": 236.44197750091553, + "epoch": 0.8898947986084915, + "grad_norm": 0.24173203157702566, + "kl": 0.0780029296875, + "learning_rate": 4.99886144608717e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7901786025613546, + "rewards/format_reward_func": 0.9919642880558968, + "step": 5308 + }, + { + "completion_length": 243.21876335144043, + "epoch": 0.8902301018483592, + "grad_norm": 0.2508191564510342, + "kl": 0.076171875, + "learning_rate": 4.99885782328211e-07, + "loss": 0.0001, + "reward": 1.7392857745289803, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857596278191, + "rewards/format_reward_func": 1.0, + "step": 5310 + }, + { + "completion_length": 246.66518688201904, + "epoch": 0.8905654050882267, + "grad_norm": 0.19502896555021526, + "kl": 0.07415771484375, + "learning_rate": 4.998854194723752e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 5312 + }, + { + "completion_length": 239.36608028411865, + "epoch": 0.8909007083280942, + "grad_norm": 0.2431954955354549, + "kl": 0.072662353515625, + "learning_rate": 4.998850560412106e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 5314 + }, + { + "completion_length": 261.59376335144043, + "epoch": 0.8912360115679617, + "grad_norm": 0.23926305068072765, + "kl": 0.07647705078125, + "learning_rate": 4.998846920347178e-07, + "loss": 0.0001, + "reward": 1.7732143849134445, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776785865426064, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5316 + }, + { + "completion_length": 253.9419755935669, + "epoch": 0.8915713148078294, + "grad_norm": 0.2160253207774375, + "kl": 0.0731964111328125, + "learning_rate": 4.99884327452898e-07, + "loss": 0.0001, + "reward": 1.7428571954369545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 5318 + }, + { + "completion_length": 250.19197750091553, + "epoch": 0.8919066180476969, + "grad_norm": 0.20380295428031342, + "kl": 0.072662353515625, + "learning_rate": 4.998839622957517e-07, + "loss": 0.0001, + "reward": 1.7285715267062187, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 5320 + }, + { + "completion_length": 243.31697463989258, + "epoch": 0.8922419212875644, + "grad_norm": 0.22780821035437138, + "kl": 0.084014892578125, + "learning_rate": 4.998835965632798e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7169643081724644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5322 + }, + { + "completion_length": 252.35715293884277, + "epoch": 0.892577224527432, + "grad_norm": 0.22647910989616787, + "kl": 0.07513427734375, + "learning_rate": 4.998832302554834e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 5324 + }, + { + "completion_length": 255.96876430511475, + "epoch": 0.8929125277672996, + "grad_norm": 0.27754938674440754, + "kl": 0.08740234375, + "learning_rate": 4.99882863372363e-07, + "loss": 0.0001, + "reward": 1.7375000789761543, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5326 + }, + { + "completion_length": 259.9509048461914, + "epoch": 0.8932478310071671, + "grad_norm": 0.1811040255508447, + "kl": 0.084320068359375, + "learning_rate": 4.998824959139196e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143152505159, + "rewards/format_reward_func": 1.0, + "step": 5328 + }, + { + "completion_length": 248.4196548461914, + "epoch": 0.8935831342470346, + "grad_norm": 0.5155124794678484, + "kl": 0.07611083984375, + "learning_rate": 4.998821278801542e-07, + "loss": 0.0001, + "reward": 1.719642959535122, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5330 + }, + { + "completion_length": 250.12947940826416, + "epoch": 0.8939184374869023, + "grad_norm": 0.24714452364489936, + "kl": 0.078399658203125, + "learning_rate": 4.998817592710674e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 5332 + }, + { + "completion_length": 246.7901906967163, + "epoch": 0.8942537407267698, + "grad_norm": 0.26676101095918225, + "kl": 0.0773773193359375, + "learning_rate": 4.998813900866601e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 5334 + }, + { + "completion_length": 250.9821538925171, + "epoch": 0.8945890439666373, + "grad_norm": 0.839157664979466, + "kl": 0.082855224609375, + "learning_rate": 4.998810203269333e-07, + "loss": 0.0001, + "reward": 1.7392858117818832, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5336 + }, + { + "completion_length": 252.33483123779297, + "epoch": 0.8949243472065049, + "grad_norm": 0.12935000666624785, + "kl": 0.0791015625, + "learning_rate": 4.998806499918876e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 5338 + }, + { + "completion_length": 246.33036708831787, + "epoch": 0.8952596504463725, + "grad_norm": 0.10639964651343883, + "kl": 0.09002685546875, + "learning_rate": 4.998802790815241e-07, + "loss": 0.0001, + "reward": 1.8017857670783997, + "reward_std": 0.04798224661499262, + "rewards/equation_reward_func": 0.8151785917580128, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5340 + }, + { + "completion_length": 248.7634048461914, + "epoch": 0.89559495368624, + "grad_norm": 0.33624572209025144, + "kl": 0.0796356201171875, + "learning_rate": 4.998799075958435e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7464285995811224, + "rewards/format_reward_func": 1.0, + "step": 5342 + }, + { + "completion_length": 259.901798248291, + "epoch": 0.8959302569261075, + "grad_norm": 0.1254125349254734, + "kl": 0.082733154296875, + "learning_rate": 4.998795355348467e-07, + "loss": 0.0001, + "reward": 1.7375000566244125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5344 + }, + { + "completion_length": 263.9285840988159, + "epoch": 0.8962655601659751, + "grad_norm": 0.31235861346966837, + "kl": 0.1109619140625, + "learning_rate": 4.998791628985346e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7383928820490837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5346 + }, + { + "completion_length": 242.00447750091553, + "epoch": 0.8966008634058427, + "grad_norm": 0.1923375555144056, + "kl": 0.08599853515625, + "learning_rate": 4.99878789686908e-07, + "loss": 0.0001, + "reward": 1.7517857626080513, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5348 + }, + { + "completion_length": 253.73215293884277, + "epoch": 0.8969361666457102, + "grad_norm": 0.3144491273579619, + "kl": 0.157440185546875, + "learning_rate": 4.998784158999677e-07, + "loss": 0.0002, + "reward": 1.7482143566012383, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7526785880327225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5350 + }, + { + "completion_length": 258.86608505249023, + "epoch": 0.8972714698855778, + "grad_norm": 0.3800286104807005, + "kl": 0.0876312255859375, + "learning_rate": 4.998780415377148e-07, + "loss": 0.0001, + "reward": 1.7946428880095482, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7991071678698063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5352 + }, + { + "completion_length": 255.0178680419922, + "epoch": 0.8976067731254453, + "grad_norm": 0.5843896226137154, + "kl": 0.1268310546875, + "learning_rate": 4.998776666001499e-07, + "loss": 0.0001, + "reward": 1.6750000715255737, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.6839286163449287, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5354 + }, + { + "completion_length": 252.58929634094238, + "epoch": 0.8979420763653129, + "grad_norm": 0.3539484283441876, + "kl": 0.2587127685546875, + "learning_rate": 4.998772910872739e-07, + "loss": 0.0003, + "reward": 1.7035714909434319, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7125000301748514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5356 + }, + { + "completion_length": 254.33929538726807, + "epoch": 0.8982773796051804, + "grad_norm": 0.46943661426880745, + "kl": 0.193206787109375, + "learning_rate": 4.998769149990878e-07, + "loss": 0.0002, + "reward": 1.7928571999073029, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 5358 + }, + { + "completion_length": 244.76786994934082, + "epoch": 0.898612682845048, + "grad_norm": 0.12745735511328252, + "kl": 0.22625732421875, + "learning_rate": 4.998765383355924e-07, + "loss": 0.0002, + "reward": 1.753571517765522, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 5360 + }, + { + "completion_length": 254.78572750091553, + "epoch": 0.8989479860849156, + "grad_norm": 0.20917256499861125, + "kl": 0.3976593017578125, + "learning_rate": 4.998761610967885e-07, + "loss": 0.0004, + "reward": 1.7982143610715866, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5362 + }, + { + "completion_length": 242.13393688201904, + "epoch": 0.8992832893247831, + "grad_norm": 0.4313738380742602, + "kl": 0.1184539794921875, + "learning_rate": 4.998757832826772e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 5364 + }, + { + "completion_length": 255.21875953674316, + "epoch": 0.8996185925646507, + "grad_norm": 0.5463979836566241, + "kl": 0.6548309326171875, + "learning_rate": 4.99875404893259e-07, + "loss": 0.0007, + "reward": 1.7017857879400253, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.706250037997961, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5366 + }, + { + "completion_length": 249.6517972946167, + "epoch": 0.8999538958045182, + "grad_norm": 0.2889413935758667, + "kl": 0.3985443115234375, + "learning_rate": 4.998750259285351e-07, + "loss": 0.0004, + "reward": 1.7428572177886963, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571656346321, + "rewards/format_reward_func": 1.0, + "step": 5368 + }, + { + "completion_length": 246.79912090301514, + "epoch": 0.9002891990443858, + "grad_norm": 0.31076407243427856, + "kl": 0.1228179931640625, + "learning_rate": 4.998746463885062e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8026785962283611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5370 + }, + { + "completion_length": 247.64733123779297, + "epoch": 0.9006245022842533, + "grad_norm": 0.4925333749051105, + "kl": 1.430389404296875, + "learning_rate": 4.998742662731732e-07, + "loss": 0.0014, + "reward": 1.79464291036129, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071566939354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5372 + }, + { + "completion_length": 244.7232265472412, + "epoch": 0.9009598055241209, + "grad_norm": 0.20763244478129034, + "kl": 1.0489349365234375, + "learning_rate": 4.998738855825371e-07, + "loss": 0.001, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 5374 + }, + { + "completion_length": 245.29911518096924, + "epoch": 0.9012951087639884, + "grad_norm": 0.7021375598228516, + "kl": 0.698516845703125, + "learning_rate": 4.998735043165986e-07, + "loss": 0.0007, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 5376 + }, + { + "completion_length": 257.50894260406494, + "epoch": 0.901630412003856, + "grad_norm": 0.7357447369391495, + "kl": 0.44970703125, + "learning_rate": 4.998731224753586e-07, + "loss": 0.0004, + "reward": 1.7321429327130318, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7410714663565159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5378 + }, + { + "completion_length": 247.5491189956665, + "epoch": 0.9019657152437235, + "grad_norm": 0.47121200401910107, + "kl": 1.12872314453125, + "learning_rate": 4.99872740058818e-07, + "loss": 0.0011, + "reward": 1.7125000804662704, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7169643118977547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5380 + }, + { + "completion_length": 247.50893783569336, + "epoch": 0.9023010184835911, + "grad_norm": 0.3110160210673541, + "kl": 0.1286468505859375, + "learning_rate": 4.998723570669778e-07, + "loss": 0.0001, + "reward": 1.7071429416537285, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7071428969502449, + "rewards/format_reward_func": 1.0, + "step": 5382 + }, + { + "completion_length": 255.0625123977661, + "epoch": 0.9026363217234586, + "grad_norm": 0.33694169060014484, + "kl": 0.8502655029296875, + "learning_rate": 4.998719734998387e-07, + "loss": 0.0008, + "reward": 1.6785715445876122, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.6875000335276127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5384 + }, + { + "completion_length": 247.7053680419922, + "epoch": 0.9029716249633262, + "grad_norm": 0.17383252788350392, + "kl": 0.1181640625, + "learning_rate": 4.998715893574018e-07, + "loss": 0.0001, + "reward": 1.721428669989109, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214285954833031, + "rewards/format_reward_func": 1.0, + "step": 5386 + }, + { + "completion_length": 249.31697463989258, + "epoch": 0.9033069282031938, + "grad_norm": 0.2062389071756386, + "kl": 0.0739898681640625, + "learning_rate": 4.998712046396677e-07, + "loss": 0.0001, + "reward": 1.7607143223285675, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 5388 + }, + { + "completion_length": 253.4375114440918, + "epoch": 0.9036422314430613, + "grad_norm": 0.1814458101267558, + "kl": 0.0583648681640625, + "learning_rate": 4.998708193466375e-07, + "loss": 0.0001, + "reward": 1.7267857864499092, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7312500439584255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5390 + }, + { + "completion_length": 239.55804824829102, + "epoch": 0.9039775346829289, + "grad_norm": 0.29584868151822824, + "kl": 0.0521392822265625, + "learning_rate": 4.998704334783121e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857361584902, + "rewards/format_reward_func": 1.0, + "step": 5392 + }, + { + "completion_length": 250.7455472946167, + "epoch": 0.9043128379227964, + "grad_norm": 0.23987515671916398, + "kl": 0.0930328369140625, + "learning_rate": 4.998700470346923e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000264495611, + "rewards/format_reward_func": 1.0, + "step": 5394 + }, + { + "completion_length": 236.6741189956665, + "epoch": 0.904648141162664, + "grad_norm": 0.27100398738808573, + "kl": 0.0775299072265625, + "learning_rate": 4.998696600157789e-07, + "loss": 0.0001, + "reward": 1.7142858132719994, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7142857387661934, + "rewards/format_reward_func": 1.0, + "step": 5396 + }, + { + "completion_length": 249.26340007781982, + "epoch": 0.9049834444025315, + "grad_norm": 0.19531226401891985, + "kl": 0.1512298583984375, + "learning_rate": 4.998692724215731e-07, + "loss": 0.0002, + "reward": 1.7571429535746574, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 5398 + }, + { + "completion_length": 251.31250953674316, + "epoch": 0.9053187476423991, + "grad_norm": 0.18254214716236256, + "kl": 0.05153656005859375, + "learning_rate": 4.998688842520754e-07, + "loss": 0.0001, + "reward": 1.746428668498993, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 5400 + }, + { + "completion_length": 244.5714406967163, + "epoch": 0.9056540508822667, + "grad_norm": 0.19710436019518554, + "kl": 0.0549468994140625, + "learning_rate": 4.998684955072869e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 5402 + }, + { + "completion_length": 248.0982255935669, + "epoch": 0.9059893541221342, + "grad_norm": 0.2126268791462505, + "kl": 0.0541839599609375, + "learning_rate": 4.998681061872086e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 5404 + }, + { + "completion_length": 245.34822463989258, + "epoch": 0.9063246573620017, + "grad_norm": 0.28491657265214865, + "kl": 0.1408843994140625, + "learning_rate": 4.998677162918411e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500461935997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5406 + }, + { + "completion_length": 250.87947750091553, + "epoch": 0.9066599606018693, + "grad_norm": 0.28512756855223254, + "kl": 0.187103271484375, + "learning_rate": 4.998673258211857e-07, + "loss": 0.0002, + "reward": 1.7464286386966705, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.755357176065445, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5408 + }, + { + "completion_length": 250.35269165039062, + "epoch": 0.9069952638417369, + "grad_norm": 0.1916385409286891, + "kl": 0.05348968505859375, + "learning_rate": 4.998669347752429e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 5410 + }, + { + "completion_length": 258.12500953674316, + "epoch": 0.9073305670816044, + "grad_norm": 3.139980507250418, + "kl": 0.21661376953125, + "learning_rate": 4.998665431540138e-07, + "loss": 0.0002, + "reward": 1.7142857909202576, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7232143245637417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5412 + }, + { + "completion_length": 258.0803689956665, + "epoch": 0.907665870321472, + "grad_norm": 0.358562731594068, + "kl": 0.0805206298828125, + "learning_rate": 4.998661509574993e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 5414 + }, + { + "completion_length": 254.50893783569336, + "epoch": 0.9080011735613396, + "grad_norm": 0.27085475348892585, + "kl": 0.0615692138671875, + "learning_rate": 4.998657581857002e-07, + "loss": 0.0001, + "reward": 1.8107143267989159, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 5416 + }, + { + "completion_length": 239.04911994934082, + "epoch": 0.9083364768012071, + "grad_norm": 0.278262296727244, + "kl": 0.0838623046875, + "learning_rate": 4.998653648386175e-07, + "loss": 0.0001, + "reward": 1.703571505844593, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7035714704543352, + "rewards/format_reward_func": 1.0, + "step": 5418 + }, + { + "completion_length": 240.16965293884277, + "epoch": 0.9086717800410746, + "grad_norm": 0.2400452957091018, + "kl": 0.051971435546875, + "learning_rate": 4.998649709162522e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7660714536905289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5420 + }, + { + "completion_length": 236.52233123779297, + "epoch": 0.9090070832809422, + "grad_norm": 0.00552479084519166, + "kl": 0.05666351318359375, + "learning_rate": 4.998645764186051e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7357143331319094, + "rewards/format_reward_func": 1.0, + "step": 5422 + }, + { + "completion_length": 242.33036994934082, + "epoch": 0.9093423865208098, + "grad_norm": 0.29723655679749067, + "kl": 0.0811309814453125, + "learning_rate": 4.99864181345677e-07, + "loss": 0.0001, + "reward": 1.7285714745521545, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 5424 + }, + { + "completion_length": 246.1607255935669, + "epoch": 0.9096776897606773, + "grad_norm": 0.13343396730306012, + "kl": 0.0643768310546875, + "learning_rate": 4.99863785697469e-07, + "loss": 0.0001, + "reward": 1.7785715237259865, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 1.0, + "step": 5426 + }, + { + "completion_length": 238.2500114440918, + "epoch": 0.9100129930005448, + "grad_norm": 0.10802697735688958, + "kl": 0.0711669921875, + "learning_rate": 4.998633894739818e-07, + "loss": 0.0001, + "reward": 1.7767857536673546, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7812500447034836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5428 + }, + { + "completion_length": 255.61161518096924, + "epoch": 0.9103482962404125, + "grad_norm": 0.15751124690918902, + "kl": 0.06842041015625, + "learning_rate": 4.998629926752165e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000294297934, + "rewards/format_reward_func": 1.0, + "step": 5430 + }, + { + "completion_length": 253.02233409881592, + "epoch": 0.91068359948028, + "grad_norm": 0.07871958210206086, + "kl": 0.106658935546875, + "learning_rate": 4.998625953011739e-07, + "loss": 0.0001, + "reward": 1.7428571954369545, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7517857551574707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5432 + }, + { + "completion_length": 239.16965579986572, + "epoch": 0.9110189027201475, + "grad_norm": 0.17591567198150765, + "kl": 0.04869842529296875, + "learning_rate": 4.99862197351855e-07, + "loss": 0.0, + "reward": 1.7821429371833801, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 5434 + }, + { + "completion_length": 241.68304634094238, + "epoch": 0.911354205960015, + "grad_norm": 0.24591806078737033, + "kl": 0.0575103759765625, + "learning_rate": 4.998617988272608e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 5436 + }, + { + "completion_length": 243.97768878936768, + "epoch": 0.9116895091998827, + "grad_norm": 0.2791332987456658, + "kl": 0.064788818359375, + "learning_rate": 4.998613997273919e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 1.0, + "step": 5438 + }, + { + "completion_length": 248.4241189956665, + "epoch": 0.9120248124397502, + "grad_norm": 0.157219240713236, + "kl": 0.0563507080078125, + "learning_rate": 4.998610000522495e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 5440 + }, + { + "completion_length": 249.42858695983887, + "epoch": 0.9123601156796177, + "grad_norm": 0.22393359111992003, + "kl": 0.17645263671875, + "learning_rate": 4.998605998018344e-07, + "loss": 0.0002, + "reward": 1.7160715237259865, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7205357439815998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5442 + }, + { + "completion_length": 246.8526906967163, + "epoch": 0.9126954189194854, + "grad_norm": 0.22293059464193454, + "kl": 0.0731201171875, + "learning_rate": 4.998601989761477e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5444 + }, + { + "completion_length": 232.83036708831787, + "epoch": 0.9130307221593529, + "grad_norm": 0.004513281336610052, + "kl": 0.05118560791015625, + "learning_rate": 4.9985979757519e-07, + "loss": 0.0001, + "reward": 1.8250000327825546, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000365078449, + "rewards/format_reward_func": 1.0, + "step": 5446 + }, + { + "completion_length": 246.4642972946167, + "epoch": 0.9133660253992204, + "grad_norm": 0.28111220060756315, + "kl": 0.11065673828125, + "learning_rate": 4.998593955989625e-07, + "loss": 0.0001, + "reward": 1.7803571671247482, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7937500290572643, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5448 + }, + { + "completion_length": 248.61161708831787, + "epoch": 0.9137013286390879, + "grad_norm": 0.26719790440911256, + "kl": 0.0706329345703125, + "learning_rate": 4.99858993047466e-07, + "loss": 0.0001, + "reward": 1.8214286118745804, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214286006987095, + "rewards/format_reward_func": 1.0, + "step": 5450 + }, + { + "completion_length": 246.4598331451416, + "epoch": 0.9140366318789556, + "grad_norm": 0.27475256082811733, + "kl": 0.2954254150390625, + "learning_rate": 4.998585899207015e-07, + "loss": 0.0003, + "reward": 1.7500000968575478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 5452 + }, + { + "completion_length": 258.51787185668945, + "epoch": 0.9143719351188231, + "grad_norm": 0.2363268462501324, + "kl": 0.075775146484375, + "learning_rate": 4.998581862186698e-07, + "loss": 0.0001, + "reward": 1.753571480512619, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.753571467474103, + "rewards/format_reward_func": 1.0, + "step": 5454 + }, + { + "completion_length": 248.1875123977661, + "epoch": 0.9147072383586906, + "grad_norm": 0.35017889785356593, + "kl": 0.1490478515625, + "learning_rate": 4.99857781941372e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7357143145054579, + "rewards/format_reward_func": 1.0, + "step": 5456 + }, + { + "completion_length": 247.55804538726807, + "epoch": 0.9150425415985582, + "grad_norm": 0.2851086680739538, + "kl": 0.08409881591796875, + "learning_rate": 4.998573770888089e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 5458 + }, + { + "completion_length": 252.24108123779297, + "epoch": 0.9153778448384258, + "grad_norm": 0.12642756304879468, + "kl": 0.055938720703125, + "learning_rate": 4.998569716609815e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 5460 + }, + { + "completion_length": 248.8571548461914, + "epoch": 0.9157131480782933, + "grad_norm": 0.11845124265324933, + "kl": 0.1025543212890625, + "learning_rate": 4.998565656578907e-07, + "loss": 0.0001, + "reward": 1.7517857626080513, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5462 + }, + { + "completion_length": 245.77679824829102, + "epoch": 0.9160484513181608, + "grad_norm": 0.2892291151263095, + "kl": 0.2335357666015625, + "learning_rate": 4.998561590795375e-07, + "loss": 0.0002, + "reward": 1.8107143640518188, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 5464 + }, + { + "completion_length": 247.86608028411865, + "epoch": 0.9163837545580285, + "grad_norm": 0.14281412571223723, + "kl": 0.1459503173828125, + "learning_rate": 4.998557519259227e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 5466 + }, + { + "completion_length": 254.5312623977661, + "epoch": 0.916719057797896, + "grad_norm": 0.2005936378687353, + "kl": 0.145904541015625, + "learning_rate": 4.998553441970474e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5468 + }, + { + "completion_length": 239.5625114440918, + "epoch": 0.9170543610377635, + "grad_norm": 0.0922149462674823, + "kl": 0.0469512939453125, + "learning_rate": 4.998549358929124e-07, + "loss": 0.0, + "reward": 1.7178572341799736, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 5470 + }, + { + "completion_length": 242.3080472946167, + "epoch": 0.9173896642776311, + "grad_norm": 0.28370406486244265, + "kl": 0.07550048828125, + "learning_rate": 4.998545270135187e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 5472 + }, + { + "completion_length": 255.2678680419922, + "epoch": 0.9177249675174987, + "grad_norm": 0.3061601835724448, + "kl": 0.0880889892578125, + "learning_rate": 4.998541175588672e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.09091372601687908, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 5474 + }, + { + "completion_length": 258.0134038925171, + "epoch": 0.9180602707573662, + "grad_norm": 0.2438152833657782, + "kl": 0.05542755126953125, + "learning_rate": 4.998537075289589e-07, + "loss": 0.0001, + "reward": 1.8196429088711739, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8241071589291096, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5476 + }, + { + "completion_length": 251.0580472946167, + "epoch": 0.9183955739972337, + "grad_norm": 0.3451355955311913, + "kl": 0.1912841796875, + "learning_rate": 4.998532969237948e-07, + "loss": 0.0002, + "reward": 1.7803572118282318, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5478 + }, + { + "completion_length": 246.39287090301514, + "epoch": 0.9187308772371013, + "grad_norm": 0.12898136469831076, + "kl": 0.3510284423828125, + "learning_rate": 4.998528857433758e-07, + "loss": 0.0004, + "reward": 1.7732143476605415, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776786033064127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5480 + }, + { + "completion_length": 252.08036994934082, + "epoch": 0.9190661804769689, + "grad_norm": 0.198850014831597, + "kl": 0.5845413208007812, + "learning_rate": 4.998524739877027e-07, + "loss": 0.0006, + "reward": 1.7000000551342964, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7089286018162966, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5482 + }, + { + "completion_length": 246.92411994934082, + "epoch": 0.9194014837168364, + "grad_norm": 0.20629485243538223, + "kl": 0.1067047119140625, + "learning_rate": 4.998520616567767e-07, + "loss": 0.0001, + "reward": 1.8142857775092125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 5484 + }, + { + "completion_length": 257.71875858306885, + "epoch": 0.919736786956704, + "grad_norm": 0.27743791130919937, + "kl": 0.06072998046875, + "learning_rate": 4.998516487505985e-07, + "loss": 0.0001, + "reward": 1.7053572162985802, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.709821455180645, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5486 + }, + { + "completion_length": 251.83929634094238, + "epoch": 0.9200720901965715, + "grad_norm": 0.26182652616513147, + "kl": 0.3108367919921875, + "learning_rate": 4.998512352691692e-07, + "loss": 0.0003, + "reward": 1.762500062584877, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5488 + }, + { + "completion_length": 256.5044755935669, + "epoch": 0.9204073934364391, + "grad_norm": 0.228511123466136, + "kl": 0.05645751953125, + "learning_rate": 4.998508212124896e-07, + "loss": 0.0001, + "reward": 1.7660715132951736, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7705357223749161, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5490 + }, + { + "completion_length": 249.4866189956665, + "epoch": 0.9207426966763066, + "grad_norm": 0.328227301695069, + "kl": 0.3878936767578125, + "learning_rate": 4.99850406580561e-07, + "loss": 0.0004, + "reward": 1.7857143580913544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 5492 + }, + { + "completion_length": 251.33483600616455, + "epoch": 0.9210779999161742, + "grad_norm": 0.22654887015954236, + "kl": 0.05487060546875, + "learning_rate": 4.99849991373384e-07, + "loss": 0.0001, + "reward": 1.8392857685685158, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8392857350409031, + "rewards/format_reward_func": 1.0, + "step": 5494 + }, + { + "completion_length": 259.1205463409424, + "epoch": 0.9214133031560418, + "grad_norm": 0.22116648315395465, + "kl": 0.05206298828125, + "learning_rate": 4.998495755909597e-07, + "loss": 0.0001, + "reward": 1.7732143253087997, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5496 + }, + { + "completion_length": 249.49108123779297, + "epoch": 0.9217486063959093, + "grad_norm": 0.2728720425990515, + "kl": 0.3609466552734375, + "learning_rate": 4.998491592332891e-07, + "loss": 0.0004, + "reward": 1.7607143372297287, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 5498 + }, + { + "completion_length": 253.9375114440918, + "epoch": 0.9220839096357769, + "grad_norm": 0.22240895206575162, + "kl": 0.1055145263671875, + "learning_rate": 4.99848742300373e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214507102966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5500 + }, + { + "completion_length": 255.65626049041748, + "epoch": 0.9224192128756444, + "grad_norm": 0.24399409381101786, + "kl": 0.30101776123046875, + "learning_rate": 4.998483247922125e-07, + "loss": 0.0003, + "reward": 1.7625000700354576, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5502 + }, + { + "completion_length": 251.9598331451416, + "epoch": 0.922754516115512, + "grad_norm": 0.22450025929828568, + "kl": 0.062225341796875, + "learning_rate": 4.998479067088085e-07, + "loss": 0.0001, + "reward": 1.8321429193019867, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8321428820490837, + "rewards/format_reward_func": 1.0, + "step": 5504 + }, + { + "completion_length": 248.4509038925171, + "epoch": 0.9230898193553795, + "grad_norm": 0.3051567211790164, + "kl": 0.3009185791015625, + "learning_rate": 4.99847488050162e-07, + "loss": 0.0003, + "reward": 1.771428644657135, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 5506 + }, + { + "completion_length": 258.60269260406494, + "epoch": 0.9234251225952471, + "grad_norm": 0.14467945045617023, + "kl": 0.0584869384765625, + "learning_rate": 4.998470688162739e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 5508 + }, + { + "completion_length": 244.5848331451416, + "epoch": 0.9237604258351146, + "grad_norm": 0.2562804961215929, + "kl": 0.14105224609375, + "learning_rate": 4.998466490071452e-07, + "loss": 0.0001, + "reward": 1.7875000312924385, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5510 + }, + { + "completion_length": 263.23662281036377, + "epoch": 0.9240957290749822, + "grad_norm": 0.14269246424130785, + "kl": 0.2320556640625, + "learning_rate": 4.99846228622777e-07, + "loss": 0.0002, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 5512 + }, + { + "completion_length": 253.27233219146729, + "epoch": 0.9244310323148497, + "grad_norm": 0.19061742235708712, + "kl": 0.2888031005859375, + "learning_rate": 4.9984580766317e-07, + "loss": 0.0003, + "reward": 1.764285795390606, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 5514 + }, + { + "completion_length": 258.8705463409424, + "epoch": 0.9247663355547173, + "grad_norm": 0.19888817017665691, + "kl": 0.2613983154296875, + "learning_rate": 4.998453861283254e-07, + "loss": 0.0003, + "reward": 1.723214365541935, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.736607164144516, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5516 + }, + { + "completion_length": 261.30804538726807, + "epoch": 0.9251016387945848, + "grad_norm": 0.31133274406414835, + "kl": 0.060394287109375, + "learning_rate": 4.998449640182442e-07, + "loss": 0.0001, + "reward": 1.7696429416537285, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7741071730852127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5518 + }, + { + "completion_length": 263.91519355773926, + "epoch": 0.9254369420344524, + "grad_norm": 0.21421569618201042, + "kl": 0.256683349609375, + "learning_rate": 4.998445413329271e-07, + "loss": 0.0003, + "reward": 1.6821429505944252, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.6821428909897804, + "rewards/format_reward_func": 1.0, + "step": 5520 + }, + { + "completion_length": 255.39733219146729, + "epoch": 0.92577224527432, + "grad_norm": 0.2180324591587215, + "kl": 0.2082366943359375, + "learning_rate": 4.998441180723753e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428664028645, + "rewards/format_reward_func": 1.0, + "step": 5522 + }, + { + "completion_length": 248.8437623977661, + "epoch": 0.9261075485141875, + "grad_norm": 0.12138142361987297, + "kl": 0.113922119140625, + "learning_rate": 4.998436942365896e-07, + "loss": 0.0001, + "reward": 1.8142857626080513, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8142857328057289, + "rewards/format_reward_func": 1.0, + "step": 5524 + }, + { + "completion_length": 258.85715675354004, + "epoch": 0.926442851754055, + "grad_norm": 0.28283183000256845, + "kl": 0.074371337890625, + "learning_rate": 4.998432698255712e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5526 + }, + { + "completion_length": 255.95090293884277, + "epoch": 0.9267781549939226, + "grad_norm": 0.24545344039802608, + "kl": 0.1540679931640625, + "learning_rate": 4.998428448393209e-07, + "loss": 0.0002, + "reward": 1.7553572207689285, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5528 + }, + { + "completion_length": 255.63840293884277, + "epoch": 0.9271134582337902, + "grad_norm": 0.2534769583940854, + "kl": 0.1455841064453125, + "learning_rate": 4.998424192778396e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 5530 + }, + { + "completion_length": 250.9553680419922, + "epoch": 0.9274487614736577, + "grad_norm": 0.24871677747576773, + "kl": 0.070220947265625, + "learning_rate": 4.998419931411286e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 5532 + }, + { + "completion_length": 259.0491180419922, + "epoch": 0.9277840647135253, + "grad_norm": 0.1870510929710293, + "kl": 0.07379150390625, + "learning_rate": 4.998415664291887e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 5534 + }, + { + "completion_length": 258.1071529388428, + "epoch": 0.9281193679533929, + "grad_norm": 0.25610770723960113, + "kl": 0.12774658203125, + "learning_rate": 4.998411391420209e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 5536 + }, + { + "completion_length": 245.25893688201904, + "epoch": 0.9284546711932604, + "grad_norm": 0.30114561591144784, + "kl": 0.083526611328125, + "learning_rate": 4.998407112796261e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 5538 + }, + { + "completion_length": 241.30804920196533, + "epoch": 0.9287899744331279, + "grad_norm": 0.2860699239934635, + "kl": 0.094268798828125, + "learning_rate": 4.998402828420052e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214671015739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5540 + }, + { + "completion_length": 253.16072750091553, + "epoch": 0.9291252776729955, + "grad_norm": 0.2555147566584149, + "kl": 0.0799713134765625, + "learning_rate": 4.998398538291596e-07, + "loss": 0.0001, + "reward": 1.7071429416537285, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7071428876370192, + "rewards/format_reward_func": 1.0, + "step": 5542 + }, + { + "completion_length": 244.32590293884277, + "epoch": 0.9294605809128631, + "grad_norm": 0.1987351795108615, + "kl": 0.0678253173828125, + "learning_rate": 4.998394242410899e-07, + "loss": 0.0001, + "reward": 1.8142857626080513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 5544 + }, + { + "completion_length": 255.3080472946167, + "epoch": 0.9297958841527306, + "grad_norm": 0.31618605204352274, + "kl": 0.121063232421875, + "learning_rate": 4.998389940777972e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7410714626312256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5546 + }, + { + "completion_length": 268.62501335144043, + "epoch": 0.9301311873925981, + "grad_norm": 0.3458136837670348, + "kl": 0.0794830322265625, + "learning_rate": 4.998385633392825e-07, + "loss": 0.0001, + "reward": 1.7285714894533157, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.7375000230967999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5548 + }, + { + "completion_length": 265.83930015563965, + "epoch": 0.9304664906324658, + "grad_norm": 0.2956670786981416, + "kl": 0.2076568603515625, + "learning_rate": 4.998381320255468e-07, + "loss": 0.0002, + "reward": 1.7000000551342964, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7089286111295223, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5550 + }, + { + "completion_length": 263.7857265472412, + "epoch": 0.9308017938723333, + "grad_norm": 0.22854230090747885, + "kl": 0.12713623046875, + "learning_rate": 4.998377001365911e-07, + "loss": 0.0001, + "reward": 1.6910715252161026, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.6955357491970062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5552 + }, + { + "completion_length": 265.04465675354004, + "epoch": 0.9311370971122008, + "grad_norm": 0.23869813596626355, + "kl": 0.3279571533203125, + "learning_rate": 4.998372676724164e-07, + "loss": 0.0003, + "reward": 1.7178572043776512, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7267857491970062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5554 + }, + { + "completion_length": 264.2053699493408, + "epoch": 0.9314724003520684, + "grad_norm": 0.21032106039875323, + "kl": 0.152618408203125, + "learning_rate": 4.998368346330237e-07, + "loss": 0.0002, + "reward": 1.7339286729693413, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7473214641213417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5556 + }, + { + "completion_length": 266.86608600616455, + "epoch": 0.931807703591936, + "grad_norm": 0.2956750550588605, + "kl": 0.080657958984375, + "learning_rate": 4.998364010184139e-07, + "loss": 0.0001, + "reward": 1.7375000789761543, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7419643122702837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5558 + }, + { + "completion_length": 258.6651887893677, + "epoch": 0.9321430068318035, + "grad_norm": 0.24094408593698557, + "kl": 0.07884979248046875, + "learning_rate": 4.99835966828588e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 5560 + }, + { + "completion_length": 256.8660821914673, + "epoch": 0.932478310071671, + "grad_norm": 0.19758210940866042, + "kl": 0.1866455078125, + "learning_rate": 4.998355320635473e-07, + "loss": 0.0002, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 5562 + }, + { + "completion_length": 251.83929920196533, + "epoch": 0.9328136133115387, + "grad_norm": 0.20512783737454493, + "kl": 0.07763671875, + "learning_rate": 4.998350967232925e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 5564 + }, + { + "completion_length": 269.28126430511475, + "epoch": 0.9331489165514062, + "grad_norm": 0.1919354926759305, + "kl": 0.0897216796875, + "learning_rate": 4.998346608078245e-07, + "loss": 0.0001, + "reward": 1.7000000774860382, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7089286148548126, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5566 + }, + { + "completion_length": 250.1607265472412, + "epoch": 0.9334842197912737, + "grad_norm": 0.10457544698349448, + "kl": 0.0626678466796875, + "learning_rate": 4.998342243171447e-07, + "loss": 0.0001, + "reward": 1.7464286237955093, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.754464328289032, + "rewards/format_reward_func": 0.9919642917811871, + "step": 5568 + }, + { + "completion_length": 264.3973331451416, + "epoch": 0.9338195230311412, + "grad_norm": 0.22229885242314276, + "kl": 0.0751495361328125, + "learning_rate": 4.998337872512538e-07, + "loss": 0.0001, + "reward": 1.839285783469677, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.8482143133878708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5570 + }, + { + "completion_length": 270.1384057998657, + "epoch": 0.9341548262710089, + "grad_norm": 0.18494009709638623, + "kl": 0.115081787109375, + "learning_rate": 4.998333496101529e-07, + "loss": 0.0001, + "reward": 1.6750000789761543, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6750000324100256, + "rewards/format_reward_func": 1.0, + "step": 5572 + }, + { + "completion_length": 259.0535840988159, + "epoch": 0.9344901295108764, + "grad_norm": 0.09959741588528473, + "kl": 0.0759735107421875, + "learning_rate": 4.998329113938429e-07, + "loss": 0.0001, + "reward": 1.700000062584877, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7000000271946192, + "rewards/format_reward_func": 1.0, + "step": 5574 + }, + { + "completion_length": 252.05358505249023, + "epoch": 0.9348254327507439, + "grad_norm": 0.2517501606709083, + "kl": 0.0615692138671875, + "learning_rate": 4.998324726023249e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 5576 + }, + { + "completion_length": 252.37500953674316, + "epoch": 0.9351607359906116, + "grad_norm": 0.22949889960028877, + "kl": 0.080596923828125, + "learning_rate": 4.998320332356001e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 5578 + }, + { + "completion_length": 259.05805110931396, + "epoch": 0.9354960392304791, + "grad_norm": 0.3646697098932177, + "kl": 0.4967041015625, + "learning_rate": 4.998315932936693e-07, + "loss": 0.0005, + "reward": 1.735714353621006, + "reward_std": 0.11111677903681993, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5580 + }, + { + "completion_length": 245.58036708831787, + "epoch": 0.9358313424703466, + "grad_norm": 0.25969544709589587, + "kl": 0.085906982421875, + "learning_rate": 4.998311527765334e-07, + "loss": 0.0001, + "reward": 1.7875000536441803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5582 + }, + { + "completion_length": 250.86161994934082, + "epoch": 0.9361666457102141, + "grad_norm": 0.17861812568478064, + "kl": 0.08895111083984375, + "learning_rate": 4.998307116841937e-07, + "loss": 0.0001, + "reward": 1.7714285999536514, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 5584 + }, + { + "completion_length": 262.96876335144043, + "epoch": 0.9365019489500818, + "grad_norm": 0.25962272007727116, + "kl": 0.0623779296875, + "learning_rate": 4.998302700166509e-07, + "loss": 0.0001, + "reward": 1.7232143580913544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7267857529222965, + "rewards/format_reward_func": 0.9964285716414452, + "step": 5586 + }, + { + "completion_length": 267.6384029388428, + "epoch": 0.9368372521899493, + "grad_norm": 0.2566924669697665, + "kl": 0.137725830078125, + "learning_rate": 4.998298277739063e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7910714447498322, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5588 + }, + { + "completion_length": 270.07590198516846, + "epoch": 0.9371725554298168, + "grad_norm": 0.2571152286005929, + "kl": 0.11667633056640625, + "learning_rate": 4.998293849559608e-07, + "loss": 0.0001, + "reward": 1.7696429044008255, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7741071842610836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5590 + }, + { + "completion_length": 265.2142972946167, + "epoch": 0.9375078586696844, + "grad_norm": 0.2686328008496726, + "kl": 0.0756072998046875, + "learning_rate": 4.998289415628154e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7589286062866449, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5592 + }, + { + "completion_length": 262.8526916503906, + "epoch": 0.937843161909552, + "grad_norm": 0.2147661255591824, + "kl": 0.1193695068359375, + "learning_rate": 4.998284975944712e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7589286044239998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5594 + }, + { + "completion_length": 261.5044755935669, + "epoch": 0.9381784651494195, + "grad_norm": 0.31342327171802786, + "kl": 0.064178466796875, + "learning_rate": 4.998280530509291e-07, + "loss": 0.0001, + "reward": 1.7464286759495735, + "reward_std": 0.09596448857337236, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 5596 + }, + { + "completion_length": 258.48215198516846, + "epoch": 0.938513768389287, + "grad_norm": 0.24752753734293131, + "kl": 0.1938629150390625, + "learning_rate": 4.998276079321903e-07, + "loss": 0.0002, + "reward": 1.7553572207689285, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5598 + }, + { + "completion_length": 251.5401906967163, + "epoch": 0.9388490716291547, + "grad_norm": 0.14067875762090168, + "kl": 0.108612060546875, + "learning_rate": 4.998271622382556e-07, + "loss": 0.0001, + "reward": 1.814285784959793, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8142857328057289, + "rewards/format_reward_func": 1.0, + "step": 5600 + }, + { + "completion_length": 243.477689743042, + "epoch": 0.9391843748690222, + "grad_norm": 0.2324244712555495, + "kl": 0.0554046630859375, + "learning_rate": 4.998267159691262e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571429014205933, + "rewards/format_reward_func": 1.0, + "step": 5602 + }, + { + "completion_length": 263.3616189956665, + "epoch": 0.9395196781088897, + "grad_norm": 0.19419565752990522, + "kl": 0.233978271484375, + "learning_rate": 4.998262691248031e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857436090708, + "rewards/format_reward_func": 1.0, + "step": 5604 + }, + { + "completion_length": 247.4642972946167, + "epoch": 0.9398549813487573, + "grad_norm": 0.1919762132952126, + "kl": 0.1018218994140625, + "learning_rate": 4.998258217052872e-07, + "loss": 0.0001, + "reward": 1.7196429148316383, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7241071797907352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5606 + }, + { + "completion_length": 252.6919765472412, + "epoch": 0.9401902845886249, + "grad_norm": 0.1919943550574376, + "kl": 0.2514801025390625, + "learning_rate": 4.998253737105797e-07, + "loss": 0.0003, + "reward": 1.7714286223053932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 5608 + }, + { + "completion_length": 250.2366189956665, + "epoch": 0.9405255878284924, + "grad_norm": 0.29611682678625656, + "kl": 0.1829986572265625, + "learning_rate": 4.998249251406815e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 5610 + }, + { + "completion_length": 263.4553680419922, + "epoch": 0.9408608910683599, + "grad_norm": 0.23130672291701776, + "kl": 0.094757080078125, + "learning_rate": 4.998244759955939e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5612 + }, + { + "completion_length": 268.04019355773926, + "epoch": 0.9411961943082275, + "grad_norm": 0.30392357057779124, + "kl": 0.2783203125, + "learning_rate": 4.998240262753174e-07, + "loss": 0.0003, + "reward": 1.7178572490811348, + "reward_std": 0.09596449043601751, + "rewards/equation_reward_func": 0.7267857454717159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5614 + }, + { + "completion_length": 257.6339387893677, + "epoch": 0.9415314975480951, + "grad_norm": 0.4255414977969514, + "kl": 0.2381591796875, + "learning_rate": 4.998235759798537e-07, + "loss": 0.0002, + "reward": 1.7250000536441803, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7339286021888256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5616 + }, + { + "completion_length": 257.5401906967163, + "epoch": 0.9418668007879626, + "grad_norm": 0.292460391135546, + "kl": 0.360076904296875, + "learning_rate": 4.998231251092033e-07, + "loss": 0.0004, + "reward": 1.68392863124609, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.6883928906172514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5618 + }, + { + "completion_length": 268.0848321914673, + "epoch": 0.9422021040278302, + "grad_norm": 0.2446423277783523, + "kl": 0.21514892578125, + "learning_rate": 4.998226736633675e-07, + "loss": 0.0002, + "reward": 1.726785808801651, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7312500327825546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5620 + }, + { + "completion_length": 256.3392972946167, + "epoch": 0.9425374072676977, + "grad_norm": 0.18823448189637623, + "kl": 0.12542724609375, + "learning_rate": 4.998222216423472e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7616071663796902, + "rewards/format_reward_func": 0.9919642880558968, + "step": 5622 + }, + { + "completion_length": 256.7053699493408, + "epoch": 0.9428727105075653, + "grad_norm": 0.2138723503939617, + "kl": 0.1275787353515625, + "learning_rate": 4.998217690461435e-07, + "loss": 0.0001, + "reward": 1.7410715147852898, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7544643245637417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5624 + }, + { + "completion_length": 250.16072750091553, + "epoch": 0.9432080137474328, + "grad_norm": 0.16148957638978453, + "kl": 0.0900421142578125, + "learning_rate": 4.998213158747576e-07, + "loss": 0.0001, + "reward": 1.758928619325161, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7723214477300644, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5626 + }, + { + "completion_length": 264.17411518096924, + "epoch": 0.9435433169873004, + "grad_norm": 0.30775280324654475, + "kl": 0.090179443359375, + "learning_rate": 4.998208621281903e-07, + "loss": 0.0001, + "reward": 1.7607143744826317, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7696428783237934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5628 + }, + { + "completion_length": 256.93304538726807, + "epoch": 0.943878620227168, + "grad_norm": 0.4164457824510602, + "kl": 0.1088409423828125, + "learning_rate": 4.998204078064429e-07, + "loss": 0.0001, + "reward": 1.737500086426735, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7508928924798965, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5630 + }, + { + "completion_length": 257.83483505249023, + "epoch": 0.9442139234670355, + "grad_norm": 0.18818507517460953, + "kl": 0.1334228515625, + "learning_rate": 4.998199529095162e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7910714596509933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5632 + }, + { + "completion_length": 270.9375114440918, + "epoch": 0.944549226706903, + "grad_norm": 0.2851531511191286, + "kl": 0.18560791015625, + "learning_rate": 4.998194974374113e-07, + "loss": 0.0002, + "reward": 1.7392857745289803, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7482143193483353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5634 + }, + { + "completion_length": 264.75001525878906, + "epoch": 0.9448845299467706, + "grad_norm": 0.253065033016023, + "kl": 0.06976318359375, + "learning_rate": 4.998190413901292e-07, + "loss": 0.0001, + "reward": 1.7696429267525673, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7910714596509933, + "rewards/format_reward_func": 0.9785714372992516, + "step": 5636 + }, + { + "completion_length": 265.07144260406494, + "epoch": 0.9452198331866382, + "grad_norm": 0.23839217310733235, + "kl": 0.061492919921875, + "learning_rate": 4.998185847676712e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571794182062, + "rewards/format_reward_func": 1.0, + "step": 5638 + }, + { + "completion_length": 261.7678699493408, + "epoch": 0.9455551364265057, + "grad_norm": 0.19998156568588835, + "kl": 0.1107177734375, + "learning_rate": 4.998181275700382e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 5640 + }, + { + "completion_length": 270.2634048461914, + "epoch": 0.9458904396663733, + "grad_norm": 0.25558352437163384, + "kl": 0.1804656982421875, + "learning_rate": 4.998176697972311e-07, + "loss": 0.0002, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 5642 + }, + { + "completion_length": 254.98661994934082, + "epoch": 0.9462257429062408, + "grad_norm": 0.14052163514370772, + "kl": 0.221527099609375, + "learning_rate": 4.998172114492513e-07, + "loss": 0.0002, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 5644 + }, + { + "completion_length": 257.1875123977661, + "epoch": 0.9465610461461084, + "grad_norm": 0.27107773602436486, + "kl": 0.11501312255859375, + "learning_rate": 4.998167525260994e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 5646 + }, + { + "completion_length": 266.7857275009155, + "epoch": 0.9468963493859759, + "grad_norm": 0.34060283731390234, + "kl": 0.3077545166015625, + "learning_rate": 4.99816293027777e-07, + "loss": 0.0003, + "reward": 1.7089286595582962, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7133928872644901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5648 + }, + { + "completion_length": 259.46429920196533, + "epoch": 0.9472316526258435, + "grad_norm": 0.32314258962676723, + "kl": 0.0685882568359375, + "learning_rate": 4.998158329542847e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714715719223, + "rewards/format_reward_func": 1.0, + "step": 5650 + }, + { + "completion_length": 256.6651945114136, + "epoch": 0.947566955865711, + "grad_norm": 0.23348653009831258, + "kl": 0.140716552734375, + "learning_rate": 4.998153723056237e-07, + "loss": 0.0001, + "reward": 1.7339286506175995, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5652 + }, + { + "completion_length": 261.8705463409424, + "epoch": 0.9479022591055786, + "grad_norm": 0.5032103429997516, + "kl": 0.4618377685546875, + "learning_rate": 4.998149110817952e-07, + "loss": 0.0005, + "reward": 1.7785714641213417, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7875000201165676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5654 + }, + { + "completion_length": 248.64287090301514, + "epoch": 0.9482375623454462, + "grad_norm": 0.26493899916194824, + "kl": 0.0601654052734375, + "learning_rate": 4.998144492828e-07, + "loss": 0.0001, + "reward": 1.7357143834233284, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7357143051922321, + "rewards/format_reward_func": 1.0, + "step": 5656 + }, + { + "completion_length": 277.3750114440918, + "epoch": 0.9485728655853137, + "grad_norm": 0.1620975210345513, + "kl": 0.1508026123046875, + "learning_rate": 4.998139869086394e-07, + "loss": 0.0002, + "reward": 1.7053571864962578, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.977678582072258, + "step": 5658 + }, + { + "completion_length": 265.5803699493408, + "epoch": 0.9489081688251813, + "grad_norm": 0.14163608078600906, + "kl": 0.1465911865234375, + "learning_rate": 4.998135239593145e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500461935997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5660 + }, + { + "completion_length": 263.0134086608887, + "epoch": 0.9492434720650488, + "grad_norm": 0.19857088284391342, + "kl": 0.0649871826171875, + "learning_rate": 4.998130604348261e-07, + "loss": 0.0001, + "reward": 1.8357143327593803, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8357143104076385, + "rewards/format_reward_func": 1.0, + "step": 5662 + }, + { + "completion_length": 264.9285840988159, + "epoch": 0.9495787753049164, + "grad_norm": 0.29489920317894763, + "kl": 0.0663299560546875, + "learning_rate": 4.998125963351754e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7580357417464256, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5664 + }, + { + "completion_length": 271.7410840988159, + "epoch": 0.9499140785447839, + "grad_norm": 0.2585833121153054, + "kl": 0.0933837890625, + "learning_rate": 4.998121316603635e-07, + "loss": 0.0001, + "reward": 1.7107143700122833, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7196428887546062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5666 + }, + { + "completion_length": 256.977689743042, + "epoch": 0.9502493817846515, + "grad_norm": 0.004380553101243798, + "kl": 0.069793701171875, + "learning_rate": 4.998116664103914e-07, + "loss": 0.0001, + "reward": 1.8250000327825546, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 5668 + }, + { + "completion_length": 268.883939743042, + "epoch": 0.9505846850245191, + "grad_norm": 0.405321103068457, + "kl": 0.142242431640625, + "learning_rate": 4.998112005852603e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5670 + }, + { + "completion_length": 263.4464416503906, + "epoch": 0.9509199882643866, + "grad_norm": 0.7514470251323826, + "kl": 0.2141571044921875, + "learning_rate": 4.998107341849712e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 1.0, + "step": 5672 + }, + { + "completion_length": 262.5357255935669, + "epoch": 0.9512552915042541, + "grad_norm": 0.17197998736240475, + "kl": 0.0726470947265625, + "learning_rate": 4.998102672095251e-07, + "loss": 0.0001, + "reward": 1.7428571954369545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 1.0, + "step": 5674 + }, + { + "completion_length": 264.42411708831787, + "epoch": 0.9515905947441217, + "grad_norm": 0.24326647158367354, + "kl": 0.080718994140625, + "learning_rate": 4.998097996589233e-07, + "loss": 0.0001, + "reward": 1.7732143625617027, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5676 + }, + { + "completion_length": 266.07143783569336, + "epoch": 0.9519258979839893, + "grad_norm": 0.4137675130766855, + "kl": 0.101409912109375, + "learning_rate": 4.998093315331665e-07, + "loss": 0.0001, + "reward": 1.7950893491506577, + "reward_std": 0.057452425360679626, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 0.9986607171595097, + "step": 5678 + }, + { + "completion_length": 256.8973331451416, + "epoch": 0.9522612012238568, + "grad_norm": 0.23364485897135776, + "kl": 0.0838165283203125, + "learning_rate": 4.998088628322562e-07, + "loss": 0.0001, + "reward": 1.8321429342031479, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8321428820490837, + "rewards/format_reward_func": 1.0, + "step": 5680 + }, + { + "completion_length": 269.2500114440918, + "epoch": 0.9525965044637243, + "grad_norm": 0.44124717033408517, + "kl": 0.0906219482421875, + "learning_rate": 4.998083935561932e-07, + "loss": 0.0001, + "reward": 1.7196429148316383, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7241071835160255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5682 + }, + { + "completion_length": 265.31251430511475, + "epoch": 0.952931807703592, + "grad_norm": 0.1912833560793976, + "kl": 0.07427978515625, + "learning_rate": 4.998079237049785e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714525729418, + "rewards/format_reward_func": 1.0, + "step": 5684 + }, + { + "completion_length": 268.4509086608887, + "epoch": 0.9532671109434595, + "grad_norm": 0.12658711226962915, + "kl": 0.06768798828125, + "learning_rate": 4.998074532786135e-07, + "loss": 0.0001, + "reward": 1.7071429416537285, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7071428932249546, + "rewards/format_reward_func": 1.0, + "step": 5686 + }, + { + "completion_length": 283.089298248291, + "epoch": 0.953602414183327, + "grad_norm": 0.23460052009656376, + "kl": 0.073516845703125, + "learning_rate": 4.998069822770992e-07, + "loss": 0.0001, + "reward": 1.7303571924567223, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.7526785936206579, + "rewards/format_reward_func": 0.977678582072258, + "step": 5688 + }, + { + "completion_length": 281.93751430511475, + "epoch": 0.9539377174231946, + "grad_norm": 0.18191125932432262, + "kl": 0.0916290283203125, + "learning_rate": 4.998065107004365e-07, + "loss": 0.0001, + "reward": 1.6625000536441803, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.6848214641213417, + "rewards/format_reward_func": 0.977678582072258, + "step": 5690 + }, + { + "completion_length": 281.11162281036377, + "epoch": 0.9542730206630622, + "grad_norm": 0.2272203925271211, + "kl": 0.1111297607421875, + "learning_rate": 4.998060385486265e-07, + "loss": 0.0001, + "reward": 1.7339286729693413, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5692 + }, + { + "completion_length": 280.6964406967163, + "epoch": 0.9546083239029297, + "grad_norm": 0.12683642382177307, + "kl": 0.086151123046875, + "learning_rate": 4.998055658216705e-07, + "loss": 0.0001, + "reward": 1.7482143267989159, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5694 + }, + { + "completion_length": 282.22769260406494, + "epoch": 0.9549436271427972, + "grad_norm": 0.33633389268441777, + "kl": 0.1428070068359375, + "learning_rate": 4.998050925195694e-07, + "loss": 0.0001, + "reward": 1.7125000655651093, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.977678582072258, + "step": 5696 + }, + { + "completion_length": 277.4687614440918, + "epoch": 0.9552789303826649, + "grad_norm": 0.2182738670209157, + "kl": 0.1761627197265625, + "learning_rate": 4.998046186423243e-07, + "loss": 0.0002, + "reward": 1.7625000774860382, + "reward_std": 0.09343911055475473, + "rewards/equation_reward_func": 0.7758928760886192, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5698 + }, + { + "completion_length": 272.46876430511475, + "epoch": 0.9556142336225324, + "grad_norm": 0.2382541500873255, + "kl": 0.0994110107421875, + "learning_rate": 4.998041441899365e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5700 + }, + { + "completion_length": 277.77680110931396, + "epoch": 0.9559495368623999, + "grad_norm": 0.17786839241166985, + "kl": 0.1782073974609375, + "learning_rate": 4.998036691624069e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 5702 + }, + { + "completion_length": 280.6607265472412, + "epoch": 0.9562848401022674, + "grad_norm": 0.44365582048278357, + "kl": 0.2714080810546875, + "learning_rate": 4.998031935597366e-07, + "loss": 0.0003, + "reward": 1.698214329779148, + "reward_std": 0.09343911055475473, + "rewards/equation_reward_func": 0.7116071823984385, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5704 + }, + { + "completion_length": 276.37501335144043, + "epoch": 0.9566201433421351, + "grad_norm": 0.15799020884293222, + "kl": 0.0831298828125, + "learning_rate": 4.998027173819268e-07, + "loss": 0.0001, + "reward": 1.7767857685685158, + "reward_std": 0.05303300637751818, + "rewards/equation_reward_func": 0.7901786006987095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5706 + }, + { + "completion_length": 274.84822845458984, + "epoch": 0.9569554465820026, + "grad_norm": 0.22140325800096408, + "kl": 0.145477294921875, + "learning_rate": 4.998022406289784e-07, + "loss": 0.0001, + "reward": 1.7053572311997414, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7098214514553547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5708 + }, + { + "completion_length": 275.5937623977661, + "epoch": 0.9572907498218701, + "grad_norm": 0.43758406981647235, + "kl": 0.108306884765625, + "learning_rate": 4.998017633008928e-07, + "loss": 0.0001, + "reward": 1.6910715103149414, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7133928909897804, + "rewards/format_reward_func": 0.9776785783469677, + "step": 5710 + }, + { + "completion_length": 269.6205520629883, + "epoch": 0.9576260530617378, + "grad_norm": 0.09696601262183618, + "kl": 0.07403564453125, + "learning_rate": 4.998012853976707e-07, + "loss": 0.0001, + "reward": 1.737500086426735, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5712 + }, + { + "completion_length": 271.7009057998657, + "epoch": 0.9579613563016053, + "grad_norm": 0.3847634166372032, + "kl": 0.0723419189453125, + "learning_rate": 4.998008069193136e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.11616754438728094, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5714 + }, + { + "completion_length": 253.88840293884277, + "epoch": 0.9582966595414728, + "grad_norm": 0.2766356278864402, + "kl": 0.07025146484375, + "learning_rate": 4.998003278658222e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7660714499652386, + "rewards/format_reward_func": 0.9785714447498322, + "step": 5716 + }, + { + "completion_length": 269.3750114440918, + "epoch": 0.9586319627813403, + "grad_norm": 0.31884468339624056, + "kl": 0.077911376953125, + "learning_rate": 4.997998482371981e-07, + "loss": 0.0001, + "reward": 1.6964286863803864, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.6964285932481289, + "rewards/format_reward_func": 1.0, + "step": 5718 + }, + { + "completion_length": 267.9509057998657, + "epoch": 0.958967266021208, + "grad_norm": 0.15281834104888092, + "kl": 0.081817626953125, + "learning_rate": 4.99799368033442e-07, + "loss": 0.0001, + "reward": 1.700000062584877, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7089286092668772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5720 + }, + { + "completion_length": 265.84376430511475, + "epoch": 0.9593025692610755, + "grad_norm": 0.13977553928490638, + "kl": 0.0774688720703125, + "learning_rate": 4.997988872545551e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071801632643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5722 + }, + { + "completion_length": 258.8437614440918, + "epoch": 0.959637872500943, + "grad_norm": 0.0732864545892103, + "kl": 0.0887908935546875, + "learning_rate": 4.997984059005386e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 5724 + }, + { + "completion_length": 256.7321548461914, + "epoch": 0.9599731757408106, + "grad_norm": 0.1907927548631042, + "kl": 0.078094482421875, + "learning_rate": 4.997979239713935e-07, + "loss": 0.0001, + "reward": 1.8214286267757416, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214286044239998, + "rewards/format_reward_func": 1.0, + "step": 5726 + }, + { + "completion_length": 252.30804443359375, + "epoch": 0.9603084789806782, + "grad_norm": 0.5063146778249406, + "kl": 0.0906982421875, + "learning_rate": 4.997974414671211e-07, + "loss": 0.0001, + "reward": 1.698214367032051, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.702678607776761, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5728 + }, + { + "completion_length": 259.276798248291, + "epoch": 0.9606437822205457, + "grad_norm": 0.3635133661797194, + "kl": 0.1004638671875, + "learning_rate": 4.997969583877223e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5730 + }, + { + "completion_length": 252.36162090301514, + "epoch": 0.9609790854604132, + "grad_norm": 0.28713514263695467, + "kl": 0.0994110107421875, + "learning_rate": 4.997964747331982e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571749478579, + "rewards/format_reward_func": 1.0, + "step": 5732 + }, + { + "completion_length": 250.84375953674316, + "epoch": 0.9613143887002809, + "grad_norm": 0.2510492779487852, + "kl": 0.250762939453125, + "learning_rate": 4.9979599050355e-07, + "loss": 0.0003, + "reward": 1.723214365541935, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5734 + }, + { + "completion_length": 249.1830472946167, + "epoch": 0.9616496919401484, + "grad_norm": 0.22425040779164032, + "kl": 0.0988311767578125, + "learning_rate": 4.99795505698779e-07, + "loss": 0.0001, + "reward": 1.7486607730388641, + "reward_std": 0.05240166233852506, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 0.9986607171595097, + "step": 5736 + }, + { + "completion_length": 242.62947845458984, + "epoch": 0.9619849951800159, + "grad_norm": 0.23321105934828876, + "kl": 0.093170166015625, + "learning_rate": 4.997950203188859e-07, + "loss": 0.0001, + "reward": 1.825000062584877, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 5738 + }, + { + "completion_length": 252.52233600616455, + "epoch": 0.9623202984198835, + "grad_norm": 1.1336488889710032, + "kl": 0.0902557373046875, + "learning_rate": 4.997945343638721e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.771428594365716, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5740 + }, + { + "completion_length": 238.45983123779297, + "epoch": 0.9626556016597511, + "grad_norm": 0.08074561960104147, + "kl": 0.0883331298828125, + "learning_rate": 4.997940478337387e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.03030457627028227, + "rewards/equation_reward_func": 0.7660714685916901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5742 + }, + { + "completion_length": 241.31251049041748, + "epoch": 0.9629909048996186, + "grad_norm": 0.16187012849684024, + "kl": 0.0847930908203125, + "learning_rate": 4.997935607284869e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.8196428641676903, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5744 + }, + { + "completion_length": 243.7232255935669, + "epoch": 0.9633262081394861, + "grad_norm": 0.7833918591829895, + "kl": 0.11737060546875, + "learning_rate": 4.997930730481175e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7660714499652386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5746 + }, + { + "completion_length": 254.01786994934082, + "epoch": 0.9636615113793537, + "grad_norm": 0.2893503495802617, + "kl": 0.0744171142578125, + "learning_rate": 4.99792584792632e-07, + "loss": 0.0001, + "reward": 1.7214286178350449, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7303571961820126, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5748 + }, + { + "completion_length": 247.45090579986572, + "epoch": 0.9639968146192213, + "grad_norm": 0.21552202774201076, + "kl": 0.076690673828125, + "learning_rate": 4.997920959620312e-07, + "loss": 0.0001, + "reward": 1.7517857626080513, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5750 + }, + { + "completion_length": 228.21876049041748, + "epoch": 0.9643321178590888, + "grad_norm": 0.17294579698170476, + "kl": 0.0935211181640625, + "learning_rate": 4.997916065563164e-07, + "loss": 0.0001, + "reward": 1.7767857611179352, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5752 + }, + { + "completion_length": 237.68304824829102, + "epoch": 0.9646674210989564, + "grad_norm": 0.2319434648635915, + "kl": 0.148101806640625, + "learning_rate": 4.997911165754888e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357142902910709, + "rewards/format_reward_func": 1.0, + "step": 5754 + }, + { + "completion_length": 235.92858123779297, + "epoch": 0.965002724338824, + "grad_norm": 0.25800015070654836, + "kl": 0.1013336181640625, + "learning_rate": 4.997906260195495e-07, + "loss": 0.0001, + "reward": 1.7839286401867867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5756 + }, + { + "completion_length": 227.48661613464355, + "epoch": 0.9653380275786915, + "grad_norm": 0.5387041435019756, + "kl": 0.2335357666015625, + "learning_rate": 4.997901348884994e-07, + "loss": 0.0002, + "reward": 1.8125000670552254, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8169643171131611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5758 + }, + { + "completion_length": 255.6071538925171, + "epoch": 0.965673330818559, + "grad_norm": 0.9829855590547008, + "kl": 0.515045166015625, + "learning_rate": 4.997896431823398e-07, + "loss": 0.0005, + "reward": 1.6928571909666061, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7107143253087997, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5760 + }, + { + "completion_length": 249.9062623977661, + "epoch": 0.9660086340584266, + "grad_norm": 0.4302051737915668, + "kl": 0.381072998046875, + "learning_rate": 4.997891509010719e-07, + "loss": 0.0004, + "reward": 1.7964286357164383, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.8053571581840515, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5762 + }, + { + "completion_length": 251.86161708831787, + "epoch": 0.9663439372982942, + "grad_norm": 0.2003740166787335, + "kl": 0.1812744140625, + "learning_rate": 4.997886580446968e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7821428701281548, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5764 + }, + { + "completion_length": 236.9910831451416, + "epoch": 0.9666792405381617, + "grad_norm": 0.30823609277606523, + "kl": 0.79034423828125, + "learning_rate": 4.997881646132154e-07, + "loss": 0.0008, + "reward": 1.7732143551111221, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5766 + }, + { + "completion_length": 258.9464406967163, + "epoch": 0.9670145437780292, + "grad_norm": 0.4599728212819268, + "kl": 4.020721435546875, + "learning_rate": 4.997876706066293e-07, + "loss": 0.004, + "reward": 1.7410714775323868, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5768 + }, + { + "completion_length": 255.64733028411865, + "epoch": 0.9673498470178968, + "grad_norm": 1.0446450602846626, + "kl": 4.951690673828125, + "learning_rate": 4.997871760249392e-07, + "loss": 0.005, + "reward": 1.6772322282195091, + "reward_std": 0.1433153918478638, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 0.9629464447498322, + "step": 5770 + }, + { + "completion_length": 256.3928699493408, + "epoch": 0.9676851502577644, + "grad_norm": 0.652762162620934, + "kl": 1.116546630859375, + "learning_rate": 4.997866808681464e-07, + "loss": 0.0011, + "reward": 1.6901786252856255, + "reward_std": 0.14773480826988816, + "rewards/equation_reward_func": 0.7366071753203869, + "rewards/format_reward_func": 0.9535714499652386, + "step": 5772 + }, + { + "completion_length": 260.3928680419922, + "epoch": 0.9680204534976319, + "grad_norm": 0.4629856359890017, + "kl": 1.8438873291015625, + "learning_rate": 4.997861851362522e-07, + "loss": 0.0018, + "reward": 1.7361607551574707, + "reward_std": 0.12058695964515209, + "rewards/equation_reward_func": 0.7598214652389288, + "rewards/format_reward_func": 0.9763392992317677, + "step": 5774 + }, + { + "completion_length": 258.7410840988159, + "epoch": 0.9683557567374995, + "grad_norm": 0.2744137856877816, + "kl": 0.482940673828125, + "learning_rate": 4.997856888292575e-07, + "loss": 0.0005, + "reward": 1.7428571954369545, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5776 + }, + { + "completion_length": 255.73661613464355, + "epoch": 0.968691059977367, + "grad_norm": 0.1992131837536013, + "kl": 0.2537994384765625, + "learning_rate": 4.997851919471634e-07, + "loss": 0.0003, + "reward": 1.6647322177886963, + "reward_std": 0.07007933338172734, + "rewards/equation_reward_func": 0.6794643215835094, + "rewards/format_reward_func": 0.9852678664028645, + "step": 5778 + }, + { + "completion_length": 243.08929824829102, + "epoch": 0.9690263632172346, + "grad_norm": 0.2374438613904606, + "kl": 0.1378021240234375, + "learning_rate": 4.997846944899713e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.0858629671856761, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5780 + }, + { + "completion_length": 230.6116189956665, + "epoch": 0.9693616664571021, + "grad_norm": 0.19207024544444537, + "kl": 0.1244354248046875, + "learning_rate": 4.997841964576822e-07, + "loss": 0.0001, + "reward": 1.7482143267989159, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7526786141097546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5782 + }, + { + "completion_length": 234.8303680419922, + "epoch": 0.9696969696969697, + "grad_norm": 0.29969401860216555, + "kl": 0.1541748046875, + "learning_rate": 4.997836978502973e-07, + "loss": 0.0002, + "reward": 1.7196429297327995, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7330357395112514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5784 + }, + { + "completion_length": 235.4687614440918, + "epoch": 0.9700322729368372, + "grad_norm": 0.0050095061490047785, + "kl": 0.1041717529296875, + "learning_rate": 4.997831986678177e-07, + "loss": 0.0001, + "reward": 1.7397321984171867, + "reward_std": 0.034723992459475994, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.994196429848671, + "step": 5786 + }, + { + "completion_length": 233.54465007781982, + "epoch": 0.9703675761767048, + "grad_norm": 0.4226043906249522, + "kl": 0.1141204833984375, + "learning_rate": 4.997826989102445e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500331550837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5788 + }, + { + "completion_length": 235.62501049041748, + "epoch": 0.9707028794165724, + "grad_norm": 0.14606378420437877, + "kl": 0.11199951171875, + "learning_rate": 4.997821985775789e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 5790 + }, + { + "completion_length": 224.12054443359375, + "epoch": 0.9710381826564399, + "grad_norm": 0.4096506818257736, + "kl": 0.1255035400390625, + "learning_rate": 4.997816976698222e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 5792 + }, + { + "completion_length": 231.3482255935669, + "epoch": 0.9713734858963075, + "grad_norm": 0.3583328866620858, + "kl": 0.137237548828125, + "learning_rate": 4.997811961869754e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7589285895228386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5794 + }, + { + "completion_length": 228.71429538726807, + "epoch": 0.971708789136175, + "grad_norm": 0.20583662808869987, + "kl": 0.106781005859375, + "learning_rate": 4.997806941290396e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857406288385, + "rewards/format_reward_func": 1.0, + "step": 5796 + }, + { + "completion_length": 222.55804634094238, + "epoch": 0.9720440923760426, + "grad_norm": 0.17532440930536886, + "kl": 0.09112548828125, + "learning_rate": 4.99780191496016e-07, + "loss": 0.0001, + "reward": 1.725000061094761, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.725000036880374, + "rewards/format_reward_func": 1.0, + "step": 5798 + }, + { + "completion_length": 235.87501049041748, + "epoch": 0.9723793956159101, + "grad_norm": 0.31365165812364126, + "kl": 0.09649658203125, + "learning_rate": 4.997796882879058e-07, + "loss": 0.0001, + "reward": 1.764285758137703, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7732143122702837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5800 + }, + { + "completion_length": 223.37500953674316, + "epoch": 0.9727146988557777, + "grad_norm": 0.3106940012934583, + "kl": 0.104827880859375, + "learning_rate": 4.997791845047102e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 5802 + }, + { + "completion_length": 218.8616180419922, + "epoch": 0.9730500020956453, + "grad_norm": 0.22578401002909218, + "kl": 0.09271240234375, + "learning_rate": 4.997786801464303e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 5804 + }, + { + "completion_length": 220.16518592834473, + "epoch": 0.9733853053355128, + "grad_norm": 0.331889824862444, + "kl": 0.08624267578125, + "learning_rate": 4.997781752130673e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 5806 + }, + { + "completion_length": 224.34822463989258, + "epoch": 0.9737206085753803, + "grad_norm": 0.21284460310280223, + "kl": 0.1049041748046875, + "learning_rate": 4.997776697046223e-07, + "loss": 0.0001, + "reward": 1.8125000521540642, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8169643059372902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5808 + }, + { + "completion_length": 221.4553680419922, + "epoch": 0.9740559118152479, + "grad_norm": 0.20490578484408217, + "kl": 0.07861328125, + "learning_rate": 4.997771636210965e-07, + "loss": 0.0001, + "reward": 1.783928632736206, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5810 + }, + { + "completion_length": 225.59822463989258, + "epoch": 0.9743912150551155, + "grad_norm": 0.33365690223263317, + "kl": 0.0732269287109375, + "learning_rate": 4.99776656962491e-07, + "loss": 0.0001, + "reward": 1.7892857566475868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857436090708, + "rewards/format_reward_func": 1.0, + "step": 5812 + }, + { + "completion_length": 225.36161422729492, + "epoch": 0.974726518294983, + "grad_norm": 0.2679487209690672, + "kl": 0.0830841064453125, + "learning_rate": 4.997761497288071e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7696428969502449, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5814 + }, + { + "completion_length": 218.46429634094238, + "epoch": 0.9750618215348505, + "grad_norm": 0.3324314294312522, + "kl": 0.0901641845703125, + "learning_rate": 4.997756419200458e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 5816 + }, + { + "completion_length": 229.84375858306885, + "epoch": 0.9753971247747182, + "grad_norm": 0.18266863033969682, + "kl": 0.075103759765625, + "learning_rate": 4.997751335362085e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 5818 + }, + { + "completion_length": 225.60715293884277, + "epoch": 0.9757324280145857, + "grad_norm": 0.3062540378594742, + "kl": 0.079925537109375, + "learning_rate": 4.997746245772962e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 5820 + }, + { + "completion_length": 232.3035831451416, + "epoch": 0.9760677312544532, + "grad_norm": 0.26651813698518534, + "kl": 0.07159423828125, + "learning_rate": 4.9977411504331e-07, + "loss": 0.0001, + "reward": 1.6892857998609543, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.6892857421189547, + "rewards/format_reward_func": 1.0, + "step": 5822 + }, + { + "completion_length": 228.92411708831787, + "epoch": 0.9764030344943208, + "grad_norm": 0.2276887505675821, + "kl": 0.0889892578125, + "learning_rate": 4.997736049342512e-07, + "loss": 0.0001, + "reward": 1.7642857655882835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 5824 + }, + { + "completion_length": 233.46875953674316, + "epoch": 0.9767383377341884, + "grad_norm": 0.07011180696246816, + "kl": 0.0858306884765625, + "learning_rate": 4.997730942501211e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 5826 + }, + { + "completion_length": 234.75893878936768, + "epoch": 0.9770736409740559, + "grad_norm": 0.5499560677708243, + "kl": 0.0884246826171875, + "learning_rate": 4.997725829909205e-07, + "loss": 0.0001, + "reward": 1.755357213318348, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5828 + }, + { + "completion_length": 228.28125953674316, + "epoch": 0.9774089442139234, + "grad_norm": 0.2858952293699924, + "kl": 0.0711822509765625, + "learning_rate": 4.99772071156651e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143204659224, + "rewards/format_reward_func": 1.0, + "step": 5830 + }, + { + "completion_length": 233.47768878936768, + "epoch": 0.9777442474537911, + "grad_norm": 0.252328997483877, + "kl": 0.0720367431640625, + "learning_rate": 4.997715587473135e-07, + "loss": 0.0001, + "reward": 1.7267857864499092, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.731250025331974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5832 + }, + { + "completion_length": 234.97322273254395, + "epoch": 0.9780795506936586, + "grad_norm": 0.2657323747803113, + "kl": 0.0772247314453125, + "learning_rate": 4.997710457629092e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071857511997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5834 + }, + { + "completion_length": 228.12054443359375, + "epoch": 0.9784148539335261, + "grad_norm": 0.00524528276891892, + "kl": 0.0808563232421875, + "learning_rate": 4.997705322034394e-07, + "loss": 0.0001, + "reward": 1.7732143625617027, + "reward_std": 0.027779195457696915, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5836 + }, + { + "completion_length": 238.71876049041748, + "epoch": 0.9787501571733936, + "grad_norm": 0.28480684344520696, + "kl": 0.076904296875, + "learning_rate": 4.997700180689053e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964285872876644, + "rewards/format_reward_func": 1.0, + "step": 5838 + }, + { + "completion_length": 227.2366180419922, + "epoch": 0.9790854604132613, + "grad_norm": 0.3515029048482449, + "kl": 0.078582763671875, + "learning_rate": 4.99769503359308e-07, + "loss": 0.0001, + "reward": 1.7357143759727478, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 5840 + }, + { + "completion_length": 234.6830472946167, + "epoch": 0.9794207636531288, + "grad_norm": 0.1499024663881233, + "kl": 0.073211669921875, + "learning_rate": 4.997689880746486e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 5842 + }, + { + "completion_length": 229.09376049041748, + "epoch": 0.9797560668929963, + "grad_norm": 0.22930478631284873, + "kl": 0.05889892578125, + "learning_rate": 4.997684722149284e-07, + "loss": 0.0001, + "reward": 1.753571480512619, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714600235224, + "rewards/format_reward_func": 1.0, + "step": 5844 + }, + { + "completion_length": 228.95983219146729, + "epoch": 0.980091370132864, + "grad_norm": 0.20276460790670736, + "kl": 0.09027099609375, + "learning_rate": 4.997679557801487e-07, + "loss": 0.0001, + "reward": 1.814285784959793, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 5846 + }, + { + "completion_length": 223.62947463989258, + "epoch": 0.9804266733727315, + "grad_norm": 0.23756297896683548, + "kl": 0.0641937255859375, + "learning_rate": 4.997674387703104e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 5848 + }, + { + "completion_length": 214.40626049041748, + "epoch": 0.980761976612599, + "grad_norm": 0.24790898194269112, + "kl": 0.0617218017578125, + "learning_rate": 4.997669211854148e-07, + "loss": 0.0001, + "reward": 1.8214286491274834, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214285895228386, + "rewards/format_reward_func": 1.0, + "step": 5850 + }, + { + "completion_length": 229.6919755935669, + "epoch": 0.9810972798524665, + "grad_norm": 0.45248769291328733, + "kl": 0.089996337890625, + "learning_rate": 4.997664030254634e-07, + "loss": 0.0001, + "reward": 1.7446429505944252, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7482143118977547, + "rewards/format_reward_func": 0.9964285790920258, + "step": 5852 + }, + { + "completion_length": 233.15625858306885, + "epoch": 0.9814325830923342, + "grad_norm": 0.17702506587319505, + "kl": 0.0703277587890625, + "learning_rate": 4.99765884290457e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7651786021888256, + "rewards/format_reward_func": 0.9866071455180645, + "step": 5854 + }, + { + "completion_length": 231.97322463989258, + "epoch": 0.9817678863322017, + "grad_norm": 0.25418976598809995, + "kl": 0.06585693359375, + "learning_rate": 4.997653649803968e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 5856 + }, + { + "completion_length": 234.37054634094238, + "epoch": 0.9821031895720692, + "grad_norm": 0.18552869148200038, + "kl": 0.09344482421875, + "learning_rate": 4.997648450952842e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571589291096, + "rewards/format_reward_func": 1.0, + "step": 5858 + }, + { + "completion_length": 245.8839406967163, + "epoch": 0.9824384928119368, + "grad_norm": 0.2988328236976148, + "kl": 0.0713958740234375, + "learning_rate": 4.997643246351204e-07, + "loss": 0.0001, + "reward": 1.7303571999073029, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7348214760422707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5860 + }, + { + "completion_length": 236.5625123977661, + "epoch": 0.9827737960518044, + "grad_norm": 0.09087717881506024, + "kl": 0.06475830078125, + "learning_rate": 4.997638035999065e-07, + "loss": 0.0001, + "reward": 1.7285715341567993, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 5862 + }, + { + "completion_length": 240.37501335144043, + "epoch": 0.9831090992916719, + "grad_norm": 0.2330972837811373, + "kl": 0.073760986328125, + "learning_rate": 4.997632819896437e-07, + "loss": 0.0001, + "reward": 1.7928572446107864, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 5864 + }, + { + "completion_length": 242.43750953674316, + "epoch": 0.9834444025315394, + "grad_norm": 0.3031159260506758, + "kl": 0.0743408203125, + "learning_rate": 4.997627598043331e-07, + "loss": 0.0001, + "reward": 1.7392858043313026, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 5866 + }, + { + "completion_length": 228.8616180419922, + "epoch": 0.983779705771407, + "grad_norm": 0.21641965125787252, + "kl": 0.082244873046875, + "learning_rate": 4.997622370439762e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 5868 + }, + { + "completion_length": 243.7634048461914, + "epoch": 0.9841150090112746, + "grad_norm": 0.17336499937615538, + "kl": 0.068206787109375, + "learning_rate": 4.99761713708574e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571656346321, + "rewards/format_reward_func": 1.0, + "step": 5870 + }, + { + "completion_length": 241.12947463989258, + "epoch": 0.9844503122511421, + "grad_norm": 0.2895336684874953, + "kl": 0.0819244384765625, + "learning_rate": 4.997611897981277e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7910714596509933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5872 + }, + { + "completion_length": 241.39733123779297, + "epoch": 0.9847856154910097, + "grad_norm": 0.2139574550323431, + "kl": 0.0740966796875, + "learning_rate": 4.997606653126385e-07, + "loss": 0.0001, + "reward": 1.7928572222590446, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 5874 + }, + { + "completion_length": 245.07143783569336, + "epoch": 0.9851209187308773, + "grad_norm": 0.24078819208661117, + "kl": 0.0866851806640625, + "learning_rate": 4.997601402521077e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 5876 + }, + { + "completion_length": 227.65179634094238, + "epoch": 0.9854562219707448, + "grad_norm": 0.3018082502223864, + "kl": 0.0716400146484375, + "learning_rate": 4.997596146165363e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 5878 + }, + { + "completion_length": 232.21429824829102, + "epoch": 0.9857915252106123, + "grad_norm": 0.20631451236813453, + "kl": 0.0742034912109375, + "learning_rate": 4.997590884059259e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 5880 + }, + { + "completion_length": 245.1116180419922, + "epoch": 0.9861268284504799, + "grad_norm": 0.14566184117270678, + "kl": 0.078857421875, + "learning_rate": 4.997585616202773e-07, + "loss": 0.0001, + "reward": 1.714285783469677, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 1.0, + "step": 5882 + }, + { + "completion_length": 245.8571538925171, + "epoch": 0.9864621316903475, + "grad_norm": 0.2400514185909936, + "kl": 0.0743560791015625, + "learning_rate": 4.99758034259592e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5884 + }, + { + "completion_length": 241.3348331451416, + "epoch": 0.986797434930215, + "grad_norm": 0.10029168277200791, + "kl": 0.0720062255859375, + "learning_rate": 4.997575063238711e-07, + "loss": 0.0001, + "reward": 1.7214286252856255, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214286047965288, + "rewards/format_reward_func": 1.0, + "step": 5886 + }, + { + "completion_length": 239.9732255935669, + "epoch": 0.9871327381700825, + "grad_norm": 0.12171211237035984, + "kl": 0.077362060546875, + "learning_rate": 4.997569778131157e-07, + "loss": 0.0001, + "reward": 1.7839286178350449, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5888 + }, + { + "completion_length": 251.11608600616455, + "epoch": 0.9874680414099501, + "grad_norm": 0.24615144405117734, + "kl": 0.1072845458984375, + "learning_rate": 4.997564487273272e-07, + "loss": 0.0001, + "reward": 1.7125000730156898, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7258929051458836, + "rewards/format_reward_func": 0.9866071455180645, + "step": 5890 + }, + { + "completion_length": 248.27233409881592, + "epoch": 0.9878033446498177, + "grad_norm": 0.10878122748316436, + "kl": 0.0915069580078125, + "learning_rate": 4.997559190665067e-07, + "loss": 0.0001, + "reward": 1.7714286595582962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285887777805, + "rewards/format_reward_func": 1.0, + "step": 5892 + }, + { + "completion_length": 260.90626335144043, + "epoch": 0.9881386478896852, + "grad_norm": 0.24469718671854498, + "kl": 0.10359954833984375, + "learning_rate": 4.997553888306556e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928894996643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5894 + }, + { + "completion_length": 263.89287090301514, + "epoch": 0.9884739511295528, + "grad_norm": 0.19164770454787758, + "kl": 0.0981903076171875, + "learning_rate": 4.997548580197749e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5896 + }, + { + "completion_length": 262.14733028411865, + "epoch": 0.9888092543694204, + "grad_norm": 0.11700179725728932, + "kl": 0.09405517578125, + "learning_rate": 4.99754326633866e-07, + "loss": 0.0001, + "reward": 1.8035714700818062, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.8125000223517418, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5898 + }, + { + "completion_length": 249.94197463989258, + "epoch": 0.9891445576092879, + "grad_norm": 0.1817638086218183, + "kl": 0.14984130859375, + "learning_rate": 4.997537946729298e-07, + "loss": 0.0001, + "reward": 1.7732143625617027, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5900 + }, + { + "completion_length": 266.526798248291, + "epoch": 0.9894798608491554, + "grad_norm": 0.3553538368445335, + "kl": 0.1319122314453125, + "learning_rate": 4.99753262136968e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 5902 + }, + { + "completion_length": 255.9375123977661, + "epoch": 0.989815164089023, + "grad_norm": 0.26831031710499303, + "kl": 0.145782470703125, + "learning_rate": 4.997527290259816e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8160714507102966, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5904 + }, + { + "completion_length": 268.3571548461914, + "epoch": 0.9901504673288906, + "grad_norm": 0.2864201561733845, + "kl": 0.1101837158203125, + "learning_rate": 4.997521953399717e-07, + "loss": 0.0001, + "reward": 1.766071505844593, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7794643118977547, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5906 + }, + { + "completion_length": 266.1250114440918, + "epoch": 0.9904857705687581, + "grad_norm": 0.3488360998148485, + "kl": 0.1395721435546875, + "learning_rate": 4.997516610789397e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7866071723401546, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5908 + }, + { + "completion_length": 260.60715103149414, + "epoch": 0.9908210738086257, + "grad_norm": 0.13988328840005468, + "kl": 0.1138763427734375, + "learning_rate": 4.997511262428867e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 5910 + }, + { + "completion_length": 273.7991189956665, + "epoch": 0.9911563770484932, + "grad_norm": 0.18774606542998765, + "kl": 0.15557861328125, + "learning_rate": 4.997505908318142e-07, + "loss": 0.0002, + "reward": 1.7803571969270706, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7848214656114578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5912 + }, + { + "completion_length": 261.88840770721436, + "epoch": 0.9914916802883608, + "grad_norm": 0.20507424623893872, + "kl": 0.2316436767578125, + "learning_rate": 4.997500548457231e-07, + "loss": 0.0002, + "reward": 1.7325893491506577, + "reward_std": 0.044825518038123846, + "rewards/equation_reward_func": 0.7383928745985031, + "rewards/format_reward_func": 0.9941964335739613, + "step": 5914 + }, + { + "completion_length": 256.0669755935669, + "epoch": 0.9918269835282283, + "grad_norm": 0.3678386165226889, + "kl": 0.1391754150390625, + "learning_rate": 4.997495182846147e-07, + "loss": 0.0001, + "reward": 1.7348214909434319, + "reward_std": 0.08207489270716906, + "rewards/equation_reward_func": 0.741071455180645, + "rewards/format_reward_func": 0.9937500059604645, + "step": 5916 + }, + { + "completion_length": 256.1160840988159, + "epoch": 0.9921622867680959, + "grad_norm": 0.1756867333928644, + "kl": 0.1789093017578125, + "learning_rate": 4.997489811484903e-07, + "loss": 0.0002, + "reward": 1.7803572043776512, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5918 + }, + { + "completion_length": 269.5178689956665, + "epoch": 0.9924975900079634, + "grad_norm": 0.11046519379219497, + "kl": 0.1328277587890625, + "learning_rate": 4.997484434373513e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5920 + }, + { + "completion_length": 269.83929538726807, + "epoch": 0.992832893247831, + "grad_norm": 0.2843856797083342, + "kl": 0.4182891845703125, + "learning_rate": 4.997479051511988e-07, + "loss": 0.0004, + "reward": 1.696428656578064, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7053571809083223, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5922 + }, + { + "completion_length": 266.794659614563, + "epoch": 0.9931681964876986, + "grad_norm": 0.25353859287113506, + "kl": 0.2885894775390625, + "learning_rate": 4.997473662900339e-07, + "loss": 0.0003, + "reward": 1.6517857983708382, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.6562500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5924 + }, + { + "completion_length": 260.80358505249023, + "epoch": 0.9935034997275661, + "grad_norm": 0.17313230378824523, + "kl": 0.297637939453125, + "learning_rate": 4.997468268538579e-07, + "loss": 0.0003, + "reward": 1.7571429312229156, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5926 + }, + { + "completion_length": 249.4107265472412, + "epoch": 0.9938388029674337, + "grad_norm": 0.1547458708021722, + "kl": 0.3666229248046875, + "learning_rate": 4.997462868426722e-07, + "loss": 0.0004, + "reward": 1.826785758137703, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8312500193715096, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5928 + }, + { + "completion_length": 255.52233219146729, + "epoch": 0.9941741062073012, + "grad_norm": 0.39843122540179543, + "kl": 0.4351806640625, + "learning_rate": 4.997457462564781e-07, + "loss": 0.0004, + "reward": 1.7040179446339607, + "reward_std": 0.06124049751088023, + "rewards/equation_reward_func": 0.7098214589059353, + "rewards/format_reward_func": 0.9941964335739613, + "step": 5930 + }, + { + "completion_length": 259.84376430511475, + "epoch": 0.9945094094471688, + "grad_norm": 0.3779794869916766, + "kl": 0.6574249267578125, + "learning_rate": 4.997452050952765e-07, + "loss": 0.0007, + "reward": 1.76071435213089, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7696428894996643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5932 + }, + { + "completion_length": 251.8794765472412, + "epoch": 0.9948447126870363, + "grad_norm": 0.8563269925502356, + "kl": 1.45794677734375, + "learning_rate": 4.997446633590689e-07, + "loss": 0.0015, + "reward": 1.7553571909666061, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214708268642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5934 + }, + { + "completion_length": 251.55805110931396, + "epoch": 0.9951800159269039, + "grad_norm": 0.2936187191356089, + "kl": 1.031280517578125, + "learning_rate": 4.997441210478564e-07, + "loss": 0.001, + "reward": 1.7142857611179352, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7232143320143223, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5936 + }, + { + "completion_length": 260.50447368621826, + "epoch": 0.9955153191667715, + "grad_norm": 0.28716438779501635, + "kl": 0.818145751953125, + "learning_rate": 4.997435781616405e-07, + "loss": 0.0008, + "reward": 1.7267857939004898, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7312500290572643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5938 + }, + { + "completion_length": 256.3616180419922, + "epoch": 0.995850622406639, + "grad_norm": 0.23298759856279558, + "kl": 1.7165985107421875, + "learning_rate": 4.997430347004221e-07, + "loss": 0.0017, + "reward": 1.7642857655882835, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7732143178582191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5940 + }, + { + "completion_length": 256.57143783569336, + "epoch": 0.9961859256465065, + "grad_norm": 0.2095096831056827, + "kl": 0.1193695068359375, + "learning_rate": 4.997424906642028e-07, + "loss": 0.0001, + "reward": 1.8200893253087997, + "reward_std": 0.06250318605452776, + "rewards/equation_reward_func": 0.8214285895228386, + "rewards/format_reward_func": 0.9986607171595097, + "step": 5942 + }, + { + "completion_length": 253.92858219146729, + "epoch": 0.9965212288863741, + "grad_norm": 0.22191173301560937, + "kl": 0.5313568115234375, + "learning_rate": 4.997419460529836e-07, + "loss": 0.0005, + "reward": 1.7160715237259865, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7205357477068901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5944 + }, + { + "completion_length": 252.2946548461914, + "epoch": 0.9968565321262417, + "grad_norm": 0.2167018970654993, + "kl": 1.0382080078125, + "learning_rate": 4.997414008667658e-07, + "loss": 0.001, + "reward": 1.7982143461704254, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026785887777805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5946 + }, + { + "completion_length": 262.90626525878906, + "epoch": 0.9971918353661092, + "grad_norm": 0.25514814703256206, + "kl": 0.0827789306640625, + "learning_rate": 4.997408551055508e-07, + "loss": 0.0001, + "reward": 1.7839286476373672, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7875000238418579, + "rewards/format_reward_func": 0.9964285716414452, + "step": 5948 + }, + { + "completion_length": 261.5714406967163, + "epoch": 0.9975271386059767, + "grad_norm": 0.30817589784765403, + "kl": 0.498565673828125, + "learning_rate": 4.997403087693398e-07, + "loss": 0.0005, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 5950 + }, + { + "completion_length": 264.19197845458984, + "epoch": 0.9978624418458444, + "grad_norm": 0.21263202583302027, + "kl": 0.1584625244140625, + "learning_rate": 4.997397618581339e-07, + "loss": 0.0002, + "reward": 1.7767857611179352, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500204890966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5952 + }, + { + "completion_length": 259.008939743042, + "epoch": 0.9981977450857119, + "grad_norm": 0.2297520903477404, + "kl": 0.1841583251953125, + "learning_rate": 4.997392143719344e-07, + "loss": 0.0002, + "reward": 1.7107143625617027, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7107143327593803, + "rewards/format_reward_func": 1.0, + "step": 5954 + }, + { + "completion_length": 255.6785831451416, + "epoch": 0.9985330483255794, + "grad_norm": 0.23026073378599074, + "kl": 0.18896484375, + "learning_rate": 4.997386663107428e-07, + "loss": 0.0002, + "reward": 1.7428572177886963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571823984385, + "rewards/format_reward_func": 1.0, + "step": 5956 + }, + { + "completion_length": 266.88394260406494, + "epoch": 0.998868351565447, + "grad_norm": 0.2934772523750861, + "kl": 0.1302490234375, + "learning_rate": 4.997381176745602e-07, + "loss": 0.0001, + "reward": 1.7839286178350449, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5958 + }, + { + "completion_length": 262.446439743042, + "epoch": 0.9992036548053146, + "grad_norm": 0.26250342499542473, + "kl": 0.257049560546875, + "learning_rate": 4.997375684633878e-07, + "loss": 0.0003, + "reward": 1.782142922282219, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428682655096, + "rewards/format_reward_func": 1.0, + "step": 5960 + }, + { + "completion_length": 253.3616180419922, + "epoch": 0.9995389580451821, + "grad_norm": 0.35762220838323594, + "kl": 0.0915374755859375, + "learning_rate": 4.997370186772269e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5962 + }, + { + "completion_length": 263.8259029388428, + "epoch": 0.9998742612850496, + "grad_norm": 1.4570787798699594, + "kl": 0.1656646728515625, + "learning_rate": 4.997364683160787e-07, + "loss": 0.0002, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 5964 + }, + { + "completion_length": 265.9805304787376, + "epoch": 1.0003353032398676, + "grad_norm": 0.9384003688196095, + "kl": 0.10651189630681818, + "learning_rate": 4.997359173799447e-07, + "loss": 0.0001, + "reward": 1.6987013925205579, + "reward_std": 0.08815876665440472, + "rewards/equation_reward_func": 0.7116883397102356, + "rewards/format_reward_func": 0.9870129932056774, + "step": 5966 + }, + { + "completion_length": 259.83037090301514, + "epoch": 1.000670606479735, + "grad_norm": 1.2250441001337247, + "kl": 0.1012420654296875, + "learning_rate": 4.99735365868826e-07, + "loss": 0.0001, + "reward": 1.769642949104309, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5968 + }, + { + "completion_length": 265.8125104904175, + "epoch": 1.0010059097196027, + "grad_norm": 0.21174169445063962, + "kl": 0.10089111328125, + "learning_rate": 4.997348137827238e-07, + "loss": 0.0001, + "reward": 1.714285783469677, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7142857573926449, + "rewards/format_reward_func": 1.0, + "step": 5970 + }, + { + "completion_length": 264.96876430511475, + "epoch": 1.0013412129594703, + "grad_norm": 0.309308577606721, + "kl": 0.06874847412109375, + "learning_rate": 4.997342611216395e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5972 + }, + { + "completion_length": 258.415189743042, + "epoch": 1.0016765161993377, + "grad_norm": 0.9905876695648793, + "kl": 0.0795135498046875, + "learning_rate": 4.997337078855744e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 5974 + }, + { + "completion_length": 266.07144355773926, + "epoch": 1.0020118194392054, + "grad_norm": 0.2839674817964923, + "kl": 0.1131744384765625, + "learning_rate": 4.997331540745296e-07, + "loss": 0.0001, + "reward": 1.7303571924567223, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7437500394880772, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5976 + }, + { + "completion_length": 276.91519355773926, + "epoch": 1.0023471226790728, + "grad_norm": 0.24495313830148246, + "kl": 0.123016357421875, + "learning_rate": 4.997325996885066e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 5978 + }, + { + "completion_length": 273.0178699493408, + "epoch": 1.0026824259189404, + "grad_norm": 1.3292883047967456, + "kl": 0.136383056640625, + "learning_rate": 4.997320447275065e-07, + "loss": 0.0001, + "reward": 1.71830365806818, + "reward_std": 0.05492704384960234, + "rewards/equation_reward_func": 0.7241071686148643, + "rewards/format_reward_func": 0.9941964335739613, + "step": 5980 + }, + { + "completion_length": 271.33929538726807, + "epoch": 1.003017729158808, + "grad_norm": 1.1772382051916397, + "kl": 0.2193603515625, + "learning_rate": 4.997314891915307e-07, + "loss": 0.0002, + "reward": 1.7607143595814705, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5982 + }, + { + "completion_length": 260.65179347991943, + "epoch": 1.0033530323986755, + "grad_norm": 4.171729150636543, + "kl": 0.216400146484375, + "learning_rate": 4.997309330805803e-07, + "loss": 0.0002, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 5984 + }, + { + "completion_length": 264.4196529388428, + "epoch": 1.003688335638543, + "grad_norm": 0.38616155970757654, + "kl": 0.297698974609375, + "learning_rate": 4.997303763946568e-07, + "loss": 0.0003, + "reward": 1.7160715013742447, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7294643148779869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 5986 + }, + { + "completion_length": 261.3169755935669, + "epoch": 1.0040236388784107, + "grad_norm": 0.21821546768331762, + "kl": 0.239715576171875, + "learning_rate": 4.997298191337613e-07, + "loss": 0.0002, + "reward": 1.7611608058214188, + "reward_std": 0.05492704384960234, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9941964335739613, + "step": 5988 + }, + { + "completion_length": 272.94197845458984, + "epoch": 1.0043589421182781, + "grad_norm": 0.8574313762374961, + "kl": 0.2301483154296875, + "learning_rate": 4.997292612978954e-07, + "loss": 0.0002, + "reward": 1.7214286103844643, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7303571626543999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5990 + }, + { + "completion_length": 282.87947273254395, + "epoch": 1.0046942453581458, + "grad_norm": 0.41062082585236326, + "kl": 0.18109130859375, + "learning_rate": 4.997287028870599e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.09596449136734009, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5992 + }, + { + "completion_length": 264.7321548461914, + "epoch": 1.0050295485980134, + "grad_norm": 0.22685254042735117, + "kl": 0.5179901123046875, + "learning_rate": 4.997281439012564e-07, + "loss": 0.0005, + "reward": 1.8035714849829674, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.8125000335276127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 5994 + }, + { + "completion_length": 269.5223331451416, + "epoch": 1.0053648518378808, + "grad_norm": 0.22963070499995214, + "kl": 0.550018310546875, + "learning_rate": 4.997275843404861e-07, + "loss": 0.0006, + "reward": 1.7142857760190964, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7321428991854191, + "rewards/format_reward_func": 0.9821428656578064, + "step": 5996 + }, + { + "completion_length": 265.7946557998657, + "epoch": 1.0057001550777485, + "grad_norm": 0.4467582667821596, + "kl": 0.439666748046875, + "learning_rate": 4.997270242047504e-07, + "loss": 0.0004, + "reward": 1.7343750521540642, + "reward_std": 0.0675539500080049, + "rewards/equation_reward_func": 0.7535714637488127, + "rewards/format_reward_func": 0.980803582817316, + "step": 5998 + }, + { + "completion_length": 262.388409614563, + "epoch": 1.006035458317616, + "grad_norm": 0.21615161312001333, + "kl": 0.2450103759765625, + "learning_rate": 4.997264634940503e-07, + "loss": 0.0002, + "reward": 1.773214340209961, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.777678620070219, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6000 + }, + { + "completion_length": 250.29911994934082, + "epoch": 1.0063707615574835, + "grad_norm": 0.34505426050760507, + "kl": 1.3549957275390625, + "learning_rate": 4.997259022083875e-07, + "loss": 0.0014, + "reward": 1.76071435213089, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 6002 + }, + { + "completion_length": 256.5000104904175, + "epoch": 1.0067060647973511, + "grad_norm": 0.23308189879398827, + "kl": 0.4465484619140625, + "learning_rate": 4.99725340347763e-07, + "loss": 0.0004, + "reward": 1.7714286372065544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 6004 + }, + { + "completion_length": 250.5669765472412, + "epoch": 1.0070413680372186, + "grad_norm": 0.23353505975057562, + "kl": 0.1022491455078125, + "learning_rate": 4.997247779121782e-07, + "loss": 0.0001, + "reward": 1.8357143178582191, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.8446428813040257, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6006 + }, + { + "completion_length": 260.1562614440918, + "epoch": 1.0073766712770862, + "grad_norm": 0.34613575609921376, + "kl": 0.1776885986328125, + "learning_rate": 4.997242149016343e-07, + "loss": 0.0002, + "reward": 1.7477679252624512, + "reward_std": 0.08396892924793065, + "rewards/equation_reward_func": 0.7553571686148643, + "rewards/format_reward_func": 0.9924107231199741, + "step": 6008 + }, + { + "completion_length": 260.95983505249023, + "epoch": 1.0077119745169538, + "grad_norm": 0.209932605339094, + "kl": 0.2499847412109375, + "learning_rate": 4.997236513161327e-07, + "loss": 0.0003, + "reward": 1.741071492433548, + "reward_std": 0.08333758544176817, + "rewards/equation_reward_func": 0.7633928693830967, + "rewards/format_reward_func": 0.977678582072258, + "step": 6010 + }, + { + "completion_length": 268.901798248291, + "epoch": 1.0080472777568212, + "grad_norm": 0.23702142547618296, + "kl": 0.272735595703125, + "learning_rate": 4.997230871556747e-07, + "loss": 0.0003, + "reward": 1.751785770058632, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6012 + }, + { + "completion_length": 258.6875123977661, + "epoch": 1.0083825809966889, + "grad_norm": 0.2647906060959816, + "kl": 0.30157470703125, + "learning_rate": 4.997225224202616e-07, + "loss": 0.0003, + "reward": 1.7517857924103737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500294297934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6014 + }, + { + "completion_length": 260.34822368621826, + "epoch": 1.0087178842365565, + "grad_norm": 0.3469783106868168, + "kl": 0.2371063232421875, + "learning_rate": 4.997219571098945e-07, + "loss": 0.0002, + "reward": 1.7589286267757416, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6016 + }, + { + "completion_length": 251.0714406967163, + "epoch": 1.009053187476424, + "grad_norm": 0.2878495464486915, + "kl": 0.1557464599609375, + "learning_rate": 4.997213912245751e-07, + "loss": 0.0002, + "reward": 1.7678572461009026, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 6018 + }, + { + "completion_length": 253.09376049041748, + "epoch": 1.0093884907162916, + "grad_norm": 0.31678984513529884, + "kl": 0.5877227783203125, + "learning_rate": 4.997208247643042e-07, + "loss": 0.0006, + "reward": 1.7678571864962578, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 6020 + }, + { + "completion_length": 251.00001335144043, + "epoch": 1.0097237939561592, + "grad_norm": 0.2262559946347966, + "kl": 0.2086334228515625, + "learning_rate": 4.997202577290836e-07, + "loss": 0.0002, + "reward": 1.7607143595814705, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7696428932249546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6022 + }, + { + "completion_length": 259.9598340988159, + "epoch": 1.0100590971960266, + "grad_norm": 0.3477392927745284, + "kl": 0.3122711181640625, + "learning_rate": 4.997196901189142e-07, + "loss": 0.0003, + "reward": 1.7696429342031479, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071730852127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6024 + }, + { + "completion_length": 251.7098331451416, + "epoch": 1.0103944004358942, + "grad_norm": 0.20619517269581278, + "kl": 0.207794189453125, + "learning_rate": 4.997191219337976e-07, + "loss": 0.0002, + "reward": 1.7339286506175995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7383928876370192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6026 + }, + { + "completion_length": 267.39733600616455, + "epoch": 1.0107297036757619, + "grad_norm": 0.28115504056281226, + "kl": 0.31494140625, + "learning_rate": 4.99718553173735e-07, + "loss": 0.0003, + "reward": 1.7214286401867867, + "reward_std": 0.0909137288108468, + "rewards/equation_reward_func": 0.739285746589303, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6028 + }, + { + "completion_length": 250.46876049041748, + "epoch": 1.0110650069156293, + "grad_norm": 0.08975104121727806, + "kl": 0.3542633056640625, + "learning_rate": 4.997179838387276e-07, + "loss": 0.0004, + "reward": 1.7642857730388641, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 6030 + }, + { + "completion_length": 249.0714406967163, + "epoch": 1.011400310155497, + "grad_norm": 0.2578463046422513, + "kl": 0.090484619140625, + "learning_rate": 4.997174139287768e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 6032 + }, + { + "completion_length": 248.33036994934082, + "epoch": 1.0117356133953643, + "grad_norm": 0.30858155230260276, + "kl": 0.078338623046875, + "learning_rate": 4.997168434438841e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.0656599160283804, + "rewards/equation_reward_func": 0.7410714663565159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6034 + }, + { + "completion_length": 252.12054538726807, + "epoch": 1.012070916635232, + "grad_norm": 0.26137515161726865, + "kl": 0.0858306884765625, + "learning_rate": 4.997162723840505e-07, + "loss": 0.0001, + "reward": 1.7910714596509933, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6036 + }, + { + "completion_length": 246.0669765472412, + "epoch": 1.0124062198750996, + "grad_norm": 0.30419280294145973, + "kl": 0.07515716552734375, + "learning_rate": 4.997157007492775e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6038 + }, + { + "completion_length": 250.9241189956665, + "epoch": 1.012741523114967, + "grad_norm": 0.2789904085399853, + "kl": 0.1070098876953125, + "learning_rate": 4.997151285395663e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 6040 + }, + { + "completion_length": 257.3526906967163, + "epoch": 1.0130768263548346, + "grad_norm": 0.29540488870553583, + "kl": 0.211273193359375, + "learning_rate": 4.997145557549184e-07, + "loss": 0.0002, + "reward": 1.6821429580450058, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6821429021656513, + "rewards/format_reward_func": 1.0, + "step": 6042 + }, + { + "completion_length": 263.98661613464355, + "epoch": 1.0134121295947023, + "grad_norm": 0.3503033270367516, + "kl": 0.429779052734375, + "learning_rate": 4.99713982395335e-07, + "loss": 0.0004, + "reward": 1.7642857506871223, + "reward_std": 0.04040610231459141, + "rewards/equation_reward_func": 0.77321432903409, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6044 + }, + { + "completion_length": 251.9509048461914, + "epoch": 1.0137474328345697, + "grad_norm": 0.2594043398085691, + "kl": 0.1715240478515625, + "learning_rate": 4.997134084608174e-07, + "loss": 0.0002, + "reward": 1.85714291036129, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8571428693830967, + "rewards/format_reward_func": 1.0, + "step": 6046 + }, + { + "completion_length": 255.3482265472412, + "epoch": 1.0140827360744373, + "grad_norm": 0.3322364759657405, + "kl": 0.13507080078125, + "learning_rate": 4.997128339513669e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 6048 + }, + { + "completion_length": 253.30805110931396, + "epoch": 1.014418039314305, + "grad_norm": 0.5803981729663658, + "kl": 0.480499267578125, + "learning_rate": 4.997122588669849e-07, + "loss": 0.0005, + "reward": 1.7535715028643608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 6050 + }, + { + "completion_length": 252.18751049041748, + "epoch": 1.0147533425541724, + "grad_norm": 0.40183967799838594, + "kl": 0.190338134765625, + "learning_rate": 4.997116832076727e-07, + "loss": 0.0002, + "reward": 1.7571429535746574, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 6052 + }, + { + "completion_length": 246.7321548461914, + "epoch": 1.01508864579404, + "grad_norm": 0.26403663378360054, + "kl": 0.138153076171875, + "learning_rate": 4.997111069734316e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.8053571693599224, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6054 + }, + { + "completion_length": 250.02679920196533, + "epoch": 1.0154239490339076, + "grad_norm": 0.18947696729731334, + "kl": 0.109039306640625, + "learning_rate": 4.997105301642629e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6056 + }, + { + "completion_length": 257.2767972946167, + "epoch": 1.015759252273775, + "grad_norm": 0.43595958723248723, + "kl": 0.13599395751953125, + "learning_rate": 4.99709952780168e-07, + "loss": 0.0001, + "reward": 1.7392857745289803, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 6058 + }, + { + "completion_length": 258.1741189956665, + "epoch": 1.0160945555136427, + "grad_norm": 0.261448358162978, + "kl": 0.150146484375, + "learning_rate": 4.997093748211482e-07, + "loss": 0.0002, + "reward": 1.73214291036129, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.7410714700818062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6060 + }, + { + "completion_length": 256.78126430511475, + "epoch": 1.01642985875351, + "grad_norm": 0.18200663054584304, + "kl": 0.103240966796875, + "learning_rate": 4.997087962872049e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 6062 + }, + { + "completion_length": 258.3571557998657, + "epoch": 1.0167651619933777, + "grad_norm": 0.1819383632703182, + "kl": 0.17974853515625, + "learning_rate": 4.997082171783393e-07, + "loss": 0.0002, + "reward": 1.7732143625617027, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6064 + }, + { + "completion_length": 259.5134057998657, + "epoch": 1.0171004652332454, + "grad_norm": 0.3211241713421729, + "kl": 0.13037109375, + "learning_rate": 4.997076374945529e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6066 + }, + { + "completion_length": 255.58036994934082, + "epoch": 1.0174357684731128, + "grad_norm": 0.2425993917665553, + "kl": 0.2032470703125, + "learning_rate": 4.997070572358468e-07, + "loss": 0.0002, + "reward": 1.814285770058632, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 6068 + }, + { + "completion_length": 264.77680110931396, + "epoch": 1.0177710717129804, + "grad_norm": 0.15083247763046945, + "kl": 0.17120361328125, + "learning_rate": 4.997064764022225e-07, + "loss": 0.0002, + "reward": 1.780357226729393, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6070 + }, + { + "completion_length": 259.620548248291, + "epoch": 1.018106374952848, + "grad_norm": 0.5226944898181175, + "kl": 0.14019775390625, + "learning_rate": 4.997058949936814e-07, + "loss": 0.0001, + "reward": 1.7482143342494965, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7616071663796902, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6072 + }, + { + "completion_length": 267.4955520629883, + "epoch": 1.0184416781927155, + "grad_norm": 0.29505621677078775, + "kl": 0.1959228515625, + "learning_rate": 4.997053130102247e-07, + "loss": 0.0002, + "reward": 1.7071429267525673, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7160714529454708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6074 + }, + { + "completion_length": 270.7053699493408, + "epoch": 1.018776981432583, + "grad_norm": 0.5437437458100716, + "kl": 0.2197265625, + "learning_rate": 4.997047304518538e-07, + "loss": 0.0002, + "reward": 1.7482143566012383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6076 + }, + { + "completion_length": 268.59822368621826, + "epoch": 1.0191122846724507, + "grad_norm": 0.24741324747983043, + "kl": 0.14093017578125, + "learning_rate": 4.997041473185699e-07, + "loss": 0.0001, + "reward": 1.6857143640518188, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.6946428865194321, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6078 + }, + { + "completion_length": 279.3928680419922, + "epoch": 1.0194475879123182, + "grad_norm": 0.16270285483118307, + "kl": 0.12310791015625, + "learning_rate": 4.997035636103746e-07, + "loss": 0.0001, + "reward": 1.7808035910129547, + "reward_std": 0.0776554774492979, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 0.980803582817316, + "step": 6080 + }, + { + "completion_length": 270.41518783569336, + "epoch": 1.0197828911521858, + "grad_norm": 0.35956905028353797, + "kl": 0.286376953125, + "learning_rate": 4.997029793272691e-07, + "loss": 0.0003, + "reward": 1.703571505844593, + "reward_std": 0.08586296625435352, + "rewards/equation_reward_func": 0.7125000394880772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6082 + }, + { + "completion_length": 270.30358505249023, + "epoch": 1.0201181943920532, + "grad_norm": 0.47292226180050445, + "kl": 0.266021728515625, + "learning_rate": 4.997023944692547e-07, + "loss": 0.0003, + "reward": 1.7482143491506577, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7616071775555611, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6084 + }, + { + "completion_length": 251.9107265472412, + "epoch": 1.0204534976319208, + "grad_norm": 0.2832647131892392, + "kl": 0.1192626953125, + "learning_rate": 4.997018090363329e-07, + "loss": 0.0001, + "reward": 1.7732143327593803, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7866071555763483, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6086 + }, + { + "completion_length": 264.56697845458984, + "epoch": 1.0207888008717885, + "grad_norm": 0.31822589986037786, + "kl": 0.26104736328125, + "learning_rate": 4.997012230285048e-07, + "loss": 0.0003, + "reward": 1.6875000894069672, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7098214626312256, + "rewards/format_reward_func": 0.977678582072258, + "step": 6088 + }, + { + "completion_length": 265.49108695983887, + "epoch": 1.0211241041116559, + "grad_norm": 0.45547720897693517, + "kl": 0.40484619140625, + "learning_rate": 4.997006364457721e-07, + "loss": 0.0004, + "reward": 1.7750000655651093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 6090 + }, + { + "completion_length": 268.34376430511475, + "epoch": 1.0214594073515235, + "grad_norm": 0.2786369185101486, + "kl": 0.428558349609375, + "learning_rate": 4.997000492881358e-07, + "loss": 0.0004, + "reward": 1.7053572237491608, + "reward_std": 0.09848987311124802, + "rewards/equation_reward_func": 0.7366071715950966, + "rewards/format_reward_func": 0.9687500149011612, + "step": 6092 + }, + { + "completion_length": 277.3214406967163, + "epoch": 1.0217947105913912, + "grad_norm": 0.18411353534994465, + "kl": 0.46099853515625, + "learning_rate": 4.996994615555975e-07, + "loss": 0.0005, + "reward": 1.7071429193019867, + "reward_std": 0.10101525392383337, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6094 + }, + { + "completion_length": 267.19197845458984, + "epoch": 1.0221300138312586, + "grad_norm": 0.3466971216842296, + "kl": 0.677581787109375, + "learning_rate": 4.996988732481584e-07, + "loss": 0.0007, + "reward": 1.694642923772335, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.716964315623045, + "rewards/format_reward_func": 0.977678582072258, + "step": 6096 + }, + { + "completion_length": 267.0223379135132, + "epoch": 1.0224653170711262, + "grad_norm": 0.3621796109181974, + "kl": 0.21337890625, + "learning_rate": 4.9969828436582e-07, + "loss": 0.0002, + "reward": 1.7410714849829674, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7544643245637417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6098 + }, + { + "completion_length": 273.089298248291, + "epoch": 1.0228006203109938, + "grad_norm": 0.7042681314100642, + "kl": 0.7377166748046875, + "learning_rate": 4.996976949085834e-07, + "loss": 0.0007, + "reward": 1.6803572326898575, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.6937500275671482, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6100 + }, + { + "completion_length": 267.9687614440918, + "epoch": 1.0231359235508612, + "grad_norm": 0.2904338249092064, + "kl": 0.147735595703125, + "learning_rate": 4.996971048764502e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.1060660183429718, + "rewards/equation_reward_func": 0.7410714644938707, + "rewards/format_reward_func": 0.9732142947614193, + "step": 6102 + }, + { + "completion_length": 254.40179443359375, + "epoch": 1.0234712267907289, + "grad_norm": 0.1715708017841068, + "kl": 0.1373291015625, + "learning_rate": 4.996965142694217e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.025253813713788986, + "rewards/equation_reward_func": 0.8125000223517418, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6104 + }, + { + "completion_length": 260.66518783569336, + "epoch": 1.0238065300305965, + "grad_norm": 0.5298721752119621, + "kl": 0.3925323486328125, + "learning_rate": 4.996959230874993e-07, + "loss": 0.0004, + "reward": 1.7607143446803093, + "reward_std": 0.09596449043601751, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6106 + }, + { + "completion_length": 261.8437614440918, + "epoch": 1.024141833270464, + "grad_norm": 0.2597657463578196, + "kl": 0.9878387451171875, + "learning_rate": 4.996953313306842e-07, + "loss": 0.001, + "reward": 1.7339286357164383, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7473214529454708, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6108 + }, + { + "completion_length": 258.2544765472412, + "epoch": 1.0244771365103316, + "grad_norm": 0.34126144567118943, + "kl": 0.373443603515625, + "learning_rate": 4.99694738998978e-07, + "loss": 0.0004, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 6110 + }, + { + "completion_length": 248.68304443359375, + "epoch": 1.024812439750199, + "grad_norm": 0.191507371496448, + "kl": 0.4516754150390625, + "learning_rate": 4.996941460923818e-07, + "loss": 0.0005, + "reward": 1.7660714760422707, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7794643137603998, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6112 + }, + { + "completion_length": 252.77233409881592, + "epoch": 1.0251477429900666, + "grad_norm": 0.38964610451237514, + "kl": 0.4473114013671875, + "learning_rate": 4.996935526108972e-07, + "loss": 0.0004, + "reward": 1.7482143267989159, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526786085218191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6114 + }, + { + "completion_length": 251.73215293884277, + "epoch": 1.0254830462299342, + "grad_norm": 0.2627876383548723, + "kl": 0.3182525634765625, + "learning_rate": 4.996929585545254e-07, + "loss": 0.0003, + "reward": 1.7696429193019867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6116 + }, + { + "completion_length": 239.15625953674316, + "epoch": 1.0258183494698017, + "grad_norm": 0.11734356018251145, + "kl": 0.720947265625, + "learning_rate": 4.996923639232678e-07, + "loss": 0.0007, + "reward": 1.7642857730388641, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 6118 + }, + { + "completion_length": 232.41518878936768, + "epoch": 1.0261536527096693, + "grad_norm": 0.3020438778673639, + "kl": 0.3007354736328125, + "learning_rate": 4.996917687171258e-07, + "loss": 0.0003, + "reward": 1.7839286476373672, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6120 + }, + { + "completion_length": 234.4821548461914, + "epoch": 1.026488955949537, + "grad_norm": 0.19764513890719873, + "kl": 0.581573486328125, + "learning_rate": 4.996911729361009e-07, + "loss": 0.0006, + "reward": 1.8357143551111221, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8357143141329288, + "rewards/format_reward_func": 1.0, + "step": 6122 + }, + { + "completion_length": 230.95983219146729, + "epoch": 1.0268242591894043, + "grad_norm": 0.1907246795303604, + "kl": 0.1136474609375, + "learning_rate": 4.996905765801942e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 6124 + }, + { + "completion_length": 233.60715198516846, + "epoch": 1.027159562429272, + "grad_norm": 0.22052651145584928, + "kl": 0.1941375732421875, + "learning_rate": 4.996899796494073e-07, + "loss": 0.0002, + "reward": 1.7928571999073029, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 6126 + }, + { + "completion_length": 233.07590198516846, + "epoch": 1.0274948656691396, + "grad_norm": 0.28840565132762075, + "kl": 0.187591552734375, + "learning_rate": 4.996893821437414e-07, + "loss": 0.0002, + "reward": 1.6821429654955864, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6821428909897804, + "rewards/format_reward_func": 1.0, + "step": 6128 + }, + { + "completion_length": 222.7991189956665, + "epoch": 1.027830168909007, + "grad_norm": 0.22193940299452425, + "kl": 0.144378662109375, + "learning_rate": 4.99688784063198e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 6130 + }, + { + "completion_length": 230.36608219146729, + "epoch": 1.0281654721488747, + "grad_norm": 0.5075373073783743, + "kl": 0.3329925537109375, + "learning_rate": 4.996881854077785e-07, + "loss": 0.0003, + "reward": 1.7571429088711739, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 6132 + }, + { + "completion_length": 231.11608219146729, + "epoch": 1.0285007753887423, + "grad_norm": 0.20879696471969295, + "kl": 0.253082275390625, + "learning_rate": 4.996875861774842e-07, + "loss": 0.0003, + "reward": 1.7517857924103737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6134 + }, + { + "completion_length": 230.20090579986572, + "epoch": 1.0288360786286097, + "grad_norm": 0.38774526912973895, + "kl": 0.451629638671875, + "learning_rate": 4.996869863723165e-07, + "loss": 0.0005, + "reward": 1.7267857789993286, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7401785980910063, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6136 + }, + { + "completion_length": 220.5357265472412, + "epoch": 1.0291713818684773, + "grad_norm": 0.27093543124287117, + "kl": 0.1378173828125, + "learning_rate": 4.996863859922767e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 6138 + }, + { + "completion_length": 231.71876049041748, + "epoch": 1.0295066851083448, + "grad_norm": 0.6042941302054783, + "kl": 0.576324462890625, + "learning_rate": 4.996857850373663e-07, + "loss": 0.0006, + "reward": 1.7147322073578835, + "reward_std": 0.04987628059461713, + "rewards/equation_reward_func": 0.7205357514321804, + "rewards/format_reward_func": 0.9941964335739613, + "step": 6140 + }, + { + "completion_length": 229.93751049041748, + "epoch": 1.0298419883482124, + "grad_norm": 0.40842733005443116, + "kl": 0.823272705078125, + "learning_rate": 4.996851835075867e-07, + "loss": 0.0008, + "reward": 1.6964286342263222, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7142857573926449, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6142 + }, + { + "completion_length": 221.98215198516846, + "epoch": 1.03017729158808, + "grad_norm": 0.24154587050830953, + "kl": 0.40399169921875, + "learning_rate": 4.996845814029392e-07, + "loss": 0.0004, + "reward": 1.7535714879631996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 6144 + }, + { + "completion_length": 218.56697750091553, + "epoch": 1.0305125948279474, + "grad_norm": 0.2808529909155721, + "kl": 1.314208984375, + "learning_rate": 4.996839787234252e-07, + "loss": 0.0013, + "reward": 1.7839286476373672, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6146 + }, + { + "completion_length": 231.55804824829102, + "epoch": 1.030847898067815, + "grad_norm": 0.20719697823983504, + "kl": 0.54595947265625, + "learning_rate": 4.996833754690461e-07, + "loss": 0.0005, + "reward": 1.7214286625385284, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7303571794182062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6148 + }, + { + "completion_length": 228.35715293884277, + "epoch": 1.0311832013076827, + "grad_norm": 0.32793316359403357, + "kl": 0.9630889892578125, + "learning_rate": 4.996827716398033e-07, + "loss": 0.001, + "reward": 1.7482143566012383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6150 + }, + { + "completion_length": 223.99554538726807, + "epoch": 1.0315185045475501, + "grad_norm": 0.13363658578744522, + "kl": 0.293060302734375, + "learning_rate": 4.996821672356982e-07, + "loss": 0.0003, + "reward": 1.7303571999073029, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7348214760422707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6152 + }, + { + "completion_length": 219.50000858306885, + "epoch": 1.0318538077874178, + "grad_norm": 0.233703696734875, + "kl": 0.246307373046875, + "learning_rate": 4.996815622567322e-07, + "loss": 0.0002, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 6154 + }, + { + "completion_length": 222.09822368621826, + "epoch": 1.0321891110272854, + "grad_norm": 0.16851135813365517, + "kl": 1.2260894775390625, + "learning_rate": 4.996809567029067e-07, + "loss": 0.0012, + "reward": 1.766071505844593, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6156 + }, + { + "completion_length": 222.87947463989258, + "epoch": 1.0325244142671528, + "grad_norm": 0.3325081112548761, + "kl": 0.226348876953125, + "learning_rate": 4.996803505742229e-07, + "loss": 0.0002, + "reward": 1.7589286491274834, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633929029107094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6158 + }, + { + "completion_length": 215.20090293884277, + "epoch": 1.0328597175070204, + "grad_norm": 0.3898566757276845, + "kl": 0.326934814453125, + "learning_rate": 4.996797438706825e-07, + "loss": 0.0003, + "reward": 1.742857202887535, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7517857439815998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6160 + }, + { + "completion_length": 216.13840293884277, + "epoch": 1.033195020746888, + "grad_norm": 0.29160838237888526, + "kl": 0.46624755859375, + "learning_rate": 4.996791365922867e-07, + "loss": 0.0005, + "reward": 1.6821429431438446, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.6821428947150707, + "rewards/format_reward_func": 1.0, + "step": 6162 + }, + { + "completion_length": 207.17411708831787, + "epoch": 1.0335303239867555, + "grad_norm": 0.29139948351310624, + "kl": 0.110260009765625, + "learning_rate": 4.99678528739037e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714715719223, + "rewards/format_reward_func": 1.0, + "step": 6164 + }, + { + "completion_length": 207.4062614440918, + "epoch": 1.0338656272266231, + "grad_norm": 0.4263314352373076, + "kl": 0.1416778564453125, + "learning_rate": 4.996779203109347e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 6166 + }, + { + "completion_length": 208.258939743042, + "epoch": 1.0342009304664905, + "grad_norm": 0.20562322052764345, + "kl": 0.4632568359375, + "learning_rate": 4.996773113079813e-07, + "loss": 0.0005, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 1.0, + "step": 6168 + }, + { + "completion_length": 214.71875953674316, + "epoch": 1.0345362337063582, + "grad_norm": 0.4857726664048818, + "kl": 1.15155029296875, + "learning_rate": 4.996767017301781e-07, + "loss": 0.0012, + "reward": 1.7892857566475868, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7982143051922321, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6170 + }, + { + "completion_length": 208.66965198516846, + "epoch": 1.0348715369462258, + "grad_norm": 0.8186370164924367, + "kl": 0.657196044921875, + "learning_rate": 4.996760915775266e-07, + "loss": 0.0007, + "reward": 1.7321429327130318, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.732142873108387, + "rewards/format_reward_func": 1.0, + "step": 6172 + }, + { + "completion_length": 201.98661613464355, + "epoch": 1.0352068401860932, + "grad_norm": 0.09891019549043235, + "kl": 0.2893218994140625, + "learning_rate": 4.996754808500282e-07, + "loss": 0.0003, + "reward": 1.7357143387198448, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143331319094, + "rewards/format_reward_func": 1.0, + "step": 6174 + }, + { + "completion_length": 206.6696538925171, + "epoch": 1.0355421434259608, + "grad_norm": 0.3504160565048967, + "kl": 0.156005859375, + "learning_rate": 4.996748695476843e-07, + "loss": 0.0002, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857529222965, + "rewards/format_reward_func": 1.0, + "step": 6176 + }, + { + "completion_length": 211.74554443359375, + "epoch": 1.0358774466658285, + "grad_norm": 0.280817911357908, + "kl": 0.1046600341796875, + "learning_rate": 4.996742576704961e-07, + "loss": 0.0001, + "reward": 1.7964286506175995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 6178 + }, + { + "completion_length": 216.66072273254395, + "epoch": 1.036212749905696, + "grad_norm": 0.15394694928327, + "kl": 0.117523193359375, + "learning_rate": 4.996736452184653e-07, + "loss": 0.0001, + "reward": 1.7178572118282318, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571820259094, + "rewards/format_reward_func": 1.0, + "step": 6180 + }, + { + "completion_length": 215.63393783569336, + "epoch": 1.0365480531455635, + "grad_norm": 0.1798703112423341, + "kl": 0.0845489501953125, + "learning_rate": 4.996730321915932e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143145054579, + "rewards/format_reward_func": 1.0, + "step": 6182 + }, + { + "completion_length": 211.61161518096924, + "epoch": 1.0368833563854312, + "grad_norm": 0.26618095980539785, + "kl": 0.10028076171875, + "learning_rate": 4.996724185898812e-07, + "loss": 0.0001, + "reward": 1.8160714879631996, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.820535734295845, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6184 + }, + { + "completion_length": 218.41072368621826, + "epoch": 1.0372186596252986, + "grad_norm": 0.23902131457280135, + "kl": 0.090423583984375, + "learning_rate": 4.996718044133306e-07, + "loss": 0.0001, + "reward": 1.8017857447266579, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8062500357627869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6186 + }, + { + "completion_length": 216.80357933044434, + "epoch": 1.0375539628651662, + "grad_norm": 0.6333183597495273, + "kl": 0.0997314453125, + "learning_rate": 4.996711896619432e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 6188 + }, + { + "completion_length": 218.7901906967163, + "epoch": 1.0378892661050338, + "grad_norm": 0.24219539600599282, + "kl": 0.0910186767578125, + "learning_rate": 4.9967057433572e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 6190 + }, + { + "completion_length": 225.9419765472412, + "epoch": 1.0382245693449013, + "grad_norm": 0.38545151734950256, + "kl": 0.108154296875, + "learning_rate": 4.996699584346625e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6192 + }, + { + "completion_length": 221.77679634094238, + "epoch": 1.038559872584769, + "grad_norm": 0.004252728264381877, + "kl": 0.094970703125, + "learning_rate": 4.996693419587723e-07, + "loss": 0.0001, + "reward": 1.7178572118282318, + "reward_std": 0.005050762556493282, + "rewards/equation_reward_func": 0.7178571671247482, + "rewards/format_reward_func": 1.0, + "step": 6194 + }, + { + "completion_length": 222.7991180419922, + "epoch": 1.0388951758246363, + "grad_norm": 0.20910697584743013, + "kl": 0.100341796875, + "learning_rate": 4.996687249080507e-07, + "loss": 0.0001, + "reward": 1.8464286178350449, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8464286029338837, + "rewards/format_reward_func": 1.0, + "step": 6196 + }, + { + "completion_length": 236.7991180419922, + "epoch": 1.039230479064504, + "grad_norm": 0.4548612314796721, + "kl": 0.1113128662109375, + "learning_rate": 4.99668107282499e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 6198 + }, + { + "completion_length": 221.1875114440918, + "epoch": 1.0395657823043716, + "grad_norm": 0.21093026878166982, + "kl": 0.09210205078125, + "learning_rate": 4.996674890821189e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000279396772, + "rewards/format_reward_func": 1.0, + "step": 6200 + }, + { + "completion_length": 222.7991180419922, + "epoch": 1.039901085544239, + "grad_norm": 0.04538039287639752, + "kl": 0.0979766845703125, + "learning_rate": 4.996668703069115e-07, + "loss": 0.0001, + "reward": 1.7464286088943481, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7553571797907352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6202 + }, + { + "completion_length": 226.7991180419922, + "epoch": 1.0402363887841066, + "grad_norm": 0.2985622998817586, + "kl": 0.099334716796875, + "learning_rate": 4.996662509568785e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 6204 + }, + { + "completion_length": 224.3705472946167, + "epoch": 1.0405716920239743, + "grad_norm": 0.3199587341752373, + "kl": 0.118927001953125, + "learning_rate": 4.996656310320213e-07, + "loss": 0.0001, + "reward": 1.7321429550647736, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428768336773, + "rewards/format_reward_func": 1.0, + "step": 6206 + }, + { + "completion_length": 240.6160831451416, + "epoch": 1.0409069952638417, + "grad_norm": 0.1554368722368833, + "kl": 0.094329833984375, + "learning_rate": 4.996650105323411e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 6208 + }, + { + "completion_length": 233.73661518096924, + "epoch": 1.0412422985037093, + "grad_norm": 0.2616894712735538, + "kl": 0.0997772216796875, + "learning_rate": 4.996643894578396e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 6210 + }, + { + "completion_length": 233.00893783569336, + "epoch": 1.041577601743577, + "grad_norm": 0.32797188169474895, + "kl": 0.0866241455078125, + "learning_rate": 4.996637678085181e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000461935997, + "rewards/format_reward_func": 1.0, + "step": 6212 + }, + { + "completion_length": 213.83929538726807, + "epoch": 1.0419129049834444, + "grad_norm": 0.27628963395743616, + "kl": 0.085784912109375, + "learning_rate": 4.99663145584378e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 6214 + }, + { + "completion_length": 221.89733123779297, + "epoch": 1.042248208223312, + "grad_norm": 0.08516641647525623, + "kl": 0.090362548828125, + "learning_rate": 4.996625227854207e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 6216 + }, + { + "completion_length": 232.78126049041748, + "epoch": 1.0425835114631794, + "grad_norm": 0.2787046385004818, + "kl": 0.0954742431640625, + "learning_rate": 4.996618994116478e-07, + "loss": 0.0001, + "reward": 1.7267857864499092, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7312500346451998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6218 + }, + { + "completion_length": 230.34376049041748, + "epoch": 1.042918814703047, + "grad_norm": 0.1534465803849126, + "kl": 0.095611572265625, + "learning_rate": 4.996612754630605e-07, + "loss": 0.0001, + "reward": 1.7125000730156898, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.7169643342494965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6220 + }, + { + "completion_length": 241.18751049041748, + "epoch": 1.0432541179429147, + "grad_norm": 0.2836709467862103, + "kl": 0.087158203125, + "learning_rate": 4.996606509396605e-07, + "loss": 0.0001, + "reward": 1.7107143923640251, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7107143141329288, + "rewards/format_reward_func": 1.0, + "step": 6222 + }, + { + "completion_length": 237.9196548461914, + "epoch": 1.043589421182782, + "grad_norm": 0.5012585855206936, + "kl": 0.09515380859375, + "learning_rate": 4.996600258414491e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 6224 + }, + { + "completion_length": 234.18751049041748, + "epoch": 1.0439247244226497, + "grad_norm": 0.2459476407984449, + "kl": 0.0809173583984375, + "learning_rate": 4.996594001684278e-07, + "loss": 0.0001, + "reward": 1.7714286595582962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 6226 + }, + { + "completion_length": 238.39733028411865, + "epoch": 1.0442600276625174, + "grad_norm": 0.14081129195328745, + "kl": 0.07952880859375, + "learning_rate": 4.996587739205979e-07, + "loss": 0.0001, + "reward": 1.751785770058632, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500182539225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6228 + }, + { + "completion_length": 237.11608219146729, + "epoch": 1.0445953309023848, + "grad_norm": 0.1969990280030385, + "kl": 0.078887939453125, + "learning_rate": 4.996581470979609e-07, + "loss": 0.0001, + "reward": 1.6928572207689285, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.6928571742027998, + "rewards/format_reward_func": 1.0, + "step": 6230 + }, + { + "completion_length": 235.27679824829102, + "epoch": 1.0449306341422524, + "grad_norm": 0.10241176001437838, + "kl": 0.077056884765625, + "learning_rate": 4.996575197005184e-07, + "loss": 0.0001, + "reward": 1.8053571805357933, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8098214603960514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6232 + }, + { + "completion_length": 233.43304634094238, + "epoch": 1.04526593738212, + "grad_norm": 0.43947870911254105, + "kl": 0.0736846923828125, + "learning_rate": 4.996568917282718e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 6234 + }, + { + "completion_length": 244.62947750091553, + "epoch": 1.0456012406219874, + "grad_norm": 0.22402884796565656, + "kl": 0.0749969482421875, + "learning_rate": 4.996562631812222e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571857511997, + "rewards/format_reward_func": 1.0, + "step": 6236 + }, + { + "completion_length": 244.3259038925171, + "epoch": 1.045936543861855, + "grad_norm": 0.275003424803172, + "kl": 0.0874786376953125, + "learning_rate": 4.996556340593715e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 6238 + }, + { + "completion_length": 235.7321538925171, + "epoch": 1.0462718471017227, + "grad_norm": 0.22154834415764207, + "kl": 0.0706024169921875, + "learning_rate": 4.99655004362721e-07, + "loss": 0.0001, + "reward": 1.703571505844593, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.703571455553174, + "rewards/format_reward_func": 1.0, + "step": 6240 + }, + { + "completion_length": 236.3437623977661, + "epoch": 1.0466071503415901, + "grad_norm": 0.39231322722768996, + "kl": 0.07466888427734375, + "learning_rate": 4.99654374091272e-07, + "loss": 0.0001, + "reward": 1.7522322162985802, + "reward_std": 0.0473508988507092, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 0.9986607171595097, + "step": 6242 + }, + { + "completion_length": 235.2857265472412, + "epoch": 1.0469424535814578, + "grad_norm": 0.23871235936534596, + "kl": 0.067901611328125, + "learning_rate": 4.996537432450261e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 6244 + }, + { + "completion_length": 245.44197463989258, + "epoch": 1.0472777568213252, + "grad_norm": 0.13381440765275113, + "kl": 0.06072998046875, + "learning_rate": 4.996531118239848e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 6246 + }, + { + "completion_length": 249.54911518096924, + "epoch": 1.0476130600611928, + "grad_norm": 0.17442154972673185, + "kl": 0.071014404296875, + "learning_rate": 4.996524798281494e-07, + "loss": 0.0001, + "reward": 1.6750000640749931, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6750000417232513, + "rewards/format_reward_func": 1.0, + "step": 6248 + }, + { + "completion_length": 236.6116180419922, + "epoch": 1.0479483633010604, + "grad_norm": 0.1054946432492003, + "kl": 0.06951904296875, + "learning_rate": 4.996518472575214e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 6250 + }, + { + "completion_length": 242.00001049041748, + "epoch": 1.0482836665409279, + "grad_norm": 0.21478339945524438, + "kl": 0.0652008056640625, + "learning_rate": 4.996512141121023e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 6252 + }, + { + "completion_length": 236.27679538726807, + "epoch": 1.0486189697807955, + "grad_norm": 0.2525516533339607, + "kl": 0.079071044921875, + "learning_rate": 4.996505803918936e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 6254 + }, + { + "completion_length": 242.00894260406494, + "epoch": 1.0489542730206631, + "grad_norm": 0.19916546296729642, + "kl": 0.071990966796875, + "learning_rate": 4.996499460968966e-07, + "loss": 0.0001, + "reward": 1.816071480512619, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8205357417464256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6256 + }, + { + "completion_length": 247.7187614440918, + "epoch": 1.0492895762605305, + "grad_norm": 0.5188461088710983, + "kl": 0.518463134765625, + "learning_rate": 4.996493112271129e-07, + "loss": 0.0005, + "reward": 1.751785784959793, + "reward_std": 0.10859139822423458, + "rewards/equation_reward_func": 0.765178594738245, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6258 + }, + { + "completion_length": 250.15626049041748, + "epoch": 1.0496248795003982, + "grad_norm": 0.2216511665362915, + "kl": 0.06902313232421875, + "learning_rate": 4.996486757825439e-07, + "loss": 0.0001, + "reward": 1.7160715013742447, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7205357663333416, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6260 + }, + { + "completion_length": 255.28572463989258, + "epoch": 1.0499601827402658, + "grad_norm": 0.3203314600754042, + "kl": 0.109161376953125, + "learning_rate": 4.996480397631911e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7303571775555611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6262 + }, + { + "completion_length": 242.7857265472412, + "epoch": 1.0502954859801332, + "grad_norm": 0.302882520820471, + "kl": 0.0699310302734375, + "learning_rate": 4.996474031690561e-07, + "loss": 0.0001, + "reward": 1.8053572103381157, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8098214529454708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6264 + }, + { + "completion_length": 254.93304538726807, + "epoch": 1.0506307892200009, + "grad_norm": 0.24384354682916257, + "kl": 0.0830535888671875, + "learning_rate": 4.996467660001399e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6266 + }, + { + "completion_length": 249.7187623977661, + "epoch": 1.0509660924598685, + "grad_norm": 0.3250066916248128, + "kl": 0.0789337158203125, + "learning_rate": 4.996461282564446e-07, + "loss": 0.0001, + "reward": 1.708928644657135, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7223214507102966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6268 + }, + { + "completion_length": 246.92411994934082, + "epoch": 1.051301395699736, + "grad_norm": 0.2190682285191929, + "kl": 0.084503173828125, + "learning_rate": 4.996454899379711e-07, + "loss": 0.0001, + "reward": 1.7339286506175995, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7383928820490837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6270 + }, + { + "completion_length": 259.10715198516846, + "epoch": 1.0516366989396035, + "grad_norm": 0.27884125227950174, + "kl": 0.0814361572265625, + "learning_rate": 4.996448510447213e-07, + "loss": 0.0001, + "reward": 1.705357238650322, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.709821455180645, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6272 + }, + { + "completion_length": 242.6116180419922, + "epoch": 1.051972002179471, + "grad_norm": 0.26117811635112054, + "kl": 0.0708770751953125, + "learning_rate": 4.996442115766965e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7946428880095482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6274 + }, + { + "completion_length": 253.24555110931396, + "epoch": 1.0523073054193386, + "grad_norm": 0.20232160553432557, + "kl": 0.0785369873046875, + "learning_rate": 4.996435715338981e-07, + "loss": 0.0001, + "reward": 1.7803571969270706, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7937500290572643, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6276 + }, + { + "completion_length": 250.5491180419922, + "epoch": 1.0526426086592062, + "grad_norm": 0.28832289892932167, + "kl": 0.06787109375, + "learning_rate": 4.996429309163276e-07, + "loss": 0.0001, + "reward": 1.7392858043313026, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7660714611411095, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6278 + }, + { + "completion_length": 247.56697940826416, + "epoch": 1.0529779118990736, + "grad_norm": 0.23480186692840185, + "kl": 0.0709991455078125, + "learning_rate": 4.996422897239866e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.10606601741164923, + "rewards/equation_reward_func": 0.7589286006987095, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6280 + }, + { + "completion_length": 257.071439743042, + "epoch": 1.0533132151389413, + "grad_norm": 0.3109790026339082, + "kl": 0.0759124755859375, + "learning_rate": 4.996416479568763e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7517857290804386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6282 + }, + { + "completion_length": 243.58483219146729, + "epoch": 1.053648518378809, + "grad_norm": 0.21916995248392268, + "kl": 0.06939697265625, + "learning_rate": 4.996410056149986e-07, + "loss": 0.0001, + "reward": 1.7589286640286446, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6284 + }, + { + "completion_length": 251.3303680419922, + "epoch": 1.0539838216186763, + "grad_norm": 0.1085527793568352, + "kl": 0.07000732421875, + "learning_rate": 4.996403626983547e-07, + "loss": 0.0001, + "reward": 1.7107143476605415, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7196428924798965, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6286 + }, + { + "completion_length": 257.58483695983887, + "epoch": 1.054319124858544, + "grad_norm": 0.27881947506588706, + "kl": 0.0749053955078125, + "learning_rate": 4.996397192069461e-07, + "loss": 0.0001, + "reward": 1.6857143566012383, + "reward_std": 0.11111677996814251, + "rewards/equation_reward_func": 0.7125000320374966, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6288 + }, + { + "completion_length": 251.14733219146729, + "epoch": 1.0546544280984116, + "grad_norm": 0.27651003259180323, + "kl": 0.0708160400390625, + "learning_rate": 4.996390751407741e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 6290 + }, + { + "completion_length": 243.77679538726807, + "epoch": 1.054989731338279, + "grad_norm": 0.21381289177800553, + "kl": 0.0681915283203125, + "learning_rate": 4.996384304998406e-07, + "loss": 0.0001, + "reward": 1.7803571969270706, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7848214525729418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6292 + }, + { + "completion_length": 246.5089406967163, + "epoch": 1.0553250345781466, + "grad_norm": 0.22910722937156328, + "kl": 0.0751953125, + "learning_rate": 4.99637785284147e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.7839286141097546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6294 + }, + { + "completion_length": 259.4241180419922, + "epoch": 1.0556603378180143, + "grad_norm": 0.7797186367045612, + "kl": 0.1149749755859375, + "learning_rate": 4.996371394936944e-07, + "loss": 0.0001, + "reward": 1.7732143551111221, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6296 + }, + { + "completion_length": 250.4285831451416, + "epoch": 1.0559956410578817, + "grad_norm": 1.2601252607687894, + "kl": 0.0680389404296875, + "learning_rate": 4.996364931284847e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776785995811224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6298 + }, + { + "completion_length": 238.5803680419922, + "epoch": 1.0563309442977493, + "grad_norm": 0.2717918614938203, + "kl": 0.078369140625, + "learning_rate": 4.996358461885192e-07, + "loss": 0.0001, + "reward": 1.7910714969038963, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7955357395112514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6300 + }, + { + "completion_length": 242.43304634094238, + "epoch": 1.0566662475376167, + "grad_norm": 0.09746477369521411, + "kl": 0.079925537109375, + "learning_rate": 4.996351986737993e-07, + "loss": 0.0001, + "reward": 1.7303572073578835, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6302 + }, + { + "completion_length": 245.0178680419922, + "epoch": 1.0570015507774844, + "grad_norm": 0.13468086713952232, + "kl": 0.076995849609375, + "learning_rate": 4.996345505843268e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6304 + }, + { + "completion_length": 250.1875114440918, + "epoch": 1.057336854017352, + "grad_norm": 0.18329931190892818, + "kl": 0.0720367431640625, + "learning_rate": 4.996339019201029e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6306 + }, + { + "completion_length": 238.9419765472412, + "epoch": 1.0576721572572194, + "grad_norm": 0.20137418059888398, + "kl": 0.0643157958984375, + "learning_rate": 4.996332526811292e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776786163449287, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6308 + }, + { + "completion_length": 246.35715103149414, + "epoch": 1.058007460497087, + "grad_norm": 0.16433502622545273, + "kl": 0.0754547119140625, + "learning_rate": 4.996326028674073e-07, + "loss": 0.0001, + "reward": 1.716071493923664, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7205357421189547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6310 + }, + { + "completion_length": 247.68751335144043, + "epoch": 1.0583427637369547, + "grad_norm": 0.19212147064966212, + "kl": 0.0781707763671875, + "learning_rate": 4.996319524789385e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6312 + }, + { + "completion_length": 242.81697273254395, + "epoch": 1.058678066976822, + "grad_norm": 0.09429431247365397, + "kl": 0.0767822265625, + "learning_rate": 4.996313015157244e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 6314 + }, + { + "completion_length": 245.39733123779297, + "epoch": 1.0590133702166897, + "grad_norm": 0.18421668642270214, + "kl": 0.09881591796875, + "learning_rate": 4.996306499777665e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7758928686380386, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6316 + }, + { + "completion_length": 245.1116189956665, + "epoch": 1.0593486734565574, + "grad_norm": 0.3807004269832346, + "kl": 0.0757904052734375, + "learning_rate": 4.996299978650662e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 6318 + }, + { + "completion_length": 251.22322463989258, + "epoch": 1.0596839766964248, + "grad_norm": 0.2114666808769925, + "kl": 0.078765869140625, + "learning_rate": 4.996293451776252e-07, + "loss": 0.0001, + "reward": 1.7500000298023224, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7678571939468384, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6320 + }, + { + "completion_length": 246.852689743042, + "epoch": 1.0600192799362924, + "grad_norm": 0.16649081805675256, + "kl": 0.0686492919921875, + "learning_rate": 4.996286919154449e-07, + "loss": 0.0001, + "reward": 1.7625000700354576, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6322 + }, + { + "completion_length": 247.61608123779297, + "epoch": 1.06035458317616, + "grad_norm": 0.30420478683403074, + "kl": 0.0810699462890625, + "learning_rate": 4.996280380785267e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6324 + }, + { + "completion_length": 251.58483219146729, + "epoch": 1.0606898864160275, + "grad_norm": 0.23160704711198593, + "kl": 0.0971832275390625, + "learning_rate": 4.996273836668723e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7660714648663998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6326 + }, + { + "completion_length": 244.9642972946167, + "epoch": 1.061025189655895, + "grad_norm": 0.3936892033816459, + "kl": 0.0736236572265625, + "learning_rate": 4.996267286804831e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7491071652621031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6328 + }, + { + "completion_length": 241.1116180419922, + "epoch": 1.0613604928957625, + "grad_norm": 0.1383559369834235, + "kl": 0.0700225830078125, + "learning_rate": 4.996260731193606e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 6330 + }, + { + "completion_length": 245.1473331451416, + "epoch": 1.0616957961356301, + "grad_norm": 0.3744539358250854, + "kl": 0.0670928955078125, + "learning_rate": 4.996254169835063e-07, + "loss": 0.0001, + "reward": 1.8160715028643608, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.8205357491970062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6332 + }, + { + "completion_length": 247.540189743042, + "epoch": 1.0620310993754978, + "grad_norm": 0.4886645560494357, + "kl": 0.0726165771484375, + "learning_rate": 4.996247602729216e-07, + "loss": 0.0001, + "reward": 1.7267857640981674, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.731250025331974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6334 + }, + { + "completion_length": 251.43304538726807, + "epoch": 1.0623664026153652, + "grad_norm": 0.1689562542398881, + "kl": 0.0782623291015625, + "learning_rate": 4.996241029876084e-07, + "loss": 0.0001, + "reward": 1.7910714820027351, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6336 + }, + { + "completion_length": 263.352689743042, + "epoch": 1.0627017058552328, + "grad_norm": 0.26788844139713214, + "kl": 0.13543701171875, + "learning_rate": 4.996234451275678e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6338 + }, + { + "completion_length": 247.23661994934082, + "epoch": 1.0630370090951005, + "grad_norm": 0.19132039665083433, + "kl": 0.1452789306640625, + "learning_rate": 4.996227866928016e-07, + "loss": 0.0001, + "reward": 1.7285715267062187, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7375000230967999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6340 + }, + { + "completion_length": 258.4419746398926, + "epoch": 1.0633723123349679, + "grad_norm": 0.19874075640970054, + "kl": 0.1968841552734375, + "learning_rate": 4.996221276833111e-07, + "loss": 0.0002, + "reward": 1.7607143446803093, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7696428820490837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6342 + }, + { + "completion_length": 251.9464406967163, + "epoch": 1.0637076155748355, + "grad_norm": 0.23396514743791882, + "kl": 0.1824951171875, + "learning_rate": 4.996214680990979e-07, + "loss": 0.0002, + "reward": 1.7767857983708382, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500111758709, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6344 + }, + { + "completion_length": 238.31251049041748, + "epoch": 1.0640429188147031, + "grad_norm": 0.43415076467434777, + "kl": 0.073333740234375, + "learning_rate": 4.996208079401635e-07, + "loss": 0.0001, + "reward": 1.7839286550879478, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6346 + }, + { + "completion_length": 245.0089406967163, + "epoch": 1.0643782220545706, + "grad_norm": 0.4964206120947802, + "kl": 0.758087158203125, + "learning_rate": 4.996201472065096e-07, + "loss": 0.0008, + "reward": 1.7321429401636124, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7410714626312256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6348 + }, + { + "completion_length": 258.4107275009155, + "epoch": 1.0647135252944382, + "grad_norm": 0.2686395974838121, + "kl": 0.7075042724609375, + "learning_rate": 4.996194858981374e-07, + "loss": 0.0007, + "reward": 1.7535714954137802, + "reward_std": 0.0656599160283804, + "rewards/equation_reward_func": 0.7625000365078449, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6350 + }, + { + "completion_length": 241.78572463989258, + "epoch": 1.0650488285343056, + "grad_norm": 0.20670819187440403, + "kl": 0.2355194091796875, + "learning_rate": 4.996188240150486e-07, + "loss": 0.0002, + "reward": 1.8000000566244125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 6352 + }, + { + "completion_length": 258.058048248291, + "epoch": 1.0653841317741732, + "grad_norm": 0.24306248609979544, + "kl": 0.6288604736328125, + "learning_rate": 4.996181615572449e-07, + "loss": 0.0006, + "reward": 1.7375000938773155, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6354 + }, + { + "completion_length": 239.61608028411865, + "epoch": 1.0657194350140409, + "grad_norm": 0.28623101171516113, + "kl": 0.140655517578125, + "learning_rate": 4.996174985247275e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857454717159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6356 + }, + { + "completion_length": 244.5759048461914, + "epoch": 1.0660547382539083, + "grad_norm": 0.18153326342052306, + "kl": 0.0954132080078125, + "learning_rate": 4.996168349174981e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7241071779280901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6358 + }, + { + "completion_length": 247.27679634094238, + "epoch": 1.066390041493776, + "grad_norm": 0.35221345372591045, + "kl": 0.691741943359375, + "learning_rate": 4.996161707355582e-07, + "loss": 0.0007, + "reward": 1.7500000670552254, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7589286081492901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6360 + }, + { + "completion_length": 255.2366189956665, + "epoch": 1.0667253447336436, + "grad_norm": 0.5279583818578518, + "kl": 0.78302001953125, + "learning_rate": 4.996155059789093e-07, + "loss": 0.0008, + "reward": 1.6785714775323868, + "reward_std": 0.11111677903681993, + "rewards/equation_reward_func": 0.7053571660071611, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6362 + }, + { + "completion_length": 256.92858123779297, + "epoch": 1.067060647973511, + "grad_norm": 0.2476382497789468, + "kl": 0.3397369384765625, + "learning_rate": 4.996148406475529e-07, + "loss": 0.0003, + "reward": 1.7464286461472511, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 6364 + }, + { + "completion_length": 251.4017972946167, + "epoch": 1.0673959512133786, + "grad_norm": 0.22177943064580702, + "kl": 1.80938720703125, + "learning_rate": 4.996141747414906e-07, + "loss": 0.0018, + "reward": 1.7928571924567223, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.8017857410013676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6366 + }, + { + "completion_length": 249.25447750091553, + "epoch": 1.0677312544532462, + "grad_norm": 0.2940764790633873, + "kl": 0.3301849365234375, + "learning_rate": 4.99613508260724e-07, + "loss": 0.0003, + "reward": 1.7321429029107094, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7410714533179998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6368 + }, + { + "completion_length": 262.54465770721436, + "epoch": 1.0680665576931136, + "grad_norm": 0.36158647929313403, + "kl": 0.313018798828125, + "learning_rate": 4.996128412052546e-07, + "loss": 0.0003, + "reward": 1.7517857626080513, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7651786096394062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6370 + }, + { + "completion_length": 265.00894260406494, + "epoch": 1.0684018609329813, + "grad_norm": 0.2477564882494541, + "kl": 0.3001556396484375, + "learning_rate": 4.996121735750838e-07, + "loss": 0.0003, + "reward": 1.7071429267525673, + "reward_std": 0.1010152529925108, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6372 + }, + { + "completion_length": 253.47322463989258, + "epoch": 1.068737164172849, + "grad_norm": 0.21786370922386733, + "kl": 0.0959014892578125, + "learning_rate": 4.996115053702132e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643197208643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6374 + }, + { + "completion_length": 252.44644165039062, + "epoch": 1.0690724674127163, + "grad_norm": 0.16300399710153712, + "kl": 0.1212158203125, + "learning_rate": 4.996108365906444e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6376 + }, + { + "completion_length": 255.8482255935669, + "epoch": 1.069407770652584, + "grad_norm": 0.4202332630677491, + "kl": 0.266632080078125, + "learning_rate": 4.996101672363789e-07, + "loss": 0.0003, + "reward": 1.7464286461472511, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286163449287, + "rewards/format_reward_func": 1.0, + "step": 6378 + }, + { + "completion_length": 250.80358219146729, + "epoch": 1.0697430738924516, + "grad_norm": 0.21634007380193282, + "kl": 0.1393280029296875, + "learning_rate": 4.996094973074182e-07, + "loss": 0.0001, + "reward": 1.7910715118050575, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6380 + }, + { + "completion_length": 240.21429634094238, + "epoch": 1.070078377132319, + "grad_norm": 0.2353217723544025, + "kl": 0.1108551025390625, + "learning_rate": 4.996088268037641e-07, + "loss": 0.0001, + "reward": 1.7517857924103737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6382 + }, + { + "completion_length": 254.6250114440918, + "epoch": 1.0704136803721866, + "grad_norm": 0.22977953063269516, + "kl": 0.0945281982421875, + "learning_rate": 4.996081557254177e-07, + "loss": 0.0001, + "reward": 1.7696429193019867, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071600466967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6384 + }, + { + "completion_length": 244.19643592834473, + "epoch": 1.070748983612054, + "grad_norm": 0.22856891401563256, + "kl": 0.212188720703125, + "learning_rate": 4.996074840723809e-07, + "loss": 0.0002, + "reward": 1.7642858251929283, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 6386 + }, + { + "completion_length": 236.9062623977661, + "epoch": 1.0710842868519217, + "grad_norm": 0.20213076895736504, + "kl": 0.105621337890625, + "learning_rate": 4.99606811844655e-07, + "loss": 0.0001, + "reward": 1.7785715162754059, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714380443096, + "rewards/format_reward_func": 1.0, + "step": 6388 + }, + { + "completion_length": 248.64733409881592, + "epoch": 1.0714195900917893, + "grad_norm": 0.16060784507836098, + "kl": 0.07293701171875, + "learning_rate": 4.996061390422418e-07, + "loss": 0.0001, + "reward": 1.8035714998841286, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8125000335276127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6390 + }, + { + "completion_length": 237.47322273254395, + "epoch": 1.0717548933316567, + "grad_norm": 0.3744916426993643, + "kl": 0.1375274658203125, + "learning_rate": 4.996054656651427e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7732143066823483, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6392 + }, + { + "completion_length": 240.9419765472412, + "epoch": 1.0720901965715244, + "grad_norm": 0.5524514166335937, + "kl": 0.1202850341796875, + "learning_rate": 4.996047917133591e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7616071775555611, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6394 + }, + { + "completion_length": 247.79465293884277, + "epoch": 1.072425499811392, + "grad_norm": 0.1931412468701336, + "kl": 0.1047821044921875, + "learning_rate": 4.996041171868929e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 6396 + }, + { + "completion_length": 231.8973331451416, + "epoch": 1.0727608030512594, + "grad_norm": 0.16039896848106713, + "kl": 0.07208251953125, + "learning_rate": 4.996034420857454e-07, + "loss": 0.0001, + "reward": 1.7839286401867867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6398 + }, + { + "completion_length": 247.7634038925171, + "epoch": 1.073096106291127, + "grad_norm": 0.40971345021003636, + "kl": 0.078338623046875, + "learning_rate": 4.996027664099183e-07, + "loss": 0.0001, + "reward": 1.7785715162754059, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 6400 + }, + { + "completion_length": 248.1741189956665, + "epoch": 1.0734314095309947, + "grad_norm": 0.1190091501059919, + "kl": 0.104644775390625, + "learning_rate": 4.996020901594131e-07, + "loss": 0.0001, + "reward": 1.748214341700077, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6402 + }, + { + "completion_length": 244.5491180419922, + "epoch": 1.073766712770862, + "grad_norm": 0.20764049742946922, + "kl": 0.074798583984375, + "learning_rate": 4.996014133342314e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285798370838, + "rewards/format_reward_func": 1.0, + "step": 6404 + }, + { + "completion_length": 246.4017972946167, + "epoch": 1.0741020160107297, + "grad_norm": 0.21780243516392667, + "kl": 0.067718505859375, + "learning_rate": 4.996007359343745e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 6406 + }, + { + "completion_length": 246.27679920196533, + "epoch": 1.0744373192505972, + "grad_norm": 0.48129432871493416, + "kl": 0.08587646484375, + "learning_rate": 4.996000579598442e-07, + "loss": 0.0001, + "reward": 1.7071429565548897, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7160714659839869, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6408 + }, + { + "completion_length": 245.6562614440918, + "epoch": 1.0747726224904648, + "grad_norm": 0.24065879335741383, + "kl": 0.0702972412109375, + "learning_rate": 4.995993794106421e-07, + "loss": 0.0001, + "reward": 1.8000000715255737, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 6410 + }, + { + "completion_length": 247.36608123779297, + "epoch": 1.0751079257303324, + "grad_norm": 0.26497871394053646, + "kl": 0.0687713623046875, + "learning_rate": 4.995987002867698e-07, + "loss": 0.0001, + "reward": 1.7553572207689285, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6412 + }, + { + "completion_length": 248.27679824829102, + "epoch": 1.0754432289701998, + "grad_norm": 0.41136003473797084, + "kl": 0.0745391845703125, + "learning_rate": 4.995980205882286e-07, + "loss": 0.0001, + "reward": 1.7392858043313026, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 6414 + }, + { + "completion_length": 247.2678689956665, + "epoch": 1.0757785322100675, + "grad_norm": 0.21398005997336908, + "kl": 0.064544677734375, + "learning_rate": 4.995973403150202e-07, + "loss": 0.0001, + "reward": 1.719642959535122, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6416 + }, + { + "completion_length": 234.8259038925171, + "epoch": 1.076113835449935, + "grad_norm": 0.25680062865685876, + "kl": 0.0615081787109375, + "learning_rate": 4.995966594671462e-07, + "loss": 0.0001, + "reward": 1.8071429207921028, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 6418 + }, + { + "completion_length": 239.17858409881592, + "epoch": 1.0764491386898025, + "grad_norm": 0.3022971347656876, + "kl": 0.05865478515625, + "learning_rate": 4.995959780446081e-07, + "loss": 0.0001, + "reward": 1.7687500417232513, + "reward_std": 0.04419417306780815, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9982142895460129, + "step": 6420 + }, + { + "completion_length": 240.0357255935669, + "epoch": 1.0767844419296702, + "grad_norm": 0.3100097102368012, + "kl": 0.065399169921875, + "learning_rate": 4.995952960474076e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 1.0, + "step": 6422 + }, + { + "completion_length": 250.1116189956665, + "epoch": 1.0771197451695378, + "grad_norm": 0.12420128973486144, + "kl": 0.0625457763671875, + "learning_rate": 4.995946134755462e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 6424 + }, + { + "completion_length": 252.18751335144043, + "epoch": 1.0774550484094052, + "grad_norm": 0.16978968685993892, + "kl": 0.067596435546875, + "learning_rate": 4.995939303290253e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 6426 + }, + { + "completion_length": 254.5044755935669, + "epoch": 1.0777903516492728, + "grad_norm": 0.33574965100851467, + "kl": 0.064117431640625, + "learning_rate": 4.995932466078468e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7669643275439739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6428 + }, + { + "completion_length": 249.9241180419922, + "epoch": 1.0781256548891403, + "grad_norm": 0.22794782342785525, + "kl": 0.06793212890625, + "learning_rate": 4.995925623120121e-07, + "loss": 0.0001, + "reward": 1.6964286491274834, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7053571809083223, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6430 + }, + { + "completion_length": 251.34376335144043, + "epoch": 1.0784609581290079, + "grad_norm": 0.21343348359286007, + "kl": 0.064117431640625, + "learning_rate": 4.995918774415226e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 6432 + }, + { + "completion_length": 250.5178680419922, + "epoch": 1.0787962613688755, + "grad_norm": 0.2154335777824777, + "kl": 0.073577880859375, + "learning_rate": 4.995911919963802e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 6434 + }, + { + "completion_length": 256.2455520629883, + "epoch": 1.079131564608743, + "grad_norm": 0.22964136065354926, + "kl": 0.0595703125, + "learning_rate": 4.995905059765863e-07, + "loss": 0.0001, + "reward": 1.79464291036129, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.8080357350409031, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6436 + }, + { + "completion_length": 255.1294765472412, + "epoch": 1.0794668678486106, + "grad_norm": 0.30185869514240066, + "kl": 0.069580078125, + "learning_rate": 4.995898193821426e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 6438 + }, + { + "completion_length": 254.9241180419922, + "epoch": 1.0798021710884782, + "grad_norm": 0.22874675324827642, + "kl": 0.060516357421875, + "learning_rate": 4.995891322130505e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643014669418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6440 + }, + { + "completion_length": 245.75447845458984, + "epoch": 1.0801374743283456, + "grad_norm": 0.15348513336418879, + "kl": 0.058837890625, + "learning_rate": 4.995884444693117e-07, + "loss": 0.0001, + "reward": 1.835714340209961, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8357143104076385, + "rewards/format_reward_func": 1.0, + "step": 6442 + }, + { + "completion_length": 247.2366189956665, + "epoch": 1.0804727775682132, + "grad_norm": 0.17123096708578153, + "kl": 0.0656585693359375, + "learning_rate": 4.995877561509278e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286170899868, + "rewards/format_reward_func": 1.0, + "step": 6444 + }, + { + "completion_length": 257.4598340988159, + "epoch": 1.0808080808080809, + "grad_norm": 0.28229072762750085, + "kl": 0.069366455078125, + "learning_rate": 4.995870672579003e-07, + "loss": 0.0001, + "reward": 1.7361607998609543, + "reward_std": 0.10038390709087253, + "rewards/equation_reward_func": 0.7419643122702837, + "rewards/format_reward_func": 0.9941964335739613, + "step": 6446 + }, + { + "completion_length": 249.0089406967163, + "epoch": 1.0811433840479483, + "grad_norm": 0.2814686950997999, + "kl": 0.08123779296875, + "learning_rate": 4.995863777902308e-07, + "loss": 0.0001, + "reward": 1.8196429088711739, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8241071663796902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6448 + }, + { + "completion_length": 251.23215293884277, + "epoch": 1.081478687287816, + "grad_norm": 0.18263971276236252, + "kl": 0.0647430419921875, + "learning_rate": 4.99585687747921e-07, + "loss": 0.0001, + "reward": 1.7839286401867867, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6450 + }, + { + "completion_length": 246.1071548461914, + "epoch": 1.0818139905276836, + "grad_norm": 0.3340344630893179, + "kl": 0.07000732421875, + "learning_rate": 4.995849971309723e-07, + "loss": 0.0001, + "reward": 1.7785715088248253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 6452 + }, + { + "completion_length": 245.1875114440918, + "epoch": 1.082149293767551, + "grad_norm": 0.23800132416609462, + "kl": 0.0767669677734375, + "learning_rate": 4.995843059393865e-07, + "loss": 0.0001, + "reward": 1.7410714775323868, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6454 + }, + { + "completion_length": 247.5000114440918, + "epoch": 1.0824845970074186, + "grad_norm": 0.19794952988984094, + "kl": 0.07769775390625, + "learning_rate": 4.995836141731651e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285887777805, + "rewards/format_reward_func": 1.0, + "step": 6456 + }, + { + "completion_length": 244.90179634094238, + "epoch": 1.0828199002472862, + "grad_norm": 0.2173504470847555, + "kl": 0.0712127685546875, + "learning_rate": 4.995829218323096e-07, + "loss": 0.0001, + "reward": 1.7053572162985802, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7098214700818062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6458 + }, + { + "completion_length": 258.9375123977661, + "epoch": 1.0831552034871537, + "grad_norm": 0.24258856009385474, + "kl": 0.0692596435546875, + "learning_rate": 4.995822289168216e-07, + "loss": 0.0001, + "reward": 1.7035715207457542, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.703571442514658, + "rewards/format_reward_func": 1.0, + "step": 6460 + }, + { + "completion_length": 254.15626525878906, + "epoch": 1.0834905067270213, + "grad_norm": 0.21434048158714838, + "kl": 0.0685882568359375, + "learning_rate": 4.99581535426703e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071909666061, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6462 + }, + { + "completion_length": 255.51786708831787, + "epoch": 1.0838258099668887, + "grad_norm": 0.322625348089809, + "kl": 0.066741943359375, + "learning_rate": 4.99580841361955e-07, + "loss": 0.0001, + "reward": 1.7303572371602058, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7348214536905289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6464 + }, + { + "completion_length": 262.401798248291, + "epoch": 1.0841611132067563, + "grad_norm": 0.18824960966371346, + "kl": 0.0705108642578125, + "learning_rate": 4.995801467225794e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7508928924798965, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6466 + }, + { + "completion_length": 253.3259048461914, + "epoch": 1.084496416446624, + "grad_norm": 0.32009667898891864, + "kl": 0.074462890625, + "learning_rate": 4.995794515085778e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6468 + }, + { + "completion_length": 252.25001525878906, + "epoch": 1.0848317196864914, + "grad_norm": 0.23876316643511034, + "kl": 0.069305419921875, + "learning_rate": 4.995787557199518e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7732143066823483, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6470 + }, + { + "completion_length": 255.30804920196533, + "epoch": 1.085167022926359, + "grad_norm": 0.26642096787601005, + "kl": 0.069610595703125, + "learning_rate": 4.995780593567028e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 6472 + }, + { + "completion_length": 252.88394260406494, + "epoch": 1.0855023261662267, + "grad_norm": 0.24414306302956112, + "kl": 0.0724029541015625, + "learning_rate": 4.995773624188328e-07, + "loss": 0.0001, + "reward": 1.7267858013510704, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7312500216066837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6474 + }, + { + "completion_length": 255.62054538726807, + "epoch": 1.085837629406094, + "grad_norm": 0.250032728392339, + "kl": 0.0707550048828125, + "learning_rate": 4.995766649063429e-07, + "loss": 0.0001, + "reward": 1.7964286133646965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 6476 + }, + { + "completion_length": 256.508939743042, + "epoch": 1.0861729326459617, + "grad_norm": 0.1228238932830406, + "kl": 0.0816497802734375, + "learning_rate": 4.995759668192352e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143107801676, + "rewards/format_reward_func": 1.0, + "step": 6478 + }, + { + "completion_length": 253.83483409881592, + "epoch": 1.0865082358858293, + "grad_norm": 0.2071801861333888, + "kl": 0.0679779052734375, + "learning_rate": 4.995752681575111e-07, + "loss": 0.0001, + "reward": 1.8160714730620384, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8205357268452644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6480 + }, + { + "completion_length": 241.84822368621826, + "epoch": 1.0868435391256968, + "grad_norm": 0.17132538782288873, + "kl": 0.0664825439453125, + "learning_rate": 4.995745689211721e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000134110451, + "rewards/format_reward_func": 1.0, + "step": 6482 + }, + { + "completion_length": 251.61608123779297, + "epoch": 1.0871788423655644, + "grad_norm": 0.33335513174173054, + "kl": 0.077301025390625, + "learning_rate": 4.9957386911022e-07, + "loss": 0.0001, + "reward": 1.7107143849134445, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7196428813040257, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6484 + }, + { + "completion_length": 249.34375953674316, + "epoch": 1.0875141456054318, + "grad_norm": 0.1509868126855817, + "kl": 0.0727386474609375, + "learning_rate": 4.995731687246562e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 6486 + }, + { + "completion_length": 263.23662090301514, + "epoch": 1.0878494488452994, + "grad_norm": 0.14210317501246147, + "kl": 0.07989501953125, + "learning_rate": 4.995724677644824e-07, + "loss": 0.0001, + "reward": 1.7107143625617027, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7196428924798965, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6488 + }, + { + "completion_length": 258.1696548461914, + "epoch": 1.088184752085167, + "grad_norm": 0.22951210067149835, + "kl": 0.075286865234375, + "learning_rate": 4.995717662297003e-07, + "loss": 0.0001, + "reward": 1.7178572565317154, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7178571857511997, + "rewards/format_reward_func": 1.0, + "step": 6490 + }, + { + "completion_length": 256.8303689956665, + "epoch": 1.0885200553250345, + "grad_norm": 0.21521536442832762, + "kl": 0.0674591064453125, + "learning_rate": 4.995710641203115e-07, + "loss": 0.0001, + "reward": 1.7410714849829674, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6492 + }, + { + "completion_length": 252.008939743042, + "epoch": 1.0888553585649021, + "grad_norm": 0.17696140360889953, + "kl": 0.073944091796875, + "learning_rate": 4.995703614363176e-07, + "loss": 0.0001, + "reward": 1.7410714998841286, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357406288385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6494 + }, + { + "completion_length": 276.0759048461914, + "epoch": 1.0891906618047698, + "grad_norm": 0.29492164732292375, + "kl": 0.0858001708984375, + "learning_rate": 4.995696581777201e-07, + "loss": 0.0001, + "reward": 1.7196429371833801, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6496 + }, + { + "completion_length": 265.07144260406494, + "epoch": 1.0895259650446372, + "grad_norm": 0.20700246572065972, + "kl": 0.098724365234375, + "learning_rate": 4.995689543445209e-07, + "loss": 0.0001, + "reward": 1.7785714641213417, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7875000238418579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6498 + }, + { + "completion_length": 267.6875104904175, + "epoch": 1.0898612682845048, + "grad_norm": 0.1563277942876201, + "kl": 0.0779876708984375, + "learning_rate": 4.995682499367212e-07, + "loss": 0.0001, + "reward": 1.7892857491970062, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.807142885401845, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6500 + }, + { + "completion_length": 260.4017972946167, + "epoch": 1.0901965715243724, + "grad_norm": 0.16573974603677033, + "kl": 0.0820465087890625, + "learning_rate": 4.99567544954323e-07, + "loss": 0.0001, + "reward": 1.812500037252903, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8169643133878708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6502 + }, + { + "completion_length": 270.37054920196533, + "epoch": 1.0905318747642399, + "grad_norm": 0.20563901893599482, + "kl": 0.08184814453125, + "learning_rate": 4.995668393973277e-07, + "loss": 0.0001, + "reward": 1.7089286521077156, + "reward_std": 0.10859140008687973, + "rewards/equation_reward_func": 0.7401786111295223, + "rewards/format_reward_func": 0.9687500111758709, + "step": 6504 + }, + { + "completion_length": 276.2589387893677, + "epoch": 1.0908671780041075, + "grad_norm": 0.7781279067739908, + "kl": 0.1058349609375, + "learning_rate": 4.99566133265737e-07, + "loss": 0.0001, + "reward": 1.712500050663948, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7348214518278837, + "rewards/format_reward_func": 0.977678582072258, + "step": 6506 + }, + { + "completion_length": 284.8482255935669, + "epoch": 1.0912024812439751, + "grad_norm": 0.30134766414761066, + "kl": 0.1042633056640625, + "learning_rate": 4.995654265595526e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7767857424914837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6508 + }, + { + "completion_length": 289.0535879135132, + "epoch": 1.0915377844838425, + "grad_norm": 0.25376529695446304, + "kl": 0.114471435546875, + "learning_rate": 4.99564719278776e-07, + "loss": 0.0001, + "reward": 1.7160714715719223, + "reward_std": 0.07828682288527489, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.977678582072258, + "step": 6510 + }, + { + "completion_length": 284.9821557998657, + "epoch": 1.0918730877237102, + "grad_norm": 0.42895058427759786, + "kl": 0.13214111328125, + "learning_rate": 4.99564011423409e-07, + "loss": 0.0001, + "reward": 1.6714286506175995, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.6892857495695353, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6512 + }, + { + "completion_length": 283.6562614440918, + "epoch": 1.0922083909635778, + "grad_norm": 0.6980135663023137, + "kl": 0.156036376953125, + "learning_rate": 4.99563302993453e-07, + "loss": 0.0002, + "reward": 1.6982143893837929, + "reward_std": 0.12374368775635958, + "rewards/equation_reward_func": 0.7294643074274063, + "rewards/format_reward_func": 0.9687500149011612, + "step": 6514 + }, + { + "completion_length": 284.52233600616455, + "epoch": 1.0925436942034452, + "grad_norm": 0.44354741258299424, + "kl": 0.1544189453125, + "learning_rate": 4.995625939889097e-07, + "loss": 0.0002, + "reward": 1.7008929252624512, + "reward_std": 0.059346460737288, + "rewards/equation_reward_func": 0.7116071954369545, + "rewards/format_reward_func": 0.9892857223749161, + "step": 6516 + }, + { + "completion_length": 278.3214406967163, + "epoch": 1.0928789974433128, + "grad_norm": 0.3003540847071273, + "kl": 0.144378662109375, + "learning_rate": 4.995618844097808e-07, + "loss": 0.0001, + "reward": 1.7767857909202576, + "reward_std": 0.10354063473641872, + "rewards/equation_reward_func": 0.7901785857975483, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6518 + }, + { + "completion_length": 300.0803680419922, + "epoch": 1.0932143006831803, + "grad_norm": 0.6405706414392106, + "kl": 0.28515625, + "learning_rate": 4.99561174256068e-07, + "loss": 0.0003, + "reward": 1.687500074505806, + "reward_std": 0.08838834799826145, + "rewards/equation_reward_func": 0.7098214626312256, + "rewards/format_reward_func": 0.977678582072258, + "step": 6520 + }, + { + "completion_length": 290.151798248291, + "epoch": 1.093549603923048, + "grad_norm": 0.21560716139636138, + "kl": 0.2230987548828125, + "learning_rate": 4.995604635277728e-07, + "loss": 0.0002, + "reward": 1.707142949104309, + "reward_std": 0.11616754159331322, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 0.9642857313156128, + "step": 6522 + }, + { + "completion_length": 288.7901945114136, + "epoch": 1.0938849071629155, + "grad_norm": 0.6662672959830765, + "kl": 0.8878631591796875, + "learning_rate": 4.995597522248968e-07, + "loss": 0.0009, + "reward": 1.7214286401867867, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6524 + }, + { + "completion_length": 289.4553689956665, + "epoch": 1.094220210402783, + "grad_norm": 0.1884437199547336, + "kl": 0.4522247314453125, + "learning_rate": 4.995590403474419e-07, + "loss": 0.0005, + "reward": 1.707142896950245, + "reward_std": 0.08081220462918282, + "rewards/equation_reward_func": 0.7339286208152771, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6526 + }, + { + "completion_length": 268.06251335144043, + "epoch": 1.0945555136426506, + "grad_norm": 0.29136324362216554, + "kl": 0.8277587890625, + "learning_rate": 4.995583278954095e-07, + "loss": 0.0008, + "reward": 1.782142922282219, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7910714484751225, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6528 + }, + { + "completion_length": 282.5134105682373, + "epoch": 1.0948908168825182, + "grad_norm": 0.13064029811812, + "kl": 1.827606201171875, + "learning_rate": 4.995576148688012e-07, + "loss": 0.0018, + "reward": 1.771428607404232, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7982143238186836, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6530 + }, + { + "completion_length": 274.4553699493408, + "epoch": 1.0952261201223856, + "grad_norm": 0.2776119596095246, + "kl": 0.695587158203125, + "learning_rate": 4.995569012676189e-07, + "loss": 0.0007, + "reward": 1.7375000640749931, + "reward_std": 0.08838834799826145, + "rewards/equation_reward_func": 0.7508928775787354, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6532 + }, + { + "completion_length": 283.30358695983887, + "epoch": 1.0955614233622533, + "grad_norm": 0.2657775533041802, + "kl": 0.5340576171875, + "learning_rate": 4.995561870918639e-07, + "loss": 0.0005, + "reward": 1.7482143715023994, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6534 + }, + { + "completion_length": 290.11162281036377, + "epoch": 1.095896726602121, + "grad_norm": 0.3284617239203021, + "kl": 0.455780029296875, + "learning_rate": 4.995554723415382e-07, + "loss": 0.0005, + "reward": 1.7071429342031479, + "reward_std": 0.10606601741164923, + "rewards/equation_reward_func": 0.733928594738245, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6536 + }, + { + "completion_length": 285.56251525878906, + "epoch": 1.0962320298419883, + "grad_norm": 0.4737519246446728, + "kl": 0.477142333984375, + "learning_rate": 4.995547570166432e-07, + "loss": 0.0005, + "reward": 1.7482143715023994, + "reward_std": 0.10859139822423458, + "rewards/equation_reward_func": 0.779464315623045, + "rewards/format_reward_func": 0.9687500149011612, + "step": 6538 + }, + { + "completion_length": 281.23662281036377, + "epoch": 1.096567333081856, + "grad_norm": 0.28880568251512756, + "kl": 0.977142333984375, + "learning_rate": 4.995540411171805e-07, + "loss": 0.001, + "reward": 1.767857201397419, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6540 + }, + { + "completion_length": 289.55358505249023, + "epoch": 1.0969026363217234, + "grad_norm": 0.022334262039546134, + "kl": 0.261566162109375, + "learning_rate": 4.99553324643152e-07, + "loss": 0.0003, + "reward": 1.757142923772335, + "reward_std": 0.04040610231459141, + "rewards/equation_reward_func": 0.7660714536905289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6542 + }, + { + "completion_length": 285.9241199493408, + "epoch": 1.097237939561591, + "grad_norm": 0.30625968284332616, + "kl": 0.1597900390625, + "learning_rate": 4.995526075945593e-07, + "loss": 0.0002, + "reward": 1.7553571835160255, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.977678582072258, + "step": 6544 + }, + { + "completion_length": 279.4241199493408, + "epoch": 1.0975732428014586, + "grad_norm": 0.4419334505325685, + "kl": 0.6954498291015625, + "learning_rate": 4.995518899714039e-07, + "loss": 0.0007, + "reward": 1.685714341700077, + "reward_std": 0.07071067858487368, + "rewards/equation_reward_func": 0.7035714499652386, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6546 + }, + { + "completion_length": 286.0134086608887, + "epoch": 1.097908546041326, + "grad_norm": 0.33027778382925066, + "kl": 0.388275146484375, + "learning_rate": 4.995511717736875e-07, + "loss": 0.0004, + "reward": 1.73214291036129, + "reward_std": 0.11616754066199064, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6548 + }, + { + "completion_length": 283.43750953674316, + "epoch": 1.0982438492811937, + "grad_norm": 0.21767882315718848, + "kl": 0.23004150390625, + "learning_rate": 4.995504530014117e-07, + "loss": 0.0002, + "reward": 1.7500000670552254, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7589286006987095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6550 + }, + { + "completion_length": 281.70983123779297, + "epoch": 1.0985791525210613, + "grad_norm": 0.3029158393739477, + "kl": 0.1859130859375, + "learning_rate": 4.995497336545783e-07, + "loss": 0.0002, + "reward": 1.7410715073347092, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357611179352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6552 + }, + { + "completion_length": 270.4107246398926, + "epoch": 1.0989144557609287, + "grad_norm": 0.37787958558581103, + "kl": 0.186492919921875, + "learning_rate": 4.995490137331889e-07, + "loss": 0.0002, + "reward": 1.7946429178118706, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.8080357275903225, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6554 + }, + { + "completion_length": 288.4419765472412, + "epoch": 1.0992497590007964, + "grad_norm": 0.26048348704828717, + "kl": 0.204254150390625, + "learning_rate": 4.995482932372451e-07, + "loss": 0.0002, + "reward": 1.698214367032051, + "reward_std": 0.12879444938153028, + "rewards/equation_reward_func": 0.7294643111526966, + "rewards/format_reward_func": 0.9687500111758709, + "step": 6556 + }, + { + "completion_length": 261.23662090301514, + "epoch": 1.099585062240664, + "grad_norm": 0.551212163479299, + "kl": 0.3712005615234375, + "learning_rate": 4.995475721667486e-07, + "loss": 0.0004, + "reward": 1.7089286148548126, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7223214562982321, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6558 + }, + { + "completion_length": 249.2544755935669, + "epoch": 1.0999203654805314, + "grad_norm": 0.20253441295934366, + "kl": 0.2565765380859375, + "learning_rate": 4.995468505217011e-07, + "loss": 0.0003, + "reward": 1.769642911851406, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6560 + }, + { + "completion_length": 258.9598340988159, + "epoch": 1.100255668720399, + "grad_norm": 0.21418411741040597, + "kl": 0.1841278076171875, + "learning_rate": 4.995461283021043e-07, + "loss": 0.0002, + "reward": 1.6607143729925156, + "reward_std": 0.0656599160283804, + "rewards/equation_reward_func": 0.6785714607685804, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6562 + }, + { + "completion_length": 253.91519165039062, + "epoch": 1.1005909719602665, + "grad_norm": 0.2070990208135219, + "kl": 0.144287109375, + "learning_rate": 4.995454055079597e-07, + "loss": 0.0001, + "reward": 1.721428632736206, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214286178350449, + "rewards/format_reward_func": 1.0, + "step": 6564 + }, + { + "completion_length": 246.5803689956665, + "epoch": 1.100926275200134, + "grad_norm": 0.13879373930903302, + "kl": 0.237457275390625, + "learning_rate": 4.995446821392691e-07, + "loss": 0.0002, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.785714328289032, + "rewards/format_reward_func": 1.0, + "step": 6566 + }, + { + "completion_length": 242.7142972946167, + "epoch": 1.1012615784400017, + "grad_norm": 0.26167857128278654, + "kl": 0.146392822265625, + "learning_rate": 4.99543958196034e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.728571455925703, + "rewards/format_reward_func": 1.0, + "step": 6568 + }, + { + "completion_length": 248.7500123977661, + "epoch": 1.1015968816798691, + "grad_norm": 0.15531658303917262, + "kl": 0.11407470703125, + "learning_rate": 4.995432336782563e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6570 + }, + { + "completion_length": 235.06251049041748, + "epoch": 1.1019321849197368, + "grad_norm": 0.8186170595241872, + "kl": 0.1658935546875, + "learning_rate": 4.995425085859375e-07, + "loss": 0.0002, + "reward": 1.8142857775092125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 6572 + }, + { + "completion_length": 239.82590293884277, + "epoch": 1.1022674881596044, + "grad_norm": 0.06946096262257632, + "kl": 0.1508331298828125, + "learning_rate": 4.995417829190793e-07, + "loss": 0.0002, + "reward": 1.8464286103844643, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8464285880327225, + "rewards/format_reward_func": 1.0, + "step": 6574 + }, + { + "completion_length": 241.59375953674316, + "epoch": 1.1026027913994718, + "grad_norm": 0.3313378435357141, + "kl": 0.213287353515625, + "learning_rate": 4.995410566776835e-07, + "loss": 0.0002, + "reward": 1.703571505844593, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7035714667290449, + "rewards/format_reward_func": 1.0, + "step": 6576 + }, + { + "completion_length": 233.88394165039062, + "epoch": 1.1029380946393395, + "grad_norm": 0.16843392243853206, + "kl": 0.14697265625, + "learning_rate": 4.995403298617516e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 6578 + }, + { + "completion_length": 239.76786994934082, + "epoch": 1.103273397879207, + "grad_norm": 0.3046427178111775, + "kl": 0.140228271484375, + "learning_rate": 4.995396024712853e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7250000331550837, + "rewards/format_reward_func": 1.0, + "step": 6580 + }, + { + "completion_length": 243.0803680419922, + "epoch": 1.1036087011190745, + "grad_norm": 0.21699272384070514, + "kl": 0.19512939453125, + "learning_rate": 4.995388745062864e-07, + "loss": 0.0002, + "reward": 1.7678572162985802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 1.0, + "step": 6582 + }, + { + "completion_length": 245.22769165039062, + "epoch": 1.1039440043589421, + "grad_norm": 0.1718854405192041, + "kl": 0.1443328857421875, + "learning_rate": 4.995381459667564e-07, + "loss": 0.0001, + "reward": 1.7428572326898575, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 6584 + }, + { + "completion_length": 234.7098331451416, + "epoch": 1.1042793075988098, + "grad_norm": 0.22912566502299336, + "kl": 0.168731689453125, + "learning_rate": 4.995374168526971e-07, + "loss": 0.0002, + "reward": 1.7571429163217545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 6586 + }, + { + "completion_length": 235.71429634094238, + "epoch": 1.1046146108386772, + "grad_norm": 0.1733917636861512, + "kl": 0.141448974609375, + "learning_rate": 4.995366871641101e-07, + "loss": 0.0001, + "reward": 1.7982143312692642, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026785962283611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6588 + }, + { + "completion_length": 242.3392972946167, + "epoch": 1.1049499140785448, + "grad_norm": 0.3394077171058364, + "kl": 0.211273193359375, + "learning_rate": 4.995359569009972e-07, + "loss": 0.0002, + "reward": 1.7125000953674316, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7169643230736256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6590 + }, + { + "completion_length": 252.6875123977661, + "epoch": 1.1052852173184124, + "grad_norm": 0.43081293111981867, + "kl": 0.11590576171875, + "learning_rate": 4.995352260633601e-07, + "loss": 0.0001, + "reward": 1.7285715192556381, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714484751225, + "rewards/format_reward_func": 1.0, + "step": 6592 + }, + { + "completion_length": 258.4330472946167, + "epoch": 1.1056205205582799, + "grad_norm": 0.21628723638974706, + "kl": 0.13763427734375, + "learning_rate": 4.995344946512002e-07, + "loss": 0.0001, + "reward": 1.7392858043313026, + "reward_std": 0.05555839091539383, + "rewards/equation_reward_func": 0.7482143007218838, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6594 + }, + { + "completion_length": 248.11608123779297, + "epoch": 1.1059558237981475, + "grad_norm": 0.5966538770183953, + "kl": 0.261688232421875, + "learning_rate": 4.995337626645195e-07, + "loss": 0.0003, + "reward": 1.7875000685453415, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.791964303702116, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6596 + }, + { + "completion_length": 245.98215103149414, + "epoch": 1.106291127038015, + "grad_norm": 0.157717130031815, + "kl": 0.097625732421875, + "learning_rate": 4.995330301033195e-07, + "loss": 0.0001, + "reward": 1.7625000700354576, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7669643107801676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6598 + }, + { + "completion_length": 248.59376335144043, + "epoch": 1.1066264302778825, + "grad_norm": 0.13235538653295403, + "kl": 0.130096435546875, + "learning_rate": 4.995322969676019e-07, + "loss": 0.0001, + "reward": 1.719642922282219, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6600 + }, + { + "completion_length": 255.5089406967163, + "epoch": 1.1069617335177502, + "grad_norm": 0.3280297518641213, + "kl": 0.145782470703125, + "learning_rate": 4.995315632573684e-07, + "loss": 0.0001, + "reward": 1.6982143893837929, + "reward_std": 0.11364216171205044, + "rewards/equation_reward_func": 0.7205357421189547, + "rewards/format_reward_func": 0.977678582072258, + "step": 6602 + }, + { + "completion_length": 270.5000104904175, + "epoch": 1.1072970367576176, + "grad_norm": 0.9859893448344605, + "kl": 0.204925537109375, + "learning_rate": 4.995308289726209e-07, + "loss": 0.0002, + "reward": 1.7214286476373672, + "reward_std": 0.09091373067349195, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6604 + }, + { + "completion_length": 254.0848331451416, + "epoch": 1.1076323399974852, + "grad_norm": 0.6059061751603121, + "kl": 0.307647705078125, + "learning_rate": 4.995300941133608e-07, + "loss": 0.0003, + "reward": 1.7660714983940125, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6606 + }, + { + "completion_length": 274.86608600616455, + "epoch": 1.1079676432373529, + "grad_norm": 0.2292386723821532, + "kl": 0.272247314453125, + "learning_rate": 4.995293586795899e-07, + "loss": 0.0003, + "reward": 1.7553572282195091, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6608 + }, + { + "completion_length": 261.2946557998657, + "epoch": 1.1083029464772203, + "grad_norm": 0.2272666943279464, + "kl": 0.20709228515625, + "learning_rate": 4.995286226713099e-07, + "loss": 0.0002, + "reward": 1.7071429565548897, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7160714566707611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6610 + }, + { + "completion_length": 253.86608123779297, + "epoch": 1.108638249717088, + "grad_norm": 0.4846628000520213, + "kl": 0.3332977294921875, + "learning_rate": 4.995278860885225e-07, + "loss": 0.0003, + "reward": 1.6946429386734962, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.6991071794182062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6612 + }, + { + "completion_length": 254.2321538925171, + "epoch": 1.1089735529569555, + "grad_norm": 0.16874754553226004, + "kl": 0.1689453125, + "learning_rate": 4.995271489312294e-07, + "loss": 0.0002, + "reward": 1.7933036237955093, + "reward_std": 0.05997780850157142, + "rewards/equation_reward_func": 0.808035746216774, + "rewards/format_reward_func": 0.9852678664028645, + "step": 6614 + }, + { + "completion_length": 246.46429538726807, + "epoch": 1.109308856196823, + "grad_norm": 0.2300480690557061, + "kl": 0.7288818359375, + "learning_rate": 4.995264111994322e-07, + "loss": 0.0007, + "reward": 1.7285714969038963, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7375000342726707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6616 + }, + { + "completion_length": 255.94197463989258, + "epoch": 1.1096441594366906, + "grad_norm": 0.2096849116943943, + "kl": 1.5742645263671875, + "learning_rate": 4.995256728931328e-07, + "loss": 0.0016, + "reward": 1.7254464998841286, + "reward_std": 0.054927044780924916, + "rewards/equation_reward_func": 0.7401785962283611, + "rewards/format_reward_func": 0.9852678664028645, + "step": 6618 + }, + { + "completion_length": 253.44197273254395, + "epoch": 1.109979462676558, + "grad_norm": 0.24599453467498117, + "kl": 0.540283203125, + "learning_rate": 4.995249340123328e-07, + "loss": 0.0005, + "reward": 1.801785759627819, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500283122063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6620 + }, + { + "completion_length": 254.8750114440918, + "epoch": 1.1103147659164256, + "grad_norm": 0.3624082535916845, + "kl": 0.9485015869140625, + "learning_rate": 4.995241945570339e-07, + "loss": 0.001, + "reward": 1.7517857775092125, + "reward_std": 0.1287944484502077, + "rewards/equation_reward_func": 0.7830357439815998, + "rewards/format_reward_func": 0.9687500149011612, + "step": 6622 + }, + { + "completion_length": 249.16072463989258, + "epoch": 1.1106500691562933, + "grad_norm": 0.22190804563509325, + "kl": 0.233489990234375, + "learning_rate": 4.995234545272377e-07, + "loss": 0.0002, + "reward": 1.7892858013510704, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7982143126428127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6624 + }, + { + "completion_length": 266.75894355773926, + "epoch": 1.1109853723961607, + "grad_norm": 0.2184835683267264, + "kl": 0.2601318359375, + "learning_rate": 4.99522713922946e-07, + "loss": 0.0003, + "reward": 1.6803572326898575, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.684821480885148, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6626 + }, + { + "completion_length": 258.70536518096924, + "epoch": 1.1113206756360283, + "grad_norm": 0.23099437396258055, + "kl": 0.1270751953125, + "learning_rate": 4.995219727441605e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.07576144114136696, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6628 + }, + { + "completion_length": 249.26340579986572, + "epoch": 1.111655978875896, + "grad_norm": 0.15913812225492693, + "kl": 0.1038665771484375, + "learning_rate": 4.995212309908829e-07, + "loss": 0.0001, + "reward": 1.7321429029107094, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7500000242143869, + "rewards/format_reward_func": 0.9821428619325161, + "step": 6630 + }, + { + "completion_length": 255.02233505249023, + "epoch": 1.1119912821157634, + "grad_norm": 0.1314757051272552, + "kl": 0.1993408203125, + "learning_rate": 4.995204886631149e-07, + "loss": 0.0002, + "reward": 1.753571517765522, + "reward_std": 0.05555839091539383, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6632 + }, + { + "completion_length": 251.0848331451416, + "epoch": 1.112326585355631, + "grad_norm": 0.32806559142892566, + "kl": 0.133941650390625, + "learning_rate": 4.995197457608582e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.10101525485515594, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6634 + }, + { + "completion_length": 255.01787281036377, + "epoch": 1.1126618885954986, + "grad_norm": 0.16793208241146898, + "kl": 0.108489990234375, + "learning_rate": 4.995190022841146e-07, + "loss": 0.0001, + "reward": 1.7500000447034836, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7589286044239998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6636 + }, + { + "completion_length": 262.2142972946167, + "epoch": 1.112997191835366, + "grad_norm": 0.08044673225781292, + "kl": 0.131439208984375, + "learning_rate": 4.995182582328858e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7169643100351095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6638 + }, + { + "completion_length": 256.00001335144043, + "epoch": 1.1133324950752337, + "grad_norm": 0.20721738332140918, + "kl": 0.11676025390625, + "learning_rate": 4.995175136071733e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.752678606659174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6640 + }, + { + "completion_length": 246.9419765472412, + "epoch": 1.1136677983151013, + "grad_norm": 0.24787722237396217, + "kl": 0.095733642578125, + "learning_rate": 4.99516768406979e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6642 + }, + { + "completion_length": 241.3437614440918, + "epoch": 1.1140031015549687, + "grad_norm": 0.4045079891732805, + "kl": 0.08056640625, + "learning_rate": 4.995160226323046e-07, + "loss": 0.0001, + "reward": 1.7803572192788124, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6644 + }, + { + "completion_length": 245.88840198516846, + "epoch": 1.1143384047948364, + "grad_norm": 0.28886896563636494, + "kl": 0.080047607421875, + "learning_rate": 4.99515276283152e-07, + "loss": 0.0001, + "reward": 1.7580358013510704, + "reward_std": 0.08965103607624769, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9982142895460129, + "step": 6646 + }, + { + "completion_length": 260.1384038925171, + "epoch": 1.114673708034704, + "grad_norm": 0.18604455411456783, + "kl": 0.103424072265625, + "learning_rate": 4.995145293595224e-07, + "loss": 0.0001, + "reward": 1.7375000938773155, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6648 + }, + { + "completion_length": 244.8169765472412, + "epoch": 1.1150090112745714, + "grad_norm": 0.13081924970963807, + "kl": 0.073028564453125, + "learning_rate": 4.995137818614181e-07, + "loss": 0.0001, + "reward": 1.8642857670783997, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8642857372760773, + "rewards/format_reward_func": 1.0, + "step": 6650 + }, + { + "completion_length": 256.6294765472412, + "epoch": 1.115344314514439, + "grad_norm": 0.2613255092581983, + "kl": 0.088592529296875, + "learning_rate": 4.995130337888404e-07, + "loss": 0.0001, + "reward": 1.7678571864962578, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6652 + }, + { + "completion_length": 253.9241189956665, + "epoch": 1.1156796177543065, + "grad_norm": 0.29768289237812334, + "kl": 0.07568359375, + "learning_rate": 4.995122851417912e-07, + "loss": 0.0001, + "reward": 1.708928681910038, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7223214525729418, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6654 + }, + { + "completion_length": 242.65625858306885, + "epoch": 1.116014920994174, + "grad_norm": 0.2506214712263386, + "kl": 0.1123504638671875, + "learning_rate": 4.995115359202722e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.72142861969769, + "rewards/format_reward_func": 1.0, + "step": 6656 + }, + { + "completion_length": 253.31251049041748, + "epoch": 1.1163502242340417, + "grad_norm": 0.2593847018803885, + "kl": 0.1129302978515625, + "learning_rate": 4.995107861242852e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6658 + }, + { + "completion_length": 255.15179824829102, + "epoch": 1.1166855274739091, + "grad_norm": 0.27640892602323935, + "kl": 0.07427978515625, + "learning_rate": 4.995100357538319e-07, + "loss": 0.0001, + "reward": 1.7214286625385284, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 6660 + }, + { + "completion_length": 258.34376335144043, + "epoch": 1.1170208307137768, + "grad_norm": 0.2795999240184237, + "kl": 0.152252197265625, + "learning_rate": 4.995092848089139e-07, + "loss": 0.0002, + "reward": 1.7357143759727478, + "reward_std": 0.1010152529925108, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6662 + }, + { + "completion_length": 248.71876430511475, + "epoch": 1.1173561339536444, + "grad_norm": 0.2088474635895226, + "kl": 0.067962646484375, + "learning_rate": 4.99508533289533e-07, + "loss": 0.0001, + "reward": 1.6928572282195091, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7017857655882835, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6664 + }, + { + "completion_length": 260.60715675354004, + "epoch": 1.1176914371935118, + "grad_norm": 0.23495409011200794, + "kl": 0.107940673828125, + "learning_rate": 4.99507781195691e-07, + "loss": 0.0001, + "reward": 1.7410714775323868, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7544643208384514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6666 + }, + { + "completion_length": 258.20537185668945, + "epoch": 1.1180267404333795, + "grad_norm": 0.3012901568012722, + "kl": 0.18408203125, + "learning_rate": 4.995070285273895e-07, + "loss": 0.0002, + "reward": 1.7660714834928513, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.779464315623045, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6668 + }, + { + "completion_length": 267.14287090301514, + "epoch": 1.118362043673247, + "grad_norm": 0.12807288181741017, + "kl": 0.08056640625, + "learning_rate": 4.995062752846304e-07, + "loss": 0.0001, + "reward": 1.7303572297096252, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.743750024586916, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6670 + }, + { + "completion_length": 254.95983219146729, + "epoch": 1.1186973469131145, + "grad_norm": 0.27471962269330896, + "kl": 0.365966796875, + "learning_rate": 4.995055214674153e-07, + "loss": 0.0004, + "reward": 1.7339286282658577, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7473214566707611, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6672 + }, + { + "completion_length": 245.0714406967163, + "epoch": 1.1190326501529821, + "grad_norm": 0.445086464766825, + "kl": 0.2374267578125, + "learning_rate": 4.99504767075746e-07, + "loss": 0.0002, + "reward": 1.7821429297327995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428701281548, + "rewards/format_reward_func": 1.0, + "step": 6674 + }, + { + "completion_length": 258.39286613464355, + "epoch": 1.1193679533928496, + "grad_norm": 0.41685696367688935, + "kl": 0.120513916015625, + "learning_rate": 4.995040121096243e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6676 + }, + { + "completion_length": 242.99554538726807, + "epoch": 1.1197032566327172, + "grad_norm": 0.40514511717107204, + "kl": 0.22406005859375, + "learning_rate": 4.995032565690517e-07, + "loss": 0.0002, + "reward": 1.7160715088248253, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7205357495695353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6678 + }, + { + "completion_length": 240.9821538925171, + "epoch": 1.1200385598725848, + "grad_norm": 0.46798781534682443, + "kl": 0.207855224609375, + "learning_rate": 4.995025004540301e-07, + "loss": 0.0002, + "reward": 1.7892857640981674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 6680 + }, + { + "completion_length": 256.11161708831787, + "epoch": 1.1203738631124522, + "grad_norm": 0.6230530844540287, + "kl": 0.28582763671875, + "learning_rate": 4.995017437645613e-07, + "loss": 0.0003, + "reward": 1.770089365541935, + "reward_std": 0.09280776605010033, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 0.9808035790920258, + "step": 6682 + }, + { + "completion_length": 236.52679634094238, + "epoch": 1.1207091663523199, + "grad_norm": 2.168458827730386, + "kl": 0.2948150634765625, + "learning_rate": 4.99500986500647e-07, + "loss": 0.0003, + "reward": 1.7080358043313026, + "reward_std": 0.04924493608996272, + "rewards/equation_reward_func": 0.7107143364846706, + "rewards/format_reward_func": 0.9973214343190193, + "step": 6684 + }, + { + "completion_length": 235.4553680419922, + "epoch": 1.1210444695921875, + "grad_norm": 0.5230753506181978, + "kl": 0.3092041015625, + "learning_rate": 4.995002286622888e-07, + "loss": 0.0003, + "reward": 1.7428572252392769, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7517857402563095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6686 + }, + { + "completion_length": 263.477689743042, + "epoch": 1.121379772832055, + "grad_norm": 2.2929950926167426, + "kl": 1.7777099609375, + "learning_rate": 4.994994702494887e-07, + "loss": 0.0018, + "reward": 1.7089285999536514, + "reward_std": 0.1439467379823327, + "rewards/equation_reward_func": 0.7580357454717159, + "rewards/format_reward_func": 0.9508928805589676, + "step": 6688 + }, + { + "completion_length": 332.70983505249023, + "epoch": 1.1217150760719226, + "grad_norm": 2.489929363328956, + "kl": 2.840087890625, + "learning_rate": 4.994987112622483e-07, + "loss": 0.0028, + "reward": 1.1937500424683094, + "reward_std": 0.4482551934197545, + "rewards/equation_reward_func": 0.535714304074645, + "rewards/format_reward_func": 0.6580357477068901, + "step": 6690 + }, + { + "completion_length": 297.50001430511475, + "epoch": 1.1220503793117902, + "grad_norm": 2.5227119840513015, + "kl": 1.386444091796875, + "learning_rate": 4.994979517005693e-07, + "loss": 0.0014, + "reward": 1.365625061094761, + "reward_std": 0.3276682370342314, + "rewards/equation_reward_func": 0.6062500178813934, + "rewards/format_reward_func": 0.7593750320374966, + "step": 6692 + }, + { + "completion_length": 292.82590770721436, + "epoch": 1.1223856825516576, + "grad_norm": 1.663920193770367, + "kl": 1.7261810302734375, + "learning_rate": 4.994971915644536e-07, + "loss": 0.0017, + "reward": 1.4058036357164383, + "reward_std": 0.3415578296408057, + "rewards/equation_reward_func": 0.6482142992317677, + "rewards/format_reward_func": 0.7575893234461546, + "step": 6694 + }, + { + "completion_length": 314.8214454650879, + "epoch": 1.1227209857915252, + "grad_norm": 1.2900474738091785, + "kl": 1.621734619140625, + "learning_rate": 4.994964308539027e-07, + "loss": 0.0016, + "reward": 1.278571479022503, + "reward_std": 0.3838579710572958, + "rewards/equation_reward_func": 0.5732143018394709, + "rewards/format_reward_func": 0.7053571790456772, + "step": 6696 + }, + { + "completion_length": 272.94197368621826, + "epoch": 1.1230562890313927, + "grad_norm": 1.9977503024608492, + "kl": 10.36822509765625, + "learning_rate": 4.994956695689187e-07, + "loss": 0.0104, + "reward": 1.4410714656114578, + "reward_std": 0.3055711481720209, + "rewards/equation_reward_func": 0.6151786055415869, + "rewards/format_reward_func": 0.8258929029107094, + "step": 6698 + }, + { + "completion_length": 259.0178699493408, + "epoch": 1.1233915922712603, + "grad_norm": 1.0655966422208405, + "kl": 8.722640991210938, + "learning_rate": 4.99494907709503e-07, + "loss": 0.0087, + "reward": 1.5750000774860382, + "reward_std": 0.2676904257386923, + "rewards/equation_reward_func": 0.7178571633994579, + "rewards/format_reward_func": 0.8571428880095482, + "step": 6700 + }, + { + "completion_length": 249.94197750091553, + "epoch": 1.123726895511128, + "grad_norm": 1.1186796278129825, + "kl": 7.51885986328125, + "learning_rate": 4.994941452756575e-07, + "loss": 0.0075, + "reward": 1.6553571745753288, + "reward_std": 0.18940360005944967, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.906250037252903, + "step": 6702 + }, + { + "completion_length": 241.90626335144043, + "epoch": 1.1240621987509953, + "grad_norm": 0.26440510902701225, + "kl": 0.41455078125, + "learning_rate": 4.994933822673841e-07, + "loss": 0.0004, + "reward": 1.6441965103149414, + "reward_std": 0.0839689327403903, + "rewards/equation_reward_func": 0.7053571715950966, + "rewards/format_reward_func": 0.9388393126428127, + "step": 6704 + }, + { + "completion_length": 233.44197463989258, + "epoch": 1.124397501990863, + "grad_norm": 0.40772070413726497, + "kl": 0.08148193359375, + "learning_rate": 4.994926186846844e-07, + "loss": 0.0001, + "reward": 1.6924107670783997, + "reward_std": 0.08270624093711376, + "rewards/equation_reward_func": 0.7321428768336773, + "rewards/format_reward_func": 0.9602678790688515, + "step": 6706 + }, + { + "completion_length": 224.9196548461914, + "epoch": 1.1247328052307306, + "grad_norm": 0.2948453419333735, + "kl": 0.0688323974609375, + "learning_rate": 4.994918545275601e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.812500037252903, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6708 + }, + { + "completion_length": 232.6696538925171, + "epoch": 1.125068108470598, + "grad_norm": 0.23710215383380753, + "kl": 0.06982421875, + "learning_rate": 4.994910897960132e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 0.9642857238650322, + "step": 6710 + }, + { + "completion_length": 229.37947463989258, + "epoch": 1.1254034117104657, + "grad_norm": 0.28880507537327116, + "kl": 0.083526611328125, + "learning_rate": 4.994903244900453e-07, + "loss": 0.0001, + "reward": 1.7000000774860382, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7357143275439739, + "rewards/format_reward_func": 0.9642857313156128, + "step": 6712 + }, + { + "completion_length": 234.78572368621826, + "epoch": 1.1257387149503333, + "grad_norm": 0.49477930676249415, + "kl": 0.129913330078125, + "learning_rate": 4.994895586096581e-07, + "loss": 0.0001, + "reward": 1.6857143267989159, + "reward_std": 0.12626906856894493, + "rewards/equation_reward_func": 0.7392857354134321, + "rewards/format_reward_func": 0.9464285969734192, + "step": 6714 + }, + { + "completion_length": 234.7366180419922, + "epoch": 1.1260740181902007, + "grad_norm": 0.2093629175985944, + "kl": 0.0768280029296875, + "learning_rate": 4.994887921548533e-07, + "loss": 0.0001, + "reward": 1.7267857566475868, + "reward_std": 0.10354063659906387, + "rewards/equation_reward_func": 0.7758928947150707, + "rewards/format_reward_func": 0.950892873108387, + "step": 6716 + }, + { + "completion_length": 226.94643878936768, + "epoch": 1.1264093214300683, + "grad_norm": 0.27764295574443226, + "kl": 0.0676422119140625, + "learning_rate": 4.994880251256329e-07, + "loss": 0.0001, + "reward": 1.6910715103149414, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.713392885401845, + "rewards/format_reward_func": 0.9776785783469677, + "step": 6718 + }, + { + "completion_length": 252.6562614440918, + "epoch": 1.126744624669936, + "grad_norm": 0.7163660640986659, + "kl": 0.150604248046875, + "learning_rate": 4.994872575219986e-07, + "loss": 0.0002, + "reward": 1.6642857491970062, + "reward_std": 0.10606601648032665, + "rewards/equation_reward_func": 0.7267857491970062, + "rewards/format_reward_func": 0.9375000186264515, + "step": 6720 + }, + { + "completion_length": 240.53572368621826, + "epoch": 1.1270799279098034, + "grad_norm": 0.23950711514360806, + "kl": 0.1202392578125, + "learning_rate": 4.994864893439521e-07, + "loss": 0.0001, + "reward": 1.7178572043776512, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6722 + }, + { + "completion_length": 229.9866189956665, + "epoch": 1.127415231149671, + "grad_norm": 0.7546838926343592, + "kl": 0.2407379150390625, + "learning_rate": 4.994857205914952e-07, + "loss": 0.0002, + "reward": 1.6946429088711739, + "reward_std": 0.13384521193802357, + "rewards/equation_reward_func": 0.7437500320374966, + "rewards/format_reward_func": 0.9508928768336773, + "step": 6724 + }, + { + "completion_length": 234.6026906967163, + "epoch": 1.1277505343895387, + "grad_norm": 0.2525886081067382, + "kl": 0.0670013427734375, + "learning_rate": 4.994849512646297e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.1060660183429718, + "rewards/equation_reward_func": 0.7875000350177288, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6726 + }, + { + "completion_length": 227.79911708831787, + "epoch": 1.128085837629406, + "grad_norm": 0.3302151167549251, + "kl": 0.068359375, + "learning_rate": 4.994841813633574e-07, + "loss": 0.0001, + "reward": 1.73214291036129, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7410714589059353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6728 + }, + { + "completion_length": 232.83036708831787, + "epoch": 1.1284211408692737, + "grad_norm": 0.27840527237917856, + "kl": 0.071990966796875, + "learning_rate": 4.994834108876798e-07, + "loss": 0.0001, + "reward": 1.734375037252903, + "reward_std": 0.0738674052990973, + "rewards/equation_reward_func": 0.7625000439584255, + "rewards/format_reward_func": 0.9718750081956387, + "step": 6730 + }, + { + "completion_length": 229.79911708831787, + "epoch": 1.128756444109141, + "grad_norm": 0.37449317940935545, + "kl": 0.0745849609375, + "learning_rate": 4.994826398375991e-07, + "loss": 0.0001, + "reward": 1.786160796880722, + "reward_std": 0.09028238197788596, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9941964335739613, + "step": 6732 + }, + { + "completion_length": 231.52233123779297, + "epoch": 1.1290917473490087, + "grad_norm": 0.4020724354337358, + "kl": 0.074310302734375, + "learning_rate": 4.994818682131168e-07, + "loss": 0.0001, + "reward": 1.8138393461704254, + "reward_std": 0.061240497045218945, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 0.995982151478529, + "step": 6734 + }, + { + "completion_length": 226.665189743042, + "epoch": 1.1294270505888764, + "grad_norm": 0.21480784890133833, + "kl": 0.0719451904296875, + "learning_rate": 4.994810960142347e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7625000327825546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6736 + }, + { + "completion_length": 237.665189743042, + "epoch": 1.1297623538287438, + "grad_norm": 0.42019404209191996, + "kl": 0.0841827392578125, + "learning_rate": 4.994803232409546e-07, + "loss": 0.0001, + "reward": 1.6464286521077156, + "reward_std": 0.1414213553071022, + "rewards/equation_reward_func": 0.6821428798139095, + "rewards/format_reward_func": 0.9642857313156128, + "step": 6738 + }, + { + "completion_length": 235.20536708831787, + "epoch": 1.1300976570686114, + "grad_norm": 0.22061191284016354, + "kl": 0.0779876708984375, + "learning_rate": 4.994795498932784e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7392857298254967, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6740 + }, + { + "completion_length": 236.31251049041748, + "epoch": 1.130432960308479, + "grad_norm": 0.26718414308599536, + "kl": 1.054351806640625, + "learning_rate": 4.994787759712077e-07, + "loss": 0.0011, + "reward": 1.7089286297559738, + "reward_std": 0.108591397292912, + "rewards/equation_reward_func": 0.7312500327825546, + "rewards/format_reward_func": 0.977678582072258, + "step": 6742 + }, + { + "completion_length": 230.5357255935669, + "epoch": 1.1307682635483465, + "grad_norm": 0.30451959283162633, + "kl": 0.08184814453125, + "learning_rate": 4.994780014747444e-07, + "loss": 0.0001, + "reward": 1.764285758137703, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7821429036557674, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6744 + }, + { + "completion_length": 240.34822845458984, + "epoch": 1.131103566788214, + "grad_norm": 0.2806823903013915, + "kl": 0.736236572265625, + "learning_rate": 4.994772264038901e-07, + "loss": 0.0007, + "reward": 1.7303572371602058, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7616071701049805, + "rewards/format_reward_func": 0.9687500149011612, + "step": 6746 + }, + { + "completion_length": 242.71429538726807, + "epoch": 1.1314388700280817, + "grad_norm": 0.30587482030970475, + "kl": 0.078155517578125, + "learning_rate": 4.994764507586468e-07, + "loss": 0.0001, + "reward": 1.7696429267525673, + "reward_std": 0.09343911148607731, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9776785783469677, + "step": 6748 + }, + { + "completion_length": 220.5759038925171, + "epoch": 1.1317741732679492, + "grad_norm": 0.12420241190982315, + "kl": 0.0816192626953125, + "learning_rate": 4.994756745390163e-07, + "loss": 0.0001, + "reward": 1.7285714894533157, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7464285995811224, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6750 + }, + { + "completion_length": 229.2544755935669, + "epoch": 1.1321094765078168, + "grad_norm": 0.24521204153039233, + "kl": 0.078582763671875, + "learning_rate": 4.994748977450002e-07, + "loss": 0.0001, + "reward": 1.7053572088479996, + "reward_std": 0.10354063659906387, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9598214440047741, + "step": 6752 + }, + { + "completion_length": 236.79465293884277, + "epoch": 1.1324447797476842, + "grad_norm": 0.2621989103061339, + "kl": 0.086334228515625, + "learning_rate": 4.994741203766006e-07, + "loss": 0.0001, + "reward": 1.7267857417464256, + "reward_std": 0.11364216171205044, + "rewards/equation_reward_func": 0.7580357305705547, + "rewards/format_reward_func": 0.9687500149011612, + "step": 6754 + }, + { + "completion_length": 230.37054538726807, + "epoch": 1.1327800829875518, + "grad_norm": 0.35612925150187297, + "kl": 0.126983642578125, + "learning_rate": 4.994733424338187e-07, + "loss": 0.0001, + "reward": 1.7160715237259865, + "reward_std": 0.11869292240589857, + "rewards/equation_reward_func": 0.7294643297791481, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6756 + }, + { + "completion_length": 232.8303680419922, + "epoch": 1.1331153862274195, + "grad_norm": 0.24637191100536926, + "kl": 0.08123779296875, + "learning_rate": 4.99472563916657e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.0909137288108468, + "rewards/equation_reward_func": 0.753571467474103, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6758 + }, + { + "completion_length": 218.29911708831787, + "epoch": 1.1334506894672869, + "grad_norm": 0.1348445612235299, + "kl": 0.06597900390625, + "learning_rate": 4.994717848251168e-07, + "loss": 0.0001, + "reward": 1.8089286237955093, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.813392885029316, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6760 + }, + { + "completion_length": 235.64286708831787, + "epoch": 1.1337859927071545, + "grad_norm": 0.25379818975244123, + "kl": 0.0818634033203125, + "learning_rate": 4.994710051592001e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.10354063473641872, + "rewards/equation_reward_func": 0.7758928909897804, + "rewards/format_reward_func": 0.9687500149011612, + "step": 6762 + }, + { + "completion_length": 225.30804824829102, + "epoch": 1.1341212959470222, + "grad_norm": 0.29357611013427315, + "kl": 0.6669158935546875, + "learning_rate": 4.994702249189087e-07, + "loss": 0.0007, + "reward": 1.7607143595814705, + "reward_std": 0.10606601554900408, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6764 + }, + { + "completion_length": 226.20982933044434, + "epoch": 1.1344565991868896, + "grad_norm": 0.22086866513243852, + "kl": 0.0677032470703125, + "learning_rate": 4.994694441042442e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7482143230736256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6766 + }, + { + "completion_length": 231.54018783569336, + "epoch": 1.1347919024267572, + "grad_norm": 0.2603920909323302, + "kl": 0.0783538818359375, + "learning_rate": 4.994686627152085e-07, + "loss": 0.0001, + "reward": 1.7500000447034836, + "reward_std": 0.09596449043601751, + "rewards/equation_reward_func": 0.7767857424914837, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6768 + }, + { + "completion_length": 233.05804634094238, + "epoch": 1.1351272056666248, + "grad_norm": 0.15957707276356545, + "kl": 0.07037353515625, + "learning_rate": 4.994678807518037e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.8008928820490837, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6770 + }, + { + "completion_length": 226.86161708831787, + "epoch": 1.1354625089064923, + "grad_norm": 0.2632334048762584, + "kl": 0.074432373046875, + "learning_rate": 4.994670982140312e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7803571633994579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6772 + }, + { + "completion_length": 232.81250953674316, + "epoch": 1.1357978121463599, + "grad_norm": 0.16245051836075658, + "kl": 0.0787353515625, + "learning_rate": 4.994663151018929e-07, + "loss": 0.0001, + "reward": 1.7589286118745804, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.977678582072258, + "step": 6774 + }, + { + "completion_length": 237.65179920196533, + "epoch": 1.1361331153862273, + "grad_norm": 0.3432961730562157, + "kl": 1.7099456787109375, + "learning_rate": 4.994655314153907e-07, + "loss": 0.0017, + "reward": 1.6964286267757416, + "reward_std": 0.12121830601245165, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 0.9642857313156128, + "step": 6776 + }, + { + "completion_length": 229.12947273254395, + "epoch": 1.136468418626095, + "grad_norm": 0.19895687920441474, + "kl": 0.0804595947265625, + "learning_rate": 4.994647471545265e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143238186836, + "rewards/format_reward_func": 1.0, + "step": 6778 + }, + { + "completion_length": 226.87947463989258, + "epoch": 1.1368037218659626, + "grad_norm": 0.20275366249162494, + "kl": 0.0865631103515625, + "learning_rate": 4.994639623193017e-07, + "loss": 0.0001, + "reward": 1.78035718947649, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6780 + }, + { + "completion_length": 229.3080472946167, + "epoch": 1.1371390251058302, + "grad_norm": 0.25465394761532567, + "kl": 0.08404541015625, + "learning_rate": 4.994631769097186e-07, + "loss": 0.0001, + "reward": 1.6696429252624512, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.6830357611179352, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6782 + }, + { + "completion_length": 235.73215198516846, + "epoch": 1.1374743283456976, + "grad_norm": 0.2737771638741082, + "kl": 1.16455078125, + "learning_rate": 4.994623909257786e-07, + "loss": 0.0012, + "reward": 1.7500000447034836, + "reward_std": 0.08586296811699867, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 0.9642857313156128, + "step": 6784 + }, + { + "completion_length": 220.9509048461914, + "epoch": 1.1378096315855653, + "grad_norm": 0.19918002938971385, + "kl": 0.0744476318359375, + "learning_rate": 4.994616043674837e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7803571745753288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6786 + }, + { + "completion_length": 217.20536518096924, + "epoch": 1.1381449348254327, + "grad_norm": 0.17679503465918528, + "kl": 0.079254150390625, + "learning_rate": 4.994608172348356e-07, + "loss": 0.0001, + "reward": 1.798214353621006, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6788 + }, + { + "completion_length": 221.59822368621826, + "epoch": 1.1384802380653003, + "grad_norm": 0.33131053910015773, + "kl": 0.076934814453125, + "learning_rate": 4.994600295278364e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000227242708, + "rewards/format_reward_func": 1.0, + "step": 6790 + }, + { + "completion_length": 213.90626049041748, + "epoch": 1.138815541305168, + "grad_norm": 0.3653836980839601, + "kl": 1.16046142578125, + "learning_rate": 4.994592412464876e-07, + "loss": 0.0012, + "reward": 1.7500000670552254, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7589286155998707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6792 + }, + { + "completion_length": 233.4017972946167, + "epoch": 1.1391508445450353, + "grad_norm": 0.27747762072907645, + "kl": 0.084259033203125, + "learning_rate": 4.994584523907914e-07, + "loss": 0.0001, + "reward": 1.655357226729393, + "reward_std": 0.09848987404257059, + "rewards/equation_reward_func": 0.6866071820259094, + "rewards/format_reward_func": 0.9687500111758709, + "step": 6794 + }, + { + "completion_length": 226.19643783569336, + "epoch": 1.139486147784903, + "grad_norm": 1.1883478179099713, + "kl": 3.068939208984375, + "learning_rate": 4.99457662960749e-07, + "loss": 0.0031, + "reward": 1.7125000581145287, + "reward_std": 0.1035406356677413, + "rewards/equation_reward_func": 0.7526785898953676, + "rewards/format_reward_func": 0.9598214440047741, + "step": 6796 + }, + { + "completion_length": 221.78572750091553, + "epoch": 1.1398214510247706, + "grad_norm": 0.2720848237059936, + "kl": 0.0783538818359375, + "learning_rate": 4.994568729563628e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7937500402331352, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6798 + }, + { + "completion_length": 215.40625953674316, + "epoch": 1.140156754264638, + "grad_norm": 0.17525951997006495, + "kl": 0.084381103515625, + "learning_rate": 4.994560823776342e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.08586296625435352, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6800 + }, + { + "completion_length": 231.19643783569336, + "epoch": 1.1404920575045057, + "grad_norm": 0.5392629404213098, + "kl": 0.669952392578125, + "learning_rate": 4.994552912245653e-07, + "loss": 0.0007, + "reward": 1.7107143625617027, + "reward_std": 0.11616754252463579, + "rewards/equation_reward_func": 0.737500037997961, + "rewards/format_reward_func": 0.9732142984867096, + "step": 6802 + }, + { + "completion_length": 220.7991180419922, + "epoch": 1.1408273607443733, + "grad_norm": 0.38399115209840423, + "kl": 0.09869384765625, + "learning_rate": 4.994544994971579e-07, + "loss": 0.0001, + "reward": 1.7017858177423477, + "reward_std": 0.11869292240589857, + "rewards/equation_reward_func": 0.7151786051690578, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6804 + }, + { + "completion_length": 210.77679538726807, + "epoch": 1.1411626639842407, + "grad_norm": 0.3414302520044539, + "kl": 0.08026123046875, + "learning_rate": 4.994537071954136e-07, + "loss": 0.0001, + "reward": 1.7946428954601288, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7991071697324514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6806 + }, + { + "completion_length": 216.79018688201904, + "epoch": 1.1414979672241083, + "grad_norm": 0.35112093270640127, + "kl": 0.0818939208984375, + "learning_rate": 4.994529143193344e-07, + "loss": 0.0001, + "reward": 1.7053571790456772, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7187500260770321, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6808 + }, + { + "completion_length": 216.69197368621826, + "epoch": 1.1418332704639758, + "grad_norm": 0.27308493165841274, + "kl": 0.084320068359375, + "learning_rate": 4.994521208689222e-07, + "loss": 0.0001, + "reward": 1.7107143551111221, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7196428962051868, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6810 + }, + { + "completion_length": 229.5982255935669, + "epoch": 1.1421685737038434, + "grad_norm": 0.33048177298301895, + "kl": 0.0850372314453125, + "learning_rate": 4.994513268441786e-07, + "loss": 0.0001, + "reward": 1.7267857864499092, + "reward_std": 0.09343911055475473, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.977678582072258, + "step": 6812 + }, + { + "completion_length": 216.61608219146729, + "epoch": 1.142503876943711, + "grad_norm": 0.16062072058112123, + "kl": 0.0807952880859375, + "learning_rate": 4.994505322451057e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.74910718947649, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6814 + }, + { + "completion_length": 212.56697273254395, + "epoch": 1.1428391801835784, + "grad_norm": 0.2508972626651724, + "kl": 0.085968017578125, + "learning_rate": 4.994497370717051e-07, + "loss": 0.0001, + "reward": 1.7517857924103737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6816 + }, + { + "completion_length": 221.79465293884277, + "epoch": 1.143174483423446, + "grad_norm": 0.28485351061728537, + "kl": 0.0821075439453125, + "learning_rate": 4.994489413239788e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6818 + }, + { + "completion_length": 217.49554538726807, + "epoch": 1.1435097866633137, + "grad_norm": 0.16398489059121946, + "kl": 0.0821380615234375, + "learning_rate": 4.994481450019285e-07, + "loss": 0.0001, + "reward": 1.817857176065445, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 6820 + }, + { + "completion_length": 217.56250858306885, + "epoch": 1.1438450899031811, + "grad_norm": 0.3405546953923567, + "kl": 0.085357666015625, + "learning_rate": 4.99447348105556e-07, + "loss": 0.0001, + "reward": 1.7325893640518188, + "reward_std": 0.08018085593357682, + "rewards/equation_reward_func": 0.7473214641213417, + "rewards/format_reward_func": 0.9852678664028645, + "step": 6822 + }, + { + "completion_length": 224.90179634094238, + "epoch": 1.1441803931430488, + "grad_norm": 0.2212434884477253, + "kl": 0.072967529296875, + "learning_rate": 4.994465506348633e-07, + "loss": 0.0001, + "reward": 1.717857226729393, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857324331999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6824 + }, + { + "completion_length": 216.48661613464355, + "epoch": 1.1445156963829164, + "grad_norm": 0.26409679570024003, + "kl": 0.088470458984375, + "learning_rate": 4.994457525898521e-07, + "loss": 0.0001, + "reward": 1.7482143640518188, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786010712385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6826 + }, + { + "completion_length": 223.00000858306885, + "epoch": 1.1448509996227838, + "grad_norm": 0.1694539692321961, + "kl": 0.0811614990234375, + "learning_rate": 4.994449539705244e-07, + "loss": 0.0001, + "reward": 1.8147321939468384, + "reward_std": 0.03977475711144507, + "rewards/equation_reward_func": 0.8205357473343611, + "rewards/format_reward_func": 0.9941964335739613, + "step": 6828 + }, + { + "completion_length": 217.86608123779297, + "epoch": 1.1451863028626514, + "grad_norm": 0.2320165700751102, + "kl": 0.0828704833984375, + "learning_rate": 4.994441547768818e-07, + "loss": 0.0001, + "reward": 1.801785759627819, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.8062500283122063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6830 + }, + { + "completion_length": 219.41965103149414, + "epoch": 1.1455216061025189, + "grad_norm": 0.3201717426420188, + "kl": 0.08636474609375, + "learning_rate": 4.994433550089263e-07, + "loss": 0.0001, + "reward": 1.7446429431438446, + "reward_std": 0.08838834799826145, + "rewards/equation_reward_func": 0.7580357529222965, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6832 + }, + { + "completion_length": 215.90179538726807, + "epoch": 1.1458569093423865, + "grad_norm": 0.1761618860877667, + "kl": 0.084991455078125, + "learning_rate": 4.994425546666597e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8125000260770321, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6834 + }, + { + "completion_length": 222.77679538726807, + "epoch": 1.1461922125822541, + "grad_norm": 0.23635481786912677, + "kl": 0.087860107421875, + "learning_rate": 4.99441753750084e-07, + "loss": 0.0001, + "reward": 1.7879464998841286, + "reward_std": 0.06755395047366619, + "rewards/equation_reward_func": 0.7982143200933933, + "rewards/format_reward_func": 0.9897321462631226, + "step": 6836 + }, + { + "completion_length": 225.16518878936768, + "epoch": 1.1465275158221215, + "grad_norm": 0.21894307591906984, + "kl": 0.09490966796875, + "learning_rate": 4.994409522592008e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7303571663796902, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6838 + }, + { + "completion_length": 222.02679538726807, + "epoch": 1.1468628190619892, + "grad_norm": 0.24222570084093759, + "kl": 0.08447265625, + "learning_rate": 4.994401501940119e-07, + "loss": 0.0001, + "reward": 1.7000000849366188, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7089286111295223, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6840 + }, + { + "completion_length": 233.45090579986572, + "epoch": 1.1471981223018568, + "grad_norm": 0.3205876539023307, + "kl": 0.08941650390625, + "learning_rate": 4.994393475545195e-07, + "loss": 0.0001, + "reward": 1.6843750774860382, + "reward_std": 0.08775700209662318, + "rewards/equation_reward_func": 0.7035714648663998, + "rewards/format_reward_func": 0.980803582817316, + "step": 6842 + }, + { + "completion_length": 225.56250858306885, + "epoch": 1.1475334255417242, + "grad_norm": 0.23515632938836778, + "kl": 0.0845794677734375, + "learning_rate": 4.994385443407252e-07, + "loss": 0.0001, + "reward": 1.7696428894996643, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7830357439815998, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6844 + }, + { + "completion_length": 227.61161708831787, + "epoch": 1.1478687287815919, + "grad_norm": 0.2374903179827161, + "kl": 0.1004638671875, + "learning_rate": 4.994377405526308e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7544643059372902, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6846 + }, + { + "completion_length": 222.64733123779297, + "epoch": 1.1482040320214595, + "grad_norm": 0.40396699261744706, + "kl": 0.49755859375, + "learning_rate": 4.994369361902383e-07, + "loss": 0.0005, + "reward": 1.7196429371833801, + "reward_std": 0.10354063380509615, + "rewards/equation_reward_func": 0.733035746961832, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6848 + }, + { + "completion_length": 222.08036613464355, + "epoch": 1.148539335261327, + "grad_norm": 0.31380228580436215, + "kl": 0.088623046875, + "learning_rate": 4.994361312535495e-07, + "loss": 0.0001, + "reward": 1.6897322162985802, + "reward_std": 0.09154507238417864, + "rewards/equation_reward_func": 0.7223214562982321, + "rewards/format_reward_func": 0.9674107320606709, + "step": 6850 + }, + { + "completion_length": 211.91965293884277, + "epoch": 1.1488746385011945, + "grad_norm": 0.2694868807371034, + "kl": 0.081146240234375, + "learning_rate": 4.994353257425661e-07, + "loss": 0.0001, + "reward": 1.7839286103844643, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7973214536905289, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6852 + }, + { + "completion_length": 223.18304538726807, + "epoch": 1.1492099417410622, + "grad_norm": 0.24600340041907806, + "kl": 0.798309326171875, + "learning_rate": 4.994345196572903e-07, + "loss": 0.0008, + "reward": 1.7232143580913544, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7366071790456772, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6854 + }, + { + "completion_length": 230.9419755935669, + "epoch": 1.1495452449809296, + "grad_norm": 0.15237309305910776, + "kl": 0.0922698974609375, + "learning_rate": 4.994337129977235e-07, + "loss": 0.0001, + "reward": 1.7254465147852898, + "reward_std": 0.09154507424682379, + "rewards/equation_reward_func": 0.7401786036789417, + "rewards/format_reward_func": 0.9852678626775742, + "step": 6856 + }, + { + "completion_length": 228.14733219146729, + "epoch": 1.1498805482207972, + "grad_norm": 0.22325111909912654, + "kl": 0.09490966796875, + "learning_rate": 4.99432905763868e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7767857350409031, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6858 + }, + { + "completion_length": 227.51340293884277, + "epoch": 1.1502158514606649, + "grad_norm": 0.2710958581186891, + "kl": 0.7169189453125, + "learning_rate": 4.994320979557255e-07, + "loss": 0.0007, + "reward": 1.7540179193019867, + "reward_std": 0.11553619476035237, + "rewards/equation_reward_func": 0.7687500305473804, + "rewards/format_reward_func": 0.9852678664028645, + "step": 6860 + }, + { + "completion_length": 222.80804634094238, + "epoch": 1.1505511547005323, + "grad_norm": 0.26345604424103974, + "kl": 0.079986572265625, + "learning_rate": 4.994312895732978e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7839285917580128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6862 + }, + { + "completion_length": 227.67858123779297, + "epoch": 1.1508864579404, + "grad_norm": 0.26812911857919164, + "kl": 0.0786590576171875, + "learning_rate": 4.994304806165867e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.8017857484519482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6864 + }, + { + "completion_length": 235.8616189956665, + "epoch": 1.1512217611802673, + "grad_norm": 0.2522975419476787, + "kl": 0.08685302734375, + "learning_rate": 4.994296710855942e-07, + "loss": 0.0001, + "reward": 1.6357143446803093, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.6803571656346321, + "rewards/format_reward_func": 0.955357164144516, + "step": 6866 + }, + { + "completion_length": 234.99554538726807, + "epoch": 1.151557064420135, + "grad_norm": 0.25997920012377895, + "kl": 0.07647705078125, + "learning_rate": 4.994288609803221e-07, + "loss": 0.0001, + "reward": 1.7142858058214188, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 0.9821428656578064, + "step": 6868 + }, + { + "completion_length": 234.01340579986572, + "epoch": 1.1518923676600026, + "grad_norm": 0.547764225858209, + "kl": 0.92034912109375, + "learning_rate": 4.994280503007723e-07, + "loss": 0.0009, + "reward": 1.7598215192556381, + "reward_std": 0.11743023293092847, + "rewards/equation_reward_func": 0.7848214469850063, + "rewards/format_reward_func": 0.975000012665987, + "step": 6870 + }, + { + "completion_length": 223.48215293884277, + "epoch": 1.15222767089987, + "grad_norm": 0.004337909642485559, + "kl": 0.0759124755859375, + "learning_rate": 4.994272390469467e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 6872 + }, + { + "completion_length": 227.67411994934082, + "epoch": 1.1525629741397376, + "grad_norm": 0.26786762702478173, + "kl": 0.093414306640625, + "learning_rate": 4.99426427218847e-07, + "loss": 0.0001, + "reward": 1.745089367032051, + "reward_std": 0.0675539500080049, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 0.9986607171595097, + "step": 6874 + }, + { + "completion_length": 223.37947368621826, + "epoch": 1.1528982773796053, + "grad_norm": 0.14117511452087392, + "kl": 0.0843505859375, + "learning_rate": 4.994256148164752e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7803571801632643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6876 + }, + { + "completion_length": 227.93304634094238, + "epoch": 1.1532335806194727, + "grad_norm": 0.1684037263529093, + "kl": 0.08544921875, + "learning_rate": 4.994248018398332e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 6878 + }, + { + "completion_length": 223.60268783569336, + "epoch": 1.1535688838593403, + "grad_norm": 0.23348290820565865, + "kl": 0.0734100341796875, + "learning_rate": 4.994239882889229e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 6880 + }, + { + "completion_length": 229.4553680419922, + "epoch": 1.153904187099208, + "grad_norm": 0.21225142595897906, + "kl": 0.092926025390625, + "learning_rate": 4.99423174163746e-07, + "loss": 0.0001, + "reward": 1.8196429163217545, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8241071589291096, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6882 + }, + { + "completion_length": 226.08036994934082, + "epoch": 1.1542394903390754, + "grad_norm": 0.3118696688418131, + "kl": 0.1126251220703125, + "learning_rate": 4.994223594643044e-07, + "loss": 0.0001, + "reward": 1.756250061094761, + "reward_std": 0.06818529823794961, + "rewards/equation_reward_func": 0.7723214607685804, + "rewards/format_reward_func": 0.9839285798370838, + "step": 6884 + }, + { + "completion_length": 220.00000762939453, + "epoch": 1.154574793578943, + "grad_norm": 0.20714372614678384, + "kl": 0.1049652099609375, + "learning_rate": 4.994215441906003e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 6886 + }, + { + "completion_length": 227.08036994934082, + "epoch": 1.1549100968188104, + "grad_norm": 0.26264781214443184, + "kl": 0.074127197265625, + "learning_rate": 4.99420728342635e-07, + "loss": 0.0001, + "reward": 1.6964286342263222, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6964285932481289, + "rewards/format_reward_func": 1.0, + "step": 6888 + }, + { + "completion_length": 221.78572273254395, + "epoch": 1.155245400058678, + "grad_norm": 0.18060283117654174, + "kl": 0.096527099609375, + "learning_rate": 4.994199119204109e-07, + "loss": 0.0001, + "reward": 1.6803572326898575, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.6848214734345675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6890 + }, + { + "completion_length": 224.6384048461914, + "epoch": 1.1555807032985457, + "grad_norm": 0.31072415977608814, + "kl": 0.2027130126953125, + "learning_rate": 4.994190949239297e-07, + "loss": 0.0002, + "reward": 1.7375000640749931, + "reward_std": 0.06818529404699802, + "rewards/equation_reward_func": 0.74196432903409, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6892 + }, + { + "completion_length": 223.92411708831787, + "epoch": 1.155916006538413, + "grad_norm": 0.42115750864099694, + "kl": 0.1059722900390625, + "learning_rate": 4.994182773531932e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7732143141329288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6894 + }, + { + "completion_length": 223.92411613464355, + "epoch": 1.1562513097782807, + "grad_norm": 0.1894086306269926, + "kl": 0.0964202880859375, + "learning_rate": 4.994174592082034e-07, + "loss": 0.0001, + "reward": 1.7857143729925156, + "reward_std": 0.0909137288108468, + "rewards/equation_reward_func": 0.7946428842842579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6896 + }, + { + "completion_length": 226.8035831451416, + "epoch": 1.1565866130181484, + "grad_norm": 0.19003955983787937, + "kl": 0.0852813720703125, + "learning_rate": 4.994166404889621e-07, + "loss": 0.0001, + "reward": 1.7178572043776512, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857417464256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6898 + }, + { + "completion_length": 226.31697368621826, + "epoch": 1.1569219162580158, + "grad_norm": 0.23749390714239918, + "kl": 0.1407928466796875, + "learning_rate": 4.994158211954712e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7910714484751225, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6900 + }, + { + "completion_length": 213.43750762939453, + "epoch": 1.1572572194978834, + "grad_norm": 0.16679608761011747, + "kl": 0.100677490234375, + "learning_rate": 4.994150013277326e-07, + "loss": 0.0001, + "reward": 1.8392857536673546, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8392857387661934, + "rewards/format_reward_func": 1.0, + "step": 6902 + }, + { + "completion_length": 216.508939743042, + "epoch": 1.157592522737751, + "grad_norm": 0.22082425907055644, + "kl": 0.10552978515625, + "learning_rate": 4.994141808857483e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 6904 + }, + { + "completion_length": 220.37054538726807, + "epoch": 1.1579278259776185, + "grad_norm": 0.5021244800347835, + "kl": 0.0793609619140625, + "learning_rate": 4.9941335986952e-07, + "loss": 0.0001, + "reward": 1.7160715013742447, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7205357514321804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6906 + }, + { + "completion_length": 217.09822368621826, + "epoch": 1.158263129217486, + "grad_norm": 0.20157959080101762, + "kl": 0.08599853515625, + "learning_rate": 4.994125382790496e-07, + "loss": 0.0001, + "reward": 1.7482143864035606, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6908 + }, + { + "completion_length": 214.39733028411865, + "epoch": 1.1585984324573535, + "grad_norm": 0.17372353002627805, + "kl": 0.075927734375, + "learning_rate": 4.994117161143392e-07, + "loss": 0.0001, + "reward": 1.7250000685453415, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000424683094, + "rewards/format_reward_func": 1.0, + "step": 6910 + }, + { + "completion_length": 217.69197177886963, + "epoch": 1.1589337356972211, + "grad_norm": 0.18357542006548058, + "kl": 0.0815887451171875, + "learning_rate": 4.994108933753905e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 6912 + }, + { + "completion_length": 216.56250953674316, + "epoch": 1.1592690389370888, + "grad_norm": 0.14700847457999847, + "kl": 0.0750274658203125, + "learning_rate": 4.994100700622054e-07, + "loss": 0.0001, + "reward": 1.753571480512619, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 6914 + }, + { + "completion_length": 227.56251049041748, + "epoch": 1.1596043421769564, + "grad_norm": 0.24436108533912068, + "kl": 0.0693511962890625, + "learning_rate": 4.994092461747859e-07, + "loss": 0.0001, + "reward": 1.7303572371602058, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6916 + }, + { + "completion_length": 212.84375762939453, + "epoch": 1.1599396454168238, + "grad_norm": 0.24133830306304005, + "kl": 0.0664825439453125, + "learning_rate": 4.994084217131338e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 6918 + }, + { + "completion_length": 212.49554443359375, + "epoch": 1.1602749486566915, + "grad_norm": 0.2615600831239175, + "kl": 0.07427978515625, + "learning_rate": 4.994075966772511e-07, + "loss": 0.0001, + "reward": 1.707142911851406, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7071428894996643, + "rewards/format_reward_func": 1.0, + "step": 6920 + }, + { + "completion_length": 224.66965293884277, + "epoch": 1.1606102518965589, + "grad_norm": 0.15686356558490766, + "kl": 0.4253997802734375, + "learning_rate": 4.994067710671396e-07, + "loss": 0.0004, + "reward": 1.7267857640981674, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7312500439584255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6922 + }, + { + "completion_length": 215.59822273254395, + "epoch": 1.1609455551364265, + "grad_norm": 0.22075119949291844, + "kl": 0.071746826171875, + "learning_rate": 4.994059448828011e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 6924 + }, + { + "completion_length": 224.16518878936768, + "epoch": 1.1612808583762941, + "grad_norm": 0.2866122016574234, + "kl": 0.0772247314453125, + "learning_rate": 4.994051181242379e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 6926 + }, + { + "completion_length": 199.56250858306885, + "epoch": 1.1616161616161615, + "grad_norm": 0.2835153067499442, + "kl": 0.07659912109375, + "learning_rate": 4.994042907914514e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 6928 + }, + { + "completion_length": 220.72768878936768, + "epoch": 1.1619514648560292, + "grad_norm": 0.22777164421377802, + "kl": 0.6842498779296875, + "learning_rate": 4.99403462884444e-07, + "loss": 0.0007, + "reward": 1.7928571999073029, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571589291096, + "rewards/format_reward_func": 1.0, + "step": 6930 + }, + { + "completion_length": 219.58482933044434, + "epoch": 1.1622867680958968, + "grad_norm": 0.17939756568933712, + "kl": 0.073150634765625, + "learning_rate": 4.994026344032172e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7696428932249546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6932 + }, + { + "completion_length": 210.61608028411865, + "epoch": 1.1626220713357642, + "grad_norm": 0.3478303221101954, + "kl": 0.0870361328125, + "learning_rate": 4.994018053477731e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 1.0, + "step": 6934 + }, + { + "completion_length": 210.92858219146729, + "epoch": 1.1629573745756319, + "grad_norm": 0.1651791347237627, + "kl": 0.0748748779296875, + "learning_rate": 4.994009757181136e-07, + "loss": 0.0001, + "reward": 1.758928656578064, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6936 + }, + { + "completion_length": 215.88393783569336, + "epoch": 1.1632926778154995, + "grad_norm": 0.3029528530372732, + "kl": 0.0755157470703125, + "learning_rate": 4.994001455142405e-07, + "loss": 0.0001, + "reward": 1.7017857730388641, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7062500566244125, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6938 + }, + { + "completion_length": 211.4419755935669, + "epoch": 1.163627981055367, + "grad_norm": 0.26122991877460666, + "kl": 0.087005615234375, + "learning_rate": 4.993993147361558e-07, + "loss": 0.0001, + "reward": 1.7000000700354576, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7089286148548126, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6940 + }, + { + "completion_length": 208.23661613464355, + "epoch": 1.1639632842952345, + "grad_norm": 0.2747568610792723, + "kl": 0.0793609619140625, + "learning_rate": 4.993984833838615e-07, + "loss": 0.0001, + "reward": 1.785714328289032, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 6942 + }, + { + "completion_length": 213.88393688201904, + "epoch": 1.164298587535102, + "grad_norm": 0.2901435145294046, + "kl": 0.07421875, + "learning_rate": 4.993976514573593e-07, + "loss": 0.0001, + "reward": 1.692857213318348, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.692857176065445, + "rewards/format_reward_func": 1.0, + "step": 6944 + }, + { + "completion_length": 214.64286708831787, + "epoch": 1.1646338907749696, + "grad_norm": 0.2537096914951688, + "kl": 0.07794189453125, + "learning_rate": 4.993968189566513e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 6946 + }, + { + "completion_length": 223.53572273254395, + "epoch": 1.1649691940148372, + "grad_norm": 0.2668001365446884, + "kl": 0.0701446533203125, + "learning_rate": 4.993959858817395e-07, + "loss": 0.0001, + "reward": 1.7732143253087997, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7776786088943481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6948 + }, + { + "completion_length": 218.05358123779297, + "epoch": 1.1653044972547046, + "grad_norm": 0.21744701461798605, + "kl": 0.0718994140625, + "learning_rate": 4.993951522326255e-07, + "loss": 0.0001, + "reward": 1.769642911851406, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6950 + }, + { + "completion_length": 229.17858123779297, + "epoch": 1.1656398004945723, + "grad_norm": 0.21091420914175668, + "kl": 0.073822021484375, + "learning_rate": 4.993943180093114e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 6952 + }, + { + "completion_length": 229.37500953674316, + "epoch": 1.16597510373444, + "grad_norm": 0.28567924027709884, + "kl": 0.0760650634765625, + "learning_rate": 4.993934832117991e-07, + "loss": 0.0001, + "reward": 1.7125000730156898, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.725892897695303, + "rewards/format_reward_func": 0.9866071492433548, + "step": 6954 + }, + { + "completion_length": 226.1384038925171, + "epoch": 1.1663104069743073, + "grad_norm": 0.30841935931803693, + "kl": 0.0717315673828125, + "learning_rate": 4.993926478400906e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 6956 + }, + { + "completion_length": 235.94643783569336, + "epoch": 1.166645710214175, + "grad_norm": 0.23208670864060132, + "kl": 0.3082733154296875, + "learning_rate": 4.993918118941877e-07, + "loss": 0.0003, + "reward": 1.7410715222358704, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6958 + }, + { + "completion_length": 224.87054443359375, + "epoch": 1.1669810134540426, + "grad_norm": 0.14985084192465364, + "kl": 0.0709381103515625, + "learning_rate": 4.993909753740924e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 6960 + }, + { + "completion_length": 235.30358123779297, + "epoch": 1.16731631669391, + "grad_norm": 0.24284178348934188, + "kl": 0.543304443359375, + "learning_rate": 4.993901382798067e-07, + "loss": 0.0005, + "reward": 1.766071505844593, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6962 + }, + { + "completion_length": 234.59822463989258, + "epoch": 1.1676516199337776, + "grad_norm": 0.15701413152998836, + "kl": 0.3557891845703125, + "learning_rate": 4.993893006113323e-07, + "loss": 0.0004, + "reward": 1.7339285984635353, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.977678582072258, + "step": 6964 + }, + { + "completion_length": 234.5134048461914, + "epoch": 1.167986923173645, + "grad_norm": 0.22042202431841593, + "kl": 0.0718994140625, + "learning_rate": 4.993884623686713e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857320606709, + "rewards/format_reward_func": 1.0, + "step": 6966 + }, + { + "completion_length": 234.33929443359375, + "epoch": 1.1683222264135127, + "grad_norm": 0.2321463814591959, + "kl": 0.083343505859375, + "learning_rate": 4.993876235518256e-07, + "loss": 0.0001, + "reward": 1.7160714715719223, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7205357551574707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6968 + }, + { + "completion_length": 237.26340198516846, + "epoch": 1.1686575296533803, + "grad_norm": 0.23375655605151902, + "kl": 0.0672149658203125, + "learning_rate": 4.993867841607972e-07, + "loss": 0.0001, + "reward": 1.7535714656114578, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714842379093, + "rewards/format_reward_func": 1.0, + "step": 6970 + }, + { + "completion_length": 236.7321538925171, + "epoch": 1.1689928328932477, + "grad_norm": 0.16916676139529008, + "kl": 0.067230224609375, + "learning_rate": 4.993859441955879e-07, + "loss": 0.0001, + "reward": 1.7303572073578835, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7348214723169804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6972 + }, + { + "completion_length": 227.88393878936768, + "epoch": 1.1693281361331154, + "grad_norm": 0.21767153944629555, + "kl": 0.068359375, + "learning_rate": 4.993851036561996e-07, + "loss": 0.0001, + "reward": 1.7982143461704254, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8026785887777805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6974 + }, + { + "completion_length": 238.67858219146729, + "epoch": 1.169663439372983, + "grad_norm": 0.2871662999160085, + "kl": 0.0770721435546875, + "learning_rate": 4.993842625426344e-07, + "loss": 0.0001, + "reward": 1.755357213318348, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7598214577883482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6976 + }, + { + "completion_length": 243.97322750091553, + "epoch": 1.1699987426128504, + "grad_norm": 0.2003109959117877, + "kl": 0.0980682373046875, + "learning_rate": 4.993834208548942e-07, + "loss": 0.0001, + "reward": 1.7178572341799736, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7267857436090708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 6978 + }, + { + "completion_length": 243.05358409881592, + "epoch": 1.170334045852718, + "grad_norm": 0.4412118535614136, + "kl": 0.1660919189453125, + "learning_rate": 4.993825785929808e-07, + "loss": 0.0002, + "reward": 1.7375000640749931, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6980 + }, + { + "completion_length": 238.2009038925171, + "epoch": 1.1706693490925857, + "grad_norm": 0.2531765682553049, + "kl": 0.070648193359375, + "learning_rate": 4.993817357568963e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6982 + }, + { + "completion_length": 237.69197463989258, + "epoch": 1.171004652332453, + "grad_norm": 0.1307315755554436, + "kl": 0.07025146484375, + "learning_rate": 4.993808923466427e-07, + "loss": 0.0001, + "reward": 1.8071429207921028, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071429021656513, + "rewards/format_reward_func": 1.0, + "step": 6984 + }, + { + "completion_length": 229.9241180419922, + "epoch": 1.1713399555723207, + "grad_norm": 0.16282934602909968, + "kl": 0.0719757080078125, + "learning_rate": 4.993800483622217e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285872876644, + "rewards/format_reward_func": 1.0, + "step": 6986 + }, + { + "completion_length": 235.06250858306885, + "epoch": 1.1716752588121884, + "grad_norm": 0.15396331685512085, + "kl": 0.061370849609375, + "learning_rate": 4.993792038036353e-07, + "loss": 0.0001, + "reward": 1.8785714581608772, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8785714544355869, + "rewards/format_reward_func": 1.0, + "step": 6988 + }, + { + "completion_length": 230.7321538925171, + "epoch": 1.1720105620520558, + "grad_norm": 0.20821104135962984, + "kl": 0.1810455322265625, + "learning_rate": 4.993783586708857e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 6990 + }, + { + "completion_length": 239.67411994934082, + "epoch": 1.1723458652919234, + "grad_norm": 0.13161959537442824, + "kl": 0.0686798095703125, + "learning_rate": 4.993775129639744e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 6992 + }, + { + "completion_length": 240.89286994934082, + "epoch": 1.172681168531791, + "grad_norm": 0.20917233948066855, + "kl": 0.2086334228515625, + "learning_rate": 4.993766666829038e-07, + "loss": 0.0002, + "reward": 1.7125000655651093, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7169643230736256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 6994 + }, + { + "completion_length": 242.56697273254395, + "epoch": 1.1730164717716585, + "grad_norm": 0.14468665687786536, + "kl": 0.064697265625, + "learning_rate": 4.993758198276756e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571529686451, + "rewards/format_reward_func": 1.0, + "step": 6996 + }, + { + "completion_length": 246.53126049041748, + "epoch": 1.173351775011526, + "grad_norm": 0.24789853835463235, + "kl": 0.076202392578125, + "learning_rate": 4.993749723982918e-07, + "loss": 0.0001, + "reward": 1.8178571835160255, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.817857176065445, + "rewards/format_reward_func": 1.0, + "step": 6998 + }, + { + "completion_length": 237.70983219146729, + "epoch": 1.1736870782513935, + "grad_norm": 0.22220896735230375, + "kl": 0.0637969970703125, + "learning_rate": 4.993741243947544e-07, + "loss": 0.0001, + "reward": 1.8357143327593803, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8357143066823483, + "rewards/format_reward_func": 1.0, + "step": 7000 + }, + { + "completion_length": 243.8616180419922, + "epoch": 1.1740223814912611, + "grad_norm": 0.24457270120564578, + "kl": 0.060516357421875, + "learning_rate": 4.993732758170652e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 7002 + }, + { + "completion_length": 234.6116189956665, + "epoch": 1.1743576847311288, + "grad_norm": 0.25798999402321, + "kl": 0.06378173828125, + "learning_rate": 4.993724266652263e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 7004 + }, + { + "completion_length": 243.3660831451416, + "epoch": 1.1746929879709962, + "grad_norm": 0.12745495764158657, + "kl": 0.0615386962890625, + "learning_rate": 4.993715769392397e-07, + "loss": 0.0001, + "reward": 1.7053572088479996, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.709821468219161, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7006 + }, + { + "completion_length": 241.02679443359375, + "epoch": 1.1750282912108638, + "grad_norm": 0.2753649872008471, + "kl": 0.11676025390625, + "learning_rate": 4.993707266391072e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7767857611179352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7008 + }, + { + "completion_length": 236.1473331451416, + "epoch": 1.1753635944507315, + "grad_norm": 0.21138871590711736, + "kl": 0.42535400390625, + "learning_rate": 4.993698757648308e-07, + "loss": 0.0004, + "reward": 1.8035714700818062, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 7010 + }, + { + "completion_length": 236.2946538925171, + "epoch": 1.1756988976905989, + "grad_norm": 0.20702311310531854, + "kl": 0.150177001953125, + "learning_rate": 4.993690243164125e-07, + "loss": 0.0002, + "reward": 1.7178572192788124, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571633994579, + "rewards/format_reward_func": 1.0, + "step": 7012 + }, + { + "completion_length": 246.5000123977661, + "epoch": 1.1760342009304665, + "grad_norm": 0.24995814190921872, + "kl": 0.3637237548828125, + "learning_rate": 4.993681722938542e-07, + "loss": 0.0004, + "reward": 1.7696429342031479, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7014 + }, + { + "completion_length": 241.29912090301514, + "epoch": 1.1763695041703341, + "grad_norm": 0.1647114277394096, + "kl": 0.104705810546875, + "learning_rate": 4.993673196971581e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 7016 + }, + { + "completion_length": 244.20983409881592, + "epoch": 1.1767048074102016, + "grad_norm": 0.2431811099491955, + "kl": 0.0694427490234375, + "learning_rate": 4.993664665263258e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285928755999, + "rewards/format_reward_func": 1.0, + "step": 7018 + }, + { + "completion_length": 250.6562623977661, + "epoch": 1.1770401106500692, + "grad_norm": 0.2823400285980501, + "kl": 0.07000732421875, + "learning_rate": 4.993656127813594e-07, + "loss": 0.0001, + "reward": 1.691071517765522, + "reward_std": 0.09343910776078701, + "rewards/equation_reward_func": 0.6955357547849417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7020 + }, + { + "completion_length": 243.43304634094238, + "epoch": 1.1773754138899366, + "grad_norm": 0.2006690204806412, + "kl": 0.0662994384765625, + "learning_rate": 4.99364758462261e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 7022 + }, + { + "completion_length": 239.68304538726807, + "epoch": 1.1777107171298042, + "grad_norm": 0.25763110318507637, + "kl": 0.735565185546875, + "learning_rate": 4.993639035690325e-07, + "loss": 0.0007, + "reward": 1.7750000804662704, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 7024 + }, + { + "completion_length": 243.7634038925171, + "epoch": 1.1780460203696719, + "grad_norm": 0.2525095936361976, + "kl": 0.0626220703125, + "learning_rate": 4.993630481016758e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8214286044239998, + "rewards/format_reward_func": 1.0, + "step": 7026 + }, + { + "completion_length": 238.9330472946167, + "epoch": 1.1783813236095393, + "grad_norm": 0.32584782130502027, + "kl": 0.08795166015625, + "learning_rate": 4.993621920601928e-07, + "loss": 0.0001, + "reward": 1.7750001102685928, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 7028 + }, + { + "completion_length": 238.77233219146729, + "epoch": 1.178716626849407, + "grad_norm": 0.18152868420415122, + "kl": 0.1728973388671875, + "learning_rate": 4.993613354445857e-07, + "loss": 0.0002, + "reward": 1.7464286237955093, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.755357176065445, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7030 + }, + { + "completion_length": 234.59822463989258, + "epoch": 1.1790519300892746, + "grad_norm": 0.21606357065813922, + "kl": 0.0672607421875, + "learning_rate": 4.993604782548563e-07, + "loss": 0.0001, + "reward": 1.7410714998841286, + "reward_std": 0.06313453521579504, + "rewards/equation_reward_func": 0.7544643208384514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7032 + }, + { + "completion_length": 245.071439743042, + "epoch": 1.179387233329142, + "grad_norm": 0.11731433901397256, + "kl": 0.0707244873046875, + "learning_rate": 4.993596204910067e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.716964315623045, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7034 + }, + { + "completion_length": 234.6741189956665, + "epoch": 1.1797225365690096, + "grad_norm": 0.15904667048164609, + "kl": 0.066925048828125, + "learning_rate": 4.993587621530386e-07, + "loss": 0.0001, + "reward": 1.755357213318348, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7036 + }, + { + "completion_length": 244.46875953674316, + "epoch": 1.1800578398088772, + "grad_norm": 0.21918559336727544, + "kl": 0.068328857421875, + "learning_rate": 4.993579032409544e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 7038 + }, + { + "completion_length": 241.32144165039062, + "epoch": 1.1803931430487447, + "grad_norm": 0.28183054748017666, + "kl": 0.1956024169921875, + "learning_rate": 4.993570437547558e-07, + "loss": 0.0002, + "reward": 1.8089286535978317, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.813392885029316, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7040 + }, + { + "completion_length": 240.8571538925171, + "epoch": 1.1807284462886123, + "grad_norm": 0.17991330132898947, + "kl": 0.0831451416015625, + "learning_rate": 4.993561836944447e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 7042 + }, + { + "completion_length": 244.83929920196533, + "epoch": 1.1810637495284797, + "grad_norm": 0.15519989350192553, + "kl": 0.0687103271484375, + "learning_rate": 4.993553230600233e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 7044 + }, + { + "completion_length": 244.7812614440918, + "epoch": 1.1813990527683473, + "grad_norm": 0.2493715816928291, + "kl": 0.0808258056640625, + "learning_rate": 4.993544618514935e-07, + "loss": 0.0001, + "reward": 1.7040179297327995, + "reward_std": 0.06502856919541955, + "rewards/equation_reward_func": 0.7098214626312256, + "rewards/format_reward_func": 0.9941964335739613, + "step": 7046 + }, + { + "completion_length": 250.14287090301514, + "epoch": 1.181734356008215, + "grad_norm": 0.22729720223898298, + "kl": 0.0684661865234375, + "learning_rate": 4.993536000688573e-07, + "loss": 0.0001, + "reward": 1.6785715147852898, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.6785714477300644, + "rewards/format_reward_func": 1.0, + "step": 7048 + }, + { + "completion_length": 248.52679538726807, + "epoch": 1.1820696592480826, + "grad_norm": 0.2324995386469855, + "kl": 0.06976318359375, + "learning_rate": 4.993527377121166e-07, + "loss": 0.0001, + "reward": 1.7517857924103737, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7050 + }, + { + "completion_length": 248.4419765472412, + "epoch": 1.18240496248795, + "grad_norm": 0.18319454840783975, + "kl": 0.0713653564453125, + "learning_rate": 4.993518747812735e-07, + "loss": 0.0001, + "reward": 1.7107143551111221, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7276786006987095, + "rewards/format_reward_func": 0.9830357283353806, + "step": 7052 + }, + { + "completion_length": 238.18751049041748, + "epoch": 1.1827402657278177, + "grad_norm": 0.23620609642510929, + "kl": 0.100982666015625, + "learning_rate": 4.993510112763299e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500201165676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7054 + }, + { + "completion_length": 253.98661994934082, + "epoch": 1.183075568967685, + "grad_norm": 0.2541844366706011, + "kl": 0.0661773681640625, + "learning_rate": 4.993501471972879e-07, + "loss": 0.0001, + "reward": 1.7250000983476639, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7339286096394062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7056 + }, + { + "completion_length": 243.8214406967163, + "epoch": 1.1834108722075527, + "grad_norm": 0.19942797787478084, + "kl": 0.07354736328125, + "learning_rate": 4.993492825441493e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857320606709, + "rewards/format_reward_func": 1.0, + "step": 7058 + }, + { + "completion_length": 236.7232265472412, + "epoch": 1.1837461754474203, + "grad_norm": 0.21271859218155764, + "kl": 0.065704345703125, + "learning_rate": 4.993484173169162e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 7060 + }, + { + "completion_length": 239.19197368621826, + "epoch": 1.1840814786872877, + "grad_norm": 0.28199565232331336, + "kl": 0.12799072265625, + "learning_rate": 4.993475515155906e-07, + "loss": 0.0001, + "reward": 1.758928656578064, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7062 + }, + { + "completion_length": 246.8928680419922, + "epoch": 1.1844167819271554, + "grad_norm": 0.25315393129614683, + "kl": 0.5302581787109375, + "learning_rate": 4.993466851401745e-07, + "loss": 0.0005, + "reward": 1.7571429163217545, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7064 + }, + { + "completion_length": 235.8125123977661, + "epoch": 1.184752085167023, + "grad_norm": 0.2698316473301265, + "kl": 0.0690765380859375, + "learning_rate": 4.993458181906699e-07, + "loss": 0.0001, + "reward": 1.8089286237955093, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.813392885029316, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7066 + }, + { + "completion_length": 244.77679538726807, + "epoch": 1.1850873884068904, + "grad_norm": 0.2320733825817402, + "kl": 0.0703582763671875, + "learning_rate": 4.993449506670788e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7068 + }, + { + "completion_length": 259.99554920196533, + "epoch": 1.185422691646758, + "grad_norm": 0.24226402232737604, + "kl": 0.0719451904296875, + "learning_rate": 4.99344082569403e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.08081220369786024, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 0.9821428656578064, + "step": 7070 + }, + { + "completion_length": 247.50447940826416, + "epoch": 1.1857579948866257, + "grad_norm": 0.18774264494588802, + "kl": 0.0692596435546875, + "learning_rate": 4.993432138976448e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000324100256, + "rewards/format_reward_func": 1.0, + "step": 7072 + }, + { + "completion_length": 254.71876049041748, + "epoch": 1.1860932981264931, + "grad_norm": 0.2447953619963093, + "kl": 0.2768707275390625, + "learning_rate": 4.993423446518063e-07, + "loss": 0.0003, + "reward": 1.7392857894301414, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7074 + }, + { + "completion_length": 252.3259038925171, + "epoch": 1.1864286013663607, + "grad_norm": 0.2295580947248307, + "kl": 0.15301513671875, + "learning_rate": 4.99341474831889e-07, + "loss": 0.0002, + "reward": 1.721428632736206, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7303571682423353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7076 + }, + { + "completion_length": 255.95536422729492, + "epoch": 1.1867639046062282, + "grad_norm": 0.2032927882840729, + "kl": 0.0631103515625, + "learning_rate": 4.993406044378951e-07, + "loss": 0.0001, + "reward": 1.7196429371833801, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7241071779280901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7078 + }, + { + "completion_length": 253.1696548461914, + "epoch": 1.1870992078460958, + "grad_norm": 0.2855426504789579, + "kl": 0.1221771240234375, + "learning_rate": 4.993397334698269e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 7080 + }, + { + "completion_length": 259.46429538726807, + "epoch": 1.1874345110859634, + "grad_norm": 0.3224579946182638, + "kl": 0.1277923583984375, + "learning_rate": 4.993388619276861e-07, + "loss": 0.0001, + "reward": 1.7053572162985802, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7098214700818062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7082 + }, + { + "completion_length": 247.4151906967163, + "epoch": 1.1877698143258308, + "grad_norm": 0.18945706403060317, + "kl": 0.0734100341796875, + "learning_rate": 4.993379898114748e-07, + "loss": 0.0001, + "reward": 1.7250000536441803, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7339286059141159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7084 + }, + { + "completion_length": 253.5312614440918, + "epoch": 1.1881051175656985, + "grad_norm": 0.2508063849897521, + "kl": 0.0714569091796875, + "learning_rate": 4.99337117121195e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928768336773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7086 + }, + { + "completion_length": 242.6830472946167, + "epoch": 1.1884404208055661, + "grad_norm": 0.21219012626209383, + "kl": 0.0686187744140625, + "learning_rate": 4.993362438568487e-07, + "loss": 0.0001, + "reward": 1.8035714998841286, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8035714402794838, + "rewards/format_reward_func": 1.0, + "step": 7088 + }, + { + "completion_length": 237.0357265472412, + "epoch": 1.1887757240454335, + "grad_norm": 0.19441894647892927, + "kl": 0.092987060546875, + "learning_rate": 4.993353700184379e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7090 + }, + { + "completion_length": 257.4017972946167, + "epoch": 1.1891110272853012, + "grad_norm": 0.2658164508523097, + "kl": 0.063018798828125, + "learning_rate": 4.993344956059646e-07, + "loss": 0.0001, + "reward": 1.7125001028180122, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7258928902447224, + "rewards/format_reward_func": 0.9866071455180645, + "step": 7092 + }, + { + "completion_length": 254.77679920196533, + "epoch": 1.1894463305251688, + "grad_norm": 0.13371165163675044, + "kl": 0.062957763671875, + "learning_rate": 4.99333620619431e-07, + "loss": 0.0001, + "reward": 1.769642896950245, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7741071879863739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7094 + }, + { + "completion_length": 250.5982265472412, + "epoch": 1.1897816337650362, + "grad_norm": 0.24803401131551336, + "kl": 0.0692291259765625, + "learning_rate": 4.993327450588388e-07, + "loss": 0.0001, + "reward": 1.7214286252856255, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7214286141097546, + "rewards/format_reward_func": 1.0, + "step": 7096 + }, + { + "completion_length": 250.93304538726807, + "epoch": 1.1901169370049038, + "grad_norm": 0.2041804890215718, + "kl": 0.0629425048828125, + "learning_rate": 4.993318689241902e-07, + "loss": 0.0001, + "reward": 1.8410714864730835, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8455357365310192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7098 + }, + { + "completion_length": 253.7589406967163, + "epoch": 1.1904522402447713, + "grad_norm": 0.27051242913768825, + "kl": 0.101287841796875, + "learning_rate": 4.993309922154872e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7732143141329288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7100 + }, + { + "completion_length": 264.7232275009155, + "epoch": 1.1907875434846389, + "grad_norm": 0.22080903137693467, + "kl": 0.06524658203125, + "learning_rate": 4.993301149327319e-07, + "loss": 0.0001, + "reward": 1.6964286640286446, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.705357164144516, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7102 + }, + { + "completion_length": 256.1785831451416, + "epoch": 1.1911228467245065, + "grad_norm": 0.125236473476325, + "kl": 0.0610809326171875, + "learning_rate": 4.993292370759261e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7705357298254967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7104 + }, + { + "completion_length": 245.0491180419922, + "epoch": 1.191458149964374, + "grad_norm": 0.1692324014060084, + "kl": 0.0619354248046875, + "learning_rate": 4.99328358645072e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 7106 + }, + { + "completion_length": 261.42858600616455, + "epoch": 1.1917934532042416, + "grad_norm": 0.3721622893705267, + "kl": 0.1228485107421875, + "learning_rate": 4.993274796401716e-07, + "loss": 0.0001, + "reward": 1.6660715118050575, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.6794643178582191, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7108 + }, + { + "completion_length": 259.6696548461914, + "epoch": 1.1921287564441092, + "grad_norm": 0.0981046329887637, + "kl": 0.069854736328125, + "learning_rate": 4.99326600061227e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 7110 + }, + { + "completion_length": 264.2544775009155, + "epoch": 1.1924640596839766, + "grad_norm": 0.17182040481196187, + "kl": 0.0637969970703125, + "learning_rate": 4.993257199082399e-07, + "loss": 0.0001, + "reward": 1.7232143729925156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276785876601934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7112 + }, + { + "completion_length": 263.9687614440918, + "epoch": 1.1927993629238443, + "grad_norm": 0.20245448175755204, + "kl": 0.0615386962890625, + "learning_rate": 4.993248391812127e-07, + "loss": 0.0001, + "reward": 1.7946429029107094, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071715950966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7114 + }, + { + "completion_length": 245.4375114440918, + "epoch": 1.1931346661637119, + "grad_norm": 0.18806544126057884, + "kl": 0.0662994384765625, + "learning_rate": 4.993239578801473e-07, + "loss": 0.0001, + "reward": 1.7428572252392769, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 7116 + }, + { + "completion_length": 253.40179634094238, + "epoch": 1.1934699694035793, + "grad_norm": 0.16657846626696407, + "kl": 0.1014556884765625, + "learning_rate": 4.993230760050456e-07, + "loss": 0.0001, + "reward": 1.7571429088711739, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 7118 + }, + { + "completion_length": 257.8794765472412, + "epoch": 1.193805272643447, + "grad_norm": 0.2076503664785633, + "kl": 0.1319122314453125, + "learning_rate": 4.993221935559098e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 7120 + }, + { + "completion_length": 267.1250123977661, + "epoch": 1.1941405758833143, + "grad_norm": 0.27828751029224, + "kl": 0.0604400634765625, + "learning_rate": 4.993213105327418e-07, + "loss": 0.0001, + "reward": 1.7160715088248253, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7294643111526966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7122 + }, + { + "completion_length": 252.13840579986572, + "epoch": 1.194475879123182, + "grad_norm": 0.25794649504968276, + "kl": 0.0631561279296875, + "learning_rate": 4.993204269355438e-07, + "loss": 0.0001, + "reward": 1.7071429342031479, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7160714827477932, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7124 + }, + { + "completion_length": 252.7142972946167, + "epoch": 1.1948111823630496, + "grad_norm": 0.13134844037671142, + "kl": 0.0653228759765625, + "learning_rate": 4.993195427643176e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7482143137603998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7126 + }, + { + "completion_length": 248.602689743042, + "epoch": 1.1951464856029173, + "grad_norm": 0.2285108171615533, + "kl": 0.06787109375, + "learning_rate": 4.993186580190655e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 7128 + }, + { + "completion_length": 250.45537090301514, + "epoch": 1.1954817888427847, + "grad_norm": 0.1980039645739993, + "kl": 0.06549072265625, + "learning_rate": 4.993177726997894e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321429066359997, + "rewards/format_reward_func": 1.0, + "step": 7130 + }, + { + "completion_length": 260.33036518096924, + "epoch": 1.1958170920826523, + "grad_norm": 0.22274091348978325, + "kl": 0.0615234375, + "learning_rate": 4.993168868064913e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7132 + }, + { + "completion_length": 245.57144165039062, + "epoch": 1.1961523953225197, + "grad_norm": 0.19250171428754337, + "kl": 0.0677642822265625, + "learning_rate": 4.993160003391733e-07, + "loss": 0.0001, + "reward": 1.7553571909666061, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7687500268220901, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7134 + }, + { + "completion_length": 248.99554538726807, + "epoch": 1.1964876985623873, + "grad_norm": 0.29356235790356494, + "kl": 0.069305419921875, + "learning_rate": 4.993151132978374e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 7136 + }, + { + "completion_length": 253.4062614440918, + "epoch": 1.196823001802255, + "grad_norm": 0.19904524202169394, + "kl": 0.114044189453125, + "learning_rate": 4.993142256824857e-07, + "loss": 0.0001, + "reward": 1.719642922282219, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7241071835160255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7138 + }, + { + "completion_length": 251.6071538925171, + "epoch": 1.1971583050421224, + "grad_norm": 0.19771127345401926, + "kl": 0.06500244140625, + "learning_rate": 4.993133374931203e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286185801029, + "rewards/format_reward_func": 1.0, + "step": 7140 + }, + { + "completion_length": 246.4955472946167, + "epoch": 1.19749360828199, + "grad_norm": 0.29051666126122133, + "kl": 0.1295928955078125, + "learning_rate": 4.99312448729743e-07, + "loss": 0.0001, + "reward": 1.7625000849366188, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7142 + }, + { + "completion_length": 242.80358505249023, + "epoch": 1.1978289115218577, + "grad_norm": 0.2147715894666388, + "kl": 0.0679779052734375, + "learning_rate": 4.993115593923561e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7144 + }, + { + "completion_length": 245.52679920196533, + "epoch": 1.198164214761725, + "grad_norm": 0.2806676529569814, + "kl": 0.0613861083984375, + "learning_rate": 4.993106694809615e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 1.0, + "step": 7146 + }, + { + "completion_length": 243.39733219146729, + "epoch": 1.1984995180015927, + "grad_norm": 0.1163241116677545, + "kl": 0.06005859375, + "learning_rate": 4.993097789955614e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 7148 + }, + { + "completion_length": 242.4241189956665, + "epoch": 1.1988348212414603, + "grad_norm": 0.2207702446446991, + "kl": 0.064971923828125, + "learning_rate": 4.993088879361576e-07, + "loss": 0.0001, + "reward": 1.7232143506407738, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7366071753203869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7150 + }, + { + "completion_length": 243.4285831451416, + "epoch": 1.1991701244813278, + "grad_norm": 0.17856647941258746, + "kl": 0.06219482421875, + "learning_rate": 4.993079963027525e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.782142873853445, + "rewards/format_reward_func": 1.0, + "step": 7152 + }, + { + "completion_length": 231.9821538925171, + "epoch": 1.1995054277211954, + "grad_norm": 0.31929378908916767, + "kl": 0.077178955078125, + "learning_rate": 4.993071040953477e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 7154 + }, + { + "completion_length": 245.165189743042, + "epoch": 1.1998407309610628, + "grad_norm": 0.23651149317372253, + "kl": 0.1234130859375, + "learning_rate": 4.993062113139457e-07, + "loss": 0.0001, + "reward": 1.7482143715023994, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7156 + }, + { + "completion_length": 241.07590293884277, + "epoch": 1.2001760342009304, + "grad_norm": 0.18797884413854196, + "kl": 0.0643157958984375, + "learning_rate": 4.993053179585484e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714600235224, + "rewards/format_reward_func": 1.0, + "step": 7158 + }, + { + "completion_length": 245.80804347991943, + "epoch": 1.200511337440798, + "grad_norm": 0.19977985178626378, + "kl": 0.0759735107421875, + "learning_rate": 4.993044240291576e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 7160 + }, + { + "completion_length": 244.26786994934082, + "epoch": 1.2008466406806655, + "grad_norm": 0.27002438185556893, + "kl": 0.070404052734375, + "learning_rate": 4.993035295257758e-07, + "loss": 0.0001, + "reward": 1.7232143506407738, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.727678619325161, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7162 + }, + { + "completion_length": 246.47769165039062, + "epoch": 1.2011819439205331, + "grad_norm": 0.14532020676846388, + "kl": 0.063720703125, + "learning_rate": 4.993026344484047e-07, + "loss": 0.0001, + "reward": 1.7553572207689285, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7164 + }, + { + "completion_length": 243.21429824829102, + "epoch": 1.2015172471604008, + "grad_norm": 0.32398441312856185, + "kl": 0.1230926513671875, + "learning_rate": 4.993017387970467e-07, + "loss": 0.0001, + "reward": 1.7071429565548897, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7071429006755352, + "rewards/format_reward_func": 1.0, + "step": 7166 + }, + { + "completion_length": 249.39733600616455, + "epoch": 1.2018525504002682, + "grad_norm": 0.16493126908134675, + "kl": 0.0857696533203125, + "learning_rate": 4.993008425717034e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.750892885029316, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7168 + }, + { + "completion_length": 246.2009048461914, + "epoch": 1.2021878536401358, + "grad_norm": 0.17881583185915126, + "kl": 0.06036376953125, + "learning_rate": 4.992999457723773e-07, + "loss": 0.0001, + "reward": 1.8107143566012383, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8196428716182709, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7170 + }, + { + "completion_length": 234.98661708831787, + "epoch": 1.2025231568800034, + "grad_norm": 0.20858932423195833, + "kl": 0.067535400390625, + "learning_rate": 4.992990483990702e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 7172 + }, + { + "completion_length": 254.27233123779297, + "epoch": 1.2028584601198709, + "grad_norm": 0.3299755816245421, + "kl": 0.110748291015625, + "learning_rate": 4.992981504517843e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 7174 + }, + { + "completion_length": 249.93305110931396, + "epoch": 1.2031937633597385, + "grad_norm": 0.1657871907998651, + "kl": 0.1009368896484375, + "learning_rate": 4.992972519305216e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7176 + }, + { + "completion_length": 254.86608219146729, + "epoch": 1.203529066599606, + "grad_norm": 0.19653879742103245, + "kl": 0.065093994140625, + "learning_rate": 4.992963528352843e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7758928798139095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7178 + }, + { + "completion_length": 242.56251049041748, + "epoch": 1.2038643698394735, + "grad_norm": 0.22354122095251158, + "kl": 0.1406707763671875, + "learning_rate": 4.992954531660742e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143297791481, + "rewards/format_reward_func": 1.0, + "step": 7180 + }, + { + "completion_length": 245.2544765472412, + "epoch": 1.2041996730793412, + "grad_norm": 0.2367578009193638, + "kl": 0.0679779052734375, + "learning_rate": 4.992945529228937e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321429029107094, + "rewards/format_reward_func": 1.0, + "step": 7182 + }, + { + "completion_length": 257.9464416503906, + "epoch": 1.2045349763192088, + "grad_norm": 0.19217014082094047, + "kl": 0.1060028076171875, + "learning_rate": 4.992936521057446e-07, + "loss": 0.0001, + "reward": 1.7642858102917671, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 7184 + }, + { + "completion_length": 257.2276906967163, + "epoch": 1.2048702795590762, + "grad_norm": 0.2761550441553349, + "kl": 0.1991729736328125, + "learning_rate": 4.992927507146291e-07, + "loss": 0.0002, + "reward": 1.7785715088248253, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7785714659839869, + "rewards/format_reward_func": 1.0, + "step": 7186 + }, + { + "completion_length": 251.54911708831787, + "epoch": 1.2052055827989439, + "grad_norm": 0.307690065103978, + "kl": 0.085052490234375, + "learning_rate": 4.992918487495492e-07, + "loss": 0.0001, + "reward": 1.801785759627819, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.8062500320374966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7188 + }, + { + "completion_length": 255.63840579986572, + "epoch": 1.2055408860388113, + "grad_norm": 0.15497750976828262, + "kl": 0.0872802734375, + "learning_rate": 4.992909462105072e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.74196432903409, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7190 + }, + { + "completion_length": 256.4151916503906, + "epoch": 1.205876189278679, + "grad_norm": 0.22940595595717467, + "kl": 0.0673065185546875, + "learning_rate": 4.992900430975048e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214633762836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7192 + }, + { + "completion_length": 254.82590675354004, + "epoch": 1.2062114925185465, + "grad_norm": 0.2030428580858518, + "kl": 0.0630340576171875, + "learning_rate": 4.992891394105445e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 7194 + }, + { + "completion_length": 249.6160831451416, + "epoch": 1.206546795758414, + "grad_norm": 0.22482483113253907, + "kl": 0.1019287109375, + "learning_rate": 4.992882351496281e-07, + "loss": 0.0001, + "reward": 1.7732143327593803, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7776786275207996, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7196 + }, + { + "completion_length": 254.1160831451416, + "epoch": 1.2068820989982816, + "grad_norm": 0.12312991086334027, + "kl": 0.086456298828125, + "learning_rate": 4.992873303147577e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.025253813713788986, + "rewards/equation_reward_func": 0.8053571507334709, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7198 + }, + { + "completion_length": 263.8794775009155, + "epoch": 1.2072174022381492, + "grad_norm": 0.10108203752975906, + "kl": 0.229705810546875, + "learning_rate": 4.992864249059354e-07, + "loss": 0.0002, + "reward": 1.7053572088479996, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7098214738070965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7200 + }, + { + "completion_length": 249.06697750091553, + "epoch": 1.2075527054780166, + "grad_norm": 0.305712170662913, + "kl": 0.169677734375, + "learning_rate": 4.992855189231634e-07, + "loss": 0.0002, + "reward": 1.7464286386966705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 7202 + }, + { + "completion_length": 260.4241199493408, + "epoch": 1.2078880087178843, + "grad_norm": 0.11947172731424033, + "kl": 0.0626373291015625, + "learning_rate": 4.992846123664437e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 7204 + }, + { + "completion_length": 260.3839445114136, + "epoch": 1.208223311957752, + "grad_norm": 0.17124817043019805, + "kl": 0.0750732421875, + "learning_rate": 4.992837052357783e-07, + "loss": 0.0001, + "reward": 1.7625000700354576, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669643145054579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7206 + }, + { + "completion_length": 258.6205463409424, + "epoch": 1.2085586151976193, + "grad_norm": 0.20993607262866407, + "kl": 0.0743255615234375, + "learning_rate": 4.992827975311695e-07, + "loss": 0.0001, + "reward": 1.7625000700354576, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7208 + }, + { + "completion_length": 249.5803689956665, + "epoch": 1.208893918437487, + "grad_norm": 0.18415055786807938, + "kl": 0.0663604736328125, + "learning_rate": 4.992818892526193e-07, + "loss": 0.0001, + "reward": 1.78035718947649, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214507102966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7210 + }, + { + "completion_length": 252.821439743042, + "epoch": 1.2092292216773544, + "grad_norm": 0.1304870095299773, + "kl": 0.0946197509765625, + "learning_rate": 4.992809804001296e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 7212 + }, + { + "completion_length": 262.9776906967163, + "epoch": 1.209564524917222, + "grad_norm": 0.23406135216288343, + "kl": 0.0801544189453125, + "learning_rate": 4.992800709737029e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 7214 + }, + { + "completion_length": 260.1160840988159, + "epoch": 1.2098998281570896, + "grad_norm": 0.1897578285746444, + "kl": 0.129669189453125, + "learning_rate": 4.992791609733408e-07, + "loss": 0.0001, + "reward": 1.785714328289032, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7946428954601288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7216 + }, + { + "completion_length": 256.3482275009155, + "epoch": 1.210235131396957, + "grad_norm": 0.23263252449293698, + "kl": 0.1983489990234375, + "learning_rate": 4.992782503990458e-07, + "loss": 0.0002, + "reward": 1.769642911851406, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7218 + }, + { + "completion_length": 257.5759048461914, + "epoch": 1.2105704346368247, + "grad_norm": 0.20205381240686107, + "kl": 0.0634307861328125, + "learning_rate": 4.992773392508198e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7220 + }, + { + "completion_length": 266.7098388671875, + "epoch": 1.2109057378766923, + "grad_norm": 0.2924996226040406, + "kl": 0.0713043212890625, + "learning_rate": 4.99276427528665e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 7222 + }, + { + "completion_length": 254.8035831451416, + "epoch": 1.2112410411165597, + "grad_norm": 0.1814333949957524, + "kl": 0.100006103515625, + "learning_rate": 4.992755152325833e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071578115225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7224 + }, + { + "completion_length": 257.64287185668945, + "epoch": 1.2115763443564274, + "grad_norm": 0.24674960296846465, + "kl": 0.1389617919921875, + "learning_rate": 4.99274602362577e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7705357596278191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7226 + }, + { + "completion_length": 249.9107265472412, + "epoch": 1.211911647596295, + "grad_norm": 0.17770874745401008, + "kl": 0.068023681640625, + "learning_rate": 4.992736889186482e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 7228 + }, + { + "completion_length": 256.6250123977661, + "epoch": 1.2122469508361624, + "grad_norm": 0.2509594504708243, + "kl": 0.1199951171875, + "learning_rate": 4.992727749007988e-07, + "loss": 0.0001, + "reward": 1.7267857864499092, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7312500402331352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7230 + }, + { + "completion_length": 267.089298248291, + "epoch": 1.21258225407603, + "grad_norm": 0.22356860342543525, + "kl": 0.0696563720703125, + "learning_rate": 4.992718603090312e-07, + "loss": 0.0001, + "reward": 1.7178572043776512, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7178571987897158, + "rewards/format_reward_func": 1.0, + "step": 7232 + }, + { + "completion_length": 263.86608123779297, + "epoch": 1.2129175573158975, + "grad_norm": 0.4141292974356211, + "kl": 0.0949554443359375, + "learning_rate": 4.992709451433473e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.801785733550787, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7234 + }, + { + "completion_length": 259.0000123977661, + "epoch": 1.213252860555765, + "grad_norm": 0.2666180229355076, + "kl": 0.1144866943359375, + "learning_rate": 4.992700294037493e-07, + "loss": 0.0001, + "reward": 1.7214286625385284, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214285954833031, + "rewards/format_reward_func": 1.0, + "step": 7236 + }, + { + "completion_length": 252.33929538726807, + "epoch": 1.2135881637956327, + "grad_norm": 0.08668010444846727, + "kl": 0.1309967041015625, + "learning_rate": 4.992691130902392e-07, + "loss": 0.0001, + "reward": 1.7875000834465027, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7238 + }, + { + "completion_length": 255.44197368621826, + "epoch": 1.2139234670355001, + "grad_norm": 0.09888979720455138, + "kl": 0.122528076171875, + "learning_rate": 4.992681962028193e-07, + "loss": 0.0001, + "reward": 1.860714316368103, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8607143089175224, + "rewards/format_reward_func": 1.0, + "step": 7240 + }, + { + "completion_length": 248.6830472946167, + "epoch": 1.2142587702753678, + "grad_norm": 0.18925414301509572, + "kl": 0.1031341552734375, + "learning_rate": 4.992672787414914e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 7242 + }, + { + "completion_length": 258.4464416503906, + "epoch": 1.2145940735152354, + "grad_norm": 0.2640904385696533, + "kl": 0.074462890625, + "learning_rate": 4.99266360706258e-07, + "loss": 0.0001, + "reward": 1.7892857939004898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 7244 + }, + { + "completion_length": 251.86162185668945, + "epoch": 1.2149293767551028, + "grad_norm": 0.31212532650075375, + "kl": 0.06591796875, + "learning_rate": 4.992654420971209e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 7246 + }, + { + "completion_length": 261.6875114440918, + "epoch": 1.2152646799949705, + "grad_norm": 0.08009381221104957, + "kl": 0.072265625, + "learning_rate": 4.992645229140824e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 7248 + }, + { + "completion_length": 255.63394260406494, + "epoch": 1.215599983234838, + "grad_norm": 0.16946388110095856, + "kl": 0.06854248046875, + "learning_rate": 4.992636031571444e-07, + "loss": 0.0001, + "reward": 1.6964286640286446, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.696428619325161, + "rewards/format_reward_func": 1.0, + "step": 7250 + }, + { + "completion_length": 254.97769165039062, + "epoch": 1.2159352864747055, + "grad_norm": 0.23666184664199402, + "kl": 0.078399658203125, + "learning_rate": 4.992626828263093e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 7252 + }, + { + "completion_length": 253.17858028411865, + "epoch": 1.2162705897145731, + "grad_norm": 0.008383486510034467, + "kl": 0.1063690185546875, + "learning_rate": 4.992617619215791e-07, + "loss": 0.0001, + "reward": 1.753571480512619, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714600235224, + "rewards/format_reward_func": 1.0, + "step": 7254 + }, + { + "completion_length": 262.36608505249023, + "epoch": 1.2166058929544405, + "grad_norm": 0.23928214815805107, + "kl": 0.1030426025390625, + "learning_rate": 4.992608404429558e-07, + "loss": 0.0001, + "reward": 1.7330357879400253, + "reward_std": 0.044194172602146864, + "rewards/equation_reward_func": 0.7348214592784643, + "rewards/format_reward_func": 0.9982142895460129, + "step": 7256 + }, + { + "completion_length": 254.133939743042, + "epoch": 1.2169411961943082, + "grad_norm": 0.16791549556043, + "kl": 0.0996551513671875, + "learning_rate": 4.992599183904417e-07, + "loss": 0.0001, + "reward": 1.7232143580913544, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7276786006987095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7258 + }, + { + "completion_length": 255.5178689956665, + "epoch": 1.2172764994341758, + "grad_norm": 0.25849978845251953, + "kl": 0.084625244140625, + "learning_rate": 4.992589957640388e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7517857402563095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7260 + }, + { + "completion_length": 254.0759048461914, + "epoch": 1.2176118026740435, + "grad_norm": 0.18565102559484725, + "kl": 0.154052734375, + "learning_rate": 4.992580725637494e-07, + "loss": 0.0002, + "reward": 1.7370536476373672, + "reward_std": 0.0587151157669723, + "rewards/equation_reward_func": 0.7446428835391998, + "rewards/format_reward_func": 0.9924107193946838, + "step": 7262 + }, + { + "completion_length": 259.30358505249023, + "epoch": 1.2179471059139109, + "grad_norm": 0.21800854720790241, + "kl": 0.1430206298828125, + "learning_rate": 4.992571487895753e-07, + "loss": 0.0001, + "reward": 1.7232143431901932, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7264 + }, + { + "completion_length": 250.0759048461914, + "epoch": 1.2182824091537785, + "grad_norm": 0.1164286611566817, + "kl": 0.0711212158203125, + "learning_rate": 4.99256224441519e-07, + "loss": 0.0001, + "reward": 1.7392857521772385, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857633531094, + "rewards/format_reward_func": 1.0, + "step": 7266 + }, + { + "completion_length": 264.60715198516846, + "epoch": 1.218617712393646, + "grad_norm": 0.24969613066518914, + "kl": 0.062957763671875, + "learning_rate": 4.992552995195825e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000279396772, + "rewards/format_reward_func": 1.0, + "step": 7268 + }, + { + "completion_length": 255.99554920196533, + "epoch": 1.2189530156335135, + "grad_norm": 0.19538401377886458, + "kl": 0.0625152587890625, + "learning_rate": 4.992543740237677e-07, + "loss": 0.0001, + "reward": 1.8339286297559738, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.8383928760886192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7270 + }, + { + "completion_length": 250.9642972946167, + "epoch": 1.2192883188733812, + "grad_norm": 0.23034703579469626, + "kl": 0.0625762939453125, + "learning_rate": 4.99253447954077e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714752972126, + "rewards/format_reward_func": 1.0, + "step": 7272 + }, + { + "completion_length": 263.1696557998657, + "epoch": 1.2196236221132486, + "grad_norm": 0.19438148140689746, + "kl": 0.109344482421875, + "learning_rate": 4.992525213105124e-07, + "loss": 0.0001, + "reward": 1.7178572416305542, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7267857454717159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7274 + }, + { + "completion_length": 266.7678680419922, + "epoch": 1.2199589253531162, + "grad_norm": 0.20735329597287194, + "kl": 0.077850341796875, + "learning_rate": 4.992515940930762e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143238186836, + "rewards/format_reward_func": 1.0, + "step": 7276 + }, + { + "completion_length": 258.3035840988159, + "epoch": 1.2202942285929839, + "grad_norm": 0.14736460091258483, + "kl": 0.1506500244140625, + "learning_rate": 4.992506663017702e-07, + "loss": 0.0002, + "reward": 1.742857240140438, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 7278 + }, + { + "completion_length": 248.61608409881592, + "epoch": 1.2206295318328513, + "grad_norm": 0.2677453481686041, + "kl": 0.126617431640625, + "learning_rate": 4.99249737936597e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7280 + }, + { + "completion_length": 259.7901887893677, + "epoch": 1.220964835072719, + "grad_norm": 0.492668665701258, + "kl": 0.084564208984375, + "learning_rate": 4.992488089975583e-07, + "loss": 0.0001, + "reward": 1.7160715237259865, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7205357477068901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7282 + }, + { + "completion_length": 264.9196548461914, + "epoch": 1.2213001383125865, + "grad_norm": 0.181095995568355, + "kl": 0.087677001953125, + "learning_rate": 4.992478794846565e-07, + "loss": 0.0001, + "reward": 1.723214328289032, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7366071715950966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7284 + }, + { + "completion_length": 267.1651906967163, + "epoch": 1.221635441552454, + "grad_norm": 0.2169639788945554, + "kl": 0.0669708251953125, + "learning_rate": 4.992469493978937e-07, + "loss": 0.0001, + "reward": 1.7946429178118706, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7991071790456772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7286 + }, + { + "completion_length": 256.1875123977661, + "epoch": 1.2219707447923216, + "grad_norm": 0.6186112811288419, + "kl": 0.09869384765625, + "learning_rate": 4.992460187372719e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7178571857511997, + "rewards/format_reward_func": 1.0, + "step": 7288 + }, + { + "completion_length": 260.57590103149414, + "epoch": 1.222306048032189, + "grad_norm": 0.16121117399200693, + "kl": 0.0945587158203125, + "learning_rate": 4.992450875027935e-07, + "loss": 0.0001, + "reward": 1.7160715088248253, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7205357514321804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7290 + }, + { + "completion_length": 260.7410879135132, + "epoch": 1.2226413512720566, + "grad_norm": 0.22727721872779744, + "kl": 0.06622314453125, + "learning_rate": 4.992441556944604e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143331319094, + "rewards/format_reward_func": 1.0, + "step": 7292 + }, + { + "completion_length": 259.4107265472412, + "epoch": 1.2229766545119243, + "grad_norm": 0.24415005335576428, + "kl": 0.1076812744140625, + "learning_rate": 4.992432233122749e-07, + "loss": 0.0001, + "reward": 1.7607143744826317, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143204659224, + "rewards/format_reward_func": 1.0, + "step": 7294 + }, + { + "completion_length": 257.56697845458984, + "epoch": 1.2233119577517917, + "grad_norm": 0.31739442791663675, + "kl": 0.0947113037109375, + "learning_rate": 4.99242290356239e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 7296 + }, + { + "completion_length": 270.28126430511475, + "epoch": 1.2236472609916593, + "grad_norm": 0.1385651283554177, + "kl": 0.0988616943359375, + "learning_rate": 4.99241356826355e-07, + "loss": 0.0001, + "reward": 1.7785715162754059, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 1.0, + "step": 7298 + }, + { + "completion_length": 266.92857933044434, + "epoch": 1.223982564231527, + "grad_norm": 0.2780453894697533, + "kl": 0.0828094482421875, + "learning_rate": 4.992404227226249e-07, + "loss": 0.0001, + "reward": 1.7553571984171867, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7300 + }, + { + "completion_length": 266.64733123779297, + "epoch": 1.2243178674713944, + "grad_norm": 0.1710191122109286, + "kl": 0.0935516357421875, + "learning_rate": 4.99239488045051e-07, + "loss": 0.0001, + "reward": 1.7446429505944252, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7302 + }, + { + "completion_length": 269.4509057998657, + "epoch": 1.224653170711262, + "grad_norm": 0.24567936420836164, + "kl": 0.071990966796875, + "learning_rate": 4.992385527936354e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7803571820259094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7304 + }, + { + "completion_length": 267.95090770721436, + "epoch": 1.2249884739511296, + "grad_norm": 0.2317099030341503, + "kl": 0.098785400390625, + "learning_rate": 4.992376169683803e-07, + "loss": 0.0001, + "reward": 1.8071429207921028, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 7306 + }, + { + "completion_length": 262.07590675354004, + "epoch": 1.225323777190997, + "grad_norm": 0.21304988210294232, + "kl": 0.065704345703125, + "learning_rate": 4.992366805692877e-07, + "loss": 0.0001, + "reward": 1.7696429193019867, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7308 + }, + { + "completion_length": 267.8169775009155, + "epoch": 1.2256590804308647, + "grad_norm": 0.226278049448281, + "kl": 0.0792388916015625, + "learning_rate": 4.992357435963599e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7696428820490837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7310 + }, + { + "completion_length": 261.1964454650879, + "epoch": 1.225994383670732, + "grad_norm": 0.24893292964449734, + "kl": 0.0957183837890625, + "learning_rate": 4.992348060495989e-07, + "loss": 0.0001, + "reward": 1.783928632736206, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928716182709, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7312 + }, + { + "completion_length": 266.98662185668945, + "epoch": 1.2263296869105997, + "grad_norm": 0.24435019680281253, + "kl": 0.075714111328125, + "learning_rate": 4.99233867929007e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 7314 + }, + { + "completion_length": 245.55358219146729, + "epoch": 1.2266649901504674, + "grad_norm": 0.174986762818187, + "kl": 0.0735015869140625, + "learning_rate": 4.992329292345865e-07, + "loss": 0.0001, + "reward": 1.7428572252392769, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7428571656346321, + "rewards/format_reward_func": 1.0, + "step": 7316 + }, + { + "completion_length": 262.0759029388428, + "epoch": 1.227000293390335, + "grad_norm": 0.05003726187541759, + "kl": 0.06793212890625, + "learning_rate": 4.992319899663391e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.05555839091539383, + "rewards/equation_reward_func": 0.7839285992085934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7318 + }, + { + "completion_length": 249.6071538925171, + "epoch": 1.2273355966302024, + "grad_norm": 0.0020244057203651937, + "kl": 0.06622314453125, + "learning_rate": 4.992310501242676e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 7320 + }, + { + "completion_length": 260.71430110931396, + "epoch": 1.22767089987007, + "grad_norm": 0.20053090389361175, + "kl": 0.091400146484375, + "learning_rate": 4.992301097083735e-07, + "loss": 0.0001, + "reward": 1.732142947614193, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 7322 + }, + { + "completion_length": 261.8348331451416, + "epoch": 1.2280062031099375, + "grad_norm": 0.2737092773881806, + "kl": 0.0717315673828125, + "learning_rate": 4.992291687186595e-07, + "loss": 0.0001, + "reward": 1.7285715192556381, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 7324 + }, + { + "completion_length": 256.0134057998657, + "epoch": 1.228341506349805, + "grad_norm": 0.33311744361146584, + "kl": 0.130706787109375, + "learning_rate": 4.992282271551274e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 7326 + }, + { + "completion_length": 249.45536994934082, + "epoch": 1.2286768095896727, + "grad_norm": 0.3027524607523363, + "kl": 0.1055755615234375, + "learning_rate": 4.992272850177795e-07, + "loss": 0.0001, + "reward": 1.8089286237955093, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8133928813040257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7328 + }, + { + "completion_length": 257.183048248291, + "epoch": 1.2290121128295401, + "grad_norm": 0.2134971880773326, + "kl": 0.068511962890625, + "learning_rate": 4.992263423066182e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7455357573926449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7330 + }, + { + "completion_length": 265.4241189956665, + "epoch": 1.2293474160694078, + "grad_norm": 0.09134287624357415, + "kl": 0.0912628173828125, + "learning_rate": 4.992253990216453e-07, + "loss": 0.0001, + "reward": 1.7839286550879478, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7883928641676903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7332 + }, + { + "completion_length": 255.87054634094238, + "epoch": 1.2296827193092754, + "grad_norm": 0.2729861269599726, + "kl": 0.0726165771484375, + "learning_rate": 4.992244551628631e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 1.0, + "step": 7334 + }, + { + "completion_length": 243.59375858306885, + "epoch": 1.2300180225491428, + "grad_norm": 0.12997354741489184, + "kl": 0.0819549560546875, + "learning_rate": 4.992235107302738e-07, + "loss": 0.0001, + "reward": 1.7071429044008255, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7071428969502449, + "rewards/format_reward_func": 1.0, + "step": 7336 + }, + { + "completion_length": 253.1562623977661, + "epoch": 1.2303533257890105, + "grad_norm": 0.13434226070764227, + "kl": 0.089324951171875, + "learning_rate": 4.992225657238797e-07, + "loss": 0.0001, + "reward": 1.7000000923871994, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7000000346451998, + "rewards/format_reward_func": 1.0, + "step": 7338 + }, + { + "completion_length": 242.4419765472412, + "epoch": 1.230688629028878, + "grad_norm": 0.1688100720296217, + "kl": 0.079833984375, + "learning_rate": 4.992216201436827e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857428640127, + "rewards/format_reward_func": 1.0, + "step": 7340 + }, + { + "completion_length": 240.30358123779297, + "epoch": 1.2310239322687455, + "grad_norm": 0.22757167938869166, + "kl": 0.0682830810546875, + "learning_rate": 4.992206739896851e-07, + "loss": 0.0001, + "reward": 1.8214286416769028, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 1.0, + "step": 7342 + }, + { + "completion_length": 256.2857265472412, + "epoch": 1.2313592355086131, + "grad_norm": 0.24103032576905367, + "kl": 0.0616302490234375, + "learning_rate": 4.992197272618893e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 7344 + }, + { + "completion_length": 249.7500114440918, + "epoch": 1.2316945387484806, + "grad_norm": 0.22890032159343376, + "kl": 0.084991455078125, + "learning_rate": 4.992187799602972e-07, + "loss": 0.0001, + "reward": 1.6910715252161026, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.6955357491970062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7346 + }, + { + "completion_length": 239.84376335144043, + "epoch": 1.2320298419883482, + "grad_norm": 0.11993653416671786, + "kl": 0.0696258544921875, + "learning_rate": 4.992178320849109e-07, + "loss": 0.0001, + "reward": 1.714285783469677, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857611179352, + "rewards/format_reward_func": 1.0, + "step": 7348 + }, + { + "completion_length": 229.56697463989258, + "epoch": 1.2323651452282158, + "grad_norm": 0.283880895706985, + "kl": 0.1734161376953125, + "learning_rate": 4.992168836357329e-07, + "loss": 0.0002, + "reward": 1.8250000551342964, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 7350 + }, + { + "completion_length": 243.64733219146729, + "epoch": 1.2327004484680832, + "grad_norm": 0.2535059072968625, + "kl": 0.0676727294921875, + "learning_rate": 4.992159346127652e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 7352 + }, + { + "completion_length": 237.9464406967163, + "epoch": 1.2330357517079509, + "grad_norm": 0.3366521986910057, + "kl": 0.0709991455078125, + "learning_rate": 4.9921498501601e-07, + "loss": 0.0001, + "reward": 1.7035715356469154, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7035714499652386, + "rewards/format_reward_func": 1.0, + "step": 7354 + }, + { + "completion_length": 230.95983219146729, + "epoch": 1.2333710549478185, + "grad_norm": 0.3199696882096981, + "kl": 0.098846435546875, + "learning_rate": 4.992140348454695e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7285714503377676, + "rewards/format_reward_func": 1.0, + "step": 7356 + }, + { + "completion_length": 232.62054824829102, + "epoch": 1.233706358187686, + "grad_norm": 0.20618256158180728, + "kl": 0.0677337646484375, + "learning_rate": 4.992130841011461e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000257045031, + "rewards/format_reward_func": 1.0, + "step": 7358 + }, + { + "completion_length": 226.79018878936768, + "epoch": 1.2340416614275536, + "grad_norm": 0.24268664351870567, + "kl": 0.0743560791015625, + "learning_rate": 4.992121327830415e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428760886192, + "rewards/format_reward_func": 1.0, + "step": 7360 + }, + { + "completion_length": 229.5535831451416, + "epoch": 1.2343769646674212, + "grad_norm": 0.21594958900650493, + "kl": 0.066436767578125, + "learning_rate": 4.992111808911583e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000543892384, + "rewards/format_reward_func": 1.0, + "step": 7362 + }, + { + "completion_length": 237.1830472946167, + "epoch": 1.2347122679072886, + "grad_norm": 0.2437006081039398, + "kl": 0.0667266845703125, + "learning_rate": 4.992102284254985e-07, + "loss": 0.0001, + "reward": 1.717857226729393, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7178571671247482, + "rewards/format_reward_func": 1.0, + "step": 7364 + }, + { + "completion_length": 232.06697463989258, + "epoch": 1.2350475711471562, + "grad_norm": 0.21956220909973093, + "kl": 0.06585693359375, + "learning_rate": 4.992092753860644e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 7366 + }, + { + "completion_length": 231.67858123779297, + "epoch": 1.2353828743870237, + "grad_norm": 0.22599447075235282, + "kl": 0.0912017822265625, + "learning_rate": 4.992083217728581e-07, + "loss": 0.0001, + "reward": 1.775000087916851, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 7368 + }, + { + "completion_length": 221.7634038925171, + "epoch": 1.2357181776268913, + "grad_norm": 0.2089522879086932, + "kl": 0.0640411376953125, + "learning_rate": 4.992073675858818e-07, + "loss": 0.0001, + "reward": 1.7678572311997414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 1.0, + "step": 7370 + }, + { + "completion_length": 225.31251049041748, + "epoch": 1.236053480866759, + "grad_norm": 0.27154299086942546, + "kl": 0.0687255859375, + "learning_rate": 4.992064128251379e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7392857633531094, + "rewards/format_reward_func": 1.0, + "step": 7372 + }, + { + "completion_length": 231.61608219146729, + "epoch": 1.2363887841066263, + "grad_norm": 0.2652996298189875, + "kl": 0.061309814453125, + "learning_rate": 4.992054574906284e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7374 + }, + { + "completion_length": 231.45983409881592, + "epoch": 1.236724087346494, + "grad_norm": 0.1997123223499637, + "kl": 0.067901611328125, + "learning_rate": 4.992045015823555e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143312692642, + "rewards/format_reward_func": 1.0, + "step": 7376 + }, + { + "completion_length": 226.44643878936768, + "epoch": 1.2370593905863616, + "grad_norm": 0.22936699167594957, + "kl": 0.0925445556640625, + "learning_rate": 4.992035451003214e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 7378 + }, + { + "completion_length": 226.43304538726807, + "epoch": 1.237394693826229, + "grad_norm": 0.3561236632263674, + "kl": 0.072845458984375, + "learning_rate": 4.992025880445284e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.782142873853445, + "rewards/format_reward_func": 1.0, + "step": 7380 + }, + { + "completion_length": 225.39733123779297, + "epoch": 1.2377299970660967, + "grad_norm": 0.13552781415865434, + "kl": 0.06146240234375, + "learning_rate": 4.992016304149786e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 7382 + }, + { + "completion_length": 225.352689743042, + "epoch": 1.2380653003059643, + "grad_norm": 0.16034323470596654, + "kl": 0.0637054443359375, + "learning_rate": 4.992006722116743e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.771428594365716, + "rewards/format_reward_func": 1.0, + "step": 7384 + }, + { + "completion_length": 234.2009038925171, + "epoch": 1.2384006035458317, + "grad_norm": 0.11643361950936694, + "kl": 0.06671142578125, + "learning_rate": 4.991997134346176e-07, + "loss": 0.0001, + "reward": 1.725000061094761, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7250000424683094, + "rewards/format_reward_func": 1.0, + "step": 7386 + }, + { + "completion_length": 237.43304538726807, + "epoch": 1.2387359067856993, + "grad_norm": 0.34066513169603546, + "kl": 0.0608978271484375, + "learning_rate": 4.991987540838108e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 7388 + }, + { + "completion_length": 227.40626049041748, + "epoch": 1.2390712100255667, + "grad_norm": 0.14493342284575672, + "kl": 0.064849853515625, + "learning_rate": 4.99197794159256e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 7390 + }, + { + "completion_length": 232.2991189956665, + "epoch": 1.2394065132654344, + "grad_norm": 0.27793165057230773, + "kl": 0.067626953125, + "learning_rate": 4.991968336609556e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 7392 + }, + { + "completion_length": 225.19643878936768, + "epoch": 1.239741816505302, + "grad_norm": 0.2876380698092266, + "kl": 0.07135009765625, + "learning_rate": 4.991958725889116e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 7394 + }, + { + "completion_length": 229.30358123779297, + "epoch": 1.2400771197451697, + "grad_norm": 0.21288203957645194, + "kl": 0.064697265625, + "learning_rate": 4.991949109431264e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143089175224, + "rewards/format_reward_func": 1.0, + "step": 7396 + }, + { + "completion_length": 221.82143783569336, + "epoch": 1.240412422985037, + "grad_norm": 0.10024101777540191, + "kl": 0.0673828125, + "learning_rate": 4.99193948723602e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428883820772, + "rewards/format_reward_func": 1.0, + "step": 7398 + }, + { + "completion_length": 226.58036708831787, + "epoch": 1.2407477262249047, + "grad_norm": 0.22284009957725762, + "kl": 0.0872955322265625, + "learning_rate": 4.991929859303408e-07, + "loss": 0.0001, + "reward": 1.710714377462864, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143141329288, + "rewards/format_reward_func": 1.0, + "step": 7400 + }, + { + "completion_length": 223.60268783569336, + "epoch": 1.2410830294647721, + "grad_norm": 0.2600750417552333, + "kl": 0.06585693359375, + "learning_rate": 4.99192022563345e-07, + "loss": 0.0001, + "reward": 1.79642865806818, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 7402 + }, + { + "completion_length": 227.6919755935669, + "epoch": 1.2414183327046397, + "grad_norm": 0.26431094508941017, + "kl": 0.0618896484375, + "learning_rate": 4.991910586226166e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 7404 + }, + { + "completion_length": 221.63840198516846, + "epoch": 1.2417536359445074, + "grad_norm": 0.34389640598822085, + "kl": 0.0604095458984375, + "learning_rate": 4.991900941081583e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 7406 + }, + { + "completion_length": 225.4419755935669, + "epoch": 1.2420889391843748, + "grad_norm": 0.38325653075172783, + "kl": 0.096160888671875, + "learning_rate": 4.991891290199717e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 7408 + }, + { + "completion_length": 223.94197463989258, + "epoch": 1.2424242424242424, + "grad_norm": 0.16090653148401493, + "kl": 0.065399169921875, + "learning_rate": 4.991881633580594e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 7410 + }, + { + "completion_length": 228.5000123977661, + "epoch": 1.24275954566411, + "grad_norm": 0.28773662489496515, + "kl": 0.074371337890625, + "learning_rate": 4.991871971224237e-07, + "loss": 0.0001, + "reward": 1.7665179297327995, + "reward_std": 0.0776554741896689, + "rewards/equation_reward_func": 0.7678571846336126, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7412 + }, + { + "completion_length": 230.33929634094238, + "epoch": 1.2430948489039775, + "grad_norm": 0.16326829694279263, + "kl": 0.0712127685546875, + "learning_rate": 4.991862303130666e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 7414 + }, + { + "completion_length": 225.54018783569336, + "epoch": 1.2434301521438451, + "grad_norm": 0.14390607940358557, + "kl": 0.0926513671875, + "learning_rate": 4.991852629299904e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 7416 + }, + { + "completion_length": 222.42858123779297, + "epoch": 1.2437654553837127, + "grad_norm": 0.22509298933939004, + "kl": 0.0763702392578125, + "learning_rate": 4.991842949731974e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 7418 + }, + { + "completion_length": 221.15179538726807, + "epoch": 1.2441007586235802, + "grad_norm": 0.28116774174842096, + "kl": 0.0690460205078125, + "learning_rate": 4.991833264426896e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 7420 + }, + { + "completion_length": 218.59376049041748, + "epoch": 1.2444360618634478, + "grad_norm": 0.2843754473417763, + "kl": 0.06597900390625, + "learning_rate": 4.991823573384695e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571686148643, + "rewards/format_reward_func": 1.0, + "step": 7422 + }, + { + "completion_length": 222.20983219146729, + "epoch": 1.2447713651033152, + "grad_norm": 0.20690636300290868, + "kl": 0.0728607177734375, + "learning_rate": 4.991813876605392e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 7424 + }, + { + "completion_length": 218.0178680419922, + "epoch": 1.2451066683431828, + "grad_norm": 0.13231542882177258, + "kl": 0.06787109375, + "learning_rate": 4.99180417408901e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 7426 + }, + { + "completion_length": 221.59375667572021, + "epoch": 1.2454419715830505, + "grad_norm": 0.33948420671007135, + "kl": 0.0762939453125, + "learning_rate": 4.99179446583557e-07, + "loss": 0.0001, + "reward": 1.807142935693264, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8071428760886192, + "rewards/format_reward_func": 1.0, + "step": 7428 + }, + { + "completion_length": 222.44197463989258, + "epoch": 1.245777274822918, + "grad_norm": 0.18968566266064457, + "kl": 0.0654754638671875, + "learning_rate": 4.991784751845096e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286208152771, + "rewards/format_reward_func": 1.0, + "step": 7430 + }, + { + "completion_length": 221.1741180419922, + "epoch": 1.2461125780627855, + "grad_norm": 0.13362021561969276, + "kl": 0.0767059326171875, + "learning_rate": 4.99177503211761e-07, + "loss": 0.0001, + "reward": 1.82857146859169, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8285714574158192, + "rewards/format_reward_func": 1.0, + "step": 7432 + }, + { + "completion_length": 220.6696538925171, + "epoch": 1.2464478813026532, + "grad_norm": 0.40983168744085197, + "kl": 0.0979766845703125, + "learning_rate": 4.991765306653133e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 7434 + }, + { + "completion_length": 217.53572273254395, + "epoch": 1.2467831845425206, + "grad_norm": 0.24499121073236454, + "kl": 0.073272705078125, + "learning_rate": 4.991755575451689e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 7436 + }, + { + "completion_length": 215.0267972946167, + "epoch": 1.2471184877823882, + "grad_norm": 0.246875958035262, + "kl": 0.086761474609375, + "learning_rate": 4.991745838513299e-07, + "loss": 0.0001, + "reward": 1.8250000476837158, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8250000178813934, + "rewards/format_reward_func": 1.0, + "step": 7438 + }, + { + "completion_length": 212.84822463989258, + "epoch": 1.2474537910222558, + "grad_norm": 0.20762540243109817, + "kl": 0.0724029541015625, + "learning_rate": 4.991736095837987e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 7440 + }, + { + "completion_length": 221.09822463989258, + "epoch": 1.2477890942621233, + "grad_norm": 0.1869219659646194, + "kl": 0.0802001953125, + "learning_rate": 4.991726347425774e-07, + "loss": 0.0001, + "reward": 1.7392857521772385, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857559025288, + "rewards/format_reward_func": 1.0, + "step": 7442 + }, + { + "completion_length": 232.01340293884277, + "epoch": 1.248124397501991, + "grad_norm": 0.27557571789481994, + "kl": 0.072235107421875, + "learning_rate": 4.991716593276684e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7444 + }, + { + "completion_length": 220.5937614440918, + "epoch": 1.2484597007418583, + "grad_norm": 0.18519122568768503, + "kl": 0.07379150390625, + "learning_rate": 4.991706833390738e-07, + "loss": 0.0001, + "reward": 1.8392857536673546, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.839285746216774, + "rewards/format_reward_func": 1.0, + "step": 7446 + }, + { + "completion_length": 232.02679443359375, + "epoch": 1.248795003981726, + "grad_norm": 0.24044948180768533, + "kl": 0.0777587890625, + "learning_rate": 4.99169706776796e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7448 + }, + { + "completion_length": 228.03125858306885, + "epoch": 1.2491303072215936, + "grad_norm": 0.11412891327814846, + "kl": 0.0772857666015625, + "learning_rate": 4.991687296408371e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 7450 + }, + { + "completion_length": 232.45090293884277, + "epoch": 1.2494656104614612, + "grad_norm": 0.22571443973421887, + "kl": 0.069427490234375, + "learning_rate": 4.991677519311993e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 7452 + }, + { + "completion_length": 234.48215293884277, + "epoch": 1.2498009137013286, + "grad_norm": 0.21476238687167007, + "kl": 0.0712890625, + "learning_rate": 4.99166773647885e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 7454 + }, + { + "completion_length": 230.04018783569336, + "epoch": 1.2501362169411963, + "grad_norm": 0.25465075300402357, + "kl": 0.0692291259765625, + "learning_rate": 4.991657947908964e-07, + "loss": 0.0001, + "reward": 1.7357143387198448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143238186836, + "rewards/format_reward_func": 1.0, + "step": 7456 + }, + { + "completion_length": 229.64286613464355, + "epoch": 1.2504715201810637, + "grad_norm": 0.6685778048694985, + "kl": 0.0740203857421875, + "learning_rate": 4.991648153602358e-07, + "loss": 0.0001, + "reward": 1.748214341700077, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7526786141097546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7458 + }, + { + "completion_length": 238.12054920196533, + "epoch": 1.2508068234209313, + "grad_norm": 0.27757295188385095, + "kl": 0.0650787353515625, + "learning_rate": 4.991638353559054e-07, + "loss": 0.0001, + "reward": 1.785714365541935, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 7460 + }, + { + "completion_length": 231.2500114440918, + "epoch": 1.251142126660799, + "grad_norm": 0.25223749992442296, + "kl": 0.0772247314453125, + "learning_rate": 4.991628547779074e-07, + "loss": 0.0001, + "reward": 1.775000087916851, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 7462 + }, + { + "completion_length": 232.54465579986572, + "epoch": 1.2514774299006663, + "grad_norm": 0.20840119266884935, + "kl": 0.0738983154296875, + "learning_rate": 4.991618736262442e-07, + "loss": 0.0001, + "reward": 1.7285714894533157, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 7464 + }, + { + "completion_length": 235.50447845458984, + "epoch": 1.251812733140534, + "grad_norm": 0.24398954293589886, + "kl": 0.07470703125, + "learning_rate": 4.991608919009179e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 7466 + }, + { + "completion_length": 242.83929443359375, + "epoch": 1.2521480363804014, + "grad_norm": 0.19045043384066485, + "kl": 0.0765533447265625, + "learning_rate": 4.991599096019309e-07, + "loss": 0.0001, + "reward": 1.6910714954137802, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.6955357603728771, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7468 + }, + { + "completion_length": 229.02233219146729, + "epoch": 1.252483339620269, + "grad_norm": 0.2739781670065189, + "kl": 0.090850830078125, + "learning_rate": 4.991589267292854e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143145054579, + "rewards/format_reward_func": 1.0, + "step": 7470 + }, + { + "completion_length": 234.63393688201904, + "epoch": 1.2528186428601367, + "grad_norm": 0.5638522629412447, + "kl": 0.0961456298828125, + "learning_rate": 4.991579432829837e-07, + "loss": 0.0001, + "reward": 1.8160714879631996, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8205357380211353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7472 + }, + { + "completion_length": 227.69644165039062, + "epoch": 1.2531539461000043, + "grad_norm": 0.24793558166991733, + "kl": 0.0794677734375, + "learning_rate": 4.99156959263028e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 1.0, + "step": 7474 + }, + { + "completion_length": 231.8973331451416, + "epoch": 1.2534892493398717, + "grad_norm": 0.10141429069354206, + "kl": 0.0692138671875, + "learning_rate": 4.991559746694206e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 7476 + }, + { + "completion_length": 231.5446538925171, + "epoch": 1.2538245525797393, + "grad_norm": 0.2815565446267239, + "kl": 0.0777587890625, + "learning_rate": 4.991549895021638e-07, + "loss": 0.0001, + "reward": 1.721428669989109, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214285954833031, + "rewards/format_reward_func": 1.0, + "step": 7478 + }, + { + "completion_length": 230.41518878936768, + "epoch": 1.2541598558196068, + "grad_norm": 0.19892908493645406, + "kl": 0.0809478759765625, + "learning_rate": 4.991540037612598e-07, + "loss": 0.0001, + "reward": 1.8107143491506577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 7480 + }, + { + "completion_length": 237.64733028411865, + "epoch": 1.2544951590594744, + "grad_norm": 0.21884502968045427, + "kl": 0.0768890380859375, + "learning_rate": 4.991530174467109e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 7482 + }, + { + "completion_length": 236.6919755935669, + "epoch": 1.254830462299342, + "grad_norm": 0.16027566375000496, + "kl": 0.0768585205078125, + "learning_rate": 4.991520305585194e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 7484 + }, + { + "completion_length": 251.01340293884277, + "epoch": 1.2551657655392094, + "grad_norm": 0.2896072683566364, + "kl": 0.082977294921875, + "learning_rate": 4.991510430966875e-07, + "loss": 0.0001, + "reward": 1.7232143804430962, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.727678619325161, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7486 + }, + { + "completion_length": 236.20537090301514, + "epoch": 1.255501068779077, + "grad_norm": 0.24092087177097757, + "kl": 0.0662078857421875, + "learning_rate": 4.991500550612176e-07, + "loss": 0.0001, + "reward": 1.814285784959793, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8142857551574707, + "rewards/format_reward_func": 1.0, + "step": 7488 + }, + { + "completion_length": 239.6919765472412, + "epoch": 1.2558363720189447, + "grad_norm": 0.27121315571398896, + "kl": 0.0826873779296875, + "learning_rate": 4.991490664521119e-07, + "loss": 0.0001, + "reward": 1.7178572118282318, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 7490 + }, + { + "completion_length": 239.20983028411865, + "epoch": 1.2561716752588121, + "grad_norm": 0.16039280148087817, + "kl": 0.0776519775390625, + "learning_rate": 4.991480772693726e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 7492 + }, + { + "completion_length": 231.55358219146729, + "epoch": 1.2565069784986798, + "grad_norm": 0.25401639410075905, + "kl": 0.0705413818359375, + "learning_rate": 4.99147087513002e-07, + "loss": 0.0001, + "reward": 1.7553571984171867, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7494 + }, + { + "completion_length": 236.26340293884277, + "epoch": 1.2568422817385474, + "grad_norm": 0.31600402330566846, + "kl": 0.071044921875, + "learning_rate": 4.991460971830026e-07, + "loss": 0.0001, + "reward": 1.7303571999073029, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7348214536905289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7496 + }, + { + "completion_length": 236.80358123779297, + "epoch": 1.2571775849784148, + "grad_norm": 0.38116439183130013, + "kl": 0.0737762451171875, + "learning_rate": 4.991451062793763e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286148548126, + "rewards/format_reward_func": 1.0, + "step": 7498 + }, + { + "completion_length": 237.85715198516846, + "epoch": 1.2575128882182824, + "grad_norm": 0.2152823007031557, + "kl": 0.0875701904296875, + "learning_rate": 4.991441148021258e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143055647612, + "rewards/format_reward_func": 1.0, + "step": 7500 + }, + { + "completion_length": 239.27233219146729, + "epoch": 1.2578481914581499, + "grad_norm": 0.23772054865269418, + "kl": 0.0738372802734375, + "learning_rate": 4.991431227512531e-07, + "loss": 0.0001, + "reward": 1.7857143133878708, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143320143223, + "rewards/format_reward_func": 1.0, + "step": 7502 + }, + { + "completion_length": 233.86607933044434, + "epoch": 1.2581834946980175, + "grad_norm": 0.16101616603817884, + "kl": 0.11566162109375, + "learning_rate": 4.991421301267604e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 7504 + }, + { + "completion_length": 234.1071548461914, + "epoch": 1.2585187979378851, + "grad_norm": 0.2618933580358117, + "kl": 0.07000732421875, + "learning_rate": 4.991411369286503e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 7506 + }, + { + "completion_length": 238.22768783569336, + "epoch": 1.2588541011777528, + "grad_norm": 0.2759809096611727, + "kl": 0.073516845703125, + "learning_rate": 4.99140143156925e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 7508 + }, + { + "completion_length": 229.99107933044434, + "epoch": 1.2591894044176202, + "grad_norm": 0.13394452526505557, + "kl": 0.0980377197265625, + "learning_rate": 4.991391488115866e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 7510 + }, + { + "completion_length": 234.29018688201904, + "epoch": 1.2595247076574878, + "grad_norm": 0.6323557898045636, + "kl": 0.08587646484375, + "learning_rate": 4.991381538926374e-07, + "loss": 0.0001, + "reward": 1.7589286118745804, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928898721933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7512 + }, + { + "completion_length": 229.91965293884277, + "epoch": 1.2598600108973552, + "grad_norm": 0.40135163456317896, + "kl": 0.07708740234375, + "learning_rate": 4.991371584000799e-07, + "loss": 0.0001, + "reward": 1.7660714834928513, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357354134321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7514 + }, + { + "completion_length": 227.17858028411865, + "epoch": 1.2601953141372229, + "grad_norm": 0.9782226270408743, + "kl": 0.07330322265625, + "learning_rate": 4.991361623339164e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 7516 + }, + { + "completion_length": 232.57143878936768, + "epoch": 1.2605306173770905, + "grad_norm": 0.2562746459472891, + "kl": 0.0767974853515625, + "learning_rate": 4.991351656941489e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 7518 + }, + { + "completion_length": 241.55358219146729, + "epoch": 1.260865920616958, + "grad_norm": 0.2472584451345171, + "kl": 0.0744781494140625, + "learning_rate": 4.9913416848078e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 7520 + }, + { + "completion_length": 234.4107255935669, + "epoch": 1.2612012238568255, + "grad_norm": 0.36177741112164086, + "kl": 0.0704498291015625, + "learning_rate": 4.991331706938118e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 7522 + }, + { + "completion_length": 242.14286708831787, + "epoch": 1.261536527096693, + "grad_norm": 0.2512713998995826, + "kl": 0.073394775390625, + "learning_rate": 4.991321723332467e-07, + "loss": 0.0001, + "reward": 1.73392865806818, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928820490837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7524 + }, + { + "completion_length": 238.54911994934082, + "epoch": 1.2618718303365606, + "grad_norm": 0.23311703994054409, + "kl": 0.072662353515625, + "learning_rate": 4.99131173399087e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7526 + }, + { + "completion_length": 240.94197463989258, + "epoch": 1.2622071335764282, + "grad_norm": 0.23393040111609653, + "kl": 0.07373046875, + "learning_rate": 4.99130173891335e-07, + "loss": 0.0001, + "reward": 1.7928571850061417, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 7528 + }, + { + "completion_length": 238.2544755935669, + "epoch": 1.2625424368162959, + "grad_norm": 0.2596660312043446, + "kl": 0.07232666015625, + "learning_rate": 4.991291738099929e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7530 + }, + { + "completion_length": 227.68304538726807, + "epoch": 1.2628777400561633, + "grad_norm": 0.1490256935867386, + "kl": 0.0784912109375, + "learning_rate": 4.991281731550631e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 7532 + }, + { + "completion_length": 226.21876049041748, + "epoch": 1.263213043296031, + "grad_norm": 0.33889375367672325, + "kl": 0.081512451171875, + "learning_rate": 4.991271719265477e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000204890966, + "rewards/format_reward_func": 1.0, + "step": 7534 + }, + { + "completion_length": 229.75893592834473, + "epoch": 1.2635483465358983, + "grad_norm": 0.19253892513658385, + "kl": 0.07342529296875, + "learning_rate": 4.991261701244494e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 7536 + }, + { + "completion_length": 232.67411994934082, + "epoch": 1.263883649775766, + "grad_norm": 0.1851524664373545, + "kl": 0.0771484375, + "learning_rate": 4.991251677487702e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143335044384, + "rewards/format_reward_func": 1.0, + "step": 7538 + }, + { + "completion_length": 228.49108123779297, + "epoch": 1.2642189530156336, + "grad_norm": 0.3447227639574531, + "kl": 0.0768890380859375, + "learning_rate": 4.991241647995126e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428809314966, + "rewards/format_reward_func": 1.0, + "step": 7540 + }, + { + "completion_length": 223.60715293884277, + "epoch": 1.264554256255501, + "grad_norm": 0.21475977305995408, + "kl": 0.079498291015625, + "learning_rate": 4.991231612766786e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 7542 + }, + { + "completion_length": 228.71429538726807, + "epoch": 1.2648895594953686, + "grad_norm": 0.26691773585830647, + "kl": 0.1363372802734375, + "learning_rate": 4.99122157180271e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 7544 + }, + { + "completion_length": 220.34822368621826, + "epoch": 1.265224862735236, + "grad_norm": 0.20614781368504556, + "kl": 0.094970703125, + "learning_rate": 4.991211525102916e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 7546 + }, + { + "completion_length": 224.1696538925171, + "epoch": 1.2655601659751037, + "grad_norm": 0.23147580973292636, + "kl": 0.0751190185546875, + "learning_rate": 4.99120147266743e-07, + "loss": 0.0001, + "reward": 1.7125000730156898, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7169643249362707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7548 + }, + { + "completion_length": 221.75447273254395, + "epoch": 1.2658954692149713, + "grad_norm": 0.6172895430411022, + "kl": 0.0866851806640625, + "learning_rate": 4.991191414496274e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 7550 + }, + { + "completion_length": 218.9553680419922, + "epoch": 1.266230772454839, + "grad_norm": 0.24688172383365634, + "kl": 0.11077880859375, + "learning_rate": 4.991181350589473e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857529222965, + "rewards/format_reward_func": 1.0, + "step": 7552 + }, + { + "completion_length": 233.47768878936768, + "epoch": 1.2665660756947064, + "grad_norm": 0.36008920665302907, + "kl": 0.0836181640625, + "learning_rate": 4.991171280947047e-07, + "loss": 0.0001, + "reward": 1.7160715013742447, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7205357477068901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7554 + }, + { + "completion_length": 221.57590103149414, + "epoch": 1.266901378934574, + "grad_norm": 0.2618971345873311, + "kl": 0.087188720703125, + "learning_rate": 4.991161205569023e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7419643122702837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7556 + }, + { + "completion_length": 224.26340293884277, + "epoch": 1.2672366821744414, + "grad_norm": 0.331582401421873, + "kl": 0.1146087646484375, + "learning_rate": 4.991151124455422e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 7558 + }, + { + "completion_length": 220.55358219146729, + "epoch": 1.267571985414309, + "grad_norm": 0.25120591195005054, + "kl": 0.08917236328125, + "learning_rate": 4.991141037606266e-07, + "loss": 0.0001, + "reward": 1.7375000938773155, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7560 + }, + { + "completion_length": 212.33483123779297, + "epoch": 1.2679072886541767, + "grad_norm": 0.40357051105146413, + "kl": 0.08935546875, + "learning_rate": 4.991130945021581e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 7562 + }, + { + "completion_length": 219.60268878936768, + "epoch": 1.2682425918940443, + "grad_norm": 0.40444888759563763, + "kl": 0.0786590576171875, + "learning_rate": 4.991120846701388e-07, + "loss": 0.0001, + "reward": 1.7696429342031479, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7741071581840515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7564 + }, + { + "completion_length": 226.77679538726807, + "epoch": 1.2685778951339117, + "grad_norm": 0.5012331277544384, + "kl": 0.18408203125, + "learning_rate": 4.991110742645712e-07, + "loss": 0.0002, + "reward": 1.75178574770689, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7566 + }, + { + "completion_length": 227.01786708831787, + "epoch": 1.2689131983737794, + "grad_norm": 0.41045920652982976, + "kl": 0.0800323486328125, + "learning_rate": 4.991100632854575e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7169643137603998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7568 + }, + { + "completion_length": 228.75000953674316, + "epoch": 1.2692485016136468, + "grad_norm": 0.41643280522481385, + "kl": 0.0793914794921875, + "learning_rate": 4.991090517328001e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 7570 + }, + { + "completion_length": 214.71875953674316, + "epoch": 1.2695838048535144, + "grad_norm": 1.6521458727305902, + "kl": 0.091827392578125, + "learning_rate": 4.991080396066013e-07, + "loss": 0.0001, + "reward": 1.7375001013278961, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7419643104076385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7572 + }, + { + "completion_length": 211.85715103149414, + "epoch": 1.269919108093382, + "grad_norm": 0.7937714569865104, + "kl": 0.311981201171875, + "learning_rate": 4.991070269068633e-07, + "loss": 0.0003, + "reward": 1.7678572088479996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 7574 + }, + { + "completion_length": 220.1384038925171, + "epoch": 1.2702544113332495, + "grad_norm": 2.7509181354526198, + "kl": 0.102203369140625, + "learning_rate": 4.991060136335887e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 7576 + }, + { + "completion_length": 223.99554443359375, + "epoch": 1.270589714573117, + "grad_norm": 0.6848668616946066, + "kl": 0.173553466796875, + "learning_rate": 4.991049997867797e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 7578 + }, + { + "completion_length": 222.53572368621826, + "epoch": 1.2709250178129845, + "grad_norm": 0.6383890615748599, + "kl": 0.078521728515625, + "learning_rate": 4.991039853664386e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 7580 + }, + { + "completion_length": 222.75447463989258, + "epoch": 1.2712603210528521, + "grad_norm": 0.6796629798388669, + "kl": 0.0955047607421875, + "learning_rate": 4.991029703725677e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 7582 + }, + { + "completion_length": 224.05358219146729, + "epoch": 1.2715956242927198, + "grad_norm": 0.6087448935727924, + "kl": 0.0820770263671875, + "learning_rate": 4.991019548051696e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 7584 + }, + { + "completion_length": 222.65179347991943, + "epoch": 1.2719309275325874, + "grad_norm": 0.2702490964335112, + "kl": 0.08392333984375, + "learning_rate": 4.991009386642463e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 7586 + }, + { + "completion_length": 220.91072463989258, + "epoch": 1.2722662307724548, + "grad_norm": 0.9077446820456878, + "kl": 0.0844268798828125, + "learning_rate": 4.990999219498002e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 7588 + }, + { + "completion_length": 222.08929347991943, + "epoch": 1.2726015340123225, + "grad_norm": 0.39540866250941975, + "kl": 0.0833282470703125, + "learning_rate": 4.990989046618338e-07, + "loss": 0.0001, + "reward": 1.803571455180645, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714663565159, + "rewards/format_reward_func": 1.0, + "step": 7590 + }, + { + "completion_length": 220.80804538726807, + "epoch": 1.2729368372521899, + "grad_norm": 1.953295584845841, + "kl": 0.0895843505859375, + "learning_rate": 4.990978868003494e-07, + "loss": 0.0001, + "reward": 1.7303572073578835, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7592 + }, + { + "completion_length": 219.47768878936768, + "epoch": 1.2732721404920575, + "grad_norm": 0.25660438766797417, + "kl": 0.0877685546875, + "learning_rate": 4.990968683653492e-07, + "loss": 0.0001, + "reward": 1.7678572311997414, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 7594 + }, + { + "completion_length": 221.82590293884277, + "epoch": 1.2736074437319251, + "grad_norm": 0.4354199758652535, + "kl": 0.088165283203125, + "learning_rate": 4.990958493568358e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 7596 + }, + { + "completion_length": 211.41518783569336, + "epoch": 1.2739427469717926, + "grad_norm": 0.20289326916956163, + "kl": 0.0782318115234375, + "learning_rate": 4.990948297748113e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 7598 + }, + { + "completion_length": 218.65179538726807, + "epoch": 1.2742780502116602, + "grad_norm": 0.48082271814419353, + "kl": 0.08978271484375, + "learning_rate": 4.990938096192782e-07, + "loss": 0.0001, + "reward": 1.7165179327130318, + "reward_std": 0.06755394907668233, + "rewards/equation_reward_func": 0.7178571857511997, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7600 + }, + { + "completion_length": 213.29018688201904, + "epoch": 1.2746133534515276, + "grad_norm": 0.44731461791586646, + "kl": 0.09454345703125, + "learning_rate": 4.990927888902387e-07, + "loss": 0.0001, + "reward": 1.767410784959793, + "reward_std": 0.0460882093757391, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9968750029802322, + "step": 7602 + }, + { + "completion_length": 218.48661708831787, + "epoch": 1.2749486566913952, + "grad_norm": 0.4131093057482022, + "kl": 0.0911865234375, + "learning_rate": 4.990917675876953e-07, + "loss": 0.0001, + "reward": 1.6982143893837929, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7026785910129547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7604 + }, + { + "completion_length": 209.18750762939453, + "epoch": 1.2752839599312629, + "grad_norm": 0.5033205739515128, + "kl": 0.077178955078125, + "learning_rate": 4.990907457116503e-07, + "loss": 0.0001, + "reward": 1.7285715341567993, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714522004128, + "rewards/format_reward_func": 1.0, + "step": 7606 + }, + { + "completion_length": 212.22768783569336, + "epoch": 1.2756192631711305, + "grad_norm": 0.4563452985721969, + "kl": 0.07989501953125, + "learning_rate": 4.990897232621061e-07, + "loss": 0.0001, + "reward": 1.8178571984171867, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571537137032, + "rewards/format_reward_func": 1.0, + "step": 7608 + }, + { + "completion_length": 213.19643878936768, + "epoch": 1.275954566410998, + "grad_norm": 0.5585952840788717, + "kl": 0.07147216796875, + "learning_rate": 4.990887002390649e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 7610 + }, + { + "completion_length": 213.9285831451416, + "epoch": 1.2762898696508655, + "grad_norm": 0.32565894244767235, + "kl": 0.0990753173828125, + "learning_rate": 4.990876766425292e-07, + "loss": 0.0001, + "reward": 1.7732143625617027, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7612 + }, + { + "completion_length": 216.44197368621826, + "epoch": 1.276625172890733, + "grad_norm": 0.2877972666853063, + "kl": 0.0857696533203125, + "learning_rate": 4.990866524725013e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286148548126, + "rewards/format_reward_func": 1.0, + "step": 7614 + }, + { + "completion_length": 208.59822273254395, + "epoch": 1.2769604761306006, + "grad_norm": 3.039888470742808, + "kl": 0.1195831298828125, + "learning_rate": 4.990856277289836e-07, + "loss": 0.0001, + "reward": 1.787500038743019, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7616 + }, + { + "completion_length": 198.33483028411865, + "epoch": 1.2772957793704682, + "grad_norm": 0.004197014084676649, + "kl": 0.0860595703125, + "learning_rate": 4.990846024119785e-07, + "loss": 0.0001, + "reward": 1.8000000789761543, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 7618 + }, + { + "completion_length": 197.69643878936768, + "epoch": 1.2776310826103356, + "grad_norm": 0.23318182594139808, + "kl": 0.092559814453125, + "learning_rate": 4.990835765214882e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 7620 + }, + { + "completion_length": 202.15625953674316, + "epoch": 1.2779663858502033, + "grad_norm": 1.2937995948418413, + "kl": 0.087005615234375, + "learning_rate": 4.990825500575152e-07, + "loss": 0.0001, + "reward": 1.7785715088248253, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 7622 + }, + { + "completion_length": 196.40626049041748, + "epoch": 1.278301689090071, + "grad_norm": 0.3271859286057177, + "kl": 0.100067138671875, + "learning_rate": 4.990815230200618e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 7624 + }, + { + "completion_length": 192.56697368621826, + "epoch": 1.2786369923299383, + "grad_norm": 0.3072899399621595, + "kl": 0.098114013671875, + "learning_rate": 4.990804954091302e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 7626 + }, + { + "completion_length": 194.65625858306885, + "epoch": 1.278972295569806, + "grad_norm": 0.315932345161278, + "kl": 0.0919036865234375, + "learning_rate": 4.990794672247232e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857596278191, + "rewards/format_reward_func": 1.0, + "step": 7628 + }, + { + "completion_length": 198.43750953674316, + "epoch": 1.2793075988096736, + "grad_norm": 0.0880141059924044, + "kl": 0.0974578857421875, + "learning_rate": 4.990784384668428e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 7630 + }, + { + "completion_length": 188.29465293884277, + "epoch": 1.279642902049541, + "grad_norm": 0.25524708052962836, + "kl": 0.106048583984375, + "learning_rate": 4.990774091354915e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286055415869, + "rewards/format_reward_func": 1.0, + "step": 7632 + }, + { + "completion_length": 187.6160774230957, + "epoch": 1.2799782052894086, + "grad_norm": 0.27881454543754314, + "kl": 0.1068115234375, + "learning_rate": 4.990763792306717e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714525729418, + "rewards/format_reward_func": 1.0, + "step": 7634 + }, + { + "completion_length": 188.61161518096924, + "epoch": 1.280313508529276, + "grad_norm": 0.31344919963315515, + "kl": 0.11016845703125, + "learning_rate": 4.990753487523857e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 7636 + }, + { + "completion_length": 193.87054538726807, + "epoch": 1.2806488117691437, + "grad_norm": 0.2240318405017893, + "kl": 0.0923614501953125, + "learning_rate": 4.990743177006358e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 1.0, + "step": 7638 + }, + { + "completion_length": 194.4017915725708, + "epoch": 1.2809841150090113, + "grad_norm": 0.22470951217114793, + "kl": 0.105865478515625, + "learning_rate": 4.990732860754246e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714697092772, + "rewards/format_reward_func": 1.0, + "step": 7640 + }, + { + "completion_length": 188.56250953674316, + "epoch": 1.281319418248879, + "grad_norm": 0.37580742577286996, + "kl": 0.088165283203125, + "learning_rate": 4.990722538767543e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 7642 + }, + { + "completion_length": 188.35715103149414, + "epoch": 1.2816547214887464, + "grad_norm": 0.1599668296641991, + "kl": 0.10662841796875, + "learning_rate": 4.990712211046273e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 7644 + }, + { + "completion_length": 192.56697368621826, + "epoch": 1.281990024728614, + "grad_norm": 0.31148905908911567, + "kl": 0.083526611328125, + "learning_rate": 4.990701877590461e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 7646 + }, + { + "completion_length": 194.48215293884277, + "epoch": 1.2823253279684814, + "grad_norm": 0.20670813668367558, + "kl": 0.090240478515625, + "learning_rate": 4.990691538400128e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 7648 + }, + { + "completion_length": 188.84822177886963, + "epoch": 1.282660631208349, + "grad_norm": 0.14393064848231346, + "kl": 0.095794677734375, + "learning_rate": 4.990681193475301e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 7650 + }, + { + "completion_length": 190.66072177886963, + "epoch": 1.2829959344482167, + "grad_norm": 0.17232390997388092, + "kl": 0.104400634765625, + "learning_rate": 4.990670842816002e-07, + "loss": 0.0001, + "reward": 1.7486608028411865, + "reward_std": 0.03219861118122935, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7652 + }, + { + "completion_length": 192.68304443359375, + "epoch": 1.283331237688084, + "grad_norm": 0.2523200705211646, + "kl": 0.0912628173828125, + "learning_rate": 4.990660486422256e-07, + "loss": 0.0001, + "reward": 1.8214286118745804, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8214286006987095, + "rewards/format_reward_func": 1.0, + "step": 7654 + }, + { + "completion_length": 192.41965007781982, + "epoch": 1.2836665409279517, + "grad_norm": 0.09130844397646869, + "kl": 0.083648681640625, + "learning_rate": 4.990650124294085e-07, + "loss": 0.0001, + "reward": 1.7321429178118706, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 7656 + }, + { + "completion_length": 198.56250953674316, + "epoch": 1.2840018441678192, + "grad_norm": 0.2058536185574342, + "kl": 0.0894012451171875, + "learning_rate": 4.990639756431514e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714663565159, + "rewards/format_reward_func": 1.0, + "step": 7658 + }, + { + "completion_length": 192.67411613464355, + "epoch": 1.2843371474076868, + "grad_norm": 0.4357492926407639, + "kl": 0.0915069580078125, + "learning_rate": 4.990629382834567e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143182307482, + "rewards/format_reward_func": 1.0, + "step": 7660 + }, + { + "completion_length": 199.83482837677002, + "epoch": 1.2846724506475544, + "grad_norm": 0.31513147134974884, + "kl": 0.083404541015625, + "learning_rate": 4.990619003503268e-07, + "loss": 0.0001, + "reward": 1.7250000908970833, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 7662 + }, + { + "completion_length": 194.86161613464355, + "epoch": 1.285007753887422, + "grad_norm": 0.19215721495269542, + "kl": 0.096099853515625, + "learning_rate": 4.990608618437641e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 7664 + }, + { + "completion_length": 198.66072273254395, + "epoch": 1.2853430571272895, + "grad_norm": 0.2876697309370327, + "kl": 0.0840911865234375, + "learning_rate": 4.990598227637708e-07, + "loss": 0.0001, + "reward": 1.6950893551111221, + "reward_std": 0.02714784862473607, + "rewards/equation_reward_func": 0.6964286249130964, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7666 + }, + { + "completion_length": 206.60268688201904, + "epoch": 1.285678360367157, + "grad_norm": 0.2944065370264894, + "kl": 0.0862579345703125, + "learning_rate": 4.990587831103495e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 7668 + }, + { + "completion_length": 210.10715293884277, + "epoch": 1.2860136636070245, + "grad_norm": 0.23531668996266744, + "kl": 0.0872650146484375, + "learning_rate": 4.990577428835026e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 7670 + }, + { + "completion_length": 209.81697177886963, + "epoch": 1.2863489668468922, + "grad_norm": 0.31108048300835284, + "kl": 0.0871734619140625, + "learning_rate": 4.990567020832325e-07, + "loss": 0.0001, + "reward": 1.7035715132951736, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7035714574158192, + "rewards/format_reward_func": 1.0, + "step": 7672 + }, + { + "completion_length": 200.08482933044434, + "epoch": 1.2866842700867598, + "grad_norm": 0.30616690330063095, + "kl": 0.07989501953125, + "learning_rate": 4.990556607095414e-07, + "loss": 0.0001, + "reward": 1.8142858073115349, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 7674 + }, + { + "completion_length": 200.36161518096924, + "epoch": 1.2870195733266272, + "grad_norm": 0.31553270795768634, + "kl": 0.0896759033203125, + "learning_rate": 4.990546187624319e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428701281548, + "rewards/format_reward_func": 1.0, + "step": 7676 + }, + { + "completion_length": 207.80357837677002, + "epoch": 1.2873548765664948, + "grad_norm": 0.4448366525953187, + "kl": 0.09698486328125, + "learning_rate": 4.990535762419062e-07, + "loss": 0.0001, + "reward": 1.7553572207689285, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7678 + }, + { + "completion_length": 207.01340198516846, + "epoch": 1.2876901798063622, + "grad_norm": 0.3822823166526979, + "kl": 0.075531005859375, + "learning_rate": 4.990525331479669e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286092668772, + "rewards/format_reward_func": 1.0, + "step": 7680 + }, + { + "completion_length": 213.92858123779297, + "epoch": 1.2880254830462299, + "grad_norm": 0.1842616328582102, + "kl": 0.0823974609375, + "learning_rate": 4.990514894806164e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 7682 + }, + { + "completion_length": 213.35268783569336, + "epoch": 1.2883607862860975, + "grad_norm": 0.3047119390670931, + "kl": 0.0763092041015625, + "learning_rate": 4.99050445239857e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 7684 + }, + { + "completion_length": 201.67858123779297, + "epoch": 1.2886960895259651, + "grad_norm": 0.24882400891042786, + "kl": 0.0864715576171875, + "learning_rate": 4.990494004256911e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 7686 + }, + { + "completion_length": 210.66518783569336, + "epoch": 1.2890313927658326, + "grad_norm": 0.23940484799827028, + "kl": 0.0796661376953125, + "learning_rate": 4.990483550381211e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 7688 + }, + { + "completion_length": 215.89733123779297, + "epoch": 1.2893666960057002, + "grad_norm": 0.30294980556103956, + "kl": 0.0738983154296875, + "learning_rate": 4.990473090771494e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 7690 + }, + { + "completion_length": 215.60715103149414, + "epoch": 1.2897019992455676, + "grad_norm": 0.07984567628802022, + "kl": 0.0729522705078125, + "learning_rate": 4.990462625427786e-07, + "loss": 0.0001, + "reward": 1.7392857670783997, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 7692 + }, + { + "completion_length": 209.93304538726807, + "epoch": 1.2900373024854352, + "grad_norm": 0.2513657048792789, + "kl": 0.0680999755859375, + "learning_rate": 4.990452154350109e-07, + "loss": 0.0001, + "reward": 1.7843750640749931, + "reward_std": 0.05240166140720248, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7694 + }, + { + "completion_length": 220.35268878936768, + "epoch": 1.2903726057253029, + "grad_norm": 0.23234477772818157, + "kl": 0.0713348388671875, + "learning_rate": 4.990441677538489e-07, + "loss": 0.0001, + "reward": 1.7250000685453415, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 7696 + }, + { + "completion_length": 209.14733028411865, + "epoch": 1.2907079089651705, + "grad_norm": 0.2776459759810098, + "kl": 0.0708160400390625, + "learning_rate": 4.990431194992946e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 7698 + }, + { + "completion_length": 220.24554443359375, + "epoch": 1.291043212205038, + "grad_norm": 0.3385624563063057, + "kl": 0.0813751220703125, + "learning_rate": 4.99042070671351e-07, + "loss": 0.0001, + "reward": 1.692857250571251, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.6928571835160255, + "rewards/format_reward_func": 1.0, + "step": 7700 + }, + { + "completion_length": 218.83483028411865, + "epoch": 1.2913785154449056, + "grad_norm": 0.22457947375299586, + "kl": 0.0742645263671875, + "learning_rate": 4.9904102127002e-07, + "loss": 0.0001, + "reward": 1.8357143551111221, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.835714302957058, + "rewards/format_reward_func": 1.0, + "step": 7702 + }, + { + "completion_length": 218.10268783569336, + "epoch": 1.291713818684773, + "grad_norm": 0.4024637053056976, + "kl": 0.079010009765625, + "learning_rate": 4.990399712953044e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143349945545, + "rewards/format_reward_func": 1.0, + "step": 7704 + }, + { + "completion_length": 217.05804824829102, + "epoch": 1.2920491219246406, + "grad_norm": 0.15498067836171614, + "kl": 0.0782928466796875, + "learning_rate": 4.990389207472064e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 7706 + }, + { + "completion_length": 222.46875953674316, + "epoch": 1.2923844251645082, + "grad_norm": 0.3098561962327767, + "kl": 0.0785369873046875, + "learning_rate": 4.990378696257284e-07, + "loss": 0.0001, + "reward": 1.7700893506407738, + "reward_std": 0.06250318721868098, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7708 + }, + { + "completion_length": 226.8259038925171, + "epoch": 1.2927197284043757, + "grad_norm": 0.23701639769112526, + "kl": 0.0764312744140625, + "learning_rate": 4.990368179308728e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 7710 + }, + { + "completion_length": 217.90625858306885, + "epoch": 1.2930550316442433, + "grad_norm": 0.1698667963121498, + "kl": 0.0675811767578125, + "learning_rate": 4.990357656626423e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 7712 + }, + { + "completion_length": 224.6205472946167, + "epoch": 1.2933903348841107, + "grad_norm": 0.239622530564761, + "kl": 0.0841827392578125, + "learning_rate": 4.990347128210391e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 7714 + }, + { + "completion_length": 233.29911518096924, + "epoch": 1.2937256381239783, + "grad_norm": 0.2877922547255133, + "kl": 0.076263427734375, + "learning_rate": 4.990336594060656e-07, + "loss": 0.0001, + "reward": 1.7160715088248253, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7205357383936644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7716 + }, + { + "completion_length": 237.74108219146729, + "epoch": 1.294060941363846, + "grad_norm": 0.17369064662715267, + "kl": 0.07440185546875, + "learning_rate": 4.990326054177243e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 7718 + }, + { + "completion_length": 234.40625953674316, + "epoch": 1.2943962446037136, + "grad_norm": 0.1456112782303269, + "kl": 0.0776824951171875, + "learning_rate": 4.990315508560176e-07, + "loss": 0.0001, + "reward": 1.696428656578064, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6964286025613546, + "rewards/format_reward_func": 1.0, + "step": 7720 + }, + { + "completion_length": 227.63840293884277, + "epoch": 1.294731547843581, + "grad_norm": 0.2851940854186735, + "kl": 0.07769775390625, + "learning_rate": 4.99030495720948e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 7722 + }, + { + "completion_length": 225.8571548461914, + "epoch": 1.2950668510834487, + "grad_norm": 0.16598813182764588, + "kl": 0.081024169921875, + "learning_rate": 4.990294400125177e-07, + "loss": 0.0001, + "reward": 1.7553571909666061, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7598214577883482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7724 + }, + { + "completion_length": 238.36608219146729, + "epoch": 1.295402154323316, + "grad_norm": 0.29466927347255356, + "kl": 0.079254150390625, + "learning_rate": 4.990283837307294e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 7726 + }, + { + "completion_length": 237.75447463989258, + "epoch": 1.2957374575631837, + "grad_norm": 0.2723778547154022, + "kl": 0.09173583984375, + "learning_rate": 4.990273268755855e-07, + "loss": 0.0001, + "reward": 1.7214286550879478, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214286159723997, + "rewards/format_reward_func": 1.0, + "step": 7728 + }, + { + "completion_length": 229.79465293884277, + "epoch": 1.2960727608030513, + "grad_norm": 0.23247535891431542, + "kl": 0.0806121826171875, + "learning_rate": 4.990262694470882e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 7730 + }, + { + "completion_length": 233.72768878936768, + "epoch": 1.2964080640429188, + "grad_norm": 0.23696849547676943, + "kl": 0.0730438232421875, + "learning_rate": 4.990252114452403e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 7732 + }, + { + "completion_length": 232.67411708831787, + "epoch": 1.2967433672827864, + "grad_norm": 0.26712344526497833, + "kl": 0.0766448974609375, + "learning_rate": 4.990241528700439e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 7734 + }, + { + "completion_length": 239.3169755935669, + "epoch": 1.2970786705226538, + "grad_norm": 0.3596598026506631, + "kl": 0.077606201171875, + "learning_rate": 4.990230937215016e-07, + "loss": 0.0001, + "reward": 1.6946429312229156, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.6991071812808514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7736 + }, + { + "completion_length": 228.24554634094238, + "epoch": 1.2974139737625214, + "grad_norm": 0.20678090185850923, + "kl": 0.0703277587890625, + "learning_rate": 4.990220339996158e-07, + "loss": 0.0001, + "reward": 1.7392858043313026, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857428640127, + "rewards/format_reward_func": 1.0, + "step": 7738 + }, + { + "completion_length": 234.41965293884277, + "epoch": 1.297749277002389, + "grad_norm": 0.15463451116909518, + "kl": 0.075775146484375, + "learning_rate": 4.99020973704389e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7455357480794191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7740 + }, + { + "completion_length": 231.96429634094238, + "epoch": 1.2980845802422567, + "grad_norm": 0.2654924669106703, + "kl": 0.0730743408203125, + "learning_rate": 4.990199128358234e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 7742 + }, + { + "completion_length": 229.4196538925171, + "epoch": 1.2984198834821241, + "grad_norm": 0.20582707966686384, + "kl": 0.081146240234375, + "learning_rate": 4.990188513939219e-07, + "loss": 0.0001, + "reward": 1.707142911851406, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7071428894996643, + "rewards/format_reward_func": 1.0, + "step": 7744 + }, + { + "completion_length": 219.16518878936768, + "epoch": 1.2987551867219918, + "grad_norm": 0.13035483487149235, + "kl": 0.0706787109375, + "learning_rate": 4.990177893786865e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 7746 + }, + { + "completion_length": 227.17858123779297, + "epoch": 1.2990904899618592, + "grad_norm": 0.2269454033186533, + "kl": 0.0691375732421875, + "learning_rate": 4.990167267901199e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 7748 + }, + { + "completion_length": 229.66965198516846, + "epoch": 1.2994257932017268, + "grad_norm": 0.30071007329818733, + "kl": 0.086517333984375, + "learning_rate": 4.990156636282243e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 7750 + }, + { + "completion_length": 231.19643878936768, + "epoch": 1.2997610964415944, + "grad_norm": 0.2517132251564651, + "kl": 0.1706085205078125, + "learning_rate": 4.990145998930025e-07, + "loss": 0.0002, + "reward": 1.7607143595814705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 7752 + }, + { + "completion_length": 223.51786708831787, + "epoch": 1.3000963996814618, + "grad_norm": 0.11299277167844926, + "kl": 0.0755767822265625, + "learning_rate": 4.990135355844567e-07, + "loss": 0.0001, + "reward": 1.8285714760422707, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8285714499652386, + "rewards/format_reward_func": 1.0, + "step": 7754 + }, + { + "completion_length": 225.43304443359375, + "epoch": 1.3004317029213295, + "grad_norm": 0.2308531874026641, + "kl": 0.0781707763671875, + "learning_rate": 4.990124707025894e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 7756 + }, + { + "completion_length": 229.23661613464355, + "epoch": 1.3007670061611971, + "grad_norm": 0.21235278465773202, + "kl": 0.084625244140625, + "learning_rate": 4.990114052474031e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 7758 + }, + { + "completion_length": 230.1116189956665, + "epoch": 1.3011023094010645, + "grad_norm": 0.21202217971475054, + "kl": 0.0723724365234375, + "learning_rate": 4.990103392189002e-07, + "loss": 0.0001, + "reward": 1.807142935693264, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 7760 + }, + { + "completion_length": 226.2678689956665, + "epoch": 1.3014376126409322, + "grad_norm": 0.3171202168947194, + "kl": 0.081817626953125, + "learning_rate": 4.990092726170832e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 7762 + }, + { + "completion_length": 231.81697463989258, + "epoch": 1.3017729158807998, + "grad_norm": 0.07547602806092736, + "kl": 0.079559326171875, + "learning_rate": 4.990082054419544e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.07071067858487368, + "rewards/equation_reward_func": 0.7875000312924385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7764 + }, + { + "completion_length": 221.40179538726807, + "epoch": 1.3021082191206672, + "grad_norm": 0.19411017636005276, + "kl": 0.075775146484375, + "learning_rate": 4.990071376935165e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 7766 + }, + { + "completion_length": 222.508939743042, + "epoch": 1.3024435223605348, + "grad_norm": 0.2174186617837793, + "kl": 0.0889739990234375, + "learning_rate": 4.99006069371772e-07, + "loss": 0.0001, + "reward": 1.708928644657135, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7133928798139095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7768 + }, + { + "completion_length": 218.52233123779297, + "epoch": 1.3027788256004023, + "grad_norm": 0.30772843925684945, + "kl": 0.07672119140625, + "learning_rate": 4.990050004767229e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 7770 + }, + { + "completion_length": 224.69197463989258, + "epoch": 1.30311412884027, + "grad_norm": 0.3108382816493444, + "kl": 0.084442138671875, + "learning_rate": 4.990039310083722e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714615136385, + "rewards/format_reward_func": 1.0, + "step": 7772 + }, + { + "completion_length": 228.41965103149414, + "epoch": 1.3034494320801375, + "grad_norm": 0.14404724983058562, + "kl": 0.0753021240234375, + "learning_rate": 4.99002860966722e-07, + "loss": 0.0001, + "reward": 1.8178571984171867, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 7774 + }, + { + "completion_length": 216.98661518096924, + "epoch": 1.3037847353200052, + "grad_norm": 0.2621768758533317, + "kl": 0.0835113525390625, + "learning_rate": 4.990017903517748e-07, + "loss": 0.0001, + "reward": 1.746428668498993, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 7776 + }, + { + "completion_length": 212.20090103149414, + "epoch": 1.3041200385598726, + "grad_norm": 0.3576231235033641, + "kl": 0.077362060546875, + "learning_rate": 4.990007191635333e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 7778 + }, + { + "completion_length": 217.42858028411865, + "epoch": 1.3044553417997402, + "grad_norm": 0.33123491228549057, + "kl": 0.0748291015625, + "learning_rate": 4.989996474019998e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 7780 + }, + { + "completion_length": 223.04018878936768, + "epoch": 1.3047906450396076, + "grad_norm": 0.2896579109829537, + "kl": 0.0826416015625, + "learning_rate": 4.989985750671768e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000469386578, + "rewards/format_reward_func": 1.0, + "step": 7782 + }, + { + "completion_length": 224.85268878936768, + "epoch": 1.3051259482794753, + "grad_norm": 0.38177411239092823, + "kl": 0.0991973876953125, + "learning_rate": 4.989975021590668e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7482143193483353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7784 + }, + { + "completion_length": 217.80358123779297, + "epoch": 1.305461251519343, + "grad_norm": 0.391786171910137, + "kl": 0.09320068359375, + "learning_rate": 4.989964286776721e-07, + "loss": 0.0001, + "reward": 1.7486607655882835, + "reward_std": 0.06250318652018905, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7786 + }, + { + "completion_length": 219.24108219146729, + "epoch": 1.3057965547592103, + "grad_norm": 0.29750057352726367, + "kl": 0.0730133056640625, + "learning_rate": 4.989953546229954e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7788 + }, + { + "completion_length": 228.39733123779297, + "epoch": 1.306131857999078, + "grad_norm": 0.339059597344952, + "kl": 0.06829833984375, + "learning_rate": 4.98994279995039e-07, + "loss": 0.0001, + "reward": 1.8035715073347092, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 7790 + }, + { + "completion_length": 224.32143783569336, + "epoch": 1.3064671612389454, + "grad_norm": 0.17052533501531209, + "kl": 0.0645294189453125, + "learning_rate": 4.989932047938055e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 7792 + }, + { + "completion_length": 232.91965579986572, + "epoch": 1.306802464478813, + "grad_norm": 0.3939521238618866, + "kl": 0.0809478759765625, + "learning_rate": 4.989921290192974e-07, + "loss": 0.0001, + "reward": 1.7982143312692642, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7794 + }, + { + "completion_length": 225.34375953674316, + "epoch": 1.3071377677186806, + "grad_norm": 0.14989733463861954, + "kl": 0.069671630859375, + "learning_rate": 4.98991052671517e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 7796 + }, + { + "completion_length": 224.74107933044434, + "epoch": 1.3074730709585483, + "grad_norm": 0.22708778689946796, + "kl": 0.081268310546875, + "learning_rate": 4.989899757504669e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 7798 + }, + { + "completion_length": 237.43751049041748, + "epoch": 1.3078083741984157, + "grad_norm": 0.2108386163132425, + "kl": 0.1372833251953125, + "learning_rate": 4.989888982561495e-07, + "loss": 0.0001, + "reward": 1.7464286163449287, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7553571779280901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7800 + }, + { + "completion_length": 234.86161708831787, + "epoch": 1.3081436774382833, + "grad_norm": 0.1809852379315868, + "kl": 0.1102294921875, + "learning_rate": 4.989878201885674e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7375000342726707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7802 + }, + { + "completion_length": 234.46875858306885, + "epoch": 1.3084789806781507, + "grad_norm": 0.2641798629492874, + "kl": 0.0806427001953125, + "learning_rate": 4.98986741547723e-07, + "loss": 0.0001, + "reward": 1.7504464983940125, + "reward_std": 0.09028238197788596, + "rewards/equation_reward_func": 0.7562500443309546, + "rewards/format_reward_func": 0.9941964335739613, + "step": 7804 + }, + { + "completion_length": 229.58483219146729, + "epoch": 1.3088142839180184, + "grad_norm": 0.1348543022563862, + "kl": 0.0857391357421875, + "learning_rate": 4.989856623336188e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 7806 + }, + { + "completion_length": 236.95090103149414, + "epoch": 1.309149587157886, + "grad_norm": 0.2442230067538243, + "kl": 0.08929443359375, + "learning_rate": 4.989845825462574e-07, + "loss": 0.0001, + "reward": 1.7410714998841286, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7808 + }, + { + "completion_length": 239.0134048461914, + "epoch": 1.3094848903977534, + "grad_norm": 0.11169604253691609, + "kl": 0.089385986328125, + "learning_rate": 4.98983502185641e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964285872876644, + "rewards/format_reward_func": 1.0, + "step": 7810 + }, + { + "completion_length": 237.43304634094238, + "epoch": 1.309820193637621, + "grad_norm": 0.33155649606336196, + "kl": 0.0821075439453125, + "learning_rate": 4.989824212517724e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7580357417464256, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7812 + }, + { + "completion_length": 242.7500123977661, + "epoch": 1.3101554968774884, + "grad_norm": 0.2148534984579552, + "kl": 0.0830078125, + "learning_rate": 4.989813397446538e-07, + "loss": 0.0001, + "reward": 1.782589353621006, + "reward_std": 0.039774756878614426, + "rewards/equation_reward_func": 0.797321442514658, + "rewards/format_reward_func": 0.9852678626775742, + "step": 7814 + }, + { + "completion_length": 236.7142972946167, + "epoch": 1.310490800117356, + "grad_norm": 0.2801228926143477, + "kl": 0.090118408203125, + "learning_rate": 4.98980257664288e-07, + "loss": 0.0001, + "reward": 1.79464291036129, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7816 + }, + { + "completion_length": 237.13394165039062, + "epoch": 1.3108261033572237, + "grad_norm": 0.21059345642327687, + "kl": 0.1118316650390625, + "learning_rate": 4.989791750106772e-07, + "loss": 0.0001, + "reward": 1.7325893640518188, + "reward_std": 0.05492704175412655, + "rewards/equation_reward_func": 0.7383928783237934, + "rewards/format_reward_func": 0.994196429848671, + "step": 7818 + }, + { + "completion_length": 240.52679634094238, + "epoch": 1.3111614065970914, + "grad_norm": 0.22267002073074174, + "kl": 0.0712890625, + "learning_rate": 4.989780917838241e-07, + "loss": 0.0001, + "reward": 1.7732143327593803, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776786126196384, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7820 + }, + { + "completion_length": 231.9375114440918, + "epoch": 1.3114967098369588, + "grad_norm": 0.29483440744225925, + "kl": 0.0724029541015625, + "learning_rate": 4.989770079837311e-07, + "loss": 0.0001, + "reward": 1.7767857536673546, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7822 + }, + { + "completion_length": 228.0134048461914, + "epoch": 1.3118320130768264, + "grad_norm": 0.18827096899708526, + "kl": 0.111663818359375, + "learning_rate": 4.989759236104007e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 7824 + }, + { + "completion_length": 228.86608219146729, + "epoch": 1.3121673163166938, + "grad_norm": 0.2393960966488759, + "kl": 0.08990478515625, + "learning_rate": 4.989748386638354e-07, + "loss": 0.0001, + "reward": 1.7339286357164383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928969502449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7826 + }, + { + "completion_length": 234.2901906967163, + "epoch": 1.3125026195565614, + "grad_norm": 0.3691026151200785, + "kl": 0.083038330078125, + "learning_rate": 4.989737531440378e-07, + "loss": 0.0001, + "reward": 1.7732143625617027, + "reward_std": 0.138895976357162, + "rewards/equation_reward_func": 0.8044643178582191, + "rewards/format_reward_func": 0.9687500149011612, + "step": 7828 + }, + { + "completion_length": 225.30804538726807, + "epoch": 1.312837922796429, + "grad_norm": 0.17235853000671505, + "kl": 0.159576416015625, + "learning_rate": 4.989726670510102e-07, + "loss": 0.0002, + "reward": 1.7660714834928513, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7830 + }, + { + "completion_length": 223.16518783569336, + "epoch": 1.3131732260362965, + "grad_norm": 0.20867123219482434, + "kl": 0.0762176513671875, + "learning_rate": 4.989715803847553e-07, + "loss": 0.0001, + "reward": 1.828571505844593, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8285714462399483, + "rewards/format_reward_func": 1.0, + "step": 7832 + }, + { + "completion_length": 233.1919755935669, + "epoch": 1.3135085292761641, + "grad_norm": 0.24279714655234397, + "kl": 0.09161376953125, + "learning_rate": 4.989704931452754e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071429058909416, + "rewards/format_reward_func": 1.0, + "step": 7834 + }, + { + "completion_length": 231.41072368621826, + "epoch": 1.3138438325160318, + "grad_norm": 0.16989311883932803, + "kl": 0.1334381103515625, + "learning_rate": 4.989694053325732e-07, + "loss": 0.0001, + "reward": 1.7285715192556381, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714540630579, + "rewards/format_reward_func": 1.0, + "step": 7836 + }, + { + "completion_length": 230.12054634094238, + "epoch": 1.3141791357558992, + "grad_norm": 0.26715548521941174, + "kl": 0.0696868896484375, + "learning_rate": 4.989683169466511e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.8116071671247482, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7838 + }, + { + "completion_length": 231.8928689956665, + "epoch": 1.3145144389957668, + "grad_norm": 0.269292185061531, + "kl": 0.0734405517578125, + "learning_rate": 4.989672279875116e-07, + "loss": 0.0001, + "reward": 1.7732143551111221, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7866071686148643, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7840 + }, + { + "completion_length": 235.4910831451416, + "epoch": 1.3148497422356344, + "grad_norm": 0.20359318194734094, + "kl": 0.06927490234375, + "learning_rate": 4.989661384551573e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 7842 + }, + { + "completion_length": 236.55358505249023, + "epoch": 1.3151850454755019, + "grad_norm": 0.5545695180093116, + "kl": 0.2422637939453125, + "learning_rate": 4.989650483495906e-07, + "loss": 0.0002, + "reward": 1.7660715132951736, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7844 + }, + { + "completion_length": 223.20983123779297, + "epoch": 1.3155203487153695, + "grad_norm": 0.5333929645774035, + "kl": 0.2826690673828125, + "learning_rate": 4.98963957670814e-07, + "loss": 0.0003, + "reward": 1.751785770058632, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500424683094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7846 + }, + { + "completion_length": 225.40625953674316, + "epoch": 1.315855651955237, + "grad_norm": 0.20162221324994153, + "kl": 0.5379486083984375, + "learning_rate": 4.9896286641883e-07, + "loss": 0.0005, + "reward": 1.7571429386734962, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7660714499652386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7848 + }, + { + "completion_length": 240.8705472946167, + "epoch": 1.3161909551951045, + "grad_norm": 0.5158776214210989, + "kl": 0.2147674560546875, + "learning_rate": 4.989617745936413e-07, + "loss": 0.0002, + "reward": 1.6625000685453415, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.6848214585334063, + "rewards/format_reward_func": 0.977678582072258, + "step": 7850 + }, + { + "completion_length": 229.63840293884277, + "epoch": 1.3165262584349722, + "grad_norm": 0.3106276145668767, + "kl": 0.5753936767578125, + "learning_rate": 4.989606821952503e-07, + "loss": 0.0006, + "reward": 1.7892857566475868, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.798214316368103, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7852 + }, + { + "completion_length": 231.4866180419922, + "epoch": 1.3168615616748398, + "grad_norm": 0.20367040132891345, + "kl": 0.349151611328125, + "learning_rate": 4.989595892236594e-07, + "loss": 0.0003, + "reward": 1.7500000670552254, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7589286006987095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7854 + }, + { + "completion_length": 246.17411518096924, + "epoch": 1.3171968649147072, + "grad_norm": 0.36064056384011967, + "kl": 0.149871826171875, + "learning_rate": 4.989584956788713e-07, + "loss": 0.0001, + "reward": 1.6415179520845413, + "reward_std": 0.10796005232259631, + "rewards/equation_reward_func": 0.6785714626312256, + "rewards/format_reward_func": 0.9629464484751225, + "step": 7856 + }, + { + "completion_length": 232.4509038925171, + "epoch": 1.3175321681545749, + "grad_norm": 0.27023069215816137, + "kl": 0.454803466796875, + "learning_rate": 4.989574015608883e-07, + "loss": 0.0005, + "reward": 1.6946429386734962, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7169643137603998, + "rewards/format_reward_func": 0.9776785783469677, + "step": 7858 + }, + { + "completion_length": 233.8303680419922, + "epoch": 1.3178674713944423, + "grad_norm": 0.3203622502088371, + "kl": 0.268585205078125, + "learning_rate": 4.989563068697133e-07, + "loss": 0.0003, + "reward": 1.7535715028643608, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 7860 + }, + { + "completion_length": 230.17411613464355, + "epoch": 1.31820277463431, + "grad_norm": 0.29306740052637414, + "kl": 0.418975830078125, + "learning_rate": 4.989552116053485e-07, + "loss": 0.0004, + "reward": 1.7821429297327995, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7862 + }, + { + "completion_length": 230.42858123779297, + "epoch": 1.3185380778741775, + "grad_norm": 0.2370092201280241, + "kl": 0.0861053466796875, + "learning_rate": 4.989541157677964e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7723214514553547, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7864 + }, + { + "completion_length": 227.3750114440918, + "epoch": 1.318873381114045, + "grad_norm": 0.26860458742969134, + "kl": 0.0887451171875, + "learning_rate": 4.989530193570596e-07, + "loss": 0.0001, + "reward": 1.7214286774396896, + "reward_std": 0.1010152529925108, + "rewards/equation_reward_func": 0.7303571738302708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7866 + }, + { + "completion_length": 216.43304634094238, + "epoch": 1.3192086843539126, + "grad_norm": 0.2723609670431003, + "kl": 0.082366943359375, + "learning_rate": 4.989519223731408e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 1.0, + "step": 7868 + }, + { + "completion_length": 218.71876049041748, + "epoch": 1.31954398759378, + "grad_norm": 0.38728502112057167, + "kl": 0.2922515869140625, + "learning_rate": 4.989508248160423e-07, + "loss": 0.0003, + "reward": 1.78035718947649, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7870 + }, + { + "completion_length": 219.24108123779297, + "epoch": 1.3198792908336476, + "grad_norm": 0.27127451546841486, + "kl": 0.170257568359375, + "learning_rate": 4.989497266857666e-07, + "loss": 0.0002, + "reward": 1.730357214808464, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214574158192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7872 + }, + { + "completion_length": 219.01340198516846, + "epoch": 1.3202145940735153, + "grad_norm": 0.283198873620819, + "kl": 0.092987060546875, + "learning_rate": 4.989486279823164e-07, + "loss": 0.0001, + "reward": 1.7946429327130318, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7991071678698063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7874 + }, + { + "completion_length": 222.977689743042, + "epoch": 1.320549897313383, + "grad_norm": 0.29053446930371934, + "kl": 0.107940673828125, + "learning_rate": 4.989475287056941e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 7876 + }, + { + "completion_length": 221.5089406967163, + "epoch": 1.3208852005532503, + "grad_norm": 0.31560908109020885, + "kl": 0.1349945068359375, + "learning_rate": 4.989464288559024e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143238186836, + "rewards/format_reward_func": 1.0, + "step": 7878 + }, + { + "completion_length": 208.75000953674316, + "epoch": 1.321220503793118, + "grad_norm": 0.23600630732348715, + "kl": 0.0809326171875, + "learning_rate": 4.989453284329436e-07, + "loss": 0.0001, + "reward": 1.7379464954137802, + "reward_std": 0.03724937466904521, + "rewards/equation_reward_func": 0.7392857316881418, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7880 + }, + { + "completion_length": 216.7232265472412, + "epoch": 1.3215558070329854, + "grad_norm": 0.3341347334253455, + "kl": 0.1131591796875, + "learning_rate": 4.989442274368204e-07, + "loss": 0.0001, + "reward": 1.7803571969270706, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214693367481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7882 + }, + { + "completion_length": 215.28125953674316, + "epoch": 1.321891110272853, + "grad_norm": 0.1194486987207903, + "kl": 0.1649322509765625, + "learning_rate": 4.989431258675353e-07, + "loss": 0.0002, + "reward": 1.7571429088711739, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 7884 + }, + { + "completion_length": 213.87054538726807, + "epoch": 1.3222264135127206, + "grad_norm": 0.16692747952837317, + "kl": 0.9911956787109375, + "learning_rate": 4.989420237250909e-07, + "loss": 0.001, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857398837805, + "rewards/format_reward_func": 1.0, + "step": 7886 + }, + { + "completion_length": 208.29911613464355, + "epoch": 1.322561716752588, + "grad_norm": 0.43172098055364655, + "kl": 0.154571533203125, + "learning_rate": 4.989409210094895e-07, + "loss": 0.0002, + "reward": 1.798214353621006, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8026785925030708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7888 + }, + { + "completion_length": 211.33036518096924, + "epoch": 1.3228970199924557, + "grad_norm": 0.33694847774059483, + "kl": 0.096923828125, + "learning_rate": 4.989398177207339e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7321428749710321, + "rewards/format_reward_func": 1.0, + "step": 7890 + }, + { + "completion_length": 211.00893783569336, + "epoch": 1.3232323232323233, + "grad_norm": 0.29598731413356466, + "kl": 0.2599945068359375, + "learning_rate": 4.989387138588265e-07, + "loss": 0.0003, + "reward": 1.8017857521772385, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.806250024586916, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7892 + }, + { + "completion_length": 220.1741180419922, + "epoch": 1.3235676264721907, + "grad_norm": 0.2904979857112134, + "kl": 0.0965576171875, + "learning_rate": 4.989376094237699e-07, + "loss": 0.0001, + "reward": 1.717857226729393, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857305705547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7894 + }, + { + "completion_length": 220.26786613464355, + "epoch": 1.3239029297120584, + "grad_norm": 0.27770757884590075, + "kl": 0.145721435546875, + "learning_rate": 4.989365044155667e-07, + "loss": 0.0001, + "reward": 1.6714286506175995, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.6803571786731482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7896 + }, + { + "completion_length": 217.25447368621826, + "epoch": 1.324238232951926, + "grad_norm": 0.32922269503566337, + "kl": 0.196502685546875, + "learning_rate": 4.989353988342192e-07, + "loss": 0.0002, + "reward": 1.7392857894301414, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7898 + }, + { + "completion_length": 223.9107265472412, + "epoch": 1.3245735361917934, + "grad_norm": 0.998183898998436, + "kl": 0.1063232421875, + "learning_rate": 4.989342926797303e-07, + "loss": 0.0001, + "reward": 1.7660714834928513, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7794643230736256, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7900 + }, + { + "completion_length": 226.12054538726807, + "epoch": 1.324908839431661, + "grad_norm": 0.1907288726707059, + "kl": 0.097930908203125, + "learning_rate": 4.989331859521022e-07, + "loss": 0.0001, + "reward": 1.7178572043776512, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7267857529222965, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7902 + }, + { + "completion_length": 224.27233219146729, + "epoch": 1.3252441426715285, + "grad_norm": 2.421709001433735, + "kl": 0.112579345703125, + "learning_rate": 4.989320786513376e-07, + "loss": 0.0001, + "reward": 1.7267857864499092, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.740178607404232, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7904 + }, + { + "completion_length": 226.09822463989258, + "epoch": 1.325579445911396, + "grad_norm": 1.1546552639652499, + "kl": 0.12115478515625, + "learning_rate": 4.989309707774391e-07, + "loss": 0.0001, + "reward": 1.769642896950245, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7830357477068901, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7906 + }, + { + "completion_length": 221.54018878936768, + "epoch": 1.3259147491512637, + "grad_norm": 0.2796337690765062, + "kl": 0.14959716796875, + "learning_rate": 4.989298623304094e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 7908 + }, + { + "completion_length": 210.87501049041748, + "epoch": 1.3262500523911314, + "grad_norm": 0.3019066657629454, + "kl": 0.1559600830078125, + "learning_rate": 4.989287533102506e-07, + "loss": 0.0002, + "reward": 1.8178572058677673, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 7910 + }, + { + "completion_length": 220.26340103149414, + "epoch": 1.3265853556309988, + "grad_norm": 0.34559487640089015, + "kl": 0.151519775390625, + "learning_rate": 4.989276437169656e-07, + "loss": 0.0002, + "reward": 1.7535714954137802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 7912 + }, + { + "completion_length": 213.41518688201904, + "epoch": 1.3269206588708664, + "grad_norm": 0.292567252173144, + "kl": 0.08819580078125, + "learning_rate": 4.989265335505569e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 7914 + }, + { + "completion_length": 208.04465103149414, + "epoch": 1.3272559621107338, + "grad_norm": 0.3393569470550142, + "kl": 0.0819091796875, + "learning_rate": 4.989254228110271e-07, + "loss": 0.0001, + "reward": 1.7714286595582962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 7916 + }, + { + "completion_length": 214.92858028411865, + "epoch": 1.3275912653506015, + "grad_norm": 0.0855442589689345, + "kl": 0.0947113037109375, + "learning_rate": 4.989243114983785e-07, + "loss": 0.0001, + "reward": 1.8142857775092125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857551574707, + "rewards/format_reward_func": 1.0, + "step": 7918 + }, + { + "completion_length": 214.98661708831787, + "epoch": 1.327926568590469, + "grad_norm": 0.23092207006843854, + "kl": 0.0896759033203125, + "learning_rate": 4.98923199612614e-07, + "loss": 0.0001, + "reward": 1.7267857939004898, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7401785831898451, + "rewards/format_reward_func": 0.9866071492433548, + "step": 7920 + }, + { + "completion_length": 211.44197273254395, + "epoch": 1.3282618718303365, + "grad_norm": 0.2068555217674292, + "kl": 0.0815582275390625, + "learning_rate": 4.989220871537359e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 7922 + }, + { + "completion_length": 212.82590198516846, + "epoch": 1.3285971750702041, + "grad_norm": 0.28652744944235065, + "kl": 0.0796966552734375, + "learning_rate": 4.98920974121747e-07, + "loss": 0.0001, + "reward": 1.8321429044008255, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8321428708732128, + "rewards/format_reward_func": 1.0, + "step": 7924 + }, + { + "completion_length": 206.47768688201904, + "epoch": 1.3289324783100716, + "grad_norm": 0.29701924863495033, + "kl": 0.09295654296875, + "learning_rate": 4.989198605166495e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 7926 + }, + { + "completion_length": 210.23215103149414, + "epoch": 1.3292677815499392, + "grad_norm": 0.39862120982234434, + "kl": 0.09503173828125, + "learning_rate": 4.989187463384463e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 1.0, + "step": 7928 + }, + { + "completion_length": 223.2232255935669, + "epoch": 1.3296030847898068, + "grad_norm": 0.3389688200507532, + "kl": 0.0911407470703125, + "learning_rate": 4.989176315871398e-07, + "loss": 0.0001, + "reward": 1.74821437895298, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7930 + }, + { + "completion_length": 211.71429538726807, + "epoch": 1.3299383880296745, + "grad_norm": 0.30581841187049746, + "kl": 0.0901336669921875, + "learning_rate": 4.989165162627328e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 7932 + }, + { + "completion_length": 223.56251049041748, + "epoch": 1.3302736912695419, + "grad_norm": 0.3911307845060985, + "kl": 0.0928497314453125, + "learning_rate": 4.989154003652275e-07, + "loss": 0.0001, + "reward": 1.7928572222590446, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 7934 + }, + { + "completion_length": 214.97322463989258, + "epoch": 1.3306089945094095, + "grad_norm": 0.3465375626222913, + "kl": 0.102874755859375, + "learning_rate": 4.989142838946267e-07, + "loss": 0.0001, + "reward": 1.7428572252392769, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 1.0, + "step": 7936 + }, + { + "completion_length": 215.17411708831787, + "epoch": 1.330944297749277, + "grad_norm": 0.2738688404966579, + "kl": 0.0842132568359375, + "learning_rate": 4.98913166850933e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571589291096, + "rewards/format_reward_func": 1.0, + "step": 7938 + }, + { + "completion_length": 220.07590293884277, + "epoch": 1.3312796009891446, + "grad_norm": 0.20492354543887023, + "kl": 0.379638671875, + "learning_rate": 4.989120492341489e-07, + "loss": 0.0004, + "reward": 1.76071435213089, + "reward_std": 0.05555838719010353, + "rewards/equation_reward_func": 0.7607142999768257, + "rewards/format_reward_func": 1.0, + "step": 7940 + }, + { + "completion_length": 215.39733123779297, + "epoch": 1.3316149042290122, + "grad_norm": 0.30510096492084604, + "kl": 0.0955047607421875, + "learning_rate": 4.989109310442769e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 7942 + }, + { + "completion_length": 218.67858028411865, + "epoch": 1.3319502074688796, + "grad_norm": 0.34981667176896825, + "kl": 6.00408935546875, + "learning_rate": 4.989098122813197e-07, + "loss": 0.006, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 7944 + }, + { + "completion_length": 206.76786708831787, + "epoch": 1.3322855107087472, + "grad_norm": 0.5514828094624111, + "kl": 0.338348388671875, + "learning_rate": 4.989086929452797e-07, + "loss": 0.0003, + "reward": 1.7678572088479996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.767857177183032, + "rewards/format_reward_func": 1.0, + "step": 7946 + }, + { + "completion_length": 218.44197273254395, + "epoch": 1.3326208139486146, + "grad_norm": 0.21252407205890636, + "kl": 0.2454071044921875, + "learning_rate": 4.989075730361598e-07, + "loss": 0.0002, + "reward": 1.7785714715719223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 7948 + }, + { + "completion_length": 212.97768688201904, + "epoch": 1.3329561171884823, + "grad_norm": 0.24461102951428326, + "kl": 0.1788330078125, + "learning_rate": 4.989064525539622e-07, + "loss": 0.0002, + "reward": 1.7928571850061417, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571719676256, + "rewards/format_reward_func": 1.0, + "step": 7950 + }, + { + "completion_length": 213.28126049041748, + "epoch": 1.33329142042835, + "grad_norm": 0.25165262044320724, + "kl": 0.157684326171875, + "learning_rate": 4.989053314986898e-07, + "loss": 0.0002, + "reward": 1.7964286357164383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 7952 + }, + { + "completion_length": 218.63840007781982, + "epoch": 1.3336267236682176, + "grad_norm": 0.3567360154219791, + "kl": 0.103912353515625, + "learning_rate": 4.989042098703449e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 7954 + }, + { + "completion_length": 222.22768783569336, + "epoch": 1.333962026908085, + "grad_norm": 0.251345285427576, + "kl": 0.21942138671875, + "learning_rate": 4.989030876689303e-07, + "loss": 0.0002, + "reward": 1.7892857864499092, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 7956 + }, + { + "completion_length": 233.54465293884277, + "epoch": 1.3342973301479526, + "grad_norm": 0.3240709202095466, + "kl": 0.26336669921875, + "learning_rate": 4.989019648944486e-07, + "loss": 0.0003, + "reward": 1.739285796880722, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 7958 + }, + { + "completion_length": 227.4107255935669, + "epoch": 1.33463263338782, + "grad_norm": 0.3387491696363639, + "kl": 0.09735107421875, + "learning_rate": 4.989008415469022e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8178571630269289, + "rewards/format_reward_func": 1.0, + "step": 7960 + }, + { + "completion_length": 239.0803689956665, + "epoch": 1.3349679366276876, + "grad_norm": 0.3606421949515841, + "kl": 0.183502197265625, + "learning_rate": 4.988997176262937e-07, + "loss": 0.0002, + "reward": 1.7642858028411865, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857320606709, + "rewards/format_reward_func": 1.0, + "step": 7962 + }, + { + "completion_length": 229.78126049041748, + "epoch": 1.3353032398675553, + "grad_norm": 0.16454299451021592, + "kl": 0.39007568359375, + "learning_rate": 4.988985931326259e-07, + "loss": 0.0004, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 7964 + }, + { + "completion_length": 231.1250114440918, + "epoch": 1.3356385431074227, + "grad_norm": 0.4889025325932433, + "kl": 0.591400146484375, + "learning_rate": 4.988974680659012e-07, + "loss": 0.0006, + "reward": 1.7964286357164383, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 7966 + }, + { + "completion_length": 232.08483409881592, + "epoch": 1.3359738463472903, + "grad_norm": 0.40254854105558957, + "kl": 0.818389892578125, + "learning_rate": 4.988963424261221e-07, + "loss": 0.0008, + "reward": 1.772321529686451, + "reward_std": 0.06944798585027456, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 0.9973214343190193, + "step": 7968 + }, + { + "completion_length": 230.89733600616455, + "epoch": 1.336309149587158, + "grad_norm": 0.43390298939426425, + "kl": 1.7117919921875, + "learning_rate": 4.988952162132916e-07, + "loss": 0.0017, + "reward": 1.779464341700077, + "reward_std": 0.06944798701442778, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 0.9973214343190193, + "step": 7970 + }, + { + "completion_length": 222.60715293884277, + "epoch": 1.3366444528270254, + "grad_norm": 0.2671454665293245, + "kl": 0.3524322509765625, + "learning_rate": 4.988940894274118e-07, + "loss": 0.0004, + "reward": 1.81428574770689, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 7972 + }, + { + "completion_length": 231.09375953674316, + "epoch": 1.336979756066893, + "grad_norm": 0.20695948130143144, + "kl": 0.834014892578125, + "learning_rate": 4.988929620684857e-07, + "loss": 0.0008, + "reward": 1.7558036223053932, + "reward_std": 0.032198611879721284, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7974 + }, + { + "completion_length": 231.2187623977661, + "epoch": 1.3373150593067606, + "grad_norm": 0.25623959963115334, + "kl": 0.179534912109375, + "learning_rate": 4.988918341365156e-07, + "loss": 0.0002, + "reward": 1.7464286461472511, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7553571574389935, + "rewards/format_reward_func": 0.9910714328289032, + "step": 7976 + }, + { + "completion_length": 232.86161708831787, + "epoch": 1.337650362546628, + "grad_norm": 0.07860704228809769, + "kl": 0.182830810546875, + "learning_rate": 4.988907056315043e-07, + "loss": 0.0002, + "reward": 1.7272322252392769, + "reward_std": 0.03219861118122935, + "rewards/equation_reward_func": 0.7285714615136385, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7978 + }, + { + "completion_length": 241.88394165039062, + "epoch": 1.3379856657864957, + "grad_norm": 0.3020735349351519, + "kl": 0.331207275390625, + "learning_rate": 4.988895765534542e-07, + "loss": 0.0003, + "reward": 1.7017857879400253, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7062500528991222, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7980 + }, + { + "completion_length": 235.10268878936768, + "epoch": 1.338320969026363, + "grad_norm": 1.859077809209031, + "kl": 0.191925048828125, + "learning_rate": 4.98888446902368e-07, + "loss": 0.0002, + "reward": 1.7428572252392769, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 7982 + }, + { + "completion_length": 242.65180015563965, + "epoch": 1.3386562722662307, + "grad_norm": 0.3395116480501293, + "kl": 0.2183990478515625, + "learning_rate": 4.988873166782484e-07, + "loss": 0.0002, + "reward": 1.7700893580913544, + "reward_std": 0.07260471256449819, + "rewards/equation_reward_func": 0.7714285887777805, + "rewards/format_reward_func": 0.9986607171595097, + "step": 7984 + }, + { + "completion_length": 244.06697463989258, + "epoch": 1.3389915755060984, + "grad_norm": 0.30610268249495964, + "kl": 0.224273681640625, + "learning_rate": 4.98886185881098e-07, + "loss": 0.0002, + "reward": 1.7392858117818832, + "reward_std": 0.09596448857337236, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 7986 + }, + { + "completion_length": 248.25001049041748, + "epoch": 1.339326878745966, + "grad_norm": 0.23843987852462686, + "kl": 0.1561737060546875, + "learning_rate": 4.988850545109193e-07, + "loss": 0.0002, + "reward": 1.7357143461704254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 7988 + }, + { + "completion_length": 247.65180015563965, + "epoch": 1.3396621819858334, + "grad_norm": 0.2668127447982816, + "kl": 0.113250732421875, + "learning_rate": 4.988839225677147e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7990 + }, + { + "completion_length": 243.83483123779297, + "epoch": 1.339997485225701, + "grad_norm": 0.17925726284968613, + "kl": 0.08819580078125, + "learning_rate": 4.988827900514873e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.752678606659174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7992 + }, + { + "completion_length": 238.2321548461914, + "epoch": 1.3403327884655685, + "grad_norm": 0.17101341252068045, + "kl": 0.1177520751953125, + "learning_rate": 4.988816569622392e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 7994 + }, + { + "completion_length": 232.06697463989258, + "epoch": 1.340668091705436, + "grad_norm": 0.22431370698058403, + "kl": 0.083343505859375, + "learning_rate": 4.988805232999735e-07, + "loss": 0.0001, + "reward": 1.8160714730620384, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8205357417464256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 7996 + }, + { + "completion_length": 240.4375123977661, + "epoch": 1.3410033949453037, + "grad_norm": 0.2647862614895852, + "kl": 0.082122802734375, + "learning_rate": 4.988793890646924e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857640981674, + "rewards/format_reward_func": 1.0, + "step": 7998 + }, + { + "completion_length": 235.17858123779297, + "epoch": 1.3413386981851712, + "grad_norm": 0.18667197312641198, + "kl": 0.0843658447265625, + "learning_rate": 4.988782542563988e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 8000 + }, + { + "completion_length": 243.3928689956665, + "epoch": 1.3416740014250388, + "grad_norm": 0.30499551404800335, + "kl": 0.139678955078125, + "learning_rate": 4.988771188750949e-07, + "loss": 0.0001, + "reward": 1.835714377462864, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8357142992317677, + "rewards/format_reward_func": 1.0, + "step": 8002 + }, + { + "completion_length": 248.0000123977661, + "epoch": 1.3420093046649062, + "grad_norm": 0.2519452088794451, + "kl": 0.1127777099609375, + "learning_rate": 4.988759829207839e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 8004 + }, + { + "completion_length": 234.8169755935669, + "epoch": 1.3423446079047738, + "grad_norm": 0.2282132101143147, + "kl": 0.1110076904296875, + "learning_rate": 4.98874846393468e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 8006 + }, + { + "completion_length": 240.383939743042, + "epoch": 1.3426799111446415, + "grad_norm": 0.3600678021417168, + "kl": 0.17425537109375, + "learning_rate": 4.988737092931499e-07, + "loss": 0.0002, + "reward": 1.7232143729925156, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8008 + }, + { + "completion_length": 247.48662090301514, + "epoch": 1.343015214384509, + "grad_norm": 0.3601856091328208, + "kl": 0.108642578125, + "learning_rate": 4.988725716198322e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.09091372601687908, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 8010 + }, + { + "completion_length": 234.50447368621826, + "epoch": 1.3433505176243765, + "grad_norm": 0.30609978360331364, + "kl": 0.09759521484375, + "learning_rate": 4.988714333735176e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 8012 + }, + { + "completion_length": 229.9419765472412, + "epoch": 1.3436858208642442, + "grad_norm": 0.12760156152712052, + "kl": 0.085052490234375, + "learning_rate": 4.988702945542088e-07, + "loss": 0.0001, + "reward": 1.8321429193019867, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8321428783237934, + "rewards/format_reward_func": 1.0, + "step": 8014 + }, + { + "completion_length": 243.6428680419922, + "epoch": 1.3440211241041116, + "grad_norm": 0.39674180978928336, + "kl": 0.156829833984375, + "learning_rate": 4.988691551619081e-07, + "loss": 0.0002, + "reward": 1.7517857775092125, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7562500461935997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8016 + }, + { + "completion_length": 233.2544755935669, + "epoch": 1.3443564273439792, + "grad_norm": 0.16498579602363317, + "kl": 0.084625244140625, + "learning_rate": 4.988680151966186e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 8018 + }, + { + "completion_length": 240.07590293884277, + "epoch": 1.3446917305838468, + "grad_norm": 0.26198383378203854, + "kl": 0.0823822021484375, + "learning_rate": 4.988668746583424e-07, + "loss": 0.0001, + "reward": 1.7517857626080513, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8020 + }, + { + "completion_length": 243.4553680419922, + "epoch": 1.3450270338237142, + "grad_norm": 0.27333683560494554, + "kl": 0.0854644775390625, + "learning_rate": 4.988657335470826e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 8022 + }, + { + "completion_length": 246.2901906967163, + "epoch": 1.3453623370635819, + "grad_norm": 0.43851686807388973, + "kl": 0.092041015625, + "learning_rate": 4.988645918628414e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 8024 + }, + { + "completion_length": 235.3928680419922, + "epoch": 1.3456976403034493, + "grad_norm": 0.30842295436356104, + "kl": 0.08282470703125, + "learning_rate": 4.988634496056218e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8026 + }, + { + "completion_length": 238.07590293884277, + "epoch": 1.346032943543317, + "grad_norm": 0.12391568944034573, + "kl": 0.0708465576171875, + "learning_rate": 4.988623067754262e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 8028 + }, + { + "completion_length": 243.37947273254395, + "epoch": 1.3463682467831846, + "grad_norm": 0.18952970799683763, + "kl": 0.0840911865234375, + "learning_rate": 4.988611633722573e-07, + "loss": 0.0001, + "reward": 1.7357143834233284, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 8030 + }, + { + "completion_length": 247.5401906967163, + "epoch": 1.3467035500230522, + "grad_norm": 0.22351549042268146, + "kl": 0.08148193359375, + "learning_rate": 4.988600193961177e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 8032 + }, + { + "completion_length": 243.38393688201904, + "epoch": 1.3470388532629196, + "grad_norm": 0.37835809624396216, + "kl": 0.085357666015625, + "learning_rate": 4.988588748470101e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 8034 + }, + { + "completion_length": 238.62054634094238, + "epoch": 1.3473741565027872, + "grad_norm": 0.23108849325790354, + "kl": 0.08673095703125, + "learning_rate": 4.988577297249371e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428772062063, + "rewards/format_reward_func": 1.0, + "step": 8036 + }, + { + "completion_length": 245.7053680419922, + "epoch": 1.3477094597426547, + "grad_norm": 0.0852437054563733, + "kl": 0.0871124267578125, + "learning_rate": 4.988565840299014e-07, + "loss": 0.0001, + "reward": 1.6982143744826317, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7026786133646965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8038 + }, + { + "completion_length": 256.2857246398926, + "epoch": 1.3480447629825223, + "grad_norm": 0.27334831001457666, + "kl": 0.09234619140625, + "learning_rate": 4.988554377619054e-07, + "loss": 0.0001, + "reward": 1.7214286252856255, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 8040 + }, + { + "completion_length": 240.62500953674316, + "epoch": 1.34838006622239, + "grad_norm": 0.3848916028092399, + "kl": 0.1457366943359375, + "learning_rate": 4.988542909209521e-07, + "loss": 0.0001, + "reward": 1.8000000789761543, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 1.0, + "step": 8042 + }, + { + "completion_length": 237.4375123977661, + "epoch": 1.3487153694622576, + "grad_norm": 0.24599279522305645, + "kl": 0.0876922607421875, + "learning_rate": 4.988531435070438e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 8044 + }, + { + "completion_length": 258.0759038925171, + "epoch": 1.349050672702125, + "grad_norm": 0.4030775726183148, + "kl": 0.0973968505859375, + "learning_rate": 4.988519955201834e-07, + "loss": 0.0001, + "reward": 1.710714377462864, + "reward_std": 0.1060660146176815, + "rewards/equation_reward_func": 0.7196428924798965, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8046 + }, + { + "completion_length": 248.3214406967163, + "epoch": 1.3493859759419926, + "grad_norm": 0.30576816556802056, + "kl": 0.12847900390625, + "learning_rate": 4.988508469603735e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 8048 + }, + { + "completion_length": 241.42858123779297, + "epoch": 1.34972127918186, + "grad_norm": 0.26936985666206786, + "kl": 0.1039581298828125, + "learning_rate": 4.988496978276166e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571850061417, + "rewards/format_reward_func": 1.0, + "step": 8050 + }, + { + "completion_length": 255.56251525878906, + "epoch": 1.3500565824217277, + "grad_norm": 0.32989257568241814, + "kl": 0.133758544921875, + "learning_rate": 4.988485481219154e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857354134321, + "rewards/format_reward_func": 1.0, + "step": 8052 + }, + { + "completion_length": 242.9866189956665, + "epoch": 1.3503918856615953, + "grad_norm": 0.44298681376559773, + "kl": 0.0977783203125, + "learning_rate": 4.988473978432725e-07, + "loss": 0.0001, + "reward": 1.8000000789761543, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000081956387, + "rewards/format_reward_func": 1.0, + "step": 8054 + }, + { + "completion_length": 257.57590198516846, + "epoch": 1.3507271889014627, + "grad_norm": 0.380273329373798, + "kl": 0.085113525390625, + "learning_rate": 4.988462469916908e-07, + "loss": 0.0001, + "reward": 1.7982143312692642, + "reward_std": 0.0732360603287816, + "rewards/equation_reward_func": 0.8116071745753288, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8056 + }, + { + "completion_length": 251.74108505249023, + "epoch": 1.3510624921413303, + "grad_norm": 0.23128181541135234, + "kl": 0.1066436767578125, + "learning_rate": 4.988450955671727e-07, + "loss": 0.0001, + "reward": 1.7696429044008255, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8058 + }, + { + "completion_length": 239.02679824829102, + "epoch": 1.3513977953811978, + "grad_norm": 0.3129928101399441, + "kl": 0.1588897705078125, + "learning_rate": 4.988439435697209e-07, + "loss": 0.0002, + "reward": 1.7500000819563866, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7589286118745804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8060 + }, + { + "completion_length": 244.73215293884277, + "epoch": 1.3517330986210654, + "grad_norm": 0.31500260650227535, + "kl": 0.1183929443359375, + "learning_rate": 4.98842790999338e-07, + "loss": 0.0001, + "reward": 1.7964286133646965, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 8062 + }, + { + "completion_length": 266.3928699493408, + "epoch": 1.352068401860933, + "grad_norm": 0.19326811694448875, + "kl": 0.0916748046875, + "learning_rate": 4.98841637856027e-07, + "loss": 0.0001, + "reward": 1.7910714820027351, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357357859612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8064 + }, + { + "completion_length": 251.99554824829102, + "epoch": 1.3524037051008007, + "grad_norm": 0.28242915098566646, + "kl": 0.0901641845703125, + "learning_rate": 4.9884048413979e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 8066 + }, + { + "completion_length": 254.74108409881592, + "epoch": 1.352739008340668, + "grad_norm": 0.21990591678680338, + "kl": 0.1272125244140625, + "learning_rate": 4.988393298506301e-07, + "loss": 0.0001, + "reward": 1.7267858013510704, + "reward_std": 0.0732360603287816, + "rewards/equation_reward_func": 0.7401785962283611, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8068 + }, + { + "completion_length": 261.4910831451416, + "epoch": 1.3530743115805357, + "grad_norm": 0.19448140488390295, + "kl": 0.101318359375, + "learning_rate": 4.988381749885498e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 1.0, + "step": 8070 + }, + { + "completion_length": 260.87501335144043, + "epoch": 1.3534096148204031, + "grad_norm": 0.3282465087207066, + "kl": 0.15740966796875, + "learning_rate": 4.988370195535517e-07, + "loss": 0.0002, + "reward": 1.7375000640749931, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7508928887546062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8072 + }, + { + "completion_length": 268.11162185668945, + "epoch": 1.3537449180602708, + "grad_norm": 1.0868446090043193, + "kl": 0.2178192138671875, + "learning_rate": 4.988358635456385e-07, + "loss": 0.0002, + "reward": 1.721428632736206, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7303571570664644, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8074 + }, + { + "completion_length": 260.8928680419922, + "epoch": 1.3540802213001384, + "grad_norm": 0.16292994704342872, + "kl": 0.0915069580078125, + "learning_rate": 4.988347069648129e-07, + "loss": 0.0001, + "reward": 1.787500075995922, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8076 + }, + { + "completion_length": 269.1830472946167, + "epoch": 1.3544155245400058, + "grad_norm": 0.2766759911148509, + "kl": 0.10125732421875, + "learning_rate": 4.988335498110776e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7410714700818062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8078 + }, + { + "completion_length": 270.62947940826416, + "epoch": 1.3547508277798734, + "grad_norm": 0.24982035488239168, + "kl": 0.084320068359375, + "learning_rate": 4.988323920844352e-07, + "loss": 0.0001, + "reward": 1.7964286506175995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 8080 + }, + { + "completion_length": 262.83037090301514, + "epoch": 1.3550861310197408, + "grad_norm": 0.12586729994994952, + "kl": 0.0821075439453125, + "learning_rate": 4.988312337848883e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7169643081724644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8082 + }, + { + "completion_length": 277.20983695983887, + "epoch": 1.3554214342596085, + "grad_norm": 0.35417812775222424, + "kl": 0.243377685546875, + "learning_rate": 4.988300749124397e-07, + "loss": 0.0002, + "reward": 1.6589286550879478, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.6723214648663998, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8084 + }, + { + "completion_length": 268.1919755935669, + "epoch": 1.3557567374994761, + "grad_norm": 0.35492565059691283, + "kl": 0.2265167236328125, + "learning_rate": 4.988289154670919e-07, + "loss": 0.0002, + "reward": 1.6982143893837929, + "reward_std": 0.11364216171205044, + "rewards/equation_reward_func": 0.7205357477068901, + "rewards/format_reward_func": 0.977678582072258, + "step": 8086 + }, + { + "completion_length": 275.0803689956665, + "epoch": 1.3560920407393438, + "grad_norm": 0.12388342800816557, + "kl": 0.2826995849609375, + "learning_rate": 4.988277554488478e-07, + "loss": 0.0003, + "reward": 1.7517857998609543, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8088 + }, + { + "completion_length": 258.0134038925171, + "epoch": 1.3564273439792112, + "grad_norm": 0.3590281267788517, + "kl": 0.2235107421875, + "learning_rate": 4.988265948577099e-07, + "loss": 0.0002, + "reward": 1.7767857760190964, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7901785932481289, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8090 + }, + { + "completion_length": 263.65625953674316, + "epoch": 1.3567626472190788, + "grad_norm": 0.2747411302913466, + "kl": 0.1129913330078125, + "learning_rate": 4.98825433693681e-07, + "loss": 0.0001, + "reward": 1.7250000461935997, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7339285984635353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8092 + }, + { + "completion_length": 279.2901906967163, + "epoch": 1.3570979504589462, + "grad_norm": 0.33908673465572003, + "kl": 0.1176605224609375, + "learning_rate": 4.988242719567636e-07, + "loss": 0.0001, + "reward": 1.6571429520845413, + "reward_std": 0.10606601648032665, + "rewards/equation_reward_func": 0.6839285977184772, + "rewards/format_reward_func": 0.9732142984867096, + "step": 8094 + }, + { + "completion_length": 262.75894260406494, + "epoch": 1.3574332536988138, + "grad_norm": 0.245226238939219, + "kl": 0.1348724365234375, + "learning_rate": 4.988231096469606e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.7508928775787354, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8096 + }, + { + "completion_length": 273.8214416503906, + "epoch": 1.3577685569386815, + "grad_norm": 0.21272801489659302, + "kl": 0.1975250244140625, + "learning_rate": 4.988219467642743e-07, + "loss": 0.0002, + "reward": 1.7857143357396126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 8098 + }, + { + "completion_length": 266.92412090301514, + "epoch": 1.358103860178549, + "grad_norm": 0.1923993590207927, + "kl": 0.10992431640625, + "learning_rate": 4.988207833087078e-07, + "loss": 0.0001, + "reward": 1.7178572341799736, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 8100 + }, + { + "completion_length": 273.35715198516846, + "epoch": 1.3584391634184165, + "grad_norm": 0.2306174121200835, + "kl": 0.5870361328125, + "learning_rate": 4.988196192802636e-07, + "loss": 0.0006, + "reward": 1.7642857730388641, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7732143066823483, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8102 + }, + { + "completion_length": 253.38393878936768, + "epoch": 1.3587744666582842, + "grad_norm": 0.2780585896084185, + "kl": 0.104766845703125, + "learning_rate": 4.988184546789444e-07, + "loss": 0.0001, + "reward": 1.7665179446339607, + "reward_std": 0.037249374436214566, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 0.9986607171595097, + "step": 8104 + }, + { + "completion_length": 264.9151887893677, + "epoch": 1.3591097698981516, + "grad_norm": 0.21535603932789843, + "kl": 0.2724151611328125, + "learning_rate": 4.988172895047528e-07, + "loss": 0.0003, + "reward": 1.7214286103844643, + "reward_std": 0.0909137288108468, + "rewards/equation_reward_func": 0.7392857559025288, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8106 + }, + { + "completion_length": 257.81697940826416, + "epoch": 1.3594450731380192, + "grad_norm": 0.1236021672041698, + "kl": 0.0832977294921875, + "learning_rate": 4.988161237576915e-07, + "loss": 0.0001, + "reward": 1.7625000476837158, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8108 + }, + { + "completion_length": 266.21429920196533, + "epoch": 1.3597803763778868, + "grad_norm": 0.28031837613588595, + "kl": 0.12738037109375, + "learning_rate": 4.988149574377633e-07, + "loss": 0.0001, + "reward": 1.6785715073347092, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.6875000447034836, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8110 + }, + { + "completion_length": 260.03572940826416, + "epoch": 1.3601156796177543, + "grad_norm": 0.24458932541296974, + "kl": 0.3977203369140625, + "learning_rate": 4.988137905449708e-07, + "loss": 0.0004, + "reward": 1.7678571939468384, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7767857424914837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8112 + }, + { + "completion_length": 261.80804920196533, + "epoch": 1.360450982857622, + "grad_norm": 0.249895327320769, + "kl": 0.093292236328125, + "learning_rate": 4.988126230793167e-07, + "loss": 0.0001, + "reward": 1.7339286282658577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928969502449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8114 + }, + { + "completion_length": 255.57590293884277, + "epoch": 1.3607862860974893, + "grad_norm": 0.3183229100046573, + "kl": 0.235931396484375, + "learning_rate": 4.988114550408037e-07, + "loss": 0.0002, + "reward": 1.7607143372297287, + "reward_std": 0.05555839091539383, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8116 + }, + { + "completion_length": 257.883939743042, + "epoch": 1.361121589337357, + "grad_norm": 0.215454386421447, + "kl": 0.0789642333984375, + "learning_rate": 4.988102864294344e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8118 + }, + { + "completion_length": 252.44644165039062, + "epoch": 1.3614568925772246, + "grad_norm": 0.12456854677300534, + "kl": 0.2279205322265625, + "learning_rate": 4.988091172452117e-07, + "loss": 0.0002, + "reward": 1.730357214808464, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7348214536905289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8120 + }, + { + "completion_length": 245.48215579986572, + "epoch": 1.3617921958170922, + "grad_norm": 0.32630944614624346, + "kl": 0.2685546875, + "learning_rate": 4.988079474881381e-07, + "loss": 0.0003, + "reward": 1.7839286476373672, + "reward_std": 0.06313453521579504, + "rewards/equation_reward_func": 0.7973214536905289, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8122 + }, + { + "completion_length": 252.8169755935669, + "epoch": 1.3621274990569596, + "grad_norm": 0.24195474875844628, + "kl": 0.3311767578125, + "learning_rate": 4.988067771582163e-07, + "loss": 0.0003, + "reward": 1.7696429267525673, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7830357439815998, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8124 + }, + { + "completion_length": 259.477689743042, + "epoch": 1.3624628022968273, + "grad_norm": 0.7045995087371681, + "kl": 1.045379638671875, + "learning_rate": 4.98805606255449e-07, + "loss": 0.001, + "reward": 1.7071429267525673, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8126 + }, + { + "completion_length": 275.4866199493408, + "epoch": 1.3627981055366947, + "grad_norm": 0.31304647795741647, + "kl": 1.1280517578125, + "learning_rate": 4.988044347798392e-07, + "loss": 0.0011, + "reward": 1.708928644657135, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7133928816765547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8128 + }, + { + "completion_length": 248.54912090301514, + "epoch": 1.3631334087765623, + "grad_norm": 0.23159962235353818, + "kl": 0.232452392578125, + "learning_rate": 4.988032627313892e-07, + "loss": 0.0002, + "reward": 1.769642911851406, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8130 + }, + { + "completion_length": 267.6607255935669, + "epoch": 1.36346871201643, + "grad_norm": 0.20785342484097794, + "kl": 0.69158935546875, + "learning_rate": 4.988020901101017e-07, + "loss": 0.0007, + "reward": 1.719642922282219, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7330357357859612, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8132 + }, + { + "completion_length": 255.8348331451416, + "epoch": 1.3638040152562974, + "grad_norm": 0.5485224272178171, + "kl": 0.561676025390625, + "learning_rate": 4.988009169159798e-07, + "loss": 0.0006, + "reward": 1.7339286357164383, + "reward_std": 0.08333758544176817, + "rewards/equation_reward_func": 0.7473214641213417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8134 + }, + { + "completion_length": 251.4196548461914, + "epoch": 1.364139318496165, + "grad_norm": 0.37869015687501495, + "kl": 0.3833465576171875, + "learning_rate": 4.987997431490257e-07, + "loss": 0.0004, + "reward": 1.7428571805357933, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7517857514321804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8136 + }, + { + "completion_length": 246.7589349746704, + "epoch": 1.3644746217360324, + "grad_norm": 0.29526207045490876, + "kl": 0.158782958984375, + "learning_rate": 4.987985688092426e-07, + "loss": 0.0002, + "reward": 1.7982143387198448, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.820535734295845, + "rewards/format_reward_func": 0.977678582072258, + "step": 8138 + }, + { + "completion_length": 261.4687614440918, + "epoch": 1.3648099249759, + "grad_norm": 0.33114232485365375, + "kl": 0.33135986328125, + "learning_rate": 4.987973938966326e-07, + "loss": 0.0003, + "reward": 1.798214353621006, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.802678607404232, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8140 + }, + { + "completion_length": 251.94197463989258, + "epoch": 1.3651452282157677, + "grad_norm": 0.4965083376527316, + "kl": 0.1101531982421875, + "learning_rate": 4.98796218411199e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 8142 + }, + { + "completion_length": 242.477689743042, + "epoch": 1.3654805314556353, + "grad_norm": 0.32604095360515667, + "kl": 0.1768951416015625, + "learning_rate": 4.987950423529442e-07, + "loss": 0.0002, + "reward": 1.8142857775092125, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 8144 + }, + { + "completion_length": 250.02679538726807, + "epoch": 1.3658158346955027, + "grad_norm": 0.28201790087899403, + "kl": 0.1674957275390625, + "learning_rate": 4.98793865721871e-07, + "loss": 0.0002, + "reward": 1.7303572222590446, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.7437500394880772, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8146 + }, + { + "completion_length": 237.24554538726807, + "epoch": 1.3661511379353704, + "grad_norm": 0.5227818679067141, + "kl": 0.1568145751953125, + "learning_rate": 4.987926885179821e-07, + "loss": 0.0002, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 1.0, + "step": 8148 + }, + { + "completion_length": 239.01786708831787, + "epoch": 1.3664864411752378, + "grad_norm": 0.20125727672007632, + "kl": 0.110504150390625, + "learning_rate": 4.987915107412801e-07, + "loss": 0.0001, + "reward": 1.769642911851406, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8150 + }, + { + "completion_length": 231.80358409881592, + "epoch": 1.3668217444151054, + "grad_norm": 0.26540877315998423, + "kl": 0.13861083984375, + "learning_rate": 4.987903323917678e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.8035714477300644, + "rewards/format_reward_func": 1.0, + "step": 8152 + }, + { + "completion_length": 234.2232265472412, + "epoch": 1.367157047654973, + "grad_norm": 2.742825030886559, + "kl": 0.16217041015625, + "learning_rate": 4.987891534694479e-07, + "loss": 0.0002, + "reward": 1.7303571999073029, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7348214704543352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8154 + }, + { + "completion_length": 237.60268783569336, + "epoch": 1.3674923508948404, + "grad_norm": 0.8197647544954363, + "kl": 0.338653564453125, + "learning_rate": 4.987879739743232e-07, + "loss": 0.0003, + "reward": 1.7214286401867867, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214285954833031, + "rewards/format_reward_func": 1.0, + "step": 8156 + }, + { + "completion_length": 238.80358123779297, + "epoch": 1.367827654134708, + "grad_norm": 0.17306658774659567, + "kl": 0.164398193359375, + "learning_rate": 4.987867939063963e-07, + "loss": 0.0002, + "reward": 1.821428619325161, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 1.0, + "step": 8158 + }, + { + "completion_length": 240.7366189956665, + "epoch": 1.3681629573745755, + "grad_norm": 0.2755680868704616, + "kl": 0.3265380859375, + "learning_rate": 4.987856132656701e-07, + "loss": 0.0003, + "reward": 1.7767857909202576, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8160 + }, + { + "completion_length": 242.43750953674316, + "epoch": 1.3684982606144431, + "grad_norm": 0.34220703415178927, + "kl": 0.2494964599609375, + "learning_rate": 4.987844320521469e-07, + "loss": 0.0002, + "reward": 1.7392857745289803, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 8162 + }, + { + "completion_length": 244.96875858306885, + "epoch": 1.3688335638543108, + "grad_norm": 0.3656314574025403, + "kl": 0.13720703125, + "learning_rate": 4.987832502658299e-07, + "loss": 0.0001, + "reward": 1.8178571835160255, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.8267857357859612, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8164 + }, + { + "completion_length": 248.82143878936768, + "epoch": 1.3691688670941784, + "grad_norm": 0.5701026727666527, + "kl": 0.2044219970703125, + "learning_rate": 4.987820679067215e-07, + "loss": 0.0002, + "reward": 1.7839286401867867, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8166 + }, + { + "completion_length": 248.11608219146729, + "epoch": 1.3695041703340458, + "grad_norm": 0.1886594138713511, + "kl": 0.4144134521484375, + "learning_rate": 4.987808849748246e-07, + "loss": 0.0004, + "reward": 1.7839286476373672, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7883928678929806, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8168 + }, + { + "completion_length": 244.99108219146729, + "epoch": 1.3698394735739134, + "grad_norm": 0.21902865172820737, + "kl": 0.2199249267578125, + "learning_rate": 4.987797014701418e-07, + "loss": 0.0002, + "reward": 1.753571480512619, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7625000327825546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8170 + }, + { + "completion_length": 245.6517972946167, + "epoch": 1.3701747768137809, + "grad_norm": 0.2876812085996151, + "kl": 0.0823822021484375, + "learning_rate": 4.98778517392676e-07, + "loss": 0.0001, + "reward": 1.7625000700354576, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8172 + }, + { + "completion_length": 251.2901906967163, + "epoch": 1.3705100800536485, + "grad_norm": 0.23891560103695314, + "kl": 0.3800506591796875, + "learning_rate": 4.987773327424297e-07, + "loss": 0.0004, + "reward": 1.7535714879631996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 8174 + }, + { + "completion_length": 239.9553680419922, + "epoch": 1.3708453832935161, + "grad_norm": 0.16681507824174535, + "kl": 0.07891845703125, + "learning_rate": 4.987761475194058e-07, + "loss": 0.0001, + "reward": 1.7464286237955093, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.755357176065445, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8176 + }, + { + "completion_length": 246.50001049041748, + "epoch": 1.3711806865333838, + "grad_norm": 0.1460767895717453, + "kl": 0.605316162109375, + "learning_rate": 4.98774961723607e-07, + "loss": 0.0006, + "reward": 1.728571504354477, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7375000230967999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8178 + }, + { + "completion_length": 240.90626049041748, + "epoch": 1.3715159897732512, + "grad_norm": 0.27316633410956814, + "kl": 0.10211181640625, + "learning_rate": 4.987737753550359e-07, + "loss": 0.0001, + "reward": 1.7125000432133675, + "reward_std": 0.061871841782703996, + "rewards/equation_reward_func": 0.7258928939700127, + "rewards/format_reward_func": 0.9866071529686451, + "step": 8180 + }, + { + "completion_length": 247.2544765472412, + "epoch": 1.3718512930131188, + "grad_norm": 0.27321212078962503, + "kl": 0.32537841796875, + "learning_rate": 4.987725884136954e-07, + "loss": 0.0003, + "reward": 1.787500075995922, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7919643223285675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8182 + }, + { + "completion_length": 235.66072273254395, + "epoch": 1.3721865962529862, + "grad_norm": 0.26702454806950976, + "kl": 0.393646240234375, + "learning_rate": 4.987714008995882e-07, + "loss": 0.0004, + "reward": 1.778571493923664, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 8184 + }, + { + "completion_length": 236.3928689956665, + "epoch": 1.3725218994928539, + "grad_norm": 0.2859289731323632, + "kl": 0.1675567626953125, + "learning_rate": 4.987702128127169e-07, + "loss": 0.0002, + "reward": 1.7428572326898575, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 8186 + }, + { + "completion_length": 235.7053680419922, + "epoch": 1.3728572027327215, + "grad_norm": 0.1957172849424329, + "kl": 0.1398468017578125, + "learning_rate": 4.987690241530844e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 8188 + }, + { + "completion_length": 238.45983028411865, + "epoch": 1.373192505972589, + "grad_norm": 0.15376641020518758, + "kl": 0.145050048828125, + "learning_rate": 4.987678349206933e-07, + "loss": 0.0001, + "reward": 1.751785770058632, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8190 + }, + { + "completion_length": 248.5044765472412, + "epoch": 1.3735278092124565, + "grad_norm": 0.27968755909602633, + "kl": 0.077972412109375, + "learning_rate": 4.987666451155465e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7946428842842579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8192 + }, + { + "completion_length": 242.2500123977661, + "epoch": 1.373863112452324, + "grad_norm": 0.2436045067533531, + "kl": 0.108734130859375, + "learning_rate": 4.987654547376466e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 8194 + }, + { + "completion_length": 240.0937614440918, + "epoch": 1.3741984156921916, + "grad_norm": 0.3174198813775443, + "kl": 0.0972442626953125, + "learning_rate": 4.987642637869965e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643089175224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8196 + }, + { + "completion_length": 241.19643878936768, + "epoch": 1.3745337189320592, + "grad_norm": 0.32104686249339653, + "kl": 0.245880126953125, + "learning_rate": 4.987630722635986e-07, + "loss": 0.0002, + "reward": 1.7839286550879478, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8198 + }, + { + "completion_length": 238.9062614440918, + "epoch": 1.3748690221719269, + "grad_norm": 0.22119044107872407, + "kl": 0.101409912109375, + "learning_rate": 4.98761880167456e-07, + "loss": 0.0001, + "reward": 1.721428632736206, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214286010712385, + "rewards/format_reward_func": 1.0, + "step": 8200 + }, + { + "completion_length": 242.1294765472412, + "epoch": 1.3752043254117943, + "grad_norm": 0.19621369458051252, + "kl": 0.0926971435546875, + "learning_rate": 4.987606874985713e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 8202 + }, + { + "completion_length": 252.14287185668945, + "epoch": 1.375539628651662, + "grad_norm": 0.20777406668459847, + "kl": 0.213043212890625, + "learning_rate": 4.987594942569473e-07, + "loss": 0.0002, + "reward": 1.800000049173832, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 8204 + }, + { + "completion_length": 246.38840198516846, + "epoch": 1.3758749318915293, + "grad_norm": 0.25030524166954016, + "kl": 0.1303863525390625, + "learning_rate": 4.987583004425867e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 8206 + }, + { + "completion_length": 256.933048248291, + "epoch": 1.376210235131397, + "grad_norm": 0.21811990945534226, + "kl": 0.104339599609375, + "learning_rate": 4.987571060554922e-07, + "loss": 0.0001, + "reward": 1.7553572282195091, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8208 + }, + { + "completion_length": 247.2410831451416, + "epoch": 1.3765455383712646, + "grad_norm": 0.30362258829253413, + "kl": 0.1866455078125, + "learning_rate": 4.987559110956667e-07, + "loss": 0.0002, + "reward": 1.7946429029107094, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.808035746216774, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8210 + }, + { + "completion_length": 244.8705472946167, + "epoch": 1.376880841611132, + "grad_norm": 0.14472449444708704, + "kl": 0.1186981201171875, + "learning_rate": 4.987547155631128e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8212 + }, + { + "completion_length": 250.7812623977661, + "epoch": 1.3772161448509996, + "grad_norm": 0.21972446651081873, + "kl": 0.090972900390625, + "learning_rate": 4.987535194578333e-07, + "loss": 0.0001, + "reward": 1.7732143923640251, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8214 + }, + { + "completion_length": 243.51786708831787, + "epoch": 1.377551448090867, + "grad_norm": 0.23292783449066065, + "kl": 0.0885467529296875, + "learning_rate": 4.98752322779831e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 8216 + }, + { + "completion_length": 252.18751049041748, + "epoch": 1.3778867513307347, + "grad_norm": 0.20195243651174724, + "kl": 0.0833892822265625, + "learning_rate": 4.987511255291087e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7946428954601288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8218 + }, + { + "completion_length": 240.0446538925171, + "epoch": 1.3782220545706023, + "grad_norm": 0.17906265132759394, + "kl": 0.0956573486328125, + "learning_rate": 4.987499277056689e-07, + "loss": 0.0001, + "reward": 1.798214316368103, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8026785962283611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8220 + }, + { + "completion_length": 254.49108505249023, + "epoch": 1.37855735781047, + "grad_norm": 0.3025207003237902, + "kl": 0.0892486572265625, + "learning_rate": 4.987487293095148e-07, + "loss": 0.0001, + "reward": 1.7732143700122833, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7776785995811224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8222 + }, + { + "completion_length": 254.6741180419922, + "epoch": 1.3788926610503374, + "grad_norm": 0.17416299428135643, + "kl": 0.0872802734375, + "learning_rate": 4.987475303406486e-07, + "loss": 0.0001, + "reward": 1.7053572237491608, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7098214644938707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8224 + }, + { + "completion_length": 243.1026906967163, + "epoch": 1.379227964290205, + "grad_norm": 0.007376292518863739, + "kl": 0.0882415771484375, + "learning_rate": 4.987463307990735e-07, + "loss": 0.0001, + "reward": 1.7875000908970833, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8226 + }, + { + "completion_length": 243.65179634094238, + "epoch": 1.3795632675300724, + "grad_norm": 0.3491150701987759, + "kl": 0.29400634765625, + "learning_rate": 4.987451306847922e-07, + "loss": 0.0003, + "reward": 1.666071504354477, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.6705357562750578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8228 + }, + { + "completion_length": 259.85268783569336, + "epoch": 1.37989857076994, + "grad_norm": 0.2254974501549908, + "kl": 0.18572998046875, + "learning_rate": 4.987439299978072e-07, + "loss": 0.0002, + "reward": 1.7500000596046448, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 8230 + }, + { + "completion_length": 248.71876335144043, + "epoch": 1.3802338740098077, + "grad_norm": 0.005612214152327968, + "kl": 0.0982513427734375, + "learning_rate": 4.987427287381215e-07, + "loss": 0.0001, + "reward": 1.7446429058909416, + "reward_std": 0.007576144300401211, + "rewards/equation_reward_func": 0.74910718947649, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8232 + }, + { + "completion_length": 242.35268783569336, + "epoch": 1.380569177249675, + "grad_norm": 0.15117705551384866, + "kl": 0.1826324462890625, + "learning_rate": 4.987415269057379e-07, + "loss": 0.0002, + "reward": 1.8178571984171867, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.8267857395112514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8234 + }, + { + "completion_length": 248.83036994934082, + "epoch": 1.3809044804895427, + "grad_norm": 0.16819016833733927, + "kl": 0.1221923828125, + "learning_rate": 4.987403245006591e-07, + "loss": 0.0001, + "reward": 1.7553571984171867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214633762836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8236 + }, + { + "completion_length": 257.6607303619385, + "epoch": 1.3812397837294104, + "grad_norm": 0.20559730867779458, + "kl": 0.1548309326171875, + "learning_rate": 4.987391215228878e-07, + "loss": 0.0002, + "reward": 1.7482143566012383, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8238 + }, + { + "completion_length": 244.97769260406494, + "epoch": 1.3815750869692778, + "grad_norm": 0.14968388550478384, + "kl": 0.2997283935546875, + "learning_rate": 4.987379179724267e-07, + "loss": 0.0003, + "reward": 1.778571493923664, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 8240 + }, + { + "completion_length": 253.08037090301514, + "epoch": 1.3819103902091454, + "grad_norm": 0.16791201111642334, + "kl": 0.1606903076171875, + "learning_rate": 4.987367138492787e-07, + "loss": 0.0002, + "reward": 1.7660715207457542, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8242 + }, + { + "completion_length": 250.44644260406494, + "epoch": 1.382245693449013, + "grad_norm": 0.2818567106401476, + "kl": 0.3727569580078125, + "learning_rate": 4.987355091534467e-07, + "loss": 0.0004, + "reward": 1.7714286521077156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285776019096, + "rewards/format_reward_func": 1.0, + "step": 8244 + }, + { + "completion_length": 241.80358219146729, + "epoch": 1.3825809966888805, + "grad_norm": 0.21332612942021542, + "kl": 0.193572998046875, + "learning_rate": 4.987343038849333e-07, + "loss": 0.0002, + "reward": 1.7571429461240768, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 8246 + }, + { + "completion_length": 244.90179443359375, + "epoch": 1.382916299928748, + "grad_norm": 0.29629913715379597, + "kl": 0.24725341796875, + "learning_rate": 4.987330980437413e-07, + "loss": 0.0002, + "reward": 1.7750000432133675, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 8248 + }, + { + "completion_length": 258.2053689956665, + "epoch": 1.3832516031686155, + "grad_norm": 0.3395190955566177, + "kl": 0.3449859619140625, + "learning_rate": 4.987318916298734e-07, + "loss": 0.0003, + "reward": 1.7517857775092125, + "reward_std": 0.09848987217992544, + "rewards/equation_reward_func": 0.765178594738245, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8250 + }, + { + "completion_length": 249.21429634094238, + "epoch": 1.3835869064084831, + "grad_norm": 0.44822066564718804, + "kl": 0.234832763671875, + "learning_rate": 4.987306846433325e-07, + "loss": 0.0002, + "reward": 1.7446429282426834, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8252 + }, + { + "completion_length": 242.1071548461914, + "epoch": 1.3839222096483508, + "grad_norm": 0.3920342374420616, + "kl": 0.464263916015625, + "learning_rate": 4.987294770841214e-07, + "loss": 0.0005, + "reward": 1.733928643167019, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7473214566707611, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8254 + }, + { + "completion_length": 240.27233028411865, + "epoch": 1.3842575128882184, + "grad_norm": 0.1406635263351389, + "kl": 0.174072265625, + "learning_rate": 4.987282689522428e-07, + "loss": 0.0002, + "reward": 1.7464286386966705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464285790920258, + "rewards/format_reward_func": 1.0, + "step": 8256 + }, + { + "completion_length": 239.23215198516846, + "epoch": 1.3845928161280858, + "grad_norm": 0.179425887440818, + "kl": 0.1930084228515625, + "learning_rate": 4.987270602476995e-07, + "loss": 0.0002, + "reward": 1.7750000804662704, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 8258 + }, + { + "completion_length": 242.12947273254395, + "epoch": 1.3849281193679535, + "grad_norm": 0.3154012365668129, + "kl": 0.10003662109375, + "learning_rate": 4.987258509704942e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857559025288, + "rewards/format_reward_func": 1.0, + "step": 8260 + }, + { + "completion_length": 234.19643783569336, + "epoch": 1.3852634226078209, + "grad_norm": 0.16562049076361046, + "kl": 0.114166259765625, + "learning_rate": 4.9872464112063e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214285969734192, + "rewards/format_reward_func": 1.0, + "step": 8262 + }, + { + "completion_length": 234.68750953674316, + "epoch": 1.3855987258476885, + "grad_norm": 0.7369731607621153, + "kl": 0.1848907470703125, + "learning_rate": 4.987234306981093e-07, + "loss": 0.0002, + "reward": 1.7428572326898575, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 8264 + }, + { + "completion_length": 233.227689743042, + "epoch": 1.3859340290875561, + "grad_norm": 0.02821810625385667, + "kl": 0.226776123046875, + "learning_rate": 4.98722219702935e-07, + "loss": 0.0002, + "reward": 1.7732143253087997, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7776786126196384, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8266 + }, + { + "completion_length": 228.66072463989258, + "epoch": 1.3862693323274236, + "grad_norm": 0.19036897284281554, + "kl": 0.1602020263671875, + "learning_rate": 4.9872100813511e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 8268 + }, + { + "completion_length": 240.06251049041748, + "epoch": 1.3866046355672912, + "grad_norm": 0.28445586166487996, + "kl": 0.0912628173828125, + "learning_rate": 4.98719795994637e-07, + "loss": 0.0001, + "reward": 1.7321429550647736, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.732142873108387, + "rewards/format_reward_func": 1.0, + "step": 8270 + }, + { + "completion_length": 228.25001049041748, + "epoch": 1.3869399388071586, + "grad_norm": 0.36868253449781974, + "kl": 0.4998779296875, + "learning_rate": 4.987185832815188e-07, + "loss": 0.0005, + "reward": 1.7928572073578835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 8272 + }, + { + "completion_length": 231.93304634094238, + "epoch": 1.3872752420470262, + "grad_norm": 0.2135789845077452, + "kl": 0.1878509521484375, + "learning_rate": 4.987173699957582e-07, + "loss": 0.0002, + "reward": 1.7803571820259094, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214656114578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8274 + }, + { + "completion_length": 233.14733219146729, + "epoch": 1.3876105452868939, + "grad_norm": 0.09741457644301535, + "kl": 0.0953216552734375, + "learning_rate": 4.98716156137358e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 8276 + }, + { + "completion_length": 231.3259038925171, + "epoch": 1.3879458485267615, + "grad_norm": 0.3273387632249645, + "kl": 0.5626983642578125, + "learning_rate": 4.98714941706321e-07, + "loss": 0.0006, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 8278 + }, + { + "completion_length": 228.06697463989258, + "epoch": 1.388281151766629, + "grad_norm": 0.4202954088324719, + "kl": 0.4383392333984375, + "learning_rate": 4.9871372670265e-07, + "loss": 0.0004, + "reward": 1.7750000432133675, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.775000037625432, + "rewards/format_reward_func": 1.0, + "step": 8280 + }, + { + "completion_length": 228.60715293884277, + "epoch": 1.3886164550064966, + "grad_norm": 0.28889603554498616, + "kl": 0.09014892578125, + "learning_rate": 4.987125111263477e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143241912127, + "rewards/format_reward_func": 1.0, + "step": 8282 + }, + { + "completion_length": 225.5535831451416, + "epoch": 1.388951758246364, + "grad_norm": 0.33761219684242044, + "kl": 0.2540435791015625, + "learning_rate": 4.987112949774171e-07, + "loss": 0.0003, + "reward": 1.7214286550879478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214286141097546, + "rewards/format_reward_func": 1.0, + "step": 8284 + }, + { + "completion_length": 226.7634048461914, + "epoch": 1.3892870614862316, + "grad_norm": 0.19009764938273654, + "kl": 0.253997802734375, + "learning_rate": 4.987100782558608e-07, + "loss": 0.0003, + "reward": 1.7678572237491608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 8286 + }, + { + "completion_length": 235.06251049041748, + "epoch": 1.3896223647260992, + "grad_norm": 0.25891486225934435, + "kl": 0.2195892333984375, + "learning_rate": 4.987088609616818e-07, + "loss": 0.0002, + "reward": 1.7500000596046448, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 8288 + }, + { + "completion_length": 226.99554538726807, + "epoch": 1.3899576679659666, + "grad_norm": 0.1975587721492033, + "kl": 0.310394287109375, + "learning_rate": 4.987076430948827e-07, + "loss": 0.0003, + "reward": 1.717857226729393, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 8290 + }, + { + "completion_length": 238.50447463989258, + "epoch": 1.3902929712058343, + "grad_norm": 0.2642575662903022, + "kl": 0.089019775390625, + "learning_rate": 4.987064246554664e-07, + "loss": 0.0001, + "reward": 1.7696429416537285, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071842610836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8292 + }, + { + "completion_length": 230.88840293884277, + "epoch": 1.3906282744457017, + "grad_norm": 0.1691942612744324, + "kl": 0.2000579833984375, + "learning_rate": 4.987052056434357e-07, + "loss": 0.0002, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 8294 + }, + { + "completion_length": 237.10268783569336, + "epoch": 1.3909635776855693, + "grad_norm": 0.330917499427202, + "kl": 0.1829986572265625, + "learning_rate": 4.987039860587933e-07, + "loss": 0.0002, + "reward": 1.7375000715255737, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7419643048197031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8296 + }, + { + "completion_length": 244.22322463989258, + "epoch": 1.391298880925437, + "grad_norm": 0.17135511845815518, + "kl": 0.373077392578125, + "learning_rate": 4.987027659015423e-07, + "loss": 0.0004, + "reward": 1.735714353621006, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8298 + }, + { + "completion_length": 252.75001335144043, + "epoch": 1.3916341841653046, + "grad_norm": 0.33960622184873085, + "kl": 0.1415557861328125, + "learning_rate": 4.987015451716853e-07, + "loss": 0.0001, + "reward": 1.6946429386734962, + "reward_std": 0.09848987031728029, + "rewards/equation_reward_func": 0.6991071663796902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8300 + }, + { + "completion_length": 245.071439743042, + "epoch": 1.391969487405172, + "grad_norm": 0.2723528900900101, + "kl": 0.122833251953125, + "learning_rate": 4.987003238692251e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 8302 + }, + { + "completion_length": 241.62054824829102, + "epoch": 1.3923047906450396, + "grad_norm": 0.2372067895679325, + "kl": 0.095794677734375, + "learning_rate": 4.986991019941644e-07, + "loss": 0.0001, + "reward": 1.7660714834928513, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8304 + }, + { + "completion_length": 236.24108409881592, + "epoch": 1.392640093884907, + "grad_norm": 0.27936471533894636, + "kl": 0.105682373046875, + "learning_rate": 4.986978795465063e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 8306 + }, + { + "completion_length": 236.5089406967163, + "epoch": 1.3929753971247747, + "grad_norm": 0.23005213349580997, + "kl": 0.1139984130859375, + "learning_rate": 4.986966565262534e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 8308 + }, + { + "completion_length": 232.9285831451416, + "epoch": 1.3933107003646423, + "grad_norm": 0.3397062470054958, + "kl": 0.1307373046875, + "learning_rate": 4.986954329334087e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857268452644, + "rewards/format_reward_func": 1.0, + "step": 8310 + }, + { + "completion_length": 248.77679634094238, + "epoch": 1.39364600360451, + "grad_norm": 0.1851405173069177, + "kl": 0.1040496826171875, + "learning_rate": 4.986942087679749e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714286148548126, + "rewards/format_reward_func": 1.0, + "step": 8312 + }, + { + "completion_length": 254.46429824829102, + "epoch": 1.3939813068443774, + "grad_norm": 0.28245811292805495, + "kl": 0.1212158203125, + "learning_rate": 4.986929840299547e-07, + "loss": 0.0001, + "reward": 1.7410714998841286, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8314 + }, + { + "completion_length": 233.6875114440918, + "epoch": 1.394316610084245, + "grad_norm": 0.22962909738151074, + "kl": 0.161712646484375, + "learning_rate": 4.986917587193511e-07, + "loss": 0.0002, + "reward": 1.725000061094761, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000424683094, + "rewards/format_reward_func": 1.0, + "step": 8316 + }, + { + "completion_length": 232.30358409881592, + "epoch": 1.3946519133241124, + "grad_norm": 0.3743388791517971, + "kl": 0.281494140625, + "learning_rate": 4.986905328361668e-07, + "loss": 0.0003, + "reward": 1.7785715013742447, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 8318 + }, + { + "completion_length": 236.19197463989258, + "epoch": 1.39498721656398, + "grad_norm": 0.18783598559833184, + "kl": 0.1347198486328125, + "learning_rate": 4.986893063804048e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714469850063, + "rewards/format_reward_func": 1.0, + "step": 8320 + }, + { + "completion_length": 230.33483219146729, + "epoch": 1.3953225198038477, + "grad_norm": 0.21152856279262738, + "kl": 0.2019805908203125, + "learning_rate": 4.986880793520677e-07, + "loss": 0.0002, + "reward": 1.7571429461240768, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 8322 + }, + { + "completion_length": 230.64733219146729, + "epoch": 1.395657823043715, + "grad_norm": 0.1390615445266548, + "kl": 0.3101959228515625, + "learning_rate": 4.986868517511585e-07, + "loss": 0.0003, + "reward": 1.8000000566244125, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 8324 + }, + { + "completion_length": 248.91518878936768, + "epoch": 1.3959931262835827, + "grad_norm": 0.24380398211878274, + "kl": 0.0861358642578125, + "learning_rate": 4.9868562357768e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 8326 + }, + { + "completion_length": 237.41965293884277, + "epoch": 1.3963284295234502, + "grad_norm": 0.2780829880264992, + "kl": 0.08013916015625, + "learning_rate": 4.98684394831635e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 8328 + }, + { + "completion_length": 233.1473331451416, + "epoch": 1.3966637327633178, + "grad_norm": 0.25010869340527275, + "kl": 0.0888214111328125, + "learning_rate": 4.986831655130262e-07, + "loss": 0.0001, + "reward": 1.766071505844593, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8330 + }, + { + "completion_length": 228.9866189956665, + "epoch": 1.3969990360031854, + "grad_norm": 0.2859377036608154, + "kl": 0.0903472900390625, + "learning_rate": 4.986819356218565e-07, + "loss": 0.0001, + "reward": 1.8000000715255737, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 8332 + }, + { + "completion_length": 236.1384048461914, + "epoch": 1.397334339243053, + "grad_norm": 0.2847175423965275, + "kl": 0.1934661865234375, + "learning_rate": 4.98680705158129e-07, + "loss": 0.0002, + "reward": 1.7535714879631996, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 8334 + }, + { + "completion_length": 239.97769260406494, + "epoch": 1.3976696424829205, + "grad_norm": 0.24022825278721455, + "kl": 0.14080810546875, + "learning_rate": 4.986794741218462e-07, + "loss": 0.0001, + "reward": 1.7196429520845413, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8336 + }, + { + "completion_length": 244.83037090301514, + "epoch": 1.398004945722788, + "grad_norm": 0.20960399495324336, + "kl": 0.2378387451171875, + "learning_rate": 4.986782425130111e-07, + "loss": 0.0002, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 8338 + }, + { + "completion_length": 238.55804443359375, + "epoch": 1.3983402489626555, + "grad_norm": 0.1458231825218131, + "kl": 0.10693359375, + "learning_rate": 4.986770103316263e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 8340 + }, + { + "completion_length": 245.87947750091553, + "epoch": 1.3986755522025232, + "grad_norm": 0.4447577806999814, + "kl": 0.1026153564453125, + "learning_rate": 4.98675777577695e-07, + "loss": 0.0001, + "reward": 1.6696429327130318, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.6830357573926449, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8342 + }, + { + "completion_length": 246.63840198516846, + "epoch": 1.3990108554423908, + "grad_norm": 0.5346485602738086, + "kl": 0.145660400390625, + "learning_rate": 4.986745442512198e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8344 + }, + { + "completion_length": 245.3125123977661, + "epoch": 1.3993461586822582, + "grad_norm": 0.2319599019557805, + "kl": 0.086669921875, + "learning_rate": 4.986733103522037e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857450991869, + "rewards/format_reward_func": 1.0, + "step": 8346 + }, + { + "completion_length": 258.2634038925171, + "epoch": 1.3996814619221258, + "grad_norm": 0.42665677837153787, + "kl": 0.156463623046875, + "learning_rate": 4.986720758806493e-07, + "loss": 0.0002, + "reward": 1.7464286386966705, + "reward_std": 0.08586296625435352, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8348 + }, + { + "completion_length": 239.5178680419922, + "epoch": 1.4000167651619932, + "grad_norm": 0.23925510003351777, + "kl": 0.1253509521484375, + "learning_rate": 4.986708408365596e-07, + "loss": 0.0001, + "reward": 1.7928572297096252, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 8350 + }, + { + "completion_length": 256.7232255935669, + "epoch": 1.4003520684018609, + "grad_norm": 0.29109935293740163, + "kl": 0.097412109375, + "learning_rate": 4.986696052199373e-07, + "loss": 0.0001, + "reward": 1.7464286163449287, + "reward_std": 0.0858629634603858, + "rewards/equation_reward_func": 0.7553571909666061, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8352 + }, + { + "completion_length": 257.20537090301514, + "epoch": 1.4006873716417285, + "grad_norm": 0.17636591360078244, + "kl": 0.255035400390625, + "learning_rate": 4.986683690307856e-07, + "loss": 0.0003, + "reward": 1.6839286461472511, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.6883928999304771, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8354 + }, + { + "completion_length": 260.008939743042, + "epoch": 1.4010226748815962, + "grad_norm": 0.20055391467891595, + "kl": 0.430389404296875, + "learning_rate": 4.986671322691071e-07, + "loss": 0.0004, + "reward": 1.6892857998609543, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.6982143372297287, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8356 + }, + { + "completion_length": 259.1562604904175, + "epoch": 1.4013579781214636, + "grad_norm": 0.5636506182839675, + "kl": 0.3524169921875, + "learning_rate": 4.986658949349046e-07, + "loss": 0.0004, + "reward": 1.7232143580913544, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.7366071753203869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8358 + }, + { + "completion_length": 255.43304634094238, + "epoch": 1.4016932813613312, + "grad_norm": 0.5249611321278694, + "kl": 0.516448974609375, + "learning_rate": 4.98664657028181e-07, + "loss": 0.0005, + "reward": 1.7500000596046448, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7589286118745804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8360 + }, + { + "completion_length": 260.5312623977661, + "epoch": 1.4020285846011986, + "grad_norm": 0.36025818419775324, + "kl": 0.0961761474609375, + "learning_rate": 4.986634185489391e-07, + "loss": 0.0001, + "reward": 1.7218750789761543, + "reward_std": 0.09028238290920854, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9941964335739613, + "step": 8362 + }, + { + "completion_length": 256.7142963409424, + "epoch": 1.4023638878410662, + "grad_norm": 0.19437306007039692, + "kl": 0.405792236328125, + "learning_rate": 4.986621794971819e-07, + "loss": 0.0004, + "reward": 1.7428572326898575, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7517857402563095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8364 + }, + { + "completion_length": 257.4955463409424, + "epoch": 1.4026991910809339, + "grad_norm": 0.5732167804087518, + "kl": 0.6250762939453125, + "learning_rate": 4.986609398729121e-07, + "loss": 0.0006, + "reward": 1.7714286670088768, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 8366 + }, + { + "completion_length": 262.8080472946167, + "epoch": 1.4030344943208013, + "grad_norm": 0.33261515236998473, + "kl": 0.738128662109375, + "learning_rate": 4.986596996761327e-07, + "loss": 0.0007, + "reward": 1.7375000715255737, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7419643104076385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8368 + }, + { + "completion_length": 258.48662281036377, + "epoch": 1.403369797560669, + "grad_norm": 0.4386239574155109, + "kl": 1.22271728515625, + "learning_rate": 4.986584589068465e-07, + "loss": 0.0012, + "reward": 1.7758929282426834, + "reward_std": 0.04419417306780815, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 0.9973214343190193, + "step": 8370 + }, + { + "completion_length": 256.2455472946167, + "epoch": 1.4037051008005366, + "grad_norm": 0.27849866199909995, + "kl": 1.667144775390625, + "learning_rate": 4.986572175650562e-07, + "loss": 0.0017, + "reward": 1.7357143610715866, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7446428872644901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8372 + }, + { + "completion_length": 243.52233028411865, + "epoch": 1.404040404040404, + "grad_norm": 0.25705270920538403, + "kl": 0.59259033203125, + "learning_rate": 4.986559756507649e-07, + "loss": 0.0006, + "reward": 1.7625000774860382, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7669643107801676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8374 + }, + { + "completion_length": 251.6294755935669, + "epoch": 1.4043757072802716, + "grad_norm": 0.15817553115601674, + "kl": 1.261627197265625, + "learning_rate": 4.986547331639753e-07, + "loss": 0.0013, + "reward": 1.7321429550647736, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7410714533179998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8376 + }, + { + "completion_length": 253.94644165039062, + "epoch": 1.4047110105201392, + "grad_norm": 0.14483387736198614, + "kl": 1.48773193359375, + "learning_rate": 4.986534901046903e-07, + "loss": 0.0015, + "reward": 1.814285770058632, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 8378 + }, + { + "completion_length": 250.39733505249023, + "epoch": 1.4050463137600067, + "grad_norm": 0.20242507143435626, + "kl": 0.5549163818359375, + "learning_rate": 4.986522464729129e-07, + "loss": 0.0006, + "reward": 1.7616072297096252, + "reward_std": 0.07071067788638175, + "rewards/equation_reward_func": 0.7732143178582191, + "rewards/format_reward_func": 0.9883928671479225, + "step": 8380 + }, + { + "completion_length": 261.4821548461914, + "epoch": 1.4053816169998743, + "grad_norm": 0.3677156368563927, + "kl": 0.290008544921875, + "learning_rate": 4.986510022686456e-07, + "loss": 0.0003, + "reward": 1.8062500581145287, + "reward_std": 0.07197336968965828, + "rewards/equation_reward_func": 0.8223214484751225, + "rewards/format_reward_func": 0.9839285798370838, + "step": 8382 + }, + { + "completion_length": 269.2812623977661, + "epoch": 1.4057169202397417, + "grad_norm": 0.5339161928148323, + "kl": 0.60052490234375, + "learning_rate": 4.986497574918917e-07, + "loss": 0.0006, + "reward": 1.7718750163912773, + "reward_std": 0.10669736191630363, + "rewards/equation_reward_func": 0.8053571619093418, + "rewards/format_reward_func": 0.9665178768336773, + "step": 8384 + }, + { + "completion_length": 261.0803689956665, + "epoch": 1.4060522234796093, + "grad_norm": 0.21607394771384367, + "kl": 0.5733642578125, + "learning_rate": 4.986485121426538e-07, + "loss": 0.0006, + "reward": 1.728571467101574, + "reward_std": 0.07071067858487368, + "rewards/equation_reward_func": 0.746428582817316, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8386 + }, + { + "completion_length": 281.7589416503906, + "epoch": 1.406387526719477, + "grad_norm": 0.23092201842398968, + "kl": 0.65911865234375, + "learning_rate": 4.986472662209348e-07, + "loss": 0.0007, + "reward": 1.6799107789993286, + "reward_std": 0.11932427063584328, + "rewards/equation_reward_func": 0.715178593993187, + "rewards/format_reward_func": 0.9647321589291096, + "step": 8388 + }, + { + "completion_length": 263.16519355773926, + "epoch": 1.4067228299593446, + "grad_norm": 0.30303730808498497, + "kl": 0.55340576171875, + "learning_rate": 4.986460197267376e-07, + "loss": 0.0006, + "reward": 1.7633929401636124, + "reward_std": 0.07197336852550507, + "rewards/equation_reward_func": 0.779464315623045, + "rewards/format_reward_func": 0.9839285835623741, + "step": 8390 + }, + { + "completion_length": 287.9866180419922, + "epoch": 1.407058133199212, + "grad_norm": 0.3113564380005515, + "kl": 0.73895263671875, + "learning_rate": 4.986447726600651e-07, + "loss": 0.0007, + "reward": 1.6491071954369545, + "reward_std": 0.128794448915869, + "rewards/equation_reward_func": 0.6919643133878708, + "rewards/format_reward_func": 0.9571428783237934, + "step": 8392 + }, + { + "completion_length": 260.9285821914673, + "epoch": 1.4073934364390797, + "grad_norm": 0.5877392569400088, + "kl": 0.58837890625, + "learning_rate": 4.986435250209201e-07, + "loss": 0.0006, + "reward": 1.7267857939004898, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7312500365078449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8394 + }, + { + "completion_length": 270.6071538925171, + "epoch": 1.407728739678947, + "grad_norm": 0.37487510306904775, + "kl": 1.393402099609375, + "learning_rate": 4.986422768093056e-07, + "loss": 0.0014, + "reward": 1.771875038743019, + "reward_std": 0.05997780826874077, + "rewards/equation_reward_func": 0.786607176065445, + "rewards/format_reward_func": 0.9852678664028645, + "step": 8396 + }, + { + "completion_length": 257.44197273254395, + "epoch": 1.4080640429188147, + "grad_norm": 0.1787303931438916, + "kl": 0.350555419921875, + "learning_rate": 4.986410280252244e-07, + "loss": 0.0004, + "reward": 1.7986607998609543, + "reward_std": 0.05240166233852506, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 0.9986607171595097, + "step": 8398 + }, + { + "completion_length": 261.2142972946167, + "epoch": 1.4083993461586823, + "grad_norm": 0.3143881061853852, + "kl": 0.587188720703125, + "learning_rate": 4.986397786686793e-07, + "loss": 0.0006, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8400 + }, + { + "completion_length": 263.7767972946167, + "epoch": 1.4087346493985498, + "grad_norm": 0.608426074512046, + "kl": 0.341552734375, + "learning_rate": 4.986385287396733e-07, + "loss": 0.0003, + "reward": 1.7392857894301414, + "reward_std": 0.10606601554900408, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8402 + }, + { + "completion_length": 274.3080472946167, + "epoch": 1.4090699526384174, + "grad_norm": 0.6285536610000946, + "kl": 0.3597412109375, + "learning_rate": 4.986372782382092e-07, + "loss": 0.0004, + "reward": 1.692857213318348, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.701785746961832, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8404 + }, + { + "completion_length": 260.9910831451416, + "epoch": 1.4094052558782848, + "grad_norm": 0.10934691631824027, + "kl": 0.264923095703125, + "learning_rate": 4.986360271642898e-07, + "loss": 0.0003, + "reward": 1.7553571984171867, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7687500305473804, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8406 + }, + { + "completion_length": 262.0044755935669, + "epoch": 1.4097405591181524, + "grad_norm": 0.14820756319286144, + "kl": 0.15960693359375, + "learning_rate": 4.986347755179181e-07, + "loss": 0.0002, + "reward": 1.8107143267989159, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107142932713032, + "rewards/format_reward_func": 1.0, + "step": 8408 + }, + { + "completion_length": 260.5714406967163, + "epoch": 1.41007586235802, + "grad_norm": 0.21250860071479305, + "kl": 0.140960693359375, + "learning_rate": 4.98633523299097e-07, + "loss": 0.0001, + "reward": 1.7303572222590446, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7348214592784643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8410 + }, + { + "completion_length": 258.4509029388428, + "epoch": 1.4104111655978877, + "grad_norm": 0.355366517952804, + "kl": 0.192138671875, + "learning_rate": 4.986322705078294e-07, + "loss": 0.0002, + "reward": 1.7053572237491608, + "reward_std": 0.12374368496239185, + "rewards/equation_reward_func": 0.7187500242143869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8412 + }, + { + "completion_length": 248.47322845458984, + "epoch": 1.4107464688377551, + "grad_norm": 0.24897194525629096, + "kl": 0.14501953125, + "learning_rate": 4.98631017144118e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 8414 + }, + { + "completion_length": 233.80804634094238, + "epoch": 1.4110817720776228, + "grad_norm": 0.22718979397807326, + "kl": 0.15289306640625, + "learning_rate": 4.986297632079659e-07, + "loss": 0.0002, + "reward": 1.767857201397419, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 8416 + }, + { + "completion_length": 247.35268783569336, + "epoch": 1.4114170753174902, + "grad_norm": 0.1969943143254704, + "kl": 0.1800537109375, + "learning_rate": 4.986285086993759e-07, + "loss": 0.0002, + "reward": 1.7500000819563866, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 8418 + }, + { + "completion_length": 248.67412090301514, + "epoch": 1.4117523785573578, + "grad_norm": 0.3579731369561279, + "kl": 0.17572021484375, + "learning_rate": 4.986272536183509e-07, + "loss": 0.0002, + "reward": 1.771428607404232, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 8420 + }, + { + "completion_length": 245.44197463989258, + "epoch": 1.4120876817972254, + "grad_norm": 0.19579481260834086, + "kl": 0.20904541015625, + "learning_rate": 4.986259979648938e-07, + "loss": 0.0002, + "reward": 1.6642857864499092, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.6732143200933933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8422 + }, + { + "completion_length": 250.2053680419922, + "epoch": 1.4124229850370928, + "grad_norm": 0.4279326961042182, + "kl": 0.214385986328125, + "learning_rate": 4.986247417390074e-07, + "loss": 0.0002, + "reward": 1.7553572058677673, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8424 + }, + { + "completion_length": 239.1562623977661, + "epoch": 1.4127582882769605, + "grad_norm": 0.3513471847587535, + "kl": 0.18414306640625, + "learning_rate": 4.986234849406947e-07, + "loss": 0.0002, + "reward": 1.7629465088248253, + "reward_std": 0.06250318652018905, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 0.9986607171595097, + "step": 8426 + }, + { + "completion_length": 245.0714406967163, + "epoch": 1.413093591516828, + "grad_norm": 0.3502444509464687, + "kl": 0.243865966796875, + "learning_rate": 4.986222275699585e-07, + "loss": 0.0002, + "reward": 1.8375000655651093, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8419643118977547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8428 + }, + { + "completion_length": 245.3125114440918, + "epoch": 1.4134288947566955, + "grad_norm": 0.437842987118793, + "kl": 0.224090576171875, + "learning_rate": 4.986209696268018e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857559025288, + "rewards/format_reward_func": 1.0, + "step": 8430 + }, + { + "completion_length": 247.5312623977661, + "epoch": 1.4137641979965632, + "grad_norm": 0.19685596006311468, + "kl": 0.1822509765625, + "learning_rate": 4.986197111112275e-07, + "loss": 0.0002, + "reward": 1.7392857819795609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 8432 + }, + { + "completion_length": 244.63393783569336, + "epoch": 1.4140995012364308, + "grad_norm": 0.16053489041924518, + "kl": 0.25244140625, + "learning_rate": 4.986184520232383e-07, + "loss": 0.0003, + "reward": 1.7696429044008255, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7741071842610836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8434 + }, + { + "completion_length": 242.8973331451416, + "epoch": 1.4144348044762982, + "grad_norm": 0.11632207449364086, + "kl": 0.202850341796875, + "learning_rate": 4.986171923628373e-07, + "loss": 0.0002, + "reward": 1.8035715073347092, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 8436 + }, + { + "completion_length": 248.72769165039062, + "epoch": 1.4147701077161658, + "grad_norm": 0.3687579739976894, + "kl": 0.3116455078125, + "learning_rate": 4.986159321300274e-07, + "loss": 0.0003, + "reward": 1.7767857685685158, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8438 + }, + { + "completion_length": 246.8928680419922, + "epoch": 1.4151054109560333, + "grad_norm": 0.25257908439965243, + "kl": 0.21636962890625, + "learning_rate": 4.986146713248115e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 8440 + }, + { + "completion_length": 254.77679538726807, + "epoch": 1.415440714195901, + "grad_norm": 0.3316500270046741, + "kl": 0.380126953125, + "learning_rate": 4.986134099471923e-07, + "loss": 0.0004, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428846567869, + "rewards/format_reward_func": 1.0, + "step": 8442 + }, + { + "completion_length": 244.6116189956665, + "epoch": 1.4157760174357685, + "grad_norm": 0.13862563902299166, + "kl": 0.159881591796875, + "learning_rate": 4.986121479971729e-07, + "loss": 0.0002, + "reward": 1.7357143312692642, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357142996042967, + "rewards/format_reward_func": 1.0, + "step": 8444 + }, + { + "completion_length": 241.01340770721436, + "epoch": 1.4161113206756362, + "grad_norm": 0.18927875554614454, + "kl": 0.397674560546875, + "learning_rate": 4.986108854747561e-07, + "loss": 0.0004, + "reward": 1.7620536461472511, + "reward_std": 0.05366435460746288, + "rewards/equation_reward_func": 0.7696428932249546, + "rewards/format_reward_func": 0.9924107193946838, + "step": 8446 + }, + { + "completion_length": 251.79465579986572, + "epoch": 1.4164466239155036, + "grad_norm": 0.1946390969329135, + "kl": 0.354339599609375, + "learning_rate": 4.986096223799449e-07, + "loss": 0.0004, + "reward": 1.805357202887535, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.8187500312924385, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8448 + }, + { + "completion_length": 262.28126525878906, + "epoch": 1.4167819271553712, + "grad_norm": 0.15546482453488383, + "kl": 0.262237548828125, + "learning_rate": 4.986083587127423e-07, + "loss": 0.0003, + "reward": 1.7732143476605415, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8450 + }, + { + "completion_length": 250.01786994934082, + "epoch": 1.4171172303952386, + "grad_norm": 0.25181161802683366, + "kl": 0.218536376953125, + "learning_rate": 4.98607094473151e-07, + "loss": 0.0002, + "reward": 1.796428643167019, + "reward_std": 0.08586296625435352, + "rewards/equation_reward_func": 0.8053571656346321, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8452 + }, + { + "completion_length": 255.62500858306885, + "epoch": 1.4174525336351063, + "grad_norm": 0.3271275806533319, + "kl": 0.414764404296875, + "learning_rate": 4.98605829661174e-07, + "loss": 0.0004, + "reward": 1.7678571939468384, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8454 + }, + { + "completion_length": 256.46875858306885, + "epoch": 1.417787836874974, + "grad_norm": 0.47129680996280954, + "kl": 0.260772705078125, + "learning_rate": 4.986045642768141e-07, + "loss": 0.0003, + "reward": 1.764285758137703, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 8456 + }, + { + "completion_length": 259.5714387893677, + "epoch": 1.4181231401148413, + "grad_norm": 0.09692164782946838, + "kl": 0.162628173828125, + "learning_rate": 4.986032983200745e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7982143089175224, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8458 + }, + { + "completion_length": 256.38840198516846, + "epoch": 1.418458443354709, + "grad_norm": 0.23052555459158877, + "kl": 0.2418212890625, + "learning_rate": 4.986020317909577e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7875000275671482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8460 + }, + { + "completion_length": 259.8259048461914, + "epoch": 1.4187937465945764, + "grad_norm": 0.21238298783978266, + "kl": 0.22589111328125, + "learning_rate": 4.98600764689467e-07, + "loss": 0.0002, + "reward": 1.7500000894069672, + "reward_std": 0.04040610231459141, + "rewards/equation_reward_func": 0.7589285895228386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8462 + }, + { + "completion_length": 255.8125123977661, + "epoch": 1.419129049834444, + "grad_norm": 0.10720861997222418, + "kl": 0.178863525390625, + "learning_rate": 4.985994970156052e-07, + "loss": 0.0002, + "reward": 1.7410714775323868, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357313156128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8464 + }, + { + "completion_length": 263.75001335144043, + "epoch": 1.4194643530743116, + "grad_norm": 0.2796565830384278, + "kl": 0.134246826171875, + "learning_rate": 4.985982287693751e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7848214693367481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8466 + }, + { + "completion_length": 269.77679920196533, + "epoch": 1.4197996563141793, + "grad_norm": 0.1424100872454886, + "kl": 0.282989501953125, + "learning_rate": 4.985969599507797e-07, + "loss": 0.0003, + "reward": 1.741071492433548, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8468 + }, + { + "completion_length": 255.5759038925171, + "epoch": 1.4201349595540467, + "grad_norm": 0.10532280309336838, + "kl": 0.143096923828125, + "learning_rate": 4.98595690559822e-07, + "loss": 0.0001, + "reward": 1.7446429505944252, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8470 + }, + { + "completion_length": 252.64733505249023, + "epoch": 1.4204702627939143, + "grad_norm": 0.4641752137066568, + "kl": 0.17669677734375, + "learning_rate": 4.985944205965048e-07, + "loss": 0.0002, + "reward": 1.723214365541935, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276786006987095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8472 + }, + { + "completion_length": 248.41965198516846, + "epoch": 1.4208055660337817, + "grad_norm": 0.20406659597155857, + "kl": 0.146026611328125, + "learning_rate": 4.985931500608311e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285980910063, + "rewards/format_reward_func": 1.0, + "step": 8474 + }, + { + "completion_length": 246.37500953674316, + "epoch": 1.4211408692736494, + "grad_norm": 0.25091824175709215, + "kl": 0.225311279296875, + "learning_rate": 4.985918789528037e-07, + "loss": 0.0002, + "reward": 1.732142947614193, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 8476 + }, + { + "completion_length": 252.33929920196533, + "epoch": 1.421476172513517, + "grad_norm": 0.3030296561947238, + "kl": 0.157745361328125, + "learning_rate": 4.985906072724257e-07, + "loss": 0.0002, + "reward": 1.735714353621006, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7357143219560385, + "rewards/format_reward_func": 1.0, + "step": 8478 + }, + { + "completion_length": 263.6250114440918, + "epoch": 1.4218114757533844, + "grad_norm": 0.3695755041875649, + "kl": 0.115570068359375, + "learning_rate": 4.985893350196999e-07, + "loss": 0.0001, + "reward": 1.7125000804662704, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7169643212109804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8480 + }, + { + "completion_length": 256.9732255935669, + "epoch": 1.422146778993252, + "grad_norm": 0.1439424708583325, + "kl": 0.1053314208984375, + "learning_rate": 4.985880621946294e-07, + "loss": 0.0001, + "reward": 1.7392858043313026, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8482 + }, + { + "completion_length": 263.15626335144043, + "epoch": 1.4224820822331194, + "grad_norm": 0.281594071406156, + "kl": 0.112548828125, + "learning_rate": 4.985867887972169e-07, + "loss": 0.0001, + "reward": 1.6857143715023994, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7035714685916901, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8484 + }, + { + "completion_length": 253.27679920196533, + "epoch": 1.422817385472987, + "grad_norm": 0.20893025538573007, + "kl": 0.092681884765625, + "learning_rate": 4.985855148274655e-07, + "loss": 0.0001, + "reward": 1.7767857611179352, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8486 + }, + { + "completion_length": 250.0714406967163, + "epoch": 1.4231526887128547, + "grad_norm": 0.16251693933127415, + "kl": 0.122161865234375, + "learning_rate": 4.985842402853781e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 8488 + }, + { + "completion_length": 253.47769260406494, + "epoch": 1.4234879919527224, + "grad_norm": 0.2043458697797928, + "kl": 0.140777587890625, + "learning_rate": 4.985829651709575e-07, + "loss": 0.0001, + "reward": 1.8303571939468384, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8348214440047741, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8490 + }, + { + "completion_length": 246.39733123779297, + "epoch": 1.4238232951925898, + "grad_norm": 0.20184167414718476, + "kl": 0.106292724609375, + "learning_rate": 4.985816894842069e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 8492 + }, + { + "completion_length": 245.11608219146729, + "epoch": 1.4241585984324574, + "grad_norm": 0.1438843833872152, + "kl": 0.119415283203125, + "learning_rate": 4.985804132251289e-07, + "loss": 0.0001, + "reward": 1.769642911851406, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7741071749478579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8494 + }, + { + "completion_length": 258.9151887893677, + "epoch": 1.4244939016723248, + "grad_norm": 0.26308992695053623, + "kl": 0.12542724609375, + "learning_rate": 4.985791363937267e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 8496 + }, + { + "completion_length": 251.2410831451416, + "epoch": 1.4248292049121924, + "grad_norm": 0.1744191837380716, + "kl": 0.1085205078125, + "learning_rate": 4.985778589900032e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 8498 + }, + { + "completion_length": 242.3348331451416, + "epoch": 1.42516450815206, + "grad_norm": 0.25796914063104437, + "kl": 0.091278076171875, + "learning_rate": 4.985765810139614e-07, + "loss": 0.0001, + "reward": 1.7660714834928513, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8500 + }, + { + "completion_length": 243.51340198516846, + "epoch": 1.4254998113919275, + "grad_norm": 0.27267829180547887, + "kl": 0.1027984619140625, + "learning_rate": 4.98575302465604e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 8502 + }, + { + "completion_length": 253.4553689956665, + "epoch": 1.4258351146317951, + "grad_norm": 0.25539325970142795, + "kl": 0.093658447265625, + "learning_rate": 4.985740233449341e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.10354063473641872, + "rewards/equation_reward_func": 0.7616071701049805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8504 + }, + { + "completion_length": 227.38393688201904, + "epoch": 1.4261704178716628, + "grad_norm": 0.2182021184194296, + "kl": 0.0866546630859375, + "learning_rate": 4.985727436519547e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 8506 + }, + { + "completion_length": 233.50447368621826, + "epoch": 1.4265057211115302, + "grad_norm": 0.12847280880524686, + "kl": 0.1015472412109375, + "learning_rate": 4.985714633866685e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714484751225, + "rewards/format_reward_func": 1.0, + "step": 8508 + }, + { + "completion_length": 247.74108219146729, + "epoch": 1.4268410243513978, + "grad_norm": 0.17515847834387244, + "kl": 0.113861083984375, + "learning_rate": 4.985701825490789e-07, + "loss": 0.0001, + "reward": 1.7125000953674316, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.716964328661561, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8510 + }, + { + "completion_length": 241.46429824829102, + "epoch": 1.4271763275912654, + "grad_norm": 0.18851509196918953, + "kl": 0.089111328125, + "learning_rate": 4.985689011391884e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 8512 + }, + { + "completion_length": 236.56697463989258, + "epoch": 1.4275116308311329, + "grad_norm": 0.1949627540989246, + "kl": 0.0927734375, + "learning_rate": 4.985676191570001e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 8514 + }, + { + "completion_length": 228.66518878936768, + "epoch": 1.4278469340710005, + "grad_norm": 0.2164410413970927, + "kl": 0.0898284912109375, + "learning_rate": 4.985663366025171e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 8516 + }, + { + "completion_length": 230.446439743042, + "epoch": 1.428182237310868, + "grad_norm": 0.10347510631654368, + "kl": 0.11016845703125, + "learning_rate": 4.985650534757421e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 8518 + }, + { + "completion_length": 238.00447368621826, + "epoch": 1.4285175405507355, + "grad_norm": 0.1783372713511173, + "kl": 0.0781707763671875, + "learning_rate": 4.985637697766783e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.767857177183032, + "rewards/format_reward_func": 1.0, + "step": 8520 + }, + { + "completion_length": 246.2767972946167, + "epoch": 1.4288528437906032, + "grad_norm": 0.2876657858107366, + "kl": 0.0819549560546875, + "learning_rate": 4.985624855053286e-07, + "loss": 0.0001, + "reward": 1.7321429178118706, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428935974836, + "rewards/format_reward_func": 1.0, + "step": 8522 + }, + { + "completion_length": 230.7544755935669, + "epoch": 1.4291881470304708, + "grad_norm": 0.22398410695846024, + "kl": 0.0896148681640625, + "learning_rate": 4.985612006616957e-07, + "loss": 0.0001, + "reward": 1.7892857939004898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 8524 + }, + { + "completion_length": 241.4241189956665, + "epoch": 1.4295234502703382, + "grad_norm": 0.5155656556852308, + "kl": 0.111846923828125, + "learning_rate": 4.985599152457829e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 8526 + }, + { + "completion_length": 237.36608219146729, + "epoch": 1.4298587535102059, + "grad_norm": 0.12106024005112531, + "kl": 0.085052490234375, + "learning_rate": 4.985586292575929e-07, + "loss": 0.0001, + "reward": 1.7678571790456772, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571697324514, + "rewards/format_reward_func": 1.0, + "step": 8528 + }, + { + "completion_length": 227.28572845458984, + "epoch": 1.4301940567500733, + "grad_norm": 0.15222247755437548, + "kl": 0.0845947265625, + "learning_rate": 4.985573426971289e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 8530 + }, + { + "completion_length": 240.3705472946167, + "epoch": 1.430529359989941, + "grad_norm": 0.23859971475872568, + "kl": 0.091400146484375, + "learning_rate": 4.985560555643937e-07, + "loss": 0.0001, + "reward": 1.78035718947649, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8532 + }, + { + "completion_length": 243.41072463989258, + "epoch": 1.4308646632298085, + "grad_norm": 0.22536579496061887, + "kl": 0.081878662109375, + "learning_rate": 4.985547678593903e-07, + "loss": 0.0001, + "reward": 1.7232143357396126, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8534 + }, + { + "completion_length": 247.3750123977661, + "epoch": 1.431199966469676, + "grad_norm": 0.2802477726314655, + "kl": 0.079833984375, + "learning_rate": 4.985534795821217e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 8536 + }, + { + "completion_length": 244.95983123779297, + "epoch": 1.4315352697095436, + "grad_norm": 0.22365417759999096, + "kl": 0.0858612060546875, + "learning_rate": 4.985521907325907e-07, + "loss": 0.0001, + "reward": 1.7160715386271477, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7205357328057289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8538 + }, + { + "completion_length": 247.64286708831787, + "epoch": 1.431870572949411, + "grad_norm": 0.29738940177670603, + "kl": 0.0897216796875, + "learning_rate": 4.985509013108005e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 8540 + }, + { + "completion_length": 242.60269260406494, + "epoch": 1.4322058761892786, + "grad_norm": 0.12314884860826378, + "kl": 0.092803955078125, + "learning_rate": 4.985496113167539e-07, + "loss": 0.0001, + "reward": 1.8321429044008255, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8321428932249546, + "rewards/format_reward_func": 1.0, + "step": 8542 + }, + { + "completion_length": 236.54018878936768, + "epoch": 1.4325411794291463, + "grad_norm": 0.28216533234311264, + "kl": 0.093536376953125, + "learning_rate": 4.985483207504541e-07, + "loss": 0.0001, + "reward": 1.7428572326898575, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 8544 + }, + { + "completion_length": 234.81250953674316, + "epoch": 1.432876482669014, + "grad_norm": 0.22984926963704252, + "kl": 0.110687255859375, + "learning_rate": 4.985470296119038e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 8546 + }, + { + "completion_length": 232.24554634094238, + "epoch": 1.4332117859088813, + "grad_norm": 0.11927480159230068, + "kl": 0.120819091796875, + "learning_rate": 4.985457379011061e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 8548 + }, + { + "completion_length": 230.32143783569336, + "epoch": 1.433547089148749, + "grad_norm": 0.20098209710952783, + "kl": 0.1114654541015625, + "learning_rate": 4.98544445618064e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428719907999, + "rewards/format_reward_func": 1.0, + "step": 8550 + }, + { + "completion_length": 237.6294755935669, + "epoch": 1.4338823923886164, + "grad_norm": 0.12974919344947425, + "kl": 0.113372802734375, + "learning_rate": 4.985431527627804e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.8089285865426064, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8552 + }, + { + "completion_length": 242.09375953674316, + "epoch": 1.434217695628484, + "grad_norm": 0.1779069025850947, + "kl": 0.11090087890625, + "learning_rate": 4.985418593352583e-07, + "loss": 0.0001, + "reward": 1.801785796880722, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.8062500208616257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8554 + }, + { + "completion_length": 253.00894165039062, + "epoch": 1.4345529988683516, + "grad_norm": 0.39904585725146463, + "kl": 0.11761474609375, + "learning_rate": 4.985405653355006e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7732143327593803, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8556 + }, + { + "completion_length": 241.35268783569336, + "epoch": 1.434888302108219, + "grad_norm": 0.20989425840657888, + "kl": 0.116424560546875, + "learning_rate": 4.985392707635104e-07, + "loss": 0.0001, + "reward": 1.7535714656114578, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714600235224, + "rewards/format_reward_func": 1.0, + "step": 8558 + }, + { + "completion_length": 254.6339406967163, + "epoch": 1.4352236053480867, + "grad_norm": 0.5080392816810477, + "kl": 0.169158935546875, + "learning_rate": 4.985379756192908e-07, + "loss": 0.0002, + "reward": 1.7428572177886963, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7517857514321804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8560 + }, + { + "completion_length": 250.40625953674316, + "epoch": 1.435558908587954, + "grad_norm": 0.40974130131501063, + "kl": 0.1434326171875, + "learning_rate": 4.985366799028445e-07, + "loss": 0.0001, + "reward": 1.7361607775092125, + "reward_std": 0.05997780663892627, + "rewards/equation_reward_func": 0.741964302957058, + "rewards/format_reward_func": 0.9941964335739613, + "step": 8562 + }, + { + "completion_length": 238.5759038925171, + "epoch": 1.4358942118278217, + "grad_norm": 0.3081459571029257, + "kl": 0.13323974609375, + "learning_rate": 4.985353836141746e-07, + "loss": 0.0001, + "reward": 1.742857187986374, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7517857477068901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8564 + }, + { + "completion_length": 245.1116189956665, + "epoch": 1.4362295150676894, + "grad_norm": 0.17198720339414358, + "kl": 0.1402435302734375, + "learning_rate": 4.985340867532841e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7285714708268642, + "rewards/format_reward_func": 1.0, + "step": 8566 + }, + { + "completion_length": 245.3437614440918, + "epoch": 1.436564818307557, + "grad_norm": 0.19688696870277422, + "kl": 0.13861083984375, + "learning_rate": 4.98532789320176e-07, + "loss": 0.0001, + "reward": 1.7464286237955093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286163449287, + "rewards/format_reward_func": 1.0, + "step": 8568 + }, + { + "completion_length": 236.9910831451416, + "epoch": 1.4369001215474244, + "grad_norm": 0.1986506605536204, + "kl": 0.130035400390625, + "learning_rate": 4.985314913148534e-07, + "loss": 0.0001, + "reward": 1.7910715118050575, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8570 + }, + { + "completion_length": 236.37054634094238, + "epoch": 1.437235424787292, + "grad_norm": 0.1790702839307691, + "kl": 0.146270751953125, + "learning_rate": 4.98530192737319e-07, + "loss": 0.0001, + "reward": 1.7964286133646965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 8572 + }, + { + "completion_length": 252.28126049041748, + "epoch": 1.4375707280271595, + "grad_norm": 0.224827230192582, + "kl": 0.1280517578125, + "learning_rate": 4.98528893587576e-07, + "loss": 0.0001, + "reward": 1.7526786476373672, + "reward_std": 0.05682107945904136, + "rewards/equation_reward_func": 0.7589285969734192, + "rewards/format_reward_func": 0.9937500059604645, + "step": 8574 + }, + { + "completion_length": 242.94643878936768, + "epoch": 1.437906031267027, + "grad_norm": 0.40589939211072706, + "kl": 0.128082275390625, + "learning_rate": 4.985275938656273e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.11111677810549736, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8576 + }, + { + "completion_length": 240.07590293884277, + "epoch": 1.4382413345068947, + "grad_norm": 0.2912371749687889, + "kl": 0.328521728515625, + "learning_rate": 4.98526293571476e-07, + "loss": 0.0003, + "reward": 1.705357238650322, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7098214663565159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8578 + }, + { + "completion_length": 242.60715293884277, + "epoch": 1.4385766377467624, + "grad_norm": 0.21713702252679135, + "kl": 0.2657470703125, + "learning_rate": 4.98524992705125e-07, + "loss": 0.0003, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 8580 + }, + { + "completion_length": 248.51340293884277, + "epoch": 1.4389119409866298, + "grad_norm": 0.34041115823987345, + "kl": 0.33697509765625, + "learning_rate": 4.985236912665773e-07, + "loss": 0.0003, + "reward": 1.7482143640518188, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7526785917580128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8582 + }, + { + "completion_length": 252.89733219146729, + "epoch": 1.4392472442264974, + "grad_norm": 0.3167723579300486, + "kl": 0.2979736328125, + "learning_rate": 4.985223892558359e-07, + "loss": 0.0003, + "reward": 1.757142923772335, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 8584 + }, + { + "completion_length": 232.28125953674316, + "epoch": 1.4395825474663648, + "grad_norm": 0.14006470671982402, + "kl": 0.265777587890625, + "learning_rate": 4.985210866729038e-07, + "loss": 0.0003, + "reward": 1.7053572237491608, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.709821455180645, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8586 + }, + { + "completion_length": 240.96429347991943, + "epoch": 1.4399178507062325, + "grad_norm": 0.5398088833976759, + "kl": 0.422210693359375, + "learning_rate": 4.985197835177841e-07, + "loss": 0.0004, + "reward": 1.7107143625617027, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.719642885029316, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8588 + }, + { + "completion_length": 252.01340293884277, + "epoch": 1.4402531539461, + "grad_norm": 0.19388137471372102, + "kl": 0.404083251953125, + "learning_rate": 4.985184797904797e-07, + "loss": 0.0004, + "reward": 1.7017858028411865, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.706250037997961, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8590 + }, + { + "completion_length": 249.28572750091553, + "epoch": 1.4405884571859675, + "grad_norm": 0.1371797585529847, + "kl": 0.428375244140625, + "learning_rate": 4.985171754909936e-07, + "loss": 0.0004, + "reward": 1.7428572252392769, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7517857365310192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8592 + }, + { + "completion_length": 244.2500114440918, + "epoch": 1.4409237604258351, + "grad_norm": 0.23078455389106672, + "kl": 0.218597412109375, + "learning_rate": 4.985158706193287e-07, + "loss": 0.0002, + "reward": 1.7750000581145287, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7839285936206579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8594 + }, + { + "completion_length": 251.49108219146729, + "epoch": 1.4412590636657026, + "grad_norm": 0.2085646350562671, + "kl": 0.250274658203125, + "learning_rate": 4.985145651754883e-07, + "loss": 0.0003, + "reward": 1.7625000551342964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8596 + }, + { + "completion_length": 253.1517972946167, + "epoch": 1.4415943669055702, + "grad_norm": 0.19498147177236294, + "kl": 0.826019287109375, + "learning_rate": 4.985132591594751e-07, + "loss": 0.0008, + "reward": 1.7857143357396126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143022119999, + "rewards/format_reward_func": 1.0, + "step": 8598 + }, + { + "completion_length": 234.71876049041748, + "epoch": 1.4419296701454378, + "grad_norm": 0.544497343527771, + "kl": 0.1741943359375, + "learning_rate": 4.985119525712924e-07, + "loss": 0.0002, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143297791481, + "rewards/format_reward_func": 1.0, + "step": 8600 + }, + { + "completion_length": 230.98661708831787, + "epoch": 1.4422649733853055, + "grad_norm": 0.2627565818988088, + "kl": 0.214813232421875, + "learning_rate": 4.98510645410943e-07, + "loss": 0.0002, + "reward": 1.716071479022503, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7205357514321804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8602 + }, + { + "completion_length": 233.70090103149414, + "epoch": 1.4426002766251729, + "grad_norm": 0.6021051909045801, + "kl": 0.244903564453125, + "learning_rate": 4.985093376784298e-07, + "loss": 0.0002, + "reward": 1.7660715132951736, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8604 + }, + { + "completion_length": 235.54911518096924, + "epoch": 1.4429355798650405, + "grad_norm": 0.2665148810564702, + "kl": 0.25433349609375, + "learning_rate": 4.98508029373756e-07, + "loss": 0.0003, + "reward": 1.7142858058214188, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7142857424914837, + "rewards/format_reward_func": 1.0, + "step": 8606 + }, + { + "completion_length": 232.60268688201904, + "epoch": 1.443270883104908, + "grad_norm": 0.34224759146463907, + "kl": 0.1395263671875, + "learning_rate": 4.985067204969247e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 8608 + }, + { + "completion_length": 233.10715579986572, + "epoch": 1.4436061863447756, + "grad_norm": 0.19504296285660103, + "kl": 0.133758544921875, + "learning_rate": 4.985054110479387e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 8610 + }, + { + "completion_length": 239.98215293884277, + "epoch": 1.4439414895846432, + "grad_norm": 0.17531776073423436, + "kl": 0.121978759765625, + "learning_rate": 4.985041010268011e-07, + "loss": 0.0001, + "reward": 1.7250000685453415, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 1.0, + "step": 8612 + }, + { + "completion_length": 219.0044755935669, + "epoch": 1.4442767928245106, + "grad_norm": 0.5596495225303464, + "kl": 0.151397705078125, + "learning_rate": 4.98502790433515e-07, + "loss": 0.0002, + "reward": 1.8142857402563095, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 8614 + }, + { + "completion_length": 229.80358219146729, + "epoch": 1.4446120960643782, + "grad_norm": 0.15586386632307309, + "kl": 0.11224365234375, + "learning_rate": 4.985014792680833e-07, + "loss": 0.0001, + "reward": 1.7035714834928513, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.703571455553174, + "rewards/format_reward_func": 1.0, + "step": 8616 + }, + { + "completion_length": 245.8526906967163, + "epoch": 1.4449473993042456, + "grad_norm": 0.13640709963299535, + "kl": 0.118865966796875, + "learning_rate": 4.98500167530509e-07, + "loss": 0.0001, + "reward": 1.6883929371833801, + "reward_std": 0.06692260596901178, + "rewards/equation_reward_func": 0.6946429014205933, + "rewards/format_reward_func": 0.9937500059604645, + "step": 8618 + }, + { + "completion_length": 227.4687614440918, + "epoch": 1.4452827025441133, + "grad_norm": 0.2478581313144655, + "kl": 0.0968017578125, + "learning_rate": 4.984988552207952e-07, + "loss": 0.0001, + "reward": 1.7535715252161026, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 8620 + }, + { + "completion_length": 242.321439743042, + "epoch": 1.445618005783981, + "grad_norm": 0.4948357753835724, + "kl": 0.118927001953125, + "learning_rate": 4.984975423389449e-07, + "loss": 0.0001, + "reward": 1.7375000789761543, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643253087997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8622 + }, + { + "completion_length": 230.17858123779297, + "epoch": 1.4459533090238486, + "grad_norm": 0.13151233430921153, + "kl": 0.130401611328125, + "learning_rate": 4.984962288849611e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 8624 + }, + { + "completion_length": 234.3616189956665, + "epoch": 1.446288612263716, + "grad_norm": 0.3987436135276987, + "kl": 0.14691162109375, + "learning_rate": 4.98494914858847e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7553571704775095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8626 + }, + { + "completion_length": 239.37054920196533, + "epoch": 1.4466239155035836, + "grad_norm": 0.5486163983262353, + "kl": 0.307373046875, + "learning_rate": 4.984936002606053e-07, + "loss": 0.0003, + "reward": 1.8000000566244125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 8628 + }, + { + "completion_length": 230.57590675354004, + "epoch": 1.446959218743451, + "grad_norm": 0.12681436673323587, + "kl": 0.11016845703125, + "learning_rate": 4.984922850902392e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643197208643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8630 + }, + { + "completion_length": 241.89286994934082, + "epoch": 1.4472945219833186, + "grad_norm": 0.18086561065557644, + "kl": 0.263641357421875, + "learning_rate": 4.984909693477518e-07, + "loss": 0.0003, + "reward": 1.7285714894533157, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 8632 + }, + { + "completion_length": 242.1562623977661, + "epoch": 1.4476298252231863, + "grad_norm": 0.2787489245249078, + "kl": 0.3875732421875, + "learning_rate": 4.98489653033146e-07, + "loss": 0.0004, + "reward": 1.7125000804662704, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7169643305242062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8634 + }, + { + "completion_length": 245.22322463989258, + "epoch": 1.4479651284630537, + "grad_norm": 0.2814785129957571, + "kl": 0.5368499755859375, + "learning_rate": 4.98488336146425e-07, + "loss": 0.0005, + "reward": 1.7464286237955093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 8636 + }, + { + "completion_length": 240.5000114440918, + "epoch": 1.4483004317029213, + "grad_norm": 0.24754685535957013, + "kl": 0.8140869140625, + "learning_rate": 4.984870186875916e-07, + "loss": 0.0008, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 8638 + }, + { + "completion_length": 232.0669755935669, + "epoch": 1.448635734942789, + "grad_norm": 0.24005853574977867, + "kl": 0.136199951171875, + "learning_rate": 4.984857006566489e-07, + "loss": 0.0001, + "reward": 1.7071429565548897, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7071428876370192, + "rewards/format_reward_func": 1.0, + "step": 8640 + }, + { + "completion_length": 239.9776906967163, + "epoch": 1.4489710381826564, + "grad_norm": 0.3229400482830985, + "kl": 0.5002593994140625, + "learning_rate": 4.984843820536002e-07, + "loss": 0.0005, + "reward": 1.7500000596046448, + "reward_std": 0.08081220369786024, + "rewards/equation_reward_func": 0.7589285913854837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8642 + }, + { + "completion_length": 246.5714406967163, + "epoch": 1.449306341422524, + "grad_norm": 0.19913141938149045, + "kl": 0.5619049072265625, + "learning_rate": 4.984830628784482e-07, + "loss": 0.0006, + "reward": 1.74642863124609, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7553571835160255, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8644 + }, + { + "completion_length": 256.48215675354004, + "epoch": 1.4496416446623916, + "grad_norm": 0.1987641534881554, + "kl": 0.1080780029296875, + "learning_rate": 4.984817431311961e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7767857350409031, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8646 + }, + { + "completion_length": 243.52679634094238, + "epoch": 1.449976947902259, + "grad_norm": 0.388932344482841, + "kl": 0.1192169189453125, + "learning_rate": 4.984804228118468e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 0.987500011920929, + "step": 8648 + }, + { + "completion_length": 253.69643688201904, + "epoch": 1.4503122511421267, + "grad_norm": 0.12055179889101425, + "kl": 0.193603515625, + "learning_rate": 4.984791019204034e-07, + "loss": 0.0002, + "reward": 1.7267857640981674, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7401786055415869, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8650 + }, + { + "completion_length": 256.64287185668945, + "epoch": 1.450647554381994, + "grad_norm": 0.45740297022280063, + "kl": 0.192108154296875, + "learning_rate": 4.984777804568692e-07, + "loss": 0.0002, + "reward": 1.785714328289032, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 8652 + }, + { + "completion_length": 255.66519165039062, + "epoch": 1.4509828576218617, + "grad_norm": 0.17801379402730066, + "kl": 0.08984375, + "learning_rate": 4.984764584212469e-07, + "loss": 0.0001, + "reward": 1.7803572192788124, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8654 + }, + { + "completion_length": 267.5491199493408, + "epoch": 1.4513181608617294, + "grad_norm": 0.11665353846037967, + "kl": 0.127777099609375, + "learning_rate": 4.984751358135396e-07, + "loss": 0.0001, + "reward": 1.714285783469677, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7232143133878708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8656 + }, + { + "completion_length": 263.14287090301514, + "epoch": 1.451653464101597, + "grad_norm": 0.25408979673938825, + "kl": 0.13909912109375, + "learning_rate": 4.984738126337504e-07, + "loss": 0.0001, + "reward": 1.6875000521540642, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7008928880095482, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8658 + }, + { + "completion_length": 272.9107255935669, + "epoch": 1.4519887673414644, + "grad_norm": 0.18260584338052813, + "kl": 0.2103271484375, + "learning_rate": 4.984724888818824e-07, + "loss": 0.0002, + "reward": 1.7982143387198448, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8026785962283611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8660 + }, + { + "completion_length": 264.65625953674316, + "epoch": 1.452324070581332, + "grad_norm": 0.06142023819391897, + "kl": 0.0996246337890625, + "learning_rate": 4.984711645579387e-07, + "loss": 0.0001, + "reward": 1.7571429088711739, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7660714685916901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8662 + }, + { + "completion_length": 263.11162090301514, + "epoch": 1.4526593738211995, + "grad_norm": 0.1881334188221849, + "kl": 0.0886993408203125, + "learning_rate": 4.984698396619221e-07, + "loss": 0.0001, + "reward": 1.7464286163449287, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8664 + }, + { + "completion_length": 257.37500953674316, + "epoch": 1.452994677061067, + "grad_norm": 0.18317443366305186, + "kl": 0.09564208984375, + "learning_rate": 4.98468514193836e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7875000275671482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8666 + }, + { + "completion_length": 268.60715770721436, + "epoch": 1.4533299803009347, + "grad_norm": 0.4507416161216136, + "kl": 0.1070556640625, + "learning_rate": 4.984671881536831e-07, + "loss": 0.0001, + "reward": 1.6857143715023994, + "reward_std": 0.11111677810549736, + "rewards/equation_reward_func": 0.7035714704543352, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8668 + }, + { + "completion_length": 267.8214387893677, + "epoch": 1.4536652835408022, + "grad_norm": 0.1606135561413552, + "kl": 0.0889129638671875, + "learning_rate": 4.984658615414666e-07, + "loss": 0.0001, + "reward": 1.812500074505806, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8169643022119999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8670 + }, + { + "completion_length": 254.62947463989258, + "epoch": 1.4540005867806698, + "grad_norm": 0.13900528444515145, + "kl": 0.0863189697265625, + "learning_rate": 4.984645343571896e-07, + "loss": 0.0001, + "reward": 1.7875000685453415, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8672 + }, + { + "completion_length": 272.78126525878906, + "epoch": 1.4543358900205372, + "grad_norm": 0.20889585300332983, + "kl": 0.0862579345703125, + "learning_rate": 4.984632066008551e-07, + "loss": 0.0001, + "reward": 1.7571428939700127, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 0.9821428656578064, + "step": 8674 + }, + { + "completion_length": 270.0491189956665, + "epoch": 1.4546711932604048, + "grad_norm": 0.23130697896746683, + "kl": 0.1115264892578125, + "learning_rate": 4.984618782724662e-07, + "loss": 0.0001, + "reward": 1.7473215013742447, + "reward_std": 0.06439722282812, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9982142895460129, + "step": 8676 + }, + { + "completion_length": 267.8303699493408, + "epoch": 1.4550064965002725, + "grad_norm": 0.32041604349225783, + "kl": 0.11407470703125, + "learning_rate": 4.98460549372026e-07, + "loss": 0.0001, + "reward": 1.7392858117818832, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7482143118977547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8678 + }, + { + "completion_length": 265.58037185668945, + "epoch": 1.45534179974014, + "grad_norm": 0.14919159431849408, + "kl": 0.1207122802734375, + "learning_rate": 4.984592198995373e-07, + "loss": 0.0001, + "reward": 1.7125000581145287, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.725892897695303, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8680 + }, + { + "completion_length": 265.38394260406494, + "epoch": 1.4556771029800075, + "grad_norm": 0.3035788406369713, + "kl": 0.1330718994140625, + "learning_rate": 4.984578898550035e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 8682 + }, + { + "completion_length": 254.71875953674316, + "epoch": 1.4560124062198752, + "grad_norm": 0.19167303109979877, + "kl": 0.13134765625, + "learning_rate": 4.984565592384275e-07, + "loss": 0.0001, + "reward": 1.6964286416769028, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7053571902215481, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8684 + }, + { + "completion_length": 265.8303689956665, + "epoch": 1.4563477094597426, + "grad_norm": 0.18865444325673764, + "kl": 0.0995635986328125, + "learning_rate": 4.984552280498124e-07, + "loss": 0.0001, + "reward": 1.8321429044008255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.832142885774374, + "rewards/format_reward_func": 1.0, + "step": 8686 + }, + { + "completion_length": 256.9241247177124, + "epoch": 1.4566830126996102, + "grad_norm": 0.8027363666604862, + "kl": 0.3691558837890625, + "learning_rate": 4.984538962891612e-07, + "loss": 0.0004, + "reward": 1.7750000655651093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 8688 + }, + { + "completion_length": 261.95536708831787, + "epoch": 1.4570183159394778, + "grad_norm": 0.19079471714756488, + "kl": 0.086334228515625, + "learning_rate": 4.984525639564771e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7875000350177288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8690 + }, + { + "completion_length": 242.62947463989258, + "epoch": 1.4573536191793452, + "grad_norm": 0.16115868086484084, + "kl": 0.103118896484375, + "learning_rate": 4.984512310517629e-07, + "loss": 0.0001, + "reward": 1.764732226729393, + "reward_std": 0.056189734023064375, + "rewards/equation_reward_func": 0.770535733550787, + "rewards/format_reward_func": 0.9941964335739613, + "step": 8692 + }, + { + "completion_length": 257.6428699493408, + "epoch": 1.4576889224192129, + "grad_norm": 0.18748173456319275, + "kl": 0.13623046875, + "learning_rate": 4.98449897575022e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 8694 + }, + { + "completion_length": 253.9732255935669, + "epoch": 1.4580242256590803, + "grad_norm": 0.0670366887196531, + "kl": 0.087493896484375, + "learning_rate": 4.984485635262573e-07, + "loss": 0.0001, + "reward": 1.7625000476837158, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7669643275439739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8696 + }, + { + "completion_length": 255.08483123779297, + "epoch": 1.458359528898948, + "grad_norm": 0.46439941711117166, + "kl": 0.146087646484375, + "learning_rate": 4.984472289054718e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8698 + }, + { + "completion_length": 258.120548248291, + "epoch": 1.4586948321388156, + "grad_norm": 0.13445293436482703, + "kl": 0.2231903076171875, + "learning_rate": 4.984458937126687e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 8700 + }, + { + "completion_length": 253.5178689956665, + "epoch": 1.4590301353786832, + "grad_norm": 0.2825612021039923, + "kl": 0.203125, + "learning_rate": 4.984445579478512e-07, + "loss": 0.0002, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 8702 + }, + { + "completion_length": 255.8616180419922, + "epoch": 1.4593654386185506, + "grad_norm": 0.0662004696664487, + "kl": 0.160614013671875, + "learning_rate": 4.98443221611022e-07, + "loss": 0.0002, + "reward": 1.7928571701049805, + "reward_std": 0.030304577201604843, + "rewards/equation_reward_func": 0.8017857428640127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8704 + }, + { + "completion_length": 247.165189743042, + "epoch": 1.4597007418584182, + "grad_norm": 0.14048445465509524, + "kl": 0.146881103515625, + "learning_rate": 4.984418847021845e-07, + "loss": 0.0001, + "reward": 1.8321429044008255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.832142885774374, + "rewards/format_reward_func": 1.0, + "step": 8706 + }, + { + "completion_length": 253.55804538726807, + "epoch": 1.4600360450982857, + "grad_norm": 0.13718770727586044, + "kl": 0.219635009765625, + "learning_rate": 4.984405472213415e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 8708 + }, + { + "completion_length": 253.86161613464355, + "epoch": 1.4603713483381533, + "grad_norm": 0.17726014798555442, + "kl": 0.115692138671875, + "learning_rate": 4.984392091684965e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 8710 + }, + { + "completion_length": 258.8660840988159, + "epoch": 1.460706651578021, + "grad_norm": 0.10583912091027659, + "kl": 0.15997314453125, + "learning_rate": 4.984378705436521e-07, + "loss": 0.0002, + "reward": 1.7464286610484123, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 8712 + }, + { + "completion_length": 246.37054347991943, + "epoch": 1.4610419548178886, + "grad_norm": 0.19567685530098944, + "kl": 0.1878814697265625, + "learning_rate": 4.984365313468117e-07, + "loss": 0.0002, + "reward": 1.7928571626543999, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.8017857372760773, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8714 + }, + { + "completion_length": 252.37054824829102, + "epoch": 1.461377258057756, + "grad_norm": 0.2787780610294813, + "kl": 0.104888916015625, + "learning_rate": 4.984351915779783e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7303571738302708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8716 + }, + { + "completion_length": 252.12947750091553, + "epoch": 1.4617125612976236, + "grad_norm": 0.3540296381410723, + "kl": 1.019378662109375, + "learning_rate": 4.98433851237155e-07, + "loss": 0.001, + "reward": 1.7285714894533157, + "reward_std": 0.03030457627028227, + "rewards/equation_reward_func": 0.7375000361353159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8718 + }, + { + "completion_length": 251.0625114440918, + "epoch": 1.462047864537491, + "grad_norm": 0.29658882871831993, + "kl": 0.2493743896484375, + "learning_rate": 4.984325103243448e-07, + "loss": 0.0002, + "reward": 1.7125000730156898, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.716964315623045, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8720 + }, + { + "completion_length": 255.5446538925171, + "epoch": 1.4623831677773587, + "grad_norm": 0.15793259635685353, + "kl": 0.204254150390625, + "learning_rate": 4.984311688395508e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7982143051922321, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8722 + }, + { + "completion_length": 245.4553689956665, + "epoch": 1.4627184710172263, + "grad_norm": 0.20221390732780958, + "kl": 0.2862548828125, + "learning_rate": 4.984298267827763e-07, + "loss": 0.0003, + "reward": 1.735714353621006, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7357143256813288, + "rewards/format_reward_func": 1.0, + "step": 8724 + }, + { + "completion_length": 256.0401906967163, + "epoch": 1.4630537742570937, + "grad_norm": 0.18251496359166494, + "kl": 0.3800048828125, + "learning_rate": 4.984284841540242e-07, + "loss": 0.0004, + "reward": 1.7053572162985802, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7098214589059353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8726 + }, + { + "completion_length": 239.9241189956665, + "epoch": 1.4633890774969613, + "grad_norm": 0.16568506817127004, + "kl": 0.2138671875, + "learning_rate": 4.984271409532975e-07, + "loss": 0.0002, + "reward": 1.6928572282195091, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.6928571704775095, + "rewards/format_reward_func": 1.0, + "step": 8728 + }, + { + "completion_length": 242.13840198516846, + "epoch": 1.4637243807368288, + "grad_norm": 0.1784867004860529, + "kl": 0.3534393310546875, + "learning_rate": 4.984257971805995e-07, + "loss": 0.0004, + "reward": 1.7678571939468384, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 8730 + }, + { + "completion_length": 248.75447368621826, + "epoch": 1.4640596839766964, + "grad_norm": 0.1337191378943348, + "kl": 0.240386962890625, + "learning_rate": 4.984244528359332e-07, + "loss": 0.0002, + "reward": 1.7339286357164383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7383929006755352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8732 + }, + { + "completion_length": 245.96429824829102, + "epoch": 1.464394987216564, + "grad_norm": 0.2954717061176378, + "kl": 0.18841552734375, + "learning_rate": 4.984231079193016e-07, + "loss": 0.0002, + "reward": 1.7285715341567993, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 8734 + }, + { + "completion_length": 241.88393783569336, + "epoch": 1.4647302904564317, + "grad_norm": 0.15578577592582596, + "kl": 0.1372833251953125, + "learning_rate": 4.98421762430708e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8736 + }, + { + "completion_length": 246.76340293884277, + "epoch": 1.465065593696299, + "grad_norm": 0.18416701370666422, + "kl": 0.2396087646484375, + "learning_rate": 4.984204163701554e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 8738 + }, + { + "completion_length": 235.80358028411865, + "epoch": 1.4654008969361667, + "grad_norm": 0.27698587198850594, + "kl": 0.2167816162109375, + "learning_rate": 4.984190697376469e-07, + "loss": 0.0002, + "reward": 1.7446429207921028, + "reward_std": 0.09848987311124802, + "rewards/equation_reward_func": 0.7580357454717159, + "rewards/format_reward_func": 0.9866071492433548, + "step": 8740 + }, + { + "completion_length": 236.6919755935669, + "epoch": 1.4657362001760341, + "grad_norm": 0.41206796353337627, + "kl": 0.5396728515625, + "learning_rate": 4.984177225331856e-07, + "loss": 0.0005, + "reward": 1.733928643167019, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928820490837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8742 + }, + { + "completion_length": 242.49108505249023, + "epoch": 1.4660715034159018, + "grad_norm": 0.7406925620636272, + "kl": 0.42578125, + "learning_rate": 4.984163747567745e-07, + "loss": 0.0004, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 8744 + }, + { + "completion_length": 230.26786613464355, + "epoch": 1.4664068066557694, + "grad_norm": 0.15986850405720687, + "kl": 0.114776611328125, + "learning_rate": 4.984150264084169e-07, + "loss": 0.0001, + "reward": 1.798214353621006, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8017857447266579, + "rewards/format_reward_func": 0.9964285716414452, + "step": 8746 + }, + { + "completion_length": 246.60715579986572, + "epoch": 1.4667421098956368, + "grad_norm": 0.21336797543590305, + "kl": 0.48760986328125, + "learning_rate": 4.984136774881158e-07, + "loss": 0.0005, + "reward": 1.7857143506407738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 8748 + }, + { + "completion_length": 232.2544755935669, + "epoch": 1.4670774131355044, + "grad_norm": 0.2915242898738858, + "kl": 0.111358642578125, + "learning_rate": 4.984123279958742e-07, + "loss": 0.0001, + "reward": 1.7160715013742447, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.720535745844245, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8750 + }, + { + "completion_length": 236.7544755935669, + "epoch": 1.4674127163753719, + "grad_norm": 0.43406847290546435, + "kl": 0.15557861328125, + "learning_rate": 4.984109779316955e-07, + "loss": 0.0002, + "reward": 1.750000074505806, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 8752 + }, + { + "completion_length": 233.5937623977661, + "epoch": 1.4677480196152395, + "grad_norm": 0.1601888555299674, + "kl": 0.16204833984375, + "learning_rate": 4.984096272955825e-07, + "loss": 0.0002, + "reward": 1.7892857640981674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857398837805, + "rewards/format_reward_func": 1.0, + "step": 8754 + }, + { + "completion_length": 237.0669755935669, + "epoch": 1.4680833228551071, + "grad_norm": 0.1986376468777605, + "kl": 0.101776123046875, + "learning_rate": 4.984082760875383e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 8756 + }, + { + "completion_length": 230.04465198516846, + "epoch": 1.4684186260949748, + "grad_norm": 0.11074341297077231, + "kl": 0.114471435546875, + "learning_rate": 4.984069243075663e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 8758 + }, + { + "completion_length": 236.65179824829102, + "epoch": 1.4687539293348422, + "grad_norm": 0.15408371005615754, + "kl": 0.10626220703125, + "learning_rate": 4.984055719556695e-07, + "loss": 0.0001, + "reward": 1.8178571984171867, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571704775095, + "rewards/format_reward_func": 1.0, + "step": 8760 + }, + { + "completion_length": 239.63840198516846, + "epoch": 1.4690892325747098, + "grad_norm": 0.21080123821184735, + "kl": 0.1241455078125, + "learning_rate": 4.984042190318509e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 8762 + }, + { + "completion_length": 237.87054443359375, + "epoch": 1.4694245358145772, + "grad_norm": 0.2850582997015366, + "kl": 0.1470184326171875, + "learning_rate": 4.984028655361138e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 8764 + }, + { + "completion_length": 240.77679824829102, + "epoch": 1.4697598390544448, + "grad_norm": 0.2321030600462851, + "kl": 0.164703369140625, + "learning_rate": 4.98401511468461e-07, + "loss": 0.0002, + "reward": 1.7357143461704254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 8766 + }, + { + "completion_length": 236.86608123779297, + "epoch": 1.4700951422943125, + "grad_norm": 0.3982264017600752, + "kl": 0.1109619140625, + "learning_rate": 4.984001568288961e-07, + "loss": 0.0001, + "reward": 1.7464286163449287, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 8768 + }, + { + "completion_length": 239.758939743042, + "epoch": 1.47043044553418, + "grad_norm": 0.5475701173802654, + "kl": 0.11822509765625, + "learning_rate": 4.983988016174216e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 8770 + }, + { + "completion_length": 238.2901906967163, + "epoch": 1.4707657487740475, + "grad_norm": 0.13873530815564392, + "kl": 0.13739013671875, + "learning_rate": 4.983974458340412e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 1.0, + "step": 8772 + }, + { + "completion_length": 240.0044765472412, + "epoch": 1.4711010520139152, + "grad_norm": 0.20762954442612053, + "kl": 0.152435302734375, + "learning_rate": 4.983960894787577e-07, + "loss": 0.0002, + "reward": 1.73392865806818, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928913623095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8774 + }, + { + "completion_length": 240.7009048461914, + "epoch": 1.4714363552537826, + "grad_norm": 0.24434572005939045, + "kl": 0.20208740234375, + "learning_rate": 4.983947325515743e-07, + "loss": 0.0002, + "reward": 1.7535714879631996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 8776 + }, + { + "completion_length": 237.5937623977661, + "epoch": 1.4717716584936502, + "grad_norm": 0.14532140615552122, + "kl": 0.213592529296875, + "learning_rate": 4.983933750524941e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 8778 + }, + { + "completion_length": 237.67858219146729, + "epoch": 1.4721069617335178, + "grad_norm": 0.21694863813568013, + "kl": 0.153778076171875, + "learning_rate": 4.983920169815203e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 8780 + }, + { + "completion_length": 237.61608505249023, + "epoch": 1.4724422649733853, + "grad_norm": 0.1438045442306162, + "kl": 0.290191650390625, + "learning_rate": 4.98390658338656e-07, + "loss": 0.0003, + "reward": 1.7607143819332123, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 8782 + }, + { + "completion_length": 238.18751049041748, + "epoch": 1.472777568213253, + "grad_norm": 0.21501601267495266, + "kl": 0.1873779296875, + "learning_rate": 4.983892991239043e-07, + "loss": 0.0002, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 8784 + }, + { + "completion_length": 237.9419755935669, + "epoch": 1.4731128714531203, + "grad_norm": 0.22531915179161718, + "kl": 0.19091796875, + "learning_rate": 4.983879393372683e-07, + "loss": 0.0002, + "reward": 1.8071429133415222, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 8786 + }, + { + "completion_length": 240.008939743042, + "epoch": 1.473448174692988, + "grad_norm": 0.26067253479642094, + "kl": 0.281646728515625, + "learning_rate": 4.983865789787512e-07, + "loss": 0.0003, + "reward": 1.7196429371833801, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7241071835160255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8788 + }, + { + "completion_length": 238.67858123779297, + "epoch": 1.4737834779328556, + "grad_norm": 0.08043931843097116, + "kl": 0.195220947265625, + "learning_rate": 4.983852180483561e-07, + "loss": 0.0002, + "reward": 1.7964286133646965, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.805357163771987, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8790 + }, + { + "completion_length": 239.61161708831787, + "epoch": 1.4741187811727232, + "grad_norm": 0.13354989199160783, + "kl": 0.142486572265625, + "learning_rate": 4.983838565460861e-07, + "loss": 0.0001, + "reward": 1.8071429282426834, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 8792 + }, + { + "completion_length": 224.60715293884277, + "epoch": 1.4744540844125906, + "grad_norm": 0.08819330980406609, + "kl": 0.14404296875, + "learning_rate": 4.983824944719445e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857142984867096, + "rewards/format_reward_func": 1.0, + "step": 8794 + }, + { + "completion_length": 235.8928680419922, + "epoch": 1.4747893876524583, + "grad_norm": 0.1696130647851362, + "kl": 0.256622314453125, + "learning_rate": 4.983811318259341e-07, + "loss": 0.0003, + "reward": 1.782142922282219, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 8796 + }, + { + "completion_length": 251.71429538726807, + "epoch": 1.4751246908923257, + "grad_norm": 0.24888454972454868, + "kl": 0.257476806640625, + "learning_rate": 4.983797686080584e-07, + "loss": 0.0003, + "reward": 1.74642863124609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464285995811224, + "rewards/format_reward_func": 1.0, + "step": 8798 + }, + { + "completion_length": 242.61161518096924, + "epoch": 1.4754599941321933, + "grad_norm": 0.24204832950998906, + "kl": 0.138885498046875, + "learning_rate": 4.983784048183203e-07, + "loss": 0.0001, + "reward": 1.7357143834233284, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 8800 + }, + { + "completion_length": 247.0357265472412, + "epoch": 1.475795297372061, + "grad_norm": 0.2644117340159772, + "kl": 0.148284912109375, + "learning_rate": 4.98377040456723e-07, + "loss": 0.0001, + "reward": 1.7910714820027351, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8802 + }, + { + "completion_length": 244.20983219146729, + "epoch": 1.4761306006119284, + "grad_norm": 0.10373343324288181, + "kl": 0.2305908203125, + "learning_rate": 4.983756755232698e-07, + "loss": 0.0002, + "reward": 1.7196429520845413, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8804 + }, + { + "completion_length": 245.05804634094238, + "epoch": 1.476465903851796, + "grad_norm": 0.2235878076098335, + "kl": 0.140228271484375, + "learning_rate": 4.983743100179635e-07, + "loss": 0.0001, + "reward": 1.7553572282195091, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8806 + }, + { + "completion_length": 237.69644165039062, + "epoch": 1.4768012070916634, + "grad_norm": 0.40044982762353, + "kl": 0.15948486328125, + "learning_rate": 4.983729439408076e-07, + "loss": 0.0002, + "reward": 1.84285718947649, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8428571783006191, + "rewards/format_reward_func": 1.0, + "step": 8808 + }, + { + "completion_length": 237.32590675354004, + "epoch": 1.477136510331531, + "grad_norm": 0.3943765992765898, + "kl": 0.201751708984375, + "learning_rate": 4.983715772918051e-07, + "loss": 0.0002, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 8810 + }, + { + "completion_length": 228.9642972946167, + "epoch": 1.4774718135713987, + "grad_norm": 0.257684754189661, + "kl": 0.1728515625, + "learning_rate": 4.98370210070959e-07, + "loss": 0.0002, + "reward": 1.77946437895298, + "reward_std": 0.049244935158640146, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9982142895460129, + "step": 8812 + }, + { + "completion_length": 240.70983219146729, + "epoch": 1.4778071168112663, + "grad_norm": 0.2669589546020435, + "kl": 0.183319091796875, + "learning_rate": 4.983688422782727e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 8814 + }, + { + "completion_length": 237.05358219146729, + "epoch": 1.4781424200511337, + "grad_norm": 0.18092645813421443, + "kl": 0.289398193359375, + "learning_rate": 4.983674739137492e-07, + "loss": 0.0003, + "reward": 1.7142857909202576, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857536673546, + "rewards/format_reward_func": 1.0, + "step": 8816 + }, + { + "completion_length": 242.8750114440918, + "epoch": 1.4784777232910014, + "grad_norm": 0.12930487883190817, + "kl": 0.164398193359375, + "learning_rate": 4.983661049773918e-07, + "loss": 0.0002, + "reward": 1.7821428999304771, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 8818 + }, + { + "completion_length": 226.96429824829102, + "epoch": 1.4788130265308688, + "grad_norm": 0.4132107732816038, + "kl": 0.1771240234375, + "learning_rate": 4.983647354692034e-07, + "loss": 0.0002, + "reward": 1.81428574770689, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857551574707, + "rewards/format_reward_func": 1.0, + "step": 8820 + }, + { + "completion_length": 243.04018783569336, + "epoch": 1.4791483297707364, + "grad_norm": 0.39982853624464665, + "kl": 0.329010009765625, + "learning_rate": 4.983633653891872e-07, + "loss": 0.0003, + "reward": 1.7446429207921028, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8822 + }, + { + "completion_length": 252.696439743042, + "epoch": 1.479483633010604, + "grad_norm": 0.5636208493610942, + "kl": 0.17095947265625, + "learning_rate": 4.983619947373467e-07, + "loss": 0.0002, + "reward": 1.7678572461009026, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 8824 + }, + { + "completion_length": 242.7857255935669, + "epoch": 1.4798189362504715, + "grad_norm": 0.22129015934686244, + "kl": 0.28814697265625, + "learning_rate": 4.983606235136847e-07, + "loss": 0.0003, + "reward": 1.7571429088711739, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 8826 + }, + { + "completion_length": 237.4687614440918, + "epoch": 1.480154239490339, + "grad_norm": 0.2303722888334993, + "kl": 0.1688385009765625, + "learning_rate": 4.983592517182044e-07, + "loss": 0.0002, + "reward": 1.7089286521077156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7133928760886192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8828 + }, + { + "completion_length": 235.96429634094238, + "epoch": 1.4804895427302065, + "grad_norm": 0.2179046244872161, + "kl": 0.308685302734375, + "learning_rate": 4.98357879350909e-07, + "loss": 0.0003, + "reward": 1.7357143685221672, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8830 + }, + { + "completion_length": 221.7366189956665, + "epoch": 1.4808248459700741, + "grad_norm": 0.12126952172765075, + "kl": 0.15643310546875, + "learning_rate": 4.983565064118017e-07, + "loss": 0.0002, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 8832 + }, + { + "completion_length": 234.1428689956665, + "epoch": 1.4811601492099418, + "grad_norm": 0.31648037804345736, + "kl": 0.201019287109375, + "learning_rate": 4.983551329008858e-07, + "loss": 0.0002, + "reward": 1.7232143506407738, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7276786137372255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8834 + }, + { + "completion_length": 233.80804538726807, + "epoch": 1.4814954524498094, + "grad_norm": 0.22828808628135586, + "kl": 0.19964599609375, + "learning_rate": 4.983537588181641e-07, + "loss": 0.0002, + "reward": 1.678571529686451, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.678571455180645, + "rewards/format_reward_func": 1.0, + "step": 8836 + }, + { + "completion_length": 223.83036708831787, + "epoch": 1.4818307556896768, + "grad_norm": 0.25069049217082295, + "kl": 0.148529052734375, + "learning_rate": 4.9835238416364e-07, + "loss": 0.0001, + "reward": 1.8178571909666061, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.817857176065445, + "rewards/format_reward_func": 1.0, + "step": 8838 + }, + { + "completion_length": 228.49554634094238, + "epoch": 1.4821660589295444, + "grad_norm": 0.18364389650314558, + "kl": 0.13134765625, + "learning_rate": 4.983510089373167e-07, + "loss": 0.0001, + "reward": 1.7410714849829674, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8840 + }, + { + "completion_length": 227.50447368621826, + "epoch": 1.4825013621694119, + "grad_norm": 0.3162348652446139, + "kl": 0.1231842041015625, + "learning_rate": 4.983496331391973e-07, + "loss": 0.0001, + "reward": 1.7250000908970833, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 1.0, + "step": 8842 + }, + { + "completion_length": 225.16072463989258, + "epoch": 1.4828366654092795, + "grad_norm": 0.1262953317962862, + "kl": 0.11572265625, + "learning_rate": 4.983482567692848e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 8844 + }, + { + "completion_length": 236.4375114440918, + "epoch": 1.4831719686491471, + "grad_norm": 0.348430568533721, + "kl": 0.1246185302734375, + "learning_rate": 4.983468798275827e-07, + "loss": 0.0001, + "reward": 1.7142857983708382, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.714285746216774, + "rewards/format_reward_func": 1.0, + "step": 8846 + }, + { + "completion_length": 222.82143688201904, + "epoch": 1.4835072718890145, + "grad_norm": 0.10965358167212523, + "kl": 0.1109466552734375, + "learning_rate": 4.98345502314094e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 8848 + }, + { + "completion_length": 237.7366180419922, + "epoch": 1.4838425751288822, + "grad_norm": 0.2539963287004196, + "kl": 0.1054534912109375, + "learning_rate": 4.983441242288219e-07, + "loss": 0.0001, + "reward": 1.7464286088943481, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464285902678967, + "rewards/format_reward_func": 1.0, + "step": 8850 + }, + { + "completion_length": 236.00447750091553, + "epoch": 1.4841778783687498, + "grad_norm": 0.2132569693177203, + "kl": 0.104949951171875, + "learning_rate": 4.983427455717694e-07, + "loss": 0.0001, + "reward": 1.7000000849366188, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7000000309199095, + "rewards/format_reward_func": 1.0, + "step": 8852 + }, + { + "completion_length": 235.29019355773926, + "epoch": 1.4845131816086172, + "grad_norm": 0.178193772294305, + "kl": 0.125152587890625, + "learning_rate": 4.983413663429399e-07, + "loss": 0.0001, + "reward": 1.7482143715023994, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8854 + }, + { + "completion_length": 231.33929538726807, + "epoch": 1.4848484848484849, + "grad_norm": 0.37231971986180934, + "kl": 0.13079833984375, + "learning_rate": 4.983399865423365e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 8856 + }, + { + "completion_length": 230.4821538925171, + "epoch": 1.4851837880883525, + "grad_norm": 0.23070799188104596, + "kl": 0.146514892578125, + "learning_rate": 4.983386061699624e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8858 + }, + { + "completion_length": 249.3571538925171, + "epoch": 1.48551909132822, + "grad_norm": 0.4236600611044629, + "kl": 0.24822998046875, + "learning_rate": 4.983372252258207e-07, + "loss": 0.0002, + "reward": 1.6915179640054703, + "reward_std": 0.0625031883828342, + "rewards/equation_reward_func": 0.7017857395112514, + "rewards/format_reward_func": 0.9897321499884129, + "step": 8860 + }, + { + "completion_length": 248.9151906967163, + "epoch": 1.4858543945680875, + "grad_norm": 0.39799241693952037, + "kl": 0.156219482421875, + "learning_rate": 4.983358437099147e-07, + "loss": 0.0002, + "reward": 1.7125000730156898, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7169643193483353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8862 + }, + { + "completion_length": 245.5714406967163, + "epoch": 1.486189697807955, + "grad_norm": 0.17593234268182617, + "kl": 0.192962646484375, + "learning_rate": 4.983344616222475e-07, + "loss": 0.0002, + "reward": 1.6982143595814705, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7026786133646965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8864 + }, + { + "completion_length": 234.52233123779297, + "epoch": 1.4865250010478226, + "grad_norm": 0.13123695463945143, + "kl": 0.311370849609375, + "learning_rate": 4.983330789628221e-07, + "loss": 0.0003, + "reward": 1.8107143491506577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 8866 + }, + { + "completion_length": 239.7678680419922, + "epoch": 1.4868603042876902, + "grad_norm": 0.12358910005630577, + "kl": 0.30950927734375, + "learning_rate": 4.983316957316421e-07, + "loss": 0.0003, + "reward": 1.7535714879631996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 8868 + }, + { + "completion_length": 232.96875953674316, + "epoch": 1.4871956075275579, + "grad_norm": 0.17331399232615474, + "kl": 0.1680908203125, + "learning_rate": 4.983303119287104e-07, + "loss": 0.0002, + "reward": 1.76607146859169, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357633531094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8870 + }, + { + "completion_length": 232.16072368621826, + "epoch": 1.4875309107674253, + "grad_norm": 0.26597933576708865, + "kl": 0.197265625, + "learning_rate": 4.983289275540302e-07, + "loss": 0.0002, + "reward": 1.732142947614193, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 8872 + }, + { + "completion_length": 233.13840579986572, + "epoch": 1.487866214007293, + "grad_norm": 0.1892458028270941, + "kl": 0.406097412109375, + "learning_rate": 4.983275426076048e-07, + "loss": 0.0004, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 8874 + }, + { + "completion_length": 235.5982255935669, + "epoch": 1.4882015172471603, + "grad_norm": 0.23906095397175636, + "kl": 0.17352294921875, + "learning_rate": 4.983261570894373e-07, + "loss": 0.0002, + "reward": 1.7660714983940125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357298254967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8876 + }, + { + "completion_length": 230.30358123779297, + "epoch": 1.488536820487028, + "grad_norm": 0.2635429378102701, + "kl": 0.173614501953125, + "learning_rate": 4.98324770999531e-07, + "loss": 0.0002, + "reward": 1.7392858117818832, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 8878 + }, + { + "completion_length": 244.73215866088867, + "epoch": 1.4888721237268956, + "grad_norm": 0.20412065409466093, + "kl": 0.16748046875, + "learning_rate": 4.983233843378889e-07, + "loss": 0.0002, + "reward": 1.7553571984171867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214708268642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8880 + }, + { + "completion_length": 231.58036994934082, + "epoch": 1.489207426966763, + "grad_norm": 0.39266692298473066, + "kl": 0.111083984375, + "learning_rate": 4.983219971045143e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 8882 + }, + { + "completion_length": 222.5982255935669, + "epoch": 1.4895427302066306, + "grad_norm": 0.15750322995007576, + "kl": 0.106170654296875, + "learning_rate": 4.983206092994104e-07, + "loss": 0.0001, + "reward": 1.841071456670761, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8455357328057289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8884 + }, + { + "completion_length": 234.8125123977661, + "epoch": 1.489878033446498, + "grad_norm": 0.20708568340897027, + "kl": 0.2044830322265625, + "learning_rate": 4.983192209225805e-07, + "loss": 0.0002, + "reward": 1.7428572326898575, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571674972773, + "rewards/format_reward_func": 1.0, + "step": 8886 + }, + { + "completion_length": 238.50447368621826, + "epoch": 1.4902133366863657, + "grad_norm": 0.3230453112977611, + "kl": 0.141143798828125, + "learning_rate": 4.983178319740276e-07, + "loss": 0.0001, + "reward": 1.7625000700354576, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8888 + }, + { + "completion_length": 231.5401906967163, + "epoch": 1.4905486399262333, + "grad_norm": 0.23606147243547548, + "kl": 0.140899658203125, + "learning_rate": 4.983164424537549e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 8890 + }, + { + "completion_length": 232.88840293884277, + "epoch": 1.490883943166101, + "grad_norm": 0.21897116798986696, + "kl": 0.1290740966796875, + "learning_rate": 4.983150523617658e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 8892 + }, + { + "completion_length": 237.89287185668945, + "epoch": 1.4912192464059684, + "grad_norm": 0.21563554948529737, + "kl": 0.155853271484375, + "learning_rate": 4.983136616980633e-07, + "loss": 0.0002, + "reward": 1.730357214808464, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214499652386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8894 + }, + { + "completion_length": 246.1384048461914, + "epoch": 1.491554549645836, + "grad_norm": 0.303244848945181, + "kl": 0.187408447265625, + "learning_rate": 4.983122704626507e-07, + "loss": 0.0002, + "reward": 1.6964286342263222, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6964286155998707, + "rewards/format_reward_func": 1.0, + "step": 8896 + }, + { + "completion_length": 242.7634048461914, + "epoch": 1.4918898528857034, + "grad_norm": 0.3079019603547445, + "kl": 0.136077880859375, + "learning_rate": 4.983108786555312e-07, + "loss": 0.0001, + "reward": 1.8160714730620384, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.8205357454717159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8898 + }, + { + "completion_length": 242.7500114440918, + "epoch": 1.492225156125571, + "grad_norm": 0.37232604700225147, + "kl": 0.154022216796875, + "learning_rate": 4.98309486276708e-07, + "loss": 0.0002, + "reward": 1.760714367032051, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143297791481, + "rewards/format_reward_func": 1.0, + "step": 8900 + }, + { + "completion_length": 242.83483123779297, + "epoch": 1.4925604593654387, + "grad_norm": 0.1844494338084748, + "kl": 0.201995849609375, + "learning_rate": 4.983080933261842e-07, + "loss": 0.0002, + "reward": 1.7375000715255737, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8902 + }, + { + "completion_length": 236.4776906967163, + "epoch": 1.492895762605306, + "grad_norm": 0.25817305870319135, + "kl": 0.271881103515625, + "learning_rate": 4.983066998039632e-07, + "loss": 0.0003, + "reward": 1.7357143759727478, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143349945545, + "rewards/format_reward_func": 1.0, + "step": 8904 + }, + { + "completion_length": 243.10715293884277, + "epoch": 1.4932310658451737, + "grad_norm": 0.20426186955697542, + "kl": 0.264862060546875, + "learning_rate": 4.983053057100482e-07, + "loss": 0.0003, + "reward": 1.725000075995922, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 8906 + }, + { + "completion_length": 247.6919755935669, + "epoch": 1.4935663690850411, + "grad_norm": 0.1531023894494314, + "kl": 0.41204833984375, + "learning_rate": 4.983039110444422e-07, + "loss": 0.0004, + "reward": 1.6678572371602058, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.6767857410013676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 8908 + }, + { + "completion_length": 231.68304538726807, + "epoch": 1.4939016723249088, + "grad_norm": 0.2368336569619978, + "kl": 0.259857177734375, + "learning_rate": 4.983025158071485e-07, + "loss": 0.0003, + "reward": 1.775000087916851, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 8910 + }, + { + "completion_length": 249.6250114440918, + "epoch": 1.4942369755647764, + "grad_norm": 0.1959684567937748, + "kl": 0.44342041015625, + "learning_rate": 4.983011199981704e-07, + "loss": 0.0004, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 8912 + }, + { + "completion_length": 249.63393878936768, + "epoch": 1.494572278804644, + "grad_norm": 0.20096044788713058, + "kl": 0.470428466796875, + "learning_rate": 4.982997236175111e-07, + "loss": 0.0005, + "reward": 1.7053572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7089286148548126, + "rewards/format_reward_func": 0.9964285716414452, + "step": 8914 + }, + { + "completion_length": 250.508939743042, + "epoch": 1.4949075820445115, + "grad_norm": 0.37251539691184704, + "kl": 0.377716064453125, + "learning_rate": 4.982983266651737e-07, + "loss": 0.0004, + "reward": 1.7589286416769028, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8916 + }, + { + "completion_length": 235.14733219146729, + "epoch": 1.495242885284379, + "grad_norm": 0.11145203648541949, + "kl": 0.121337890625, + "learning_rate": 4.982969291411615e-07, + "loss": 0.0001, + "reward": 1.801785759627819, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.8062500171363354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8918 + }, + { + "completion_length": 231.8884048461914, + "epoch": 1.4955781885242465, + "grad_norm": 0.19749947364265377, + "kl": 0.156341552734375, + "learning_rate": 4.982955310454777e-07, + "loss": 0.0002, + "reward": 1.7714286148548126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286223053932, + "rewards/format_reward_func": 1.0, + "step": 8920 + }, + { + "completion_length": 233.74108219146729, + "epoch": 1.4959134917641141, + "grad_norm": 0.10809478877179468, + "kl": 0.143829345703125, + "learning_rate": 4.982941323781255e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 8922 + }, + { + "completion_length": 224.13393878936768, + "epoch": 1.4962487950039818, + "grad_norm": 0.25690059581071395, + "kl": 0.108489990234375, + "learning_rate": 4.982927331391083e-07, + "loss": 0.0001, + "reward": 1.8178571611642838, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8178571704775095, + "rewards/format_reward_func": 1.0, + "step": 8924 + }, + { + "completion_length": 225.33483219146729, + "epoch": 1.4965840982438494, + "grad_norm": 0.21596429702799877, + "kl": 0.1488037109375, + "learning_rate": 4.98291333328429e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 8926 + }, + { + "completion_length": 225.4509038925171, + "epoch": 1.4969194014837168, + "grad_norm": 0.3113412682431212, + "kl": 0.1666259765625, + "learning_rate": 4.982899329460911e-07, + "loss": 0.0002, + "reward": 1.7535715028643608, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 8928 + }, + { + "completion_length": 228.73215198516846, + "epoch": 1.4972547047235845, + "grad_norm": 0.10233896625503296, + "kl": 0.1457366943359375, + "learning_rate": 4.982885319920977e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 8930 + }, + { + "completion_length": 236.79465293884277, + "epoch": 1.4975900079634519, + "grad_norm": 0.24018858070649615, + "kl": 0.164825439453125, + "learning_rate": 4.98287130466452e-07, + "loss": 0.0002, + "reward": 1.7625000700354576, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8932 + }, + { + "completion_length": 220.77679538726807, + "epoch": 1.4979253112033195, + "grad_norm": 0.15697397655397416, + "kl": 0.1427001953125, + "learning_rate": 4.982857283691572e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714488476515, + "rewards/format_reward_func": 1.0, + "step": 8934 + }, + { + "completion_length": 236.42858028411865, + "epoch": 1.4982606144431871, + "grad_norm": 0.12975874262251802, + "kl": 0.12359619140625, + "learning_rate": 4.982843257002167e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7428571749478579, + "rewards/format_reward_func": 1.0, + "step": 8936 + }, + { + "completion_length": 226.4241180419922, + "epoch": 1.4985959176830546, + "grad_norm": 0.33443616062865283, + "kl": 0.098114013671875, + "learning_rate": 4.982829224596337e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8938 + }, + { + "completion_length": 219.69197273254395, + "epoch": 1.4989312209229222, + "grad_norm": 0.6156013334847034, + "kl": 0.187744140625, + "learning_rate": 4.982815186474111e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 8940 + }, + { + "completion_length": 227.27233219146729, + "epoch": 1.4992665241627896, + "grad_norm": 0.11772230548243262, + "kl": 0.121368408203125, + "learning_rate": 4.982801142635526e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8107142969965935, + "rewards/format_reward_func": 1.0, + "step": 8942 + }, + { + "completion_length": 223.13393783569336, + "epoch": 1.4996018274026572, + "grad_norm": 0.21252309938680947, + "kl": 0.1238250732421875, + "learning_rate": 4.982787093080612e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 8944 + }, + { + "completion_length": 227.83036422729492, + "epoch": 1.4999371306425249, + "grad_norm": 0.26532050624733566, + "kl": 0.21636962890625, + "learning_rate": 4.982773037809402e-07, + "loss": 0.0002, + "reward": 1.796428643167019, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285910129547, + "rewards/format_reward_func": 1.0, + "step": 8946 + }, + { + "completion_length": 218.97322273254395, + "epoch": 1.5002724338823925, + "grad_norm": 0.26540863359360023, + "kl": 0.182037353515625, + "learning_rate": 4.982758976821927e-07, + "loss": 0.0002, + "reward": 1.730357214808464, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7348214723169804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 8948 + }, + { + "completion_length": 221.95983123779297, + "epoch": 1.50060773712226, + "grad_norm": 0.20609867460584388, + "kl": 0.11181640625, + "learning_rate": 4.982744910118221e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 8950 + }, + { + "completion_length": 216.87947463989258, + "epoch": 1.5009430403621273, + "grad_norm": 0.2493354640229781, + "kl": 0.1196746826171875, + "learning_rate": 4.982730837698314e-07, + "loss": 0.0001, + "reward": 1.7535715252161026, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714562982321, + "rewards/format_reward_func": 1.0, + "step": 8952 + }, + { + "completion_length": 215.49108219146729, + "epoch": 1.501278343601995, + "grad_norm": 0.2773946950947669, + "kl": 0.125640869140625, + "learning_rate": 4.982716759562242e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 1.0, + "step": 8954 + }, + { + "completion_length": 220.76340293884277, + "epoch": 1.5016136468418626, + "grad_norm": 0.21977779618397725, + "kl": 0.131927490234375, + "learning_rate": 4.982702675710034e-07, + "loss": 0.0001, + "reward": 1.7000000923871994, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7000000383704901, + "rewards/format_reward_func": 1.0, + "step": 8956 + }, + { + "completion_length": 218.67858123779297, + "epoch": 1.5019489500817302, + "grad_norm": 0.23496077922526748, + "kl": 0.100067138671875, + "learning_rate": 4.982688586141725e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 8958 + }, + { + "completion_length": 211.27233219146729, + "epoch": 1.5022842533215979, + "grad_norm": 0.24280331098890967, + "kl": 0.1251220703125, + "learning_rate": 4.982674490857346e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 8960 + }, + { + "completion_length": 225.0759038925171, + "epoch": 1.5026195565614653, + "grad_norm": 0.16089475492518973, + "kl": 0.10797119140625, + "learning_rate": 4.98266038985693e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 8962 + }, + { + "completion_length": 212.95090007781982, + "epoch": 1.5029548598013327, + "grad_norm": 0.17764220536121444, + "kl": 0.0988922119140625, + "learning_rate": 4.982646283140509e-07, + "loss": 0.0001, + "reward": 1.8107143267989159, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 8964 + }, + { + "completion_length": 213.56697368621826, + "epoch": 1.5032901630412003, + "grad_norm": 0.17043077666424078, + "kl": 0.0966796875, + "learning_rate": 4.982632170708117e-07, + "loss": 0.0001, + "reward": 1.8607143312692642, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8607143089175224, + "rewards/format_reward_func": 1.0, + "step": 8966 + }, + { + "completion_length": 221.25893592834473, + "epoch": 1.503625466281068, + "grad_norm": 0.11552382456613353, + "kl": 0.10955810546875, + "learning_rate": 4.982618052559783e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 8968 + }, + { + "completion_length": 218.31251049041748, + "epoch": 1.5039607695209356, + "grad_norm": 0.24386668329689457, + "kl": 0.12744140625, + "learning_rate": 4.982603928695543e-07, + "loss": 0.0001, + "reward": 1.8357143178582191, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8357143066823483, + "rewards/format_reward_func": 1.0, + "step": 8970 + }, + { + "completion_length": 210.50000858306885, + "epoch": 1.504296072760803, + "grad_norm": 0.24801821338106403, + "kl": 0.089141845703125, + "learning_rate": 4.982589799115429e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 8972 + }, + { + "completion_length": 212.86608123779297, + "epoch": 1.5046313760006707, + "grad_norm": 0.11105424263655118, + "kl": 0.097076416015625, + "learning_rate": 4.982575663819471e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 8974 + }, + { + "completion_length": 216.56251049041748, + "epoch": 1.504966679240538, + "grad_norm": 0.18365734597731484, + "kl": 0.113494873046875, + "learning_rate": 4.982561522807705e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 8976 + }, + { + "completion_length": 222.071439743042, + "epoch": 1.5053019824804057, + "grad_norm": 0.19870798357054514, + "kl": 0.116790771484375, + "learning_rate": 4.982547376080161e-07, + "loss": 0.0001, + "reward": 1.8107143566012383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 8978 + }, + { + "completion_length": 218.92411708831787, + "epoch": 1.5056372857202733, + "grad_norm": 0.28292647884657346, + "kl": 0.10693359375, + "learning_rate": 4.982533223636872e-07, + "loss": 0.0001, + "reward": 1.8535714820027351, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8535714522004128, + "rewards/format_reward_func": 1.0, + "step": 8980 + }, + { + "completion_length": 230.99554538726807, + "epoch": 1.505972588960141, + "grad_norm": 0.24335191206106543, + "kl": 0.146392822265625, + "learning_rate": 4.982519065477873e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571864962578, + "rewards/format_reward_func": 1.0, + "step": 8982 + }, + { + "completion_length": 231.1205472946167, + "epoch": 1.5063078922000084, + "grad_norm": 0.24953749066660705, + "kl": 0.19580078125, + "learning_rate": 4.982504901603192e-07, + "loss": 0.0002, + "reward": 1.753571517765522, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 8984 + }, + { + "completion_length": 217.58482933044434, + "epoch": 1.5066431954398758, + "grad_norm": 0.31805716184155647, + "kl": 0.17828369140625, + "learning_rate": 4.982490732012867e-07, + "loss": 0.0002, + "reward": 1.7821429297327995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 8986 + }, + { + "completion_length": 227.37501049041748, + "epoch": 1.5069784986797434, + "grad_norm": 0.0028478271534510976, + "kl": 0.110992431640625, + "learning_rate": 4.982476556706926e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 8988 + }, + { + "completion_length": 229.70090198516846, + "epoch": 1.507313801919611, + "grad_norm": 0.3544715140069262, + "kl": 0.20654296875, + "learning_rate": 4.982462375685404e-07, + "loss": 0.0002, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.782142873853445, + "rewards/format_reward_func": 1.0, + "step": 8990 + }, + { + "completion_length": 215.92858123779297, + "epoch": 1.5076491051594787, + "grad_norm": 0.23203895801518692, + "kl": 0.116851806640625, + "learning_rate": 4.982448188948333e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 8992 + }, + { + "completion_length": 227.571439743042, + "epoch": 1.507984408399346, + "grad_norm": 0.2609133394304276, + "kl": 0.152923583984375, + "learning_rate": 4.982433996495747e-07, + "loss": 0.0002, + "reward": 1.7071429267525673, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7071428913623095, + "rewards/format_reward_func": 1.0, + "step": 8994 + }, + { + "completion_length": 229.30804634094238, + "epoch": 1.5083197116392137, + "grad_norm": 0.1709798535313269, + "kl": 0.137298583984375, + "learning_rate": 4.982419798327676e-07, + "loss": 0.0001, + "reward": 1.6964286640286446, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6964286174625158, + "rewards/format_reward_func": 1.0, + "step": 8996 + }, + { + "completion_length": 227.90626049041748, + "epoch": 1.5086550148790812, + "grad_norm": 0.24870973516291606, + "kl": 0.157684326171875, + "learning_rate": 4.982405594444155e-07, + "loss": 0.0002, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 8998 + }, + { + "completion_length": 222.56250762939453, + "epoch": 1.5089903181189488, + "grad_norm": 0.5034636051894814, + "kl": 0.1951904296875, + "learning_rate": 4.982391384845216e-07, + "loss": 0.0002, + "reward": 1.7392858117818832, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 9000 + }, + { + "completion_length": 227.89286613464355, + "epoch": 1.5093256213588164, + "grad_norm": 0.19870862664022154, + "kl": 0.1785888671875, + "learning_rate": 4.982377169530892e-07, + "loss": 0.0002, + "reward": 1.7107143551111221, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.71071432903409, + "rewards/format_reward_func": 1.0, + "step": 9002 + }, + { + "completion_length": 227.68750953674316, + "epoch": 1.509660924598684, + "grad_norm": 0.2525564638952568, + "kl": 0.20538330078125, + "learning_rate": 4.982362948501214e-07, + "loss": 0.0002, + "reward": 1.7035715207457542, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7035714536905289, + "rewards/format_reward_func": 1.0, + "step": 9004 + }, + { + "completion_length": 233.28125953674316, + "epoch": 1.5099962278385515, + "grad_norm": 0.23294504953174144, + "kl": 0.242095947265625, + "learning_rate": 4.982348721756217e-07, + "loss": 0.0002, + "reward": 1.726785771548748, + "reward_std": 0.04293148126453161, + "rewards/equation_reward_func": 0.7312500271946192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9006 + }, + { + "completion_length": 233.90179634094238, + "epoch": 1.5103315310784189, + "grad_norm": 0.4282919691411335, + "kl": 0.20703125, + "learning_rate": 4.982334489295933e-07, + "loss": 0.0002, + "reward": 1.7750000655651093, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 9008 + }, + { + "completion_length": 236.61607933044434, + "epoch": 1.5106668343182865, + "grad_norm": 0.492780322237796, + "kl": 0.288299560546875, + "learning_rate": 4.982320251120395e-07, + "loss": 0.0003, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 9010 + }, + { + "completion_length": 242.3750114440918, + "epoch": 1.5110021375581542, + "grad_norm": 0.233727013707272, + "kl": 0.2298583984375, + "learning_rate": 4.982306007229634e-07, + "loss": 0.0002, + "reward": 1.7089286521077156, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7133928909897804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9012 + }, + { + "completion_length": 234.3080472946167, + "epoch": 1.5113374407980218, + "grad_norm": 0.1943769671417021, + "kl": 0.2056884765625, + "learning_rate": 4.982291757623685e-07, + "loss": 0.0002, + "reward": 1.7357143610715866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143275439739, + "rewards/format_reward_func": 1.0, + "step": 9014 + }, + { + "completion_length": 244.28572463989258, + "epoch": 1.5116727440378894, + "grad_norm": 0.592650509147069, + "kl": 0.294830322265625, + "learning_rate": 4.98227750230258e-07, + "loss": 0.0003, + "reward": 1.6875000819563866, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.6919643245637417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9016 + }, + { + "completion_length": 233.68304824829102, + "epoch": 1.5120080472777568, + "grad_norm": 0.19249539570632418, + "kl": 0.17291259765625, + "learning_rate": 4.982263241266353e-07, + "loss": 0.0002, + "reward": 1.8250000476837158, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 9018 + }, + { + "completion_length": 221.12947463989258, + "epoch": 1.5123433505176243, + "grad_norm": 0.23756867356380112, + "kl": 0.20294189453125, + "learning_rate": 4.982248974515033e-07, + "loss": 0.0002, + "reward": 1.8178571835160255, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571723401546, + "rewards/format_reward_func": 1.0, + "step": 9020 + }, + { + "completion_length": 232.18751049041748, + "epoch": 1.5126786537574919, + "grad_norm": 0.16497460287344504, + "kl": 0.15875244140625, + "learning_rate": 4.982234702048658e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 9022 + }, + { + "completion_length": 223.31251049041748, + "epoch": 1.5130139569973595, + "grad_norm": 0.20123506275968653, + "kl": 0.142669677734375, + "learning_rate": 4.982220423867257e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 9024 + }, + { + "completion_length": 242.10268878936768, + "epoch": 1.5133492602372272, + "grad_norm": 0.20078758437765387, + "kl": 0.135955810546875, + "learning_rate": 4.982206139970865e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571823984385, + "rewards/format_reward_func": 1.0, + "step": 9026 + }, + { + "completion_length": 230.040189743042, + "epoch": 1.5136845634770946, + "grad_norm": 0.2897700898573048, + "kl": 0.130035400390625, + "learning_rate": 4.982191850359514e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 9028 + }, + { + "completion_length": 245.99108123779297, + "epoch": 1.5140198667169622, + "grad_norm": 0.23874503537560982, + "kl": 0.152069091796875, + "learning_rate": 4.982177555033236e-07, + "loss": 0.0002, + "reward": 1.7522322237491608, + "reward_std": 0.0473508988507092, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 0.9986607171595097, + "step": 9030 + }, + { + "completion_length": 237.39733219146729, + "epoch": 1.5143551699568296, + "grad_norm": 0.10178434262825313, + "kl": 0.1427001953125, + "learning_rate": 4.982163253992066e-07, + "loss": 0.0001, + "reward": 1.7035715132951736, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7035714574158192, + "rewards/format_reward_func": 1.0, + "step": 9032 + }, + { + "completion_length": 244.5625123977661, + "epoch": 1.5146904731966973, + "grad_norm": 0.23837435651216773, + "kl": 0.157928466796875, + "learning_rate": 4.982148947236036e-07, + "loss": 0.0002, + "reward": 1.7464286237955093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 9034 + }, + { + "completion_length": 237.78126335144043, + "epoch": 1.5150257764365649, + "grad_norm": 0.41831335496057137, + "kl": 0.15301513671875, + "learning_rate": 4.982134634765178e-07, + "loss": 0.0002, + "reward": 1.7714286223053932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 9036 + }, + { + "completion_length": 243.93304824829102, + "epoch": 1.5153610796764325, + "grad_norm": 0.19669736268949578, + "kl": 0.146759033203125, + "learning_rate": 4.982120316579527e-07, + "loss": 0.0001, + "reward": 1.6750001087784767, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6750000193715096, + "rewards/format_reward_func": 1.0, + "step": 9038 + }, + { + "completion_length": 249.227689743042, + "epoch": 1.5156963829163, + "grad_norm": 0.11420041596317355, + "kl": 0.165985107421875, + "learning_rate": 4.982105992679113e-07, + "loss": 0.0002, + "reward": 1.7678571939468384, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 9040 + }, + { + "completion_length": 253.27233219146729, + "epoch": 1.5160316861561673, + "grad_norm": 0.5580972834206843, + "kl": 0.196990966796875, + "learning_rate": 4.982091663063972e-07, + "loss": 0.0002, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 1.0, + "step": 9042 + }, + { + "completion_length": 249.9330472946167, + "epoch": 1.516366989396035, + "grad_norm": 0.3001462222001796, + "kl": 0.154052734375, + "learning_rate": 4.982077327734135e-07, + "loss": 0.0002, + "reward": 1.7803572192788124, + "reward_std": 0.09848987124860287, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9044 + }, + { + "completion_length": 249.7946548461914, + "epoch": 1.5167022926359026, + "grad_norm": 0.2560114961097288, + "kl": 0.13873291015625, + "learning_rate": 4.982062986689637e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 9046 + }, + { + "completion_length": 249.946439743042, + "epoch": 1.5170375958757703, + "grad_norm": 0.22287832163678353, + "kl": 0.1337890625, + "learning_rate": 4.982048639930509e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714484751225, + "rewards/format_reward_func": 1.0, + "step": 9048 + }, + { + "completion_length": 241.42858409881592, + "epoch": 1.5173728991156377, + "grad_norm": 0.17462550405638447, + "kl": 0.124725341796875, + "learning_rate": 4.982034287456784e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7339286059141159, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9050 + }, + { + "completion_length": 241.64287090301514, + "epoch": 1.5177082023555053, + "grad_norm": 0.31667186359080757, + "kl": 0.12017822265625, + "learning_rate": 4.982019929268496e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 9052 + }, + { + "completion_length": 244.20983219146729, + "epoch": 1.5180435055953727, + "grad_norm": 0.7155592391967859, + "kl": 0.233062744140625, + "learning_rate": 4.982005565365678e-07, + "loss": 0.0002, + "reward": 1.7428571954369545, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 9054 + }, + { + "completion_length": 234.5044755935669, + "epoch": 1.5183788088352403, + "grad_norm": 0.19988312655943996, + "kl": 0.1508026123046875, + "learning_rate": 4.981991195748363e-07, + "loss": 0.0002, + "reward": 1.741071492433548, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357573926449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9056 + }, + { + "completion_length": 238.2053680419922, + "epoch": 1.518714112075108, + "grad_norm": 0.2720902850609489, + "kl": 0.113433837890625, + "learning_rate": 4.981976820416584e-07, + "loss": 0.0001, + "reward": 1.7285715192556381, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 9058 + }, + { + "completion_length": 232.69197750091553, + "epoch": 1.5190494153149756, + "grad_norm": 0.20592691107488253, + "kl": 0.165008544921875, + "learning_rate": 4.981962439370374e-07, + "loss": 0.0002, + "reward": 1.8250000476837158, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000327825546, + "rewards/format_reward_func": 1.0, + "step": 9060 + }, + { + "completion_length": 236.633939743042, + "epoch": 1.519384718554843, + "grad_norm": 0.31761312113051826, + "kl": 0.1402587890625, + "learning_rate": 4.981948052609767e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 9062 + }, + { + "completion_length": 235.4241189956665, + "epoch": 1.5197200217947104, + "grad_norm": 0.1862511774783427, + "kl": 0.136566162109375, + "learning_rate": 4.981933660134795e-07, + "loss": 0.0001, + "reward": 1.7517857924103737, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9064 + }, + { + "completion_length": 234.3928689956665, + "epoch": 1.520055325034578, + "grad_norm": 0.22103160635442895, + "kl": 0.165435791015625, + "learning_rate": 4.981919261945491e-07, + "loss": 0.0002, + "reward": 1.7767857685685158, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7812500204890966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9066 + }, + { + "completion_length": 234.696439743042, + "epoch": 1.5203906282744457, + "grad_norm": 0.5237816922040125, + "kl": 0.127655029296875, + "learning_rate": 4.981904858041889e-07, + "loss": 0.0001, + "reward": 1.6714286729693413, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.6714286096394062, + "rewards/format_reward_func": 1.0, + "step": 9068 + }, + { + "completion_length": 233.1116180419922, + "epoch": 1.5207259315143133, + "grad_norm": 0.2222210056755971, + "kl": 0.108612060546875, + "learning_rate": 4.981890448424021e-07, + "loss": 0.0001, + "reward": 1.7642857655882835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 9070 + }, + { + "completion_length": 227.2232265472412, + "epoch": 1.5210612347541808, + "grad_norm": 0.3268976894652414, + "kl": 0.09259033203125, + "learning_rate": 4.981876033091922e-07, + "loss": 0.0001, + "reward": 1.766071505844593, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7696428820490837, + "rewards/format_reward_func": 0.9964285790920258, + "step": 9072 + }, + { + "completion_length": 221.62947368621826, + "epoch": 1.5213965379940484, + "grad_norm": 0.11221103222961556, + "kl": 0.10101318359375, + "learning_rate": 4.981861612045624e-07, + "loss": 0.0001, + "reward": 1.8357143253087997, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8357143104076385, + "rewards/format_reward_func": 1.0, + "step": 9074 + }, + { + "completion_length": 229.33483123779297, + "epoch": 1.5217318412339158, + "grad_norm": 0.16956317672941165, + "kl": 0.0871429443359375, + "learning_rate": 4.98184718528516e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571723401546, + "rewards/format_reward_func": 1.0, + "step": 9076 + }, + { + "completion_length": 228.6741180419922, + "epoch": 1.5220671444737834, + "grad_norm": 0.11080000675820717, + "kl": 0.113494873046875, + "learning_rate": 4.981832752810564e-07, + "loss": 0.0001, + "reward": 1.7642857506871223, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 9078 + }, + { + "completion_length": 231.3660831451416, + "epoch": 1.522402447713651, + "grad_norm": 0.074742168877599, + "kl": 0.09735107421875, + "learning_rate": 4.981818314621868e-07, + "loss": 0.0001, + "reward": 1.787500075995922, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9080 + }, + { + "completion_length": 230.1785831451416, + "epoch": 1.5227377509535187, + "grad_norm": 0.2248487524696226, + "kl": 0.111541748046875, + "learning_rate": 4.981803870719107e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 9082 + }, + { + "completion_length": 240.22322273254395, + "epoch": 1.5230730541933861, + "grad_norm": 0.2183842155755358, + "kl": 0.0999603271484375, + "learning_rate": 4.981789421102313e-07, + "loss": 0.0001, + "reward": 1.8250000327825546, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000216066837, + "rewards/format_reward_func": 1.0, + "step": 9084 + }, + { + "completion_length": 231.1384048461914, + "epoch": 1.5234083574332535, + "grad_norm": 0.20518936982415142, + "kl": 0.0991363525390625, + "learning_rate": 4.98177496577152e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714767873287, + "rewards/format_reward_func": 1.0, + "step": 9086 + }, + { + "completion_length": 229.571439743042, + "epoch": 1.5237436606731212, + "grad_norm": 0.255223191177945, + "kl": 0.094573974609375, + "learning_rate": 4.981760504726759e-07, + "loss": 0.0001, + "reward": 1.8250000476837158, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000402331352, + "rewards/format_reward_func": 1.0, + "step": 9088 + }, + { + "completion_length": 244.84822463989258, + "epoch": 1.5240789639129888, + "grad_norm": 0.24607461370798528, + "kl": 0.1124420166015625, + "learning_rate": 4.981746037968068e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 9090 + }, + { + "completion_length": 242.04911708831787, + "epoch": 1.5244142671528564, + "grad_norm": 0.09837423239495768, + "kl": 0.1160888671875, + "learning_rate": 4.981731565495475e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 9092 + }, + { + "completion_length": 240.37947463989258, + "epoch": 1.524749570392724, + "grad_norm": 0.19651923643307115, + "kl": 0.097412109375, + "learning_rate": 4.981717087309018e-07, + "loss": 0.0001, + "reward": 1.79642865806818, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 9094 + }, + { + "completion_length": 237.21875858306885, + "epoch": 1.5250848736325915, + "grad_norm": 0.14018062890921712, + "kl": 0.09429931640625, + "learning_rate": 4.981702603408726e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 9096 + }, + { + "completion_length": 230.48215198516846, + "epoch": 1.525420176872459, + "grad_norm": 0.21898535817201434, + "kl": 0.1555023193359375, + "learning_rate": 4.981688113794636e-07, + "loss": 0.0002, + "reward": 1.803571492433548, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714477300644, + "rewards/format_reward_func": 1.0, + "step": 9098 + }, + { + "completion_length": 244.7991180419922, + "epoch": 1.5257554801123265, + "grad_norm": 0.16935366185685477, + "kl": 0.0987396240234375, + "learning_rate": 4.981673618466779e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 9100 + }, + { + "completion_length": 241.27233123779297, + "epoch": 1.5260907833521942, + "grad_norm": 0.33333039079707383, + "kl": 0.0966796875, + "learning_rate": 4.981659117425189e-07, + "loss": 0.0001, + "reward": 1.6571429520845413, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.6571428943425417, + "rewards/format_reward_func": 1.0, + "step": 9102 + }, + { + "completion_length": 235.34376049041748, + "epoch": 1.5264260865920618, + "grad_norm": 0.2157871403374037, + "kl": 0.09100341796875, + "learning_rate": 4.9816446106699e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 9104 + }, + { + "completion_length": 235.5714406967163, + "epoch": 1.5267613898319292, + "grad_norm": 0.21741295384543297, + "kl": 0.0951690673828125, + "learning_rate": 4.981630098200946e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 1.0, + "step": 9106 + }, + { + "completion_length": 239.915189743042, + "epoch": 1.5270966930717969, + "grad_norm": 0.3244411047147427, + "kl": 0.1490478515625, + "learning_rate": 4.981615580018357e-07, + "loss": 0.0001, + "reward": 1.7910714820027351, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7955357395112514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9108 + }, + { + "completion_length": 240.0937614440918, + "epoch": 1.5274319963116643, + "grad_norm": 0.17487077329756923, + "kl": 0.125091552734375, + "learning_rate": 4.98160105612217e-07, + "loss": 0.0001, + "reward": 1.8000000342726707, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 9110 + }, + { + "completion_length": 251.28125953674316, + "epoch": 1.527767299551532, + "grad_norm": 0.26637816520531216, + "kl": 0.140777587890625, + "learning_rate": 4.981586526512418e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 1.0, + "step": 9112 + }, + { + "completion_length": 251.94197463989258, + "epoch": 1.5281026027913995, + "grad_norm": 0.20245928613813574, + "kl": 0.1270904541015625, + "learning_rate": 4.981571991189133e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 9114 + }, + { + "completion_length": 242.29018878936768, + "epoch": 1.5284379060312672, + "grad_norm": 0.17394760960328057, + "kl": 0.153533935546875, + "learning_rate": 4.98155745015235e-07, + "loss": 0.0002, + "reward": 1.8214286118745804, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8214285969734192, + "rewards/format_reward_func": 1.0, + "step": 9116 + }, + { + "completion_length": 247.1160831451416, + "epoch": 1.5287732092711346, + "grad_norm": 0.2279073006313851, + "kl": 0.10235595703125, + "learning_rate": 4.9815429034021e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357536673546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9118 + }, + { + "completion_length": 242.0178689956665, + "epoch": 1.529108512511002, + "grad_norm": 0.2639212009108935, + "kl": 0.1070709228515625, + "learning_rate": 4.981528350938419e-07, + "loss": 0.0001, + "reward": 1.7017857879400253, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7062500435858965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9120 + }, + { + "completion_length": 241.91965293884277, + "epoch": 1.5294438157508696, + "grad_norm": 0.22109535656033716, + "kl": 0.105133056640625, + "learning_rate": 4.98151379276134e-07, + "loss": 0.0001, + "reward": 1.796428605914116, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 9122 + }, + { + "completion_length": 249.31697463989258, + "epoch": 1.5297791189907373, + "grad_norm": 0.15664011211006298, + "kl": 0.098358154296875, + "learning_rate": 4.981499228870895e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.785714304074645, + "rewards/format_reward_func": 1.0, + "step": 9124 + }, + { + "completion_length": 242.37054634094238, + "epoch": 1.530114422230605, + "grad_norm": 0.18290893825178986, + "kl": 0.1169281005859375, + "learning_rate": 4.981484659267121e-07, + "loss": 0.0001, + "reward": 1.775000087916851, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 9126 + }, + { + "completion_length": 251.2991189956665, + "epoch": 1.5304497254704723, + "grad_norm": 0.25195483521072903, + "kl": 0.1036376953125, + "learning_rate": 4.981470083950047e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571674972773, + "rewards/format_reward_func": 1.0, + "step": 9128 + }, + { + "completion_length": 239.30358219146729, + "epoch": 1.53078502871034, + "grad_norm": 0.12518929441684795, + "kl": 0.095672607421875, + "learning_rate": 4.98145550291971e-07, + "loss": 0.0001, + "reward": 1.7642857432365417, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857600003481, + "rewards/format_reward_func": 1.0, + "step": 9130 + }, + { + "completion_length": 237.91072463989258, + "epoch": 1.5311203319502074, + "grad_norm": 0.20083257203715613, + "kl": 0.092926025390625, + "learning_rate": 4.981440916176142e-07, + "loss": 0.0001, + "reward": 1.7250000983476639, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7250000238418579, + "rewards/format_reward_func": 1.0, + "step": 9132 + }, + { + "completion_length": 247.0937614440918, + "epoch": 1.531455635190075, + "grad_norm": 0.22650875766293518, + "kl": 0.103668212890625, + "learning_rate": 4.981426323719377e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714708268642, + "rewards/format_reward_func": 1.0, + "step": 9134 + }, + { + "completion_length": 243.86162090301514, + "epoch": 1.5317909384299426, + "grad_norm": 0.08014615946695847, + "kl": 0.09442138671875, + "learning_rate": 4.981411725549449e-07, + "loss": 0.0001, + "reward": 1.710714377462864, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143178582191, + "rewards/format_reward_func": 1.0, + "step": 9136 + }, + { + "completion_length": 237.8616180419922, + "epoch": 1.5321262416698103, + "grad_norm": 0.20283591817892682, + "kl": 0.085479736328125, + "learning_rate": 4.98139712166639e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9138 + }, + { + "completion_length": 243.31250953674316, + "epoch": 1.5324615449096777, + "grad_norm": 0.2042566525715254, + "kl": 0.0798492431640625, + "learning_rate": 4.981382512070235e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 9140 + }, + { + "completion_length": 237.3437614440918, + "epoch": 1.532796848149545, + "grad_norm": 0.2004883026580934, + "kl": 0.088653564453125, + "learning_rate": 4.981367896761019e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 9142 + }, + { + "completion_length": 238.80358219146729, + "epoch": 1.5331321513894127, + "grad_norm": 0.19496765967296867, + "kl": 0.090240478515625, + "learning_rate": 4.981353275738772e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714447498322, + "rewards/format_reward_func": 1.0, + "step": 9144 + }, + { + "completion_length": 231.03125953674316, + "epoch": 1.5334674546292804, + "grad_norm": 0.17878598708437798, + "kl": 0.0771331787109375, + "learning_rate": 4.981338649003531e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 9146 + }, + { + "completion_length": 241.41965579986572, + "epoch": 1.533802757869148, + "grad_norm": 0.1694416729522096, + "kl": 0.092437744140625, + "learning_rate": 4.981324016555328e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7142857573926449, + "rewards/format_reward_func": 1.0, + "step": 9148 + }, + { + "completion_length": 241.62947368621826, + "epoch": 1.5341380611090156, + "grad_norm": 0.3325622452794708, + "kl": 0.0935211181640625, + "learning_rate": 4.981309378394197e-07, + "loss": 0.0001, + "reward": 1.6928572282195091, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.6928571686148643, + "rewards/format_reward_func": 1.0, + "step": 9150 + }, + { + "completion_length": 233.10268878936768, + "epoch": 1.534473364348883, + "grad_norm": 0.23292539109008156, + "kl": 0.086761474609375, + "learning_rate": 4.981294734520172e-07, + "loss": 0.0001, + "reward": 1.767857238650322, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 9152 + }, + { + "completion_length": 237.7544765472412, + "epoch": 1.5348086675887505, + "grad_norm": 0.20881513610441563, + "kl": 0.0846099853515625, + "learning_rate": 4.981280084933287e-07, + "loss": 0.0001, + "reward": 1.78035718947649, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214730620384, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9154 + }, + { + "completion_length": 233.49554824829102, + "epoch": 1.535143970828618, + "grad_norm": 0.2662071000635271, + "kl": 0.089324951171875, + "learning_rate": 4.981265429633575e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 9156 + }, + { + "completion_length": 238.50001049041748, + "epoch": 1.5354792740684857, + "grad_norm": 0.22534214293420757, + "kl": 0.102813720703125, + "learning_rate": 4.98125076862107e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 9158 + }, + { + "completion_length": 226.07590293884277, + "epoch": 1.5358145773083534, + "grad_norm": 0.2740503946863582, + "kl": 0.1274871826171875, + "learning_rate": 4.981236101895806e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 9160 + }, + { + "completion_length": 219.53572463989258, + "epoch": 1.5361498805482208, + "grad_norm": 0.206061237532187, + "kl": 0.0881195068359375, + "learning_rate": 4.981221429457815e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714488476515, + "rewards/format_reward_func": 1.0, + "step": 9162 + }, + { + "completion_length": 222.23661708831787, + "epoch": 1.5364851837880884, + "grad_norm": 0.1439280155706265, + "kl": 0.093841552734375, + "learning_rate": 4.981206751307135e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 9164 + }, + { + "completion_length": 228.29018878936768, + "epoch": 1.5368204870279558, + "grad_norm": 0.24340325547414882, + "kl": 0.092681884765625, + "learning_rate": 4.981192067443795e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428991854191, + "rewards/format_reward_func": 1.0, + "step": 9166 + }, + { + "completion_length": 223.34376049041748, + "epoch": 1.5371557902678235, + "grad_norm": 0.18917360158958488, + "kl": 0.10345458984375, + "learning_rate": 4.981177377867831e-07, + "loss": 0.0001, + "reward": 1.7892857566475868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857361584902, + "rewards/format_reward_func": 1.0, + "step": 9168 + }, + { + "completion_length": 238.33483600616455, + "epoch": 1.537491093507691, + "grad_norm": 0.273360201970595, + "kl": 0.10321044921875, + "learning_rate": 4.981162682579278e-07, + "loss": 0.0001, + "reward": 1.7071429267525673, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7071428801864386, + "rewards/format_reward_func": 1.0, + "step": 9170 + }, + { + "completion_length": 226.3482255935669, + "epoch": 1.5378263967475587, + "grad_norm": 0.32470792042468705, + "kl": 0.098785400390625, + "learning_rate": 4.981147981578168e-07, + "loss": 0.0001, + "reward": 1.8178571984171867, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8178571574389935, + "rewards/format_reward_func": 1.0, + "step": 9172 + }, + { + "completion_length": 234.78572273254395, + "epoch": 1.5381616999874261, + "grad_norm": 0.2515031157446506, + "kl": 0.1017303466796875, + "learning_rate": 4.981133274864535e-07, + "loss": 0.0001, + "reward": 1.7665178999304771, + "reward_std": 0.027147849323228, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 0.9986607171595097, + "step": 9174 + }, + { + "completion_length": 229.47768688201904, + "epoch": 1.5384970032272935, + "grad_norm": 0.16447082251076406, + "kl": 0.092315673828125, + "learning_rate": 4.981118562438414e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 9176 + }, + { + "completion_length": 234.4107255935669, + "epoch": 1.5388323064671612, + "grad_norm": 0.21182922552358208, + "kl": 0.107757568359375, + "learning_rate": 4.981103844299837e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 9178 + }, + { + "completion_length": 232.08483123779297, + "epoch": 1.5391676097070288, + "grad_norm": 0.1271282836822565, + "kl": 0.09710693359375, + "learning_rate": 4.981089120448839e-07, + "loss": 0.0001, + "reward": 1.8000000268220901, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 9180 + }, + { + "completion_length": 251.49554920196533, + "epoch": 1.5395029129468965, + "grad_norm": 0.08852330164511354, + "kl": 0.103973388671875, + "learning_rate": 4.981074390885455e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 9182 + }, + { + "completion_length": 247.26787090301514, + "epoch": 1.5398382161867639, + "grad_norm": 0.18896351681271314, + "kl": 0.09765625, + "learning_rate": 4.981059655609717e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 9184 + }, + { + "completion_length": 245.33036708831787, + "epoch": 1.5401735194266315, + "grad_norm": 0.16901570124704474, + "kl": 0.0955810546875, + "learning_rate": 4.98104491462166e-07, + "loss": 0.0001, + "reward": 1.8035714626312256, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 9186 + }, + { + "completion_length": 244.08482933044434, + "epoch": 1.540508822666499, + "grad_norm": 0.21763164806003021, + "kl": 0.0872344970703125, + "learning_rate": 4.981030167921317e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214671015739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9188 + }, + { + "completion_length": 244.71876049041748, + "epoch": 1.5408441259063665, + "grad_norm": 0.26243133709106686, + "kl": 0.0949249267578125, + "learning_rate": 4.981015415508725e-07, + "loss": 0.0001, + "reward": 1.7107143625617027, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143141329288, + "rewards/format_reward_func": 1.0, + "step": 9190 + }, + { + "completion_length": 242.61608028411865, + "epoch": 1.5411794291462342, + "grad_norm": 0.17520268835161978, + "kl": 0.10418701171875, + "learning_rate": 4.981000657383914e-07, + "loss": 0.0001, + "reward": 1.8000000268220901, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000417232513, + "rewards/format_reward_func": 1.0, + "step": 9192 + }, + { + "completion_length": 239.6473331451416, + "epoch": 1.5415147323861018, + "grad_norm": 0.20087686035129643, + "kl": 0.109222412109375, + "learning_rate": 4.980985893546919e-07, + "loss": 0.0001, + "reward": 1.7428572475910187, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 9194 + }, + { + "completion_length": 244.56697463989258, + "epoch": 1.5418500356259692, + "grad_norm": 0.20778828417353182, + "kl": 0.10235595703125, + "learning_rate": 4.980971123997776e-07, + "loss": 0.0001, + "reward": 1.7303572297096252, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7348214574158192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9196 + }, + { + "completion_length": 241.5357265472412, + "epoch": 1.5421853388658366, + "grad_norm": 0.28343013163751585, + "kl": 0.094146728515625, + "learning_rate": 4.980956348736516e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7875000275671482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9198 + }, + { + "completion_length": 236.90179920196533, + "epoch": 1.5425206421057043, + "grad_norm": 0.19766081655551698, + "kl": 0.102569580078125, + "learning_rate": 4.980941567763176e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 9200 + }, + { + "completion_length": 236.74554538726807, + "epoch": 1.542855945345572, + "grad_norm": 0.22589139544291423, + "kl": 0.1072540283203125, + "learning_rate": 4.980926781077788e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143182307482, + "rewards/format_reward_func": 1.0, + "step": 9202 + }, + { + "completion_length": 238.62500858306885, + "epoch": 1.5431912485854395, + "grad_norm": 0.2167072922018885, + "kl": 0.0994873046875, + "learning_rate": 4.980911988680386e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 9204 + }, + { + "completion_length": 237.29018783569336, + "epoch": 1.543526551825307, + "grad_norm": 0.23633367374109066, + "kl": 0.095428466796875, + "learning_rate": 4.980897190571006e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 9206 + }, + { + "completion_length": 243.02233219146729, + "epoch": 1.5438618550651746, + "grad_norm": 0.2340838662723435, + "kl": 0.11846923828125, + "learning_rate": 4.980882386749681e-07, + "loss": 0.0001, + "reward": 1.7339286655187607, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928894996643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9208 + }, + { + "completion_length": 236.89733219146729, + "epoch": 1.544197158305042, + "grad_norm": 0.2706127020814316, + "kl": 0.097137451171875, + "learning_rate": 4.980867577216444e-07, + "loss": 0.0001, + "reward": 1.7214286550879478, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214285880327225, + "rewards/format_reward_func": 1.0, + "step": 9210 + }, + { + "completion_length": 233.38393878936768, + "epoch": 1.5445324615449096, + "grad_norm": 0.20047158656273134, + "kl": 0.105621337890625, + "learning_rate": 4.98085276197133e-07, + "loss": 0.0001, + "reward": 1.707142911851406, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7071428932249546, + "rewards/format_reward_func": 1.0, + "step": 9212 + }, + { + "completion_length": 244.8303680419922, + "epoch": 1.5448677647847773, + "grad_norm": 0.134787846932301, + "kl": 0.119598388671875, + "learning_rate": 4.980837941014374e-07, + "loss": 0.0001, + "reward": 1.6964286714792252, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6964286044239998, + "rewards/format_reward_func": 1.0, + "step": 9214 + }, + { + "completion_length": 238.9776906967163, + "epoch": 1.545203068024645, + "grad_norm": 0.2149803153413945, + "kl": 0.110198974609375, + "learning_rate": 4.980823114345608e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 9216 + }, + { + "completion_length": 224.63393783569336, + "epoch": 1.5455383712645123, + "grad_norm": 0.24007473653044104, + "kl": 0.105682373046875, + "learning_rate": 4.980808281965068e-07, + "loss": 0.0001, + "reward": 1.7357143759727478, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143275439739, + "rewards/format_reward_func": 1.0, + "step": 9218 + }, + { + "completion_length": 235.7634038925171, + "epoch": 1.5458736745043797, + "grad_norm": 0.1669212593409537, + "kl": 0.110992431640625, + "learning_rate": 4.980793443872788e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 9220 + }, + { + "completion_length": 226.36161613464355, + "epoch": 1.5462089777442474, + "grad_norm": 0.16524317745228337, + "kl": 0.111541748046875, + "learning_rate": 4.980778600068801e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 9222 + }, + { + "completion_length": 219.8169765472412, + "epoch": 1.546544280984115, + "grad_norm": 0.2477187774885109, + "kl": 0.091156005859375, + "learning_rate": 4.980763750553142e-07, + "loss": 0.0001, + "reward": 1.8071429207921028, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 9224 + }, + { + "completion_length": 235.0000114440918, + "epoch": 1.5468795842239826, + "grad_norm": 0.11273552378741418, + "kl": 0.117584228515625, + "learning_rate": 4.980748895325845e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 9226 + }, + { + "completion_length": 224.48215293884277, + "epoch": 1.5472148874638503, + "grad_norm": 0.2502290193023174, + "kl": 0.1129608154296875, + "learning_rate": 4.980734034386944e-07, + "loss": 0.0001, + "reward": 1.7964286133646965, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 9228 + }, + { + "completion_length": 222.09375953674316, + "epoch": 1.5475501907037177, + "grad_norm": 0.1613240611923338, + "kl": 0.113433837890625, + "learning_rate": 4.980719167736474e-07, + "loss": 0.0001, + "reward": 1.7839286476373672, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.788392897695303, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9230 + }, + { + "completion_length": 226.97768688201904, + "epoch": 1.547885493943585, + "grad_norm": 0.6493610709705243, + "kl": 0.1061248779296875, + "learning_rate": 4.980704295374469e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 9232 + }, + { + "completion_length": 222.69197463989258, + "epoch": 1.5482207971834527, + "grad_norm": 0.25523525723368407, + "kl": 0.114501953125, + "learning_rate": 4.980689417300963e-07, + "loss": 0.0001, + "reward": 1.76250009983778, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9234 + }, + { + "completion_length": 223.65625858306885, + "epoch": 1.5485561004233204, + "grad_norm": 0.15555612635792165, + "kl": 0.1201171875, + "learning_rate": 4.980674533515989e-07, + "loss": 0.0001, + "reward": 1.8285714462399483, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8285714536905289, + "rewards/format_reward_func": 1.0, + "step": 9236 + }, + { + "completion_length": 216.60268783569336, + "epoch": 1.548891403663188, + "grad_norm": 0.0735118992899377, + "kl": 0.10296630859375, + "learning_rate": 4.980659644019584e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 9238 + }, + { + "completion_length": 217.47322368621826, + "epoch": 1.5492267069030554, + "grad_norm": 0.25828737432664267, + "kl": 0.104248046875, + "learning_rate": 4.980644748811778e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 9240 + }, + { + "completion_length": 214.54911518096924, + "epoch": 1.549562010142923, + "grad_norm": 0.21234573043333244, + "kl": 0.132171630859375, + "learning_rate": 4.980629847892611e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571809083223, + "rewards/format_reward_func": 1.0, + "step": 9242 + }, + { + "completion_length": 218.95982933044434, + "epoch": 1.5498973133827905, + "grad_norm": 0.15823434184499677, + "kl": 0.100311279296875, + "learning_rate": 4.980614941262113e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 9244 + }, + { + "completion_length": 212.95536708831787, + "epoch": 1.550232616622658, + "grad_norm": 0.21603125601891104, + "kl": 0.103302001953125, + "learning_rate": 4.98060002892032e-07, + "loss": 0.0001, + "reward": 1.82857146859169, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8285714648663998, + "rewards/format_reward_func": 1.0, + "step": 9246 + }, + { + "completion_length": 217.14733123779297, + "epoch": 1.5505679198625257, + "grad_norm": 0.2851890565606586, + "kl": 0.114898681640625, + "learning_rate": 4.980585110867265e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 9248 + }, + { + "completion_length": 213.21875858306885, + "epoch": 1.5509032231023934, + "grad_norm": 0.40561699254233524, + "kl": 0.11175537109375, + "learning_rate": 4.980570187102985e-07, + "loss": 0.0001, + "reward": 1.7267857864499092, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7312500402331352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9250 + }, + { + "completion_length": 214.15179634094238, + "epoch": 1.5512385263422608, + "grad_norm": 0.2224340282727774, + "kl": 0.097808837890625, + "learning_rate": 4.980555257627511e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 9252 + }, + { + "completion_length": 210.20090198516846, + "epoch": 1.5515738295821282, + "grad_norm": 0.1431330448665486, + "kl": 0.11029052734375, + "learning_rate": 4.980540322440881e-07, + "loss": 0.0001, + "reward": 1.832142896950245, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8321428745985031, + "rewards/format_reward_func": 1.0, + "step": 9254 + }, + { + "completion_length": 215.96429443359375, + "epoch": 1.5519091328219958, + "grad_norm": 0.42489945876210056, + "kl": 0.1027374267578125, + "learning_rate": 4.980525381543126e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 9256 + }, + { + "completion_length": 219.38840198516846, + "epoch": 1.5522444360618635, + "grad_norm": 0.24199594531058644, + "kl": 0.11187744140625, + "learning_rate": 4.980510434934283e-07, + "loss": 0.0001, + "reward": 1.7035714983940125, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7035714685916901, + "rewards/format_reward_func": 1.0, + "step": 9258 + }, + { + "completion_length": 205.69197463989258, + "epoch": 1.552579739301731, + "grad_norm": 0.2722525309108444, + "kl": 0.092529296875, + "learning_rate": 4.980495482614384e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 9260 + }, + { + "completion_length": 218.0134038925171, + "epoch": 1.5529150425415985, + "grad_norm": 0.16731072750023818, + "kl": 0.0987548828125, + "learning_rate": 4.980480524583465e-07, + "loss": 0.0001, + "reward": 1.7571429088711739, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9262 + }, + { + "completion_length": 210.11608123779297, + "epoch": 1.5532503457814661, + "grad_norm": 0.20944642746491426, + "kl": 0.098785400390625, + "learning_rate": 4.98046556084156e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857264727354, + "rewards/format_reward_func": 1.0, + "step": 9264 + }, + { + "completion_length": 220.58483123779297, + "epoch": 1.5535856490213336, + "grad_norm": 0.39444762467548544, + "kl": 0.1048736572265625, + "learning_rate": 4.980450591388705e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7267857398837805, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9266 + }, + { + "completion_length": 219.38393688201904, + "epoch": 1.5539209522612012, + "grad_norm": 0.23693520520013314, + "kl": 0.102447509765625, + "learning_rate": 4.980435616224932e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 9268 + }, + { + "completion_length": 218.48661613464355, + "epoch": 1.5542562555010688, + "grad_norm": 0.653163923404132, + "kl": 0.10009765625, + "learning_rate": 4.980420635350277e-07, + "loss": 0.0001, + "reward": 1.79464291036129, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071566939354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9270 + }, + { + "completion_length": 212.60715293884277, + "epoch": 1.5545915587409365, + "grad_norm": 0.6311999205666676, + "kl": 0.097198486328125, + "learning_rate": 4.980405648764773e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.785714304074645, + "rewards/format_reward_func": 1.0, + "step": 9272 + }, + { + "completion_length": 221.80358219146729, + "epoch": 1.5549268619808039, + "grad_norm": 0.7841224772161673, + "kl": 0.11199951171875, + "learning_rate": 4.980390656468456e-07, + "loss": 0.0001, + "reward": 1.776785783469677, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9274 + }, + { + "completion_length": 212.97322368621826, + "epoch": 1.5552621652206713, + "grad_norm": 0.4696213408353257, + "kl": 0.111358642578125, + "learning_rate": 4.980375658461361e-07, + "loss": 0.0001, + "reward": 1.7696429044008255, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7741071730852127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9276 + }, + { + "completion_length": 213.40179443359375, + "epoch": 1.555597468460539, + "grad_norm": 0.18546115989892328, + "kl": 0.10791015625, + "learning_rate": 4.980360654743521e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 9278 + }, + { + "completion_length": 210.76786708831787, + "epoch": 1.5559327717004066, + "grad_norm": 0.25145691695231076, + "kl": 0.109344482421875, + "learning_rate": 4.98034564531497e-07, + "loss": 0.0001, + "reward": 1.737500049173832, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643234461546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9280 + }, + { + "completion_length": 213.77679443359375, + "epoch": 1.5562680749402742, + "grad_norm": 0.20233206188903105, + "kl": 0.125946044921875, + "learning_rate": 4.980330630175746e-07, + "loss": 0.0001, + "reward": 1.7526786401867867, + "reward_std": 0.056821079924702644, + "rewards/equation_reward_func": 0.7589286062866449, + "rewards/format_reward_func": 0.9937500059604645, + "step": 9282 + }, + { + "completion_length": 215.31251049041748, + "epoch": 1.5566033781801418, + "grad_norm": 0.2149426775367761, + "kl": 0.14520263671875, + "learning_rate": 4.980315609325879e-07, + "loss": 0.0001, + "reward": 1.7571429088711739, + "reward_std": 0.10101525206118822, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9284 + }, + { + "completion_length": 222.44643878936768, + "epoch": 1.5569386814200092, + "grad_norm": 0.3231430975015793, + "kl": 0.1226806640625, + "learning_rate": 4.980300582765406e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9286 + }, + { + "completion_length": 218.65179634094238, + "epoch": 1.5572739846598767, + "grad_norm": 0.1417486913493435, + "kl": 0.122344970703125, + "learning_rate": 4.980285550494362e-07, + "loss": 0.0001, + "reward": 1.769642911851406, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9288 + }, + { + "completion_length": 223.09376049041748, + "epoch": 1.5576092878997443, + "grad_norm": 0.4358009612428243, + "kl": 0.121795654296875, + "learning_rate": 4.980270512512782e-07, + "loss": 0.0001, + "reward": 1.7535714656114578, + "reward_std": 0.10606601648032665, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 0.9821428656578064, + "step": 9290 + }, + { + "completion_length": 208.5178680419922, + "epoch": 1.557944591139612, + "grad_norm": 0.276077930647653, + "kl": 0.1182861328125, + "learning_rate": 4.980255468820699e-07, + "loss": 0.0001, + "reward": 1.7750000357627869, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 9292 + }, + { + "completion_length": 227.321439743042, + "epoch": 1.5582798943794796, + "grad_norm": 0.2746660299952982, + "kl": 0.126617431640625, + "learning_rate": 4.980240419418148e-07, + "loss": 0.0001, + "reward": 1.7285714745521545, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714577883482, + "rewards/format_reward_func": 1.0, + "step": 9294 + }, + { + "completion_length": 205.32590103149414, + "epoch": 1.558615197619347, + "grad_norm": 0.251892460012717, + "kl": 0.115447998046875, + "learning_rate": 4.980225364305164e-07, + "loss": 0.0001, + "reward": 1.8071428760886192, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428947150707, + "rewards/format_reward_func": 1.0, + "step": 9296 + }, + { + "completion_length": 221.91965293884277, + "epoch": 1.5589505008592146, + "grad_norm": 0.5319414903169392, + "kl": 0.1319580078125, + "learning_rate": 4.980210303481782e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9298 + }, + { + "completion_length": 227.07590293884277, + "epoch": 1.559285804099082, + "grad_norm": 0.436605329446181, + "kl": 0.13720703125, + "learning_rate": 4.980195236948036e-07, + "loss": 0.0001, + "reward": 1.7053572311997414, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7098214644938707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9300 + }, + { + "completion_length": 225.92858123779297, + "epoch": 1.5596211073389497, + "grad_norm": 0.2605545821872648, + "kl": 0.1298828125, + "learning_rate": 4.98018016470396e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 9302 + }, + { + "completion_length": 227.6607265472412, + "epoch": 1.5599564105788173, + "grad_norm": 0.4288816041377394, + "kl": 0.13018798828125, + "learning_rate": 4.980165086749592e-07, + "loss": 0.0001, + "reward": 1.7763393446803093, + "reward_std": 0.07386740390211344, + "rewards/equation_reward_func": 0.7839286029338837, + "rewards/format_reward_func": 0.9924107193946838, + "step": 9304 + }, + { + "completion_length": 225.93750953674316, + "epoch": 1.560291713818685, + "grad_norm": 0.19834380017707282, + "kl": 0.1331787109375, + "learning_rate": 4.980150003084962e-07, + "loss": 0.0001, + "reward": 1.7553571984171867, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9306 + }, + { + "completion_length": 225.0312614440918, + "epoch": 1.5606270170585523, + "grad_norm": 0.1229449240722239, + "kl": 0.127410888671875, + "learning_rate": 4.980134913710108e-07, + "loss": 0.0001, + "reward": 1.7232143580913544, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7276786081492901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9308 + }, + { + "completion_length": 226.49554634094238, + "epoch": 1.5609623202984197, + "grad_norm": 0.5843174232684581, + "kl": 0.125244140625, + "learning_rate": 4.980119818625064e-07, + "loss": 0.0001, + "reward": 1.7660715132951736, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9310 + }, + { + "completion_length": 233.28125858306885, + "epoch": 1.5612976235382874, + "grad_norm": 0.12572644726899307, + "kl": 0.1334228515625, + "learning_rate": 4.980104717829865e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9312 + }, + { + "completion_length": 228.2098331451416, + "epoch": 1.561632926778155, + "grad_norm": 0.20672609168897751, + "kl": 0.13775634765625, + "learning_rate": 4.980089611324545e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714652389288, + "rewards/format_reward_func": 1.0, + "step": 9314 + }, + { + "completion_length": 232.10268878936768, + "epoch": 1.5619682300180227, + "grad_norm": 0.27711563932045935, + "kl": 0.13140869140625, + "learning_rate": 4.980074499109139e-07, + "loss": 0.0001, + "reward": 1.7071429342031479, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7071428894996643, + "rewards/format_reward_func": 1.0, + "step": 9316 + }, + { + "completion_length": 235.50000953674316, + "epoch": 1.56230353325789, + "grad_norm": 0.2513881781273045, + "kl": 0.143341064453125, + "learning_rate": 4.980059381183682e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428749710321, + "rewards/format_reward_func": 1.0, + "step": 9318 + }, + { + "completion_length": 222.54018688201904, + "epoch": 1.5626388364977577, + "grad_norm": 0.20648914774796778, + "kl": 0.136016845703125, + "learning_rate": 4.980044257548209e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 9320 + }, + { + "completion_length": 235.7634048461914, + "epoch": 1.5629741397376251, + "grad_norm": 0.26204188469958983, + "kl": 0.159210205078125, + "learning_rate": 4.980029128202755e-07, + "loss": 0.0002, + "reward": 1.7553572058677673, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7687500342726707, + "rewards/format_reward_func": 0.9866071492433548, + "step": 9322 + }, + { + "completion_length": 215.79465293884277, + "epoch": 1.5633094429774927, + "grad_norm": 0.31719240838874485, + "kl": 0.118438720703125, + "learning_rate": 4.980013993147353e-07, + "loss": 0.0001, + "reward": 1.7928571626543999, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 9324 + }, + { + "completion_length": 211.04911518096924, + "epoch": 1.5636447462173604, + "grad_norm": 0.14979368350307748, + "kl": 0.1125640869140625, + "learning_rate": 4.97999885238204e-07, + "loss": 0.0001, + "reward": 1.8232143446803093, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8276785835623741, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9326 + }, + { + "completion_length": 223.22768878936768, + "epoch": 1.563980049457228, + "grad_norm": 0.31624646214512, + "kl": 0.12713623046875, + "learning_rate": 4.979983705906852e-07, + "loss": 0.0001, + "reward": 1.7500000968575478, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 9328 + }, + { + "completion_length": 218.57590293884277, + "epoch": 1.5643153526970954, + "grad_norm": 0.395515407485876, + "kl": 0.124420166015625, + "learning_rate": 4.979968553721819e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9330 + }, + { + "completion_length": 211.87054634094238, + "epoch": 1.5646506559369628, + "grad_norm": 0.2938857964215618, + "kl": 0.10455322265625, + "learning_rate": 4.97995339582698e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 9332 + }, + { + "completion_length": 212.06250953674316, + "epoch": 1.5649859591768305, + "grad_norm": 0.2958914012499181, + "kl": 0.135986328125, + "learning_rate": 4.97993823222237e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 9334 + }, + { + "completion_length": 216.01786708831787, + "epoch": 1.5653212624166981, + "grad_norm": 0.22586410780715277, + "kl": 0.13189697265625, + "learning_rate": 4.979923062908022e-07, + "loss": 0.0001, + "reward": 1.7267858013510704, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7312500402331352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9336 + }, + { + "completion_length": 210.58036422729492, + "epoch": 1.5656565656565657, + "grad_norm": 0.31286413540600805, + "kl": 0.1175537109375, + "learning_rate": 4.979907887883971e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 9338 + }, + { + "completion_length": 209.39286613464355, + "epoch": 1.5659918688964332, + "grad_norm": 0.004725622950303789, + "kl": 0.134185791015625, + "learning_rate": 4.979892707150253e-07, + "loss": 0.0001, + "reward": 1.8035714700818062, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 9340 + }, + { + "completion_length": 210.72322368621826, + "epoch": 1.5663271721363008, + "grad_norm": 0.22215252473600736, + "kl": 0.126220703125, + "learning_rate": 4.979877520706902e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714510828257, + "rewards/format_reward_func": 1.0, + "step": 9342 + }, + { + "completion_length": 211.70982837677002, + "epoch": 1.5666624753761682, + "grad_norm": 0.28259642807042173, + "kl": 0.126251220703125, + "learning_rate": 4.979862328553954e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571697324514, + "rewards/format_reward_func": 1.0, + "step": 9344 + }, + { + "completion_length": 203.66072463989258, + "epoch": 1.5669977786160358, + "grad_norm": 0.11420190732138948, + "kl": 0.1182861328125, + "learning_rate": 4.979847130691442e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000081956387, + "rewards/format_reward_func": 1.0, + "step": 9346 + }, + { + "completion_length": 204.75893878936768, + "epoch": 1.5673330818559035, + "grad_norm": 0.2543779500102239, + "kl": 0.117279052734375, + "learning_rate": 4.979831927119405e-07, + "loss": 0.0001, + "reward": 1.7035715132951736, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7035714611411095, + "rewards/format_reward_func": 1.0, + "step": 9348 + }, + { + "completion_length": 214.9107265472412, + "epoch": 1.567668385095771, + "grad_norm": 0.2692869245101748, + "kl": 0.16180419921875, + "learning_rate": 4.979816717837874e-07, + "loss": 0.0002, + "reward": 1.7339286357164383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928950875998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9350 + }, + { + "completion_length": 200.07143783569336, + "epoch": 1.5680036883356385, + "grad_norm": 0.3329726441021534, + "kl": 0.144622802734375, + "learning_rate": 4.979801502846885e-07, + "loss": 0.0001, + "reward": 1.785714328289032, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.785714328289032, + "rewards/format_reward_func": 1.0, + "step": 9352 + }, + { + "completion_length": 206.977689743042, + "epoch": 1.568338991575506, + "grad_norm": 0.18730600752968107, + "kl": 0.13232421875, + "learning_rate": 4.979786282146474e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143022119999, + "rewards/format_reward_func": 1.0, + "step": 9354 + }, + { + "completion_length": 208.29018783569336, + "epoch": 1.5686742948153736, + "grad_norm": 0.13967090602322763, + "kl": 0.225189208984375, + "learning_rate": 4.979771055736677e-07, + "loss": 0.0002, + "reward": 1.807142898440361, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 9356 + }, + { + "completion_length": 207.01340293884277, + "epoch": 1.5690095980552412, + "grad_norm": 0.27378668060925143, + "kl": 0.123443603515625, + "learning_rate": 4.979755823617525e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 9358 + }, + { + "completion_length": 204.61607933044434, + "epoch": 1.5693449012951088, + "grad_norm": 0.1799442544954095, + "kl": 0.1424560546875, + "learning_rate": 4.979740585789057e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428701281548, + "rewards/format_reward_func": 1.0, + "step": 9360 + }, + { + "completion_length": 218.508939743042, + "epoch": 1.5696802045349765, + "grad_norm": 0.10917451343746819, + "kl": 0.1553955078125, + "learning_rate": 4.979725342251307e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857640981674, + "rewards/format_reward_func": 1.0, + "step": 9362 + }, + { + "completion_length": 199.85268878936768, + "epoch": 1.5700155077748439, + "grad_norm": 0.34883051529480286, + "kl": 0.137542724609375, + "learning_rate": 4.979710093004311e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 9364 + }, + { + "completion_length": 213.64286613464355, + "epoch": 1.5703508110147113, + "grad_norm": 0.23714127105446917, + "kl": 0.195068359375, + "learning_rate": 4.9796948380481e-07, + "loss": 0.0002, + "reward": 1.7000000923871994, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7000000439584255, + "rewards/format_reward_func": 1.0, + "step": 9366 + }, + { + "completion_length": 204.15179634094238, + "epoch": 1.570686114254579, + "grad_norm": 0.31596825075507784, + "kl": 0.106353759765625, + "learning_rate": 4.979679577382714e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 9368 + }, + { + "completion_length": 200.46875858306885, + "epoch": 1.5710214174944466, + "grad_norm": 0.25781774042292127, + "kl": 0.127532958984375, + "learning_rate": 4.979664311008185e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 9370 + }, + { + "completion_length": 207.22768783569336, + "epoch": 1.5713567207343142, + "grad_norm": 0.2883900615043515, + "kl": 0.1649169921875, + "learning_rate": 4.979649038924551e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 9372 + }, + { + "completion_length": 208.43304443359375, + "epoch": 1.5716920239741816, + "grad_norm": 0.26372455175352644, + "kl": 0.18939208984375, + "learning_rate": 4.979633761131845e-07, + "loss": 0.0002, + "reward": 1.7464286461472511, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 9374 + }, + { + "completion_length": 209.62947273254395, + "epoch": 1.5720273272140493, + "grad_norm": 0.3049719918557979, + "kl": 0.19256591796875, + "learning_rate": 4.979618477630102e-07, + "loss": 0.0002, + "reward": 1.7714286297559738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 9376 + }, + { + "completion_length": 201.66072273254395, + "epoch": 1.5723626304539167, + "grad_norm": 0.16845659994063056, + "kl": 0.12188720703125, + "learning_rate": 4.979603188419358e-07, + "loss": 0.0001, + "reward": 1.7392858117818832, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 9378 + }, + { + "completion_length": 205.45536518096924, + "epoch": 1.5726979336937843, + "grad_norm": 0.22718930926429257, + "kl": 0.132293701171875, + "learning_rate": 4.979587893499649e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 9380 + }, + { + "completion_length": 209.20983028411865, + "epoch": 1.573033236933652, + "grad_norm": 0.2509327116323553, + "kl": 0.1441650390625, + "learning_rate": 4.979572592871009e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 9382 + }, + { + "completion_length": 211.25000858306885, + "epoch": 1.5733685401735196, + "grad_norm": 0.08130233488984963, + "kl": 0.2010498046875, + "learning_rate": 4.979557286533473e-07, + "loss": 0.0002, + "reward": 1.800000049173832, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 9384 + }, + { + "completion_length": 200.88840007781982, + "epoch": 1.573703843413387, + "grad_norm": 0.08435947022938288, + "kl": 0.12811279296875, + "learning_rate": 4.979541974487077e-07, + "loss": 0.0001, + "reward": 1.7450893595814705, + "reward_std": 0.033461302518844604, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 0.9986607171595097, + "step": 9386 + }, + { + "completion_length": 216.1919755935669, + "epoch": 1.5740391466532544, + "grad_norm": 0.3632452957424689, + "kl": 0.2205810546875, + "learning_rate": 4.979526656731856e-07, + "loss": 0.0002, + "reward": 1.7303572222590446, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214574158192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9388 + }, + { + "completion_length": 219.18304538726807, + "epoch": 1.574374449893122, + "grad_norm": 0.30401658940048676, + "kl": 0.13153076171875, + "learning_rate": 4.979511333267845e-07, + "loss": 0.0001, + "reward": 1.7267857939004898, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7312500402331352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9390 + }, + { + "completion_length": 215.1741180419922, + "epoch": 1.5747097531329897, + "grad_norm": 0.24728973081644434, + "kl": 0.20745849609375, + "learning_rate": 4.979496004095081e-07, + "loss": 0.0002, + "reward": 1.7375001013278961, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643066823483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9392 + }, + { + "completion_length": 207.53572273254395, + "epoch": 1.5750450563728573, + "grad_norm": 0.21061804157548694, + "kl": 0.24725341796875, + "learning_rate": 4.979480669213596e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857559025288, + "rewards/format_reward_func": 1.0, + "step": 9394 + }, + { + "completion_length": 197.40179443359375, + "epoch": 1.5753803596127247, + "grad_norm": 0.26878445750989804, + "kl": 0.260589599609375, + "learning_rate": 4.979465328623428e-07, + "loss": 0.0003, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 9396 + }, + { + "completion_length": 191.18750953674316, + "epoch": 1.5757156628525923, + "grad_norm": 0.26487623591318504, + "kl": 0.123138427734375, + "learning_rate": 4.979449982324612e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 9398 + }, + { + "completion_length": 193.94643878936768, + "epoch": 1.5760509660924598, + "grad_norm": 0.3455811579684974, + "kl": 0.138336181640625, + "learning_rate": 4.979434630317181e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 9400 + }, + { + "completion_length": 200.52233123779297, + "epoch": 1.5763862693323274, + "grad_norm": 0.1336782326343173, + "kl": 0.133148193359375, + "learning_rate": 4.979419272601174e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 9402 + }, + { + "completion_length": 192.94643783569336, + "epoch": 1.576721572572195, + "grad_norm": 0.23079795904077532, + "kl": 0.14453125, + "learning_rate": 4.979403909176625e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428868919611, + "rewards/format_reward_func": 1.0, + "step": 9404 + }, + { + "completion_length": 203.18750858306885, + "epoch": 1.5770568758120627, + "grad_norm": 0.20095549925177159, + "kl": 0.288726806640625, + "learning_rate": 4.979388540043568e-07, + "loss": 0.0003, + "reward": 1.7750000730156898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 9406 + }, + { + "completion_length": 191.08036613464355, + "epoch": 1.57739217905193, + "grad_norm": 0.20087382205830567, + "kl": 0.220916748046875, + "learning_rate": 4.979373165202039e-07, + "loss": 0.0002, + "reward": 1.7535715103149414, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 9408 + }, + { + "completion_length": 191.23661708831787, + "epoch": 1.5777274822917975, + "grad_norm": 0.13166586092674154, + "kl": 0.138763427734375, + "learning_rate": 4.979357784652073e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.753571467474103, + "rewards/format_reward_func": 1.0, + "step": 9410 + }, + { + "completion_length": 188.56697177886963, + "epoch": 1.5780627855316651, + "grad_norm": 0.13549451743155633, + "kl": 0.115631103515625, + "learning_rate": 4.979342398393707e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 9412 + }, + { + "completion_length": 181.10268592834473, + "epoch": 1.5783980887715328, + "grad_norm": 0.20587388166420736, + "kl": 0.23712158203125, + "learning_rate": 4.979327006426975e-07, + "loss": 0.0002, + "reward": 1.7053572311997414, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7098214700818062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9414 + }, + { + "completion_length": 179.82143688201904, + "epoch": 1.5787333920114004, + "grad_norm": 0.324524725201001, + "kl": 0.12750244140625, + "learning_rate": 4.979311608751915e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321429029107094, + "rewards/format_reward_func": 1.0, + "step": 9416 + }, + { + "completion_length": 181.51340103149414, + "epoch": 1.579068695251268, + "grad_norm": 0.363719970992371, + "kl": 0.156158447265625, + "learning_rate": 4.979296205368558e-07, + "loss": 0.0002, + "reward": 1.7821428999304771, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821429036557674, + "rewards/format_reward_func": 1.0, + "step": 9418 + }, + { + "completion_length": 184.34375953674316, + "epoch": 1.5794039984911354, + "grad_norm": 0.2968390327039545, + "kl": 0.14080810546875, + "learning_rate": 4.979280796276943e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.746428607031703, + "rewards/format_reward_func": 1.0, + "step": 9420 + }, + { + "completion_length": 184.85715103149414, + "epoch": 1.5797393017310029, + "grad_norm": 0.2681062341314168, + "kl": 0.1266632080078125, + "learning_rate": 4.979265381477104e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 9422 + }, + { + "completion_length": 181.8125057220459, + "epoch": 1.5800746049708705, + "grad_norm": 0.18636219767722761, + "kl": 0.105621337890625, + "learning_rate": 4.979249960969077e-07, + "loss": 0.0001, + "reward": 1.7410715222358704, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9424 + }, + { + "completion_length": 192.44643592834473, + "epoch": 1.5804099082107381, + "grad_norm": 0.2592943293661732, + "kl": 0.173614501953125, + "learning_rate": 4.979234534752898e-07, + "loss": 0.0002, + "reward": 1.778571493923664, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 9426 + }, + { + "completion_length": 180.08929347991943, + "epoch": 1.5807452114506058, + "grad_norm": 0.018409488738189025, + "kl": 0.177093505859375, + "learning_rate": 4.979219102828601e-07, + "loss": 0.0002, + "reward": 1.7678571939468384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571902215481, + "rewards/format_reward_func": 1.0, + "step": 9428 + }, + { + "completion_length": 188.42857933044434, + "epoch": 1.5810805146904732, + "grad_norm": 0.19724579011986249, + "kl": 0.108642578125, + "learning_rate": 4.979203665196222e-07, + "loss": 0.0001, + "reward": 1.7142857983708382, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857424914837, + "rewards/format_reward_func": 1.0, + "step": 9430 + }, + { + "completion_length": 188.49554347991943, + "epoch": 1.5814158179303408, + "grad_norm": 0.37446507304918436, + "kl": 0.10906982421875, + "learning_rate": 4.979188221855797e-07, + "loss": 0.0001, + "reward": 1.7285714894533157, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 9432 + }, + { + "completion_length": 188.35268688201904, + "epoch": 1.5817511211702082, + "grad_norm": 0.28298486688243313, + "kl": 0.1163330078125, + "learning_rate": 4.979172772807363e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 9434 + }, + { + "completion_length": 180.41518592834473, + "epoch": 1.5820864244100759, + "grad_norm": 0.19784500966747084, + "kl": 0.117919921875, + "learning_rate": 4.979157318050953e-07, + "loss": 0.0001, + "reward": 1.7571429088711739, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 9436 + }, + { + "completion_length": 182.25893783569336, + "epoch": 1.5824217276499435, + "grad_norm": 0.18481772827255594, + "kl": 0.114227294921875, + "learning_rate": 4.979141857586604e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 9438 + }, + { + "completion_length": 181.86607933044434, + "epoch": 1.5827570308898111, + "grad_norm": 0.40152965965342363, + "kl": 0.295135498046875, + "learning_rate": 4.979126391414352e-07, + "loss": 0.0003, + "reward": 1.7428572326898575, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 9440 + }, + { + "completion_length": 188.20090293884277, + "epoch": 1.5830923341296785, + "grad_norm": 0.3072119809838924, + "kl": 0.13543701171875, + "learning_rate": 4.97911091953423e-07, + "loss": 0.0001, + "reward": 1.72857154160738, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7285714540630579, + "rewards/format_reward_func": 1.0, + "step": 9442 + }, + { + "completion_length": 184.23661518096924, + "epoch": 1.583427637369546, + "grad_norm": 0.27136970912830893, + "kl": 0.186431884765625, + "learning_rate": 4.979095441946276e-07, + "loss": 0.0002, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 9444 + }, + { + "completion_length": 184.88393688201904, + "epoch": 1.5837629406094136, + "grad_norm": 0.2625038055494905, + "kl": 0.111175537109375, + "learning_rate": 4.979079958650525e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 1.0, + "step": 9446 + }, + { + "completion_length": 188.89733028411865, + "epoch": 1.5840982438492812, + "grad_norm": 0.20991182672532305, + "kl": 0.110870361328125, + "learning_rate": 4.979064469647014e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 9448 + }, + { + "completion_length": 187.84822273254395, + "epoch": 1.5844335470891489, + "grad_norm": 0.24590372324776322, + "kl": 0.131500244140625, + "learning_rate": 4.979048974935776e-07, + "loss": 0.0001, + "reward": 1.7642857655882835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 9450 + }, + { + "completion_length": 185.66072368621826, + "epoch": 1.5847688503290163, + "grad_norm": 0.15685101508432459, + "kl": 0.204193115234375, + "learning_rate": 4.97903347451685e-07, + "loss": 0.0002, + "reward": 1.7357143387198448, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 9452 + }, + { + "completion_length": 204.43750762939453, + "epoch": 1.585104153568884, + "grad_norm": 0.28556903563308955, + "kl": 0.4090576171875, + "learning_rate": 4.979017968390268e-07, + "loss": 0.0004, + "reward": 1.7250000983476639, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7250000182539225, + "rewards/format_reward_func": 1.0, + "step": 9454 + }, + { + "completion_length": 192.05804538726807, + "epoch": 1.5854394568087513, + "grad_norm": 0.268773106820556, + "kl": 0.1308441162109375, + "learning_rate": 4.979002456556068e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9456 + }, + { + "completion_length": 199.78125858306885, + "epoch": 1.585774760048619, + "grad_norm": 0.24722104897199124, + "kl": 0.10369873046875, + "learning_rate": 4.978986939014285e-07, + "loss": 0.0001, + "reward": 1.732142947614193, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 9458 + }, + { + "completion_length": 198.80804347991943, + "epoch": 1.5861100632884866, + "grad_norm": 0.20960824627267807, + "kl": 0.140289306640625, + "learning_rate": 4.978971415764955e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 9460 + }, + { + "completion_length": 203.90179443359375, + "epoch": 1.5864453665283542, + "grad_norm": 0.291625399876519, + "kl": 0.148956298828125, + "learning_rate": 4.978955886808114e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 9462 + }, + { + "completion_length": 196.58483123779297, + "epoch": 1.5867806697682216, + "grad_norm": 0.3246287484114839, + "kl": 0.113006591796875, + "learning_rate": 4.978940352143797e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 9464 + }, + { + "completion_length": 186.13840198516846, + "epoch": 1.587115973008089, + "grad_norm": 0.4854994353952275, + "kl": 0.113677978515625, + "learning_rate": 4.97892481177204e-07, + "loss": 0.0001, + "reward": 1.721428669989109, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 9466 + }, + { + "completion_length": 197.19197368621826, + "epoch": 1.5874512762479567, + "grad_norm": 0.31571928329953813, + "kl": 0.101104736328125, + "learning_rate": 4.97890926569288e-07, + "loss": 0.0001, + "reward": 1.8000000715255737, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 9468 + }, + { + "completion_length": 198.43750953674316, + "epoch": 1.5877865794878243, + "grad_norm": 0.18825773566152013, + "kl": 0.14898681640625, + "learning_rate": 4.978893713906351e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571864962578, + "rewards/format_reward_func": 1.0, + "step": 9470 + }, + { + "completion_length": 206.13840198516846, + "epoch": 1.588121882727692, + "grad_norm": 0.1629431048033093, + "kl": 0.111083984375, + "learning_rate": 4.97887815641249e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 9472 + }, + { + "completion_length": 203.19197273254395, + "epoch": 1.5884571859675594, + "grad_norm": 0.33415035155337125, + "kl": 0.104461669921875, + "learning_rate": 4.978862593211331e-07, + "loss": 0.0001, + "reward": 1.7508929297327995, + "reward_std": 0.08965103607624769, + "rewards/equation_reward_func": 0.7526786141097546, + "rewards/format_reward_func": 0.9982142895460129, + "step": 9474 + }, + { + "completion_length": 195.89286708831787, + "epoch": 1.588792489207427, + "grad_norm": 0.3242104161132054, + "kl": 0.09625244140625, + "learning_rate": 4.978847024302914e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000417232513, + "rewards/format_reward_func": 1.0, + "step": 9476 + }, + { + "completion_length": 194.82590293884277, + "epoch": 1.5891277924472944, + "grad_norm": 0.2486906112229366, + "kl": 0.0967254638671875, + "learning_rate": 4.97883144968727e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857540398836, + "rewards/format_reward_func": 1.0, + "step": 9478 + }, + { + "completion_length": 196.92411422729492, + "epoch": 1.589463095687162, + "grad_norm": 0.19468741497020042, + "kl": 0.10626220703125, + "learning_rate": 4.978815869364437e-07, + "loss": 0.0001, + "reward": 1.7464286237955093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286237955093, + "rewards/format_reward_func": 1.0, + "step": 9480 + }, + { + "completion_length": 205.51786613464355, + "epoch": 1.5897983989270297, + "grad_norm": 0.21640894935551291, + "kl": 0.12603759765625, + "learning_rate": 4.978800283334451e-07, + "loss": 0.0001, + "reward": 1.7272322252392769, + "reward_std": 0.04230013629421592, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 0.9986607171595097, + "step": 9482 + }, + { + "completion_length": 193.09375762939453, + "epoch": 1.5901337021668973, + "grad_norm": 0.15703601361285324, + "kl": 0.100189208984375, + "learning_rate": 4.978784691597347e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857361584902, + "rewards/format_reward_func": 1.0, + "step": 9484 + }, + { + "completion_length": 188.65179347991943, + "epoch": 1.5904690054067647, + "grad_norm": 0.2601317163988536, + "kl": 0.136810302734375, + "learning_rate": 4.978769094153163e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 9486 + }, + { + "completion_length": 196.0580472946167, + "epoch": 1.5908043086466321, + "grad_norm": 0.2600653373287085, + "kl": 0.117095947265625, + "learning_rate": 4.978753491001933e-07, + "loss": 0.0001, + "reward": 1.7857143133878708, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 9488 + }, + { + "completion_length": 200.73215198516846, + "epoch": 1.5911396118864998, + "grad_norm": 0.27887034524746496, + "kl": 0.18231201171875, + "learning_rate": 4.978737882143693e-07, + "loss": 0.0002, + "reward": 1.8178572058677673, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 9490 + }, + { + "completion_length": 202.33036708831787, + "epoch": 1.5914749151263674, + "grad_norm": 0.3410612068150405, + "kl": 0.125335693359375, + "learning_rate": 4.97872226757848e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571734577417, + "rewards/format_reward_func": 1.0, + "step": 9492 + }, + { + "completion_length": 203.17858123779297, + "epoch": 1.591810218366235, + "grad_norm": 0.18929719663093758, + "kl": 0.266021728515625, + "learning_rate": 4.978706647306329e-07, + "loss": 0.0003, + "reward": 1.717857226729393, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571727126837, + "rewards/format_reward_func": 1.0, + "step": 9494 + }, + { + "completion_length": 194.48661708831787, + "epoch": 1.5921455216061027, + "grad_norm": 0.44137712767320597, + "kl": 0.210601806640625, + "learning_rate": 4.978691021327276e-07, + "loss": 0.0002, + "reward": 1.7553572058677673, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9496 + }, + { + "completion_length": 208.91965007781982, + "epoch": 1.59248082484597, + "grad_norm": 0.3316997283825163, + "kl": 0.11297607421875, + "learning_rate": 4.978675389641357e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 9498 + }, + { + "completion_length": 200.73215103149414, + "epoch": 1.5928161280858375, + "grad_norm": 0.27617819723690057, + "kl": 0.2326507568359375, + "learning_rate": 4.978659752248608e-07, + "loss": 0.0002, + "reward": 1.7714286223053932, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 9500 + }, + { + "completion_length": 210.42411613464355, + "epoch": 1.5931514313257051, + "grad_norm": 0.23732779196931864, + "kl": 0.19146728515625, + "learning_rate": 4.978644109149066e-07, + "loss": 0.0002, + "reward": 1.778571479022503, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 9502 + }, + { + "completion_length": 206.71875858306885, + "epoch": 1.5934867345655728, + "grad_norm": 0.1889520164049944, + "kl": 0.13079833984375, + "learning_rate": 4.978628460342766e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 9504 + }, + { + "completion_length": 209.49107837677002, + "epoch": 1.5938220378054404, + "grad_norm": 0.28097707245087605, + "kl": 0.125396728515625, + "learning_rate": 4.978612805829745e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428846567869, + "rewards/format_reward_func": 1.0, + "step": 9506 + }, + { + "completion_length": 219.37947368621826, + "epoch": 1.5941573410453078, + "grad_norm": 0.1472375894145825, + "kl": 0.1482086181640625, + "learning_rate": 4.978597145610037e-07, + "loss": 0.0001, + "reward": 1.7035714983940125, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7125000394880772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9508 + }, + { + "completion_length": 194.54465103149414, + "epoch": 1.5944926442851755, + "grad_norm": 0.30787715412318145, + "kl": 0.1240234375, + "learning_rate": 4.97858147968368e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9510 + }, + { + "completion_length": 205.63840198516846, + "epoch": 1.5948279475250429, + "grad_norm": 0.22203363715006483, + "kl": 0.12939453125, + "learning_rate": 4.97856580805071e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 9512 + }, + { + "completion_length": 218.54465293884277, + "epoch": 1.5951632507649105, + "grad_norm": 0.3115238153228958, + "kl": 0.197662353515625, + "learning_rate": 4.978550130711163e-07, + "loss": 0.0002, + "reward": 1.7607143819332123, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.760714303702116, + "rewards/format_reward_func": 1.0, + "step": 9514 + }, + { + "completion_length": 210.4062614440918, + "epoch": 1.5954985540047781, + "grad_norm": 0.33087151315019003, + "kl": 0.162689208984375, + "learning_rate": 4.978534447665072e-07, + "loss": 0.0002, + "reward": 1.76071435213089, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 9516 + }, + { + "completion_length": 219.63840198516846, + "epoch": 1.5958338572446458, + "grad_norm": 0.3291601561579799, + "kl": 0.1533203125, + "learning_rate": 4.978518758912478e-07, + "loss": 0.0002, + "reward": 1.712500087916851, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.716964328661561, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9518 + }, + { + "completion_length": 217.66518688201904, + "epoch": 1.5961691604845132, + "grad_norm": 0.16251913228443396, + "kl": 0.175567626953125, + "learning_rate": 4.978503064453414e-07, + "loss": 0.0002, + "reward": 1.7875000536441803, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9520 + }, + { + "completion_length": 214.29911708831787, + "epoch": 1.5965044637243806, + "grad_norm": 0.275822424940639, + "kl": 0.133331298828125, + "learning_rate": 4.978487364287918e-07, + "loss": 0.0001, + "reward": 1.7589286491274834, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9522 + }, + { + "completion_length": 222.16965293884277, + "epoch": 1.5968397669642482, + "grad_norm": 0.28340261424616403, + "kl": 0.2544708251953125, + "learning_rate": 4.978471658416024e-07, + "loss": 0.0003, + "reward": 1.7392857819795609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7482143118977547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9524 + }, + { + "completion_length": 219.68304634094238, + "epoch": 1.5971750702041159, + "grad_norm": 0.3308031864705601, + "kl": 0.1259765625, + "learning_rate": 4.978455946837769e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 9526 + }, + { + "completion_length": 228.6205472946167, + "epoch": 1.5975103734439835, + "grad_norm": 0.23764646592147184, + "kl": 0.1822509765625, + "learning_rate": 4.978440229553191e-07, + "loss": 0.0002, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 9528 + }, + { + "completion_length": 221.06697273254395, + "epoch": 1.597845676683851, + "grad_norm": 0.1900498694957338, + "kl": 0.152618408203125, + "learning_rate": 4.978424506562324e-07, + "loss": 0.0002, + "reward": 1.780357226729393, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214469850063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9530 + }, + { + "completion_length": 238.60268783569336, + "epoch": 1.5981809799237185, + "grad_norm": 0.21449225533502828, + "kl": 0.18243408203125, + "learning_rate": 4.978408777865204e-07, + "loss": 0.0002, + "reward": 1.71428582072258, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857350409031, + "rewards/format_reward_func": 1.0, + "step": 9532 + }, + { + "completion_length": 228.45536708831787, + "epoch": 1.598516283163586, + "grad_norm": 0.25287001539221815, + "kl": 0.203948974609375, + "learning_rate": 4.978393043461869e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 9534 + }, + { + "completion_length": 251.9598331451416, + "epoch": 1.5988515864034536, + "grad_norm": 0.35490299267432196, + "kl": 0.30194091796875, + "learning_rate": 4.978377303352353e-07, + "loss": 0.0003, + "reward": 1.705357238650322, + "reward_std": 0.13384521286934614, + "rewards/equation_reward_func": 0.7276785857975483, + "rewards/format_reward_func": 0.977678582072258, + "step": 9536 + }, + { + "completion_length": 235.15179824829102, + "epoch": 1.5991868896433212, + "grad_norm": 0.2435142863699395, + "kl": 0.14727783203125, + "learning_rate": 4.978361557536696e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7803571745753288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9538 + }, + { + "completion_length": 230.4866180419922, + "epoch": 1.5995221928831889, + "grad_norm": 0.26605692272202913, + "kl": 0.159637451171875, + "learning_rate": 4.97834580601493e-07, + "loss": 0.0002, + "reward": 1.7589286342263222, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9540 + }, + { + "completion_length": 235.8571548461914, + "epoch": 1.5998574961230563, + "grad_norm": 0.18506222865561817, + "kl": 0.141998291015625, + "learning_rate": 4.978330048787092e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.8089286088943481, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9542 + }, + { + "completion_length": 225.2857255935669, + "epoch": 1.6001927993629237, + "grad_norm": 0.24497391090326853, + "kl": 0.1688232421875, + "learning_rate": 4.978314285853222e-07, + "loss": 0.0002, + "reward": 1.8107143267989159, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 9544 + }, + { + "completion_length": 229.04465293884277, + "epoch": 1.6005281026027913, + "grad_norm": 0.16757262472399967, + "kl": 0.142120361328125, + "learning_rate": 4.978298517213352e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7232143171131611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9546 + }, + { + "completion_length": 230.67858219146729, + "epoch": 1.600863405842659, + "grad_norm": 0.32121427829377835, + "kl": 0.168212890625, + "learning_rate": 4.97828274286752e-07, + "loss": 0.0002, + "reward": 1.7035714983940125, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7035714648663998, + "rewards/format_reward_func": 1.0, + "step": 9548 + }, + { + "completion_length": 242.65626049041748, + "epoch": 1.6011987090825266, + "grad_norm": 0.3786336398381342, + "kl": 0.359466552734375, + "learning_rate": 4.978266962815763e-07, + "loss": 0.0004, + "reward": 1.6892857998609543, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.6982143260538578, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9550 + }, + { + "completion_length": 218.7142972946167, + "epoch": 1.6015340123223942, + "grad_norm": 0.1542904347763142, + "kl": 0.2550048828125, + "learning_rate": 4.978251177058116e-07, + "loss": 0.0003, + "reward": 1.766071505844593, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9552 + }, + { + "completion_length": 220.08483219146729, + "epoch": 1.6018693155622616, + "grad_norm": 0.2862551303649256, + "kl": 0.214569091796875, + "learning_rate": 4.978235385594616e-07, + "loss": 0.0002, + "reward": 1.8107143640518188, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8107142969965935, + "rewards/format_reward_func": 1.0, + "step": 9554 + }, + { + "completion_length": 225.383939743042, + "epoch": 1.602204618802129, + "grad_norm": 0.24500226164542638, + "kl": 0.258697509765625, + "learning_rate": 4.9782195884253e-07, + "loss": 0.0003, + "reward": 1.7464286386966705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286033064127, + "rewards/format_reward_func": 1.0, + "step": 9556 + }, + { + "completion_length": 211.54018783569336, + "epoch": 1.6025399220419967, + "grad_norm": 0.2018758611186633, + "kl": 0.230255126953125, + "learning_rate": 4.978203785550203e-07, + "loss": 0.0002, + "reward": 1.8357143476605415, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8357143104076385, + "rewards/format_reward_func": 1.0, + "step": 9558 + }, + { + "completion_length": 226.4553680419922, + "epoch": 1.6028752252818643, + "grad_norm": 0.22041226567845768, + "kl": 0.21649169921875, + "learning_rate": 4.978187976969361e-07, + "loss": 0.0002, + "reward": 1.7392857745289803, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7482143305242062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9560 + }, + { + "completion_length": 203.51340198516846, + "epoch": 1.603210528521732, + "grad_norm": 0.6848401584259823, + "kl": 0.314666748046875, + "learning_rate": 4.978172162682812e-07, + "loss": 0.0003, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 9562 + }, + { + "completion_length": 208.80804443359375, + "epoch": 1.6035458317615994, + "grad_norm": 0.23038082418106143, + "kl": 0.174072265625, + "learning_rate": 4.978156342690593e-07, + "loss": 0.0002, + "reward": 1.7946429252624512, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071678698063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9564 + }, + { + "completion_length": 210.25893783569336, + "epoch": 1.603881135001467, + "grad_norm": 0.26692083769371217, + "kl": 0.13800048828125, + "learning_rate": 4.978140516992738e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 9566 + }, + { + "completion_length": 209.26786613464355, + "epoch": 1.6042164382413344, + "grad_norm": 0.6875469688497401, + "kl": 0.182403564453125, + "learning_rate": 4.978124685589286e-07, + "loss": 0.0002, + "reward": 1.7714286595582962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 9568 + }, + { + "completion_length": 211.85268688201904, + "epoch": 1.604551741481202, + "grad_norm": 0.10928835035140934, + "kl": 0.168670654296875, + "learning_rate": 4.978108848480271e-07, + "loss": 0.0002, + "reward": 1.716071479022503, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7205357421189547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9570 + }, + { + "completion_length": 206.12500953674316, + "epoch": 1.6048870447210697, + "grad_norm": 0.35867449197804246, + "kl": 0.158905029296875, + "learning_rate": 4.97809300566573e-07, + "loss": 0.0002, + "reward": 1.825000062584877, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000327825546, + "rewards/format_reward_func": 1.0, + "step": 9572 + }, + { + "completion_length": 208.10268878936768, + "epoch": 1.6052223479609373, + "grad_norm": 0.3136847225169866, + "kl": 0.259307861328125, + "learning_rate": 4.978077157145702e-07, + "loss": 0.0003, + "reward": 1.7446429133415222, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9574 + }, + { + "completion_length": 194.73661422729492, + "epoch": 1.6055576512008047, + "grad_norm": 0.1617781791123457, + "kl": 0.131927490234375, + "learning_rate": 4.97806130292022e-07, + "loss": 0.0001, + "reward": 1.7089286595582962, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7133928909897804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9576 + }, + { + "completion_length": 200.80358028411865, + "epoch": 1.6058929544406721, + "grad_norm": 0.5766524950768712, + "kl": 0.162017822265625, + "learning_rate": 4.978045442989323e-07, + "loss": 0.0002, + "reward": 1.775000087916851, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 9578 + }, + { + "completion_length": 193.70090198516846, + "epoch": 1.6062282576805398, + "grad_norm": 0.33900780695246113, + "kl": 0.133544921875, + "learning_rate": 4.978029577353047e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 9580 + }, + { + "completion_length": 198.50893688201904, + "epoch": 1.6065635609204074, + "grad_norm": 0.31181010593599673, + "kl": 0.1903076171875, + "learning_rate": 4.978013706011427e-07, + "loss": 0.0002, + "reward": 1.7500000596046448, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000279396772, + "rewards/format_reward_func": 1.0, + "step": 9582 + }, + { + "completion_length": 201.63840293884277, + "epoch": 1.606898864160275, + "grad_norm": 0.2769393664102841, + "kl": 0.160369873046875, + "learning_rate": 4.977997828964501e-07, + "loss": 0.0002, + "reward": 1.7928571850061417, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 9584 + }, + { + "completion_length": 192.71875858306885, + "epoch": 1.6072341674001425, + "grad_norm": 0.3967558757175834, + "kl": 0.1456298828125, + "learning_rate": 4.977981946212305e-07, + "loss": 0.0001, + "reward": 1.8035715073347092, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8035714440047741, + "rewards/format_reward_func": 1.0, + "step": 9586 + }, + { + "completion_length": 209.50893878936768, + "epoch": 1.60756947064001, + "grad_norm": 0.08294927589433111, + "kl": 0.18817138671875, + "learning_rate": 4.977966057754877e-07, + "loss": 0.0002, + "reward": 1.733928643167019, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7383928876370192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9588 + }, + { + "completion_length": 208.34375953674316, + "epoch": 1.6079047738798775, + "grad_norm": 0.21126790987528263, + "kl": 0.141693115234375, + "learning_rate": 4.977950163592251e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286215603352, + "rewards/format_reward_func": 1.0, + "step": 9590 + }, + { + "completion_length": 222.32590198516846, + "epoch": 1.6082400771197451, + "grad_norm": 0.2547810669547315, + "kl": 0.19598388671875, + "learning_rate": 4.977934263724466e-07, + "loss": 0.0002, + "reward": 1.7285715118050575, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 9592 + }, + { + "completion_length": 208.25000858306885, + "epoch": 1.6085753803596128, + "grad_norm": 0.1419821972171379, + "kl": 0.1297607421875, + "learning_rate": 4.977918358151557e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 9594 + }, + { + "completion_length": 199.52233123779297, + "epoch": 1.6089106835994804, + "grad_norm": 0.17384380850416808, + "kl": 0.115386962890625, + "learning_rate": 4.977902446873561e-07, + "loss": 0.0001, + "reward": 1.776785746216774, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9596 + }, + { + "completion_length": 201.48661613464355, + "epoch": 1.6092459868393478, + "grad_norm": 0.36774919060599237, + "kl": 0.175018310546875, + "learning_rate": 4.977886529890515e-07, + "loss": 0.0002, + "reward": 1.8000000417232513, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 1.0, + "step": 9598 + }, + { + "completion_length": 213.88393878936768, + "epoch": 1.6095812900792152, + "grad_norm": 0.20051143266263496, + "kl": 0.131591796875, + "learning_rate": 4.977870607202456e-07, + "loss": 0.0001, + "reward": 1.7357143759727478, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143089175224, + "rewards/format_reward_func": 1.0, + "step": 9600 + }, + { + "completion_length": 216.59822368621826, + "epoch": 1.6099165933190829, + "grad_norm": 0.29833393306666617, + "kl": 0.210479736328125, + "learning_rate": 4.977854678809419e-07, + "loss": 0.0002, + "reward": 1.7821429297327995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 9602 + }, + { + "completion_length": 225.0178689956665, + "epoch": 1.6102518965589505, + "grad_norm": 0.2855325199172955, + "kl": 0.134735107421875, + "learning_rate": 4.977838744711443e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 9604 + }, + { + "completion_length": 219.96429538726807, + "epoch": 1.6105871997988181, + "grad_norm": 0.1848297056822976, + "kl": 0.1536865234375, + "learning_rate": 4.977822804908562e-07, + "loss": 0.0002, + "reward": 1.7950893342494965, + "reward_std": 0.02714784862473607, + "rewards/equation_reward_func": 0.7964285928755999, + "rewards/format_reward_func": 0.9986607171595097, + "step": 9606 + }, + { + "completion_length": 218.5178680419922, + "epoch": 1.6109225030386856, + "grad_norm": 0.12919435157945253, + "kl": 0.184478759765625, + "learning_rate": 4.977806859400816e-07, + "loss": 0.0002, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 9608 + }, + { + "completion_length": 238.8705472946167, + "epoch": 1.6112578062785532, + "grad_norm": 0.6828934653371714, + "kl": 0.240020751953125, + "learning_rate": 4.977790908188239e-07, + "loss": 0.0002, + "reward": 1.7178572118282318, + "reward_std": 0.09596449136734009, + "rewards/equation_reward_func": 0.7357143089175224, + "rewards/format_reward_func": 0.9821428656578064, + "step": 9610 + }, + { + "completion_length": 222.44197463989258, + "epoch": 1.6115931095184206, + "grad_norm": 0.3959098496736238, + "kl": 0.192840576171875, + "learning_rate": 4.977774951270869e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857559025288, + "rewards/format_reward_func": 1.0, + "step": 9612 + }, + { + "completion_length": 225.5491180419922, + "epoch": 1.6119284127582882, + "grad_norm": 0.13520841983062826, + "kl": 0.23187255859375, + "learning_rate": 4.977758988648742e-07, + "loss": 0.0002, + "reward": 1.7321429178118706, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7321428786963224, + "rewards/format_reward_func": 1.0, + "step": 9614 + }, + { + "completion_length": 219.13840293884277, + "epoch": 1.6122637159981559, + "grad_norm": 0.8712622346968198, + "kl": 0.259063720703125, + "learning_rate": 4.977743020321896e-07, + "loss": 0.0003, + "reward": 1.789285771548748, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 9616 + }, + { + "completion_length": 221.70983219146729, + "epoch": 1.6125990192380235, + "grad_norm": 0.3392027587240292, + "kl": 0.131103515625, + "learning_rate": 4.977727046290365e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9618 + }, + { + "completion_length": 222.52679634094238, + "epoch": 1.612934322477891, + "grad_norm": 0.29242676314966715, + "kl": 0.13372802734375, + "learning_rate": 4.977711066554189e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 9620 + }, + { + "completion_length": 231.80804634094238, + "epoch": 1.6132696257177583, + "grad_norm": 0.22772919308178813, + "kl": 0.1455078125, + "learning_rate": 4.977695081113403e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9622 + }, + { + "completion_length": 215.53125953674316, + "epoch": 1.613604928957626, + "grad_norm": 0.24217956537165358, + "kl": 0.1041412353515625, + "learning_rate": 4.977679089968044e-07, + "loss": 0.0001, + "reward": 1.7678571864962578, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 9624 + }, + { + "completion_length": 227.67411613464355, + "epoch": 1.6139402321974936, + "grad_norm": 0.2160774633134719, + "kl": 0.239593505859375, + "learning_rate": 4.977663093118151e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 9626 + }, + { + "completion_length": 230.6919755935669, + "epoch": 1.6142755354373612, + "grad_norm": 0.1998150112790896, + "kl": 0.160888671875, + "learning_rate": 4.977647090563757e-07, + "loss": 0.0002, + "reward": 1.7517857775092125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500070780516, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9628 + }, + { + "completion_length": 228.01786613464355, + "epoch": 1.6146108386772289, + "grad_norm": 0.22424239037497734, + "kl": 0.160491943359375, + "learning_rate": 4.977631082304901e-07, + "loss": 0.0002, + "reward": 1.7714286372065544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 9630 + }, + { + "completion_length": 232.89733409881592, + "epoch": 1.6149461419170963, + "grad_norm": 0.3348732755377922, + "kl": 0.160919189453125, + "learning_rate": 4.97761506834162e-07, + "loss": 0.0002, + "reward": 1.7446429058909416, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.7580357380211353, + "rewards/format_reward_func": 0.9866071492433548, + "step": 9632 + }, + { + "completion_length": 234.50447368621826, + "epoch": 1.6152814451569637, + "grad_norm": 0.1766412362968214, + "kl": 0.18402099609375, + "learning_rate": 4.97759904867395e-07, + "loss": 0.0002, + "reward": 1.7321429252624512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 9634 + }, + { + "completion_length": 230.77233123779297, + "epoch": 1.6156167483968313, + "grad_norm": 0.2087278214446919, + "kl": 0.19451904296875, + "learning_rate": 4.977583023301929e-07, + "loss": 0.0002, + "reward": 1.7035715132951736, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7035714648663998, + "rewards/format_reward_func": 1.0, + "step": 9636 + }, + { + "completion_length": 227.9375123977661, + "epoch": 1.615952051636699, + "grad_norm": 0.7674398765859399, + "kl": 0.294891357421875, + "learning_rate": 4.977566992225594e-07, + "loss": 0.0003, + "reward": 1.816071480512619, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.820535734295845, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9638 + }, + { + "completion_length": 231.5312614440918, + "epoch": 1.6162873548765666, + "grad_norm": 0.24075346358740746, + "kl": 0.1507568359375, + "learning_rate": 4.97755095544498e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857305705547, + "rewards/format_reward_func": 1.0, + "step": 9640 + }, + { + "completion_length": 237.66072463989258, + "epoch": 1.616622658116434, + "grad_norm": 0.2443153487966591, + "kl": 0.151885986328125, + "learning_rate": 4.977534912960124e-07, + "loss": 0.0002, + "reward": 1.698214367032051, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.7116071619093418, + "rewards/format_reward_func": 0.9866071492433548, + "step": 9642 + }, + { + "completion_length": 236.14733028411865, + "epoch": 1.6169579613563017, + "grad_norm": 0.33337972372874886, + "kl": 0.476715087890625, + "learning_rate": 4.977518864771065e-07, + "loss": 0.0005, + "reward": 1.7375000938773155, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.741964302957058, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9644 + }, + { + "completion_length": 229.7812614440918, + "epoch": 1.617293264596169, + "grad_norm": 0.16053063780435226, + "kl": 0.09991455078125, + "learning_rate": 4.97750281087784e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 9646 + }, + { + "completion_length": 234.77679920196533, + "epoch": 1.6176285678360367, + "grad_norm": 0.21157833209685314, + "kl": 0.130218505859375, + "learning_rate": 4.977486751280484e-07, + "loss": 0.0001, + "reward": 1.8035714700818062, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 9648 + }, + { + "completion_length": 234.44643878936768, + "epoch": 1.6179638710759043, + "grad_norm": 0.3738183027599362, + "kl": 0.736419677734375, + "learning_rate": 4.977470685979035e-07, + "loss": 0.0007, + "reward": 1.8035714700818062, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.8125000223517418, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9650 + }, + { + "completion_length": 231.8482265472412, + "epoch": 1.618299174315772, + "grad_norm": 0.1965934925777346, + "kl": 0.094573974609375, + "learning_rate": 4.97745461497353e-07, + "loss": 0.0001, + "reward": 1.841071493923664, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8455357328057289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9652 + }, + { + "completion_length": 231.94643783569336, + "epoch": 1.6186344775556394, + "grad_norm": 0.3680231256765474, + "kl": 0.1709747314453125, + "learning_rate": 4.977438538264006e-07, + "loss": 0.0002, + "reward": 1.8571428954601288, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8571428693830967, + "rewards/format_reward_func": 1.0, + "step": 9654 + }, + { + "completion_length": 224.1741180419922, + "epoch": 1.6189697807955068, + "grad_norm": 0.32522419030659977, + "kl": 0.321319580078125, + "learning_rate": 4.9774224558505e-07, + "loss": 0.0003, + "reward": 1.7875000461935997, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643297791481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9656 + }, + { + "completion_length": 227.69643783569336, + "epoch": 1.6193050840353744, + "grad_norm": 0.27102926902278984, + "kl": 0.31005859375, + "learning_rate": 4.977406367733049e-07, + "loss": 0.0003, + "reward": 1.7321429327130318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.732142886146903, + "rewards/format_reward_func": 1.0, + "step": 9658 + }, + { + "completion_length": 236.2634048461914, + "epoch": 1.619640387275242, + "grad_norm": 0.16385341685718283, + "kl": 1.131195068359375, + "learning_rate": 4.977390273911689e-07, + "loss": 0.0011, + "reward": 1.7357143759727478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 9660 + }, + { + "completion_length": 234.70983123779297, + "epoch": 1.6199756905151097, + "grad_norm": 0.28091750128202797, + "kl": 1.291107177734375, + "learning_rate": 4.97737417438646e-07, + "loss": 0.0013, + "reward": 1.6535715013742447, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.6625000461935997, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9662 + }, + { + "completion_length": 220.4017972946167, + "epoch": 1.6203109937549771, + "grad_norm": 0.33978137394630953, + "kl": 0.361083984375, + "learning_rate": 4.977358069157395e-07, + "loss": 0.0004, + "reward": 1.7714286372065544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 9664 + }, + { + "completion_length": 229.77233219146729, + "epoch": 1.6206462969948447, + "grad_norm": 0.6532627671050593, + "kl": 0.758575439453125, + "learning_rate": 4.977341958224535e-07, + "loss": 0.0008, + "reward": 1.725000075995922, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7339286003261805, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9666 + }, + { + "completion_length": 228.43751049041748, + "epoch": 1.6209816002347122, + "grad_norm": 0.18385989770126193, + "kl": 0.35162353515625, + "learning_rate": 4.977325841587914e-07, + "loss": 0.0004, + "reward": 1.7808036357164383, + "reward_std": 0.037249373737722635, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 0.9986607171595097, + "step": 9668 + }, + { + "completion_length": 237.67411518096924, + "epoch": 1.6213169034745798, + "grad_norm": 0.24482785614371022, + "kl": 0.98797607421875, + "learning_rate": 4.97730971924757e-07, + "loss": 0.001, + "reward": 1.7500000447034836, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7589286230504513, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9670 + }, + { + "completion_length": 224.8750114440918, + "epoch": 1.6216522067144474, + "grad_norm": 0.4871330431732284, + "kl": 0.494415283203125, + "learning_rate": 4.977293591203542e-07, + "loss": 0.0005, + "reward": 1.80892863124609, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.813392885029316, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9672 + }, + { + "completion_length": 222.71875858306885, + "epoch": 1.621987509954315, + "grad_norm": 0.14262834413088796, + "kl": 0.99700927734375, + "learning_rate": 4.977277457455865e-07, + "loss": 0.001, + "reward": 1.725000075995922, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7250000424683094, + "rewards/format_reward_func": 1.0, + "step": 9674 + }, + { + "completion_length": 223.46876049041748, + "epoch": 1.6223228131941825, + "grad_norm": 0.34430265186672526, + "kl": 0.10821533203125, + "learning_rate": 4.977261318004576e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 9676 + }, + { + "completion_length": 232.05804634094238, + "epoch": 1.62265811643405, + "grad_norm": 0.20829203027319398, + "kl": 0.208831787109375, + "learning_rate": 4.977245172849714e-07, + "loss": 0.0002, + "reward": 1.764285795390606, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 9678 + }, + { + "completion_length": 227.8616180419922, + "epoch": 1.6229934196739175, + "grad_norm": 0.2671142585139483, + "kl": 0.1011962890625, + "learning_rate": 4.977229021991315e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9680 + }, + { + "completion_length": 230.9955472946167, + "epoch": 1.6233287229137852, + "grad_norm": 0.2495018380150175, + "kl": 0.2869873046875, + "learning_rate": 4.977212865429416e-07, + "loss": 0.0003, + "reward": 1.778571493923664, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 9682 + }, + { + "completion_length": 219.17411613464355, + "epoch": 1.6236640261536528, + "grad_norm": 0.11882449677412509, + "kl": 0.189483642578125, + "learning_rate": 4.977196703164055e-07, + "loss": 0.0002, + "reward": 1.7446429282426834, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071633994579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9684 + }, + { + "completion_length": 220.46429443359375, + "epoch": 1.6239993293935204, + "grad_norm": 0.1870383835898375, + "kl": 0.11187744140625, + "learning_rate": 4.977180535195268e-07, + "loss": 0.0001, + "reward": 1.7392857745289803, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 9686 + }, + { + "completion_length": 225.55358219146729, + "epoch": 1.6243346326333878, + "grad_norm": 0.19740042234481134, + "kl": 0.11981201171875, + "learning_rate": 4.977164361523093e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 9688 + }, + { + "completion_length": 224.74554538726807, + "epoch": 1.6246699358732553, + "grad_norm": 0.2529255736949469, + "kl": 0.167327880859375, + "learning_rate": 4.977148182147567e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857268452644, + "rewards/format_reward_func": 1.0, + "step": 9690 + }, + { + "completion_length": 231.6696538925171, + "epoch": 1.625005239113123, + "grad_norm": 0.1329825256542032, + "kl": 0.29461669921875, + "learning_rate": 4.977131997068729e-07, + "loss": 0.0003, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 9692 + }, + { + "completion_length": 218.3125114440918, + "epoch": 1.6253405423529905, + "grad_norm": 0.0972662980687718, + "kl": 0.149688720703125, + "learning_rate": 4.977115806286613e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 9694 + }, + { + "completion_length": 217.821439743042, + "epoch": 1.6256758455928582, + "grad_norm": 0.3084129310617149, + "kl": 0.125244140625, + "learning_rate": 4.977099609801259e-07, + "loss": 0.0001, + "reward": 1.776785783469677, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.781250037252903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9696 + }, + { + "completion_length": 221.05358123779297, + "epoch": 1.6260111488327256, + "grad_norm": 0.2154034072950012, + "kl": 0.092742919921875, + "learning_rate": 4.977083407612702e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 9698 + }, + { + "completion_length": 222.59822368621826, + "epoch": 1.626346452072593, + "grad_norm": 0.1480951218735697, + "kl": 0.094696044921875, + "learning_rate": 4.977067199720981e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 9700 + }, + { + "completion_length": 225.29465293884277, + "epoch": 1.6266817553124606, + "grad_norm": 0.25106638423487115, + "kl": 0.103851318359375, + "learning_rate": 4.977050986126134e-07, + "loss": 0.0001, + "reward": 1.764285795390606, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 9702 + }, + { + "completion_length": 220.85268783569336, + "epoch": 1.6270170585523283, + "grad_norm": 0.2751909383121962, + "kl": 0.1004638671875, + "learning_rate": 4.977034766828195e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 9704 + }, + { + "completion_length": 222.95983028411865, + "epoch": 1.627352361792196, + "grad_norm": 0.34442896985833393, + "kl": 0.0963592529296875, + "learning_rate": 4.977018541827206e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 9706 + }, + { + "completion_length": 232.1071538925171, + "epoch": 1.6276876650320635, + "grad_norm": 0.1972980182956902, + "kl": 0.1473541259765625, + "learning_rate": 4.977002311123201e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 9708 + }, + { + "completion_length": 232.39286613464355, + "epoch": 1.628022968271931, + "grad_norm": 0.18802488475716975, + "kl": 0.1038055419921875, + "learning_rate": 4.976986074716217e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 9710 + }, + { + "completion_length": 227.821439743042, + "epoch": 1.6283582715117983, + "grad_norm": 0.14353477227332534, + "kl": 0.1986083984375, + "learning_rate": 4.976969832606295e-07, + "loss": 0.0002, + "reward": 1.776785783469677, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500186264515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9712 + }, + { + "completion_length": 223.3616180419922, + "epoch": 1.628693574751666, + "grad_norm": 0.002946186467856951, + "kl": 0.0805816650390625, + "learning_rate": 4.976953584793469e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 9714 + }, + { + "completion_length": 221.93750858306885, + "epoch": 1.6290288779915336, + "grad_norm": 0.2969509883521366, + "kl": 0.1015625, + "learning_rate": 4.976937331277777e-07, + "loss": 0.0001, + "reward": 1.8107143267989159, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143007218838, + "rewards/format_reward_func": 1.0, + "step": 9716 + }, + { + "completion_length": 228.8884038925171, + "epoch": 1.6293641812314013, + "grad_norm": 0.18499841231582298, + "kl": 0.1175689697265625, + "learning_rate": 4.976921072059256e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.800000037997961, + "rewards/format_reward_func": 1.0, + "step": 9718 + }, + { + "completion_length": 235.00001049041748, + "epoch": 1.6296994844712687, + "grad_norm": 0.14542382371891566, + "kl": 0.104278564453125, + "learning_rate": 4.976904807137947e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 9720 + }, + { + "completion_length": 235.32143878936768, + "epoch": 1.6300347877111363, + "grad_norm": 0.23501990256504454, + "kl": 0.1071014404296875, + "learning_rate": 4.976888536513883e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 9722 + }, + { + "completion_length": 230.77232933044434, + "epoch": 1.6303700909510037, + "grad_norm": 0.22814759709441632, + "kl": 0.094635009765625, + "learning_rate": 4.976872260187104e-07, + "loss": 0.0001, + "reward": 1.7428572326898575, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 9724 + }, + { + "completion_length": 228.00447463989258, + "epoch": 1.6307053941908713, + "grad_norm": 0.31272582817111516, + "kl": 0.10003662109375, + "learning_rate": 4.976855978157646e-07, + "loss": 0.0001, + "reward": 1.741964340209961, + "reward_std": 0.09217641782015562, + "rewards/equation_reward_func": 0.7482143230736256, + "rewards/format_reward_func": 0.9937500059604645, + "step": 9726 + }, + { + "completion_length": 225.5000114440918, + "epoch": 1.631040697430739, + "grad_norm": 0.23990043104025616, + "kl": 0.333709716796875, + "learning_rate": 4.976839690425547e-07, + "loss": 0.0003, + "reward": 1.725000075995922, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 1.0, + "step": 9728 + }, + { + "completion_length": 226.41072368621826, + "epoch": 1.6313760006706066, + "grad_norm": 0.20449317919171378, + "kl": 0.1821441650390625, + "learning_rate": 4.976823396990845e-07, + "loss": 0.0002, + "reward": 1.7839286178350449, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7883928790688515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9730 + }, + { + "completion_length": 224.50447273254395, + "epoch": 1.631711303910474, + "grad_norm": 0.13556935221852442, + "kl": 0.160125732421875, + "learning_rate": 4.976807097853577e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 9732 + }, + { + "completion_length": 229.99554634094238, + "epoch": 1.6320466071503414, + "grad_norm": 0.24666051030834293, + "kl": 0.097808837890625, + "learning_rate": 4.976790793013781e-07, + "loss": 0.0001, + "reward": 1.7339286357164383, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9734 + }, + { + "completion_length": 221.28572463989258, + "epoch": 1.632381910390209, + "grad_norm": 0.2695799750218758, + "kl": 0.0812835693359375, + "learning_rate": 4.976774482471494e-07, + "loss": 0.0001, + "reward": 1.7696429342031479, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9736 + }, + { + "completion_length": 226.5357255935669, + "epoch": 1.6327172136300767, + "grad_norm": 0.301846569929245, + "kl": 0.2186279296875, + "learning_rate": 4.976758166226755e-07, + "loss": 0.0002, + "reward": 1.741071492433548, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9738 + }, + { + "completion_length": 222.57590198516846, + "epoch": 1.6330525168699443, + "grad_norm": 0.17720910829776573, + "kl": 0.168731689453125, + "learning_rate": 4.9767418442796e-07, + "loss": 0.0002, + "reward": 1.7464286386966705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 9740 + }, + { + "completion_length": 227.6696548461914, + "epoch": 1.6333878201098118, + "grad_norm": 0.23504752245686839, + "kl": 0.344696044921875, + "learning_rate": 4.976725516630065e-07, + "loss": 0.0003, + "reward": 1.8250000551342964, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000216066837, + "rewards/format_reward_func": 1.0, + "step": 9742 + }, + { + "completion_length": 219.54465293884277, + "epoch": 1.6337231233496794, + "grad_norm": 0.26755522942864296, + "kl": 0.0870819091796875, + "learning_rate": 4.976709183278192e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 9744 + }, + { + "completion_length": 223.22768878936768, + "epoch": 1.6340584265895468, + "grad_norm": 0.3647382535741721, + "kl": 0.13238525390625, + "learning_rate": 4.976692844224014e-07, + "loss": 0.0001, + "reward": 1.7178572416305542, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7178571820259094, + "rewards/format_reward_func": 1.0, + "step": 9746 + }, + { + "completion_length": 222.90179538726807, + "epoch": 1.6343937298294144, + "grad_norm": 0.23603635934814576, + "kl": 0.084197998046875, + "learning_rate": 4.976676499467573e-07, + "loss": 0.0001, + "reward": 1.7125000730156898, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7169642969965935, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9748 + }, + { + "completion_length": 223.4776906967163, + "epoch": 1.634729033069282, + "grad_norm": 0.17505501443511048, + "kl": 0.0858306884765625, + "learning_rate": 4.976660149008903e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 9750 + }, + { + "completion_length": 226.9910831451416, + "epoch": 1.6350643363091497, + "grad_norm": 0.16053011603624132, + "kl": 0.1113739013671875, + "learning_rate": 4.976643792848043e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 9752 + }, + { + "completion_length": 229.87054634094238, + "epoch": 1.6353996395490171, + "grad_norm": 0.20286619977289153, + "kl": 0.1265869140625, + "learning_rate": 4.976627430985031e-07, + "loss": 0.0001, + "reward": 1.7232143357396126, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7276786062866449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9754 + }, + { + "completion_length": 229.2366180419922, + "epoch": 1.6357349427888845, + "grad_norm": 0.2432832648509416, + "kl": 0.539398193359375, + "learning_rate": 4.976611063419906e-07, + "loss": 0.0005, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 9756 + }, + { + "completion_length": 225.43304538726807, + "epoch": 1.6360702460287522, + "grad_norm": 0.23562147245432455, + "kl": 0.093414306640625, + "learning_rate": 4.976594690152702e-07, + "loss": 0.0001, + "reward": 1.742857187986374, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7517857514321804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9758 + }, + { + "completion_length": 226.06697368621826, + "epoch": 1.6364055492686198, + "grad_norm": 0.2776689340708705, + "kl": 0.14593505859375, + "learning_rate": 4.976578311183459e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 9760 + }, + { + "completion_length": 229.14732933044434, + "epoch": 1.6367408525084874, + "grad_norm": 0.2487283953788348, + "kl": 0.4114837646484375, + "learning_rate": 4.976561926512215e-07, + "loss": 0.0004, + "reward": 1.7625000476837158, + "reward_std": 0.09343910776078701, + "rewards/equation_reward_func": 0.7669643219560385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9762 + }, + { + "completion_length": 231.17411708831787, + "epoch": 1.637076155748355, + "grad_norm": 0.2730592489396775, + "kl": 0.090362548828125, + "learning_rate": 4.976545536139007e-07, + "loss": 0.0001, + "reward": 1.7250000685453415, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 1.0, + "step": 9764 + }, + { + "completion_length": 224.81250858306885, + "epoch": 1.6374114589882225, + "grad_norm": 0.2095179396768771, + "kl": 0.0867919921875, + "learning_rate": 4.976529140063874e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.785714328289032, + "rewards/format_reward_func": 1.0, + "step": 9766 + }, + { + "completion_length": 239.69197463989258, + "epoch": 1.63774676222809, + "grad_norm": 0.4736853723651829, + "kl": 0.27679443359375, + "learning_rate": 4.976512738286851e-07, + "loss": 0.0003, + "reward": 1.7535715028643608, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7625000402331352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9768 + }, + { + "completion_length": 231.71429634094238, + "epoch": 1.6380820654679575, + "grad_norm": 0.12591280961738255, + "kl": 0.321502685546875, + "learning_rate": 4.976496330807978e-07, + "loss": 0.0003, + "reward": 1.8214286118745804, + "reward_std": 0.03030457627028227, + "rewards/equation_reward_func": 0.8303571566939354, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9770 + }, + { + "completion_length": 231.53126049041748, + "epoch": 1.6384173687078252, + "grad_norm": 0.23487225655647087, + "kl": 0.55450439453125, + "learning_rate": 4.976479917627292e-07, + "loss": 0.0006, + "reward": 1.7683036401867867, + "reward_std": 0.05492704268544912, + "rewards/equation_reward_func": 0.7741071730852127, + "rewards/format_reward_func": 0.9941964335739613, + "step": 9772 + }, + { + "completion_length": 226.21429634094238, + "epoch": 1.6387526719476928, + "grad_norm": 0.26357185990527765, + "kl": 0.21807861328125, + "learning_rate": 4.976463498744832e-07, + "loss": 0.0002, + "reward": 1.7232143729925156, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7276786081492901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9774 + }, + { + "completion_length": 231.37501335144043, + "epoch": 1.6390879751875602, + "grad_norm": 0.2148319851533725, + "kl": 0.1714019775390625, + "learning_rate": 4.976447074160634e-07, + "loss": 0.0002, + "reward": 1.7946429327130318, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7991071604192257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9776 + }, + { + "completion_length": 234.99108123779297, + "epoch": 1.6394232784274279, + "grad_norm": 0.0844546691876832, + "kl": 0.27728271484375, + "learning_rate": 4.976430643874737e-07, + "loss": 0.0003, + "reward": 1.7178572192788124, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7178571745753288, + "rewards/format_reward_func": 1.0, + "step": 9778 + }, + { + "completion_length": 239.3303689956665, + "epoch": 1.6397585816672953, + "grad_norm": 0.35980099021948647, + "kl": 0.845733642578125, + "learning_rate": 4.976414207887178e-07, + "loss": 0.0008, + "reward": 1.7589286416769028, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7633928991854191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9780 + }, + { + "completion_length": 231.68304634094238, + "epoch": 1.640093884907163, + "grad_norm": 0.16832908265496788, + "kl": 0.0909423828125, + "learning_rate": 4.976397766197996e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714440047741, + "rewards/format_reward_func": 1.0, + "step": 9782 + }, + { + "completion_length": 231.8839406967163, + "epoch": 1.6404291881470305, + "grad_norm": 0.2077517323854166, + "kl": 0.097900390625, + "learning_rate": 4.976381318807228e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 9784 + }, + { + "completion_length": 235.29019165039062, + "epoch": 1.6407644913868982, + "grad_norm": 0.1573480194081336, + "kl": 0.0998077392578125, + "learning_rate": 4.976364865714911e-07, + "loss": 0.0001, + "reward": 1.8017857670783997, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500394880772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9786 + }, + { + "completion_length": 247.6250114440918, + "epoch": 1.6410997946267656, + "grad_norm": 0.24239820048925825, + "kl": 0.5035400390625, + "learning_rate": 4.976348406921085e-07, + "loss": 0.0005, + "reward": 1.7767857685685158, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9788 + }, + { + "completion_length": 246.48215579986572, + "epoch": 1.641435097866633, + "grad_norm": 0.2469812162663503, + "kl": 0.15966796875, + "learning_rate": 4.976331942425786e-07, + "loss": 0.0002, + "reward": 1.7553572207689285, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9790 + }, + { + "completion_length": 236.31251049041748, + "epoch": 1.6417704011065006, + "grad_norm": 0.18019076628164507, + "kl": 0.4852447509765625, + "learning_rate": 4.976315472229054e-07, + "loss": 0.0005, + "reward": 1.7321429327130318, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 9792 + }, + { + "completion_length": 232.93304634094238, + "epoch": 1.6421057043463683, + "grad_norm": 0.17304451156624187, + "kl": 0.187408447265625, + "learning_rate": 4.976298996330925e-07, + "loss": 0.0002, + "reward": 1.7767858058214188, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9794 + }, + { + "completion_length": 238.790189743042, + "epoch": 1.642441007586236, + "grad_norm": 0.7010478912700674, + "kl": 1.2734222412109375, + "learning_rate": 4.976282514731437e-07, + "loss": 0.0013, + "reward": 1.6839286386966705, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.6883929129689932, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9796 + }, + { + "completion_length": 254.2901906967163, + "epoch": 1.6427763108261033, + "grad_norm": 0.16518823323916085, + "kl": 0.342803955078125, + "learning_rate": 4.976266027430629e-07, + "loss": 0.0003, + "reward": 1.7464286386966705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.746428607031703, + "rewards/format_reward_func": 1.0, + "step": 9798 + }, + { + "completion_length": 248.65625953674316, + "epoch": 1.643111614065971, + "grad_norm": 0.2258722497635101, + "kl": 0.151885986328125, + "learning_rate": 4.976249534428539e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7732143141329288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9800 + }, + { + "completion_length": 241.86161994934082, + "epoch": 1.6434469173058384, + "grad_norm": 0.21150592179344968, + "kl": 0.124969482421875, + "learning_rate": 4.976233035725203e-07, + "loss": 0.0001, + "reward": 1.7767857983708382, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500186264515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9802 + }, + { + "completion_length": 239.51786613464355, + "epoch": 1.643782220545706, + "grad_norm": 0.23342208363613637, + "kl": 0.11627197265625, + "learning_rate": 4.976216531320662e-07, + "loss": 0.0001, + "reward": 1.7035715207457542, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.712500024586916, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9804 + }, + { + "completion_length": 243.0803689956665, + "epoch": 1.6441175237855736, + "grad_norm": 0.14752362469685548, + "kl": 0.207550048828125, + "learning_rate": 4.976200021214952e-07, + "loss": 0.0002, + "reward": 1.7678572088479996, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7767857424914837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9806 + }, + { + "completion_length": 238.94197368621826, + "epoch": 1.6444528270254413, + "grad_norm": 0.23430527663124873, + "kl": 0.1428680419921875, + "learning_rate": 4.97618350540811e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9808 + }, + { + "completion_length": 239.4062614440918, + "epoch": 1.6447881302653087, + "grad_norm": 2.2863623227606498, + "kl": 0.110076904296875, + "learning_rate": 4.976166983900177e-07, + "loss": 0.0001, + "reward": 1.7339286655187607, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928783237934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9810 + }, + { + "completion_length": 236.11608123779297, + "epoch": 1.645123433505176, + "grad_norm": 0.1577864150084241, + "kl": 0.1080169677734375, + "learning_rate": 4.976150456691189e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355339758098125, + "rewards/equation_reward_func": 0.7696428932249546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9812 + }, + { + "completion_length": 229.29465293884277, + "epoch": 1.6454587367450437, + "grad_norm": 0.23221787032821553, + "kl": 0.15777587890625, + "learning_rate": 4.976133923781186e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7875000163912773, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9814 + }, + { + "completion_length": 246.70090579986572, + "epoch": 1.6457940399849114, + "grad_norm": 0.3261517870491476, + "kl": 0.225372314453125, + "learning_rate": 4.976117385170204e-07, + "loss": 0.0002, + "reward": 1.7928571924567223, + "reward_std": 0.06060915347188711, + "rewards/equation_reward_func": 0.8017857521772385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9816 + }, + { + "completion_length": 224.31250858306885, + "epoch": 1.646129343224779, + "grad_norm": 0.4089467330222451, + "kl": 0.14581298828125, + "learning_rate": 4.97610084085828e-07, + "loss": 0.0001, + "reward": 1.8500000834465027, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8500000052154064, + "rewards/format_reward_func": 1.0, + "step": 9818 + }, + { + "completion_length": 231.48661994934082, + "epoch": 1.6464646464646466, + "grad_norm": 0.24769592529913995, + "kl": 0.213653564453125, + "learning_rate": 4.976084290845455e-07, + "loss": 0.0002, + "reward": 1.7982143461704254, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8116071671247482, + "rewards/format_reward_func": 0.9866071492433548, + "step": 9820 + }, + { + "completion_length": 229.10715198516846, + "epoch": 1.646799949704514, + "grad_norm": 0.23369410087790052, + "kl": 0.16387939453125, + "learning_rate": 4.976067735131766e-07, + "loss": 0.0002, + "reward": 1.7517857626080513, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9822 + }, + { + "completion_length": 237.2142972946167, + "epoch": 1.6471352529443815, + "grad_norm": 0.16716198794533854, + "kl": 0.13714599609375, + "learning_rate": 4.976051173717251e-07, + "loss": 0.0001, + "reward": 1.7696429193019867, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9824 + }, + { + "completion_length": 235.5580472946167, + "epoch": 1.647470556184249, + "grad_norm": 0.26337107233516976, + "kl": 0.1463623046875, + "learning_rate": 4.976034606601948e-07, + "loss": 0.0001, + "reward": 1.8160714581608772, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.8294643089175224, + "rewards/format_reward_func": 0.9866071492433548, + "step": 9826 + }, + { + "completion_length": 233.915189743042, + "epoch": 1.6478058594241167, + "grad_norm": 0.5915121442371221, + "kl": 0.158599853515625, + "learning_rate": 4.976018033785895e-07, + "loss": 0.0002, + "reward": 1.7642857879400253, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 9828 + }, + { + "completion_length": 244.946439743042, + "epoch": 1.6481411626639844, + "grad_norm": 0.22913292433146903, + "kl": 0.179351806640625, + "learning_rate": 4.976001455269129e-07, + "loss": 0.0002, + "reward": 1.7357143685221672, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7446429021656513, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9830 + }, + { + "completion_length": 234.47322368621826, + "epoch": 1.6484764659038518, + "grad_norm": 0.20752086023371275, + "kl": 0.12939453125, + "learning_rate": 4.975984871051692e-07, + "loss": 0.0001, + "reward": 1.7803571969270706, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7848214469850063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9832 + }, + { + "completion_length": 224.13393688201904, + "epoch": 1.6488117691437192, + "grad_norm": 0.19172764728090289, + "kl": 0.1249542236328125, + "learning_rate": 4.97596828113362e-07, + "loss": 0.0001, + "reward": 1.764285758137703, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7821428868919611, + "rewards/format_reward_func": 0.9821428656578064, + "step": 9834 + }, + { + "completion_length": 231.31697463989258, + "epoch": 1.6491470723835868, + "grad_norm": 0.2749263834024471, + "kl": 0.183258056640625, + "learning_rate": 4.97595168551495e-07, + "loss": 0.0002, + "reward": 1.7928572073578835, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.8017857410013676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9836 + }, + { + "completion_length": 238.1696548461914, + "epoch": 1.6494823756234545, + "grad_norm": 0.1818458948457541, + "kl": 0.223663330078125, + "learning_rate": 4.975935084195721e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9838 + }, + { + "completion_length": 240.446439743042, + "epoch": 1.649817678863322, + "grad_norm": 0.2215274701890657, + "kl": 0.308563232421875, + "learning_rate": 4.975918477175972e-07, + "loss": 0.0003, + "reward": 1.7482143342494965, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.977678582072258, + "step": 9840 + }, + { + "completion_length": 229.79911613464355, + "epoch": 1.6501529821031897, + "grad_norm": 0.162230038132875, + "kl": 0.106109619140625, + "learning_rate": 4.975901864455739e-07, + "loss": 0.0001, + "reward": 1.7946429178118706, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7991071790456772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9842 + }, + { + "completion_length": 248.53126430511475, + "epoch": 1.6504882853430571, + "grad_norm": 0.32201021323163814, + "kl": 0.21990966796875, + "learning_rate": 4.975885246035064e-07, + "loss": 0.0002, + "reward": 1.7267857640981674, + "reward_std": 0.10354063473641872, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.977678582072258, + "step": 9844 + }, + { + "completion_length": 246.05804824829102, + "epoch": 1.6508235885829246, + "grad_norm": 0.3693541503025461, + "kl": 0.27056884765625, + "learning_rate": 4.975868621913983e-07, + "loss": 0.0003, + "reward": 1.7410714775323868, + "reward_std": 0.08333758544176817, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.977678582072258, + "step": 9846 + }, + { + "completion_length": 246.50447845458984, + "epoch": 1.6511588918227922, + "grad_norm": 0.7884187970758376, + "kl": 0.19110107421875, + "learning_rate": 4.975851992092533e-07, + "loss": 0.0002, + "reward": 1.6910715103149414, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.7223214618861675, + "rewards/format_reward_func": 0.9687500074505806, + "step": 9848 + }, + { + "completion_length": 238.7410831451416, + "epoch": 1.6514941950626598, + "grad_norm": 0.256588565457127, + "kl": 0.242462158203125, + "learning_rate": 4.975835356570755e-07, + "loss": 0.0002, + "reward": 1.78035718947649, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.8026785962283611, + "rewards/format_reward_func": 0.977678582072258, + "step": 9850 + }, + { + "completion_length": 237.477689743042, + "epoch": 1.6518294983025275, + "grad_norm": 0.22375889051073486, + "kl": 0.1295928955078125, + "learning_rate": 4.975818715348686e-07, + "loss": 0.0001, + "reward": 1.71785718947649, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7357143145054579, + "rewards/format_reward_func": 0.9821428656578064, + "step": 9852 + }, + { + "completion_length": 221.727689743042, + "epoch": 1.6521648015423949, + "grad_norm": 0.2291120045991947, + "kl": 0.1092376708984375, + "learning_rate": 4.975802068426364e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643312692642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9854 + }, + { + "completion_length": 223.55804443359375, + "epoch": 1.6525001047822625, + "grad_norm": 0.26357104191843117, + "kl": 0.112152099609375, + "learning_rate": 4.975785415803828e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.7758928947150707, + "rewards/format_reward_func": 0.9866071492433548, + "step": 9856 + }, + { + "completion_length": 221.83483219146729, + "epoch": 1.65283540802213, + "grad_norm": 0.5731160641229195, + "kl": 0.19775390625, + "learning_rate": 4.975768757481115e-07, + "loss": 0.0002, + "reward": 1.7892857491970062, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.798214316368103, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9858 + }, + { + "completion_length": 229.1562623977661, + "epoch": 1.6531707112619975, + "grad_norm": 0.2752841675439393, + "kl": 0.270050048828125, + "learning_rate": 4.975752093458266e-07, + "loss": 0.0003, + "reward": 1.7553572058677673, + "reward_std": 0.10354063659906387, + "rewards/equation_reward_func": 0.7687500193715096, + "rewards/format_reward_func": 0.9866071492433548, + "step": 9860 + }, + { + "completion_length": 213.55358123779297, + "epoch": 1.6535060145018652, + "grad_norm": 0.3404765282706294, + "kl": 0.1291351318359375, + "learning_rate": 4.975735423735316e-07, + "loss": 0.0001, + "reward": 1.801785759627819, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500320374966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9862 + }, + { + "completion_length": 221.33483028411865, + "epoch": 1.6538413177417328, + "grad_norm": 0.19012431000183802, + "kl": 0.151336669921875, + "learning_rate": 4.975718748312306e-07, + "loss": 0.0002, + "reward": 1.7892857268452644, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7982143238186836, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9864 + }, + { + "completion_length": 222.90179634094238, + "epoch": 1.6541766209816002, + "grad_norm": 0.004469077198342189, + "kl": 0.235565185546875, + "learning_rate": 4.975702067189274e-07, + "loss": 0.0002, + "reward": 1.7500000596046448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 9866 + }, + { + "completion_length": 221.87054252624512, + "epoch": 1.6545119242214676, + "grad_norm": 0.5435280581221478, + "kl": 0.3485107421875, + "learning_rate": 4.975685380366257e-07, + "loss": 0.0003, + "reward": 1.739285796880722, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 9868 + }, + { + "completion_length": 224.46429538726807, + "epoch": 1.6548472274613353, + "grad_norm": 0.20865656624230186, + "kl": 0.588104248046875, + "learning_rate": 4.975668687843295e-07, + "loss": 0.0006, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9870 + }, + { + "completion_length": 219.540189743042, + "epoch": 1.655182530701203, + "grad_norm": 0.308684592760428, + "kl": 0.479217529296875, + "learning_rate": 4.975651989620425e-07, + "loss": 0.0005, + "reward": 1.7250000461935997, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 1.0, + "step": 9872 + }, + { + "completion_length": 219.31697463989258, + "epoch": 1.6555178339410705, + "grad_norm": 0.21206860929059224, + "kl": 0.5569000244140625, + "learning_rate": 4.975635285697687e-07, + "loss": 0.0006, + "reward": 1.853571467101574, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8535714522004128, + "rewards/format_reward_func": 1.0, + "step": 9874 + }, + { + "completion_length": 227.2321538925171, + "epoch": 1.655853137180938, + "grad_norm": 0.24630201716195602, + "kl": 0.1265411376953125, + "learning_rate": 4.975618576075119e-07, + "loss": 0.0001, + "reward": 1.7267858013510704, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7312500216066837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9876 + }, + { + "completion_length": 219.62947368621826, + "epoch": 1.6561884404208056, + "grad_norm": 0.23565190769001568, + "kl": 0.22161865234375, + "learning_rate": 4.975601860752758e-07, + "loss": 0.0002, + "reward": 1.778571493923664, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.778571454808116, + "rewards/format_reward_func": 1.0, + "step": 9878 + }, + { + "completion_length": 220.07590198516846, + "epoch": 1.656523743660673, + "grad_norm": 0.20121385950706266, + "kl": 0.21588134765625, + "learning_rate": 4.975585139730644e-07, + "loss": 0.0002, + "reward": 1.7892857640981674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 9880 + }, + { + "completion_length": 229.6696538925171, + "epoch": 1.6568590469005406, + "grad_norm": 0.30524107494236097, + "kl": 0.0986175537109375, + "learning_rate": 4.975568413008816e-07, + "loss": 0.0001, + "reward": 1.7892857566475868, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 9882 + }, + { + "completion_length": 224.22322273254395, + "epoch": 1.6571943501404083, + "grad_norm": 0.3024153219313286, + "kl": 0.0926361083984375, + "learning_rate": 4.97555168058731e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7419643364846706, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9884 + }, + { + "completion_length": 218.5178689956665, + "epoch": 1.657529653380276, + "grad_norm": 0.2697534224150527, + "kl": 0.094818115234375, + "learning_rate": 4.975534942466168e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 9886 + }, + { + "completion_length": 223.04911613464355, + "epoch": 1.6578649566201433, + "grad_norm": 0.1835474696257643, + "kl": 0.1168365478515625, + "learning_rate": 4.975518198645425e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 9888 + }, + { + "completion_length": 222.79018878936768, + "epoch": 1.6582002598600107, + "grad_norm": 0.1563539322514873, + "kl": 0.186553955078125, + "learning_rate": 4.975501449125122e-07, + "loss": 0.0002, + "reward": 1.8107143193483353, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 9890 + }, + { + "completion_length": 224.33483219146729, + "epoch": 1.6585355630998784, + "grad_norm": 0.29091891567336037, + "kl": 0.1243438720703125, + "learning_rate": 4.975484693905298e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 9892 + }, + { + "completion_length": 225.27233123779297, + "epoch": 1.658870866339746, + "grad_norm": 0.37451253496480214, + "kl": 0.140167236328125, + "learning_rate": 4.975467932985989e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 9894 + }, + { + "completion_length": 237.70536708831787, + "epoch": 1.6592061695796136, + "grad_norm": 0.268947938153285, + "kl": 0.9615478515625, + "learning_rate": 4.975451166367235e-07, + "loss": 0.001, + "reward": 1.8000000566244125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 9896 + }, + { + "completion_length": 233.5982255935669, + "epoch": 1.6595414728194813, + "grad_norm": 0.2700866753770015, + "kl": 0.112548828125, + "learning_rate": 4.975434394049075e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 9898 + }, + { + "completion_length": 233.76340198516846, + "epoch": 1.6598767760593487, + "grad_norm": 0.2781299469560648, + "kl": 0.2715911865234375, + "learning_rate": 4.975417616031547e-07, + "loss": 0.0003, + "reward": 1.7678572088479996, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 9900 + }, + { + "completion_length": 245.7500114440918, + "epoch": 1.660212079299216, + "grad_norm": 0.2604433695384531, + "kl": 0.42327880859375, + "learning_rate": 4.97540083231469e-07, + "loss": 0.0004, + "reward": 1.8285714760422707, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8285714574158192, + "rewards/format_reward_func": 1.0, + "step": 9902 + }, + { + "completion_length": 246.0937623977661, + "epoch": 1.6605473825390837, + "grad_norm": 0.523587038459449, + "kl": 0.1160888671875, + "learning_rate": 4.975384042898542e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9904 + }, + { + "completion_length": 240.57590579986572, + "epoch": 1.6608826857789514, + "grad_norm": 0.2091887597400727, + "kl": 0.1318206787109375, + "learning_rate": 4.975367247783144e-07, + "loss": 0.0001, + "reward": 1.81428574770689, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857328057289, + "rewards/format_reward_func": 1.0, + "step": 9906 + }, + { + "completion_length": 251.88840579986572, + "epoch": 1.661217989018819, + "grad_norm": 0.38583211577731064, + "kl": 0.101470947265625, + "learning_rate": 4.97535044696853e-07, + "loss": 0.0001, + "reward": 1.7196429446339607, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7241071797907352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9908 + }, + { + "completion_length": 251.37947750091553, + "epoch": 1.6615532922586864, + "grad_norm": 0.3002960218342867, + "kl": 0.4559783935546875, + "learning_rate": 4.975333640454743e-07, + "loss": 0.0005, + "reward": 1.7357143610715866, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143238186836, + "rewards/format_reward_func": 1.0, + "step": 9910 + }, + { + "completion_length": 240.1785831451416, + "epoch": 1.661888595498554, + "grad_norm": 0.23452481171864847, + "kl": 0.275421142578125, + "learning_rate": 4.975316828241821e-07, + "loss": 0.0003, + "reward": 1.8107143267989159, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 9912 + }, + { + "completion_length": 238.3839406967163, + "epoch": 1.6622238987384215, + "grad_norm": 0.2104834790867371, + "kl": 0.110626220703125, + "learning_rate": 4.9753000103298e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643275439739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9914 + }, + { + "completion_length": 252.7946548461914, + "epoch": 1.662559201978289, + "grad_norm": 0.2198794519534018, + "kl": 0.5018157958984375, + "learning_rate": 4.975283186718722e-07, + "loss": 0.0005, + "reward": 1.7125000730156898, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7169643174856901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9916 + }, + { + "completion_length": 253.2544755935669, + "epoch": 1.6628945052181567, + "grad_norm": 0.6512800667382872, + "kl": 1.954864501953125, + "learning_rate": 4.975266357408623e-07, + "loss": 0.002, + "reward": 1.7464286163449287, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7553571872413158, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9918 + }, + { + "completion_length": 264.151798248291, + "epoch": 1.6632298084580244, + "grad_norm": 0.19630697068622924, + "kl": 0.124298095703125, + "learning_rate": 4.975249522399544e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714786499739, + "rewards/format_reward_func": 1.0, + "step": 9920 + }, + { + "completion_length": 243.2455472946167, + "epoch": 1.6635651116978918, + "grad_norm": 0.3169098579351197, + "kl": 0.1141357421875, + "learning_rate": 4.975232681691523e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9922 + }, + { + "completion_length": 247.95983219146729, + "epoch": 1.6639004149377592, + "grad_norm": 0.24182742790034367, + "kl": 0.21661376953125, + "learning_rate": 4.975215835284598e-07, + "loss": 0.0002, + "reward": 1.8000000640749931, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 9924 + }, + { + "completion_length": 239.7009048461914, + "epoch": 1.6642357181776268, + "grad_norm": 0.007444658415270113, + "kl": 0.1461181640625, + "learning_rate": 4.975198983178808e-07, + "loss": 0.0001, + "reward": 1.7339286133646965, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9926 + }, + { + "completion_length": 245.08036708831787, + "epoch": 1.6645710214174945, + "grad_norm": 0.18782957922663335, + "kl": 0.115631103515625, + "learning_rate": 4.975182125374193e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 9928 + }, + { + "completion_length": 257.1517972946167, + "epoch": 1.664906324657362, + "grad_norm": 0.3034523369429072, + "kl": 0.4335174560546875, + "learning_rate": 4.975165261870791e-07, + "loss": 0.0004, + "reward": 1.778571479022503, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7785714417695999, + "rewards/format_reward_func": 1.0, + "step": 9930 + }, + { + "completion_length": 257.20537281036377, + "epoch": 1.6652416278972295, + "grad_norm": 0.2824380825578585, + "kl": 0.2369384765625, + "learning_rate": 4.975148392668641e-07, + "loss": 0.0002, + "reward": 1.750000074505806, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.758928619325161, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9932 + }, + { + "completion_length": 245.7232265472412, + "epoch": 1.6655769311370971, + "grad_norm": 0.20963993568996167, + "kl": 0.15716552734375, + "learning_rate": 4.975131517767782e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 9934 + }, + { + "completion_length": 254.8437623977661, + "epoch": 1.6659122343769646, + "grad_norm": 0.1708738271349833, + "kl": 0.327911376953125, + "learning_rate": 4.975114637168252e-07, + "loss": 0.0003, + "reward": 1.7428572103381157, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 9936 + }, + { + "completion_length": 251.62054634094238, + "epoch": 1.6662475376168322, + "grad_norm": 0.15899310756923962, + "kl": 0.11993408203125, + "learning_rate": 4.97509775087009e-07, + "loss": 0.0001, + "reward": 1.742857187986374, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 0.9821428656578064, + "step": 9938 + }, + { + "completion_length": 259.2500114440918, + "epoch": 1.6665828408566998, + "grad_norm": 0.25007766641668766, + "kl": 0.113739013671875, + "learning_rate": 4.975080858873336e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9940 + }, + { + "completion_length": 253.57144260406494, + "epoch": 1.6669181440965675, + "grad_norm": 0.29124021885748613, + "kl": 0.125030517578125, + "learning_rate": 4.975063961178027e-07, + "loss": 0.0001, + "reward": 1.7982143685221672, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8026785887777805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9942 + }, + { + "completion_length": 250.4375114440918, + "epoch": 1.6672534473364349, + "grad_norm": 0.36358150917938814, + "kl": 0.12762451171875, + "learning_rate": 4.975047057784204e-07, + "loss": 0.0001, + "reward": 1.7303572222590446, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9944 + }, + { + "completion_length": 264.16965675354004, + "epoch": 1.6675887505763023, + "grad_norm": 0.27750091888726475, + "kl": 0.11468505859375, + "learning_rate": 4.975030148691905e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7410714589059353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9946 + }, + { + "completion_length": 264.3125104904175, + "epoch": 1.66792405381617, + "grad_norm": 0.26333116652019245, + "kl": 0.13232421875, + "learning_rate": 4.975013233901169e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 9948 + }, + { + "completion_length": 251.47769260406494, + "epoch": 1.6682593570560376, + "grad_norm": 0.21635846864664374, + "kl": 0.112518310546875, + "learning_rate": 4.974996313412034e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 9950 + }, + { + "completion_length": 252.16072368621826, + "epoch": 1.6685946602959052, + "grad_norm": 0.1487337396106638, + "kl": 0.1123046875, + "learning_rate": 4.974979387224541e-07, + "loss": 0.0001, + "reward": 1.7178571969270706, + "reward_std": 0.08586296625435352, + "rewards/equation_reward_func": 0.7357143275439739, + "rewards/format_reward_func": 0.9821428656578064, + "step": 9952 + }, + { + "completion_length": 262.29465675354004, + "epoch": 1.6689299635357726, + "grad_norm": 0.13238518388734732, + "kl": 0.10601806640625, + "learning_rate": 4.974962455338728e-07, + "loss": 0.0001, + "reward": 1.7946429029107094, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071715950966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9954 + }, + { + "completion_length": 266.8928689956665, + "epoch": 1.6692652667756402, + "grad_norm": 0.1476093870816, + "kl": 0.1008758544921875, + "learning_rate": 4.974945517754633e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7508928906172514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 9956 + }, + { + "completion_length": 252.73661994934082, + "epoch": 1.6696005700155077, + "grad_norm": 0.6375593922999553, + "kl": 0.1016998291015625, + "learning_rate": 4.974928574472296e-07, + "loss": 0.0001, + "reward": 1.7839286178350449, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7883928753435612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9958 + }, + { + "completion_length": 262.15626335144043, + "epoch": 1.6699358732553753, + "grad_norm": 0.1283348204099573, + "kl": 0.1146392822265625, + "learning_rate": 4.974911625491755e-07, + "loss": 0.0001, + "reward": 1.7410714849829674, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357629805803, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9960 + }, + { + "completion_length": 264.4419775009155, + "epoch": 1.670271176495243, + "grad_norm": 0.15103147121960364, + "kl": 0.107666015625, + "learning_rate": 4.974894670813051e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 9962 + }, + { + "completion_length": 255.56697273254395, + "epoch": 1.6706064797351106, + "grad_norm": 0.4432214758021507, + "kl": 0.110931396484375, + "learning_rate": 4.974877710436222e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 0.9821428656578064, + "step": 9964 + }, + { + "completion_length": 253.93751049041748, + "epoch": 1.670941782974978, + "grad_norm": 0.1251880983482693, + "kl": 0.114776611328125, + "learning_rate": 4.974860744361306e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 1.0, + "step": 9966 + }, + { + "completion_length": 252.72769451141357, + "epoch": 1.6712770862148454, + "grad_norm": 0.21358964680499068, + "kl": 0.11474609375, + "learning_rate": 4.974843772588343e-07, + "loss": 0.0001, + "reward": 1.7910714894533157, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9968 + }, + { + "completion_length": 250.03572177886963, + "epoch": 1.671612389454713, + "grad_norm": 0.17397521385257672, + "kl": 0.102630615234375, + "learning_rate": 4.974826795117371e-07, + "loss": 0.0001, + "reward": 1.733928605914116, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7383928969502449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9970 + }, + { + "completion_length": 239.4910831451416, + "epoch": 1.6719476926945807, + "grad_norm": 0.29886153992095205, + "kl": 0.106719970703125, + "learning_rate": 4.974809811948432e-07, + "loss": 0.0001, + "reward": 1.7107143700122833, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7107143215835094, + "rewards/format_reward_func": 1.0, + "step": 9972 + }, + { + "completion_length": 244.91072463989258, + "epoch": 1.6722829959344483, + "grad_norm": 0.2621951499898351, + "kl": 0.11273193359375, + "learning_rate": 4.974792823081563e-07, + "loss": 0.0001, + "reward": 1.785714365541935, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 9974 + }, + { + "completion_length": 235.9375123977661, + "epoch": 1.672618299174316, + "grad_norm": 0.1754438312521652, + "kl": 0.0966033935546875, + "learning_rate": 4.974775828516803e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 9976 + }, + { + "completion_length": 252.69644355773926, + "epoch": 1.6729536024141833, + "grad_norm": 0.19395800186925502, + "kl": 0.168914794921875, + "learning_rate": 4.974758828254192e-07, + "loss": 0.0002, + "reward": 1.8214286118745804, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214285895228386, + "rewards/format_reward_func": 1.0, + "step": 9978 + }, + { + "completion_length": 246.8973331451416, + "epoch": 1.6732889056540508, + "grad_norm": 0.2813224848583, + "kl": 0.138519287109375, + "learning_rate": 4.974741822293768e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7839285880327225, + "rewards/format_reward_func": 0.9910714328289032, + "step": 9980 + }, + { + "completion_length": 253.12054824829102, + "epoch": 1.6736242088939184, + "grad_norm": 0.14022046326230286, + "kl": 0.106109619140625, + "learning_rate": 4.97472481063557e-07, + "loss": 0.0001, + "reward": 1.746428668498993, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 9982 + }, + { + "completion_length": 247.61608409881592, + "epoch": 1.673959512133786, + "grad_norm": 0.1908507372333823, + "kl": 0.1064453125, + "learning_rate": 4.974707793279638e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 9984 + }, + { + "completion_length": 258.3660831451416, + "epoch": 1.6742948153736537, + "grad_norm": 0.19304693534597053, + "kl": 0.1163177490234375, + "learning_rate": 4.974690770226012e-07, + "loss": 0.0001, + "reward": 1.6821429282426834, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6821428928524256, + "rewards/format_reward_func": 1.0, + "step": 9986 + }, + { + "completion_length": 246.45090579986572, + "epoch": 1.674630118613521, + "grad_norm": 0.1695020044930438, + "kl": 0.12274169921875, + "learning_rate": 4.974673741474731e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9988 + }, + { + "completion_length": 241.7544765472412, + "epoch": 1.6749654218533887, + "grad_norm": 0.00708512969713439, + "kl": 0.116058349609375, + "learning_rate": 4.974656707025832e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 1.0, + "step": 9990 + }, + { + "completion_length": 252.5491189956665, + "epoch": 1.6753007250932561, + "grad_norm": 0.09447739873956, + "kl": 0.15313720703125, + "learning_rate": 4.974639666879356e-07, + "loss": 0.0002, + "reward": 1.7267857789993286, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7312500439584255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9992 + }, + { + "completion_length": 239.4419765472412, + "epoch": 1.6756360283331238, + "grad_norm": 0.3231878362488016, + "kl": 0.1316986083984375, + "learning_rate": 4.974622621035342e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 9994 + }, + { + "completion_length": 241.5759038925171, + "epoch": 1.6759713315729914, + "grad_norm": 0.14058138538907927, + "kl": 0.0970916748046875, + "learning_rate": 4.97460556949383e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286163449287, + "rewards/format_reward_func": 1.0, + "step": 9996 + }, + { + "completion_length": 253.74108505249023, + "epoch": 1.676306634812859, + "grad_norm": 0.22176562412261744, + "kl": 0.138458251953125, + "learning_rate": 4.974588512254858e-07, + "loss": 0.0001, + "reward": 1.8107143715023994, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 9998 + }, + { + "completion_length": 248.26340675354004, + "epoch": 1.6766419380527264, + "grad_norm": 0.2757893418813273, + "kl": 0.1244964599609375, + "learning_rate": 4.974571449318465e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 10000 + }, + { + "completion_length": 257.0134029388428, + "epoch": 1.6769772412925938, + "grad_norm": 0.686820231753818, + "kl": 0.2299652099609375, + "learning_rate": 4.974554380684692e-07, + "loss": 0.0002, + "reward": 1.7446429431438446, + "reward_std": 0.09848987217992544, + "rewards/equation_reward_func": 0.7580357417464256, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10002 + }, + { + "completion_length": 254.4687623977661, + "epoch": 1.6773125445324615, + "grad_norm": 0.3286740146450279, + "kl": 0.186676025390625, + "learning_rate": 4.974537306353577e-07, + "loss": 0.0002, + "reward": 1.798214353621006, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026785887777805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10004 + }, + { + "completion_length": 251.5714406967163, + "epoch": 1.6776478477723291, + "grad_norm": 0.13520624126222636, + "kl": 0.3016204833984375, + "learning_rate": 4.97452022632516e-07, + "loss": 0.0003, + "reward": 1.7482143491506577, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10006 + }, + { + "completion_length": 258.53125953674316, + "epoch": 1.6779831510121967, + "grad_norm": 0.15517144788447346, + "kl": 0.12713623046875, + "learning_rate": 4.97450314059948e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 10008 + }, + { + "completion_length": 249.79465579986572, + "epoch": 1.6783184542520642, + "grad_norm": 0.25555380520299875, + "kl": 0.12371826171875, + "learning_rate": 4.974486049176575e-07, + "loss": 0.0001, + "reward": 1.7071429342031479, + "reward_std": 0.06060915347188711, + "rewards/equation_reward_func": 0.7160714641213417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10010 + }, + { + "completion_length": 256.10268688201904, + "epoch": 1.6786537574919318, + "grad_norm": 0.11627685203612302, + "kl": 0.11810302734375, + "learning_rate": 4.974468952056487e-07, + "loss": 0.0001, + "reward": 1.7196429446339607, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071686148643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10012 + }, + { + "completion_length": 253.6428680419922, + "epoch": 1.6789890607317992, + "grad_norm": 0.20586557717180662, + "kl": 0.1288604736328125, + "learning_rate": 4.974451849239253e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7589285895228386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10014 + }, + { + "completion_length": 250.83483123779297, + "epoch": 1.6793243639716668, + "grad_norm": 0.1705164777603785, + "kl": 0.10638427734375, + "learning_rate": 4.974434740724915e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500331550837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10016 + }, + { + "completion_length": 258.96875953674316, + "epoch": 1.6796596672115345, + "grad_norm": 0.12622978400779727, + "kl": 0.180267333984375, + "learning_rate": 4.974417626513509e-07, + "loss": 0.0002, + "reward": 1.8214286118745804, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8214286006987095, + "rewards/format_reward_func": 1.0, + "step": 10018 + }, + { + "completion_length": 262.65626525878906, + "epoch": 1.6799949704514021, + "grad_norm": 0.260643559753684, + "kl": 0.163970947265625, + "learning_rate": 4.974400506605077e-07, + "loss": 0.0002, + "reward": 1.7375000938773155, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10020 + }, + { + "completion_length": 264.2321538925171, + "epoch": 1.6803302736912695, + "grad_norm": 0.22700129756442697, + "kl": 0.22357177734375, + "learning_rate": 4.974383380999657e-07, + "loss": 0.0002, + "reward": 1.7482143640518188, + "reward_std": 0.042931484058499336, + "rewards/equation_reward_func": 0.7616071701049805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10022 + }, + { + "completion_length": 260.0178699493408, + "epoch": 1.680665576931137, + "grad_norm": 0.6076334386049436, + "kl": 0.208984375, + "learning_rate": 4.97436624969729e-07, + "loss": 0.0002, + "reward": 1.7589286342263222, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7723214589059353, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10024 + }, + { + "completion_length": 272.1696557998657, + "epoch": 1.6810008801710046, + "grad_norm": 0.1626425044654749, + "kl": 0.129058837890625, + "learning_rate": 4.974349112698014e-07, + "loss": 0.0001, + "reward": 1.7910714745521545, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8044643066823483, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10026 + }, + { + "completion_length": 258.67412090301514, + "epoch": 1.6813361834108722, + "grad_norm": 0.2425983207701506, + "kl": 0.142913818359375, + "learning_rate": 4.974331970001869e-07, + "loss": 0.0001, + "reward": 1.791071504354477, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7955357395112514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10028 + }, + { + "completion_length": 257.92412090301514, + "epoch": 1.6816714866507398, + "grad_norm": 0.21702457725067278, + "kl": 0.1026611328125, + "learning_rate": 4.974314821608894e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7660714462399483, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10030 + }, + { + "completion_length": 248.46429634094238, + "epoch": 1.6820067898906075, + "grad_norm": 0.30085577116212653, + "kl": 0.1477813720703125, + "learning_rate": 4.974297667519129e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7625000290572643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10032 + }, + { + "completion_length": 257.2589416503906, + "epoch": 1.682342093130475, + "grad_norm": 0.06563080695637653, + "kl": 0.148345947265625, + "learning_rate": 4.974280507732613e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10034 + }, + { + "completion_length": 261.8884048461914, + "epoch": 1.6826773963703423, + "grad_norm": 0.3108712427672666, + "kl": 0.203338623046875, + "learning_rate": 4.974263342249387e-07, + "loss": 0.0002, + "reward": 1.7535714879631996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7625000160187483, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10036 + }, + { + "completion_length": 266.7187623977661, + "epoch": 1.68301269961021, + "grad_norm": 0.39533845320926486, + "kl": 0.125885009765625, + "learning_rate": 4.974246171069489e-07, + "loss": 0.0001, + "reward": 1.7000000700354576, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7089286036789417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10038 + }, + { + "completion_length": 246.85715198516846, + "epoch": 1.6833480028500776, + "grad_norm": 0.24514446508009619, + "kl": 0.11993408203125, + "learning_rate": 4.974228994192959e-07, + "loss": 0.0001, + "reward": 1.7732143327593803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10040 + }, + { + "completion_length": 266.3794765472412, + "epoch": 1.6836833060899452, + "grad_norm": 0.21864629994207177, + "kl": 0.13507080078125, + "learning_rate": 4.974211811619836e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 10042 + }, + { + "completion_length": 262.80358123779297, + "epoch": 1.6840186093298126, + "grad_norm": 0.17710018125664703, + "kl": 0.136474609375, + "learning_rate": 4.97419462335016e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10044 + }, + { + "completion_length": 249.85715293884277, + "epoch": 1.6843539125696803, + "grad_norm": 0.32386004346591574, + "kl": 0.110443115234375, + "learning_rate": 4.974177429383971e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 10046 + }, + { + "completion_length": 260.33483505249023, + "epoch": 1.6846892158095477, + "grad_norm": 0.2952786587131419, + "kl": 0.142913818359375, + "learning_rate": 4.974160229721308e-07, + "loss": 0.0001, + "reward": 1.7303572073578835, + "reward_std": 0.09848987124860287, + "rewards/equation_reward_func": 0.7437500320374966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10048 + }, + { + "completion_length": 264.16519260406494, + "epoch": 1.6850245190494153, + "grad_norm": 0.4340353985467339, + "kl": 0.12249755859375, + "learning_rate": 4.974143024362211e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 10050 + }, + { + "completion_length": 245.21875953674316, + "epoch": 1.685359822289283, + "grad_norm": 0.41833085967800654, + "kl": 0.1136474609375, + "learning_rate": 4.974125813306719e-07, + "loss": 0.0001, + "reward": 1.6964286416769028, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 0.9821428656578064, + "step": 10052 + }, + { + "completion_length": 252.24554634094238, + "epoch": 1.6856951255291506, + "grad_norm": 0.3466336130381244, + "kl": 0.116119384765625, + "learning_rate": 4.974108596554872e-07, + "loss": 0.0001, + "reward": 1.8142857775092125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857514321804, + "rewards/format_reward_func": 1.0, + "step": 10054 + }, + { + "completion_length": 252.09822368621826, + "epoch": 1.686030428769018, + "grad_norm": 0.2296487564634485, + "kl": 0.110504150390625, + "learning_rate": 4.97409137410671e-07, + "loss": 0.0001, + "reward": 1.7446429207921028, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071727126837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10056 + }, + { + "completion_length": 244.94643783569336, + "epoch": 1.6863657320088854, + "grad_norm": 0.15632409057849708, + "kl": 0.120025634765625, + "learning_rate": 4.974074145962272e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 10058 + }, + { + "completion_length": 243.4330472946167, + "epoch": 1.686701035248753, + "grad_norm": 0.2494711951314461, + "kl": 0.131256103515625, + "learning_rate": 4.974056912121599e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 10060 + }, + { + "completion_length": 251.37054634094238, + "epoch": 1.6870363384886207, + "grad_norm": 0.23723346999630718, + "kl": 0.111907958984375, + "learning_rate": 4.974039672584729e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 10062 + }, + { + "completion_length": 263.2991189956665, + "epoch": 1.6873716417284883, + "grad_norm": 0.24109290815217851, + "kl": 0.156463623046875, + "learning_rate": 4.974022427351703e-07, + "loss": 0.0002, + "reward": 1.7267857864499092, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7312500402331352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10064 + }, + { + "completion_length": 248.97322463989258, + "epoch": 1.6877069449683557, + "grad_norm": 0.24947769782804471, + "kl": 0.12640380859375, + "learning_rate": 4.974005176422559e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 10066 + }, + { + "completion_length": 245.2946538925171, + "epoch": 1.6880422482082234, + "grad_norm": 0.40610948081296316, + "kl": 0.125946044921875, + "learning_rate": 4.973987919797337e-07, + "loss": 0.0001, + "reward": 1.8107143640518188, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.8196428716182709, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10068 + }, + { + "completion_length": 257.37947273254395, + "epoch": 1.6883775514480908, + "grad_norm": 0.12776086661506092, + "kl": 0.151214599609375, + "learning_rate": 4.973970657476079e-07, + "loss": 0.0002, + "reward": 1.7875000685453415, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10070 + }, + { + "completion_length": 231.50893878936768, + "epoch": 1.6887128546879584, + "grad_norm": 0.1613835712573395, + "kl": 0.0987548828125, + "learning_rate": 4.973953389458824e-07, + "loss": 0.0001, + "reward": 1.8392857760190964, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.839285746216774, + "rewards/format_reward_func": 1.0, + "step": 10072 + }, + { + "completion_length": 250.8750114440918, + "epoch": 1.689048157927826, + "grad_norm": 0.21526186107713036, + "kl": 0.144287109375, + "learning_rate": 4.97393611574561e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 10074 + }, + { + "completion_length": 242.64733505249023, + "epoch": 1.6893834611676937, + "grad_norm": 0.14668393885929962, + "kl": 0.16693115234375, + "learning_rate": 4.973918836336478e-07, + "loss": 0.0002, + "reward": 1.7892857566475868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857305705547, + "rewards/format_reward_func": 1.0, + "step": 10076 + }, + { + "completion_length": 251.165189743042, + "epoch": 1.689718764407561, + "grad_norm": 0.6243773833866528, + "kl": 0.200439453125, + "learning_rate": 4.973901551231467e-07, + "loss": 0.0002, + "reward": 1.7625000774860382, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10078 + }, + { + "completion_length": 243.98215293884277, + "epoch": 1.6900540676474285, + "grad_norm": 0.158549076583998, + "kl": 0.1374359130859375, + "learning_rate": 4.973884260430617e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10080 + }, + { + "completion_length": 250.665189743042, + "epoch": 1.6903893708872961, + "grad_norm": 0.17691845376375445, + "kl": 0.2376861572265625, + "learning_rate": 4.97386696393397e-07, + "loss": 0.0002, + "reward": 1.8000000715255737, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 1.0, + "step": 10082 + }, + { + "completion_length": 248.63840293884277, + "epoch": 1.6907246741271638, + "grad_norm": 0.20282061468046836, + "kl": 0.1807861328125, + "learning_rate": 4.973849661741563e-07, + "loss": 0.0002, + "reward": 1.7732143700122833, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10084 + }, + { + "completion_length": 244.196439743042, + "epoch": 1.6910599773670314, + "grad_norm": 0.28045689010751607, + "kl": 0.71612548828125, + "learning_rate": 4.973832353853436e-07, + "loss": 0.0007, + "reward": 1.7857143431901932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143003493547, + "rewards/format_reward_func": 1.0, + "step": 10086 + }, + { + "completion_length": 242.47322750091553, + "epoch": 1.6913952806068988, + "grad_norm": 0.3193025602899345, + "kl": 0.498504638671875, + "learning_rate": 4.973815040269631e-07, + "loss": 0.0005, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 10088 + }, + { + "completion_length": 246.7142972946167, + "epoch": 1.6917305838467664, + "grad_norm": 0.23974640616671034, + "kl": 0.428253173828125, + "learning_rate": 4.973797720990186e-07, + "loss": 0.0004, + "reward": 1.7857143506407738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 10090 + }, + { + "completion_length": 264.4419775009155, + "epoch": 1.6920658870866339, + "grad_norm": 0.4536671668732646, + "kl": 0.81256103515625, + "learning_rate": 4.973780396015142e-07, + "loss": 0.0008, + "reward": 1.7321429178118706, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7321429029107094, + "rewards/format_reward_func": 1.0, + "step": 10092 + }, + { + "completion_length": 245.4776906967163, + "epoch": 1.6924011903265015, + "grad_norm": 0.17562989852246272, + "kl": 0.906951904296875, + "learning_rate": 4.973763065344538e-07, + "loss": 0.0009, + "reward": 1.7232143729925156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276786081492901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10094 + }, + { + "completion_length": 254.79018688201904, + "epoch": 1.6927364935663691, + "grad_norm": 0.07665879961083437, + "kl": 0.70660400390625, + "learning_rate": 4.973745728978413e-07, + "loss": 0.0007, + "reward": 1.7892857566475868, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 10096 + }, + { + "completion_length": 245.40626049041748, + "epoch": 1.6930717968062368, + "grad_norm": 0.2503366730466342, + "kl": 0.56756591796875, + "learning_rate": 4.97372838691681e-07, + "loss": 0.0006, + "reward": 1.7589286491274834, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10098 + }, + { + "completion_length": 253.65626049041748, + "epoch": 1.6934071000461042, + "grad_norm": 0.6320229532136868, + "kl": 1.1220703125, + "learning_rate": 4.973711039159765e-07, + "loss": 0.0011, + "reward": 1.7776786237955093, + "reward_std": 0.05177031829953194, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 0.9848214387893677, + "step": 10100 + }, + { + "completion_length": 245.08483219146729, + "epoch": 1.6937424032859716, + "grad_norm": 0.1919645569093751, + "kl": 0.560791015625, + "learning_rate": 4.973693685707322e-07, + "loss": 0.0006, + "reward": 1.7500000521540642, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 10102 + }, + { + "completion_length": 240.29911518096924, + "epoch": 1.6940777065258392, + "grad_norm": 0.1526633101158587, + "kl": 0.2593841552734375, + "learning_rate": 4.973676326559518e-07, + "loss": 0.0003, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285872876644, + "rewards/format_reward_func": 1.0, + "step": 10104 + }, + { + "completion_length": 243.66518878936768, + "epoch": 1.6944130097657069, + "grad_norm": 0.19763174283560406, + "kl": 0.2373809814453125, + "learning_rate": 4.973658961716394e-07, + "loss": 0.0002, + "reward": 1.7839286476373672, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928753435612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10106 + }, + { + "completion_length": 242.6651906967163, + "epoch": 1.6947483130055745, + "grad_norm": 0.18538824341398405, + "kl": 0.131744384765625, + "learning_rate": 4.973641591177991e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 10108 + }, + { + "completion_length": 242.34822273254395, + "epoch": 1.6950836162454421, + "grad_norm": 0.21636593330112164, + "kl": 0.161346435546875, + "learning_rate": 4.973624214944347e-07, + "loss": 0.0002, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143167406321, + "rewards/format_reward_func": 1.0, + "step": 10110 + }, + { + "completion_length": 243.72322273254395, + "epoch": 1.6954189194853095, + "grad_norm": 0.4051968109433958, + "kl": 0.2559814453125, + "learning_rate": 4.973606833015503e-07, + "loss": 0.0003, + "reward": 1.7660714983940125, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357391387224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10112 + }, + { + "completion_length": 250.1696548461914, + "epoch": 1.695754222725177, + "grad_norm": 0.1942327326972562, + "kl": 0.23480224609375, + "learning_rate": 4.973589445391497e-07, + "loss": 0.0002, + "reward": 1.778571493923664, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 10114 + }, + { + "completion_length": 231.977689743042, + "epoch": 1.6960895259650446, + "grad_norm": 0.2900110471204609, + "kl": 0.150970458984375, + "learning_rate": 4.973572052072374e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857525497675, + "rewards/format_reward_func": 1.0, + "step": 10116 + }, + { + "completion_length": 244.3303680419922, + "epoch": 1.6964248292049122, + "grad_norm": 0.23325186627488984, + "kl": 0.206512451171875, + "learning_rate": 4.973554653058169e-07, + "loss": 0.0002, + "reward": 1.7250000908970833, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000424683094, + "rewards/format_reward_func": 1.0, + "step": 10118 + }, + { + "completion_length": 239.58483219146729, + "epoch": 1.6967601324447799, + "grad_norm": 0.1860929377205112, + "kl": 0.1572265625, + "learning_rate": 4.973537248348925e-07, + "loss": 0.0002, + "reward": 1.753571480512619, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 10120 + }, + { + "completion_length": 240.98661613464355, + "epoch": 1.6970954356846473, + "grad_norm": 0.8941528688423361, + "kl": 0.22747802734375, + "learning_rate": 4.973519837944681e-07, + "loss": 0.0002, + "reward": 1.7821429073810577, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821429036557674, + "rewards/format_reward_func": 1.0, + "step": 10122 + }, + { + "completion_length": 253.20983123779297, + "epoch": 1.697430738924515, + "grad_norm": 0.46380905705084785, + "kl": 0.25750732421875, + "learning_rate": 4.973502421845476e-07, + "loss": 0.0003, + "reward": 1.710714377462864, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7196428775787354, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10124 + }, + { + "completion_length": 249.33037185668945, + "epoch": 1.6977660421643823, + "grad_norm": 0.15391532324689755, + "kl": 0.18988037109375, + "learning_rate": 4.973485000051354e-07, + "loss": 0.0002, + "reward": 1.769642911851406, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10126 + }, + { + "completion_length": 252.89287090301514, + "epoch": 1.69810134540425, + "grad_norm": 0.49648863518351344, + "kl": 0.2744140625, + "learning_rate": 4.973467572562351e-07, + "loss": 0.0003, + "reward": 1.7339286282658577, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7383929006755352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10128 + }, + { + "completion_length": 260.56251335144043, + "epoch": 1.6984366486441176, + "grad_norm": 0.23758751631952763, + "kl": 0.212677001953125, + "learning_rate": 4.973450139378508e-07, + "loss": 0.0002, + "reward": 1.7392857819795609, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7482143230736256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10130 + }, + { + "completion_length": 246.54465675354004, + "epoch": 1.6987719518839852, + "grad_norm": 0.008302879011098514, + "kl": 0.142120361328125, + "learning_rate": 4.973432700499866e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 10132 + }, + { + "completion_length": 239.49107837677002, + "epoch": 1.6991072551238526, + "grad_norm": 0.1169948662184085, + "kl": 0.385284423828125, + "learning_rate": 4.973415255926466e-07, + "loss": 0.0004, + "reward": 1.7857143357396126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143115252256, + "rewards/format_reward_func": 1.0, + "step": 10134 + }, + { + "completion_length": 250.99108505249023, + "epoch": 1.69944255836372, + "grad_norm": 0.43755620630028064, + "kl": 0.3654937744140625, + "learning_rate": 4.973397805658345e-07, + "loss": 0.0004, + "reward": 1.7625000849366188, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669642977416515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10136 + }, + { + "completion_length": 241.7544755935669, + "epoch": 1.6997778616035877, + "grad_norm": 0.1611913201639629, + "kl": 0.258697509765625, + "learning_rate": 4.973380349695547e-07, + "loss": 0.0003, + "reward": 1.7321429401636124, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428824216127, + "rewards/format_reward_func": 1.0, + "step": 10138 + }, + { + "completion_length": 248.00447845458984, + "epoch": 1.7001131648434553, + "grad_norm": 0.3664824885281984, + "kl": 0.603790283203125, + "learning_rate": 4.973362888038109e-07, + "loss": 0.0006, + "reward": 1.7535714879631996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714488476515, + "rewards/format_reward_func": 1.0, + "step": 10140 + }, + { + "completion_length": 251.26787090301514, + "epoch": 1.700448468083323, + "grad_norm": 0.40636458209251985, + "kl": 0.312957763671875, + "learning_rate": 4.973345420686073e-07, + "loss": 0.0003, + "reward": 1.6928572207689285, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.6928571797907352, + "rewards/format_reward_func": 1.0, + "step": 10142 + }, + { + "completion_length": 248.34376049041748, + "epoch": 1.7007837713231904, + "grad_norm": 0.33955868228708397, + "kl": 1.279510498046875, + "learning_rate": 4.973327947639478e-07, + "loss": 0.0013, + "reward": 1.7035714983940125, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7035714648663998, + "rewards/format_reward_func": 1.0, + "step": 10144 + }, + { + "completion_length": 250.28572845458984, + "epoch": 1.701119074563058, + "grad_norm": 0.16777483507884186, + "kl": 0.157379150390625, + "learning_rate": 4.973310468898366e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 10146 + }, + { + "completion_length": 253.16965675354004, + "epoch": 1.7014543778029254, + "grad_norm": 0.19270567991823492, + "kl": 0.84698486328125, + "learning_rate": 4.973292984462777e-07, + "loss": 0.0008, + "reward": 1.7446429133415222, + "reward_std": 0.06060915347188711, + "rewards/equation_reward_func": 0.7571428958326578, + "rewards/format_reward_func": 0.987500011920929, + "step": 10148 + }, + { + "completion_length": 246.55804443359375, + "epoch": 1.701789681042793, + "grad_norm": 0.24648732219059558, + "kl": 0.365814208984375, + "learning_rate": 4.973275494332749e-07, + "loss": 0.0004, + "reward": 1.7750000730156898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 10150 + }, + { + "completion_length": 262.4017972946167, + "epoch": 1.7021249842826607, + "grad_norm": 0.36717976358266147, + "kl": 0.6148681640625, + "learning_rate": 4.973257998508325e-07, + "loss": 0.0006, + "reward": 1.8000000566244125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 1.0, + "step": 10152 + }, + { + "completion_length": 241.30804634094238, + "epoch": 1.7024602875225283, + "grad_norm": 0.13942221880283598, + "kl": 0.142333984375, + "learning_rate": 4.973240496989543e-07, + "loss": 0.0001, + "reward": 1.7803571820259094, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7848214656114578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10154 + }, + { + "completion_length": 239.52232933044434, + "epoch": 1.7027955907623957, + "grad_norm": 0.505066602090983, + "kl": 0.241119384765625, + "learning_rate": 4.973222989776446e-07, + "loss": 0.0002, + "reward": 1.7678572162985802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571864962578, + "rewards/format_reward_func": 1.0, + "step": 10156 + }, + { + "completion_length": 239.7009038925171, + "epoch": 1.7031308940022631, + "grad_norm": 0.2516769436746728, + "kl": 0.176483154296875, + "learning_rate": 4.973205476869072e-07, + "loss": 0.0002, + "reward": 1.7625000551342964, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10158 + }, + { + "completion_length": 249.70983028411865, + "epoch": 1.7034661972421308, + "grad_norm": 0.1202381463499803, + "kl": 0.1517333984375, + "learning_rate": 4.973187958267461e-07, + "loss": 0.0002, + "reward": 1.76071435213089, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143335044384, + "rewards/format_reward_func": 1.0, + "step": 10160 + }, + { + "completion_length": 237.6026906967163, + "epoch": 1.7038015004819984, + "grad_norm": 0.15176271405999694, + "kl": 0.1248779296875, + "learning_rate": 4.973170433971655e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 10162 + }, + { + "completion_length": 246.08483219146729, + "epoch": 1.704136803721866, + "grad_norm": 0.3686108597554467, + "kl": 0.121490478515625, + "learning_rate": 4.973152903981693e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 10164 + }, + { + "completion_length": 245.67858219146729, + "epoch": 1.7044721069617337, + "grad_norm": 0.22525795074098146, + "kl": 0.11285400390625, + "learning_rate": 4.973135368297617e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.8035714477300644, + "rewards/format_reward_func": 1.0, + "step": 10166 + }, + { + "completion_length": 238.79911613464355, + "epoch": 1.704807410201601, + "grad_norm": 0.1792667048858632, + "kl": 0.13275146484375, + "learning_rate": 4.973117826919467e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 10168 + }, + { + "completion_length": 237.5134038925171, + "epoch": 1.7051427134414685, + "grad_norm": 0.002530607373799404, + "kl": 0.0972442626953125, + "learning_rate": 4.973100279847281e-07, + "loss": 0.0001, + "reward": 1.8071428909897804, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.807142898440361, + "rewards/format_reward_func": 1.0, + "step": 10170 + }, + { + "completion_length": 246.03126335144043, + "epoch": 1.7054780166813361, + "grad_norm": 0.3430750903754556, + "kl": 0.111419677734375, + "learning_rate": 4.973082727081103e-07, + "loss": 0.0001, + "reward": 1.7053572237491608, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7098214738070965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10172 + }, + { + "completion_length": 245.3437623977661, + "epoch": 1.7058133199212038, + "grad_norm": 0.30599076791001545, + "kl": 0.1102294921875, + "learning_rate": 4.97306516862097e-07, + "loss": 0.0001, + "reward": 1.832142911851406, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8321428783237934, + "rewards/format_reward_func": 1.0, + "step": 10174 + }, + { + "completion_length": 243.66518878936768, + "epoch": 1.7061486231610714, + "grad_norm": 0.3329256720540618, + "kl": 0.143310546875, + "learning_rate": 4.973047604466925e-07, + "loss": 0.0001, + "reward": 1.7482143640518188, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10176 + }, + { + "completion_length": 233.24554443359375, + "epoch": 1.7064839264009388, + "grad_norm": 0.2505718550948968, + "kl": 0.121307373046875, + "learning_rate": 4.973030034619007e-07, + "loss": 0.0001, + "reward": 1.7214286550879478, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214286122471094, + "rewards/format_reward_func": 1.0, + "step": 10178 + }, + { + "completion_length": 240.8259048461914, + "epoch": 1.7068192296408065, + "grad_norm": 0.3175999733034826, + "kl": 0.1077423095703125, + "learning_rate": 4.973012459077257e-07, + "loss": 0.0001, + "reward": 1.70000009983778, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7000000346451998, + "rewards/format_reward_func": 1.0, + "step": 10180 + }, + { + "completion_length": 238.7589406967163, + "epoch": 1.7071545328806739, + "grad_norm": 0.8609321651175816, + "kl": 0.1385498046875, + "learning_rate": 4.972994877841715e-07, + "loss": 0.0001, + "reward": 1.7767857611179352, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500409781933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10182 + }, + { + "completion_length": 242.0625114440918, + "epoch": 1.7074898361205415, + "grad_norm": 0.3140538824392083, + "kl": 0.120025634765625, + "learning_rate": 4.972977290912423e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 10184 + }, + { + "completion_length": 240.6696538925171, + "epoch": 1.7078251393604091, + "grad_norm": 0.16024782471908663, + "kl": 0.1128692626953125, + "learning_rate": 4.97295969828942e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714440047741, + "rewards/format_reward_func": 1.0, + "step": 10186 + }, + { + "completion_length": 237.9241189956665, + "epoch": 1.7081604426002768, + "grad_norm": 0.20369444105406395, + "kl": 0.1204833984375, + "learning_rate": 4.972942099972746e-07, + "loss": 0.0001, + "reward": 1.825000062584877, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.825000025331974, + "rewards/format_reward_func": 1.0, + "step": 10188 + }, + { + "completion_length": 237.3214406967163, + "epoch": 1.7084957458401442, + "grad_norm": 0.46196819682538787, + "kl": 0.1105804443359375, + "learning_rate": 4.972924495962443e-07, + "loss": 0.0001, + "reward": 1.8250000476837158, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000216066837, + "rewards/format_reward_func": 1.0, + "step": 10190 + }, + { + "completion_length": 247.79465293884277, + "epoch": 1.7088310490800116, + "grad_norm": 0.17530326999013046, + "kl": 0.09844970703125, + "learning_rate": 4.97290688625855e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7446428835391998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10192 + }, + { + "completion_length": 239.915189743042, + "epoch": 1.7091663523198792, + "grad_norm": 0.17669363274091412, + "kl": 0.10247802734375, + "learning_rate": 4.97288927086111e-07, + "loss": 0.0001, + "reward": 1.8000000268220901, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 10194 + }, + { + "completion_length": 251.86608219146729, + "epoch": 1.7095016555597469, + "grad_norm": 0.1184864862329785, + "kl": 0.111236572265625, + "learning_rate": 4.972871649770162e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 10196 + }, + { + "completion_length": 249.87947845458984, + "epoch": 1.7098369587996145, + "grad_norm": 0.3240525754200581, + "kl": 0.1515350341796875, + "learning_rate": 4.972854022985746e-07, + "loss": 0.0002, + "reward": 1.753571517765522, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 10198 + }, + { + "completion_length": 258.5848331451416, + "epoch": 1.710172262039482, + "grad_norm": 0.13925645977761708, + "kl": 0.135101318359375, + "learning_rate": 4.972836390507902e-07, + "loss": 0.0001, + "reward": 1.7285715267062187, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7375000268220901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10200 + }, + { + "completion_length": 255.58483219146729, + "epoch": 1.7105075652793496, + "grad_norm": 0.3032562982546393, + "kl": 0.102783203125, + "learning_rate": 4.972818752336674e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 10202 + }, + { + "completion_length": 257.7321557998657, + "epoch": 1.710842868519217, + "grad_norm": 0.3941826817499621, + "kl": 0.14019775390625, + "learning_rate": 4.972801108472099e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7410714626312256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10204 + }, + { + "completion_length": 265.51340675354004, + "epoch": 1.7111781717590846, + "grad_norm": 0.25116486602831917, + "kl": 0.133148193359375, + "learning_rate": 4.97278345891422e-07, + "loss": 0.0001, + "reward": 1.798214316368103, + "reward_std": 0.06313453149050474, + "rewards/equation_reward_func": 0.8026785962283611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10206 + }, + { + "completion_length": 265.3839416503906, + "epoch": 1.7115134749989522, + "grad_norm": 0.14768036461804573, + "kl": 0.12890625, + "learning_rate": 4.972765803663076e-07, + "loss": 0.0001, + "reward": 1.7267857939004898, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7312500290572643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10208 + }, + { + "completion_length": 254.28572845458984, + "epoch": 1.7118487782388199, + "grad_norm": 0.18517748549085836, + "kl": 0.101348876953125, + "learning_rate": 4.972748142718708e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857305705547, + "rewards/format_reward_func": 1.0, + "step": 10210 + }, + { + "completion_length": 248.27233219146729, + "epoch": 1.7121840814786873, + "grad_norm": 0.16112355740952278, + "kl": 0.117645263671875, + "learning_rate": 4.972730476081157e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 10212 + }, + { + "completion_length": 249.39733600616455, + "epoch": 1.7125193847185547, + "grad_norm": 0.27793530860374993, + "kl": 0.2154541015625, + "learning_rate": 4.972712803750464e-07, + "loss": 0.0002, + "reward": 1.8196429163217545, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8241071775555611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10214 + }, + { + "completion_length": 255.77233409881592, + "epoch": 1.7128546879584223, + "grad_norm": 0.17317650979675972, + "kl": 0.1054840087890625, + "learning_rate": 4.972695125726669e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 10216 + }, + { + "completion_length": 253.52233123779297, + "epoch": 1.71318999119829, + "grad_norm": 0.22910347888793958, + "kl": 0.133819580078125, + "learning_rate": 4.972677442009813e-07, + "loss": 0.0001, + "reward": 1.7785715088248253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 10218 + }, + { + "completion_length": 256.03126335144043, + "epoch": 1.7135252944381576, + "grad_norm": 0.1251190478169731, + "kl": 0.1388702392578125, + "learning_rate": 4.972659752599937e-07, + "loss": 0.0001, + "reward": 1.8482143357396126, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.852678582072258, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10220 + }, + { + "completion_length": 263.6428699493408, + "epoch": 1.713860597678025, + "grad_norm": 0.1527141001062361, + "kl": 0.1422882080078125, + "learning_rate": 4.972642057497082e-07, + "loss": 0.0001, + "reward": 1.7696429193019867, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10222 + }, + { + "completion_length": 267.70983123779297, + "epoch": 1.7141959009178926, + "grad_norm": 0.2617393189987282, + "kl": 0.106292724609375, + "learning_rate": 4.972624356701287e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10224 + }, + { + "completion_length": 259.7410831451416, + "epoch": 1.71453120415776, + "grad_norm": 0.2660914466620459, + "kl": 0.13519287109375, + "learning_rate": 4.972606650212595e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500149011612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10226 + }, + { + "completion_length": 262.5357255935669, + "epoch": 1.7148665073976277, + "grad_norm": 0.09373059801058264, + "kl": 0.147003173828125, + "learning_rate": 4.972588938031045e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 10228 + }, + { + "completion_length": 269.6651906967163, + "epoch": 1.7152018106374953, + "grad_norm": 0.09178558498879472, + "kl": 0.278045654296875, + "learning_rate": 4.972571220156679e-07, + "loss": 0.0003, + "reward": 1.79464291036129, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7991071529686451, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10230 + }, + { + "completion_length": 273.5357255935669, + "epoch": 1.715537113877363, + "grad_norm": 0.20699127352252952, + "kl": 0.10955810546875, + "learning_rate": 4.972553496589536e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000242143869, + "rewards/format_reward_func": 1.0, + "step": 10232 + }, + { + "completion_length": 255.79911994934082, + "epoch": 1.7158724171172304, + "grad_norm": 0.16328593761963217, + "kl": 0.16827392578125, + "learning_rate": 4.97253576732966e-07, + "loss": 0.0002, + "reward": 1.7714286521077156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 10234 + }, + { + "completion_length": 251.2812623977661, + "epoch": 1.7162077203570978, + "grad_norm": 0.15051227767423914, + "kl": 0.2393951416015625, + "learning_rate": 4.972518032377088e-07, + "loss": 0.0002, + "reward": 1.7500000819563866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 10236 + }, + { + "completion_length": 266.82143783569336, + "epoch": 1.7165430235969654, + "grad_norm": 0.5221781685923255, + "kl": 0.2209320068359375, + "learning_rate": 4.972500291731865e-07, + "loss": 0.0002, + "reward": 1.7973214834928513, + "reward_std": 0.07449874933809042, + "rewards/equation_reward_func": 0.8080357369035482, + "rewards/format_reward_func": 0.9892857223749161, + "step": 10238 + }, + { + "completion_length": 256.92858505249023, + "epoch": 1.716878326836833, + "grad_norm": 0.1457588564956292, + "kl": 0.137115478515625, + "learning_rate": 4.972482545394028e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 10240 + }, + { + "completion_length": 261.8526916503906, + "epoch": 1.7172136300767007, + "grad_norm": 0.1387693804521752, + "kl": 0.1348876953125, + "learning_rate": 4.972464793363619e-07, + "loss": 0.0001, + "reward": 1.760714314877987, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.760714340955019, + "rewards/format_reward_func": 1.0, + "step": 10242 + }, + { + "completion_length": 267.48662281036377, + "epoch": 1.7175489333165683, + "grad_norm": 0.45388286887329304, + "kl": 0.276153564453125, + "learning_rate": 4.972447035640681e-07, + "loss": 0.0003, + "reward": 1.7482143864035606, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10244 + }, + { + "completion_length": 256.8303689956665, + "epoch": 1.7178842365564357, + "grad_norm": 0.26309543690645204, + "kl": 0.109283447265625, + "learning_rate": 4.972429272225252e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 10246 + }, + { + "completion_length": 256.8884048461914, + "epoch": 1.7182195397963032, + "grad_norm": 0.5253921470745806, + "kl": 0.2630615234375, + "learning_rate": 4.972411503117374e-07, + "loss": 0.0003, + "reward": 1.7035714983940125, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7125000301748514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10248 + }, + { + "completion_length": 261.7857255935669, + "epoch": 1.7185548430361708, + "grad_norm": 0.1953770623697014, + "kl": 0.12847900390625, + "learning_rate": 4.972393728317089e-07, + "loss": 0.0001, + "reward": 1.7410714998841286, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357518047094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10250 + }, + { + "completion_length": 259.8348321914673, + "epoch": 1.7188901462760384, + "grad_norm": 0.1585139958072636, + "kl": 0.1113128662109375, + "learning_rate": 4.972375947824437e-07, + "loss": 0.0001, + "reward": 1.7642857506871223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857600003481, + "rewards/format_reward_func": 1.0, + "step": 10252 + }, + { + "completion_length": 251.85269260406494, + "epoch": 1.719225449515906, + "grad_norm": 0.21025823635972254, + "kl": 0.142669677734375, + "learning_rate": 4.972358161639458e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 10254 + }, + { + "completion_length": 246.71429538726807, + "epoch": 1.7195607527557735, + "grad_norm": 0.21935861333026402, + "kl": 0.0960845947265625, + "learning_rate": 4.972340369762193e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 10256 + }, + { + "completion_length": 249.4509038925171, + "epoch": 1.719896055995641, + "grad_norm": 0.19696143606941816, + "kl": 0.1159515380859375, + "learning_rate": 4.972322572192686e-07, + "loss": 0.0001, + "reward": 1.785714328289032, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 10258 + }, + { + "completion_length": 253.4955472946167, + "epoch": 1.7202313592355085, + "grad_norm": 0.2697506218673313, + "kl": 0.1621856689453125, + "learning_rate": 4.972304768930973e-07, + "loss": 0.0002, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 10260 + }, + { + "completion_length": 248.9821548461914, + "epoch": 1.7205666624753762, + "grad_norm": 0.15170022565279304, + "kl": 0.102203369140625, + "learning_rate": 4.9722869599771e-07, + "loss": 0.0001, + "reward": 1.8250000402331352, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 10262 + }, + { + "completion_length": 241.9017972946167, + "epoch": 1.7209019657152438, + "grad_norm": 0.19211303932289808, + "kl": 0.105621337890625, + "learning_rate": 4.972269145331106e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8026785925030708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10264 + }, + { + "completion_length": 254.39733505249023, + "epoch": 1.7212372689551114, + "grad_norm": 0.013160530116678611, + "kl": 0.108795166015625, + "learning_rate": 4.972251324993031e-07, + "loss": 0.0001, + "reward": 1.7946429252624512, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7991071790456772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10266 + }, + { + "completion_length": 242.98661708831787, + "epoch": 1.7215725721949788, + "grad_norm": 0.15062515796873527, + "kl": 0.085784912109375, + "learning_rate": 4.972233498962917e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 10268 + }, + { + "completion_length": 252.28572463989258, + "epoch": 1.7219078754348462, + "grad_norm": 0.3019032739127625, + "kl": 0.0868988037109375, + "learning_rate": 4.972215667240805e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 10270 + }, + { + "completion_length": 248.76340675354004, + "epoch": 1.7222431786747139, + "grad_norm": 0.17839336544905793, + "kl": 0.0845184326171875, + "learning_rate": 4.972197829826735e-07, + "loss": 0.0001, + "reward": 1.8053571581840515, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8098214566707611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10272 + }, + { + "completion_length": 250.40179824829102, + "epoch": 1.7225784819145815, + "grad_norm": 0.4455969725900186, + "kl": 0.11224365234375, + "learning_rate": 4.97217998672075e-07, + "loss": 0.0001, + "reward": 1.758928619325161, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928935974836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10274 + }, + { + "completion_length": 251.383939743042, + "epoch": 1.7229137851544492, + "grad_norm": 0.16785836779638325, + "kl": 0.0938873291015625, + "learning_rate": 4.972162137922888e-07, + "loss": 0.0001, + "reward": 1.7946429029107094, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071660071611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10276 + }, + { + "completion_length": 243.18304538726807, + "epoch": 1.7232490883943166, + "grad_norm": 0.23542771845399904, + "kl": 0.0963134765625, + "learning_rate": 4.972144283433194e-07, + "loss": 0.0001, + "reward": 1.7785715162754059, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 10278 + }, + { + "completion_length": 248.352689743042, + "epoch": 1.7235843916341842, + "grad_norm": 0.23453639466499993, + "kl": 0.09661865234375, + "learning_rate": 4.972126423251708e-07, + "loss": 0.0001, + "reward": 1.7825893387198448, + "reward_std": 0.06502856989391148, + "rewards/equation_reward_func": 0.7883929014205933, + "rewards/format_reward_func": 0.9941964335739613, + "step": 10280 + }, + { + "completion_length": 249.73215675354004, + "epoch": 1.7239196948740516, + "grad_norm": 0.1695213510329949, + "kl": 0.114288330078125, + "learning_rate": 4.972108557378469e-07, + "loss": 0.0001, + "reward": 1.7901786267757416, + "reward_std": 0.044194172602146864, + "rewards/equation_reward_func": 0.7919643223285675, + "rewards/format_reward_func": 0.9982142895460129, + "step": 10282 + }, + { + "completion_length": 247.0312623977661, + "epoch": 1.7242549981139192, + "grad_norm": 0.20108784450563666, + "kl": 0.107269287109375, + "learning_rate": 4.972090685813519e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 10284 + }, + { + "completion_length": 248.40626525878906, + "epoch": 1.7245903013537869, + "grad_norm": 0.17376718166315555, + "kl": 0.098052978515625, + "learning_rate": 4.972072808556901e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 10286 + }, + { + "completion_length": 253.43304920196533, + "epoch": 1.7249256045936545, + "grad_norm": 0.20889097778803672, + "kl": 0.1026458740234375, + "learning_rate": 4.972054925608654e-07, + "loss": 0.0001, + "reward": 1.7678571864962578, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 10288 + }, + { + "completion_length": 251.602689743042, + "epoch": 1.725260907833522, + "grad_norm": 0.1601895959215239, + "kl": 0.0942535400390625, + "learning_rate": 4.972037036968821e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 10290 + }, + { + "completion_length": 252.18304634094238, + "epoch": 1.7255962110733893, + "grad_norm": 0.18190231757421382, + "kl": 0.1245269775390625, + "learning_rate": 4.972019142637442e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 10292 + }, + { + "completion_length": 249.75894260406494, + "epoch": 1.725931514313257, + "grad_norm": 0.17078338524343523, + "kl": 0.0865631103515625, + "learning_rate": 4.972001242614558e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7776785884052515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10294 + }, + { + "completion_length": 244.34375953674316, + "epoch": 1.7262668175531246, + "grad_norm": 0.2378874275442031, + "kl": 0.1673583984375, + "learning_rate": 4.97198333690021e-07, + "loss": 0.0002, + "reward": 1.769642896950245, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071879863739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10296 + }, + { + "completion_length": 249.7098331451416, + "epoch": 1.7266021207929922, + "grad_norm": 0.21160423128606037, + "kl": 0.120391845703125, + "learning_rate": 4.971965425494439e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 10298 + }, + { + "completion_length": 253.62947177886963, + "epoch": 1.7269374240328599, + "grad_norm": 0.20583500077418201, + "kl": 0.18536376953125, + "learning_rate": 4.97194750839729e-07, + "loss": 0.0002, + "reward": 1.7464286461472511, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 10300 + }, + { + "completion_length": 235.7901906967163, + "epoch": 1.7272727272727273, + "grad_norm": 0.3967885953207802, + "kl": 0.1246337890625, + "learning_rate": 4.971929585608799e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 10302 + }, + { + "completion_length": 240.97768688201904, + "epoch": 1.7276080305125947, + "grad_norm": 0.46283659163232105, + "kl": 0.155731201171875, + "learning_rate": 4.97191165712901e-07, + "loss": 0.0002, + "reward": 1.742857240140438, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 10304 + }, + { + "completion_length": 241.07590675354004, + "epoch": 1.7279433337524623, + "grad_norm": 0.2614746911479949, + "kl": 0.0865325927734375, + "learning_rate": 4.971893722957964e-07, + "loss": 0.0001, + "reward": 1.8017857521772385, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8062500171363354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10306 + }, + { + "completion_length": 230.19197463989258, + "epoch": 1.72827863699233, + "grad_norm": 0.18608370494522042, + "kl": 0.1041259765625, + "learning_rate": 4.971875783095702e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 10308 + }, + { + "completion_length": 240.24108505249023, + "epoch": 1.7286139402321976, + "grad_norm": 0.26414739512639407, + "kl": 0.112152099609375, + "learning_rate": 4.971857837542266e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 10310 + }, + { + "completion_length": 241.75893878936768, + "epoch": 1.728949243472065, + "grad_norm": 0.2030729173493549, + "kl": 0.12939453125, + "learning_rate": 4.971839886297697e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.778571454808116, + "rewards/format_reward_func": 1.0, + "step": 10312 + }, + { + "completion_length": 229.4955472946167, + "epoch": 1.7292845467119327, + "grad_norm": 0.2002807067146816, + "kl": 0.106903076171875, + "learning_rate": 4.971821929362035e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 10314 + }, + { + "completion_length": 241.39733409881592, + "epoch": 1.7296198499518, + "grad_norm": 0.22584604105987863, + "kl": 0.0989837646484375, + "learning_rate": 4.971803966735322e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 10316 + }, + { + "completion_length": 245.0357265472412, + "epoch": 1.7299551531916677, + "grad_norm": 0.16363201642424538, + "kl": 0.09918212890625, + "learning_rate": 4.971785998417601e-07, + "loss": 0.0001, + "reward": 1.7732143327593803, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10318 + }, + { + "completion_length": 241.36608219146729, + "epoch": 1.7302904564315353, + "grad_norm": 0.11868823441642207, + "kl": 0.119659423828125, + "learning_rate": 4.971768024408912e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10320 + }, + { + "completion_length": 246.12947368621826, + "epoch": 1.730625759671403, + "grad_norm": 0.4974318341941413, + "kl": 0.11785888671875, + "learning_rate": 4.971750044709296e-07, + "loss": 0.0001, + "reward": 1.7357143312692642, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 10322 + }, + { + "completion_length": 243.7142972946167, + "epoch": 1.7309610629112704, + "grad_norm": 0.22373018764174843, + "kl": 0.10406494140625, + "learning_rate": 4.971732059318796e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 10324 + }, + { + "completion_length": 244.84822750091553, + "epoch": 1.7312963661511378, + "grad_norm": 0.3080666523543961, + "kl": 0.157073974609375, + "learning_rate": 4.971714068237452e-07, + "loss": 0.0002, + "reward": 1.7892857566475868, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 10326 + }, + { + "completion_length": 247.28572750091553, + "epoch": 1.7316316693910054, + "grad_norm": 0.194548889979958, + "kl": 0.18048095703125, + "learning_rate": 4.971696071465305e-07, + "loss": 0.0002, + "reward": 1.7517857775092125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10328 + }, + { + "completion_length": 255.8348331451416, + "epoch": 1.731966972630873, + "grad_norm": 0.10122060964439665, + "kl": 0.1008148193359375, + "learning_rate": 4.971678069002398e-07, + "loss": 0.0001, + "reward": 1.8250000476837158, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.825000025331974, + "rewards/format_reward_func": 1.0, + "step": 10330 + }, + { + "completion_length": 258.7410821914673, + "epoch": 1.7323022758707407, + "grad_norm": 0.6716880283698804, + "kl": 0.2233734130859375, + "learning_rate": 4.971660060848772e-07, + "loss": 0.0002, + "reward": 1.8125000521540642, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8169643171131611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10332 + }, + { + "completion_length": 260.14733505249023, + "epoch": 1.7326375791106081, + "grad_norm": 0.10600836730768504, + "kl": 0.189788818359375, + "learning_rate": 4.971642047004466e-07, + "loss": 0.0002, + "reward": 1.7250000685453415, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7250000294297934, + "rewards/format_reward_func": 1.0, + "step": 10334 + }, + { + "completion_length": 253.477689743042, + "epoch": 1.7329728823504758, + "grad_norm": 0.13879610996945366, + "kl": 0.160552978515625, + "learning_rate": 4.971624027469526e-07, + "loss": 0.0002, + "reward": 1.7696429193019867, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7741071730852127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10336 + }, + { + "completion_length": 257.62055015563965, + "epoch": 1.7333081855903432, + "grad_norm": 0.21084404479090876, + "kl": 0.20538330078125, + "learning_rate": 4.97160600224399e-07, + "loss": 0.0002, + "reward": 1.7928571999073029, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 10338 + }, + { + "completion_length": 250.45090675354004, + "epoch": 1.7336434888302108, + "grad_norm": 0.26688576816393683, + "kl": 0.146881103515625, + "learning_rate": 4.971587971327901e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142873853445, + "rewards/format_reward_func": 1.0, + "step": 10340 + }, + { + "completion_length": 256.0759038925171, + "epoch": 1.7339787920700784, + "grad_norm": 0.113620714880813, + "kl": 0.16156005859375, + "learning_rate": 4.9715699347213e-07, + "loss": 0.0002, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 10342 + }, + { + "completion_length": 257.41072273254395, + "epoch": 1.734314095309946, + "grad_norm": 0.2530098783662599, + "kl": 0.1668701171875, + "learning_rate": 4.971551892424228e-07, + "loss": 0.0002, + "reward": 1.7357143387198448, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 10344 + }, + { + "completion_length": 258.1651916503906, + "epoch": 1.7346493985498135, + "grad_norm": 0.13072198172795174, + "kl": 0.095489501953125, + "learning_rate": 4.971533844436728e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 10346 + }, + { + "completion_length": 265.5535840988159, + "epoch": 1.734984701789681, + "grad_norm": 0.13941893460031515, + "kl": 0.114532470703125, + "learning_rate": 4.97151579075884e-07, + "loss": 0.0001, + "reward": 1.7267858013510704, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7312500383704901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10348 + }, + { + "completion_length": 268.7009029388428, + "epoch": 1.7353200050295485, + "grad_norm": 0.28658059038787587, + "kl": 0.103271484375, + "learning_rate": 4.971497731390607e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 10350 + }, + { + "completion_length": 259.558048248291, + "epoch": 1.7356553082694162, + "grad_norm": 0.1752486314212871, + "kl": 0.12921142578125, + "learning_rate": 4.971479666332069e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10352 + }, + { + "completion_length": 275.13393783569336, + "epoch": 1.7359906115092838, + "grad_norm": 0.1740133493170833, + "kl": 0.183502197265625, + "learning_rate": 4.971461595583269e-07, + "loss": 0.0002, + "reward": 1.7375000566244125, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7419643104076385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10354 + }, + { + "completion_length": 276.25447845458984, + "epoch": 1.7363259147491512, + "grad_norm": 0.4997503302559109, + "kl": 0.151702880859375, + "learning_rate": 4.971443519144248e-07, + "loss": 0.0002, + "reward": 1.758928619325161, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.763392886146903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10356 + }, + { + "completion_length": 268.64287090301514, + "epoch": 1.7366612179890188, + "grad_norm": 0.20603458803696528, + "kl": 0.166961669921875, + "learning_rate": 4.971425437015048e-07, + "loss": 0.0002, + "reward": 1.8267857804894447, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8312500268220901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10358 + }, + { + "completion_length": 269.7678680419922, + "epoch": 1.7369965212288863, + "grad_norm": 0.1892884256617599, + "kl": 0.1112518310546875, + "learning_rate": 4.97140734919571e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7910714484751225, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10360 + }, + { + "completion_length": 267.5357265472412, + "epoch": 1.737331824468754, + "grad_norm": 0.21339591862795737, + "kl": 0.096923828125, + "learning_rate": 4.971389255686275e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.08586296625435352, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 0.9821428656578064, + "step": 10362 + }, + { + "completion_length": 266.5714406967163, + "epoch": 1.7376671277086215, + "grad_norm": 0.15824648300184477, + "kl": 0.1436004638671875, + "learning_rate": 4.971371156486786e-07, + "loss": 0.0001, + "reward": 1.8250000402331352, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000365078449, + "rewards/format_reward_func": 1.0, + "step": 10364 + }, + { + "completion_length": 266.89733600616455, + "epoch": 1.7380024309484892, + "grad_norm": 0.3613629986828252, + "kl": 0.1967926025390625, + "learning_rate": 4.971353051597285e-07, + "loss": 0.0002, + "reward": 1.791071504354477, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7955357320606709, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10366 + }, + { + "completion_length": 260.7232265472412, + "epoch": 1.7383377341883566, + "grad_norm": 0.002761877959319243, + "kl": 0.1133575439453125, + "learning_rate": 4.971334941017813e-07, + "loss": 0.0001, + "reward": 1.7589285969734192, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633929047733545, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10368 + }, + { + "completion_length": 267.3794765472412, + "epoch": 1.738673037428224, + "grad_norm": 0.12438441265646877, + "kl": 0.15130615234375, + "learning_rate": 4.971316824748412e-07, + "loss": 0.0002, + "reward": 1.7464286535978317, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 10370 + }, + { + "completion_length": 271.1116189956665, + "epoch": 1.7390083406680916, + "grad_norm": 0.25002979116110813, + "kl": 0.125457763671875, + "learning_rate": 4.971298702789123e-07, + "loss": 0.0001, + "reward": 1.7839286401867867, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7883928790688515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10372 + }, + { + "completion_length": 275.9107246398926, + "epoch": 1.7393436439079593, + "grad_norm": 0.20122847648749992, + "kl": 0.1316986083984375, + "learning_rate": 4.971280575139988e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10374 + }, + { + "completion_length": 271.5223340988159, + "epoch": 1.739678947147827, + "grad_norm": 0.14811595712044243, + "kl": 0.1340484619140625, + "learning_rate": 4.971262441801048e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.752678606659174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10376 + }, + { + "completion_length": 284.4330425262451, + "epoch": 1.7400142503876945, + "grad_norm": 0.22086490716191035, + "kl": 0.118499755859375, + "learning_rate": 4.971244302772346e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.8008928783237934, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10378 + }, + { + "completion_length": 267.0089416503906, + "epoch": 1.740349553627562, + "grad_norm": 0.057972474967720546, + "kl": 0.1278839111328125, + "learning_rate": 4.971226158053923e-07, + "loss": 0.0001, + "reward": 1.7946429252624512, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7991071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10380 + }, + { + "completion_length": 282.9732275009155, + "epoch": 1.7406848568674294, + "grad_norm": 0.295008429979625, + "kl": 0.219482421875, + "learning_rate": 4.971208007645823e-07, + "loss": 0.0002, + "reward": 1.728571504354477, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7375000305473804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10382 + }, + { + "completion_length": 270.80804920196533, + "epoch": 1.741020160107297, + "grad_norm": 0.22125537829038144, + "kl": 0.129486083984375, + "learning_rate": 4.971189851548084e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7732143215835094, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10384 + }, + { + "completion_length": 285.18304538726807, + "epoch": 1.7413554633471646, + "grad_norm": 0.5638466687686062, + "kl": 0.160888671875, + "learning_rate": 4.971171689760751e-07, + "loss": 0.0002, + "reward": 1.7625000402331352, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10386 + }, + { + "completion_length": 275.16965103149414, + "epoch": 1.7416907665870323, + "grad_norm": 0.2000542526716108, + "kl": 0.141265869140625, + "learning_rate": 4.971153522283864e-07, + "loss": 0.0001, + "reward": 1.7714286521077156, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7803571708500385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10388 + }, + { + "completion_length": 273.29465675354004, + "epoch": 1.7420260698268997, + "grad_norm": 0.19184259176887247, + "kl": 0.140655517578125, + "learning_rate": 4.971135349117465e-07, + "loss": 0.0001, + "reward": 1.7428572326898575, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7517857402563095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10390 + }, + { + "completion_length": 267.9821538925171, + "epoch": 1.7423613730667673, + "grad_norm": 0.11815547024679297, + "kl": 0.15802001953125, + "learning_rate": 4.971117170261596e-07, + "loss": 0.0002, + "reward": 1.7714286372065544, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 10392 + }, + { + "completion_length": 272.5803699493408, + "epoch": 1.7426966763066347, + "grad_norm": 0.24361023479353522, + "kl": 0.267608642578125, + "learning_rate": 4.9710989857163e-07, + "loss": 0.0003, + "reward": 1.7142858058214188, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7142857424914837, + "rewards/format_reward_func": 1.0, + "step": 10394 + }, + { + "completion_length": 258.5982275009155, + "epoch": 1.7430319795465024, + "grad_norm": 0.24585877829799133, + "kl": 0.13165283203125, + "learning_rate": 4.971080795481618e-07, + "loss": 0.0001, + "reward": 1.8285714909434319, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8285714462399483, + "rewards/format_reward_func": 1.0, + "step": 10396 + }, + { + "completion_length": 266.8125123977661, + "epoch": 1.74336728278637, + "grad_norm": 0.2220689272461111, + "kl": 0.150238037109375, + "learning_rate": 4.971062599557591e-07, + "loss": 0.0002, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 10398 + }, + { + "completion_length": 260.52679920196533, + "epoch": 1.7437025860262376, + "grad_norm": 0.19939037132635878, + "kl": 0.109222412109375, + "learning_rate": 4.971044397944261e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 10400 + }, + { + "completion_length": 272.54911613464355, + "epoch": 1.744037889266105, + "grad_norm": 0.19888458583738894, + "kl": 0.2293853759765625, + "learning_rate": 4.971026190641672e-07, + "loss": 0.0002, + "reward": 1.7321429252624512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428991854191, + "rewards/format_reward_func": 1.0, + "step": 10402 + }, + { + "completion_length": 273.9732275009155, + "epoch": 1.7443731925059724, + "grad_norm": 0.27151938888768473, + "kl": 0.178863525390625, + "learning_rate": 4.971007977649864e-07, + "loss": 0.0002, + "reward": 1.785714365541935, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 10404 + }, + { + "completion_length": 266.08930110931396, + "epoch": 1.74470849574584, + "grad_norm": 0.31683103034099785, + "kl": 0.1129150390625, + "learning_rate": 4.97098975896888e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 10406 + }, + { + "completion_length": 267.5089406967163, + "epoch": 1.7450437989857077, + "grad_norm": 0.13888860982697956, + "kl": 0.1087799072265625, + "learning_rate": 4.970971534598761e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7767857424914837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10408 + }, + { + "completion_length": 263.92411708831787, + "epoch": 1.7453791022255754, + "grad_norm": 0.27825671143278985, + "kl": 0.143402099609375, + "learning_rate": 4.970953304539549e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 10410 + }, + { + "completion_length": 273.5178699493408, + "epoch": 1.7457144054654428, + "grad_norm": 0.08538777497485277, + "kl": 0.151153564453125, + "learning_rate": 4.970935068791286e-07, + "loss": 0.0002, + "reward": 1.7410714775323868, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7455357443541288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10412 + }, + { + "completion_length": 270.0982275009155, + "epoch": 1.7460497087053104, + "grad_norm": 0.5390369677057072, + "kl": 0.193695068359375, + "learning_rate": 4.970916827354016e-07, + "loss": 0.0002, + "reward": 1.7732143476605415, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7866071835160255, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10414 + }, + { + "completion_length": 261.7366199493408, + "epoch": 1.7463850119451778, + "grad_norm": 0.18971974736151406, + "kl": 0.1143035888671875, + "learning_rate": 4.970898580227778e-07, + "loss": 0.0001, + "reward": 1.8285714611411095, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8285714648663998, + "rewards/format_reward_func": 1.0, + "step": 10416 + }, + { + "completion_length": 285.3214406967163, + "epoch": 1.7467203151850454, + "grad_norm": 0.3431656777052588, + "kl": 0.220428466796875, + "learning_rate": 4.970880327412616e-07, + "loss": 0.0002, + "reward": 1.7875000685453415, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10418 + }, + { + "completion_length": 276.5134029388428, + "epoch": 1.747055618424913, + "grad_norm": 0.14949047704968912, + "kl": 0.2712860107421875, + "learning_rate": 4.97086206890857e-07, + "loss": 0.0003, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214484751225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10420 + }, + { + "completion_length": 265.5669755935669, + "epoch": 1.7473909216647807, + "grad_norm": 0.20053087897408742, + "kl": 0.1323699951171875, + "learning_rate": 4.970843804715684e-07, + "loss": 0.0001, + "reward": 1.7910714969038963, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.795535746961832, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10422 + }, + { + "completion_length": 281.4196557998657, + "epoch": 1.7477262249046481, + "grad_norm": 0.34421321644086805, + "kl": 0.195068359375, + "learning_rate": 4.970825534834e-07, + "loss": 0.0002, + "reward": 1.7285715118050575, + "reward_std": 0.12121830135583878, + "rewards/equation_reward_func": 0.728571455925703, + "rewards/format_reward_func": 1.0, + "step": 10424 + }, + { + "completion_length": 272.6919765472412, + "epoch": 1.7480615281445155, + "grad_norm": 0.3231745079674326, + "kl": 0.154296875, + "learning_rate": 4.970807259263559e-07, + "loss": 0.0002, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 10426 + }, + { + "completion_length": 279.2589464187622, + "epoch": 1.7483968313843832, + "grad_norm": 0.17148280698992707, + "kl": 0.165374755859375, + "learning_rate": 4.970788978004404e-07, + "loss": 0.0002, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 10428 + }, + { + "completion_length": 275.589298248291, + "epoch": 1.7487321346242508, + "grad_norm": 0.25112206633806494, + "kl": 0.146392822265625, + "learning_rate": 4.970770691056577e-07, + "loss": 0.0001, + "reward": 1.7053572237491608, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7098214644938707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10430 + }, + { + "completion_length": 265.3437614440918, + "epoch": 1.7490674378641184, + "grad_norm": 0.17369490573789476, + "kl": 0.12432861328125, + "learning_rate": 4.97075239842012e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 10432 + }, + { + "completion_length": 273.96429443359375, + "epoch": 1.749402741103986, + "grad_norm": 0.24760782391253822, + "kl": 0.1802978515625, + "learning_rate": 4.970734100095073e-07, + "loss": 0.0002, + "reward": 1.7446429207921028, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7580357417464256, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10434 + }, + { + "completion_length": 272.47322845458984, + "epoch": 1.7497380443438535, + "grad_norm": 0.20643982792458254, + "kl": 0.1326141357421875, + "learning_rate": 4.970715796081482e-07, + "loss": 0.0001, + "reward": 1.817857176065445, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 10436 + }, + { + "completion_length": 276.3794746398926, + "epoch": 1.750073347583721, + "grad_norm": 0.10986143055110638, + "kl": 0.2333984375, + "learning_rate": 4.970697486379386e-07, + "loss": 0.0002, + "reward": 1.7446429207921028, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10438 + }, + { + "completion_length": 257.4509029388428, + "epoch": 1.7504086508235885, + "grad_norm": 0.19597224889941217, + "kl": 0.110137939453125, + "learning_rate": 4.970679170988829e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285850524902, + "rewards/format_reward_func": 1.0, + "step": 10440 + }, + { + "completion_length": 261.3794746398926, + "epoch": 1.7507439540634562, + "grad_norm": 0.32525695998592424, + "kl": 0.164581298828125, + "learning_rate": 4.970660849909852e-07, + "loss": 0.0002, + "reward": 1.7642857879400253, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 10442 + }, + { + "completion_length": 264.62500858306885, + "epoch": 1.7510792573033238, + "grad_norm": 0.2660366832229654, + "kl": 0.09619140625, + "learning_rate": 4.970642523142498e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 10444 + }, + { + "completion_length": 271.4866180419922, + "epoch": 1.7514145605431912, + "grad_norm": 0.2229115237298831, + "kl": 0.159698486328125, + "learning_rate": 4.970624190686808e-07, + "loss": 0.0002, + "reward": 1.773214340209961, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7776786126196384, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10446 + }, + { + "completion_length": 255.22322750091553, + "epoch": 1.7517498637830589, + "grad_norm": 0.34719989862558487, + "kl": 0.1978912353515625, + "learning_rate": 4.970605852542826e-07, + "loss": 0.0002, + "reward": 1.8125000447034836, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8169643171131611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10448 + }, + { + "completion_length": 264.40626335144043, + "epoch": 1.7520851670229263, + "grad_norm": 0.19374510192625774, + "kl": 0.1017913818359375, + "learning_rate": 4.970587508710593e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285995811224, + "rewards/format_reward_func": 1.0, + "step": 10450 + }, + { + "completion_length": 267.5491199493408, + "epoch": 1.752420470262794, + "grad_norm": 0.18141824950577376, + "kl": 0.22296142578125, + "learning_rate": 4.970569159190152e-07, + "loss": 0.0002, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 1.0, + "step": 10452 + }, + { + "completion_length": 257.7500123977661, + "epoch": 1.7527557735026615, + "grad_norm": 0.00973250312477692, + "kl": 0.120849609375, + "learning_rate": 4.970550803981544e-07, + "loss": 0.0001, + "reward": 1.7250000685453415, + "reward_std": 0.005050762556493282, + "rewards/equation_reward_func": 0.7250000536441803, + "rewards/format_reward_func": 1.0, + "step": 10454 + }, + { + "completion_length": 264.75447845458984, + "epoch": 1.7530910767425292, + "grad_norm": 0.16646004320135122, + "kl": 0.0950927734375, + "learning_rate": 4.970532443084812e-07, + "loss": 0.0001, + "reward": 1.7553571984171867, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7589286118745804, + "rewards/format_reward_func": 0.9964285716414452, + "step": 10456 + }, + { + "completion_length": 272.44197368621826, + "epoch": 1.7534263799823966, + "grad_norm": 0.2769682286353471, + "kl": 0.12298583984375, + "learning_rate": 4.970514076499999e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10458 + }, + { + "completion_length": 259.90626430511475, + "epoch": 1.753761683222264, + "grad_norm": 0.20317105516113643, + "kl": 0.0985107421875, + "learning_rate": 4.970495704227146e-07, + "loss": 0.0001, + "reward": 1.7928572222590446, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 10460 + }, + { + "completion_length": 267.4732275009155, + "epoch": 1.7540969864621316, + "grad_norm": 0.28731729589505295, + "kl": 0.119537353515625, + "learning_rate": 4.970477326266297e-07, + "loss": 0.0001, + "reward": 1.7178572490811348, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571820259094, + "rewards/format_reward_func": 1.0, + "step": 10462 + }, + { + "completion_length": 258.66072940826416, + "epoch": 1.7544322897019993, + "grad_norm": 0.37070153761318986, + "kl": 0.100830078125, + "learning_rate": 4.970458942617493e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 10464 + }, + { + "completion_length": 271.5000123977661, + "epoch": 1.754767592941867, + "grad_norm": 0.20976854165769124, + "kl": 0.156036376953125, + "learning_rate": 4.970440553280776e-07, + "loss": 0.0002, + "reward": 1.7732143551111221, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776785865426064, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10466 + }, + { + "completion_length": 261.25893783569336, + "epoch": 1.7551028961817343, + "grad_norm": 0.3999970736225111, + "kl": 0.1058502197265625, + "learning_rate": 4.970422158256188e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.09343910776078701, + "rewards/equation_reward_func": 0.7669643014669418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10468 + }, + { + "completion_length": 270.8616189956665, + "epoch": 1.755438199421602, + "grad_norm": 0.2932392086982285, + "kl": 0.102630615234375, + "learning_rate": 4.970403757543773e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 10470 + }, + { + "completion_length": 258.6919755935669, + "epoch": 1.7557735026614694, + "grad_norm": 0.7825140351594014, + "kl": 0.227874755859375, + "learning_rate": 4.970385351143573e-07, + "loss": 0.0002, + "reward": 1.8178571835160255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571574389935, + "rewards/format_reward_func": 1.0, + "step": 10472 + }, + { + "completion_length": 265.7009048461914, + "epoch": 1.756108805901337, + "grad_norm": 0.23065535913563237, + "kl": 0.1201171875, + "learning_rate": 4.97036693905563e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 10474 + }, + { + "completion_length": 255.95536708831787, + "epoch": 1.7564441091412046, + "grad_norm": 0.35248789588895424, + "kl": 0.3164215087890625, + "learning_rate": 4.970348521279986e-07, + "loss": 0.0003, + "reward": 1.8142857626080513, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.823214303702116, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10476 + }, + { + "completion_length": 261.8125123977661, + "epoch": 1.7567794123810723, + "grad_norm": 0.19963963290351514, + "kl": 0.1441192626953125, + "learning_rate": 4.970330097816683e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 0.9821428656578064, + "step": 10478 + }, + { + "completion_length": 270.3884057998657, + "epoch": 1.7571147156209397, + "grad_norm": 0.23056821333387442, + "kl": 0.110260009765625, + "learning_rate": 4.970311668665766e-07, + "loss": 0.0001, + "reward": 1.764285758137703, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 10480 + }, + { + "completion_length": 266.4776916503906, + "epoch": 1.757450018860807, + "grad_norm": 0.1639649340695387, + "kl": 0.137908935546875, + "learning_rate": 4.970293233827274e-07, + "loss": 0.0001, + "reward": 1.6678572073578835, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.676785746589303, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10482 + }, + { + "completion_length": 257.2053699493408, + "epoch": 1.7577853221006747, + "grad_norm": 0.20303344762139539, + "kl": 0.107940673828125, + "learning_rate": 4.970274793301252e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.7767857424914837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10484 + }, + { + "completion_length": 274.4018020629883, + "epoch": 1.7581206253405424, + "grad_norm": 0.261607313743246, + "kl": 0.113311767578125, + "learning_rate": 4.970256347087741e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 10486 + }, + { + "completion_length": 258.8750114440918, + "epoch": 1.75845592858041, + "grad_norm": 0.48522185192261696, + "kl": 0.1284637451171875, + "learning_rate": 4.970237895186784e-07, + "loss": 0.0001, + "reward": 1.8339286223053932, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8383928686380386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10488 + }, + { + "completion_length": 273.1071529388428, + "epoch": 1.7587912318202774, + "grad_norm": 0.18348579583019425, + "kl": 0.12353515625, + "learning_rate": 4.970219437598423e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.751785721629858, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10490 + }, + { + "completion_length": 270.8884057998657, + "epoch": 1.759126535060145, + "grad_norm": 0.3575486729727749, + "kl": 0.1207275390625, + "learning_rate": 4.970200974322702e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7767857499420643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10492 + }, + { + "completion_length": 269.8750114440918, + "epoch": 1.7594618383000125, + "grad_norm": 0.2593350581842222, + "kl": 0.160858154296875, + "learning_rate": 4.970182505359662e-07, + "loss": 0.0002, + "reward": 1.7571429088711739, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 10494 + }, + { + "completion_length": 266.58929920196533, + "epoch": 1.75979714153988, + "grad_norm": 0.2131207192247197, + "kl": 0.10052490234375, + "learning_rate": 4.970164030709346e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.7758928798139095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10496 + }, + { + "completion_length": 262.8928680419922, + "epoch": 1.7601324447797477, + "grad_norm": 0.23990261996700582, + "kl": 0.1071319580078125, + "learning_rate": 4.970145550371797e-07, + "loss": 0.0001, + "reward": 1.7303572222590446, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7348214536905289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10498 + }, + { + "completion_length": 265.95090770721436, + "epoch": 1.7604677480196154, + "grad_norm": 0.28258834064510424, + "kl": 0.177490234375, + "learning_rate": 4.970127064347056e-07, + "loss": 0.0002, + "reward": 1.7017857879400253, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7062500268220901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10500 + }, + { + "completion_length": 255.3259038925171, + "epoch": 1.7608030512594828, + "grad_norm": 0.25810737442998816, + "kl": 0.099578857421875, + "learning_rate": 4.970108572635168e-07, + "loss": 0.0001, + "reward": 1.825000062584877, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000141561031, + "rewards/format_reward_func": 1.0, + "step": 10502 + }, + { + "completion_length": 255.51787185668945, + "epoch": 1.7611383544993502, + "grad_norm": 0.309821970825553, + "kl": 0.150054931640625, + "learning_rate": 4.970090075236173e-07, + "loss": 0.0002, + "reward": 1.7303572073578835, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7437500394880772, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10504 + }, + { + "completion_length": 260.08929920196533, + "epoch": 1.7614736577392178, + "grad_norm": 0.24711570040427921, + "kl": 0.1146240234375, + "learning_rate": 4.970071572150116e-07, + "loss": 0.0001, + "reward": 1.7982143238186836, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10506 + }, + { + "completion_length": 260.5982265472412, + "epoch": 1.7618089609790855, + "grad_norm": 0.18349956303515857, + "kl": 0.11907958984375, + "learning_rate": 4.970053063377037e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 10508 + }, + { + "completion_length": 245.54911994934082, + "epoch": 1.762144264218953, + "grad_norm": 0.07936488872801638, + "kl": 0.09381103515625, + "learning_rate": 4.97003454891698e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 10510 + }, + { + "completion_length": 255.3884048461914, + "epoch": 1.7624795674588207, + "grad_norm": 0.09023603463700747, + "kl": 0.0997314453125, + "learning_rate": 4.970016028769989e-07, + "loss": 0.0001, + "reward": 1.7625000476837158, + "reward_std": 0.022728432901203632, + "rewards/equation_reward_func": 0.7669643312692642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10512 + }, + { + "completion_length": 251.54465675354004, + "epoch": 1.7628148706986881, + "grad_norm": 0.14167783274190454, + "kl": 0.120635986328125, + "learning_rate": 4.969997502936105e-07, + "loss": 0.0001, + "reward": 1.7375000789761543, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10514 + }, + { + "completion_length": 238.62054634094238, + "epoch": 1.7631501739385556, + "grad_norm": 0.21653483508274973, + "kl": 0.100006103515625, + "learning_rate": 4.96997897141537e-07, + "loss": 0.0001, + "reward": 1.8071429282426834, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428947150707, + "rewards/format_reward_func": 1.0, + "step": 10516 + }, + { + "completion_length": 244.4732265472412, + "epoch": 1.7634854771784232, + "grad_norm": 0.21315796766163686, + "kl": 0.09234619140625, + "learning_rate": 4.969960434207828e-07, + "loss": 0.0001, + "reward": 1.7383929342031479, + "reward_std": 0.056821079924702644, + "rewards/equation_reward_func": 0.7446428798139095, + "rewards/format_reward_func": 0.9937500059604645, + "step": 10518 + }, + { + "completion_length": 247.1517972946167, + "epoch": 1.7638207804182908, + "grad_norm": 0.18540650627396144, + "kl": 0.097076416015625, + "learning_rate": 4.969941891313522e-07, + "loss": 0.0001, + "reward": 1.848214328289032, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8526785857975483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10520 + }, + { + "completion_length": 245.6741180419922, + "epoch": 1.7641560836581585, + "grad_norm": 0.16238503100743001, + "kl": 0.101806640625, + "learning_rate": 4.969923342732493e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10522 + }, + { + "completion_length": 251.41072750091553, + "epoch": 1.7644913868980259, + "grad_norm": 0.435907843732035, + "kl": 0.09576416015625, + "learning_rate": 4.969904788464786e-07, + "loss": 0.0001, + "reward": 1.7392858266830444, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 10524 + }, + { + "completion_length": 253.42411708831787, + "epoch": 1.7648266901378935, + "grad_norm": 0.18620437703841827, + "kl": 0.0916748046875, + "learning_rate": 4.969886228510442e-07, + "loss": 0.0001, + "reward": 1.7285714894533157, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7375000268220901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10526 + }, + { + "completion_length": 251.94197845458984, + "epoch": 1.765161993377761, + "grad_norm": 0.2205986553207691, + "kl": 0.0987396240234375, + "learning_rate": 4.969867662869503e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 10528 + }, + { + "completion_length": 246.54019165039062, + "epoch": 1.7654972966176286, + "grad_norm": 0.1830755227045685, + "kl": 0.120941162109375, + "learning_rate": 4.969849091542014e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 10530 + }, + { + "completion_length": 240.3125123977661, + "epoch": 1.7658325998574962, + "grad_norm": 0.15024188183011833, + "kl": 0.1271514892578125, + "learning_rate": 4.969830514528016e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7696428839117289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10532 + }, + { + "completion_length": 238.25447368621826, + "epoch": 1.7661679030973638, + "grad_norm": 0.149959763615365, + "kl": 0.091094970703125, + "learning_rate": 4.969811931827552e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 10534 + }, + { + "completion_length": 246.3928689956665, + "epoch": 1.7665032063372312, + "grad_norm": 0.3019943513178325, + "kl": 0.098297119140625, + "learning_rate": 4.969793343440666e-07, + "loss": 0.0001, + "reward": 1.8035714998841286, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 10536 + }, + { + "completion_length": 238.12500858306885, + "epoch": 1.7668385095770986, + "grad_norm": 0.3349463351982298, + "kl": 0.09002685546875, + "learning_rate": 4.969774749367401e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10538 + }, + { + "completion_length": 232.12054538726807, + "epoch": 1.7671738128169663, + "grad_norm": 0.36886859954996376, + "kl": 0.0915985107421875, + "learning_rate": 4.969756149607796e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 10540 + }, + { + "completion_length": 242.53572750091553, + "epoch": 1.767509116056834, + "grad_norm": 0.2734354801264316, + "kl": 0.1054229736328125, + "learning_rate": 4.969737544161899e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 10542 + }, + { + "completion_length": 246.2500114440918, + "epoch": 1.7678444192967016, + "grad_norm": 0.004447381062977461, + "kl": 0.0968017578125, + "learning_rate": 4.96971893302975e-07, + "loss": 0.0001, + "reward": 1.6928572058677673, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.6928571686148643, + "rewards/format_reward_func": 1.0, + "step": 10544 + }, + { + "completion_length": 237.5000123977661, + "epoch": 1.768179722536569, + "grad_norm": 0.1948318054506246, + "kl": 0.115692138671875, + "learning_rate": 4.969700316211392e-07, + "loss": 0.0001, + "reward": 1.814285784959793, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 10546 + }, + { + "completion_length": 239.33929634094238, + "epoch": 1.7685150257764366, + "grad_norm": 0.3184589691220786, + "kl": 0.099212646484375, + "learning_rate": 4.969681693706868e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 10548 + }, + { + "completion_length": 235.21429538726807, + "epoch": 1.768850329016304, + "grad_norm": 0.20532648968982828, + "kl": 0.129791259765625, + "learning_rate": 4.969663065516222e-07, + "loss": 0.0001, + "reward": 1.7803572341799736, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7839286029338837, + "rewards/format_reward_func": 0.9964285716414452, + "step": 10550 + }, + { + "completion_length": 231.64286708831787, + "epoch": 1.7691856322561716, + "grad_norm": 0.24991697110021083, + "kl": 0.096435546875, + "learning_rate": 4.969644431639495e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 10552 + }, + { + "completion_length": 234.38393878936768, + "epoch": 1.7695209354960393, + "grad_norm": 0.21260438942552592, + "kl": 0.11773681640625, + "learning_rate": 4.969625792076731e-07, + "loss": 0.0001, + "reward": 1.692857213318348, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.6928571686148643, + "rewards/format_reward_func": 1.0, + "step": 10554 + }, + { + "completion_length": 238.30358123779297, + "epoch": 1.769856238735907, + "grad_norm": 0.1314157473470005, + "kl": 0.099853515625, + "learning_rate": 4.969607146827972e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 10556 + }, + { + "completion_length": 234.6071548461914, + "epoch": 1.7701915419757743, + "grad_norm": 0.12255947847610604, + "kl": 0.11346435546875, + "learning_rate": 4.969588495893263e-07, + "loss": 0.0001, + "reward": 1.8017857670783997, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8053571842610836, + "rewards/format_reward_func": 0.9964285716414452, + "step": 10558 + }, + { + "completion_length": 232.83483028411865, + "epoch": 1.7705268452156417, + "grad_norm": 0.3855074347280174, + "kl": 0.141265869140625, + "learning_rate": 4.969569839272645e-07, + "loss": 0.0001, + "reward": 1.7089286670088768, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7125000320374966, + "rewards/format_reward_func": 0.9964285790920258, + "step": 10560 + }, + { + "completion_length": 233.3303680419922, + "epoch": 1.7708621484555094, + "grad_norm": 0.18710436318418036, + "kl": 0.1106109619140625, + "learning_rate": 4.969551176966162e-07, + "loss": 0.0001, + "reward": 1.7928572297096252, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 10562 + }, + { + "completion_length": 237.5669765472412, + "epoch": 1.771197451695377, + "grad_norm": 0.2694915854598506, + "kl": 0.1116485595703125, + "learning_rate": 4.969532508973856e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 10564 + }, + { + "completion_length": 249.62054634094238, + "epoch": 1.7715327549352446, + "grad_norm": 0.18554608403376768, + "kl": 0.2052001953125, + "learning_rate": 4.969513835295771e-07, + "loss": 0.0002, + "reward": 1.767857201397419, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 10566 + }, + { + "completion_length": 248.68751049041748, + "epoch": 1.7718680581751123, + "grad_norm": 0.14565025528136757, + "kl": 0.138702392578125, + "learning_rate": 4.96949515593195e-07, + "loss": 0.0001, + "reward": 1.732142947614193, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7321428805589676, + "rewards/format_reward_func": 1.0, + "step": 10568 + }, + { + "completion_length": 260.7232275009155, + "epoch": 1.7722033614149797, + "grad_norm": 0.2558570624218233, + "kl": 0.375213623046875, + "learning_rate": 4.969476470882435e-07, + "loss": 0.0004, + "reward": 1.7821428999304771, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428868919611, + "rewards/format_reward_func": 1.0, + "step": 10570 + }, + { + "completion_length": 255.16072463989258, + "epoch": 1.772538664654847, + "grad_norm": 0.2710318747661508, + "kl": 0.5247802734375, + "learning_rate": 4.969457780147268e-07, + "loss": 0.0005, + "reward": 1.717857226729393, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571671247482, + "rewards/format_reward_func": 1.0, + "step": 10572 + }, + { + "completion_length": 239.89733219146729, + "epoch": 1.7728739678947147, + "grad_norm": 0.1223392200473689, + "kl": 0.10552978515625, + "learning_rate": 4.969439083726496e-07, + "loss": 0.0001, + "reward": 1.8357143327593803, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8357143178582191, + "rewards/format_reward_func": 1.0, + "step": 10574 + }, + { + "completion_length": 257.7053689956665, + "epoch": 1.7732092711345824, + "grad_norm": 0.3783049950951984, + "kl": 0.526641845703125, + "learning_rate": 4.969420381620158e-07, + "loss": 0.0005, + "reward": 1.7339286506175995, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7473214641213417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10576 + }, + { + "completion_length": 256.7142972946167, + "epoch": 1.77354457437445, + "grad_norm": 0.23260212557444576, + "kl": 0.114898681640625, + "learning_rate": 4.9694016738283e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 10578 + }, + { + "completion_length": 254.59376335144043, + "epoch": 1.7738798776143174, + "grad_norm": 0.31166389144220735, + "kl": 0.380523681640625, + "learning_rate": 4.969382960350962e-07, + "loss": 0.0004, + "reward": 1.7821429073810577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 10580 + }, + { + "completion_length": 270.8705472946167, + "epoch": 1.774215180854185, + "grad_norm": 0.16185259134789332, + "kl": 0.2837677001953125, + "learning_rate": 4.969364241188191e-07, + "loss": 0.0003, + "reward": 1.6928572282195091, + "reward_std": 0.08081220462918282, + "rewards/equation_reward_func": 0.7196428887546062, + "rewards/format_reward_func": 0.9732142984867096, + "step": 10582 + }, + { + "completion_length": 262.33036708831787, + "epoch": 1.7745504840940525, + "grad_norm": 0.20630636229383023, + "kl": 0.138214111328125, + "learning_rate": 4.969345516340026e-07, + "loss": 0.0001, + "reward": 1.7732143551111221, + "reward_std": 0.10354063473641872, + "rewards/equation_reward_func": 0.8044643104076385, + "rewards/format_reward_func": 0.9687500149011612, + "step": 10584 + }, + { + "completion_length": 266.24108505249023, + "epoch": 1.77488578733392, + "grad_norm": 0.22700971231892747, + "kl": 0.325653076171875, + "learning_rate": 4.969326785806513e-07, + "loss": 0.0003, + "reward": 1.7053571864962578, + "reward_std": 0.123743686825037, + "rewards/equation_reward_func": 0.736607177183032, + "rewards/format_reward_func": 0.9687500149011612, + "step": 10586 + }, + { + "completion_length": 255.2634048461914, + "epoch": 1.7752210905737877, + "grad_norm": 0.31500744912161194, + "kl": 0.1110076904296875, + "learning_rate": 4.969308049587694e-07, + "loss": 0.0001, + "reward": 1.7071429044008255, + "reward_std": 0.08081220369786024, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 0.9821428656578064, + "step": 10588 + }, + { + "completion_length": 269.6607275009155, + "epoch": 1.7755563938136554, + "grad_norm": 0.29098020918540685, + "kl": 0.1125640869140625, + "learning_rate": 4.969289307683612e-07, + "loss": 0.0001, + "reward": 1.698214367032051, + "reward_std": 0.10354063659906387, + "rewards/equation_reward_func": 0.7205357439815998, + "rewards/format_reward_func": 0.977678582072258, + "step": 10590 + }, + { + "completion_length": 266.28125953674316, + "epoch": 1.7758916970535228, + "grad_norm": 0.3319292014716968, + "kl": 0.608734130859375, + "learning_rate": 4.969270560094311e-07, + "loss": 0.0006, + "reward": 1.6875000596046448, + "reward_std": 0.1085914010182023, + "rewards/equation_reward_func": 0.7187500447034836, + "rewards/format_reward_func": 0.9687500149011612, + "step": 10592 + }, + { + "completion_length": 261.5401945114136, + "epoch": 1.7762270002933902, + "grad_norm": 0.26305753900705553, + "kl": 0.69952392578125, + "learning_rate": 4.969251806819834e-07, + "loss": 0.0007, + "reward": 1.7392857745289803, + "reward_std": 0.10606601648032665, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9732142984867096, + "step": 10594 + }, + { + "completion_length": 256.352689743042, + "epoch": 1.7765623035332578, + "grad_norm": 0.11624276670860509, + "kl": 0.237396240234375, + "learning_rate": 4.969233047860223e-07, + "loss": 0.0002, + "reward": 1.73214291036129, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9830357208848, + "step": 10596 + }, + { + "completion_length": 266.1116180419922, + "epoch": 1.7768976067731255, + "grad_norm": 0.24888262657285953, + "kl": 0.37750244140625, + "learning_rate": 4.969214283215523e-07, + "loss": 0.0004, + "reward": 1.700000062584877, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7089285850524902, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10598 + }, + { + "completion_length": 255.25894165039062, + "epoch": 1.777232910012993, + "grad_norm": 0.3873078676136931, + "kl": 0.159027099609375, + "learning_rate": 4.969195512885775e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428753435612, + "rewards/format_reward_func": 1.0, + "step": 10600 + }, + { + "completion_length": 258.6294775009155, + "epoch": 1.7775682132528605, + "grad_norm": 0.16038748989821341, + "kl": 0.148468017578125, + "learning_rate": 4.969176736871024e-07, + "loss": 0.0001, + "reward": 1.7142858132719994, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 1.0, + "step": 10602 + }, + { + "completion_length": 252.9553680419922, + "epoch": 1.7779035164927282, + "grad_norm": 0.2520951190407464, + "kl": 0.157440185546875, + "learning_rate": 4.969157955171313e-07, + "loss": 0.0002, + "reward": 1.7000000476837158, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.708928607404232, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10604 + }, + { + "completion_length": 247.06697463989258, + "epoch": 1.7782388197325956, + "grad_norm": 0.26364201619762373, + "kl": 0.3067626953125, + "learning_rate": 4.969139167786684e-07, + "loss": 0.0003, + "reward": 1.7928572222590446, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 10606 + }, + { + "completion_length": 245.12947463989258, + "epoch": 1.7785741229724632, + "grad_norm": 0.1645764154368642, + "kl": 0.153106689453125, + "learning_rate": 4.969120374717182e-07, + "loss": 0.0002, + "reward": 1.7678572162985802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 10608 + }, + { + "completion_length": 237.33929538726807, + "epoch": 1.7789094262123308, + "grad_norm": 0.16426994973474854, + "kl": 0.1124267578125, + "learning_rate": 4.969101575962849e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 10610 + }, + { + "completion_length": 247.6964406967163, + "epoch": 1.7792447294521985, + "grad_norm": 0.182551150654352, + "kl": 0.1142730712890625, + "learning_rate": 4.969082771523728e-07, + "loss": 0.0001, + "reward": 1.7982143461704254, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026785962283611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10612 + }, + { + "completion_length": 247.01786518096924, + "epoch": 1.7795800326920659, + "grad_norm": 0.23539622917480219, + "kl": 0.1351470947265625, + "learning_rate": 4.969063961399865e-07, + "loss": 0.0001, + "reward": 1.7866071984171867, + "reward_std": 0.049244935624301434, + "rewards/equation_reward_func": 0.7883928902447224, + "rewards/format_reward_func": 0.9982142895460129, + "step": 10614 + }, + { + "completion_length": 255.25893878936768, + "epoch": 1.7799153359319333, + "grad_norm": 0.20134669385634055, + "kl": 0.119110107421875, + "learning_rate": 4.9690451455913e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10616 + }, + { + "completion_length": 240.79911708831787, + "epoch": 1.780250639171801, + "grad_norm": 0.07603412774145471, + "kl": 0.10906982421875, + "learning_rate": 4.969026324098076e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 10618 + }, + { + "completion_length": 249.82590293884277, + "epoch": 1.7805859424116686, + "grad_norm": 0.26972691648450575, + "kl": 0.1138153076171875, + "learning_rate": 4.96900749692024e-07, + "loss": 0.0001, + "reward": 1.7776786237955093, + "reward_std": 0.07197336852550507, + "rewards/equation_reward_func": 0.7839285917580128, + "rewards/format_reward_func": 0.9937500059604645, + "step": 10620 + }, + { + "completion_length": 239.44197750091553, + "epoch": 1.7809212456515362, + "grad_norm": 0.2305440981797223, + "kl": 0.130126953125, + "learning_rate": 4.968988664057834e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 10622 + }, + { + "completion_length": 243.85268878936768, + "epoch": 1.7812565488914036, + "grad_norm": 0.26768835156722426, + "kl": 0.092803955078125, + "learning_rate": 4.968969825510899e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 10624 + }, + { + "completion_length": 247.4196548461914, + "epoch": 1.7815918521312712, + "grad_norm": 0.23568997635818786, + "kl": 0.155609130859375, + "learning_rate": 4.968950981279481e-07, + "loss": 0.0002, + "reward": 1.7553572058677673, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214708268642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10626 + }, + { + "completion_length": 248.5937614440918, + "epoch": 1.7819271553711387, + "grad_norm": 0.2304402395430419, + "kl": 0.101806640625, + "learning_rate": 4.968932131363621e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7839285954833031, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10628 + }, + { + "completion_length": 252.67858505249023, + "epoch": 1.7822624586110063, + "grad_norm": 0.14773177459543393, + "kl": 0.088470458984375, + "learning_rate": 4.968913275763365e-07, + "loss": 0.0001, + "reward": 1.753571480512619, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.762500025331974, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10630 + }, + { + "completion_length": 241.102689743042, + "epoch": 1.782597761850874, + "grad_norm": 0.23951759989016289, + "kl": 0.0987548828125, + "learning_rate": 4.968894414478756e-07, + "loss": 0.0001, + "reward": 1.817857213318348, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571686148643, + "rewards/format_reward_func": 1.0, + "step": 10632 + }, + { + "completion_length": 259.4732246398926, + "epoch": 1.7829330650907416, + "grad_norm": 0.26787557518060745, + "kl": 0.1324462890625, + "learning_rate": 4.968875547509836e-07, + "loss": 0.0001, + "reward": 1.7696429267525673, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071619093418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10634 + }, + { + "completion_length": 251.00447750091553, + "epoch": 1.783268368330609, + "grad_norm": 0.5559621875801689, + "kl": 0.1373291015625, + "learning_rate": 4.968856674856648e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 10636 + }, + { + "completion_length": 244.40626049041748, + "epoch": 1.7836036715704764, + "grad_norm": 0.2907232267990311, + "kl": 0.132568359375, + "learning_rate": 4.968837796519238e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 10638 + }, + { + "completion_length": 245.6741180419922, + "epoch": 1.783938974810344, + "grad_norm": 0.2367535675360497, + "kl": 0.134552001953125, + "learning_rate": 4.968818912497647e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7875000275671482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10640 + }, + { + "completion_length": 249.16072463989258, + "epoch": 1.7842742780502117, + "grad_norm": 0.279259478311908, + "kl": 0.3076171875, + "learning_rate": 4.968800022791921e-07, + "loss": 0.0003, + "reward": 1.751785770058632, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.7651786021888256, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10642 + }, + { + "completion_length": 244.25447463989258, + "epoch": 1.7846095812900793, + "grad_norm": 0.30117573873766484, + "kl": 0.3115234375, + "learning_rate": 4.9687811274021e-07, + "loss": 0.0003, + "reward": 1.7196429371833801, + "reward_std": 0.1035406356677413, + "rewards/equation_reward_func": 0.7330357506871223, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10644 + }, + { + "completion_length": 247.9375123977661, + "epoch": 1.784944884529947, + "grad_norm": 0.31154505385018966, + "kl": 0.31414794921875, + "learning_rate": 4.968762226328231e-07, + "loss": 0.0003, + "reward": 1.7392857819795609, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7482143118977547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10646 + }, + { + "completion_length": 252.08929634094238, + "epoch": 1.7852801877698143, + "grad_norm": 0.16945113473111542, + "kl": 0.0981597900390625, + "learning_rate": 4.968743319570354e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714399069548, + "rewards/format_reward_func": 1.0, + "step": 10648 + }, + { + "completion_length": 240.6160831451416, + "epoch": 1.7856154910096818, + "grad_norm": 0.22287418212263907, + "kl": 0.100494384765625, + "learning_rate": 4.968724407128516e-07, + "loss": 0.0001, + "reward": 1.8160714879631996, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8196428790688515, + "rewards/format_reward_func": 0.9964285716414452, + "step": 10650 + }, + { + "completion_length": 235.75893878936768, + "epoch": 1.7859507942495494, + "grad_norm": 0.1826597093835376, + "kl": 0.0981597900390625, + "learning_rate": 4.968705489002759e-07, + "loss": 0.0001, + "reward": 1.751785770058632, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10652 + }, + { + "completion_length": 238.5044755935669, + "epoch": 1.786286097489417, + "grad_norm": 0.1955615982514768, + "kl": 0.101715087890625, + "learning_rate": 4.968686565193127e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 10654 + }, + { + "completion_length": 247.62054443359375, + "epoch": 1.7866214007292847, + "grad_norm": 0.20413985650724134, + "kl": 0.120880126953125, + "learning_rate": 4.968667635699662e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.808928593993187, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10656 + }, + { + "completion_length": 247.2232255935669, + "epoch": 1.786956703969152, + "grad_norm": 0.2791594220333788, + "kl": 0.272247314453125, + "learning_rate": 4.968648700522411e-07, + "loss": 0.0003, + "reward": 1.6928571984171867, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.6928571853786707, + "rewards/format_reward_func": 1.0, + "step": 10658 + }, + { + "completion_length": 234.5089406967163, + "epoch": 1.7872920072090197, + "grad_norm": 0.3520679227334401, + "kl": 0.1893463134765625, + "learning_rate": 4.968629759661414e-07, + "loss": 0.0002, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 10660 + }, + { + "completion_length": 242.85715293884277, + "epoch": 1.7876273104488871, + "grad_norm": 0.22464576239652737, + "kl": 0.2414398193359375, + "learning_rate": 4.968610813116716e-07, + "loss": 0.0002, + "reward": 1.7678572162985802, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7767857387661934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10662 + }, + { + "completion_length": 238.91072463989258, + "epoch": 1.7879626136887548, + "grad_norm": 0.2595509564574555, + "kl": 0.10626220703125, + "learning_rate": 4.96859186088836e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143238186836, + "rewards/format_reward_func": 1.0, + "step": 10664 + }, + { + "completion_length": 231.3705472946167, + "epoch": 1.7882979169286224, + "grad_norm": 0.5999381979692516, + "kl": 0.176300048828125, + "learning_rate": 4.968572902976392e-07, + "loss": 0.0002, + "reward": 1.8178571835160255, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 10666 + }, + { + "completion_length": 243.08482933044434, + "epoch": 1.78863322016849, + "grad_norm": 0.45039498742021106, + "kl": 0.152374267578125, + "learning_rate": 4.968553939380852e-07, + "loss": 0.0002, + "reward": 1.7696429416537285, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7741071581840515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10668 + }, + { + "completion_length": 238.62055015563965, + "epoch": 1.7889685234083574, + "grad_norm": 0.31604306983363795, + "kl": 0.114898681640625, + "learning_rate": 4.968534970101786e-07, + "loss": 0.0001, + "reward": 1.8392857685685158, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8392857275903225, + "rewards/format_reward_func": 1.0, + "step": 10670 + }, + { + "completion_length": 239.0000123977661, + "epoch": 1.7893038266482248, + "grad_norm": 0.1465661738385793, + "kl": 0.151458740234375, + "learning_rate": 4.968515995139237e-07, + "loss": 0.0002, + "reward": 1.803571492433548, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714440047741, + "rewards/format_reward_func": 1.0, + "step": 10672 + }, + { + "completion_length": 244.06697750091553, + "epoch": 1.7896391298880925, + "grad_norm": 0.29623229382166644, + "kl": 0.254180908203125, + "learning_rate": 4.968497014493251e-07, + "loss": 0.0003, + "reward": 1.7482143267989159, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7616071812808514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10674 + }, + { + "completion_length": 245.59376335144043, + "epoch": 1.7899744331279601, + "grad_norm": 0.26622090051005864, + "kl": 0.1448974609375, + "learning_rate": 4.968478028163867e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7669643089175224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10676 + }, + { + "completion_length": 246.6607255935669, + "epoch": 1.7903097363678278, + "grad_norm": 0.16048621493513235, + "kl": 0.18072509765625, + "learning_rate": 4.968459036151132e-07, + "loss": 0.0002, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143241912127, + "rewards/format_reward_func": 1.0, + "step": 10678 + }, + { + "completion_length": 240.69197368621826, + "epoch": 1.7906450396076952, + "grad_norm": 0.1796921365565492, + "kl": 0.12127685546875, + "learning_rate": 4.96844003845509e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000469386578, + "rewards/format_reward_func": 1.0, + "step": 10680 + }, + { + "completion_length": 227.19643688201904, + "epoch": 1.7909803428475628, + "grad_norm": 0.14189694182681398, + "kl": 0.176177978515625, + "learning_rate": 4.968421035075784e-07, + "loss": 0.0002, + "reward": 1.7946429252624512, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7991071790456772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10682 + }, + { + "completion_length": 240.73661518096924, + "epoch": 1.7913156460874302, + "grad_norm": 0.2688246315021042, + "kl": 0.352508544921875, + "learning_rate": 4.968402026013256e-07, + "loss": 0.0004, + "reward": 1.7517858073115349, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10684 + }, + { + "completion_length": 234.1294755935669, + "epoch": 1.7916509493272978, + "grad_norm": 0.3868588654613052, + "kl": 0.184417724609375, + "learning_rate": 4.968383011267553e-07, + "loss": 0.0002, + "reward": 1.7785715088248253, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 10686 + }, + { + "completion_length": 235.66518878936768, + "epoch": 1.7919862525671655, + "grad_norm": 0.23331563541205555, + "kl": 0.129638671875, + "learning_rate": 4.968363990838716e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 10688 + }, + { + "completion_length": 239.1384048461914, + "epoch": 1.7923215558070331, + "grad_norm": 0.14848342113524915, + "kl": 0.100128173828125, + "learning_rate": 4.96834496472679e-07, + "loss": 0.0001, + "reward": 1.8107143640518188, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 10690 + }, + { + "completion_length": 235.27679347991943, + "epoch": 1.7926568590469005, + "grad_norm": 0.12301443116228022, + "kl": 0.1092529296875, + "learning_rate": 4.968325932931819e-07, + "loss": 0.0001, + "reward": 1.7821428924798965, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428831666708, + "rewards/format_reward_func": 1.0, + "step": 10692 + }, + { + "completion_length": 245.8437623977661, + "epoch": 1.792992162286768, + "grad_norm": 0.24794920535729859, + "kl": 0.235076904296875, + "learning_rate": 4.968306895453846e-07, + "loss": 0.0002, + "reward": 1.7142857983708382, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7142857424914837, + "rewards/format_reward_func": 1.0, + "step": 10694 + }, + { + "completion_length": 239.08036613464355, + "epoch": 1.7933274655266356, + "grad_norm": 0.7665548328413366, + "kl": 0.161376953125, + "learning_rate": 4.968287852292916e-07, + "loss": 0.0002, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 10696 + }, + { + "completion_length": 240.67858600616455, + "epoch": 1.7936627687665032, + "grad_norm": 0.26873292722443687, + "kl": 0.1585845947265625, + "learning_rate": 4.968268803449072e-07, + "loss": 0.0002, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 10698 + }, + { + "completion_length": 239.7053689956665, + "epoch": 1.7939980720063708, + "grad_norm": 0.36024769526209427, + "kl": 0.099151611328125, + "learning_rate": 4.968249748922358e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 10700 + }, + { + "completion_length": 235.33037090301514, + "epoch": 1.7943333752462385, + "grad_norm": 0.1891792212945995, + "kl": 0.097320556640625, + "learning_rate": 4.968230688712818e-07, + "loss": 0.0001, + "reward": 1.721428669989109, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 10702 + }, + { + "completion_length": 223.64733123779297, + "epoch": 1.794668678486106, + "grad_norm": 0.18664824171925037, + "kl": 0.09637451171875, + "learning_rate": 4.968211622820495e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 10704 + }, + { + "completion_length": 230.7187614440918, + "epoch": 1.7950039817259733, + "grad_norm": 0.19383546116090142, + "kl": 0.1127166748046875, + "learning_rate": 4.968192551245435e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857529222965, + "rewards/format_reward_func": 1.0, + "step": 10706 + }, + { + "completion_length": 231.93750953674316, + "epoch": 1.795339284965841, + "grad_norm": 0.2077794241490124, + "kl": 0.093048095703125, + "learning_rate": 4.968173473987681e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 10708 + }, + { + "completion_length": 234.94643783569336, + "epoch": 1.7956745882057086, + "grad_norm": 0.21449086730659028, + "kl": 0.096038818359375, + "learning_rate": 4.968154391047274e-07, + "loss": 0.0001, + "reward": 1.7714286595582962, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 10710 + }, + { + "completion_length": 238.0491180419922, + "epoch": 1.7960098914455762, + "grad_norm": 0.29946112981399303, + "kl": 0.09326171875, + "learning_rate": 4.968135302424262e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 10712 + }, + { + "completion_length": 236.37054634094238, + "epoch": 1.7963451946854436, + "grad_norm": 0.11589060979527094, + "kl": 0.091156005859375, + "learning_rate": 4.968116208118688e-07, + "loss": 0.0001, + "reward": 1.8080357685685158, + "reward_std": 0.059346460737288, + "rewards/equation_reward_func": 0.8098214641213417, + "rewards/format_reward_func": 0.9982142895460129, + "step": 10714 + }, + { + "completion_length": 245.41965198516846, + "epoch": 1.796680497925311, + "grad_norm": 0.5277318509362663, + "kl": 0.11151123046875, + "learning_rate": 4.968097108130595e-07, + "loss": 0.0001, + "reward": 1.7250000983476639, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000424683094, + "rewards/format_reward_func": 1.0, + "step": 10716 + }, + { + "completion_length": 237.00447368621826, + "epoch": 1.7970158011651787, + "grad_norm": 0.4631048577308088, + "kl": 0.155487060546875, + "learning_rate": 4.968078002460027e-07, + "loss": 0.0002, + "reward": 1.8000000566244125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 10718 + }, + { + "completion_length": 232.86161613464355, + "epoch": 1.7973511044050463, + "grad_norm": 0.31052716637182964, + "kl": 0.121002197265625, + "learning_rate": 4.968058891107029e-07, + "loss": 0.0001, + "reward": 1.7794643267989159, + "reward_std": 0.049244935624301434, + "rewards/equation_reward_func": 0.7812500279396772, + "rewards/format_reward_func": 0.9982142895460129, + "step": 10720 + }, + { + "completion_length": 240.9241180419922, + "epoch": 1.797686407644914, + "grad_norm": 0.49598330005641567, + "kl": 0.12677001953125, + "learning_rate": 4.968039774071644e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 10722 + }, + { + "completion_length": 236.0134048461914, + "epoch": 1.7980217108847816, + "grad_norm": 0.40670055538330085, + "kl": 0.1128997802734375, + "learning_rate": 4.968020651353916e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 10724 + }, + { + "completion_length": 230.78572463989258, + "epoch": 1.798357014124649, + "grad_norm": 0.2347570156213548, + "kl": 0.13226318359375, + "learning_rate": 4.968001522953889e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 10726 + }, + { + "completion_length": 238.46429634094238, + "epoch": 1.7986923173645164, + "grad_norm": 0.15634411925731584, + "kl": 0.101654052734375, + "learning_rate": 4.967982388871608e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 10728 + }, + { + "completion_length": 240.1785831451416, + "epoch": 1.799027620604384, + "grad_norm": 0.1919579346758296, + "kl": 0.098297119140625, + "learning_rate": 4.967963249107117e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286129921675, + "rewards/format_reward_func": 1.0, + "step": 10730 + }, + { + "completion_length": 236.01340293884277, + "epoch": 1.7993629238442517, + "grad_norm": 0.19750935368932737, + "kl": 0.104766845703125, + "learning_rate": 4.967944103660458e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 10732 + }, + { + "completion_length": 242.94197463989258, + "epoch": 1.7996982270841193, + "grad_norm": 0.18550836957480152, + "kl": 0.106964111328125, + "learning_rate": 4.967924952531678e-07, + "loss": 0.0001, + "reward": 1.753571480512619, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 10734 + }, + { + "completion_length": 239.72769165039062, + "epoch": 1.8000335303239867, + "grad_norm": 0.14716396585075947, + "kl": 0.11041259765625, + "learning_rate": 4.96790579572082e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7535714562982321, + "rewards/format_reward_func": 1.0, + "step": 10736 + }, + { + "completion_length": 234.71429634094238, + "epoch": 1.8003688335638544, + "grad_norm": 0.1392211187955793, + "kl": 0.102142333984375, + "learning_rate": 4.967886633227927e-07, + "loss": 0.0001, + "reward": 1.7250000685453415, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 10738 + }, + { + "completion_length": 236.9375114440918, + "epoch": 1.8007041368037218, + "grad_norm": 0.2994866979627325, + "kl": 0.105682373046875, + "learning_rate": 4.967867465053044e-07, + "loss": 0.0001, + "reward": 1.7839286401867867, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10740 + }, + { + "completion_length": 234.30804634094238, + "epoch": 1.8010394400435894, + "grad_norm": 0.18439435986178013, + "kl": 0.102508544921875, + "learning_rate": 4.967848291196216e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 10742 + }, + { + "completion_length": 244.1160831451416, + "epoch": 1.801374743283457, + "grad_norm": 0.19176630269728612, + "kl": 0.114044189453125, + "learning_rate": 4.967829111657485e-07, + "loss": 0.0001, + "reward": 1.7071429267525673, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7071428839117289, + "rewards/format_reward_func": 1.0, + "step": 10744 + }, + { + "completion_length": 249.7500123977661, + "epoch": 1.8017100465233247, + "grad_norm": 0.3886127234059232, + "kl": 0.1446533203125, + "learning_rate": 4.967809926436897e-07, + "loss": 0.0001, + "reward": 1.7732143327593803, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776786126196384, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10746 + }, + { + "completion_length": 240.1875114440918, + "epoch": 1.802045349763192, + "grad_norm": 0.25582454254449444, + "kl": 0.12982177734375, + "learning_rate": 4.967790735534495e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 10748 + }, + { + "completion_length": 243.196439743042, + "epoch": 1.8023806530030595, + "grad_norm": 0.1093102806927311, + "kl": 0.12841796875, + "learning_rate": 4.967771538950325e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143022119999, + "rewards/format_reward_func": 1.0, + "step": 10750 + }, + { + "completion_length": 233.93304538726807, + "epoch": 1.8027159562429271, + "grad_norm": 0.2041405389472402, + "kl": 0.097808837890625, + "learning_rate": 4.967752336684428e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571686148643, + "rewards/format_reward_func": 1.0, + "step": 10752 + }, + { + "completion_length": 243.42857933044434, + "epoch": 1.8030512594827948, + "grad_norm": 0.31358890129485795, + "kl": 0.1039886474609375, + "learning_rate": 4.967733128736852e-07, + "loss": 0.0001, + "reward": 1.7696429044008255, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7830357551574707, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10754 + }, + { + "completion_length": 252.57144165039062, + "epoch": 1.8033865627226624, + "grad_norm": 0.5249165360425068, + "kl": 0.2020263671875, + "learning_rate": 4.967713915107639e-07, + "loss": 0.0002, + "reward": 1.7062500938773155, + "reward_std": 0.06187184248119593, + "rewards/equation_reward_func": 0.7125000283122063, + "rewards/format_reward_func": 0.9937500059604645, + "step": 10756 + }, + { + "completion_length": 249.23662090301514, + "epoch": 1.8037218659625298, + "grad_norm": 0.0033794405458902822, + "kl": 0.105987548828125, + "learning_rate": 4.967694695796833e-07, + "loss": 0.0001, + "reward": 1.7357143387198448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143145054579, + "rewards/format_reward_func": 1.0, + "step": 10758 + }, + { + "completion_length": 249.14733219146729, + "epoch": 1.8040571692023974, + "grad_norm": 0.25247943380170706, + "kl": 0.135498046875, + "learning_rate": 4.967675470804479e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.752678606659174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10760 + }, + { + "completion_length": 258.09822940826416, + "epoch": 1.8043924724422649, + "grad_norm": 0.32874928742714987, + "kl": 0.146820068359375, + "learning_rate": 4.967656240130621e-07, + "loss": 0.0001, + "reward": 1.7107143849134445, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7107143104076385, + "rewards/format_reward_func": 1.0, + "step": 10762 + }, + { + "completion_length": 245.71876335144043, + "epoch": 1.8047277756821325, + "grad_norm": 0.2009743767668466, + "kl": 0.3307342529296875, + "learning_rate": 4.967637003775303e-07, + "loss": 0.0003, + "reward": 1.7857143506407738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 10764 + }, + { + "completion_length": 254.3928689956665, + "epoch": 1.8050630789220001, + "grad_norm": 0.24327630118252214, + "kl": 0.181610107421875, + "learning_rate": 4.967617761738571e-07, + "loss": 0.0002, + "reward": 1.7000000849366188, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7000000346451998, + "rewards/format_reward_func": 1.0, + "step": 10766 + }, + { + "completion_length": 247.15626049041748, + "epoch": 1.8053983821618678, + "grad_norm": 0.4902753874075953, + "kl": 0.193634033203125, + "learning_rate": 4.967598514020467e-07, + "loss": 0.0002, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 10768 + }, + { + "completion_length": 251.9866180419922, + "epoch": 1.8057336854017352, + "grad_norm": 0.22101468467873372, + "kl": 0.237457275390625, + "learning_rate": 4.967579260621036e-07, + "loss": 0.0002, + "reward": 1.7589286416769028, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10770 + }, + { + "completion_length": 262.8035840988159, + "epoch": 1.8060689886416026, + "grad_norm": 0.38687489836976613, + "kl": 0.288330078125, + "learning_rate": 4.967560001540324e-07, + "loss": 0.0003, + "reward": 1.7178572118282318, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.71785718947649, + "rewards/format_reward_func": 1.0, + "step": 10772 + }, + { + "completion_length": 258.84375858306885, + "epoch": 1.8064042918814702, + "grad_norm": 0.1725018185461603, + "kl": 0.2459564208984375, + "learning_rate": 4.967540736778373e-07, + "loss": 0.0002, + "reward": 1.7500000968575478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 10774 + }, + { + "completion_length": 245.78126525878906, + "epoch": 1.8067395951213379, + "grad_norm": 0.12436038886983099, + "kl": 0.120635986328125, + "learning_rate": 4.967521466335228e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7660714536905289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10776 + }, + { + "completion_length": 259.9955472946167, + "epoch": 1.8070748983612055, + "grad_norm": 0.3770273477501551, + "kl": 0.53118896484375, + "learning_rate": 4.967502190210934e-07, + "loss": 0.0005, + "reward": 1.7482143715023994, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.752678606659174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10778 + }, + { + "completion_length": 242.696439743042, + "epoch": 1.8074102016010731, + "grad_norm": 1.098628616791553, + "kl": 0.73077392578125, + "learning_rate": 4.967482908405536e-07, + "loss": 0.0007, + "reward": 1.7785715013742447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 10780 + }, + { + "completion_length": 244.02233123779297, + "epoch": 1.8077455048409405, + "grad_norm": 0.3056318524063688, + "kl": 0.1494140625, + "learning_rate": 4.967463620919077e-07, + "loss": 0.0001, + "reward": 1.7035715132951736, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7035714611411095, + "rewards/format_reward_func": 1.0, + "step": 10782 + }, + { + "completion_length": 251.45536708831787, + "epoch": 1.808080808080808, + "grad_norm": 0.2017605345416473, + "kl": 0.2183837890625, + "learning_rate": 4.967444327751601e-07, + "loss": 0.0002, + "reward": 1.74642863124609, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7553571723401546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10784 + }, + { + "completion_length": 246.1384038925171, + "epoch": 1.8084161113206756, + "grad_norm": 0.17238070059866234, + "kl": 0.24993896484375, + "learning_rate": 4.967425028903153e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857436090708, + "rewards/format_reward_func": 1.0, + "step": 10786 + }, + { + "completion_length": 247.71429538726807, + "epoch": 1.8087514145605432, + "grad_norm": 0.21629355159149774, + "kl": 0.41064453125, + "learning_rate": 4.96740572437378e-07, + "loss": 0.0004, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 10788 + }, + { + "completion_length": 242.66965579986572, + "epoch": 1.8090867178004109, + "grad_norm": 0.3175852033098951, + "kl": 0.1038665771484375, + "learning_rate": 4.967386414163522e-07, + "loss": 0.0001, + "reward": 1.8196429461240768, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8241071701049805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10790 + }, + { + "completion_length": 255.8884038925171, + "epoch": 1.8094220210402783, + "grad_norm": 0.32156099273712735, + "kl": 0.1937713623046875, + "learning_rate": 4.967367098272427e-07, + "loss": 0.0002, + "reward": 1.7410715073347092, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.754464328289032, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10792 + }, + { + "completion_length": 255.883939743042, + "epoch": 1.809757324280146, + "grad_norm": 0.0992087667618495, + "kl": 0.1259918212890625, + "learning_rate": 4.967347776700538e-07, + "loss": 0.0001, + "reward": 1.6910714656114578, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7133928891271353, + "rewards/format_reward_func": 0.977678582072258, + "step": 10794 + }, + { + "completion_length": 247.7053680419922, + "epoch": 1.8100926275200133, + "grad_norm": 0.18729247886434083, + "kl": 0.1168670654296875, + "learning_rate": 4.967328449447898e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.787500012665987, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10796 + }, + { + "completion_length": 260.98215675354004, + "epoch": 1.810427930759881, + "grad_norm": 0.21310321098668664, + "kl": 0.1923980712890625, + "learning_rate": 4.967309116514555e-07, + "loss": 0.0002, + "reward": 1.7964286133646965, + "reward_std": 0.035355339758098125, + "rewards/equation_reward_func": 0.8053571805357933, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10798 + }, + { + "completion_length": 256.29911708831787, + "epoch": 1.8107632339997486, + "grad_norm": 0.3038643309134824, + "kl": 0.1059722900390625, + "learning_rate": 4.967289777900551e-07, + "loss": 0.0001, + "reward": 1.6625000908970833, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.6758928969502449, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10800 + }, + { + "completion_length": 240.25001049041748, + "epoch": 1.8110985372396162, + "grad_norm": 0.283155134206445, + "kl": 0.132720947265625, + "learning_rate": 4.96727043360593e-07, + "loss": 0.0001, + "reward": 1.7196429520845413, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10802 + }, + { + "completion_length": 237.68750953674316, + "epoch": 1.8114338404794836, + "grad_norm": 0.08161599208536394, + "kl": 0.1114654541015625, + "learning_rate": 4.967251083630739e-07, + "loss": 0.0001, + "reward": 1.785714328289032, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 10804 + }, + { + "completion_length": 246.34822273254395, + "epoch": 1.811769143719351, + "grad_norm": 0.23240967627608058, + "kl": 0.122589111328125, + "learning_rate": 4.967231727975021e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10806 + }, + { + "completion_length": 248.69197463989258, + "epoch": 1.8121044469592187, + "grad_norm": 0.1302638213649644, + "kl": 0.102691650390625, + "learning_rate": 4.967212366638821e-07, + "loss": 0.0001, + "reward": 1.7464286163449287, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7553571723401546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10808 + }, + { + "completion_length": 249.09822463989258, + "epoch": 1.8124397501990863, + "grad_norm": 0.309884045882955, + "kl": 0.0946807861328125, + "learning_rate": 4.967192999622183e-07, + "loss": 0.0001, + "reward": 1.7375000789761543, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10810 + }, + { + "completion_length": 233.42858028411865, + "epoch": 1.812775053438954, + "grad_norm": 0.23595094134065409, + "kl": 0.10211181640625, + "learning_rate": 4.967173626925152e-07, + "loss": 0.0001, + "reward": 1.7589286118745804, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10812 + }, + { + "completion_length": 245.852689743042, + "epoch": 1.8131103566788214, + "grad_norm": 0.32348892322344996, + "kl": 0.143402099609375, + "learning_rate": 4.967154248547773e-07, + "loss": 0.0001, + "reward": 1.751785770058632, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10814 + }, + { + "completion_length": 227.3035831451416, + "epoch": 1.813445659918689, + "grad_norm": 0.07325437364817967, + "kl": 0.0951690673828125, + "learning_rate": 4.967134864490089e-07, + "loss": 0.0001, + "reward": 1.782142885029316, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7821428943425417, + "rewards/format_reward_func": 1.0, + "step": 10816 + }, + { + "completion_length": 239.6428689956665, + "epoch": 1.8137809631585564, + "grad_norm": 0.25369394576949483, + "kl": 0.090789794921875, + "learning_rate": 4.967115474752146e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 1.0, + "step": 10818 + }, + { + "completion_length": 241.89733219146729, + "epoch": 1.814116266398424, + "grad_norm": 0.18867217775157238, + "kl": 0.130615234375, + "learning_rate": 4.967096079333989e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 10820 + }, + { + "completion_length": 235.8750123977661, + "epoch": 1.8144515696382917, + "grad_norm": 0.21150017673269703, + "kl": 0.095611572265625, + "learning_rate": 4.967076678235662e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 10822 + }, + { + "completion_length": 251.977689743042, + "epoch": 1.8147868728781593, + "grad_norm": 0.13915586483642634, + "kl": 0.124359130859375, + "learning_rate": 4.96705727145721e-07, + "loss": 0.0001, + "reward": 1.783928632736206, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7875000275671482, + "rewards/format_reward_func": 0.9964285716414452, + "step": 10824 + }, + { + "completion_length": 236.93304634094238, + "epoch": 1.8151221761180267, + "grad_norm": 0.32561266503298675, + "kl": 0.105865478515625, + "learning_rate": 4.967037858998677e-07, + "loss": 0.0001, + "reward": 1.8178571835160255, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 10826 + }, + { + "completion_length": 252.33929634094238, + "epoch": 1.8154574793578941, + "grad_norm": 0.28224839859070017, + "kl": 0.1246337890625, + "learning_rate": 4.967018440860109e-07, + "loss": 0.0001, + "reward": 1.8142857775092125, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.8232143074274063, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10828 + }, + { + "completion_length": 253.1205472946167, + "epoch": 1.8157927825977618, + "grad_norm": 0.15807833293193896, + "kl": 0.157958984375, + "learning_rate": 4.966999017041549e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857473343611, + "rewards/format_reward_func": 1.0, + "step": 10830 + }, + { + "completion_length": 251.11608505249023, + "epoch": 1.8161280858376294, + "grad_norm": 0.20844670553261815, + "kl": 0.195587158203125, + "learning_rate": 4.966979587543043e-07, + "loss": 0.0002, + "reward": 1.735714353621006, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143256813288, + "rewards/format_reward_func": 1.0, + "step": 10832 + }, + { + "completion_length": 250.04465198516846, + "epoch": 1.816463389077497, + "grad_norm": 0.22690609755391183, + "kl": 0.124908447265625, + "learning_rate": 4.966960152364635e-07, + "loss": 0.0001, + "reward": 1.7410714700818062, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10834 + }, + { + "completion_length": 255.36608409881592, + "epoch": 1.8167986923173647, + "grad_norm": 0.1446434148430334, + "kl": 0.151519775390625, + "learning_rate": 4.96694071150637e-07, + "loss": 0.0002, + "reward": 1.7446429207921028, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7580357454717159, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10836 + }, + { + "completion_length": 250.4866189956665, + "epoch": 1.817133995557232, + "grad_norm": 0.47846980714272636, + "kl": 0.1998748779296875, + "learning_rate": 4.966921264968293e-07, + "loss": 0.0002, + "reward": 1.6732143759727478, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.6776786055415869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10838 + }, + { + "completion_length": 243.34375953674316, + "epoch": 1.8174692987970995, + "grad_norm": 0.13234759313529285, + "kl": 0.109649658203125, + "learning_rate": 4.966901812750448e-07, + "loss": 0.0001, + "reward": 1.8428571745753288, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8428571745753288, + "rewards/format_reward_func": 1.0, + "step": 10840 + }, + { + "completion_length": 242.13394165039062, + "epoch": 1.8178046020369671, + "grad_norm": 0.13118584337901576, + "kl": 0.1727294921875, + "learning_rate": 4.966882354852882e-07, + "loss": 0.0002, + "reward": 1.7589286267757416, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10842 + }, + { + "completion_length": 254.74554634094238, + "epoch": 1.8181399052768348, + "grad_norm": 0.19629673610097528, + "kl": 0.15948486328125, + "learning_rate": 4.966862891275637e-07, + "loss": 0.0002, + "reward": 1.7750000357627869, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7839286141097546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10844 + }, + { + "completion_length": 255.2276906967163, + "epoch": 1.8184752085167024, + "grad_norm": 0.21825418465981786, + "kl": 0.15118408203125, + "learning_rate": 4.966843422018758e-07, + "loss": 0.0002, + "reward": 1.771428644657135, + "reward_std": 0.10101525392383337, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 0.9821428656578064, + "step": 10846 + }, + { + "completion_length": 246.93304920196533, + "epoch": 1.8188105117565698, + "grad_norm": 0.14504716918157162, + "kl": 0.1495361328125, + "learning_rate": 4.966823947082292e-07, + "loss": 0.0001, + "reward": 1.7732143625617027, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10848 + }, + { + "completion_length": 253.6205472946167, + "epoch": 1.8191458149964372, + "grad_norm": 0.20655397846657847, + "kl": 0.129791259765625, + "learning_rate": 4.966804466466282e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.10101525392383337, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 0.9821428656578064, + "step": 10850 + }, + { + "completion_length": 251.24554347991943, + "epoch": 1.8194811182363049, + "grad_norm": 0.1738275679980491, + "kl": 0.1940460205078125, + "learning_rate": 4.966784980170774e-07, + "loss": 0.0002, + "reward": 1.7714286223053932, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7892857305705547, + "rewards/format_reward_func": 0.9821428656578064, + "step": 10852 + }, + { + "completion_length": 247.00447750091553, + "epoch": 1.8198164214761725, + "grad_norm": 0.20985469649492897, + "kl": 1.001861572265625, + "learning_rate": 4.966765488195812e-07, + "loss": 0.001, + "reward": 1.753571480512619, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7803571671247482, + "rewards/format_reward_func": 0.9732142984867096, + "step": 10854 + }, + { + "completion_length": 236.7812623977661, + "epoch": 1.8201517247160401, + "grad_norm": 0.11620140241283157, + "kl": 0.25567626953125, + "learning_rate": 4.966745990541442e-07, + "loss": 0.0003, + "reward": 1.7553572058677673, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7687500305473804, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10856 + }, + { + "completion_length": 253.74555015563965, + "epoch": 1.8204870279559078, + "grad_norm": 0.1833107759586828, + "kl": 0.108062744140625, + "learning_rate": 4.966726487207708e-07, + "loss": 0.0001, + "reward": 1.7071429267525673, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 0.9821428656578064, + "step": 10858 + }, + { + "completion_length": 239.290189743042, + "epoch": 1.8208223311957752, + "grad_norm": 0.23300359377419902, + "kl": 0.562469482421875, + "learning_rate": 4.966706978194655e-07, + "loss": 0.0006, + "reward": 1.7089286297559738, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7133928816765547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10860 + }, + { + "completion_length": 236.2634038925171, + "epoch": 1.8211576344356426, + "grad_norm": 0.18866839307399952, + "kl": 0.4730224609375, + "learning_rate": 4.966687463502327e-07, + "loss": 0.0005, + "reward": 1.7660714760422707, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7705357298254967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10862 + }, + { + "completion_length": 229.13393783569336, + "epoch": 1.8214929376755102, + "grad_norm": 0.10210684142020995, + "kl": 0.330596923828125, + "learning_rate": 4.966667943130771e-07, + "loss": 0.0003, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 10864 + }, + { + "completion_length": 234.0446538925171, + "epoch": 1.8218282409153779, + "grad_norm": 0.23129147813982034, + "kl": 0.1144256591796875, + "learning_rate": 4.966648417080031e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 10866 + }, + { + "completion_length": 235.9107255935669, + "epoch": 1.8221635441552455, + "grad_norm": 0.1554049848557028, + "kl": 0.1997222900390625, + "learning_rate": 4.96662888535015e-07, + "loss": 0.0002, + "reward": 1.7857143506407738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 10868 + }, + { + "completion_length": 235.03572463989258, + "epoch": 1.822498847395113, + "grad_norm": 0.40407649067600887, + "kl": 0.53131103515625, + "learning_rate": 4.966609347941176e-07, + "loss": 0.0005, + "reward": 1.8035714700818062, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 10870 + }, + { + "completion_length": 233.65179824829102, + "epoch": 1.8228341506349806, + "grad_norm": 0.13115162629134425, + "kl": 0.3810577392578125, + "learning_rate": 4.966589804853153e-07, + "loss": 0.0004, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 10872 + }, + { + "completion_length": 224.61161708831787, + "epoch": 1.823169453874848, + "grad_norm": 0.22174956953423863, + "kl": 0.0928497314453125, + "learning_rate": 4.966570256086126e-07, + "loss": 0.0001, + "reward": 1.7928572222590446, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 10874 + }, + { + "completion_length": 231.61608123779297, + "epoch": 1.8235047571147156, + "grad_norm": 0.2175888324277477, + "kl": 0.1910400390625, + "learning_rate": 4.966550701640139e-07, + "loss": 0.0002, + "reward": 1.7464286610484123, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 10876 + }, + { + "completion_length": 224.2098331451416, + "epoch": 1.8238400603545832, + "grad_norm": 0.13761918610296675, + "kl": 0.294647216796875, + "learning_rate": 4.966531141515237e-07, + "loss": 0.0003, + "reward": 1.7464286163449287, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7464286144822836, + "rewards/format_reward_func": 1.0, + "step": 10878 + }, + { + "completion_length": 219.59375858306885, + "epoch": 1.8241753635944509, + "grad_norm": 0.12025355892629688, + "kl": 0.126495361328125, + "learning_rate": 4.966511575711467e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 10880 + }, + { + "completion_length": 228.77233219146729, + "epoch": 1.8245106668343183, + "grad_norm": 0.3027116876913923, + "kl": 0.84716796875, + "learning_rate": 4.966492004228872e-07, + "loss": 0.0008, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 10882 + }, + { + "completion_length": 220.33929538726807, + "epoch": 1.8248459700741857, + "grad_norm": 0.15113626205446923, + "kl": 0.207672119140625, + "learning_rate": 4.966472427067499e-07, + "loss": 0.0002, + "reward": 1.8571428954601288, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8571428768336773, + "rewards/format_reward_func": 1.0, + "step": 10884 + }, + { + "completion_length": 223.70536613464355, + "epoch": 1.8251812733140533, + "grad_norm": 0.30749767893684915, + "kl": 0.09979248046875, + "learning_rate": 4.966452844227391e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 10886 + }, + { + "completion_length": 225.04018878936768, + "epoch": 1.825516576553921, + "grad_norm": 0.2152092830261441, + "kl": 0.13427734375, + "learning_rate": 4.966433255708594e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714522004128, + "rewards/format_reward_func": 1.0, + "step": 10888 + }, + { + "completion_length": 237.9241189956665, + "epoch": 1.8258518797937886, + "grad_norm": 0.11807239100575599, + "kl": 0.154876708984375, + "learning_rate": 4.966413661511154e-07, + "loss": 0.0002, + "reward": 1.7250000461935997, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.725000036880374, + "rewards/format_reward_func": 1.0, + "step": 10890 + }, + { + "completion_length": 228.87501049041748, + "epoch": 1.826187183033656, + "grad_norm": 0.273143116307382, + "kl": 0.3214111328125, + "learning_rate": 4.966394061635115e-07, + "loss": 0.0003, + "reward": 1.7428572103381157, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 10892 + }, + { + "completion_length": 223.47322368621826, + "epoch": 1.8265224862735236, + "grad_norm": 0.1521072199861038, + "kl": 0.11309814453125, + "learning_rate": 4.966374456080522e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 10894 + }, + { + "completion_length": 237.30358123779297, + "epoch": 1.826857789513391, + "grad_norm": 0.6139851648931479, + "kl": 0.197723388671875, + "learning_rate": 4.966354844847421e-07, + "loss": 0.0002, + "reward": 1.778571479022503, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 10896 + }, + { + "completion_length": 238.2946548461914, + "epoch": 1.8271930927532587, + "grad_norm": 0.29583958632372603, + "kl": 0.115325927734375, + "learning_rate": 4.966335227935856e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 1.0, + "step": 10898 + }, + { + "completion_length": 225.09822368621826, + "epoch": 1.8275283959931263, + "grad_norm": 0.24015298347241795, + "kl": 0.225799560546875, + "learning_rate": 4.966315605345873e-07, + "loss": 0.0002, + "reward": 1.850000038743019, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8500000089406967, + "rewards/format_reward_func": 1.0, + "step": 10900 + }, + { + "completion_length": 235.571439743042, + "epoch": 1.827863699232994, + "grad_norm": 0.2932754055945961, + "kl": 0.11669921875, + "learning_rate": 4.966295977077518e-07, + "loss": 0.0001, + "reward": 1.7410715073347092, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10902 + }, + { + "completion_length": 238.17411613464355, + "epoch": 1.8281990024728614, + "grad_norm": 0.26385235317977035, + "kl": 0.1941680908203125, + "learning_rate": 4.966276343130835e-07, + "loss": 0.0002, + "reward": 1.7142858132719994, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857536673546, + "rewards/format_reward_func": 1.0, + "step": 10904 + }, + { + "completion_length": 234.19643783569336, + "epoch": 1.8285343057127288, + "grad_norm": 0.3444873572004797, + "kl": 0.1352691650390625, + "learning_rate": 4.966256703505869e-07, + "loss": 0.0001, + "reward": 1.8178571835160255, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.817857176065445, + "rewards/format_reward_func": 1.0, + "step": 10906 + }, + { + "completion_length": 247.98215198516846, + "epoch": 1.8288696089525964, + "grad_norm": 0.2911970609785864, + "kl": 0.11505126953125, + "learning_rate": 4.966237058202665e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857529222965, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10908 + }, + { + "completion_length": 239.40179538726807, + "epoch": 1.829204912192464, + "grad_norm": 0.25664083609430993, + "kl": 0.21173095703125, + "learning_rate": 4.96621740722127e-07, + "loss": 0.0002, + "reward": 1.7535714730620384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714562982321, + "rewards/format_reward_func": 1.0, + "step": 10910 + }, + { + "completion_length": 233.74554634094238, + "epoch": 1.8295402154323317, + "grad_norm": 0.2616752508689885, + "kl": 0.0946807861328125, + "learning_rate": 4.966197750561728e-07, + "loss": 0.0001, + "reward": 1.7839286178350449, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10912 + }, + { + "completion_length": 234.1294755935669, + "epoch": 1.8298755186721993, + "grad_norm": 0.2995033876494516, + "kl": 0.1689453125, + "learning_rate": 4.966178088224084e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 10914 + }, + { + "completion_length": 242.03126525878906, + "epoch": 1.8302108219120667, + "grad_norm": 0.31543069402561186, + "kl": 0.10711669921875, + "learning_rate": 4.966158420208383e-07, + "loss": 0.0001, + "reward": 1.7232143506407738, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10916 + }, + { + "completion_length": 219.94197273254395, + "epoch": 1.8305461251519342, + "grad_norm": 0.21660990217758797, + "kl": 0.1119232177734375, + "learning_rate": 4.966138746514672e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 10918 + }, + { + "completion_length": 227.89733123779297, + "epoch": 1.8308814283918018, + "grad_norm": 0.16069491284800438, + "kl": 0.1295623779296875, + "learning_rate": 4.966119067142995e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7803571783006191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10920 + }, + { + "completion_length": 238.59376049041748, + "epoch": 1.8312167316316694, + "grad_norm": 0.18957200729810142, + "kl": 0.117218017578125, + "learning_rate": 4.966099382093397e-07, + "loss": 0.0001, + "reward": 1.7482143640518188, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10922 + }, + { + "completion_length": 232.71875858306885, + "epoch": 1.831552034871537, + "grad_norm": 0.12322576113924998, + "kl": 0.1805267333984375, + "learning_rate": 4.966079691365925e-07, + "loss": 0.0002, + "reward": 1.7803571969270706, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10924 + }, + { + "completion_length": 233.70983028411865, + "epoch": 1.8318873381114045, + "grad_norm": 0.2518008327942554, + "kl": 0.1314697265625, + "learning_rate": 4.966059994960622e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 10926 + }, + { + "completion_length": 247.36608123779297, + "epoch": 1.832222641351272, + "grad_norm": 0.3195704223914685, + "kl": 0.25115966796875, + "learning_rate": 4.966040292877534e-07, + "loss": 0.0003, + "reward": 1.7607143446803093, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7696428894996643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10928 + }, + { + "completion_length": 232.74554538726807, + "epoch": 1.8325579445911395, + "grad_norm": 0.10482360688190698, + "kl": 0.15802001953125, + "learning_rate": 4.966020585116709e-07, + "loss": 0.0002, + "reward": 1.767857201397419, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571734577417, + "rewards/format_reward_func": 1.0, + "step": 10930 + }, + { + "completion_length": 230.0937623977661, + "epoch": 1.8328932478310072, + "grad_norm": 0.3141847682191237, + "kl": 0.16552734375, + "learning_rate": 4.966000871678189e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 10932 + }, + { + "completion_length": 232.38393878936768, + "epoch": 1.8332285510708748, + "grad_norm": 0.16685600890208938, + "kl": 0.35284423828125, + "learning_rate": 4.965981152562021e-07, + "loss": 0.0004, + "reward": 1.7446429058909416, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10934 + }, + { + "completion_length": 240.52679634094238, + "epoch": 1.8335638543107424, + "grad_norm": 0.5557902127047117, + "kl": 0.145477294921875, + "learning_rate": 4.96596142776825e-07, + "loss": 0.0001, + "reward": 1.758928619325161, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.763392873108387, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10936 + }, + { + "completion_length": 227.29911708831787, + "epoch": 1.8338991575506098, + "grad_norm": 0.25553049632062863, + "kl": 0.1214599609375, + "learning_rate": 4.965941697296922e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 10938 + }, + { + "completion_length": 244.99554538726807, + "epoch": 1.8342344607904773, + "grad_norm": 0.36957619283406395, + "kl": 0.170867919921875, + "learning_rate": 4.965921961148081e-07, + "loss": 0.0002, + "reward": 1.7339286506175995, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7464285884052515, + "rewards/format_reward_func": 0.9875000044703484, + "step": 10940 + }, + { + "completion_length": 235.54465675354004, + "epoch": 1.8345697640303449, + "grad_norm": 0.5029314215918635, + "kl": 0.261199951171875, + "learning_rate": 4.965902219321773e-07, + "loss": 0.0003, + "reward": 1.8000000715255737, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 10942 + }, + { + "completion_length": 234.7009038925171, + "epoch": 1.8349050672702125, + "grad_norm": 0.19529034614225257, + "kl": 0.1197967529296875, + "learning_rate": 4.965882471818045e-07, + "loss": 0.0001, + "reward": 1.7428571954369545, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7517857477068901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10944 + }, + { + "completion_length": 234.1696538925171, + "epoch": 1.8352403705100802, + "grad_norm": 0.21069830505407974, + "kl": 0.131500244140625, + "learning_rate": 4.965862718636941e-07, + "loss": 0.0001, + "reward": 1.7125000581145287, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7258928883820772, + "rewards/format_reward_func": 0.9866071492433548, + "step": 10946 + }, + { + "completion_length": 230.1964406967163, + "epoch": 1.8355756737499476, + "grad_norm": 0.33352966239595755, + "kl": 0.177215576171875, + "learning_rate": 4.965842959778505e-07, + "loss": 0.0002, + "reward": 1.7482143342494965, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526786029338837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10948 + }, + { + "completion_length": 233.35715198516846, + "epoch": 1.8359109769898152, + "grad_norm": 0.26718184673680234, + "kl": 0.15020751953125, + "learning_rate": 4.965823195242786e-07, + "loss": 0.0002, + "reward": 1.7732143625617027, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10950 + }, + { + "completion_length": 228.6741189956665, + "epoch": 1.8362462802296826, + "grad_norm": 0.6468786012472365, + "kl": 0.130218505859375, + "learning_rate": 4.965803425029828e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500331550837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10952 + }, + { + "completion_length": 217.52679538726807, + "epoch": 1.8365815834695502, + "grad_norm": 0.20986412344697003, + "kl": 0.132232666015625, + "learning_rate": 4.965783649139675e-07, + "loss": 0.0001, + "reward": 1.787500038743019, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10954 + }, + { + "completion_length": 227.6696538925171, + "epoch": 1.8369168867094179, + "grad_norm": 0.13116619544808694, + "kl": 0.1138916015625, + "learning_rate": 4.965763867572375e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 10956 + }, + { + "completion_length": 218.23661708831787, + "epoch": 1.8372521899492855, + "grad_norm": 0.20983926729483543, + "kl": 0.1107177734375, + "learning_rate": 4.965744080327972e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 10958 + }, + { + "completion_length": 223.20090293884277, + "epoch": 1.837587493189153, + "grad_norm": 0.3545627872806163, + "kl": 0.10614013671875, + "learning_rate": 4.965724287406512e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 10960 + }, + { + "completion_length": 224.86608123779297, + "epoch": 1.8379227964290203, + "grad_norm": 0.2105883357527289, + "kl": 0.1058807373046875, + "learning_rate": 4.96570448880804e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7589286044239998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10962 + }, + { + "completion_length": 229.571439743042, + "epoch": 1.838258099668888, + "grad_norm": 0.22364455871043729, + "kl": 0.12042236328125, + "learning_rate": 4.965684684532603e-07, + "loss": 0.0001, + "reward": 1.7468750923871994, + "reward_std": 0.037249374436214566, + "rewards/equation_reward_func": 0.7517857421189547, + "rewards/format_reward_func": 0.9950892888009548, + "step": 10964 + }, + { + "completion_length": 224.60715293884277, + "epoch": 1.8385934029087556, + "grad_norm": 0.26191034704334365, + "kl": 0.14447021484375, + "learning_rate": 4.965664874580244e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.755357164889574, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10966 + }, + { + "completion_length": 225.0803680419922, + "epoch": 1.8389287061486232, + "grad_norm": 0.3470244049399262, + "kl": 0.118988037109375, + "learning_rate": 4.965645058951011e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 10968 + }, + { + "completion_length": 223.38840293884277, + "epoch": 1.8392640093884907, + "grad_norm": 0.3880943554691539, + "kl": 0.1046142578125, + "learning_rate": 4.965625237644949e-07, + "loss": 0.0001, + "reward": 1.7428572326898575, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 10970 + }, + { + "completion_length": 221.9375114440918, + "epoch": 1.8395993126283583, + "grad_norm": 0.5691958115794947, + "kl": 0.1212158203125, + "learning_rate": 4.965605410662104e-07, + "loss": 0.0001, + "reward": 1.766071505844593, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357372760773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10972 + }, + { + "completion_length": 222.00893878936768, + "epoch": 1.8399346158682257, + "grad_norm": 0.17615148433837938, + "kl": 0.101470947265625, + "learning_rate": 4.965585578002521e-07, + "loss": 0.0001, + "reward": 1.8000000342726707, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 10974 + }, + { + "completion_length": 242.60268878936768, + "epoch": 1.8402699191080933, + "grad_norm": 0.18370374417663582, + "kl": 0.130126953125, + "learning_rate": 4.965565739666245e-07, + "loss": 0.0001, + "reward": 1.755357213318348, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214484751225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10976 + }, + { + "completion_length": 233.42411613464355, + "epoch": 1.840605222347961, + "grad_norm": 0.3542768180228526, + "kl": 0.112030029296875, + "learning_rate": 4.965545895653324e-07, + "loss": 0.0001, + "reward": 1.7607143744826317, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7696428783237934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10978 + }, + { + "completion_length": 234.0491180419922, + "epoch": 1.8409405255878286, + "grad_norm": 0.21181866329362126, + "kl": 0.11077880859375, + "learning_rate": 4.965526045963801e-07, + "loss": 0.0001, + "reward": 1.7535715326666832, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7625000197440386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 10980 + }, + { + "completion_length": 235.08483505249023, + "epoch": 1.841275828827696, + "grad_norm": 0.14119896069516588, + "kl": 0.11920166015625, + "learning_rate": 4.965506190597723e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 10982 + }, + { + "completion_length": 228.2991180419922, + "epoch": 1.8416111320675634, + "grad_norm": 0.394193532340823, + "kl": 0.11993408203125, + "learning_rate": 4.965486329555136e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 10984 + }, + { + "completion_length": 229.94643878936768, + "epoch": 1.841946435307431, + "grad_norm": 0.22609019385205742, + "kl": 0.100433349609375, + "learning_rate": 4.965466462836085e-07, + "loss": 0.0001, + "reward": 1.8000000715255737, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 1.0, + "step": 10986 + }, + { + "completion_length": 237.7276906967163, + "epoch": 1.8422817385472987, + "grad_norm": 0.3759695986030544, + "kl": 0.1258544921875, + "learning_rate": 4.965446590440616e-07, + "loss": 0.0001, + "reward": 1.8035715073347092, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 10988 + }, + { + "completion_length": 217.1428680419922, + "epoch": 1.8426170417871663, + "grad_norm": 0.2734768653982204, + "kl": 0.109344482421875, + "learning_rate": 4.965426712368776e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 10990 + }, + { + "completion_length": 228.47322368621826, + "epoch": 1.842952345027034, + "grad_norm": 0.30183089643210215, + "kl": 0.117431640625, + "learning_rate": 4.965406828620607e-07, + "loss": 0.0001, + "reward": 1.730357214808464, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214518278837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 10992 + }, + { + "completion_length": 227.852689743042, + "epoch": 1.8432876482669014, + "grad_norm": 0.16928201748186822, + "kl": 0.099700927734375, + "learning_rate": 4.96538693919616e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 10994 + }, + { + "completion_length": 224.49108123779297, + "epoch": 1.8436229515067688, + "grad_norm": 0.2128443559967921, + "kl": 0.099517822265625, + "learning_rate": 4.965367044095477e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 10996 + }, + { + "completion_length": 232.45090293884277, + "epoch": 1.8439582547466364, + "grad_norm": 0.35638057676443574, + "kl": 0.159698486328125, + "learning_rate": 4.965347143318605e-07, + "loss": 0.0002, + "reward": 1.7821429297327995, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 10998 + }, + { + "completion_length": 220.62054634094238, + "epoch": 1.844293557986504, + "grad_norm": 0.2383076241088169, + "kl": 0.09564208984375, + "learning_rate": 4.96532723686559e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464285958558321, + "rewards/format_reward_func": 1.0, + "step": 11000 + }, + { + "completion_length": 238.58036994934082, + "epoch": 1.8446288612263717, + "grad_norm": 0.3791546490491425, + "kl": 0.20074462890625, + "learning_rate": 4.965307324736477e-07, + "loss": 0.0002, + "reward": 1.8285714909434319, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8285714481025934, + "rewards/format_reward_func": 1.0, + "step": 11002 + }, + { + "completion_length": 223.78126049041748, + "epoch": 1.8449641644662391, + "grad_norm": 0.16352363771720657, + "kl": 0.114044189453125, + "learning_rate": 4.965287406931313e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 11004 + }, + { + "completion_length": 227.56251049041748, + "epoch": 1.8452994677061068, + "grad_norm": 0.3131439397412478, + "kl": 0.1202392578125, + "learning_rate": 4.965267483450144e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.08081220090389252, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 11006 + }, + { + "completion_length": 234.16518878936768, + "epoch": 1.8456347709459742, + "grad_norm": 0.20428976348291153, + "kl": 0.10565185546875, + "learning_rate": 4.965247554293014e-07, + "loss": 0.0001, + "reward": 1.8607143238186836, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8607143051922321, + "rewards/format_reward_func": 1.0, + "step": 11008 + }, + { + "completion_length": 222.62054538726807, + "epoch": 1.8459700741858418, + "grad_norm": 0.27048745939962315, + "kl": 0.1072845458984375, + "learning_rate": 4.965227619459971e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 11010 + }, + { + "completion_length": 221.56251049041748, + "epoch": 1.8463053774257094, + "grad_norm": 0.255154030721768, + "kl": 0.090850830078125, + "learning_rate": 4.96520767895106e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964285835623741, + "rewards/format_reward_func": 1.0, + "step": 11012 + }, + { + "completion_length": 215.44197368621826, + "epoch": 1.846640680665577, + "grad_norm": 0.2318606194787304, + "kl": 0.101898193359375, + "learning_rate": 4.965187732766326e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 11014 + }, + { + "completion_length": 221.33929538726807, + "epoch": 1.8469759839054445, + "grad_norm": 0.14313873384667694, + "kl": 0.1305389404296875, + "learning_rate": 4.965167780905817e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 11016 + }, + { + "completion_length": 231.19197845458984, + "epoch": 1.847311287145312, + "grad_norm": 0.22620464844274196, + "kl": 0.10772705078125, + "learning_rate": 4.965147823369576e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 11018 + }, + { + "completion_length": 225.977689743042, + "epoch": 1.8476465903851795, + "grad_norm": 0.19280302891875264, + "kl": 0.0964508056640625, + "learning_rate": 4.965127860157652e-07, + "loss": 0.0001, + "reward": 1.76517865806818, + "reward_std": 0.059346460737288, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9982142895460129, + "step": 11020 + }, + { + "completion_length": 232.98215198516846, + "epoch": 1.8479818936250472, + "grad_norm": 0.2722219674141182, + "kl": 0.118072509765625, + "learning_rate": 4.96510789127009e-07, + "loss": 0.0001, + "reward": 1.6964286640286446, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.6964286100119352, + "rewards/format_reward_func": 1.0, + "step": 11022 + }, + { + "completion_length": 225.9107255935669, + "epoch": 1.8483171968649148, + "grad_norm": 0.25389911395651005, + "kl": 0.100128173828125, + "learning_rate": 4.965087916706934e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 11024 + }, + { + "completion_length": 221.84822368621826, + "epoch": 1.8486525001047822, + "grad_norm": 0.21637961265921582, + "kl": 0.102935791015625, + "learning_rate": 4.965067936468234e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 11026 + }, + { + "completion_length": 230.5803680419922, + "epoch": 1.8489878033446498, + "grad_norm": 0.18228930297643892, + "kl": 0.1324462890625, + "learning_rate": 4.965047950554032e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 11028 + }, + { + "completion_length": 223.92858123779297, + "epoch": 1.8493231065845173, + "grad_norm": 0.23163988946281985, + "kl": 0.0888214111328125, + "learning_rate": 4.965027958964376e-07, + "loss": 0.0001, + "reward": 1.703571505844593, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7035714611411095, + "rewards/format_reward_func": 1.0, + "step": 11030 + }, + { + "completion_length": 234.64733219146729, + "epoch": 1.849658409824385, + "grad_norm": 0.15882551249432497, + "kl": 0.11004638671875, + "learning_rate": 4.965007961699312e-07, + "loss": 0.0001, + "reward": 1.787500038743019, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11032 + }, + { + "completion_length": 234.52679634094238, + "epoch": 1.8499937130642525, + "grad_norm": 0.16316429098513455, + "kl": 0.10003662109375, + "learning_rate": 4.964987958758885e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 11034 + }, + { + "completion_length": 235.0267972946167, + "epoch": 1.8503290163041202, + "grad_norm": 0.1258603120735231, + "kl": 0.24822998046875, + "learning_rate": 4.964967950143143e-07, + "loss": 0.0002, + "reward": 1.733928620815277, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7383928894996643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11036 + }, + { + "completion_length": 227.22768688201904, + "epoch": 1.8506643195439876, + "grad_norm": 0.28442494129277424, + "kl": 0.114990234375, + "learning_rate": 4.964947935852129e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.07071067858487368, + "rewards/equation_reward_func": 0.7589286081492901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11038 + }, + { + "completion_length": 229.85268878936768, + "epoch": 1.850999622783855, + "grad_norm": 0.3542580625263232, + "kl": 0.116607666015625, + "learning_rate": 4.964927915885893e-07, + "loss": 0.0001, + "reward": 1.703571505844593, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7035714723169804, + "rewards/format_reward_func": 1.0, + "step": 11040 + }, + { + "completion_length": 231.94643878936768, + "epoch": 1.8513349260237226, + "grad_norm": 0.2923367114733701, + "kl": 0.1073455810546875, + "learning_rate": 4.964907890244478e-07, + "loss": 0.0001, + "reward": 1.7946429252624512, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7991071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11042 + }, + { + "completion_length": 225.80804252624512, + "epoch": 1.8516702292635903, + "grad_norm": 0.2327532429020024, + "kl": 0.1014404296875, + "learning_rate": 4.964887858927931e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7839286103844643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11044 + }, + { + "completion_length": 228.22768783569336, + "epoch": 1.852005532503458, + "grad_norm": 0.277353104211083, + "kl": 0.1070098876953125, + "learning_rate": 4.964867821936298e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 11046 + }, + { + "completion_length": 226.15179634094238, + "epoch": 1.8523408357433255, + "grad_norm": 0.23078546163027888, + "kl": 0.116180419921875, + "learning_rate": 4.964847779269625e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 11048 + }, + { + "completion_length": 227.08036708831787, + "epoch": 1.852676138983193, + "grad_norm": 0.2231158849058702, + "kl": 0.099700927734375, + "learning_rate": 4.96482773092796e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 11050 + }, + { + "completion_length": 227.75893783569336, + "epoch": 1.8530114422230604, + "grad_norm": 0.140503338610388, + "kl": 0.134063720703125, + "learning_rate": 4.964807676911347e-07, + "loss": 0.0001, + "reward": 1.7000000849366188, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7000000327825546, + "rewards/format_reward_func": 1.0, + "step": 11052 + }, + { + "completion_length": 226.5759038925171, + "epoch": 1.853346745462928, + "grad_norm": 0.2436791254328514, + "kl": 0.125732421875, + "learning_rate": 4.964787617219832e-07, + "loss": 0.0001, + "reward": 1.8214286044239998, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 1.0, + "step": 11054 + }, + { + "completion_length": 224.81697368621826, + "epoch": 1.8536820487027956, + "grad_norm": 0.2266395282483944, + "kl": 0.10595703125, + "learning_rate": 4.964767551853463e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11056 + }, + { + "completion_length": 218.30804634094238, + "epoch": 1.8540173519426633, + "grad_norm": 0.12301408490279202, + "kl": 0.106903076171875, + "learning_rate": 4.964747480812285e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 11058 + }, + { + "completion_length": 225.54911613464355, + "epoch": 1.8543526551825307, + "grad_norm": 0.2668418387078583, + "kl": 0.118621826171875, + "learning_rate": 4.964727404096344e-07, + "loss": 0.0001, + "reward": 1.6875000819563866, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.6919643320143223, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11060 + }, + { + "completion_length": 220.75893592834473, + "epoch": 1.8546879584223983, + "grad_norm": 0.0035520221745497823, + "kl": 0.1208953857421875, + "learning_rate": 4.964707321705687e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 11062 + }, + { + "completion_length": 237.15179443359375, + "epoch": 1.8550232616622657, + "grad_norm": 0.36594927961505463, + "kl": 0.11773681640625, + "learning_rate": 4.96468723364036e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 11064 + }, + { + "completion_length": 222.36608028411865, + "epoch": 1.8553585649021334, + "grad_norm": 0.30599537635174073, + "kl": 0.106658935546875, + "learning_rate": 4.964667139900409e-07, + "loss": 0.0001, + "reward": 1.785714365541935, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 11066 + }, + { + "completion_length": 223.11161708831787, + "epoch": 1.855693868142001, + "grad_norm": 0.47049750051047357, + "kl": 0.12823486328125, + "learning_rate": 4.96464704048588e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 11068 + }, + { + "completion_length": 215.1741180419922, + "epoch": 1.8560291713818686, + "grad_norm": 0.2115054002297252, + "kl": 0.0959320068359375, + "learning_rate": 4.964626935396821e-07, + "loss": 0.0001, + "reward": 1.7892857491970062, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857473343611, + "rewards/format_reward_func": 1.0, + "step": 11070 + }, + { + "completion_length": 217.03572273254395, + "epoch": 1.856364474621736, + "grad_norm": 0.18145256750998845, + "kl": 0.108856201171875, + "learning_rate": 4.964606824633276e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 11072 + }, + { + "completion_length": 221.37500953674316, + "epoch": 1.8566997778616035, + "grad_norm": 0.20762713551014986, + "kl": 0.108001708984375, + "learning_rate": 4.964586708195292e-07, + "loss": 0.0001, + "reward": 1.8285714760422707, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8285714536905289, + "rewards/format_reward_func": 1.0, + "step": 11074 + }, + { + "completion_length": 228.83483219146729, + "epoch": 1.857035081101471, + "grad_norm": 0.18659784904839863, + "kl": 0.14788818359375, + "learning_rate": 4.964566586082916e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 11076 + }, + { + "completion_length": 222.52233028411865, + "epoch": 1.8573703843413387, + "grad_norm": 0.19205268624150945, + "kl": 0.10675048828125, + "learning_rate": 4.964546458296194e-07, + "loss": 0.0001, + "reward": 1.8214286416769028, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214285969734192, + "rewards/format_reward_func": 1.0, + "step": 11078 + }, + { + "completion_length": 221.46429634094238, + "epoch": 1.8577056875812064, + "grad_norm": 0.2383659915228919, + "kl": 0.12945556640625, + "learning_rate": 4.964526324835172e-07, + "loss": 0.0001, + "reward": 1.732142947614193, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7321428935974836, + "rewards/format_reward_func": 1.0, + "step": 11080 + }, + { + "completion_length": 211.19643878936768, + "epoch": 1.8580409908210738, + "grad_norm": 0.11302266977949367, + "kl": 0.1028900146484375, + "learning_rate": 4.964506185699897e-07, + "loss": 0.0001, + "reward": 1.8178571835160255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.817857176065445, + "rewards/format_reward_func": 1.0, + "step": 11082 + }, + { + "completion_length": 223.49554538726807, + "epoch": 1.8583762940609414, + "grad_norm": 0.23475124330163089, + "kl": 0.14801025390625, + "learning_rate": 4.964486040890415e-07, + "loss": 0.0001, + "reward": 1.710714377462864, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7107143215835094, + "rewards/format_reward_func": 1.0, + "step": 11084 + }, + { + "completion_length": 219.42411708831787, + "epoch": 1.8587115973008088, + "grad_norm": 0.32645925916745405, + "kl": 0.200714111328125, + "learning_rate": 4.964465890406773e-07, + "loss": 0.0002, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 11086 + }, + { + "completion_length": 223.35268878936768, + "epoch": 1.8590469005406765, + "grad_norm": 0.2379439074253615, + "kl": 0.1185760498046875, + "learning_rate": 4.964445734249015e-07, + "loss": 0.0001, + "reward": 1.78035718947649, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7848214656114578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11088 + }, + { + "completion_length": 218.3526906967163, + "epoch": 1.859382203780544, + "grad_norm": 0.490915657263521, + "kl": 0.137847900390625, + "learning_rate": 4.96442557241719e-07, + "loss": 0.0001, + "reward": 1.721428669989109, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 11090 + }, + { + "completion_length": 219.01786518096924, + "epoch": 1.8597175070204117, + "grad_norm": 0.10672365885385886, + "kl": 0.186767578125, + "learning_rate": 4.964405404911344e-07, + "loss": 0.0002, + "reward": 1.7196429297327995, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071797907352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11092 + }, + { + "completion_length": 216.41072368621826, + "epoch": 1.8600528102602791, + "grad_norm": 0.3824340210957931, + "kl": 0.21832275390625, + "learning_rate": 4.964385231731523e-07, + "loss": 0.0002, + "reward": 1.7464286461472511, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 11094 + }, + { + "completion_length": 223.16072463989258, + "epoch": 1.8603881135001465, + "grad_norm": 0.23764974494911736, + "kl": 0.187652587890625, + "learning_rate": 4.964365052877773e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 11096 + }, + { + "completion_length": 236.05358409881592, + "epoch": 1.8607234167400142, + "grad_norm": 0.2853067876926874, + "kl": 0.215728759765625, + "learning_rate": 4.96434486835014e-07, + "loss": 0.0002, + "reward": 1.7089286744594574, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.713392898440361, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11098 + }, + { + "completion_length": 223.25001049041748, + "epoch": 1.8610587199798818, + "grad_norm": 0.3091993117408902, + "kl": 0.2955322265625, + "learning_rate": 4.964324678148674e-07, + "loss": 0.0003, + "reward": 1.725000075995922, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 11100 + }, + { + "completion_length": 215.76340293884277, + "epoch": 1.8613940232197494, + "grad_norm": 0.2468966823450781, + "kl": 0.138336181640625, + "learning_rate": 4.964304482273417e-07, + "loss": 0.0001, + "reward": 1.8142857626080513, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8142857290804386, + "rewards/format_reward_func": 1.0, + "step": 11102 + }, + { + "completion_length": 222.14286613464355, + "epoch": 1.8617293264596169, + "grad_norm": 0.14444154096411288, + "kl": 0.2125244140625, + "learning_rate": 4.964284280724418e-07, + "loss": 0.0002, + "reward": 1.7392857745289803, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 11104 + }, + { + "completion_length": 212.61608028411865, + "epoch": 1.8620646296994845, + "grad_norm": 0.2764355620324343, + "kl": 0.18096923828125, + "learning_rate": 4.964264073501723e-07, + "loss": 0.0002, + "reward": 1.73214291036129, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 11106 + }, + { + "completion_length": 226.53572368621826, + "epoch": 1.862399932939352, + "grad_norm": 0.13167498736808136, + "kl": 0.2672882080078125, + "learning_rate": 4.964243860605378e-07, + "loss": 0.0003, + "reward": 1.8000000715255737, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 11108 + }, + { + "completion_length": 218.80804538726807, + "epoch": 1.8627352361792195, + "grad_norm": 0.45637270918773215, + "kl": 0.367919921875, + "learning_rate": 4.96422364203543e-07, + "loss": 0.0004, + "reward": 1.8285714760422707, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8285714499652386, + "rewards/format_reward_func": 1.0, + "step": 11110 + }, + { + "completion_length": 221.94643783569336, + "epoch": 1.8630705394190872, + "grad_norm": 0.24640437904082058, + "kl": 0.381561279296875, + "learning_rate": 4.964203417791926e-07, + "loss": 0.0004, + "reward": 1.7607143595814705, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7687500230967999, + "rewards/format_reward_func": 0.9919642955064774, + "step": 11112 + }, + { + "completion_length": 222.49554538726807, + "epoch": 1.8634058426589548, + "grad_norm": 0.08996637829383648, + "kl": 0.1153564453125, + "learning_rate": 4.964183187874912e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 11114 + }, + { + "completion_length": 210.2009038925171, + "epoch": 1.8637411458988222, + "grad_norm": 0.30682680867562007, + "kl": 0.525390625, + "learning_rate": 4.964162952284435e-07, + "loss": 0.0005, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857529222965, + "rewards/format_reward_func": 1.0, + "step": 11116 + }, + { + "completion_length": 219.20983123779297, + "epoch": 1.8640764491386896, + "grad_norm": 0.21939131000569564, + "kl": 0.161834716796875, + "learning_rate": 4.964142711020539e-07, + "loss": 0.0002, + "reward": 1.7696429342031479, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11118 + }, + { + "completion_length": 220.9196548461914, + "epoch": 1.8644117523785573, + "grad_norm": 0.24876917172333088, + "kl": 0.163299560546875, + "learning_rate": 4.964122464083275e-07, + "loss": 0.0002, + "reward": 1.7357143610715866, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 11120 + }, + { + "completion_length": 216.77233123779297, + "epoch": 1.864747055618425, + "grad_norm": 0.23915510555329103, + "kl": 0.4630126953125, + "learning_rate": 4.964102211472687e-07, + "loss": 0.0005, + "reward": 1.725000075995922, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 1.0, + "step": 11122 + }, + { + "completion_length": 221.33929538726807, + "epoch": 1.8650823588582925, + "grad_norm": 0.24977953166575798, + "kl": 0.19873046875, + "learning_rate": 4.964081953188822e-07, + "loss": 0.0002, + "reward": 1.8071429058909416, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428760886192, + "rewards/format_reward_func": 1.0, + "step": 11124 + }, + { + "completion_length": 226.56250762939453, + "epoch": 1.8654176620981602, + "grad_norm": 0.4095663196737921, + "kl": 0.487213134765625, + "learning_rate": 4.964061689231727e-07, + "loss": 0.0005, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000171363354, + "rewards/format_reward_func": 1.0, + "step": 11126 + }, + { + "completion_length": 228.93750858306885, + "epoch": 1.8657529653380276, + "grad_norm": 0.30583252427411733, + "kl": 0.337738037109375, + "learning_rate": 4.964041419601448e-07, + "loss": 0.0003, + "reward": 1.7250000685453415, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000238418579, + "rewards/format_reward_func": 1.0, + "step": 11128 + }, + { + "completion_length": 224.008939743042, + "epoch": 1.866088268577895, + "grad_norm": 0.13443705049530696, + "kl": 0.214447021484375, + "learning_rate": 4.964021144298032e-07, + "loss": 0.0002, + "reward": 1.7375000640749931, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11130 + }, + { + "completion_length": 229.95536708831787, + "epoch": 1.8664235718177626, + "grad_norm": 0.6325187220459428, + "kl": 1.204498291015625, + "learning_rate": 4.964000863321526e-07, + "loss": 0.0012, + "reward": 1.7160715088248253, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7205357551574707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11132 + }, + { + "completion_length": 231.96429634094238, + "epoch": 1.8667588750576303, + "grad_norm": 0.11808471962014658, + "kl": 0.1728515625, + "learning_rate": 4.963980576671977e-07, + "loss": 0.0002, + "reward": 1.7857143431901932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 11134 + }, + { + "completion_length": 233.20090293884277, + "epoch": 1.867094178297498, + "grad_norm": 0.17947428004964994, + "kl": 0.18927001953125, + "learning_rate": 4.96396028434943e-07, + "loss": 0.0002, + "reward": 1.7571429014205933, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 11136 + }, + { + "completion_length": 234.05804920196533, + "epoch": 1.8674294815373653, + "grad_norm": 0.23106300919668743, + "kl": 0.202972412109375, + "learning_rate": 4.963939986353934e-07, + "loss": 0.0002, + "reward": 1.765178643167019, + "reward_std": 0.07954951096326113, + "rewards/equation_reward_func": 0.7669643089175224, + "rewards/format_reward_func": 0.9982142895460129, + "step": 11138 + }, + { + "completion_length": 228.23661708831787, + "epoch": 1.867764784777233, + "grad_norm": 0.2550263269785493, + "kl": 0.206146240234375, + "learning_rate": 4.963919682685533e-07, + "loss": 0.0002, + "reward": 1.8214286416769028, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.821428582072258, + "rewards/format_reward_func": 1.0, + "step": 11140 + }, + { + "completion_length": 228.85715198516846, + "epoch": 1.8681000880171004, + "grad_norm": 0.17487852501349144, + "kl": 0.12628173828125, + "learning_rate": 4.963899373344276e-07, + "loss": 0.0001, + "reward": 1.8607143312692642, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8607143051922321, + "rewards/format_reward_func": 1.0, + "step": 11142 + }, + { + "completion_length": 234.2991180419922, + "epoch": 1.868435391256968, + "grad_norm": 0.1105321170059028, + "kl": 0.12158203125, + "learning_rate": 4.963879058330209e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571660071611, + "rewards/format_reward_func": 1.0, + "step": 11144 + }, + { + "completion_length": 234.05804538726807, + "epoch": 1.8687706944968356, + "grad_norm": 0.14043168729058803, + "kl": 0.199127197265625, + "learning_rate": 4.963858737643379e-07, + "loss": 0.0002, + "reward": 1.7964286357164383, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 11146 + }, + { + "completion_length": 227.93750762939453, + "epoch": 1.8691059977367033, + "grad_norm": 0.23978752547022278, + "kl": 0.2164306640625, + "learning_rate": 4.963838411283834e-07, + "loss": 0.0002, + "reward": 1.7035715207457542, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7035714685916901, + "rewards/format_reward_func": 1.0, + "step": 11148 + }, + { + "completion_length": 239.415189743042, + "epoch": 1.8694413009765707, + "grad_norm": 0.17226054792818743, + "kl": 0.137786865234375, + "learning_rate": 4.963818079251618e-07, + "loss": 0.0001, + "reward": 1.746428668498993, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 11150 + }, + { + "completion_length": 234.7366189956665, + "epoch": 1.869776604216438, + "grad_norm": 0.563115423768281, + "kl": 0.161102294921875, + "learning_rate": 4.963797741546779e-07, + "loss": 0.0002, + "reward": 1.7535715028643608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.753571443259716, + "rewards/format_reward_func": 1.0, + "step": 11152 + }, + { + "completion_length": 231.68750762939453, + "epoch": 1.8701119074563057, + "grad_norm": 0.13526243051831918, + "kl": 0.117584228515625, + "learning_rate": 4.963777398169365e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571902215481, + "rewards/format_reward_func": 1.0, + "step": 11154 + }, + { + "completion_length": 230.14286708831787, + "epoch": 1.8704472106961734, + "grad_norm": 0.39458122341108914, + "kl": 0.1586151123046875, + "learning_rate": 4.963757049119421e-07, + "loss": 0.0002, + "reward": 1.7982143387198448, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.802678607404232, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11156 + }, + { + "completion_length": 232.852689743042, + "epoch": 1.870782513936041, + "grad_norm": 0.2544117423378524, + "kl": 0.13641357421875, + "learning_rate": 4.963736694396996e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 11158 + }, + { + "completion_length": 230.69643878936768, + "epoch": 1.8711178171759084, + "grad_norm": 0.2988204347526544, + "kl": 0.113555908203125, + "learning_rate": 4.963716334002135e-07, + "loss": 0.0001, + "reward": 1.7589286416769028, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11160 + }, + { + "completion_length": 244.852689743042, + "epoch": 1.871453120415776, + "grad_norm": 0.2732977459117658, + "kl": 0.110443115234375, + "learning_rate": 4.963695967934886e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714525729418, + "rewards/format_reward_func": 1.0, + "step": 11162 + }, + { + "completion_length": 231.44643783569336, + "epoch": 1.8717884236556435, + "grad_norm": 0.20912566072607522, + "kl": 0.106414794921875, + "learning_rate": 4.963675596195295e-07, + "loss": 0.0001, + "reward": 1.7089286521077156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7133928909897804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11164 + }, + { + "completion_length": 234.3437614440918, + "epoch": 1.872123726895511, + "grad_norm": 0.4023109968185387, + "kl": 0.12664794921875, + "learning_rate": 4.963655218783409e-07, + "loss": 0.0001, + "reward": 1.7339286282658577, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11166 + }, + { + "completion_length": 235.37054824829102, + "epoch": 1.8724590301353787, + "grad_norm": 0.2784812271855098, + "kl": 0.12969970703125, + "learning_rate": 4.963634835699275e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11168 + }, + { + "completion_length": 235.15179538726807, + "epoch": 1.8727943333752464, + "grad_norm": 0.17483775044451044, + "kl": 0.113037109375, + "learning_rate": 4.963614446942941e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 11170 + }, + { + "completion_length": 233.7410831451416, + "epoch": 1.8731296366151138, + "grad_norm": 0.12216847250099579, + "kl": 0.0994415283203125, + "learning_rate": 4.963594052514453e-07, + "loss": 0.0001, + "reward": 1.8071429282426834, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 11172 + }, + { + "completion_length": 230.18304634094238, + "epoch": 1.8734649398549812, + "grad_norm": 0.4168596205302882, + "kl": 0.11090087890625, + "learning_rate": 4.963573652413858e-07, + "loss": 0.0001, + "reward": 1.766071505844593, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11174 + }, + { + "completion_length": 229.29911613464355, + "epoch": 1.8738002430948488, + "grad_norm": 0.2791899452311946, + "kl": 0.118316650390625, + "learning_rate": 4.963553246641203e-07, + "loss": 0.0001, + "reward": 1.8357143625617027, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8357143104076385, + "rewards/format_reward_func": 1.0, + "step": 11176 + }, + { + "completion_length": 231.80358028411865, + "epoch": 1.8741355463347165, + "grad_norm": 0.16582448671376948, + "kl": 0.12030029296875, + "learning_rate": 4.963532835196534e-07, + "loss": 0.0001, + "reward": 1.807142898440361, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.807142898440361, + "rewards/format_reward_func": 1.0, + "step": 11178 + }, + { + "completion_length": 234.4509048461914, + "epoch": 1.874470849574584, + "grad_norm": 0.2595491378811056, + "kl": 0.122222900390625, + "learning_rate": 4.9635124180799e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857231199741, + "rewards/format_reward_func": 1.0, + "step": 11180 + }, + { + "completion_length": 240.40179634094238, + "epoch": 1.8748061528144517, + "grad_norm": 0.14194016819276536, + "kl": 0.11871337890625, + "learning_rate": 4.963491995291347e-07, + "loss": 0.0001, + "reward": 1.7875000461935997, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.791964303702116, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11182 + }, + { + "completion_length": 232.55804538726807, + "epoch": 1.8751414560543191, + "grad_norm": 0.12892934407277282, + "kl": 0.11798095703125, + "learning_rate": 4.963471566830922e-07, + "loss": 0.0001, + "reward": 1.7553572207689285, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214745521545, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11184 + }, + { + "completion_length": 235.81251049041748, + "epoch": 1.8754767592941866, + "grad_norm": 0.25378944592714786, + "kl": 0.11944580078125, + "learning_rate": 4.963451132698672e-07, + "loss": 0.0001, + "reward": 1.8357143253087997, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8357142992317677, + "rewards/format_reward_func": 1.0, + "step": 11186 + }, + { + "completion_length": 250.5669755935669, + "epoch": 1.8758120625340542, + "grad_norm": 0.615027955265174, + "kl": 0.1882781982421875, + "learning_rate": 4.963430692894644e-07, + "loss": 0.0002, + "reward": 1.7375000938773155, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643066823483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11188 + }, + { + "completion_length": 244.79465293884277, + "epoch": 1.8761473657739218, + "grad_norm": 0.2411759958789673, + "kl": 0.1248626708984375, + "learning_rate": 4.963410247418886e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000417232513, + "rewards/format_reward_func": 1.0, + "step": 11190 + }, + { + "completion_length": 244.06250953674316, + "epoch": 1.8764826690137895, + "grad_norm": 0.3607120042957956, + "kl": 0.274566650390625, + "learning_rate": 4.963389796271443e-07, + "loss": 0.0003, + "reward": 1.6946429535746574, + "reward_std": 0.09848987031728029, + "rewards/equation_reward_func": 0.6991071961820126, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11192 + }, + { + "completion_length": 241.02680015563965, + "epoch": 1.8768179722536569, + "grad_norm": 0.179436973418576, + "kl": 0.150360107421875, + "learning_rate": 4.963369339452363e-07, + "loss": 0.0002, + "reward": 1.773214340209961, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776785902678967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11194 + }, + { + "completion_length": 249.25447463989258, + "epoch": 1.8771532754935245, + "grad_norm": 0.22366962597111079, + "kl": 0.215087890625, + "learning_rate": 4.963348876961695e-07, + "loss": 0.0002, + "reward": 1.7660714909434319, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11196 + }, + { + "completion_length": 239.75893783569336, + "epoch": 1.877488578733392, + "grad_norm": 0.5282950834770903, + "kl": 0.2510986328125, + "learning_rate": 4.963328408799484e-07, + "loss": 0.0003, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 11198 + }, + { + "completion_length": 244.62500858306885, + "epoch": 1.8778238819732596, + "grad_norm": 0.17760628742076728, + "kl": 0.205841064453125, + "learning_rate": 4.963307934965777e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7982143275439739, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11200 + }, + { + "completion_length": 248.48215579986572, + "epoch": 1.8781591852131272, + "grad_norm": 0.19333171248821832, + "kl": 0.146392822265625, + "learning_rate": 4.963287455460622e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 11202 + }, + { + "completion_length": 246.0000123977661, + "epoch": 1.8784944884529948, + "grad_norm": 0.26767903257350883, + "kl": 0.162445068359375, + "learning_rate": 4.963266970284067e-07, + "loss": 0.0002, + "reward": 1.7267857789993286, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.731250025331974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11204 + }, + { + "completion_length": 249.42858219146729, + "epoch": 1.8788297916928622, + "grad_norm": 0.15949540283927346, + "kl": 0.176422119140625, + "learning_rate": 4.963246479436157e-07, + "loss": 0.0002, + "reward": 1.8000000566244125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 11206 + }, + { + "completion_length": 235.9687614440918, + "epoch": 1.8791650949327297, + "grad_norm": 0.22833940214050732, + "kl": 0.18597412109375, + "learning_rate": 4.96322598291694e-07, + "loss": 0.0002, + "reward": 1.7892857640981674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857305705547, + "rewards/format_reward_func": 1.0, + "step": 11208 + }, + { + "completion_length": 252.52233695983887, + "epoch": 1.8795003981725973, + "grad_norm": 0.16628686527578662, + "kl": 0.2346954345703125, + "learning_rate": 4.963205480726465e-07, + "loss": 0.0002, + "reward": 1.7964285984635353, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 11210 + }, + { + "completion_length": 240.1741180419922, + "epoch": 1.879835701412465, + "grad_norm": 0.3176503045231156, + "kl": 0.28228759765625, + "learning_rate": 4.963184972864776e-07, + "loss": 0.0003, + "reward": 1.7607143819332123, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 11212 + }, + { + "completion_length": 243.00001525878906, + "epoch": 1.8801710046523326, + "grad_norm": 0.17669946946339002, + "kl": 0.12799072265625, + "learning_rate": 4.963164459331924e-07, + "loss": 0.0001, + "reward": 1.7875000312924385, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7919643241912127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11214 + }, + { + "completion_length": 241.74108219146729, + "epoch": 1.8805063078922, + "grad_norm": 0.11810350754256321, + "kl": 0.107452392578125, + "learning_rate": 4.963143940127953e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 11216 + }, + { + "completion_length": 239.05804443359375, + "epoch": 1.8808416111320676, + "grad_norm": 0.24622105000391542, + "kl": 0.14434814453125, + "learning_rate": 4.963123415252911e-07, + "loss": 0.0001, + "reward": 1.730357214808464, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11218 + }, + { + "completion_length": 247.7544765472412, + "epoch": 1.881176914371935, + "grad_norm": 0.20972882425954445, + "kl": 0.20135498046875, + "learning_rate": 4.963102884706845e-07, + "loss": 0.0002, + "reward": 1.7428572103381157, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 11220 + }, + { + "completion_length": 232.7142972946167, + "epoch": 1.8815122176118027, + "grad_norm": 0.2750278904808387, + "kl": 0.192230224609375, + "learning_rate": 4.963082348489804e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 11222 + }, + { + "completion_length": 242.1250123977661, + "epoch": 1.8818475208516703, + "grad_norm": 0.3315015096714337, + "kl": 0.336578369140625, + "learning_rate": 4.963061806601835e-07, + "loss": 0.0003, + "reward": 1.7642857730388641, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 11224 + }, + { + "completion_length": 234.46429443359375, + "epoch": 1.882182824091538, + "grad_norm": 0.09096484363912223, + "kl": 0.1370697021484375, + "learning_rate": 4.963041259042984e-07, + "loss": 0.0001, + "reward": 1.7589286491274834, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11226 + }, + { + "completion_length": 230.73215198516846, + "epoch": 1.8825181273314053, + "grad_norm": 0.16681827582170852, + "kl": 0.13641357421875, + "learning_rate": 4.963020705813297e-07, + "loss": 0.0001, + "reward": 1.835714340209961, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8357142992317677, + "rewards/format_reward_func": 1.0, + "step": 11228 + }, + { + "completion_length": 234.8303680419922, + "epoch": 1.8828534305712727, + "grad_norm": 0.37642315561234024, + "kl": 0.120574951171875, + "learning_rate": 4.963000146912825e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 11230 + }, + { + "completion_length": 240.70983219146729, + "epoch": 1.8831887338111404, + "grad_norm": 0.23413257547900723, + "kl": 0.123199462890625, + "learning_rate": 4.962979582341613e-07, + "loss": 0.0001, + "reward": 1.8071429282426834, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428760886192, + "rewards/format_reward_func": 1.0, + "step": 11232 + }, + { + "completion_length": 236.7098331451416, + "epoch": 1.883524037051008, + "grad_norm": 0.2838174612146134, + "kl": 0.137664794921875, + "learning_rate": 4.962959012099709e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 11234 + }, + { + "completion_length": 237.9821548461914, + "epoch": 1.8838593402908757, + "grad_norm": 0.3923008916019813, + "kl": 0.157470703125, + "learning_rate": 4.96293843618716e-07, + "loss": 0.0002, + "reward": 1.7750000581145287, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 11236 + }, + { + "completion_length": 238.98215293884277, + "epoch": 1.884194643530743, + "grad_norm": 0.2385679361089355, + "kl": 0.1591796875, + "learning_rate": 4.962917854604013e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 11238 + }, + { + "completion_length": 235.0759048461914, + "epoch": 1.8845299467706107, + "grad_norm": 0.14434117277451863, + "kl": 0.12469482421875, + "learning_rate": 4.962897267350316e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143256813288, + "rewards/format_reward_func": 1.0, + "step": 11240 + }, + { + "completion_length": 223.95090293884277, + "epoch": 1.884865250010478, + "grad_norm": 0.03234523914231118, + "kl": 0.12738037109375, + "learning_rate": 4.962876674426116e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143267989159, + "rewards/format_reward_func": 1.0, + "step": 11242 + }, + { + "completion_length": 244.4196538925171, + "epoch": 1.8852005532503457, + "grad_norm": 0.3593552515735288, + "kl": 0.238555908203125, + "learning_rate": 4.962856075831462e-07, + "loss": 0.0002, + "reward": 1.6785715222358704, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.6785714607685804, + "rewards/format_reward_func": 1.0, + "step": 11244 + }, + { + "completion_length": 246.20537090301514, + "epoch": 1.8855358564902134, + "grad_norm": 0.37697301198383576, + "kl": 0.193267822265625, + "learning_rate": 4.962835471566399e-07, + "loss": 0.0002, + "reward": 1.7839286252856255, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928716182709, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11246 + }, + { + "completion_length": 248.55804824829102, + "epoch": 1.885871159730081, + "grad_norm": 0.23996368764434198, + "kl": 0.1488037109375, + "learning_rate": 4.962814861630977e-07, + "loss": 0.0001, + "reward": 1.7375000938773155, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7419643066823483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11248 + }, + { + "completion_length": 253.0000114440918, + "epoch": 1.8862064629699484, + "grad_norm": 0.18681771723186258, + "kl": 0.12933349609375, + "learning_rate": 4.96279424602524e-07, + "loss": 0.0001, + "reward": 1.7517857924103737, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11250 + }, + { + "completion_length": 238.5446548461914, + "epoch": 1.8865417662098158, + "grad_norm": 0.10783026267363756, + "kl": 0.18572998046875, + "learning_rate": 4.962773624749239e-07, + "loss": 0.0002, + "reward": 1.796428620815277, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 11252 + }, + { + "completion_length": 229.48215293884277, + "epoch": 1.8868770694496835, + "grad_norm": 0.19649646385163824, + "kl": 0.105621337890625, + "learning_rate": 4.96275299780302e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 11254 + }, + { + "completion_length": 248.9866189956665, + "epoch": 1.887212372689551, + "grad_norm": 0.28251356223769303, + "kl": 0.43621826171875, + "learning_rate": 4.96273236518663e-07, + "loss": 0.0004, + "reward": 1.7767857685685158, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.781250037252903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11256 + }, + { + "completion_length": 229.65179634094238, + "epoch": 1.8875476759294187, + "grad_norm": 0.09993092682238044, + "kl": 0.1275787353515625, + "learning_rate": 4.962711726900117e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 11258 + }, + { + "completion_length": 235.91518878936768, + "epoch": 1.8878829791692864, + "grad_norm": 0.2383186917676842, + "kl": 0.11273193359375, + "learning_rate": 4.962691082943528e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7526785843074322, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11260 + }, + { + "completion_length": 240.62947368621826, + "epoch": 1.8882182824091538, + "grad_norm": 0.12872696701242822, + "kl": 0.2894744873046875, + "learning_rate": 4.962670433316912e-07, + "loss": 0.0003, + "reward": 1.7660714834928513, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11262 + }, + { + "completion_length": 237.12054634094238, + "epoch": 1.8885535856490212, + "grad_norm": 0.2637527234686633, + "kl": 0.201141357421875, + "learning_rate": 4.962649778020316e-07, + "loss": 0.0002, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714637488127, + "rewards/format_reward_func": 1.0, + "step": 11264 + }, + { + "completion_length": 241.26340198516846, + "epoch": 1.8888888888888888, + "grad_norm": 0.19499182893264433, + "kl": 0.1104736328125, + "learning_rate": 4.962629117053786e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 11266 + }, + { + "completion_length": 247.0312623977661, + "epoch": 1.8892241921287565, + "grad_norm": 0.34806357799507837, + "kl": 0.128662109375, + "learning_rate": 4.962608450417371e-07, + "loss": 0.0001, + "reward": 1.6964286416769028, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.6964286100119352, + "rewards/format_reward_func": 1.0, + "step": 11268 + }, + { + "completion_length": 243.08037090301514, + "epoch": 1.889559495368624, + "grad_norm": 0.13210932993568505, + "kl": 0.141876220703125, + "learning_rate": 4.962587778111119e-07, + "loss": 0.0001, + "reward": 1.832142896950245, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8321428745985031, + "rewards/format_reward_func": 1.0, + "step": 11270 + }, + { + "completion_length": 247.37947845458984, + "epoch": 1.8898947986084915, + "grad_norm": 0.37825347713712715, + "kl": 0.171356201171875, + "learning_rate": 4.962567100135075e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 11272 + }, + { + "completion_length": 250.8884038925171, + "epoch": 1.8902301018483592, + "grad_norm": 0.25807449253969955, + "kl": 0.112640380859375, + "learning_rate": 4.962546416489289e-07, + "loss": 0.0001, + "reward": 1.7589286491274834, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11274 + }, + { + "completion_length": 252.50894165039062, + "epoch": 1.8905654050882266, + "grad_norm": 0.3161104987993201, + "kl": 0.10699462890625, + "learning_rate": 4.962525727173809e-07, + "loss": 0.0001, + "reward": 1.746428668498993, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 11276 + }, + { + "completion_length": 250.54912090301514, + "epoch": 1.8909007083280942, + "grad_norm": 0.26533155739866404, + "kl": 0.1735382080078125, + "learning_rate": 4.962505032188682e-07, + "loss": 0.0002, + "reward": 1.725000075995922, + "reward_std": 0.05555838719010353, + "rewards/equation_reward_func": 0.7250000331550837, + "rewards/format_reward_func": 1.0, + "step": 11278 + }, + { + "completion_length": 252.9866180419922, + "epoch": 1.8912360115679618, + "grad_norm": 0.269657416281771, + "kl": 0.264495849609375, + "learning_rate": 4.962484331533955e-07, + "loss": 0.0003, + "reward": 1.7000000849366188, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7000000365078449, + "rewards/format_reward_func": 1.0, + "step": 11280 + }, + { + "completion_length": 255.80358600616455, + "epoch": 1.8915713148078295, + "grad_norm": 0.11395970195478738, + "kl": 0.197479248046875, + "learning_rate": 4.962463625209676e-07, + "loss": 0.0002, + "reward": 1.773214340209961, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11282 + }, + { + "completion_length": 252.80804634094238, + "epoch": 1.8919066180476969, + "grad_norm": 0.20015288301172346, + "kl": 0.2366180419921875, + "learning_rate": 4.962442913215892e-07, + "loss": 0.0002, + "reward": 1.7607143446803093, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7696428839117289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11284 + }, + { + "completion_length": 265.46429920196533, + "epoch": 1.8922419212875643, + "grad_norm": 0.3071617414978232, + "kl": 0.1317138671875, + "learning_rate": 4.962422195552652e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7767857313156128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11286 + }, + { + "completion_length": 264.34822845458984, + "epoch": 1.892577224527432, + "grad_norm": 0.5844285027634494, + "kl": 0.6663818359375, + "learning_rate": 4.962401472220004e-07, + "loss": 0.0007, + "reward": 1.7375000640749931, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11288 + }, + { + "completion_length": 257.7276906967163, + "epoch": 1.8929125277672996, + "grad_norm": 0.18094547642963446, + "kl": 0.4002685546875, + "learning_rate": 4.962380743217994e-07, + "loss": 0.0004, + "reward": 1.7250000983476639, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 11290 + }, + { + "completion_length": 256.72322845458984, + "epoch": 1.8932478310071672, + "grad_norm": 0.31669459769976144, + "kl": 0.608367919921875, + "learning_rate": 4.96236000854667e-07, + "loss": 0.0006, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428772062063, + "rewards/format_reward_func": 1.0, + "step": 11292 + }, + { + "completion_length": 249.0491189956665, + "epoch": 1.8935831342470346, + "grad_norm": 0.23547687690667424, + "kl": 0.344940185546875, + "learning_rate": 4.962339268206081e-07, + "loss": 0.0003, + "reward": 1.7875000536441803, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11294 + }, + { + "completion_length": 250.0669755935669, + "epoch": 1.8939184374869023, + "grad_norm": 0.44210786853010475, + "kl": 0.396240234375, + "learning_rate": 4.962318522196274e-07, + "loss": 0.0004, + "reward": 1.7125000953674316, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7169643193483353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11296 + }, + { + "completion_length": 264.73215198516846, + "epoch": 1.8942537407267697, + "grad_norm": 0.2360487677899644, + "kl": 0.343963623046875, + "learning_rate": 4.962297770517296e-07, + "loss": 0.0003, + "reward": 1.801785759627819, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.8151785954833031, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11298 + }, + { + "completion_length": 253.4285831451416, + "epoch": 1.8945890439666373, + "grad_norm": 0.19174211335333957, + "kl": 0.182464599609375, + "learning_rate": 4.962277013169197e-07, + "loss": 0.0002, + "reward": 1.7089286521077156, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7133928909897804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11300 + }, + { + "completion_length": 262.34376335144043, + "epoch": 1.894924347206505, + "grad_norm": 0.4497420045476892, + "kl": 0.34619140625, + "learning_rate": 4.962256250152022e-07, + "loss": 0.0003, + "reward": 1.6500000730156898, + "reward_std": 0.07071067858487368, + "rewards/equation_reward_func": 0.6678571738302708, + "rewards/format_reward_func": 0.9821428656578064, + "step": 11302 + }, + { + "completion_length": 266.2901906967163, + "epoch": 1.8952596504463726, + "grad_norm": 0.19696851735656268, + "kl": 0.706573486328125, + "learning_rate": 4.962235481465821e-07, + "loss": 0.0007, + "reward": 1.7504464909434319, + "reward_std": 0.0700793326832354, + "rewards/equation_reward_func": 0.7651786096394062, + "rewards/format_reward_func": 0.9852678664028645, + "step": 11304 + }, + { + "completion_length": 257.4821538925171, + "epoch": 1.89559495368624, + "grad_norm": 0.2895527404607826, + "kl": 0.30169677734375, + "learning_rate": 4.962214707110641e-07, + "loss": 0.0003, + "reward": 1.7000000849366188, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.708928607404232, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11306 + }, + { + "completion_length": 245.58929824829102, + "epoch": 1.8959302569261074, + "grad_norm": 0.3727980145309879, + "kl": 1.33642578125, + "learning_rate": 4.96219392708653e-07, + "loss": 0.0013, + "reward": 1.7535715103149414, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7625000365078449, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11308 + }, + { + "completion_length": 251.3259048461914, + "epoch": 1.896265560165975, + "grad_norm": 0.08608763982420042, + "kl": 1.362823486328125, + "learning_rate": 4.962173141393535e-07, + "loss": 0.0014, + "reward": 1.801785759627819, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500320374966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11310 + }, + { + "completion_length": 251.4330472946167, + "epoch": 1.8966008634058427, + "grad_norm": 0.2428648181228268, + "kl": 0.157745361328125, + "learning_rate": 4.962152350031704e-07, + "loss": 0.0002, + "reward": 1.7303572297096252, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7348214462399483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11312 + }, + { + "completion_length": 260.3080472946167, + "epoch": 1.8969361666457103, + "grad_norm": 0.2209283834755941, + "kl": 0.304290771484375, + "learning_rate": 4.962131553001086e-07, + "loss": 0.0003, + "reward": 1.7446429133415222, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7491071820259094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11314 + }, + { + "completion_length": 248.6696548461914, + "epoch": 1.897271469885578, + "grad_norm": 0.159877997058151, + "kl": 0.5746612548828125, + "learning_rate": 4.962110750301729e-07, + "loss": 0.0006, + "reward": 1.759375050663948, + "reward_std": 0.017046324210241437, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 0.9986607171595097, + "step": 11316 + }, + { + "completion_length": 260.1384038925171, + "epoch": 1.8976067731254453, + "grad_norm": 0.17812440090711842, + "kl": 0.610321044921875, + "learning_rate": 4.96208994193368e-07, + "loss": 0.0006, + "reward": 1.7660714983940125, + "reward_std": 0.07828682288527489, + "rewards/equation_reward_func": 0.7794643212109804, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11318 + }, + { + "completion_length": 248.71875858306885, + "epoch": 1.8979420763653128, + "grad_norm": 0.22698536374050526, + "kl": 0.59393310546875, + "learning_rate": 4.962069127896987e-07, + "loss": 0.0006, + "reward": 1.841071456670761, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.8455357290804386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11320 + }, + { + "completion_length": 253.8259038925171, + "epoch": 1.8982773796051804, + "grad_norm": 0.21589363012833795, + "kl": 0.1286773681640625, + "learning_rate": 4.962048308191698e-07, + "loss": 0.0001, + "reward": 1.7660714983940125, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11322 + }, + { + "completion_length": 256.321439743042, + "epoch": 1.898612682845048, + "grad_norm": 0.13809367925350216, + "kl": 1.128448486328125, + "learning_rate": 4.96202748281786e-07, + "loss": 0.0011, + "reward": 1.7625000551342964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11324 + }, + { + "completion_length": 255.62054538726807, + "epoch": 1.8989479860849157, + "grad_norm": 0.1571339857944827, + "kl": 0.396942138671875, + "learning_rate": 4.962006651775522e-07, + "loss": 0.0004, + "reward": 1.7535714954137802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 11326 + }, + { + "completion_length": 263.977689743042, + "epoch": 1.899283289324783, + "grad_norm": 0.3801393073801637, + "kl": 0.1540679931640625, + "learning_rate": 4.961985815064732e-07, + "loss": 0.0002, + "reward": 1.7803572043776512, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11328 + }, + { + "completion_length": 259.214298248291, + "epoch": 1.8996185925646507, + "grad_norm": 0.007573744361931633, + "kl": 0.76910400390625, + "learning_rate": 4.961964972685539e-07, + "loss": 0.0008, + "reward": 1.78035718947649, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11330 + }, + { + "completion_length": 266.71875953674316, + "epoch": 1.8999538958045181, + "grad_norm": 0.18791643353272014, + "kl": 0.676361083984375, + "learning_rate": 4.961944124637989e-07, + "loss": 0.0007, + "reward": 1.723214365541935, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276786081492901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11332 + }, + { + "completion_length": 257.7366199493408, + "epoch": 1.9002891990443858, + "grad_norm": 0.20574448512918195, + "kl": 0.811431884765625, + "learning_rate": 4.96192327092213e-07, + "loss": 0.0008, + "reward": 1.730357214808464, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7348214723169804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11334 + }, + { + "completion_length": 266.5669765472412, + "epoch": 1.9006245022842534, + "grad_norm": 0.12088067703855519, + "kl": 0.2629852294921875, + "learning_rate": 4.961902411538013e-07, + "loss": 0.0003, + "reward": 1.739285759627819, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7482143305242062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11336 + }, + { + "completion_length": 275.45537090301514, + "epoch": 1.900959805524121, + "grad_norm": 0.19524772069955354, + "kl": 0.3109130859375, + "learning_rate": 4.961881546485682e-07, + "loss": 0.0003, + "reward": 1.7625000849366188, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11338 + }, + { + "completion_length": 271.44644355773926, + "epoch": 1.9012951087639884, + "grad_norm": 0.1591140863477574, + "kl": 0.1329803466796875, + "learning_rate": 4.961860675765188e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11340 + }, + { + "completion_length": 278.3035821914673, + "epoch": 1.9016304120038559, + "grad_norm": 0.2581112163246211, + "kl": 0.2618408203125, + "learning_rate": 4.961839799376576e-07, + "loss": 0.0003, + "reward": 1.8392857685685158, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8392857275903225, + "rewards/format_reward_func": 1.0, + "step": 11342 + }, + { + "completion_length": 263.99554920196533, + "epoch": 1.9019657152437235, + "grad_norm": 0.18510317852809857, + "kl": 0.1702880859375, + "learning_rate": 4.961818917319897e-07, + "loss": 0.0002, + "reward": 1.7267857939004898, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7312500365078449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11344 + }, + { + "completion_length": 265.2634048461914, + "epoch": 1.9023010184835911, + "grad_norm": 0.1454532985720656, + "kl": 0.371673583984375, + "learning_rate": 4.961798029595199e-07, + "loss": 0.0004, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143130153418, + "rewards/format_reward_func": 1.0, + "step": 11346 + }, + { + "completion_length": 271.0044765472412, + "epoch": 1.9026363217234588, + "grad_norm": 0.11359424624294441, + "kl": 0.22344970703125, + "learning_rate": 4.961777136202528e-07, + "loss": 0.0002, + "reward": 1.7410714998841286, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11348 + }, + { + "completion_length": 269.70536708831787, + "epoch": 1.9029716249633262, + "grad_norm": 0.14220671747259186, + "kl": 0.181732177734375, + "learning_rate": 4.961756237141934e-07, + "loss": 0.0002, + "reward": 1.728571504354477, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714633762836, + "rewards/format_reward_func": 1.0, + "step": 11350 + }, + { + "completion_length": 269.3705530166626, + "epoch": 1.9033069282031938, + "grad_norm": 0.1606879487831985, + "kl": 0.114715576171875, + "learning_rate": 4.961735332413465e-07, + "loss": 0.0001, + "reward": 1.751785770058632, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500443309546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11352 + }, + { + "completion_length": 269.37054538726807, + "epoch": 1.9036422314430612, + "grad_norm": 0.3928058113985592, + "kl": 0.1796112060546875, + "learning_rate": 4.961714422017167e-07, + "loss": 0.0002, + "reward": 1.7678571790456772, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 11354 + }, + { + "completion_length": 267.2500123977661, + "epoch": 1.9039775346829289, + "grad_norm": 0.10755558405850703, + "kl": 0.14447021484375, + "learning_rate": 4.96169350595309e-07, + "loss": 0.0001, + "reward": 1.8464286103844643, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8464285954833031, + "rewards/format_reward_func": 1.0, + "step": 11356 + }, + { + "completion_length": 264.2812623977661, + "epoch": 1.9043128379227965, + "grad_norm": 0.28424929901275836, + "kl": 0.12335205078125, + "learning_rate": 4.961672584221282e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.8089285865426064, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11358 + }, + { + "completion_length": 264.68751525878906, + "epoch": 1.9046481411626641, + "grad_norm": 0.14374849264193768, + "kl": 0.1146240234375, + "learning_rate": 4.961651656821791e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 11360 + }, + { + "completion_length": 273.4866180419922, + "epoch": 1.9049834444025315, + "grad_norm": 0.227732210229808, + "kl": 0.21929931640625, + "learning_rate": 4.961630723754666e-07, + "loss": 0.0002, + "reward": 1.7267857640981674, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7312500309199095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11362 + }, + { + "completion_length": 270.5803689956665, + "epoch": 1.905318747642399, + "grad_norm": 0.17363189254935005, + "kl": 0.14501953125, + "learning_rate": 4.961609785019954e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 11364 + }, + { + "completion_length": 271.4330472946167, + "epoch": 1.9056540508822666, + "grad_norm": 0.23595968121958819, + "kl": 0.207855224609375, + "learning_rate": 4.961588840617703e-07, + "loss": 0.0002, + "reward": 1.7142857983708382, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7232143171131611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11366 + }, + { + "completion_length": 264.21876430511475, + "epoch": 1.9059893541221342, + "grad_norm": 0.14596728600368417, + "kl": 0.11456298828125, + "learning_rate": 4.961567890547962e-07, + "loss": 0.0001, + "reward": 1.8017857521772385, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.8062500208616257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11368 + }, + { + "completion_length": 270.11162090301514, + "epoch": 1.9063246573620019, + "grad_norm": 0.23307965611049494, + "kl": 0.19744873046875, + "learning_rate": 4.96154693481078e-07, + "loss": 0.0002, + "reward": 1.7053572162985802, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.709821468219161, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11370 + }, + { + "completion_length": 265.9509029388428, + "epoch": 1.9066599606018693, + "grad_norm": 0.32637737542984435, + "kl": 0.22430419921875, + "learning_rate": 4.961525973406203e-07, + "loss": 0.0002, + "reward": 1.8000000640749931, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.8089285865426064, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11372 + }, + { + "completion_length": 261.34376335144043, + "epoch": 1.906995263841737, + "grad_norm": 0.2971004762919588, + "kl": 0.1528472900390625, + "learning_rate": 4.961505006334281e-07, + "loss": 0.0002, + "reward": 1.8000000566244125, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.8089285790920258, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11374 + }, + { + "completion_length": 266.3169775009155, + "epoch": 1.9073305670816043, + "grad_norm": 0.0754146216793177, + "kl": 0.145721435546875, + "learning_rate": 4.961484033595061e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7660714499652386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11376 + }, + { + "completion_length": 264.87055110931396, + "epoch": 1.907665870321472, + "grad_norm": 0.14408567338438105, + "kl": 0.123016357421875, + "learning_rate": 4.961463055188593e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000428408384, + "rewards/format_reward_func": 1.0, + "step": 11378 + }, + { + "completion_length": 260.95983505249023, + "epoch": 1.9080011735613396, + "grad_norm": 0.21066553363979487, + "kl": 0.201416015625, + "learning_rate": 4.961442071114925e-07, + "loss": 0.0002, + "reward": 1.796428605914116, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286040514708, + "rewards/format_reward_func": 1.0, + "step": 11380 + }, + { + "completion_length": 262.84822845458984, + "epoch": 1.9083364768012072, + "grad_norm": 0.25315756086610974, + "kl": 0.1490478515625, + "learning_rate": 4.961421081374104e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7946428880095482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11382 + }, + { + "completion_length": 263.6250114440918, + "epoch": 1.9086717800410746, + "grad_norm": 0.2453006777873615, + "kl": 0.224761962890625, + "learning_rate": 4.961400085966179e-07, + "loss": 0.0002, + "reward": 1.7285714894533157, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7375000324100256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11384 + }, + { + "completion_length": 259.40179538726807, + "epoch": 1.909007083280942, + "grad_norm": 0.39350043991505274, + "kl": 0.12457275390625, + "learning_rate": 4.961379084891199e-07, + "loss": 0.0001, + "reward": 1.7946429252624512, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071790456772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11386 + }, + { + "completion_length": 270.33930015563965, + "epoch": 1.9093423865208097, + "grad_norm": 0.2779319904231321, + "kl": 0.175750732421875, + "learning_rate": 4.961358078149211e-07, + "loss": 0.0002, + "reward": 1.735714353621006, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7446428835391998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11388 + }, + { + "completion_length": 257.08036708831787, + "epoch": 1.9096776897606773, + "grad_norm": 0.8968730072056478, + "kl": 0.12542724609375, + "learning_rate": 4.961337065740263e-07, + "loss": 0.0001, + "reward": 1.8107143267989159, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.8196428827941418, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11390 + }, + { + "completion_length": 258.45090770721436, + "epoch": 1.910012993000545, + "grad_norm": 0.273157945393228, + "kl": 0.130096435546875, + "learning_rate": 4.961316047664406e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7723214477300644, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11392 + }, + { + "completion_length": 263.4241180419922, + "epoch": 1.9103482962404126, + "grad_norm": 0.13818252928656122, + "kl": 0.154266357421875, + "learning_rate": 4.961295023921687e-07, + "loss": 0.0002, + "reward": 1.7714286297559738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 11394 + }, + { + "completion_length": 265.8214406967163, + "epoch": 1.91068359948028, + "grad_norm": 0.27011759053789397, + "kl": 0.226043701171875, + "learning_rate": 4.961273994512154e-07, + "loss": 0.0002, + "reward": 1.7500000670552254, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 11396 + }, + { + "completion_length": 250.55358409881592, + "epoch": 1.9110189027201474, + "grad_norm": 0.6885092609424, + "kl": 0.2010955810546875, + "learning_rate": 4.961252959435856e-07, + "loss": 0.0002, + "reward": 1.7464286610484123, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7553571779280901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11398 + }, + { + "completion_length": 268.08929443359375, + "epoch": 1.911354205960015, + "grad_norm": 0.2028304503555617, + "kl": 0.15380859375, + "learning_rate": 4.961231918692839e-07, + "loss": 0.0002, + "reward": 1.7964286282658577, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 11400 + }, + { + "completion_length": 268.0803680419922, + "epoch": 1.9116895091998827, + "grad_norm": 0.8293108782393289, + "kl": 0.17572021484375, + "learning_rate": 4.961210872283157e-07, + "loss": 0.0002, + "reward": 1.7678572237491608, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7767857313156128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11402 + }, + { + "completion_length": 264.0044765472412, + "epoch": 1.9120248124397503, + "grad_norm": 0.3747041193333945, + "kl": 0.146636962890625, + "learning_rate": 4.961189820206852e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7803571745753288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11404 + }, + { + "completion_length": 278.9464387893677, + "epoch": 1.9123601156796177, + "grad_norm": 0.13538982335741828, + "kl": 0.15130615234375, + "learning_rate": 4.961168762463978e-07, + "loss": 0.0002, + "reward": 1.774107202887535, + "reward_std": 0.056821079924702644, + "rewards/equation_reward_func": 0.7803571783006191, + "rewards/format_reward_func": 0.9937500059604645, + "step": 11406 + }, + { + "completion_length": 266.1651906967163, + "epoch": 1.9126954189194854, + "grad_norm": 0.8431514704194816, + "kl": 0.159820556640625, + "learning_rate": 4.961147699054579e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7660714611411095, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11408 + }, + { + "completion_length": 277.15180110931396, + "epoch": 1.9130307221593528, + "grad_norm": 1.5579412370998411, + "kl": 0.156585693359375, + "learning_rate": 4.961126629978707e-07, + "loss": 0.0002, + "reward": 1.7482143342494965, + "reward_std": 0.0732360603287816, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9776785783469677, + "step": 11410 + }, + { + "completion_length": 285.42858505249023, + "epoch": 1.9133660253992204, + "grad_norm": 0.41450893660091553, + "kl": 0.1715240478515625, + "learning_rate": 4.961105555236408e-07, + "loss": 0.0002, + "reward": 1.7928572073578835, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8017857484519482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11412 + }, + { + "completion_length": 274.0803689956665, + "epoch": 1.913701328639088, + "grad_norm": 0.24263481536803763, + "kl": 0.14422607421875, + "learning_rate": 4.961084474827731e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7580357491970062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11414 + }, + { + "completion_length": 280.65179443359375, + "epoch": 1.9140366318789557, + "grad_norm": 1.6763596841012791, + "kl": 2.72528076171875, + "learning_rate": 4.961063388752726e-07, + "loss": 0.0027, + "reward": 1.6982143819332123, + "reward_std": 0.11364216078072786, + "rewards/equation_reward_func": 0.7383928745985031, + "rewards/format_reward_func": 0.9598214477300644, + "step": 11416 + }, + { + "completion_length": 286.12054538726807, + "epoch": 1.914371935118823, + "grad_norm": 0.6866111926368569, + "kl": 0.192596435546875, + "learning_rate": 4.961042297011441e-07, + "loss": 0.0002, + "reward": 1.6571429297327995, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.6928571723401546, + "rewards/format_reward_func": 0.9642857275903225, + "step": 11418 + }, + { + "completion_length": 268.3839387893677, + "epoch": 1.9147072383586905, + "grad_norm": 0.2820457558191292, + "kl": 0.11151123046875, + "learning_rate": 4.961021199603923e-07, + "loss": 0.0001, + "reward": 1.769642911851406, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7830357402563095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11420 + }, + { + "completion_length": 270.7812623977661, + "epoch": 1.9150425415985581, + "grad_norm": 0.13665542251726445, + "kl": 0.1313629150390625, + "learning_rate": 4.961000096530222e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.7839285954833031, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11422 + }, + { + "completion_length": 268.5669765472412, + "epoch": 1.9153778448384258, + "grad_norm": 0.39813810964567975, + "kl": 0.13531494140625, + "learning_rate": 4.960978987790386e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428809314966, + "rewards/format_reward_func": 1.0, + "step": 11424 + }, + { + "completion_length": 251.665189743042, + "epoch": 1.9157131480782934, + "grad_norm": 0.10712173856582168, + "kl": 0.0943603515625, + "learning_rate": 4.960957873384465e-07, + "loss": 0.0001, + "reward": 1.8392857536673546, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8392857164144516, + "rewards/format_reward_func": 1.0, + "step": 11426 + }, + { + "completion_length": 255.4062623977661, + "epoch": 1.9160484513181608, + "grad_norm": 0.14126711549786453, + "kl": 0.1662445068359375, + "learning_rate": 4.960936753312506e-07, + "loss": 0.0002, + "reward": 1.807142898440361, + "reward_std": 0.04040610231459141, + "rewards/equation_reward_func": 0.8160714581608772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11428 + }, + { + "completion_length": 248.08037185668945, + "epoch": 1.9163837545580285, + "grad_norm": 0.3138137135372601, + "kl": 0.113006591796875, + "learning_rate": 4.960915627574558e-07, + "loss": 0.0001, + "reward": 1.7303572297096252, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.7348214536905289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11430 + }, + { + "completion_length": 247.3169765472412, + "epoch": 1.9167190577978959, + "grad_norm": 0.04930311204874395, + "kl": 0.1185302734375, + "learning_rate": 4.96089449617067e-07, + "loss": 0.0001, + "reward": 1.785714328289032, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 11432 + }, + { + "completion_length": 253.94197368621826, + "epoch": 1.9170543610377635, + "grad_norm": 0.34253288139320753, + "kl": 0.177154541015625, + "learning_rate": 4.96087335910089e-07, + "loss": 0.0002, + "reward": 1.760714329779148, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 11434 + }, + { + "completion_length": 248.2812614440918, + "epoch": 1.9173896642776311, + "grad_norm": 0.15253380622826307, + "kl": 0.120941162109375, + "learning_rate": 4.960852216365268e-07, + "loss": 0.0001, + "reward": 1.7642857655882835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 11436 + }, + { + "completion_length": 259.6741180419922, + "epoch": 1.9177249675174988, + "grad_norm": 0.16595620257856564, + "kl": 0.131561279296875, + "learning_rate": 4.960831067963851e-07, + "loss": 0.0001, + "reward": 1.7392857670783997, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11438 + }, + { + "completion_length": 242.8125114440918, + "epoch": 1.9180602707573662, + "grad_norm": 0.13210774905736541, + "kl": 0.104461669921875, + "learning_rate": 4.960809913896689e-07, + "loss": 0.0001, + "reward": 1.7571429014205933, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 11440 + }, + { + "completion_length": 250.49554920196533, + "epoch": 1.9183955739972336, + "grad_norm": 0.004442574314263273, + "kl": 0.148345947265625, + "learning_rate": 4.960788754163829e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 11442 + }, + { + "completion_length": 235.95090103149414, + "epoch": 1.9187308772371012, + "grad_norm": 0.13640183603938238, + "kl": 0.125457763671875, + "learning_rate": 4.960767588765322e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 11444 + }, + { + "completion_length": 242.54911613464355, + "epoch": 1.9190661804769689, + "grad_norm": 0.3164099307381995, + "kl": 0.1171875, + "learning_rate": 4.960746417701215e-07, + "loss": 0.0001, + "reward": 1.8607143312692642, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8607143051922321, + "rewards/format_reward_func": 1.0, + "step": 11446 + }, + { + "completion_length": 258.3526906967163, + "epoch": 1.9194014837168365, + "grad_norm": 0.24606226713899274, + "kl": 0.09759521484375, + "learning_rate": 4.960725240971558e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7491071633994579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11448 + }, + { + "completion_length": 249.64733219146729, + "epoch": 1.9197367869567041, + "grad_norm": 0.21903366414071526, + "kl": 0.117919921875, + "learning_rate": 4.960704058576399e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857566475868, + "rewards/format_reward_func": 1.0, + "step": 11450 + }, + { + "completion_length": 250.1428680419922, + "epoch": 1.9200720901965715, + "grad_norm": 0.11739844578988058, + "kl": 0.15765380859375, + "learning_rate": 4.960682870515786e-07, + "loss": 0.0002, + "reward": 1.700000062584877, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7000000365078449, + "rewards/format_reward_func": 1.0, + "step": 11452 + }, + { + "completion_length": 252.415189743042, + "epoch": 1.920407393436439, + "grad_norm": 0.23504962226583487, + "kl": 0.130767822265625, + "learning_rate": 4.96066167678977e-07, + "loss": 0.0001, + "reward": 1.8035715073347092, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 11454 + }, + { + "completion_length": 250.7500114440918, + "epoch": 1.9207426966763066, + "grad_norm": 0.17987810570866028, + "kl": 0.1189422607421875, + "learning_rate": 4.960640477398398e-07, + "loss": 0.0001, + "reward": 1.735714390873909, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 11456 + }, + { + "completion_length": 251.9062614440918, + "epoch": 1.9210779999161742, + "grad_norm": 0.5340453361696048, + "kl": 0.1201019287109375, + "learning_rate": 4.96061927234172e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 11458 + }, + { + "completion_length": 257.8660840988159, + "epoch": 1.9214133031560419, + "grad_norm": 0.06368641751747849, + "kl": 0.11151123046875, + "learning_rate": 4.960598061619782e-07, + "loss": 0.0001, + "reward": 1.826785758137703, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8401785865426064, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11460 + }, + { + "completion_length": 260.2812614440918, + "epoch": 1.9217486063959093, + "grad_norm": 0.20438380505560297, + "kl": 0.2633209228515625, + "learning_rate": 4.960576845232637e-07, + "loss": 0.0003, + "reward": 1.7714286297559738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 11462 + }, + { + "completion_length": 272.7500123977661, + "epoch": 1.922083909635777, + "grad_norm": 0.4689894616913192, + "kl": 0.259124755859375, + "learning_rate": 4.960555623180331e-07, + "loss": 0.0003, + "reward": 1.712500087916851, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7169643212109804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11464 + }, + { + "completion_length": 262.7098321914673, + "epoch": 1.9224192128756443, + "grad_norm": 0.198931744948235, + "kl": 0.163665771484375, + "learning_rate": 4.960534395462913e-07, + "loss": 0.0002, + "reward": 1.7660714909434319, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11466 + }, + { + "completion_length": 262.0714406967163, + "epoch": 1.922754516115512, + "grad_norm": 0.16364250738570876, + "kl": 0.1119537353515625, + "learning_rate": 4.960513162080434e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857540398836, + "rewards/format_reward_func": 1.0, + "step": 11468 + }, + { + "completion_length": 254.85715007781982, + "epoch": 1.9230898193553796, + "grad_norm": 0.17039801150252207, + "kl": 0.227020263671875, + "learning_rate": 4.96049192303294e-07, + "loss": 0.0002, + "reward": 1.7178572341799736, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7178571671247482, + "rewards/format_reward_func": 1.0, + "step": 11470 + }, + { + "completion_length": 248.5312614440918, + "epoch": 1.9234251225952472, + "grad_norm": 0.2090763004651719, + "kl": 0.159637451171875, + "learning_rate": 4.960470678320482e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7732143104076385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11472 + }, + { + "completion_length": 260.42412090301514, + "epoch": 1.9237604258351146, + "grad_norm": 0.23593082654275854, + "kl": 0.112701416015625, + "learning_rate": 4.960449427943108e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714752972126, + "rewards/format_reward_func": 1.0, + "step": 11474 + }, + { + "completion_length": 275.6696548461914, + "epoch": 1.924095729074982, + "grad_norm": 0.20935209515541264, + "kl": 0.1348876953125, + "learning_rate": 4.960428171900868e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7580357491970062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11476 + }, + { + "completion_length": 262.495548248291, + "epoch": 1.9244310323148497, + "grad_norm": 0.21949802358359427, + "kl": 0.119415283203125, + "learning_rate": 4.960406910193809e-07, + "loss": 0.0001, + "reward": 1.7875000461935997, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7919643148779869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11478 + }, + { + "completion_length": 269.5134038925171, + "epoch": 1.9247663355547173, + "grad_norm": 0.15969970131972752, + "kl": 0.154083251953125, + "learning_rate": 4.960385642821982e-07, + "loss": 0.0002, + "reward": 1.8035714775323868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 11480 + }, + { + "completion_length": 271.7321557998657, + "epoch": 1.925101638794585, + "grad_norm": 0.22003888136350389, + "kl": 0.22259521484375, + "learning_rate": 4.960364369785433e-07, + "loss": 0.0002, + "reward": 1.760714329779148, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11482 + }, + { + "completion_length": 273.16965770721436, + "epoch": 1.9254369420344524, + "grad_norm": 0.4368958131528556, + "kl": 0.192474365234375, + "learning_rate": 4.960343091084215e-07, + "loss": 0.0002, + "reward": 1.7357143610715866, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7446428947150707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11484 + }, + { + "completion_length": 275.43750953674316, + "epoch": 1.92577224527432, + "grad_norm": 0.1728484515807659, + "kl": 0.2697906494140625, + "learning_rate": 4.960321806718375e-07, + "loss": 0.0003, + "reward": 1.8178571835160255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571723401546, + "rewards/format_reward_func": 1.0, + "step": 11486 + }, + { + "completion_length": 276.66965675354004, + "epoch": 1.9261075485141874, + "grad_norm": 0.5261602344874771, + "kl": 0.2050933837890625, + "learning_rate": 4.960300516687961e-07, + "loss": 0.0002, + "reward": 1.7750000581145287, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 0.9821428656578064, + "step": 11488 + }, + { + "completion_length": 269.5848340988159, + "epoch": 1.926442851754055, + "grad_norm": 0.19601843506053898, + "kl": 0.2016143798828125, + "learning_rate": 4.960279220993023e-07, + "loss": 0.0002, + "reward": 1.7053572311997414, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7098214700818062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11490 + }, + { + "completion_length": 270.68751335144043, + "epoch": 1.9267781549939227, + "grad_norm": 0.17990204744519162, + "kl": 0.17333984375, + "learning_rate": 4.960257919633611e-07, + "loss": 0.0002, + "reward": 1.7857143133878708, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7946428991854191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11492 + }, + { + "completion_length": 262.0982265472412, + "epoch": 1.9271134582337903, + "grad_norm": 0.27930705227496755, + "kl": 0.2054595947265625, + "learning_rate": 4.960236612609773e-07, + "loss": 0.0002, + "reward": 1.733928643167019, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7383928876370192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11494 + }, + { + "completion_length": 263.8571557998657, + "epoch": 1.9274487614736577, + "grad_norm": 0.23885652033263852, + "kl": 0.25677490234375, + "learning_rate": 4.960215299921557e-07, + "loss": 0.0003, + "reward": 1.7821429148316383, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 11496 + }, + { + "completion_length": 261.25447273254395, + "epoch": 1.9277840647135251, + "grad_norm": 0.11426829648277192, + "kl": 0.13201904296875, + "learning_rate": 4.960193981569014e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7776786088943481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11498 + }, + { + "completion_length": 257.1428689956665, + "epoch": 1.9281193679533928, + "grad_norm": 0.14128657839377187, + "kl": 0.23486328125, + "learning_rate": 4.960172657552192e-07, + "loss": 0.0002, + "reward": 1.7535715028643608, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 11500 + }, + { + "completion_length": 269.8214387893677, + "epoch": 1.9284546711932604, + "grad_norm": 0.04353958717156185, + "kl": 0.460693359375, + "learning_rate": 4.960151327871141e-07, + "loss": 0.0005, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 11502 + }, + { + "completion_length": 252.66072750091553, + "epoch": 1.928789974433128, + "grad_norm": 0.15968991693381757, + "kl": 0.16265869140625, + "learning_rate": 4.960129992525909e-07, + "loss": 0.0002, + "reward": 1.76607146859169, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7705357577651739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11504 + }, + { + "completion_length": 263.5044775009155, + "epoch": 1.9291252776729955, + "grad_norm": 0.2532534123405873, + "kl": 0.210540771484375, + "learning_rate": 4.960108651516545e-07, + "loss": 0.0002, + "reward": 1.7571429535746574, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 11506 + }, + { + "completion_length": 267.4821548461914, + "epoch": 1.929460580912863, + "grad_norm": 0.14032272575220447, + "kl": 0.331695556640625, + "learning_rate": 4.960087304843099e-07, + "loss": 0.0003, + "reward": 1.8178571835160255, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 11508 + }, + { + "completion_length": 254.3035831451416, + "epoch": 1.9297958841527305, + "grad_norm": 0.14579727584127364, + "kl": 0.116943359375, + "learning_rate": 4.96006595250562e-07, + "loss": 0.0001, + "reward": 1.8250000402331352, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.825000025331974, + "rewards/format_reward_func": 1.0, + "step": 11510 + }, + { + "completion_length": 260.56251335144043, + "epoch": 1.9301311873925981, + "grad_norm": 0.2295536831446175, + "kl": 0.098663330078125, + "learning_rate": 4.960044594504158e-07, + "loss": 0.0001, + "reward": 1.79464291036129, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7991071678698063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11512 + }, + { + "completion_length": 272.0000104904175, + "epoch": 1.9304664906324658, + "grad_norm": 0.1013315839791849, + "kl": 0.131591796875, + "learning_rate": 4.96002323083876e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 11514 + }, + { + "completion_length": 267.3169775009155, + "epoch": 1.9308017938723334, + "grad_norm": 0.2789793491517081, + "kl": 0.112640380859375, + "learning_rate": 4.960001861509477e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11516 + }, + { + "completion_length": 262.5625114440918, + "epoch": 1.9311370971122008, + "grad_norm": 0.23336605649958678, + "kl": 0.1778564453125, + "learning_rate": 4.959980486516358e-07, + "loss": 0.0002, + "reward": 1.725000061094761, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7250000387430191, + "rewards/format_reward_func": 1.0, + "step": 11518 + }, + { + "completion_length": 275.1562662124634, + "epoch": 1.9314724003520682, + "grad_norm": 0.19566252414623606, + "kl": 0.120880126953125, + "learning_rate": 4.959959105859451e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 11520 + }, + { + "completion_length": 273.3035821914673, + "epoch": 1.9318077035919359, + "grad_norm": 0.26468819573733504, + "kl": 0.131500244140625, + "learning_rate": 4.959937719538806e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7375000342726707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11522 + }, + { + "completion_length": 278.30804920196533, + "epoch": 1.9321430068318035, + "grad_norm": 0.16803667829067487, + "kl": 0.1395416259765625, + "learning_rate": 4.959916327554473e-07, + "loss": 0.0001, + "reward": 1.7107143476605415, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143253087997, + "rewards/format_reward_func": 1.0, + "step": 11524 + }, + { + "completion_length": 282.00000953674316, + "epoch": 1.9324783100716711, + "grad_norm": 0.436746719978823, + "kl": 0.212310791015625, + "learning_rate": 4.959894929906499e-07, + "loss": 0.0002, + "reward": 1.7464286535978317, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 11526 + }, + { + "completion_length": 274.15180110931396, + "epoch": 1.9328136133115388, + "grad_norm": 0.15089911493730374, + "kl": 0.23822021484375, + "learning_rate": 4.959873526594937e-07, + "loss": 0.0002, + "reward": 1.7196429148316383, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071872413158, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11528 + }, + { + "completion_length": 268.16519355773926, + "epoch": 1.9331489165514062, + "grad_norm": 0.24695749023732816, + "kl": 0.290924072265625, + "learning_rate": 4.959852117619834e-07, + "loss": 0.0003, + "reward": 1.7410714849829674, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7455357518047094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11530 + }, + { + "completion_length": 267.61608505249023, + "epoch": 1.9334842197912736, + "grad_norm": 0.15763417332067145, + "kl": 0.22369384765625, + "learning_rate": 4.959830702981237e-07, + "loss": 0.0002, + "reward": 1.7392857894301414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7392857298254967, + "rewards/format_reward_func": 1.0, + "step": 11532 + }, + { + "completion_length": 281.9687623977661, + "epoch": 1.9338195230311412, + "grad_norm": 0.22574997557198315, + "kl": 0.16107177734375, + "learning_rate": 4.9598092826792e-07, + "loss": 0.0002, + "reward": 1.737500086426735, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11534 + }, + { + "completion_length": 272.75447845458984, + "epoch": 1.9341548262710089, + "grad_norm": 0.20676245907015658, + "kl": 0.257080078125, + "learning_rate": 4.95978785671377e-07, + "loss": 0.0003, + "reward": 1.7642858028411865, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 11536 + }, + { + "completion_length": 273.80358695983887, + "epoch": 1.9344901295108765, + "grad_norm": 0.11482451783793798, + "kl": 0.14068603515625, + "learning_rate": 4.959766425084995e-07, + "loss": 0.0001, + "reward": 1.776785746216774, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7901786006987095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11538 + }, + { + "completion_length": 283.5178699493408, + "epoch": 1.934825432750744, + "grad_norm": 0.42962985392694314, + "kl": 0.11175537109375, + "learning_rate": 4.959744987792926e-07, + "loss": 0.0001, + "reward": 1.816071480512619, + "reward_std": 0.07828682009130716, + "rewards/equation_reward_func": 0.8205357491970062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11540 + }, + { + "completion_length": 262.7500114440918, + "epoch": 1.9351607359906116, + "grad_norm": 0.26195593090194097, + "kl": 0.10430908203125, + "learning_rate": 4.959723544837611e-07, + "loss": 0.0001, + "reward": 1.8785714581608772, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8785714469850063, + "rewards/format_reward_func": 1.0, + "step": 11542 + }, + { + "completion_length": 265.85269260406494, + "epoch": 1.935496039230479, + "grad_norm": 0.22136712208141024, + "kl": 0.103271484375, + "learning_rate": 4.959702096219103e-07, + "loss": 0.0001, + "reward": 1.8178571835160255, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.8267857432365417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11544 + }, + { + "completion_length": 276.78572845458984, + "epoch": 1.9358313424703466, + "grad_norm": 0.3523477652786812, + "kl": 0.125274658203125, + "learning_rate": 4.959680641937447e-07, + "loss": 0.0001, + "reward": 1.7428572326898575, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 11546 + }, + { + "completion_length": 276.51340675354004, + "epoch": 1.9361666457102142, + "grad_norm": 0.12306280651202639, + "kl": 0.116790771484375, + "learning_rate": 4.959659181992695e-07, + "loss": 0.0001, + "reward": 1.7892857566475868, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7982143126428127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11548 + }, + { + "completion_length": 268.9509048461914, + "epoch": 1.9365019489500819, + "grad_norm": 0.1913487796137698, + "kl": 0.123565673828125, + "learning_rate": 4.959637716384895e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 11550 + }, + { + "completion_length": 271.3392972946167, + "epoch": 1.9368372521899493, + "grad_norm": 0.14629526246249022, + "kl": 0.1084747314453125, + "learning_rate": 4.959616245114097e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 11552 + }, + { + "completion_length": 279.4196548461914, + "epoch": 1.9371725554298167, + "grad_norm": 0.09694533133624259, + "kl": 0.129730224609375, + "learning_rate": 4.95959476818035e-07, + "loss": 0.0001, + "reward": 1.7660714760422707, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11554 + }, + { + "completion_length": 271.45983123779297, + "epoch": 1.9375078586696843, + "grad_norm": 0.2217887064929058, + "kl": 0.11297607421875, + "learning_rate": 4.959573285583706e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428686380386, + "rewards/format_reward_func": 1.0, + "step": 11556 + }, + { + "completion_length": 272.4732275009155, + "epoch": 1.937843161909552, + "grad_norm": 0.23558904806238534, + "kl": 0.132354736328125, + "learning_rate": 4.959551797324211e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 11558 + }, + { + "completion_length": 266.68305110931396, + "epoch": 1.9381784651494196, + "grad_norm": 0.22151610735532148, + "kl": 0.1597137451171875, + "learning_rate": 4.959530303401915e-07, + "loss": 0.0002, + "reward": 1.8169643357396126, + "reward_std": 0.05682107899338007, + "rewards/equation_reward_func": 0.8232143074274063, + "rewards/format_reward_func": 0.9937500059604645, + "step": 11560 + }, + { + "completion_length": 267.44644355773926, + "epoch": 1.938513768389287, + "grad_norm": 0.1748234891363921, + "kl": 0.1086578369140625, + "learning_rate": 4.959508803816868e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11562 + }, + { + "completion_length": 284.33930015563965, + "epoch": 1.9388490716291547, + "grad_norm": 0.189758110151058, + "kl": 0.1158447265625, + "learning_rate": 4.959487298569121e-07, + "loss": 0.0001, + "reward": 1.7875000536441803, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.8008928745985031, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11564 + }, + { + "completion_length": 273.1964416503906, + "epoch": 1.939184374869022, + "grad_norm": 0.22127814925587067, + "kl": 0.1219329833984375, + "learning_rate": 4.959465787658723e-07, + "loss": 0.0001, + "reward": 1.7464286237955093, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7553571723401546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11566 + }, + { + "completion_length": 270.5446538925171, + "epoch": 1.9395196781088897, + "grad_norm": 0.2793978109461439, + "kl": 0.2030029296875, + "learning_rate": 4.959444271085723e-07, + "loss": 0.0002, + "reward": 1.7214286029338837, + "reward_std": 0.0656599160283804, + "rewards/equation_reward_func": 0.7482143081724644, + "rewards/format_reward_func": 0.9732142984867096, + "step": 11568 + }, + { + "completion_length": 273.8259086608887, + "epoch": 1.9398549813487573, + "grad_norm": 0.550426955461273, + "kl": 0.2730712890625, + "learning_rate": 4.959422748850168e-07, + "loss": 0.0003, + "reward": 1.7321429327130318, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7410714421421289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11570 + }, + { + "completion_length": 277.6205472946167, + "epoch": 1.940190284588625, + "grad_norm": 0.19508560241066944, + "kl": 0.1059417724609375, + "learning_rate": 4.959401220952112e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11572 + }, + { + "completion_length": 277.46429443359375, + "epoch": 1.9405255878284924, + "grad_norm": 0.44544147580496724, + "kl": 0.1553955078125, + "learning_rate": 4.959379687391602e-07, + "loss": 0.0002, + "reward": 1.803571492433548, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 11574 + }, + { + "completion_length": 272.8750123977661, + "epoch": 1.9408608910683598, + "grad_norm": 0.20959783360587844, + "kl": 0.17010498046875, + "learning_rate": 4.959358148168687e-07, + "loss": 0.0002, + "reward": 1.7678572088479996, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7767857536673546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11576 + }, + { + "completion_length": 267.2009057998657, + "epoch": 1.9411961943082274, + "grad_norm": 0.23572829651922414, + "kl": 0.221099853515625, + "learning_rate": 4.95933660328342e-07, + "loss": 0.0002, + "reward": 1.8000000715255737, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.8089285977184772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11578 + }, + { + "completion_length": 276.2991180419922, + "epoch": 1.941531497548095, + "grad_norm": 0.1705477545589577, + "kl": 0.1273193359375, + "learning_rate": 4.959315052735846e-07, + "loss": 0.0001, + "reward": 1.7767858058214188, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500409781933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11580 + }, + { + "completion_length": 261.55358600616455, + "epoch": 1.9418668007879627, + "grad_norm": 0.4312032446580683, + "kl": 0.151031494140625, + "learning_rate": 4.959293496526018e-07, + "loss": 0.0002, + "reward": 1.7875000536441803, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11582 + }, + { + "completion_length": 269.62054443359375, + "epoch": 1.9422021040278303, + "grad_norm": 0.17382506989411386, + "kl": 0.1229248046875, + "learning_rate": 4.959271934653985e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 11584 + }, + { + "completion_length": 271.15179538726807, + "epoch": 1.9425374072676977, + "grad_norm": 0.22970344709163384, + "kl": 0.3343658447265625, + "learning_rate": 4.959250367119795e-07, + "loss": 0.0003, + "reward": 1.73214291036129, + "reward_std": 0.10606601648032665, + "rewards/equation_reward_func": 0.7500000204890966, + "rewards/format_reward_func": 0.9821428656578064, + "step": 11586 + }, + { + "completion_length": 277.8303680419922, + "epoch": 1.9428727105075652, + "grad_norm": 0.24053185195149798, + "kl": 0.14495849609375, + "learning_rate": 4.9592287939235e-07, + "loss": 0.0001, + "reward": 1.7357143387198448, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7446428835391998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11588 + }, + { + "completion_length": 265.2723340988159, + "epoch": 1.9432080137474328, + "grad_norm": 0.2155865915403086, + "kl": 0.13531494140625, + "learning_rate": 4.959207215065148e-07, + "loss": 0.0001, + "reward": 1.7232143729925156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276785895228386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11590 + }, + { + "completion_length": 263.0491199493408, + "epoch": 1.9435433169873004, + "grad_norm": 0.19655043391107452, + "kl": 0.6219482421875, + "learning_rate": 4.959185630544788e-07, + "loss": 0.0006, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 11592 + }, + { + "completion_length": 264.5535831451416, + "epoch": 1.943878620227168, + "grad_norm": 0.32884497758459014, + "kl": 0.231201171875, + "learning_rate": 4.959164040362473e-07, + "loss": 0.0002, + "reward": 1.7375000789761543, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7419643327593803, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11594 + }, + { + "completion_length": 263.308048248291, + "epoch": 1.9442139234670355, + "grad_norm": 0.12803094395008807, + "kl": 0.25775146484375, + "learning_rate": 4.959142444518249e-07, + "loss": 0.0003, + "reward": 1.7196429371833801, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.7330357320606709, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11596 + }, + { + "completion_length": 263.7901906967163, + "epoch": 1.944549226706903, + "grad_norm": 0.2162849149079578, + "kl": 0.13641357421875, + "learning_rate": 4.959120843012168e-07, + "loss": 0.0001, + "reward": 1.8196428939700127, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8241071589291096, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11598 + }, + { + "completion_length": 253.55804538726807, + "epoch": 1.9448845299467705, + "grad_norm": 0.27344404414924084, + "kl": 0.5733642578125, + "learning_rate": 4.959099235844278e-07, + "loss": 0.0006, + "reward": 1.7982143461704254, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11600 + }, + { + "completion_length": 255.9464406967163, + "epoch": 1.9452198331866382, + "grad_norm": 0.19967799496439764, + "kl": 0.13580322265625, + "learning_rate": 4.959077623014631e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 11602 + }, + { + "completion_length": 255.94197750091553, + "epoch": 1.9455551364265058, + "grad_norm": 0.21747384010928997, + "kl": 0.6826171875, + "learning_rate": 4.959056004523275e-07, + "loss": 0.0007, + "reward": 1.7642857506871223, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7732143178582191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11604 + }, + { + "completion_length": 252.40179920196533, + "epoch": 1.9458904396663734, + "grad_norm": 0.13116440361104098, + "kl": 0.253143310546875, + "learning_rate": 4.959034380370261e-07, + "loss": 0.0003, + "reward": 1.7142857983708382, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7142857424914837, + "rewards/format_reward_func": 1.0, + "step": 11606 + }, + { + "completion_length": 258.94643783569336, + "epoch": 1.9462257429062408, + "grad_norm": 0.17006587057823047, + "kl": 0.4481964111328125, + "learning_rate": 4.959012750555638e-07, + "loss": 0.0004, + "reward": 1.7553572058677673, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214708268642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11608 + }, + { + "completion_length": 251.8482255935669, + "epoch": 1.9465610461461083, + "grad_norm": 0.17390186294610058, + "kl": 0.245513916015625, + "learning_rate": 4.958991115079455e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 11610 + }, + { + "completion_length": 252.4509048461914, + "epoch": 1.9468963493859759, + "grad_norm": 0.13355463954568156, + "kl": 0.18939208984375, + "learning_rate": 4.958969473941763e-07, + "loss": 0.0002, + "reward": 1.6750000789761543, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.6750000305473804, + "rewards/format_reward_func": 1.0, + "step": 11612 + }, + { + "completion_length": 263.87055015563965, + "epoch": 1.9472316526258435, + "grad_norm": 0.30431565753795486, + "kl": 0.47369384765625, + "learning_rate": 4.958947827142612e-07, + "loss": 0.0005, + "reward": 1.7303571999073029, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7348214630037546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11614 + }, + { + "completion_length": 260.1562614440918, + "epoch": 1.9475669558657112, + "grad_norm": 0.2853115135319929, + "kl": 0.405059814453125, + "learning_rate": 4.958926174682052e-07, + "loss": 0.0004, + "reward": 1.7303572222590446, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.743750024586916, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11616 + }, + { + "completion_length": 252.3392972946167, + "epoch": 1.9479022591055786, + "grad_norm": 0.15608169259880164, + "kl": 0.25421142578125, + "learning_rate": 4.958904516560132e-07, + "loss": 0.0003, + "reward": 1.771428607404232, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286148548126, + "rewards/format_reward_func": 1.0, + "step": 11618 + }, + { + "completion_length": 247.1339406967163, + "epoch": 1.9482375623454462, + "grad_norm": 0.17523372472153195, + "kl": 0.204986572265625, + "learning_rate": 4.958882852776901e-07, + "loss": 0.0002, + "reward": 1.7196429297327995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071816533804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11620 + }, + { + "completion_length": 241.5134048461914, + "epoch": 1.9485728655853136, + "grad_norm": 0.2508027085559755, + "kl": 0.18426513671875, + "learning_rate": 4.958861183332411e-07, + "loss": 0.0002, + "reward": 1.7321429252624512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428824216127, + "rewards/format_reward_func": 1.0, + "step": 11622 + }, + { + "completion_length": 240.7187614440918, + "epoch": 1.9489081688251813, + "grad_norm": 0.19603250128713348, + "kl": 0.172637939453125, + "learning_rate": 4.95883950822671e-07, + "loss": 0.0002, + "reward": 1.8178571835160255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571630269289, + "rewards/format_reward_func": 1.0, + "step": 11624 + }, + { + "completion_length": 245.19197368621826, + "epoch": 1.9492434720650489, + "grad_norm": 0.32498217250259853, + "kl": 0.6026763916015625, + "learning_rate": 4.95881782745985e-07, + "loss": 0.0006, + "reward": 1.7982143610715866, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026785850524902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11626 + }, + { + "completion_length": 244.44643878936768, + "epoch": 1.9495787753049165, + "grad_norm": 0.08157777296903064, + "kl": 0.193695068359375, + "learning_rate": 4.958796141031878e-07, + "loss": 0.0002, + "reward": 1.775000087916851, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 11628 + }, + { + "completion_length": 246.67858409881592, + "epoch": 1.949914078544784, + "grad_norm": 0.24126868290716924, + "kl": 0.1125946044921875, + "learning_rate": 4.958774448942847e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 11630 + }, + { + "completion_length": 250.96429634094238, + "epoch": 1.9502493817846513, + "grad_norm": 0.058257893302917085, + "kl": 0.1483154296875, + "learning_rate": 4.958752751192805e-07, + "loss": 0.0001, + "reward": 1.7982143461704254, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11632 + }, + { + "completion_length": 262.72768783569336, + "epoch": 1.950584685024519, + "grad_norm": 0.19758306784761007, + "kl": 3.353057861328125, + "learning_rate": 4.958731047781803e-07, + "loss": 0.0034, + "reward": 1.7053572162985802, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.718750037252903, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11634 + }, + { + "completion_length": 250.64287185668945, + "epoch": 1.9509199882643866, + "grad_norm": 0.26712866657750123, + "kl": 0.165252685546875, + "learning_rate": 4.958709338709889e-07, + "loss": 0.0002, + "reward": 1.730357214808464, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7348214685916901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11636 + }, + { + "completion_length": 261.1651916503906, + "epoch": 1.9512552915042543, + "grad_norm": 0.19153741964710466, + "kl": 1.4565277099609375, + "learning_rate": 4.958687623977117e-07, + "loss": 0.0015, + "reward": 1.767857201397419, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11638 + }, + { + "completion_length": 258.1116199493408, + "epoch": 1.9515905947441217, + "grad_norm": 0.1725781835403997, + "kl": 0.469635009765625, + "learning_rate": 4.958665903583533e-07, + "loss": 0.0005, + "reward": 1.8053571954369545, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8098214603960514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11640 + }, + { + "completion_length": 266.96876335144043, + "epoch": 1.9519258979839893, + "grad_norm": 0.3686855380317879, + "kl": 0.53802490234375, + "learning_rate": 4.958644177529189e-07, + "loss": 0.0005, + "reward": 1.7535715028643608, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 0.9821428656578064, + "step": 11642 + }, + { + "completion_length": 244.67858123779297, + "epoch": 1.9522612012238567, + "grad_norm": 0.08202584755809787, + "kl": 0.1743621826171875, + "learning_rate": 4.958622445814133e-07, + "loss": 0.0002, + "reward": 1.8017857670783997, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.8062500320374966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11644 + }, + { + "completion_length": 253.4553680419922, + "epoch": 1.9525965044637243, + "grad_norm": 0.28201809123002675, + "kl": 0.7519073486328125, + "learning_rate": 4.958600708438418e-07, + "loss": 0.0008, + "reward": 1.7910714894533157, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.8044643066823483, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11646 + }, + { + "completion_length": 258.1875104904175, + "epoch": 1.952931807703592, + "grad_norm": 0.45200193199739935, + "kl": 1.96221923828125, + "learning_rate": 4.958578965402093e-07, + "loss": 0.002, + "reward": 1.742857187986374, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 0.9821428619325161, + "step": 11648 + }, + { + "completion_length": 256.8973321914673, + "epoch": 1.9532671109434596, + "grad_norm": 0.22485174876966724, + "kl": 1.655181884765625, + "learning_rate": 4.958557216705207e-07, + "loss": 0.0017, + "reward": 1.7232143580913544, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7366071715950966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11650 + }, + { + "completion_length": 252.80358409881592, + "epoch": 1.953602414183327, + "grad_norm": 0.3054215109505365, + "kl": 0.42816162109375, + "learning_rate": 4.95853546234781e-07, + "loss": 0.0004, + "reward": 1.7571429014205933, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7660714574158192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11652 + }, + { + "completion_length": 279.90180110931396, + "epoch": 1.9539377174231944, + "grad_norm": 0.13204873486477667, + "kl": 0.4475860595703125, + "learning_rate": 4.958513702329953e-07, + "loss": 0.0004, + "reward": 1.7357143312692642, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7535714469850063, + "rewards/format_reward_func": 0.9821428656578064, + "step": 11654 + }, + { + "completion_length": 260.99555015563965, + "epoch": 1.954273020663062, + "grad_norm": 0.2930258195465054, + "kl": 0.1341094970703125, + "learning_rate": 4.958491936651687e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.09091372694820166, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 0.9821428656578064, + "step": 11656 + }, + { + "completion_length": 250.63840293884277, + "epoch": 1.9546083239029297, + "grad_norm": 0.1968308325811053, + "kl": 0.443206787109375, + "learning_rate": 4.95847016531306e-07, + "loss": 0.0004, + "reward": 1.7857143506407738, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7946428842842579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11658 + }, + { + "completion_length": 254.2053689956665, + "epoch": 1.9549436271427973, + "grad_norm": 0.21657157782427686, + "kl": 0.255615234375, + "learning_rate": 4.958448388314124e-07, + "loss": 0.0003, + "reward": 1.725000075995922, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 0.9821428656578064, + "step": 11660 + }, + { + "completion_length": 259.3526906967163, + "epoch": 1.955278930382665, + "grad_norm": 0.1188084083595016, + "kl": 0.2137908935546875, + "learning_rate": 4.958426605654928e-07, + "loss": 0.0002, + "reward": 1.8000000342726707, + "reward_std": 0.05050762835890055, + "rewards/equation_reward_func": 0.808928607031703, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11662 + }, + { + "completion_length": 250.0803680419922, + "epoch": 1.9556142336225324, + "grad_norm": 0.45858462645071885, + "kl": 0.361175537109375, + "learning_rate": 4.958404817335522e-07, + "loss": 0.0004, + "reward": 1.7071429342031479, + "reward_std": 0.0909137288108468, + "rewards/equation_reward_func": 0.7339286170899868, + "rewards/format_reward_func": 0.9732142984867096, + "step": 11664 + }, + { + "completion_length": 238.53126049041748, + "epoch": 1.9559495368623998, + "grad_norm": 0.16654047526987906, + "kl": 0.10797119140625, + "learning_rate": 4.958383023355957e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.768750037997961, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11666 + }, + { + "completion_length": 247.0223331451416, + "epoch": 1.9562848401022674, + "grad_norm": 0.21473295137029416, + "kl": 0.17388916015625, + "learning_rate": 4.958361223716282e-07, + "loss": 0.0002, + "reward": 1.728571504354477, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714615136385, + "rewards/format_reward_func": 1.0, + "step": 11668 + }, + { + "completion_length": 236.5491180419922, + "epoch": 1.956620143342135, + "grad_norm": 0.1944749028128767, + "kl": 0.14154052734375, + "learning_rate": 4.95833941841655e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11670 + }, + { + "completion_length": 237.00893878936768, + "epoch": 1.9569554465820027, + "grad_norm": 0.07876787095959105, + "kl": 0.208038330078125, + "learning_rate": 4.958317607456807e-07, + "loss": 0.0002, + "reward": 1.7517857775092125, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11672 + }, + { + "completion_length": 245.40625953674316, + "epoch": 1.9572907498218701, + "grad_norm": 0.11458006446040274, + "kl": 0.266632080078125, + "learning_rate": 4.958295790837106e-07, + "loss": 0.0003, + "reward": 1.719642922282219, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7330357432365417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11674 + }, + { + "completion_length": 238.54018783569336, + "epoch": 1.9576260530617378, + "grad_norm": 0.3717038714285689, + "kl": 0.412078857421875, + "learning_rate": 4.958273968557497e-07, + "loss": 0.0004, + "reward": 1.726785771548748, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.740178607404232, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11676 + }, + { + "completion_length": 231.80358028411865, + "epoch": 1.9579613563016052, + "grad_norm": 0.20329722813535972, + "kl": 0.27691650390625, + "learning_rate": 4.95825214061803e-07, + "loss": 0.0003, + "reward": 1.746428668498993, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 11678 + }, + { + "completion_length": 233.9241189956665, + "epoch": 1.9582966595414728, + "grad_norm": 0.12961171035557645, + "kl": 0.254791259765625, + "learning_rate": 4.958230307018755e-07, + "loss": 0.0003, + "reward": 1.7196429371833801, + "reward_std": 0.0530330091714859, + "rewards/equation_reward_func": 0.7330357432365417, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11680 + }, + { + "completion_length": 231.82143878936768, + "epoch": 1.9586319627813404, + "grad_norm": 0.2286560781940869, + "kl": 0.150177001953125, + "learning_rate": 4.958208467759722e-07, + "loss": 0.0002, + "reward": 1.6982143595814705, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.702678607776761, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11682 + }, + { + "completion_length": 234.32590293884277, + "epoch": 1.958967266021208, + "grad_norm": 0.1759078841403251, + "kl": 0.170379638671875, + "learning_rate": 4.958186622840982e-07, + "loss": 0.0002, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000204890966, + "rewards/format_reward_func": 1.0, + "step": 11684 + }, + { + "completion_length": 228.33483505249023, + "epoch": 1.9593025692610755, + "grad_norm": 0.25031878839867766, + "kl": 0.27630615234375, + "learning_rate": 4.958164772262584e-07, + "loss": 0.0003, + "reward": 1.7571429312229156, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428846567869, + "rewards/format_reward_func": 1.0, + "step": 11686 + }, + { + "completion_length": 221.42411613464355, + "epoch": 1.959637872500943, + "grad_norm": 0.2143225747696588, + "kl": 0.216766357421875, + "learning_rate": 4.95814291602458e-07, + "loss": 0.0002, + "reward": 1.832142911851406, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8321428894996643, + "rewards/format_reward_func": 1.0, + "step": 11688 + }, + { + "completion_length": 223.12500953674316, + "epoch": 1.9599731757408105, + "grad_norm": 0.11656490838104483, + "kl": 0.2996826171875, + "learning_rate": 4.95812105412702e-07, + "loss": 0.0003, + "reward": 1.7857143431901932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 11690 + }, + { + "completion_length": 222.45090579986572, + "epoch": 1.9603084789806782, + "grad_norm": 0.20553337543261221, + "kl": 0.117279052734375, + "learning_rate": 4.958099186569953e-07, + "loss": 0.0001, + "reward": 1.7303571850061417, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.734821455553174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11692 + }, + { + "completion_length": 223.79465293884277, + "epoch": 1.9606437822205458, + "grad_norm": 0.09753404057868746, + "kl": 0.360626220703125, + "learning_rate": 4.958077313353432e-07, + "loss": 0.0004, + "reward": 1.7500000447034836, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 11694 + }, + { + "completion_length": 216.57590293884277, + "epoch": 1.9609790854604132, + "grad_norm": 0.159109331582923, + "kl": 0.179901123046875, + "learning_rate": 4.958055434477504e-07, + "loss": 0.0002, + "reward": 1.796428620815277, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 11696 + }, + { + "completion_length": 231.446439743042, + "epoch": 1.9613143887002809, + "grad_norm": 0.2844154985682583, + "kl": 0.23492431640625, + "learning_rate": 4.958033549942222e-07, + "loss": 0.0002, + "reward": 1.825000062584877, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8250000141561031, + "rewards/format_reward_func": 1.0, + "step": 11698 + }, + { + "completion_length": 219.102689743042, + "epoch": 1.9616496919401483, + "grad_norm": 0.28877537376523416, + "kl": 0.134918212890625, + "learning_rate": 4.958011659747635e-07, + "loss": 0.0001, + "reward": 1.7142857909202576, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.714285746216774, + "rewards/format_reward_func": 1.0, + "step": 11700 + }, + { + "completion_length": 226.44643783569336, + "epoch": 1.961984995180016, + "grad_norm": 0.42817238326910484, + "kl": 0.4429931640625, + "learning_rate": 4.957989763893793e-07, + "loss": 0.0004, + "reward": 1.7750000581145287, + "reward_std": 0.045456862077116966, + "rewards/equation_reward_func": 0.7750000152736902, + "rewards/format_reward_func": 1.0, + "step": 11702 + }, + { + "completion_length": 220.68304538726807, + "epoch": 1.9623202984198835, + "grad_norm": 0.09978868725868749, + "kl": 0.122467041015625, + "learning_rate": 4.957967862380749e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 11704 + }, + { + "completion_length": 226.65625762939453, + "epoch": 1.9626556016597512, + "grad_norm": 0.11651728537059637, + "kl": 0.1053619384765625, + "learning_rate": 4.95794595520855e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 11706 + }, + { + "completion_length": 224.50893783569336, + "epoch": 1.9629909048996186, + "grad_norm": 0.2761169766878691, + "kl": 0.214324951171875, + "learning_rate": 4.957924042377248e-07, + "loss": 0.0002, + "reward": 1.753571480512619, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 11708 + }, + { + "completion_length": 225.88393592834473, + "epoch": 1.963326208139486, + "grad_norm": 0.19632426609063403, + "kl": 0.237152099609375, + "learning_rate": 4.957902123886895e-07, + "loss": 0.0002, + "reward": 1.7857143506407738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 11710 + }, + { + "completion_length": 213.43304634094238, + "epoch": 1.9636615113793536, + "grad_norm": 0.18476990547780098, + "kl": 0.12530517578125, + "learning_rate": 4.957880199737539e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714622586966, + "rewards/format_reward_func": 1.0, + "step": 11712 + }, + { + "completion_length": 220.67858123779297, + "epoch": 1.9639968146192213, + "grad_norm": 0.20954613469586167, + "kl": 0.3189544677734375, + "learning_rate": 4.957858269929232e-07, + "loss": 0.0003, + "reward": 1.8035714849829674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 11714 + }, + { + "completion_length": 230.14733123779297, + "epoch": 1.964332117859089, + "grad_norm": 0.18032852907919253, + "kl": 0.217559814453125, + "learning_rate": 4.957836334462024e-07, + "loss": 0.0002, + "reward": 1.7500000596046448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000391155481, + "rewards/format_reward_func": 1.0, + "step": 11716 + }, + { + "completion_length": 247.68304920196533, + "epoch": 1.9646674210989565, + "grad_norm": 0.2596767575265767, + "kl": 0.3243408203125, + "learning_rate": 4.957814393335964e-07, + "loss": 0.0003, + "reward": 1.7017857879400253, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.706250037997961, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11718 + }, + { + "completion_length": 223.7321538925171, + "epoch": 1.965002724338824, + "grad_norm": 0.17295896277556655, + "kl": 0.103607177734375, + "learning_rate": 4.957792446551105e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11720 + }, + { + "completion_length": 244.13393878936768, + "epoch": 1.9653380275786914, + "grad_norm": 0.12359927873014928, + "kl": 0.1367950439453125, + "learning_rate": 4.957770494107496e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 11722 + }, + { + "completion_length": 238.42858409881592, + "epoch": 1.965673330818559, + "grad_norm": 0.2447817104012549, + "kl": 0.10650634765625, + "learning_rate": 4.957748536005189e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11724 + }, + { + "completion_length": 244.68304824829102, + "epoch": 1.9660086340584266, + "grad_norm": 0.25763781640983036, + "kl": 0.107635498046875, + "learning_rate": 4.957726572244233e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643234461546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11726 + }, + { + "completion_length": 256.0982275009155, + "epoch": 1.9663439372982943, + "grad_norm": 0.17076517137928504, + "kl": 0.104156494140625, + "learning_rate": 4.95770460282468e-07, + "loss": 0.0001, + "reward": 1.7803571969270706, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7848214656114578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11728 + }, + { + "completion_length": 250.0982255935669, + "epoch": 1.9666792405381617, + "grad_norm": 0.28394623429905763, + "kl": 0.131378173828125, + "learning_rate": 4.957682627746578e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7589286044239998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11730 + }, + { + "completion_length": 245.6473331451416, + "epoch": 1.967014543778029, + "grad_norm": 0.25643459126742696, + "kl": 0.1122894287109375, + "learning_rate": 4.957660647009981e-07, + "loss": 0.0001, + "reward": 1.7875000685453415, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11732 + }, + { + "completion_length": 244.6428680419922, + "epoch": 1.9673498470178967, + "grad_norm": 0.18632279817106337, + "kl": 0.1124420166015625, + "learning_rate": 4.957638660614938e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 11734 + }, + { + "completion_length": 256.9151916503906, + "epoch": 1.9676851502577644, + "grad_norm": 0.22765448085255816, + "kl": 0.1229400634765625, + "learning_rate": 4.957616668561498e-07, + "loss": 0.0001, + "reward": 1.7446429282426834, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7491071857511997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11736 + }, + { + "completion_length": 246.9017972946167, + "epoch": 1.968020453497632, + "grad_norm": 0.6592077745590738, + "kl": 0.13031005859375, + "learning_rate": 4.957594670849715e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.8196428827941418, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11738 + }, + { + "completion_length": 255.5178689956665, + "epoch": 1.9683557567374996, + "grad_norm": 0.26862816593064337, + "kl": 0.117919921875, + "learning_rate": 4.957572667479637e-07, + "loss": 0.0001, + "reward": 1.7303572073578835, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7348214536905289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11740 + }, + { + "completion_length": 246.37947368621826, + "epoch": 1.968691059977367, + "grad_norm": 0.2317197207042673, + "kl": 0.115509033203125, + "learning_rate": 4.957550658451315e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 11742 + }, + { + "completion_length": 257.39286708831787, + "epoch": 1.9690263632172345, + "grad_norm": 0.1188256513109012, + "kl": 0.1306610107421875, + "learning_rate": 4.957528643764801e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 11744 + }, + { + "completion_length": 254.32144260406494, + "epoch": 1.969361666457102, + "grad_norm": 0.13580848106302223, + "kl": 0.150390625, + "learning_rate": 4.957506623420145e-07, + "loss": 0.0002, + "reward": 1.7446429133415222, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7580357491970062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11746 + }, + { + "completion_length": 249.18304824829102, + "epoch": 1.9696969696969697, + "grad_norm": 0.17056617952178113, + "kl": 0.143218994140625, + "learning_rate": 4.957484597417398e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7866071611642838, + "rewards/format_reward_func": 0.9866071455180645, + "step": 11748 + }, + { + "completion_length": 257.696439743042, + "epoch": 1.9700322729368374, + "grad_norm": 0.05453721168213884, + "kl": 0.112518310546875, + "learning_rate": 4.957462565756609e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11750 + }, + { + "completion_length": 258.0759057998657, + "epoch": 1.9703675761767048, + "grad_norm": 0.1605862489015661, + "kl": 0.10565185546875, + "learning_rate": 4.95744052843783e-07, + "loss": 0.0001, + "reward": 1.798214353621006, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026786111295223, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11752 + }, + { + "completion_length": 254.1607265472412, + "epoch": 1.9707028794165724, + "grad_norm": 0.19646260246464867, + "kl": 0.1075439453125, + "learning_rate": 4.957418485461112e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 11754 + }, + { + "completion_length": 252.89733505249023, + "epoch": 1.9710381826564398, + "grad_norm": 0.3499983244450021, + "kl": 0.2943115234375, + "learning_rate": 4.957396436826506e-07, + "loss": 0.0003, + "reward": 1.8098214641213417, + "reward_std": 0.056821079924702644, + "rewards/equation_reward_func": 0.816071443259716, + "rewards/format_reward_func": 0.9937500059604645, + "step": 11756 + }, + { + "completion_length": 257.3348340988159, + "epoch": 1.9713734858963075, + "grad_norm": 0.21213137164595536, + "kl": 0.149383544921875, + "learning_rate": 4.957374382534063e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786085218191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11758 + }, + { + "completion_length": 254.6785831451416, + "epoch": 1.971708789136175, + "grad_norm": 0.16187445714713122, + "kl": 0.0967864990234375, + "learning_rate": 4.957352322583831e-07, + "loss": 0.0001, + "reward": 1.8214286267757416, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8214285895228386, + "rewards/format_reward_func": 1.0, + "step": 11760 + }, + { + "completion_length": 255.33929538726807, + "epoch": 1.9720440923760427, + "grad_norm": 0.17196609458920475, + "kl": 0.127349853515625, + "learning_rate": 4.957330256975865e-07, + "loss": 0.0001, + "reward": 1.7446429431438446, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.749107176437974, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11762 + }, + { + "completion_length": 248.13840579986572, + "epoch": 1.9723793956159101, + "grad_norm": 0.29006952980499107, + "kl": 0.0870361328125, + "learning_rate": 4.957308185710212e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.785714328289032, + "rewards/format_reward_func": 1.0, + "step": 11764 + }, + { + "completion_length": 249.27679634094238, + "epoch": 1.9727146988557775, + "grad_norm": 0.10785375158854305, + "kl": 0.157928466796875, + "learning_rate": 4.957286108786925e-07, + "loss": 0.0002, + "reward": 1.8107143342494965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 11766 + }, + { + "completion_length": 247.71429920196533, + "epoch": 1.9730500020956452, + "grad_norm": 0.20807208197803714, + "kl": 0.229705810546875, + "learning_rate": 4.957264026206054e-07, + "loss": 0.0002, + "reward": 1.8000000789761543, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 11768 + }, + { + "completion_length": 246.93304920196533, + "epoch": 1.9733853053355128, + "grad_norm": 0.17862243170744227, + "kl": 0.180389404296875, + "learning_rate": 4.957241937967651e-07, + "loss": 0.0002, + "reward": 1.8089286237955093, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8133928701281548, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11770 + }, + { + "completion_length": 252.46876049041748, + "epoch": 1.9737206085753805, + "grad_norm": 0.22827912919286747, + "kl": 0.3739166259765625, + "learning_rate": 4.957219844071765e-07, + "loss": 0.0004, + "reward": 1.7250000908970833, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 1.0, + "step": 11772 + }, + { + "completion_length": 236.30358409881592, + "epoch": 1.9740559118152479, + "grad_norm": 0.18547557891243543, + "kl": 0.1163330078125, + "learning_rate": 4.957197744518449e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857529222965, + "rewards/format_reward_func": 1.0, + "step": 11774 + }, + { + "completion_length": 251.08483409881592, + "epoch": 1.9743912150551155, + "grad_norm": 0.5494083041560429, + "kl": 0.337066650390625, + "learning_rate": 4.957175639307751e-07, + "loss": 0.0003, + "reward": 1.7678572237491608, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 11776 + }, + { + "completion_length": 252.4866189956665, + "epoch": 1.974726518294983, + "grad_norm": 0.1988050122697657, + "kl": 0.1005859375, + "learning_rate": 4.957153528439725e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 11778 + }, + { + "completion_length": 250.71429634094238, + "epoch": 1.9750618215348505, + "grad_norm": 0.17271351151488437, + "kl": 0.090301513671875, + "learning_rate": 4.95713141191442e-07, + "loss": 0.0001, + "reward": 1.7142857983708382, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7142857406288385, + "rewards/format_reward_func": 1.0, + "step": 11780 + }, + { + "completion_length": 247.45983123779297, + "epoch": 1.9753971247747182, + "grad_norm": 0.1843118865578671, + "kl": 0.389495849609375, + "learning_rate": 4.957109289731888e-07, + "loss": 0.0004, + "reward": 1.7821429371833801, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 11782 + }, + { + "completion_length": 236.5982255935669, + "epoch": 1.9757324280145858, + "grad_norm": 0.21773081688597665, + "kl": 0.3225250244140625, + "learning_rate": 4.957087161892179e-07, + "loss": 0.0003, + "reward": 1.7750000655651093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 11784 + }, + { + "completion_length": 241.3973331451416, + "epoch": 1.9760677312544532, + "grad_norm": 0.26868741544036107, + "kl": 0.7315216064453125, + "learning_rate": 4.957065028395344e-07, + "loss": 0.0007, + "reward": 1.8214285969734192, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8214285969734192, + "rewards/format_reward_func": 1.0, + "step": 11786 + }, + { + "completion_length": 248.35715579986572, + "epoch": 1.9764030344943206, + "grad_norm": 0.20328298548363283, + "kl": 0.1676788330078125, + "learning_rate": 4.957042889241435e-07, + "loss": 0.0002, + "reward": 1.7750000655651093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 11788 + }, + { + "completion_length": 243.7544755935669, + "epoch": 1.9767383377341883, + "grad_norm": 0.1907004951600171, + "kl": 0.1032867431640625, + "learning_rate": 4.957020744430502e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 11790 + }, + { + "completion_length": 233.78572368621826, + "epoch": 1.977073640974056, + "grad_norm": 0.2596566984727189, + "kl": 0.09918212890625, + "learning_rate": 4.956998593962596e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 11792 + }, + { + "completion_length": 236.5089406967163, + "epoch": 1.9774089442139235, + "grad_norm": 0.30107402027910873, + "kl": 0.442413330078125, + "learning_rate": 4.956976437837768e-07, + "loss": 0.0004, + "reward": 1.8000000789761543, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 11794 + }, + { + "completion_length": 242.1071538925171, + "epoch": 1.9777442474537912, + "grad_norm": 0.22727364686394377, + "kl": 0.29437255859375, + "learning_rate": 4.95695427605607e-07, + "loss": 0.0003, + "reward": 1.803571492433548, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 11796 + }, + { + "completion_length": 243.19643878936768, + "epoch": 1.9780795506936586, + "grad_norm": 0.25835029992767844, + "kl": 0.1036529541015625, + "learning_rate": 4.956932108617552e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 11798 + }, + { + "completion_length": 243.7991180419922, + "epoch": 1.978414853933526, + "grad_norm": 0.20062119433370976, + "kl": 0.1900177001953125, + "learning_rate": 4.956909935522265e-07, + "loss": 0.0002, + "reward": 1.74642863124609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464285902678967, + "rewards/format_reward_func": 1.0, + "step": 11800 + }, + { + "completion_length": 237.56251049041748, + "epoch": 1.9787501571733936, + "grad_norm": 0.19331615368744728, + "kl": 0.183807373046875, + "learning_rate": 4.95688775677026e-07, + "loss": 0.0002, + "reward": 1.7892857939004898, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 11802 + }, + { + "completion_length": 243.19643878936768, + "epoch": 1.9790854604132613, + "grad_norm": 0.26249176823337705, + "kl": 0.14044189453125, + "learning_rate": 4.956865572361589e-07, + "loss": 0.0001, + "reward": 1.7000000849366188, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7000000309199095, + "rewards/format_reward_func": 1.0, + "step": 11804 + }, + { + "completion_length": 231.95090198516846, + "epoch": 1.979420763653129, + "grad_norm": 0.11336370989884989, + "kl": 0.0840301513671875, + "learning_rate": 4.956843382296303e-07, + "loss": 0.0001, + "reward": 1.82857146859169, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8285714462399483, + "rewards/format_reward_func": 1.0, + "step": 11806 + }, + { + "completion_length": 234.89733123779297, + "epoch": 1.9797560668929963, + "grad_norm": 0.16502533057972615, + "kl": 0.1882171630859375, + "learning_rate": 4.956821186574453e-07, + "loss": 0.0002, + "reward": 1.8000000417232513, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 11808 + }, + { + "completion_length": 250.22768783569336, + "epoch": 1.980091370132864, + "grad_norm": 0.2793167309232402, + "kl": 0.33477783203125, + "learning_rate": 4.956798985196089e-07, + "loss": 0.0003, + "reward": 1.7535714954137802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 11810 + }, + { + "completion_length": 219.44643878936768, + "epoch": 1.9804266733727314, + "grad_norm": 0.1115267623950336, + "kl": 0.0941162109375, + "learning_rate": 4.956776778161262e-07, + "loss": 0.0001, + "reward": 1.8107143193483353, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 11812 + }, + { + "completion_length": 239.5982255935669, + "epoch": 1.980761976612599, + "grad_norm": 0.23300894138513203, + "kl": 0.304840087890625, + "learning_rate": 4.956754565470025e-07, + "loss": 0.0003, + "reward": 1.762946493923664, + "reward_std": 0.03219861118122935, + "rewards/equation_reward_func": 0.7642857488244772, + "rewards/format_reward_func": 0.9986607171595097, + "step": 11814 + }, + { + "completion_length": 246.20983505249023, + "epoch": 1.9810972798524666, + "grad_norm": 0.24004711572000476, + "kl": 0.123046875, + "learning_rate": 4.956732347122428e-07, + "loss": 0.0001, + "reward": 1.8107143566012383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 11816 + }, + { + "completion_length": 243.9241180419922, + "epoch": 1.9814325830923343, + "grad_norm": 0.26924112990635035, + "kl": 0.195953369140625, + "learning_rate": 4.956710123118522e-07, + "loss": 0.0002, + "reward": 1.7714286223053932, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 11818 + }, + { + "completion_length": 235.758939743042, + "epoch": 1.9817678863322017, + "grad_norm": 0.19173483626836108, + "kl": 0.0982208251953125, + "learning_rate": 4.956687893458359e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464286107569933, + "rewards/format_reward_func": 1.0, + "step": 11820 + }, + { + "completion_length": 238.8259038925171, + "epoch": 1.982103189572069, + "grad_norm": 0.2064395106648162, + "kl": 0.138946533203125, + "learning_rate": 4.956665658141989e-07, + "loss": 0.0001, + "reward": 1.7000000849366188, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7000000290572643, + "rewards/format_reward_func": 1.0, + "step": 11822 + }, + { + "completion_length": 241.92858123779297, + "epoch": 1.9824384928119367, + "grad_norm": 0.21597189485682244, + "kl": 0.0922088623046875, + "learning_rate": 4.956643417169464e-07, + "loss": 0.0001, + "reward": 1.750000037252903, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7589286100119352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11824 + }, + { + "completion_length": 241.89733505249023, + "epoch": 1.9827737960518044, + "grad_norm": 0.1910625884590803, + "kl": 0.1044464111328125, + "learning_rate": 4.956621170540834e-07, + "loss": 0.0001, + "reward": 1.7803572118282318, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214469850063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11826 + }, + { + "completion_length": 231.9866180419922, + "epoch": 1.983109099291672, + "grad_norm": 0.13525587988326038, + "kl": 0.090179443359375, + "learning_rate": 4.956598918256151e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 11828 + }, + { + "completion_length": 238.78572273254395, + "epoch": 1.9834444025315394, + "grad_norm": 0.21546699753134915, + "kl": 0.71807861328125, + "learning_rate": 4.956576660315468e-07, + "loss": 0.0007, + "reward": 1.8178571909666061, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8178571797907352, + "rewards/format_reward_func": 1.0, + "step": 11830 + }, + { + "completion_length": 235.4910831451416, + "epoch": 1.983779705771407, + "grad_norm": 0.14166303451460713, + "kl": 0.173187255859375, + "learning_rate": 4.956554396718835e-07, + "loss": 0.0002, + "reward": 1.800000049173832, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 11832 + }, + { + "completion_length": 246.1339406967163, + "epoch": 1.9841150090112745, + "grad_norm": 0.24250097169224633, + "kl": 0.094482421875, + "learning_rate": 4.956532127466302e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11834 + }, + { + "completion_length": 240.32590293884277, + "epoch": 1.984450312251142, + "grad_norm": 0.23198980180167816, + "kl": 0.0967254638671875, + "learning_rate": 4.956509852557921e-07, + "loss": 0.0001, + "reward": 1.8178571686148643, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571723401546, + "rewards/format_reward_func": 1.0, + "step": 11836 + }, + { + "completion_length": 247.0044755935669, + "epoch": 1.9847856154910097, + "grad_norm": 0.2247671504249133, + "kl": 0.10345458984375, + "learning_rate": 4.956487571993744e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 11838 + }, + { + "completion_length": 235.56697177886963, + "epoch": 1.9851209187308774, + "grad_norm": 0.22602112500128277, + "kl": 0.2289276123046875, + "learning_rate": 4.956465285773822e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 11840 + }, + { + "completion_length": 238.22322368621826, + "epoch": 1.9854562219707448, + "grad_norm": 0.2934779266919726, + "kl": 0.118408203125, + "learning_rate": 4.956442993898206e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 11842 + }, + { + "completion_length": 241.77679920196533, + "epoch": 1.9857915252106122, + "grad_norm": 0.25116354054121726, + "kl": 0.3289794921875, + "learning_rate": 4.956420696366947e-07, + "loss": 0.0003, + "reward": 1.8250000476837158, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8250000327825546, + "rewards/format_reward_func": 1.0, + "step": 11844 + }, + { + "completion_length": 235.3169765472412, + "epoch": 1.9861268284504798, + "grad_norm": 0.13130801114902244, + "kl": 0.2256011962890625, + "learning_rate": 4.956398393180097e-07, + "loss": 0.0002, + "reward": 1.807142898440361, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 11846 + }, + { + "completion_length": 248.06250953674316, + "epoch": 1.9864621316903475, + "grad_norm": 0.22475071490244067, + "kl": 0.15399169921875, + "learning_rate": 4.956376084337707e-07, + "loss": 0.0002, + "reward": 1.8035714775323868, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 11848 + }, + { + "completion_length": 230.65625858306885, + "epoch": 1.986797434930215, + "grad_norm": 0.21280194742861228, + "kl": 0.1126708984375, + "learning_rate": 4.956353769839829e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 11850 + }, + { + "completion_length": 245.8348331451416, + "epoch": 1.9871327381700825, + "grad_norm": 0.2820430557228894, + "kl": 0.154571533203125, + "learning_rate": 4.956331449686513e-07, + "loss": 0.0002, + "reward": 1.7035715207457542, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7035714648663998, + "rewards/format_reward_func": 1.0, + "step": 11852 + }, + { + "completion_length": 240.2009038925171, + "epoch": 1.9874680414099501, + "grad_norm": 0.395312790500384, + "kl": 0.4749603271484375, + "learning_rate": 4.956309123877812e-07, + "loss": 0.0005, + "reward": 1.725000038743019, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7339286096394062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11854 + }, + { + "completion_length": 239.75447368621826, + "epoch": 1.9878033446498176, + "grad_norm": 0.13378942220678655, + "kl": 0.110565185546875, + "learning_rate": 4.956286792413776e-07, + "loss": 0.0001, + "reward": 1.8142857626080513, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 11856 + }, + { + "completion_length": 231.61608123779297, + "epoch": 1.9881386478896852, + "grad_norm": 0.18475450589268128, + "kl": 0.114471435546875, + "learning_rate": 4.956264455294459e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 11858 + }, + { + "completion_length": 238.84822463989258, + "epoch": 1.9884739511295528, + "grad_norm": 0.1555984953874135, + "kl": 0.22589111328125, + "learning_rate": 4.956242112519908e-07, + "loss": 0.0002, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 11860 + }, + { + "completion_length": 232.9776906967163, + "epoch": 1.9888092543694205, + "grad_norm": 0.3377725470032938, + "kl": 0.1129150390625, + "learning_rate": 4.956219764090178e-07, + "loss": 0.0001, + "reward": 1.7875000685453415, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7919643260538578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11862 + }, + { + "completion_length": 235.4509048461914, + "epoch": 1.9891445576092879, + "grad_norm": 0.2210363508356654, + "kl": 0.12188720703125, + "learning_rate": 4.956197410005319e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 11864 + }, + { + "completion_length": 231.74108219146729, + "epoch": 1.9894798608491553, + "grad_norm": 0.1475095473252601, + "kl": 0.1528472900390625, + "learning_rate": 4.956175050265384e-07, + "loss": 0.0002, + "reward": 1.8267857804894447, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8312500193715096, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11866 + }, + { + "completion_length": 245.79018878936768, + "epoch": 1.989815164089023, + "grad_norm": 0.15650601562466082, + "kl": 0.136932373046875, + "learning_rate": 4.956152684870422e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286059141159, + "rewards/format_reward_func": 1.0, + "step": 11868 + }, + { + "completion_length": 246.821439743042, + "epoch": 1.9901504673288906, + "grad_norm": 0.1704402931331183, + "kl": 0.185546875, + "learning_rate": 4.956130313820487e-07, + "loss": 0.0002, + "reward": 1.7250000983476639, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7339286003261805, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11870 + }, + { + "completion_length": 253.30358505249023, + "epoch": 1.9904857705687582, + "grad_norm": 0.38526863006362766, + "kl": 0.310333251953125, + "learning_rate": 4.956107937115629e-07, + "loss": 0.0003, + "reward": 1.7892857640981674, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 11872 + }, + { + "completion_length": 247.47322368621826, + "epoch": 1.9908210738086258, + "grad_norm": 0.19867008633050648, + "kl": 0.1563720703125, + "learning_rate": 4.9560855547559e-07, + "loss": 0.0002, + "reward": 1.7232143506407738, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7276786006987095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11874 + }, + { + "completion_length": 236.4866189956665, + "epoch": 1.9911563770484932, + "grad_norm": 0.1428311178265055, + "kl": 0.11065673828125, + "learning_rate": 4.956063166741351e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 11876 + }, + { + "completion_length": 244.43751049041748, + "epoch": 1.9914916802883607, + "grad_norm": 0.2428530544891383, + "kl": 0.1180877685546875, + "learning_rate": 4.956040773072034e-07, + "loss": 0.0001, + "reward": 1.7696428894996643, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071805357933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11878 + }, + { + "completion_length": 237.57590293884277, + "epoch": 1.9918269835282283, + "grad_norm": 1.1212923774941903, + "kl": 0.1295166015625, + "learning_rate": 4.956018373748001e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.06060915347188711, + "rewards/equation_reward_func": 0.7732143066823483, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11880 + }, + { + "completion_length": 230.1071538925171, + "epoch": 1.992162286768096, + "grad_norm": 0.1906099981958323, + "kl": 0.11297607421875, + "learning_rate": 4.955995968769302e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 11882 + }, + { + "completion_length": 234.17858219146729, + "epoch": 1.9924975900079636, + "grad_norm": 0.30340923600114456, + "kl": 0.114898681640625, + "learning_rate": 4.955973558135991e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.8089286014437675, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11884 + }, + { + "completion_length": 241.93751049041748, + "epoch": 1.992832893247831, + "grad_norm": 0.24126169281411733, + "kl": 0.104156494140625, + "learning_rate": 4.955951141848117e-07, + "loss": 0.0001, + "reward": 1.7982143312692642, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8026785962283611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11886 + }, + { + "completion_length": 242.1875114440918, + "epoch": 1.9931681964876986, + "grad_norm": 0.13715380355658038, + "kl": 0.1492919921875, + "learning_rate": 4.955928719905734e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 11888 + }, + { + "completion_length": 242.13393878936768, + "epoch": 1.993503499727566, + "grad_norm": 0.27291793247372775, + "kl": 0.1129150390625, + "learning_rate": 4.955906292308892e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7392857261002064, + "rewards/format_reward_func": 1.0, + "step": 11890 + }, + { + "completion_length": 227.04465293884277, + "epoch": 1.9938388029674337, + "grad_norm": 0.23860127461571143, + "kl": 0.0947265625, + "learning_rate": 4.955883859057643e-07, + "loss": 0.0001, + "reward": 1.8285714611411095, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8285714630037546, + "rewards/format_reward_func": 1.0, + "step": 11892 + }, + { + "completion_length": 234.54019165039062, + "epoch": 1.9941741062073013, + "grad_norm": 0.3011874190088839, + "kl": 0.105133056640625, + "learning_rate": 4.955861420152039e-07, + "loss": 0.0001, + "reward": 1.7892857939004898, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 11894 + }, + { + "completion_length": 233.602689743042, + "epoch": 1.994509409447169, + "grad_norm": 0.24990488377435194, + "kl": 0.118316650390625, + "learning_rate": 4.955838975592131e-07, + "loss": 0.0001, + "reward": 1.7250000908970833, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 11896 + }, + { + "completion_length": 226.99554634094238, + "epoch": 1.9948447126870363, + "grad_norm": 0.37434620571171906, + "kl": 0.120025634765625, + "learning_rate": 4.955816525377971e-07, + "loss": 0.0001, + "reward": 1.8125000670552254, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8169643022119999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11898 + }, + { + "completion_length": 228.21876049041748, + "epoch": 1.9951800159269037, + "grad_norm": 0.1891353284554539, + "kl": 0.113739013671875, + "learning_rate": 4.955794069509611e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 11900 + }, + { + "completion_length": 224.43304538726807, + "epoch": 1.9955153191667714, + "grad_norm": 0.6070216232285166, + "kl": 0.18603515625, + "learning_rate": 4.955771607987104e-07, + "loss": 0.0002, + "reward": 1.7303571924567223, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7348214685916901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11902 + }, + { + "completion_length": 231.16072368621826, + "epoch": 1.995850622406639, + "grad_norm": 0.2760599269226852, + "kl": 0.164215087890625, + "learning_rate": 4.955749140810499e-07, + "loss": 0.0002, + "reward": 1.7589286342263222, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928768336773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11904 + }, + { + "completion_length": 217.94197463989258, + "epoch": 1.9961859256465067, + "grad_norm": 0.2162571700265734, + "kl": 0.114288330078125, + "learning_rate": 4.955726667979848e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 11906 + }, + { + "completion_length": 233.70536518096924, + "epoch": 1.996521228886374, + "grad_norm": 0.17218754513613752, + "kl": 0.125152587890625, + "learning_rate": 4.955704189495205e-07, + "loss": 0.0001, + "reward": 1.7803572118282318, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214507102966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11908 + }, + { + "completion_length": 229.40179634094238, + "epoch": 1.9968565321262417, + "grad_norm": 0.3804112945798657, + "kl": 0.1534423828125, + "learning_rate": 4.955681705356621e-07, + "loss": 0.0002, + "reward": 1.7053572237491608, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7098214775323868, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11910 + }, + { + "completion_length": 230.25000953674316, + "epoch": 1.9971918353661091, + "grad_norm": 0.9168094662360421, + "kl": 0.241485595703125, + "learning_rate": 4.955659215564145e-07, + "loss": 0.0002, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 11912 + }, + { + "completion_length": 224.8616189956665, + "epoch": 1.9975271386059767, + "grad_norm": 0.16848882583586025, + "kl": 0.13287353515625, + "learning_rate": 4.955636720117833e-07, + "loss": 0.0001, + "reward": 1.8107143267989159, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 11914 + }, + { + "completion_length": 230.39732933044434, + "epoch": 1.9978624418458444, + "grad_norm": 0.558286576121209, + "kl": 0.13238525390625, + "learning_rate": 4.955614219017734e-07, + "loss": 0.0001, + "reward": 1.7732143625617027, + "reward_std": 0.08838834892958403, + "rewards/equation_reward_func": 0.7866071723401546, + "rewards/format_reward_func": 0.9866071492433548, + "step": 11916 + }, + { + "completion_length": 218.53572463989258, + "epoch": 1.998197745085712, + "grad_norm": 0.12048965951566463, + "kl": 0.1212921142578125, + "learning_rate": 4.9555917122639e-07, + "loss": 0.0001, + "reward": 1.798214316368103, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11918 + }, + { + "completion_length": 229.7723331451416, + "epoch": 1.9985330483255794, + "grad_norm": 0.26405640247652984, + "kl": 0.13812255859375, + "learning_rate": 4.955569199856384e-07, + "loss": 0.0001, + "reward": 1.7732143327593803, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11920 + }, + { + "completion_length": 222.51340293884277, + "epoch": 1.9988683515654468, + "grad_norm": 0.20701527554064678, + "kl": 0.119659423828125, + "learning_rate": 4.955546681795238e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 11922 + }, + { + "completion_length": 233.08929634094238, + "epoch": 1.9992036548053145, + "grad_norm": 0.2992797604063413, + "kl": 0.166656494140625, + "learning_rate": 4.955524158080513e-07, + "loss": 0.0002, + "reward": 1.7267858013510704, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7312500365078449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11924 + }, + { + "completion_length": 223.50447273254395, + "epoch": 1.9995389580451821, + "grad_norm": 0.21232811921496075, + "kl": 0.116607666015625, + "learning_rate": 4.955501628712259e-07, + "loss": 0.0001, + "reward": 1.783035770058632, + "reward_std": 0.04419417306780815, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9982142895460129, + "step": 11926 + }, + { + "completion_length": 224.19643783569336, + "epoch": 1.9998742612850497, + "grad_norm": 0.22939465547794297, + "kl": 0.9582061767578125, + "learning_rate": 4.955479093690532e-07, + "loss": 0.001, + "reward": 1.7303572073578835, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7348214648663998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11928 + }, + { + "completion_length": 224.04546356201172, + "epoch": 2.0003353032398676, + "grad_norm": 0.20265008955204794, + "kl": 0.11465731534090909, + "learning_rate": 4.955456553015381e-07, + "loss": 0.0002, + "reward": 1.7909091440114109, + "reward_std": 0.053262587298046456, + "rewards/equation_reward_func": 0.7941558740355752, + "rewards/format_reward_func": 0.9967532483014193, + "step": 11930 + }, + { + "completion_length": 219.13393878936768, + "epoch": 2.0006706064797353, + "grad_norm": 0.25991971602352476, + "kl": 0.1035919189453125, + "learning_rate": 4.955434006686859e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857566475868, + "rewards/format_reward_func": 1.0, + "step": 11932 + }, + { + "completion_length": 217.21429634094238, + "epoch": 2.0010059097196025, + "grad_norm": 0.17602677588111465, + "kl": 0.135345458984375, + "learning_rate": 4.955411454705016e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964285835623741, + "rewards/format_reward_func": 1.0, + "step": 11934 + }, + { + "completion_length": 226.6651906967163, + "epoch": 2.00134121295947, + "grad_norm": 0.23395105985453332, + "kl": 0.1150360107421875, + "learning_rate": 4.955388897069907e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 11936 + }, + { + "completion_length": 221.41518783569336, + "epoch": 2.0016765161993377, + "grad_norm": 0.1894342197370109, + "kl": 0.1356201171875, + "learning_rate": 4.955366333781581e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 11938 + }, + { + "completion_length": 224.43304634094238, + "epoch": 2.0020118194392054, + "grad_norm": 0.19172940809934583, + "kl": 0.13311767578125, + "learning_rate": 4.955343764840093e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143007218838, + "rewards/format_reward_func": 1.0, + "step": 11940 + }, + { + "completion_length": 210.82143878936768, + "epoch": 2.002347122679073, + "grad_norm": 0.19378477540179337, + "kl": 0.1277923583984375, + "learning_rate": 4.955321190245491e-07, + "loss": 0.0001, + "reward": 1.7571429461240768, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.757142897695303, + "rewards/format_reward_func": 1.0, + "step": 11942 + }, + { + "completion_length": 218.50000953674316, + "epoch": 2.0026824259189406, + "grad_norm": 0.20741615096888355, + "kl": 0.123382568359375, + "learning_rate": 4.955298609997831e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 11944 + }, + { + "completion_length": 226.2321538925171, + "epoch": 2.003017729158808, + "grad_norm": 0.15846754537055674, + "kl": 0.10369873046875, + "learning_rate": 4.955276024097163e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 11946 + }, + { + "completion_length": 228.56697463989258, + "epoch": 2.0033530323986755, + "grad_norm": 0.3061465529034817, + "kl": 0.119293212890625, + "learning_rate": 4.955253432543539e-07, + "loss": 0.0001, + "reward": 1.7696429267525673, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071600466967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11948 + }, + { + "completion_length": 217.4821538925171, + "epoch": 2.003688335638543, + "grad_norm": 0.373817116939758, + "kl": 0.13421630859375, + "learning_rate": 4.955230835337012e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11950 + }, + { + "completion_length": 220.44643878936768, + "epoch": 2.0040236388784107, + "grad_norm": 0.2275983755411765, + "kl": 0.130859375, + "learning_rate": 4.955208232477633e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 11952 + }, + { + "completion_length": 208.45536708831787, + "epoch": 2.0043589421182784, + "grad_norm": 0.1904971770533389, + "kl": 0.129486083984375, + "learning_rate": 4.955185623965454e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 11954 + }, + { + "completion_length": 225.9821538925171, + "epoch": 2.0046942453581456, + "grad_norm": 0.17717864307604564, + "kl": 0.13543701171875, + "learning_rate": 4.955163009800527e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7410714738070965, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11956 + }, + { + "completion_length": 214.18750953674316, + "epoch": 2.005029548598013, + "grad_norm": 0.16732658290309635, + "kl": 0.115814208984375, + "learning_rate": 4.955140389982904e-07, + "loss": 0.0001, + "reward": 1.8142857775092125, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 11958 + }, + { + "completion_length": 229.22322463989258, + "epoch": 2.005364851837881, + "grad_norm": 0.21928717715473264, + "kl": 0.118377685546875, + "learning_rate": 4.95511776451264e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7946428954601288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11960 + }, + { + "completion_length": 219.00001049041748, + "epoch": 2.0057001550777485, + "grad_norm": 0.11188186472262818, + "kl": 0.129486083984375, + "learning_rate": 4.955095133389783e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 11962 + }, + { + "completion_length": 216.20536613464355, + "epoch": 2.006035458317616, + "grad_norm": 0.287615123732697, + "kl": 0.158966064453125, + "learning_rate": 4.955072496614386e-07, + "loss": 0.0002, + "reward": 1.7839286401867867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928753435612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11964 + }, + { + "completion_length": 221.66965293884277, + "epoch": 2.0063707615574837, + "grad_norm": 0.13775078111321387, + "kl": 0.1495361328125, + "learning_rate": 4.955049854186503e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285798370838, + "rewards/format_reward_func": 1.0, + "step": 11966 + }, + { + "completion_length": 220.4285831451416, + "epoch": 2.006706064797351, + "grad_norm": 0.20963167919580555, + "kl": 0.142425537109375, + "learning_rate": 4.955027206106184e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 1.0, + "step": 11968 + }, + { + "completion_length": 208.98661518096924, + "epoch": 2.0070413680372186, + "grad_norm": 0.048640389814782435, + "kl": 0.104705810546875, + "learning_rate": 4.955004552373483e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 11970 + }, + { + "completion_length": 224.97322463989258, + "epoch": 2.007376671277086, + "grad_norm": 0.41609726327583996, + "kl": 0.146697998046875, + "learning_rate": 4.95498189298845e-07, + "loss": 0.0001, + "reward": 1.7232143878936768, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7276786044239998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11972 + }, + { + "completion_length": 220.17858123779297, + "epoch": 2.007711974516954, + "grad_norm": 0.2719038569618615, + "kl": 0.10687255859375, + "learning_rate": 4.954959227951139e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 11974 + }, + { + "completion_length": 215.39733123779297, + "epoch": 2.0080472777568215, + "grad_norm": 0.17050510753330186, + "kl": 0.124847412109375, + "learning_rate": 4.954936557261603e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071779280901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11976 + }, + { + "completion_length": 222.39286613464355, + "epoch": 2.008382580996689, + "grad_norm": 0.1061362122372038, + "kl": 0.139892578125, + "learning_rate": 4.954913880919892e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 11978 + }, + { + "completion_length": 219.17858123779297, + "epoch": 2.0087178842365563, + "grad_norm": 0.4607873940896255, + "kl": 0.220184326171875, + "learning_rate": 4.954891198926058e-07, + "loss": 0.0002, + "reward": 1.7821429073810577, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 11980 + }, + { + "completion_length": 218.12054443359375, + "epoch": 2.009053187476424, + "grad_norm": 0.243853842105626, + "kl": 0.13153076171875, + "learning_rate": 4.954868511280156e-07, + "loss": 0.0001, + "reward": 1.8053572103381157, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8098214529454708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11982 + }, + { + "completion_length": 219.4241180419922, + "epoch": 2.0093884907162916, + "grad_norm": 0.17020714244723772, + "kl": 0.13604736328125, + "learning_rate": 4.954845817982234e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 11984 + }, + { + "completion_length": 213.47322368621826, + "epoch": 2.009723793956159, + "grad_norm": 0.09836840420537803, + "kl": 0.15594482421875, + "learning_rate": 4.95482311903235e-07, + "loss": 0.0002, + "reward": 1.733928643167019, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 11986 + }, + { + "completion_length": 212.58929538726807, + "epoch": 2.010059097196027, + "grad_norm": 0.275191574538384, + "kl": 0.12689208984375, + "learning_rate": 4.954800414430551e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 11988 + }, + { + "completion_length": 232.5357265472412, + "epoch": 2.010394400435894, + "grad_norm": 0.1892092833782436, + "kl": 0.17681884765625, + "learning_rate": 4.954777704176891e-07, + "loss": 0.0002, + "reward": 1.7392857670783997, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857261002064, + "rewards/format_reward_func": 1.0, + "step": 11990 + }, + { + "completion_length": 217.90179538726807, + "epoch": 2.0107297036757616, + "grad_norm": 0.1421242256000931, + "kl": 0.14764404296875, + "learning_rate": 4.954754988271423e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143227010965, + "rewards/format_reward_func": 1.0, + "step": 11992 + }, + { + "completion_length": 220.9910831451416, + "epoch": 2.0110650069156293, + "grad_norm": 0.2826369696564157, + "kl": 0.155670166015625, + "learning_rate": 4.9547322667142e-07, + "loss": 0.0002, + "reward": 1.7919643595814705, + "reward_std": 0.061871842946857214, + "rewards/equation_reward_func": 0.798214316368103, + "rewards/format_reward_func": 0.9937500059604645, + "step": 11994 + }, + { + "completion_length": 226.55358028411865, + "epoch": 2.011400310155497, + "grad_norm": 0.23081500085930848, + "kl": 0.143829345703125, + "learning_rate": 4.954709539505272e-07, + "loss": 0.0001, + "reward": 1.8250000551342964, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.8339286036789417, + "rewards/format_reward_func": 0.9910714328289032, + "step": 11996 + }, + { + "completion_length": 225.42411708831787, + "epoch": 2.0117356133953646, + "grad_norm": 0.15678403010002545, + "kl": 0.1483154296875, + "learning_rate": 4.954686806644692e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286148548126, + "rewards/format_reward_func": 1.0, + "step": 11998 + }, + { + "completion_length": 234.8928689956665, + "epoch": 2.012070916635232, + "grad_norm": 0.41964272204763825, + "kl": 0.121246337890625, + "learning_rate": 4.954664068132514e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.8160714618861675, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12000 + }, + { + "completion_length": 230.38840293884277, + "epoch": 2.0124062198750994, + "grad_norm": 0.18041820981293524, + "kl": 0.126129150390625, + "learning_rate": 4.954641323968788e-07, + "loss": 0.0001, + "reward": 1.7553571909666061, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214633762836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12002 + }, + { + "completion_length": 234.41965579986572, + "epoch": 2.012741523114967, + "grad_norm": 0.2307812936228628, + "kl": 0.127593994140625, + "learning_rate": 4.954618574153569e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107142932713032, + "rewards/format_reward_func": 1.0, + "step": 12004 + }, + { + "completion_length": 247.77233695983887, + "epoch": 2.0130768263548346, + "grad_norm": 0.26071255191800646, + "kl": 0.129608154296875, + "learning_rate": 4.954595818686907e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.7625000216066837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12006 + }, + { + "completion_length": 232.83929634094238, + "epoch": 2.0134121295947023, + "grad_norm": 0.20923252643628967, + "kl": 0.13958740234375, + "learning_rate": 4.954573057568856e-07, + "loss": 0.0001, + "reward": 1.69821435213089, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7026786059141159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12008 + }, + { + "completion_length": 237.15179634094238, + "epoch": 2.01374743283457, + "grad_norm": 0.25729457637255143, + "kl": 0.141448974609375, + "learning_rate": 4.954550290799468e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7214285954833031, + "rewards/format_reward_func": 1.0, + "step": 12010 + }, + { + "completion_length": 234.2946538925171, + "epoch": 2.014082736074437, + "grad_norm": 0.27225565276967606, + "kl": 0.14306640625, + "learning_rate": 4.954527518378794e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7410714589059353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12012 + }, + { + "completion_length": 229.37500953674316, + "epoch": 2.0144180393143047, + "grad_norm": 0.14623596893714957, + "kl": 0.12762451171875, + "learning_rate": 4.954504740306887e-07, + "loss": 0.0001, + "reward": 1.8089286386966705, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8133928813040257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12014 + }, + { + "completion_length": 230.6339406967163, + "epoch": 2.0147533425541724, + "grad_norm": 0.2575949222882184, + "kl": 0.12567138671875, + "learning_rate": 4.954481956583802e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.785714328289032, + "rewards/format_reward_func": 1.0, + "step": 12016 + }, + { + "completion_length": 232.95090293884277, + "epoch": 2.01508864579404, + "grad_norm": 0.2922948645964965, + "kl": 0.154632568359375, + "learning_rate": 4.954459167209588e-07, + "loss": 0.0002, + "reward": 1.80892863124609, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.8133928589522839, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12018 + }, + { + "completion_length": 243.82590198516846, + "epoch": 2.0154239490339076, + "grad_norm": 0.30686562447010646, + "kl": 0.15203857421875, + "learning_rate": 4.9544363721843e-07, + "loss": 0.0002, + "reward": 1.7553571984171867, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7598214484751225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12020 + }, + { + "completion_length": 225.49107933044434, + "epoch": 2.0157592522737753, + "grad_norm": 0.20599917438686824, + "kl": 0.16033935546875, + "learning_rate": 4.954413571507988e-07, + "loss": 0.0002, + "reward": 1.7892857640981674, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 12022 + }, + { + "completion_length": 236.26786708831787, + "epoch": 2.0160945555136425, + "grad_norm": 0.24825955537246205, + "kl": 0.14947509765625, + "learning_rate": 4.954390765180707e-07, + "loss": 0.0001, + "reward": 1.7696429267525673, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12024 + }, + { + "completion_length": 233.32143878936768, + "epoch": 2.01642985875351, + "grad_norm": 0.10900224773752099, + "kl": 0.155120849609375, + "learning_rate": 4.954367953202509e-07, + "loss": 0.0002, + "reward": 1.7375000566244125, + "reward_std": 0.05808377265930176, + "rewards/equation_reward_func": 0.7508928813040257, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12026 + }, + { + "completion_length": 221.6964406967163, + "epoch": 2.0167651619933777, + "grad_norm": 0.2646779873755802, + "kl": 0.149627685546875, + "learning_rate": 4.954345135573445e-07, + "loss": 0.0001, + "reward": 1.812500037252903, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8169643133878708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12028 + }, + { + "completion_length": 223.02679538726807, + "epoch": 2.0171004652332454, + "grad_norm": 0.20961948724422066, + "kl": 0.15570068359375, + "learning_rate": 4.954322312293568e-07, + "loss": 0.0002, + "reward": 1.753571480512619, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 12030 + }, + { + "completion_length": 233.98661708831787, + "epoch": 2.017435768473113, + "grad_norm": 0.44244999777054755, + "kl": 0.162322998046875, + "learning_rate": 4.954299483362932e-07, + "loss": 0.0002, + "reward": 1.787500061094761, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12032 + }, + { + "completion_length": 226.696439743042, + "epoch": 2.0177710717129806, + "grad_norm": 0.0029737839904434033, + "kl": 0.13922119140625, + "learning_rate": 4.954276648781588e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 12034 + }, + { + "completion_length": 238.12054634094238, + "epoch": 2.018106374952848, + "grad_norm": 0.15109126581438292, + "kl": 0.155914306640625, + "learning_rate": 4.95425380854959e-07, + "loss": 0.0002, + "reward": 1.7232143580913544, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12036 + }, + { + "completion_length": 240.7991180419922, + "epoch": 2.0184416781927155, + "grad_norm": 0.20111466564363314, + "kl": 0.168670654296875, + "learning_rate": 4.954230962666989e-07, + "loss": 0.0002, + "reward": 1.771428644657135, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 12038 + }, + { + "completion_length": 239.30358219146729, + "epoch": 2.018776981432583, + "grad_norm": 0.1693994281368671, + "kl": 0.142608642578125, + "learning_rate": 4.954208111133839e-07, + "loss": 0.0001, + "reward": 1.8196429014205933, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8241071701049805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12040 + }, + { + "completion_length": 243.7812614440918, + "epoch": 2.0191122846724507, + "grad_norm": 0.3227749388676018, + "kl": 0.171875, + "learning_rate": 4.954185253950191e-07, + "loss": 0.0002, + "reward": 1.7339286729693413, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7383928839117289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12042 + }, + { + "completion_length": 236.8660831451416, + "epoch": 2.0194475879123184, + "grad_norm": 0.17381341547292486, + "kl": 0.1773681640625, + "learning_rate": 4.9541623911161e-07, + "loss": 0.0002, + "reward": 1.796428643167019, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.8053571730852127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12044 + }, + { + "completion_length": 234.32590198516846, + "epoch": 2.0197828911521856, + "grad_norm": 0.21425360276716288, + "kl": 0.141387939453125, + "learning_rate": 4.954139522631617e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7375000193715096, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12046 + }, + { + "completion_length": 239.37500858306885, + "epoch": 2.020118194392053, + "grad_norm": 0.30991828406477046, + "kl": 0.19769287109375, + "learning_rate": 4.954116648496793e-07, + "loss": 0.0002, + "reward": 1.7428572252392769, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 12048 + }, + { + "completion_length": 242.7366189956665, + "epoch": 2.020453497631921, + "grad_norm": 0.16492187085630522, + "kl": 0.258819580078125, + "learning_rate": 4.954093768711685e-07, + "loss": 0.0003, + "reward": 1.800000049173832, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 12050 + }, + { + "completion_length": 244.13393878936768, + "epoch": 2.0207888008717885, + "grad_norm": 0.36228155609406354, + "kl": 0.3056640625, + "learning_rate": 4.954070883276342e-07, + "loss": 0.0003, + "reward": 1.8142857626080513, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 12052 + }, + { + "completion_length": 239.1741189956665, + "epoch": 2.021124104111656, + "grad_norm": 0.23091231775013618, + "kl": 0.128692626953125, + "learning_rate": 4.954047992190818e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571712225676, + "rewards/format_reward_func": 1.0, + "step": 12054 + }, + { + "completion_length": 232.46876335144043, + "epoch": 2.0214594073515237, + "grad_norm": 0.24644318616572938, + "kl": 0.320404052734375, + "learning_rate": 4.954025095455166e-07, + "loss": 0.0003, + "reward": 1.8142857402563095, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.8232143186032772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12056 + }, + { + "completion_length": 234.58036708831787, + "epoch": 2.021794710591391, + "grad_norm": 0.18374386788386235, + "kl": 0.17694091796875, + "learning_rate": 4.954002193069438e-07, + "loss": 0.0002, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 12058 + }, + { + "completion_length": 229.8794755935669, + "epoch": 2.0221300138312586, + "grad_norm": 0.11192113605103671, + "kl": 0.22650146484375, + "learning_rate": 4.953979285033687e-07, + "loss": 0.0002, + "reward": 1.767857201397419, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 12060 + }, + { + "completion_length": 248.83929538726807, + "epoch": 2.022465317071126, + "grad_norm": 0.20104151765938574, + "kl": 0.20709228515625, + "learning_rate": 4.953956371347966e-07, + "loss": 0.0002, + "reward": 1.7000000700354576, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7089286129921675, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12062 + }, + { + "completion_length": 228.36161708831787, + "epoch": 2.022800620310994, + "grad_norm": 0.27693609985790313, + "kl": 0.66949462890625, + "learning_rate": 4.953933452012327e-07, + "loss": 0.0007, + "reward": 1.7750000730156898, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 12064 + }, + { + "completion_length": 225.81251049041748, + "epoch": 2.0231359235508615, + "grad_norm": 0.26251403881160484, + "kl": 0.167205810546875, + "learning_rate": 4.953910527026824e-07, + "loss": 0.0002, + "reward": 1.7517858073115349, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7562500387430191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12066 + }, + { + "completion_length": 228.04018783569336, + "epoch": 2.0234712267907287, + "grad_norm": 0.29266835993005574, + "kl": 0.438812255859375, + "learning_rate": 4.95388759639151e-07, + "loss": 0.0004, + "reward": 1.7803572043776512, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12068 + }, + { + "completion_length": 226.27679443359375, + "epoch": 2.0238065300305963, + "grad_norm": 0.19577650905885083, + "kl": 0.363250732421875, + "learning_rate": 4.953864660106435e-07, + "loss": 0.0004, + "reward": 1.7946429178118706, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7991071566939354, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12070 + }, + { + "completion_length": 230.3526906967163, + "epoch": 2.024141833270464, + "grad_norm": 0.14275428631407802, + "kl": 0.262969970703125, + "learning_rate": 4.953841718171655e-07, + "loss": 0.0003, + "reward": 1.7803572192788124, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7848214507102966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12072 + }, + { + "completion_length": 225.99108123779297, + "epoch": 2.0244771365103316, + "grad_norm": 0.17367730450703955, + "kl": 0.56787109375, + "learning_rate": 4.953818770587221e-07, + "loss": 0.0006, + "reward": 1.7321429252624512, + "reward_std": 0.08586296439170837, + "rewards/equation_reward_func": 0.7410714514553547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12074 + }, + { + "completion_length": 230.7009038925171, + "epoch": 2.024812439750199, + "grad_norm": 0.25745718941039286, + "kl": 0.167633056640625, + "learning_rate": 4.953795817353187e-07, + "loss": 0.0002, + "reward": 1.7857143357396126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143022119999, + "rewards/format_reward_func": 1.0, + "step": 12076 + }, + { + "completion_length": 230.60715293884277, + "epoch": 2.025147742990067, + "grad_norm": 0.4703462047223365, + "kl": 0.290985107421875, + "learning_rate": 4.953772858469605e-07, + "loss": 0.0003, + "reward": 1.7857143729925156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 12078 + }, + { + "completion_length": 225.26340198516846, + "epoch": 2.025483046229934, + "grad_norm": 0.209617257104548, + "kl": 0.496490478515625, + "learning_rate": 4.953749893936527e-07, + "loss": 0.0005, + "reward": 1.78035718947649, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.784821467474103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12080 + }, + { + "completion_length": 222.3928680419922, + "epoch": 2.0258183494698017, + "grad_norm": 0.2109313296218021, + "kl": 0.129119873046875, + "learning_rate": 4.953726923754008e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.785714328289032, + "rewards/format_reward_func": 1.0, + "step": 12082 + }, + { + "completion_length": 228.6473331451416, + "epoch": 2.0261536527096693, + "grad_norm": 0.7119368148754645, + "kl": 0.433197021484375, + "learning_rate": 4.9537039479221e-07, + "loss": 0.0004, + "reward": 1.7767857611179352, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500409781933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12084 + }, + { + "completion_length": 228.45983409881592, + "epoch": 2.026488955949537, + "grad_norm": 0.20029418610940258, + "kl": 0.416046142578125, + "learning_rate": 4.953680966440855e-07, + "loss": 0.0004, + "reward": 1.7535715252161026, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 12086 + }, + { + "completion_length": 227.72768783569336, + "epoch": 2.0268242591894046, + "grad_norm": 0.28945798184832394, + "kl": 0.2611083984375, + "learning_rate": 4.953657979310327e-07, + "loss": 0.0003, + "reward": 1.7125000730156898, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7169643230736256, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12088 + }, + { + "completion_length": 217.47768783569336, + "epoch": 2.0271595624292718, + "grad_norm": 0.3697406638610568, + "kl": 0.174652099609375, + "learning_rate": 4.953634986530569e-07, + "loss": 0.0002, + "reward": 1.742857202887535, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571674972773, + "rewards/format_reward_func": 1.0, + "step": 12090 + }, + { + "completion_length": 231.9598331451416, + "epoch": 2.0274948656691394, + "grad_norm": 0.19183056439734558, + "kl": 0.171661376953125, + "learning_rate": 4.953611988101633e-07, + "loss": 0.0002, + "reward": 1.7464286535978317, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 12092 + }, + { + "completion_length": 223.97768783569336, + "epoch": 2.027830168909007, + "grad_norm": 0.2675128520542845, + "kl": 0.1929931640625, + "learning_rate": 4.953588984023573e-07, + "loss": 0.0002, + "reward": 1.7857143506407738, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7946428805589676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12094 + }, + { + "completion_length": 225.57143783569336, + "epoch": 2.0281654721488747, + "grad_norm": 0.2959984170493704, + "kl": 0.1334228515625, + "learning_rate": 4.953565974296441e-07, + "loss": 0.0001, + "reward": 1.6714286804199219, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.6714286021888256, + "rewards/format_reward_func": 1.0, + "step": 12096 + }, + { + "completion_length": 216.91072463989258, + "epoch": 2.0285007753887423, + "grad_norm": 0.3055442488342892, + "kl": 0.158233642578125, + "learning_rate": 4.95354295892029e-07, + "loss": 0.0002, + "reward": 1.8035714998841286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 12098 + }, + { + "completion_length": 229.08483409881592, + "epoch": 2.02883607862861, + "grad_norm": 0.4134827691873043, + "kl": 0.18817138671875, + "learning_rate": 4.953519937895174e-07, + "loss": 0.0002, + "reward": 1.721428632736206, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286159723997, + "rewards/format_reward_func": 1.0, + "step": 12100 + }, + { + "completion_length": 218.18304634094238, + "epoch": 2.029171381868477, + "grad_norm": 0.20157766541376793, + "kl": 0.151275634765625, + "learning_rate": 4.953496911221145e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 12102 + }, + { + "completion_length": 231.22768783569336, + "epoch": 2.0295066851083448, + "grad_norm": 0.2464018435854221, + "kl": 0.17156982421875, + "learning_rate": 4.953473878898258e-07, + "loss": 0.0002, + "reward": 1.810714341700077, + "reward_std": 0.0656599123030901, + "rewards/equation_reward_func": 0.8196428790688515, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12104 + }, + { + "completion_length": 225.34375953674316, + "epoch": 2.0298419883482124, + "grad_norm": 0.18348432319917418, + "kl": 0.15228271484375, + "learning_rate": 4.953450840926563e-07, + "loss": 0.0002, + "reward": 1.7446429282426834, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12106 + }, + { + "completion_length": 228.602689743042, + "epoch": 2.03017729158808, + "grad_norm": 0.2064246228600626, + "kl": 0.357513427734375, + "learning_rate": 4.953427797306115e-07, + "loss": 0.0004, + "reward": 1.7803572043776512, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12108 + }, + { + "completion_length": 236.37054634094238, + "epoch": 2.0305125948279477, + "grad_norm": 0.1549182499060302, + "kl": 0.149200439453125, + "learning_rate": 4.953404748036965e-07, + "loss": 0.0001, + "reward": 1.7553571909666061, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214782774448, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12110 + }, + { + "completion_length": 237.93750953674316, + "epoch": 2.0308478980678153, + "grad_norm": 0.1820479868588014, + "kl": 0.184326171875, + "learning_rate": 4.953381693119169e-07, + "loss": 0.0002, + "reward": 1.74821437895298, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.752678606659174, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12112 + }, + { + "completion_length": 236.83929824829102, + "epoch": 2.0311832013076825, + "grad_norm": 0.1271072757567212, + "kl": 0.206817626953125, + "learning_rate": 4.95335863255278e-07, + "loss": 0.0002, + "reward": 1.76071435213089, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7696428783237934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12114 + }, + { + "completion_length": 237.74108409881592, + "epoch": 2.03151850454755, + "grad_norm": 0.30312186630702503, + "kl": 0.359375, + "learning_rate": 4.953335566337847e-07, + "loss": 0.0004, + "reward": 1.726785808801651, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7312500327825546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12116 + }, + { + "completion_length": 237.87500953674316, + "epoch": 2.0318538077874178, + "grad_norm": 0.34231948362766396, + "kl": 0.210174560546875, + "learning_rate": 4.953312494474427e-07, + "loss": 0.0002, + "reward": 1.7683036476373672, + "reward_std": 0.06502856919541955, + "rewards/equation_reward_func": 0.7830357365310192, + "rewards/format_reward_func": 0.9852678664028645, + "step": 12118 + }, + { + "completion_length": 247.51340293884277, + "epoch": 2.0321891110272854, + "grad_norm": 0.24182887804491468, + "kl": 0.211639404296875, + "learning_rate": 4.953289416962573e-07, + "loss": 0.0002, + "reward": 1.7375000789761543, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12120 + }, + { + "completion_length": 236.508939743042, + "epoch": 2.032524414267153, + "grad_norm": 0.18792396805814143, + "kl": 0.15826416015625, + "learning_rate": 4.953266333802336e-07, + "loss": 0.0002, + "reward": 1.7482143342494965, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7616071589291096, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12122 + }, + { + "completion_length": 234.99554634094238, + "epoch": 2.03285971750702, + "grad_norm": 0.23981471522060907, + "kl": 0.146087646484375, + "learning_rate": 4.953243244993771e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928790688515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12124 + }, + { + "completion_length": 230.20090293884277, + "epoch": 2.033195020746888, + "grad_norm": 0.5070715919217202, + "kl": 0.173065185546875, + "learning_rate": 4.953220150536931e-07, + "loss": 0.0002, + "reward": 1.8071429058909416, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.8071428947150707, + "rewards/format_reward_func": 1.0, + "step": 12126 + }, + { + "completion_length": 235.46429538726807, + "epoch": 2.0335303239867555, + "grad_norm": 0.20365360632250812, + "kl": 0.140228271484375, + "learning_rate": 4.953197050431867e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12128 + }, + { + "completion_length": 228.5982255935669, + "epoch": 2.033865627226623, + "grad_norm": 0.3304221388416682, + "kl": 0.15032958984375, + "learning_rate": 4.953173944678635e-07, + "loss": 0.0002, + "reward": 1.8035714849829674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 12130 + }, + { + "completion_length": 244.96429443359375, + "epoch": 2.0342009304664908, + "grad_norm": 0.48853927043600925, + "kl": 0.200531005859375, + "learning_rate": 4.953150833277286e-07, + "loss": 0.0002, + "reward": 1.7553571909666061, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12132 + }, + { + "completion_length": 233.75000953674316, + "epoch": 2.0345362337063584, + "grad_norm": 0.5231974895770883, + "kl": 0.131744384765625, + "learning_rate": 4.953127716227875e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571939468384, + "rewards/format_reward_func": 1.0, + "step": 12134 + }, + { + "completion_length": 221.73661708831787, + "epoch": 2.0348715369462256, + "grad_norm": 0.156355831696372, + "kl": 0.13409423828125, + "learning_rate": 4.953104593530455e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 12136 + }, + { + "completion_length": 239.66072463989258, + "epoch": 2.035206840186093, + "grad_norm": 0.28143142463041965, + "kl": 0.167388916015625, + "learning_rate": 4.953081465185077e-07, + "loss": 0.0002, + "reward": 1.7321429178118706, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428991854191, + "rewards/format_reward_func": 1.0, + "step": 12138 + }, + { + "completion_length": 217.82590198516846, + "epoch": 2.035542143425961, + "grad_norm": 0.15327124267949874, + "kl": 0.12646484375, + "learning_rate": 4.953058331191797e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12140 + }, + { + "completion_length": 220.13840198516846, + "epoch": 2.0358774466658285, + "grad_norm": 0.243204474048529, + "kl": 0.13507080078125, + "learning_rate": 4.953035191550667e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000413507223, + "rewards/format_reward_func": 1.0, + "step": 12142 + }, + { + "completion_length": 240.28572845458984, + "epoch": 2.036212749905696, + "grad_norm": 0.3016617870258245, + "kl": 0.14996337890625, + "learning_rate": 4.95301204626174e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.800000011920929, + "rewards/format_reward_func": 1.0, + "step": 12144 + }, + { + "completion_length": 238.83036613464355, + "epoch": 2.0365480531455633, + "grad_norm": 0.9705617575803153, + "kl": 0.199249267578125, + "learning_rate": 4.952988895325071e-07, + "loss": 0.0002, + "reward": 1.7464286461472511, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285958558321, + "rewards/format_reward_func": 1.0, + "step": 12146 + }, + { + "completion_length": 242.16518688201904, + "epoch": 2.036883356385431, + "grad_norm": 0.24141183406603373, + "kl": 0.160888671875, + "learning_rate": 4.952965738740712e-07, + "loss": 0.0002, + "reward": 1.764285758137703, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 12148 + }, + { + "completion_length": 222.35715198516846, + "epoch": 2.0372186596252986, + "grad_norm": 0.11416317860538366, + "kl": 0.1640625, + "learning_rate": 4.952942576508715e-07, + "loss": 0.0002, + "reward": 1.7892857566475868, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 12150 + }, + { + "completion_length": 232.70536708831787, + "epoch": 2.037553962865166, + "grad_norm": 0.5464999282439347, + "kl": 0.185272216796875, + "learning_rate": 4.952919408629136e-07, + "loss": 0.0002, + "reward": 1.8321429267525673, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8321429006755352, + "rewards/format_reward_func": 1.0, + "step": 12152 + }, + { + "completion_length": 231.72322463989258, + "epoch": 2.037889266105034, + "grad_norm": 0.19676459706372948, + "kl": 0.166259765625, + "learning_rate": 4.952896235102027e-07, + "loss": 0.0002, + "reward": 1.7178572118282318, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571820259094, + "rewards/format_reward_func": 1.0, + "step": 12154 + }, + { + "completion_length": 233.97322463989258, + "epoch": 2.0382245693449015, + "grad_norm": 0.3356855438729618, + "kl": 0.20611572265625, + "learning_rate": 4.95287305592744e-07, + "loss": 0.0002, + "reward": 1.8214286267757416, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214286006987095, + "rewards/format_reward_func": 1.0, + "step": 12156 + }, + { + "completion_length": 238.13393783569336, + "epoch": 2.0385598725847687, + "grad_norm": 0.11742634321299134, + "kl": 0.263336181640625, + "learning_rate": 4.952849871105431e-07, + "loss": 0.0003, + "reward": 1.7575893327593803, + "reward_std": 0.02967323106713593, + "rewards/equation_reward_func": 0.7633928973227739, + "rewards/format_reward_func": 0.9941964335739613, + "step": 12158 + }, + { + "completion_length": 234.12947273254395, + "epoch": 2.0388951758246363, + "grad_norm": 0.13895650350235308, + "kl": 0.15374755859375, + "learning_rate": 4.952826680636051e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 12160 + }, + { + "completion_length": 241.33483219146729, + "epoch": 2.039230479064504, + "grad_norm": 0.2290955370201332, + "kl": 0.221832275390625, + "learning_rate": 4.952803484519357e-07, + "loss": 0.0002, + "reward": 1.8214285895228386, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8214286044239998, + "rewards/format_reward_func": 1.0, + "step": 12162 + }, + { + "completion_length": 248.62054824829102, + "epoch": 2.0395657823043716, + "grad_norm": 0.5380827733704721, + "kl": 0.365081787109375, + "learning_rate": 4.952780282755398e-07, + "loss": 0.0004, + "reward": 1.7196429148316383, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071630269289, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12164 + }, + { + "completion_length": 244.63393783569336, + "epoch": 2.039901085544239, + "grad_norm": 0.21779016308612434, + "kl": 0.276580810546875, + "learning_rate": 4.95275707534423e-07, + "loss": 0.0003, + "reward": 1.785714365541935, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.785714328289032, + "rewards/format_reward_func": 1.0, + "step": 12166 + }, + { + "completion_length": 237.87054347991943, + "epoch": 2.0402363887841064, + "grad_norm": 0.12616860556579165, + "kl": 0.215911865234375, + "learning_rate": 4.952733862285905e-07, + "loss": 0.0002, + "reward": 1.7821429297327995, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7910714633762836, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12168 + }, + { + "completion_length": 262.1384057998657, + "epoch": 2.040571692023974, + "grad_norm": 0.19813592000547936, + "kl": 0.26214599609375, + "learning_rate": 4.952710643580478e-07, + "loss": 0.0003, + "reward": 1.7071429193019867, + "reward_std": 0.08081220556050539, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 0.9821428656578064, + "step": 12170 + }, + { + "completion_length": 254.040189743042, + "epoch": 2.0409069952638417, + "grad_norm": 0.16061184685130958, + "kl": 0.239654541015625, + "learning_rate": 4.952687419228001e-07, + "loss": 0.0002, + "reward": 1.749553620815277, + "reward_std": 0.06124049751088023, + "rewards/equation_reward_func": 0.7616071701049805, + "rewards/format_reward_func": 0.9879464358091354, + "step": 12172 + }, + { + "completion_length": 242.59375953674316, + "epoch": 2.0412422985037093, + "grad_norm": 0.2398218040558381, + "kl": 0.1614990234375, + "learning_rate": 4.952664189228529e-07, + "loss": 0.0002, + "reward": 1.7839286178350449, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12174 + }, + { + "completion_length": 239.91965579986572, + "epoch": 2.041577601743577, + "grad_norm": 0.1620207173064198, + "kl": 0.160888671875, + "learning_rate": 4.952640953582114e-07, + "loss": 0.0002, + "reward": 1.7428572103381157, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571879863739, + "rewards/format_reward_func": 1.0, + "step": 12176 + }, + { + "completion_length": 237.92858028411865, + "epoch": 2.0419129049834446, + "grad_norm": 0.21874752432843478, + "kl": 0.17864990234375, + "learning_rate": 4.95261771228881e-07, + "loss": 0.0002, + "reward": 1.767857201397419, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 12178 + }, + { + "completion_length": 251.3616180419922, + "epoch": 2.0422482082233118, + "grad_norm": 0.26590850068573935, + "kl": 0.19781494140625, + "learning_rate": 4.952594465348672e-07, + "loss": 0.0002, + "reward": 1.7482143491506577, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7526785936206579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12180 + }, + { + "completion_length": 225.64286708831787, + "epoch": 2.0425835114631794, + "grad_norm": 0.20064903092848752, + "kl": 0.1510009765625, + "learning_rate": 4.95257121276175e-07, + "loss": 0.0002, + "reward": 1.8196429088711739, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.833035733550787, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12182 + }, + { + "completion_length": 246.2946538925171, + "epoch": 2.042918814703047, + "grad_norm": 0.2570804310225239, + "kl": 0.225006103515625, + "learning_rate": 4.952547954528101e-07, + "loss": 0.0002, + "reward": 1.7053572162985802, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7098214793950319, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12184 + }, + { + "completion_length": 231.93304634094238, + "epoch": 2.0432541179429147, + "grad_norm": 0.24222254138144964, + "kl": 0.16400146484375, + "learning_rate": 4.952524690647778e-07, + "loss": 0.0002, + "reward": 1.7910714745521545, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7955357544124126, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12186 + }, + { + "completion_length": 236.96876049041748, + "epoch": 2.0435894211827823, + "grad_norm": 0.24676297049278167, + "kl": 0.234893798828125, + "learning_rate": 4.952501421120832e-07, + "loss": 0.0002, + "reward": 1.7892857566475868, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.798214316368103, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12188 + }, + { + "completion_length": 232.9687623977661, + "epoch": 2.04392472442265, + "grad_norm": 0.20240000669952848, + "kl": 0.198699951171875, + "learning_rate": 4.952478145947321e-07, + "loss": 0.0002, + "reward": 1.7482143566012383, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7616071663796902, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12190 + }, + { + "completion_length": 234.76340198516846, + "epoch": 2.044260027662517, + "grad_norm": 0.2367568215370992, + "kl": 0.19390869140625, + "learning_rate": 4.952454865127295e-07, + "loss": 0.0002, + "reward": 1.7642857879400253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857320606709, + "rewards/format_reward_func": 1.0, + "step": 12192 + }, + { + "completion_length": 226.4553680419922, + "epoch": 2.0445953309023848, + "grad_norm": 0.3957196761252101, + "kl": 0.1790771484375, + "learning_rate": 4.952431578660807e-07, + "loss": 0.0002, + "reward": 1.7803572118282318, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12194 + }, + { + "completion_length": 238.0446548461914, + "epoch": 2.0449306341422524, + "grad_norm": 0.41923647210505366, + "kl": 0.213134765625, + "learning_rate": 4.952408286547913e-07, + "loss": 0.0002, + "reward": 1.753571480512619, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714767873287, + "rewards/format_reward_func": 1.0, + "step": 12196 + }, + { + "completion_length": 230.87054824829102, + "epoch": 2.04526593738212, + "grad_norm": 0.16331890639992575, + "kl": 0.154632568359375, + "learning_rate": 4.952384988788666e-07, + "loss": 0.0002, + "reward": 1.6553572416305542, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.6598214600235224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12198 + }, + { + "completion_length": 224.12500858306885, + "epoch": 2.0456012406219877, + "grad_norm": 0.4519803190382613, + "kl": 0.20135498046875, + "learning_rate": 4.95236168538312e-07, + "loss": 0.0002, + "reward": 1.7107143625617027, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7107143215835094, + "rewards/format_reward_func": 1.0, + "step": 12200 + }, + { + "completion_length": 226.50000762939453, + "epoch": 2.045936543861855, + "grad_norm": 0.338038635536733, + "kl": 0.194580078125, + "learning_rate": 4.952338376331327e-07, + "loss": 0.0002, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 12202 + }, + { + "completion_length": 223.4464406967163, + "epoch": 2.0462718471017225, + "grad_norm": 0.10915545282494744, + "kl": 0.13482666015625, + "learning_rate": 4.952315061633343e-07, + "loss": 0.0001, + "reward": 1.8285714909434319, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8285714574158192, + "rewards/format_reward_func": 1.0, + "step": 12204 + }, + { + "completion_length": 224.90179824829102, + "epoch": 2.04660715034159, + "grad_norm": 0.09504629135042848, + "kl": 0.133575439453125, + "learning_rate": 4.952291741289221e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.781250037252903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12206 + }, + { + "completion_length": 225.60268878936768, + "epoch": 2.0469424535814578, + "grad_norm": 0.1865660206136097, + "kl": 0.11761474609375, + "learning_rate": 4.952268415299013e-07, + "loss": 0.0001, + "reward": 1.8250000402331352, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000104308128, + "rewards/format_reward_func": 1.0, + "step": 12208 + }, + { + "completion_length": 228.60268878936768, + "epoch": 2.0472777568213254, + "grad_norm": 0.22629386598093604, + "kl": 0.16473388671875, + "learning_rate": 4.952245083662774e-07, + "loss": 0.0002, + "reward": 1.7535715252161026, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 12210 + }, + { + "completion_length": 230.33929634094238, + "epoch": 2.047613060061193, + "grad_norm": 0.27102016159457293, + "kl": 0.1397705078125, + "learning_rate": 4.952221746380557e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 12212 + }, + { + "completion_length": 231.64286994934082, + "epoch": 2.0479483633010602, + "grad_norm": 0.29415841832577194, + "kl": 0.14801025390625, + "learning_rate": 4.952198403452417e-07, + "loss": 0.0001, + "reward": 1.7142857760190964, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.714285746216774, + "rewards/format_reward_func": 1.0, + "step": 12214 + }, + { + "completion_length": 228.3616180419922, + "epoch": 2.048283666540928, + "grad_norm": 0.27608760539171945, + "kl": 0.14324951171875, + "learning_rate": 4.952175054878407e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 12216 + }, + { + "completion_length": 221.0000057220459, + "epoch": 2.0486189697807955, + "grad_norm": 0.16831740463669606, + "kl": 0.14044189453125, + "learning_rate": 4.952151700658581e-07, + "loss": 0.0001, + "reward": 1.7767857611179352, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12218 + }, + { + "completion_length": 230.52233028411865, + "epoch": 2.048954273020663, + "grad_norm": 0.30209937383700664, + "kl": 0.13238525390625, + "learning_rate": 4.952128340792992e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857488244772, + "rewards/format_reward_func": 1.0, + "step": 12220 + }, + { + "completion_length": 222.59375953674316, + "epoch": 2.0492895762605308, + "grad_norm": 0.3109690344702524, + "kl": 0.151123046875, + "learning_rate": 4.952104975281696e-07, + "loss": 0.0002, + "reward": 1.7535715252161026, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 12222 + }, + { + "completion_length": 232.5134038925171, + "epoch": 2.049624879500398, + "grad_norm": 0.2744549215687352, + "kl": 0.1292724609375, + "learning_rate": 4.952081604124743e-07, + "loss": 0.0001, + "reward": 1.6910715103149414, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.6955357603728771, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12224 + }, + { + "completion_length": 235.00001049041748, + "epoch": 2.0499601827402656, + "grad_norm": 0.5163864123677877, + "kl": 0.248382568359375, + "learning_rate": 4.952058227322191e-07, + "loss": 0.0002, + "reward": 1.7910714820027351, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7955357506871223, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12226 + }, + { + "completion_length": 220.38840293884277, + "epoch": 2.0502954859801332, + "grad_norm": 0.07724610768291482, + "kl": 0.1690673828125, + "learning_rate": 4.952034844874091e-07, + "loss": 0.0002, + "reward": 1.7964286133646965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 12228 + }, + { + "completion_length": 242.69197463989258, + "epoch": 2.050630789220001, + "grad_norm": 0.16030103752196245, + "kl": 0.132568359375, + "learning_rate": 4.952011456780497e-07, + "loss": 0.0001, + "reward": 1.801785759627819, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500134110451, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12230 + }, + { + "completion_length": 226.81251049041748, + "epoch": 2.0509660924598685, + "grad_norm": 0.17879382796345136, + "kl": 0.123504638671875, + "learning_rate": 4.951988063041464e-07, + "loss": 0.0001, + "reward": 1.7625000327825546, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12232 + }, + { + "completion_length": 227.62500858306885, + "epoch": 2.051301395699736, + "grad_norm": 0.26533960090283815, + "kl": 0.143218994140625, + "learning_rate": 4.951964663657046e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8035714700818062, + "rewards/format_reward_func": 1.0, + "step": 12234 + }, + { + "completion_length": 235.46875953674316, + "epoch": 2.0516366989396033, + "grad_norm": 0.3753427425352448, + "kl": 0.139190673828125, + "learning_rate": 4.951941258627294e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 12236 + }, + { + "completion_length": 230.11608028411865, + "epoch": 2.051972002179471, + "grad_norm": 0.35682220127991493, + "kl": 0.129425048828125, + "learning_rate": 4.951917847952266e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428693830967, + "rewards/format_reward_func": 1.0, + "step": 12238 + }, + { + "completion_length": 222.79465198516846, + "epoch": 2.0523073054193386, + "grad_norm": 0.13520711911359942, + "kl": 0.142059326171875, + "learning_rate": 4.951894431632014e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7383928969502449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12240 + }, + { + "completion_length": 218.290189743042, + "epoch": 2.0526426086592062, + "grad_norm": 0.13303137022439182, + "kl": 0.121063232421875, + "learning_rate": 4.951871009666591e-07, + "loss": 0.0001, + "reward": 1.7785715088248253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 12242 + }, + { + "completion_length": 223.2455472946167, + "epoch": 2.052977911899074, + "grad_norm": 0.2376218232090186, + "kl": 0.28485107421875, + "learning_rate": 4.951847582056053e-07, + "loss": 0.0003, + "reward": 1.8000000566244125, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 12244 + }, + { + "completion_length": 207.01786708831787, + "epoch": 2.0533132151389415, + "grad_norm": 0.09265186879940798, + "kl": 0.1336669921875, + "learning_rate": 4.951824148800452e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8071428723633289, + "rewards/format_reward_func": 1.0, + "step": 12246 + }, + { + "completion_length": 219.26786613464355, + "epoch": 2.0536485183788087, + "grad_norm": 0.2203674172881168, + "kl": 0.193115234375, + "learning_rate": 4.951800709899843e-07, + "loss": 0.0002, + "reward": 1.7607143595814705, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 12248 + }, + { + "completion_length": 219.40625762939453, + "epoch": 2.0539838216186763, + "grad_norm": 0.11510668979914045, + "kl": 0.20257568359375, + "learning_rate": 4.95177726535428e-07, + "loss": 0.0002, + "reward": 1.7500000819563866, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 12250 + }, + { + "completion_length": 217.38840198516846, + "epoch": 2.054319124858544, + "grad_norm": 0.16410520017656868, + "kl": 0.1424102783203125, + "learning_rate": 4.951753815163816e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 12252 + }, + { + "completion_length": 211.48215198516846, + "epoch": 2.0546544280984116, + "grad_norm": 0.18802458515738016, + "kl": 0.313232421875, + "learning_rate": 4.951730359328507e-07, + "loss": 0.0003, + "reward": 1.7964286506175995, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 12254 + }, + { + "completion_length": 217.08483123779297, + "epoch": 2.0549897313382792, + "grad_norm": 0.3090721220893524, + "kl": 0.201141357421875, + "learning_rate": 4.951706897848404e-07, + "loss": 0.0002, + "reward": 1.7928571999073029, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 12256 + }, + { + "completion_length": 214.23661518096924, + "epoch": 2.0553250345781464, + "grad_norm": 0.21218909376421832, + "kl": 0.141326904296875, + "learning_rate": 4.951683430723563e-07, + "loss": 0.0001, + "reward": 1.7642858102917671, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 12258 + }, + { + "completion_length": 216.852689743042, + "epoch": 2.055660337818014, + "grad_norm": 0.32296017850789377, + "kl": 0.15814208984375, + "learning_rate": 4.951659957954039e-07, + "loss": 0.0002, + "reward": 1.8017857819795609, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8062500059604645, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12260 + }, + { + "completion_length": 214.16072463989258, + "epoch": 2.0559956410578817, + "grad_norm": 0.5911557976325647, + "kl": 0.276123046875, + "learning_rate": 4.951636479539883e-07, + "loss": 0.0003, + "reward": 1.8267857432365417, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8312500342726707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12262 + }, + { + "completion_length": 226.8125123977661, + "epoch": 2.0563309442977493, + "grad_norm": 0.3042559685781432, + "kl": 0.15435791015625, + "learning_rate": 4.951612995481152e-07, + "loss": 0.0002, + "reward": 1.7553572058677673, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7598214671015739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12264 + }, + { + "completion_length": 213.71429634094238, + "epoch": 2.056666247537617, + "grad_norm": 0.2080678635967977, + "kl": 0.33575439453125, + "learning_rate": 4.951589505777899e-07, + "loss": 0.0003, + "reward": 1.7821428999304771, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 12266 + }, + { + "completion_length": 217.05804443359375, + "epoch": 2.0570015507774846, + "grad_norm": 0.1594908542843629, + "kl": 0.2147216796875, + "learning_rate": 4.951566010430177e-07, + "loss": 0.0002, + "reward": 1.7607143372297287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 12268 + }, + { + "completion_length": 209.49554443359375, + "epoch": 2.057336854017352, + "grad_norm": 0.1559413315105943, + "kl": 0.192535400390625, + "learning_rate": 4.951542509438041e-07, + "loss": 0.0002, + "reward": 1.7589286342263222, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7633928954601288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12270 + }, + { + "completion_length": 201.78125858306885, + "epoch": 2.0576721572572194, + "grad_norm": 0.271571152588572, + "kl": 0.2626953125, + "learning_rate": 4.951519002801546e-07, + "loss": 0.0003, + "reward": 1.7142858058214188, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7142857499420643, + "rewards/format_reward_func": 1.0, + "step": 12272 + }, + { + "completion_length": 202.54911613464355, + "epoch": 2.058007460497087, + "grad_norm": 0.30856242436789477, + "kl": 0.436859130859375, + "learning_rate": 4.951495490520745e-07, + "loss": 0.0004, + "reward": 1.7571429386734962, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 12274 + }, + { + "completion_length": 205.22322273254395, + "epoch": 2.0583427637369547, + "grad_norm": 0.21680426274120007, + "kl": 0.1307373046875, + "learning_rate": 4.951471972595694e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 12276 + }, + { + "completion_length": 199.67858123779297, + "epoch": 2.0586780669768223, + "grad_norm": 0.10996703065060714, + "kl": 0.391815185546875, + "learning_rate": 4.951448449026443e-07, + "loss": 0.0004, + "reward": 1.7928571924567223, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571552038193, + "rewards/format_reward_func": 1.0, + "step": 12278 + }, + { + "completion_length": 204.74108123779297, + "epoch": 2.0590133702166895, + "grad_norm": 0.09081324500665985, + "kl": 1.105133056640625, + "learning_rate": 4.95142491981305e-07, + "loss": 0.0011, + "reward": 1.8071429207921028, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 12280 + }, + { + "completion_length": 214.883939743042, + "epoch": 2.059348673456557, + "grad_norm": 0.34335413647230867, + "kl": 0.147857666015625, + "learning_rate": 4.951401384955568e-07, + "loss": 0.0001, + "reward": 1.7107143551111221, + "reward_std": 0.035355339758098125, + "rewards/equation_reward_func": 0.7196428906172514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12282 + }, + { + "completion_length": 211.67857933044434, + "epoch": 2.059683976696425, + "grad_norm": 0.42040633375445585, + "kl": 0.171630859375, + "learning_rate": 4.951377844454051e-07, + "loss": 0.0002, + "reward": 1.7160715013742447, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7205357477068901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12284 + }, + { + "completion_length": 216.30804634094238, + "epoch": 2.0600192799362924, + "grad_norm": 0.03826716338430946, + "kl": 1.191925048828125, + "learning_rate": 4.951354298308554e-07, + "loss": 0.0012, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 1.0, + "step": 12286 + }, + { + "completion_length": 212.11161613464355, + "epoch": 2.06035458317616, + "grad_norm": 0.3257955722546039, + "kl": 0.256622314453125, + "learning_rate": 4.951330746519129e-07, + "loss": 0.0003, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 12288 + }, + { + "completion_length": 211.2232255935669, + "epoch": 2.0606898864160277, + "grad_norm": 0.1655865158957351, + "kl": 0.50634765625, + "learning_rate": 4.951307189085833e-07, + "loss": 0.0005, + "reward": 1.7321429178118706, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428973227739, + "rewards/format_reward_func": 1.0, + "step": 12290 + }, + { + "completion_length": 216.52679443359375, + "epoch": 2.061025189655895, + "grad_norm": 0.33884942243688243, + "kl": 0.244232177734375, + "learning_rate": 4.951283626008717e-07, + "loss": 0.0002, + "reward": 1.7321429401636124, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 1.0, + "step": 12292 + }, + { + "completion_length": 208.66072463989258, + "epoch": 2.0613604928957625, + "grad_norm": 0.24913338205065425, + "kl": 0.36474609375, + "learning_rate": 4.951260057287839e-07, + "loss": 0.0004, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 12294 + }, + { + "completion_length": 208.3616180419922, + "epoch": 2.06169579613563, + "grad_norm": 0.19352248282013856, + "kl": 0.12322998046875, + "learning_rate": 4.951236482923252e-07, + "loss": 0.0001, + "reward": 1.7892857939004898, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857566475868, + "rewards/format_reward_func": 1.0, + "step": 12296 + }, + { + "completion_length": 208.8303689956665, + "epoch": 2.062031099375498, + "grad_norm": 0.08871718127407249, + "kl": 0.1153717041015625, + "learning_rate": 4.951212902915009e-07, + "loss": 0.0001, + "reward": 1.8196429163217545, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8241071626543999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12298 + }, + { + "completion_length": 227.90625762939453, + "epoch": 2.0623664026153654, + "grad_norm": 0.1901467761656163, + "kl": 0.38531494140625, + "learning_rate": 4.951189317263164e-07, + "loss": 0.0004, + "reward": 1.7214286625385284, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7303571626543999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12300 + }, + { + "completion_length": 209.73661518096924, + "epoch": 2.0627017058552326, + "grad_norm": 0.2980171673664818, + "kl": 0.2169189453125, + "learning_rate": 4.951165725967774e-07, + "loss": 0.0002, + "reward": 1.767410784959793, + "reward_std": 0.0460882093757391, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9968750067055225, + "step": 12302 + }, + { + "completion_length": 205.17411518096924, + "epoch": 2.0630370090951002, + "grad_norm": 0.12847052840527082, + "kl": 0.134552001953125, + "learning_rate": 4.951142129028891e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 12304 + }, + { + "completion_length": 215.54018783569336, + "epoch": 2.063372312334968, + "grad_norm": 0.1620875253578144, + "kl": 0.37359619140625, + "learning_rate": 4.951118526446569e-07, + "loss": 0.0004, + "reward": 1.7714286372065544, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 12306 + }, + { + "completion_length": 225.3259038925171, + "epoch": 2.0637076155748355, + "grad_norm": 0.24326793036443256, + "kl": 0.216827392578125, + "learning_rate": 4.951094918220865e-07, + "loss": 0.0002, + "reward": 1.814285770058632, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857328057289, + "rewards/format_reward_func": 1.0, + "step": 12308 + }, + { + "completion_length": 211.29018878936768, + "epoch": 2.064042918814703, + "grad_norm": 0.16845783070573056, + "kl": 0.169158935546875, + "learning_rate": 4.951071304351831e-07, + "loss": 0.0002, + "reward": 1.7928571999073029, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 12310 + }, + { + "completion_length": 205.84822368621826, + "epoch": 2.0643782220545708, + "grad_norm": 0.4214904190744296, + "kl": 0.1233978271484375, + "learning_rate": 4.951047684839522e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 12312 + }, + { + "completion_length": 210.79465007781982, + "epoch": 2.064713525294438, + "grad_norm": 0.21442170307890102, + "kl": 0.15289306640625, + "learning_rate": 4.951024059683993e-07, + "loss": 0.0002, + "reward": 1.7580357789993286, + "reward_std": 0.059346459805965424, + "rewards/equation_reward_func": 0.7598214745521545, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12314 + }, + { + "completion_length": 211.85715103149414, + "epoch": 2.0650488285343056, + "grad_norm": 0.3091366156084564, + "kl": 0.134857177734375, + "learning_rate": 4.951000428885297e-07, + "loss": 0.0001, + "reward": 1.844642885029316, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.849107164889574, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12316 + }, + { + "completion_length": 212.8571538925171, + "epoch": 2.0653841317741732, + "grad_norm": 0.45210288819457994, + "kl": 0.677154541015625, + "learning_rate": 4.95097679244349e-07, + "loss": 0.0007, + "reward": 1.7339286506175995, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928894996643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12318 + }, + { + "completion_length": 227.17411708831787, + "epoch": 2.065719435014041, + "grad_norm": 0.2152976862107728, + "kl": 0.2509307861328125, + "learning_rate": 4.950953150358625e-07, + "loss": 0.0003, + "reward": 1.8232143223285675, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8276785910129547, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12320 + }, + { + "completion_length": 227.540189743042, + "epoch": 2.0660547382539085, + "grad_norm": 0.5459242095577989, + "kl": 0.271026611328125, + "learning_rate": 4.950929502630757e-07, + "loss": 0.0003, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 12322 + }, + { + "completion_length": 220.99554347991943, + "epoch": 2.066390041493776, + "grad_norm": 0.425233638393935, + "kl": 0.907958984375, + "learning_rate": 4.950905849259942e-07, + "loss": 0.0009, + "reward": 1.7821429446339607, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 12324 + }, + { + "completion_length": 234.6250114440918, + "epoch": 2.0667253447336433, + "grad_norm": 0.3270812138751066, + "kl": 0.9796600341796875, + "learning_rate": 4.950882190246232e-07, + "loss": 0.001, + "reward": 1.6357143744826317, + "reward_std": 0.11111677903681993, + "rewards/equation_reward_func": 0.6625000331550837, + "rewards/format_reward_func": 0.9732142984867096, + "step": 12326 + }, + { + "completion_length": 221.82143878936768, + "epoch": 2.067060647973511, + "grad_norm": 0.2345927392282807, + "kl": 0.71343994140625, + "learning_rate": 4.950858525589682e-07, + "loss": 0.0007, + "reward": 1.7857143357396126, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7946428917348385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12328 + }, + { + "completion_length": 222.04018783569336, + "epoch": 2.0673959512133786, + "grad_norm": 0.1827131215605797, + "kl": 0.113525390625, + "learning_rate": 4.950834855290347e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 12330 + }, + { + "completion_length": 212.12500953674316, + "epoch": 2.0677312544532462, + "grad_norm": 0.17331603165530027, + "kl": 0.10479736328125, + "learning_rate": 4.950811179348282e-07, + "loss": 0.0001, + "reward": 1.8535714596509933, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8535714522004128, + "rewards/format_reward_func": 1.0, + "step": 12332 + }, + { + "completion_length": 226.7901906967163, + "epoch": 2.068066557693114, + "grad_norm": 0.2783940467018383, + "kl": 2.2259521484375, + "learning_rate": 4.950787497763541e-07, + "loss": 0.0022, + "reward": 1.7232143580913544, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 0.9875000044703484, + "step": 12334 + }, + { + "completion_length": 220.68751049041748, + "epoch": 2.068401860932981, + "grad_norm": 0.2831385527880495, + "kl": 0.7620086669921875, + "learning_rate": 4.950763810536178e-07, + "loss": 0.0008, + "reward": 1.8035714998841286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 12336 + }, + { + "completion_length": 225.80804634094238, + "epoch": 2.0687371641728487, + "grad_norm": 0.2587481213808961, + "kl": 2.004638671875, + "learning_rate": 4.950740117666248e-07, + "loss": 0.002, + "reward": 1.7678572311997414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 12338 + }, + { + "completion_length": 225.87501049041748, + "epoch": 2.0690724674127163, + "grad_norm": 0.14320682134950544, + "kl": 0.146728515625, + "learning_rate": 4.950716419153806e-07, + "loss": 0.0001, + "reward": 1.7892857939004898, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 12340 + }, + { + "completion_length": 226.38840198516846, + "epoch": 2.069407770652584, + "grad_norm": 0.1876906013637508, + "kl": 0.278961181640625, + "learning_rate": 4.950692714998906e-07, + "loss": 0.0003, + "reward": 1.7241072282195091, + "reward_std": 0.06692260596901178, + "rewards/equation_reward_func": 0.7303571663796902, + "rewards/format_reward_func": 0.9937500059604645, + "step": 12342 + }, + { + "completion_length": 234.2946538925171, + "epoch": 2.0697430738924516, + "grad_norm": 0.37498717092004047, + "kl": 0.311004638671875, + "learning_rate": 4.950669005201603e-07, + "loss": 0.0003, + "reward": 1.716071493923664, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7205357626080513, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12344 + }, + { + "completion_length": 218.04465579986572, + "epoch": 2.0700783771323192, + "grad_norm": 0.18627335088277233, + "kl": 0.1470947265625, + "learning_rate": 4.950645289761952e-07, + "loss": 0.0001, + "reward": 1.8250000476837158, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000178813934, + "rewards/format_reward_func": 1.0, + "step": 12346 + }, + { + "completion_length": 228.9017972946167, + "epoch": 2.0704136803721864, + "grad_norm": 0.3317378232716388, + "kl": 0.1026153564453125, + "learning_rate": 4.950621568680006e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12348 + }, + { + "completion_length": 215.88840198516846, + "epoch": 2.070748983612054, + "grad_norm": 0.13501275717490083, + "kl": 0.2255859375, + "learning_rate": 4.950597841955821e-07, + "loss": 0.0002, + "reward": 1.767857201397419, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7767857350409031, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12350 + }, + { + "completion_length": 223.21429443359375, + "epoch": 2.0710842868519217, + "grad_norm": 0.1918316612561862, + "kl": 0.6068115234375, + "learning_rate": 4.95057410958945e-07, + "loss": 0.0006, + "reward": 1.7535714954137802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 12352 + }, + { + "completion_length": 222.90179538726807, + "epoch": 2.0714195900917893, + "grad_norm": 0.28909041151334275, + "kl": 0.0965576171875, + "learning_rate": 4.95055037158095e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857428640127, + "rewards/format_reward_func": 1.0, + "step": 12354 + }, + { + "completion_length": 232.17411708831787, + "epoch": 2.071754893331657, + "grad_norm": 0.23212451175861568, + "kl": 0.3805084228515625, + "learning_rate": 4.950526627930374e-07, + "loss": 0.0004, + "reward": 1.7410715073347092, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7455357536673546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12356 + }, + { + "completion_length": 229.99108028411865, + "epoch": 2.0720901965715246, + "grad_norm": 0.1909832712879006, + "kl": 0.110687255859375, + "learning_rate": 4.950502878637776e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 12358 + }, + { + "completion_length": 220.53572463989258, + "epoch": 2.072425499811392, + "grad_norm": 0.10395956700858099, + "kl": 0.14569091796875, + "learning_rate": 4.950479123703213e-07, + "loss": 0.0001, + "reward": 1.807142898440361, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8071428742259741, + "rewards/format_reward_func": 1.0, + "step": 12360 + }, + { + "completion_length": 219.85715103149414, + "epoch": 2.0727608030512594, + "grad_norm": 0.18317384805951137, + "kl": 0.105224609375, + "learning_rate": 4.950455363126739e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428868919611, + "rewards/format_reward_func": 1.0, + "step": 12362 + }, + { + "completion_length": 223.12500953674316, + "epoch": 2.073096106291127, + "grad_norm": 0.1526248796039914, + "kl": 0.239898681640625, + "learning_rate": 4.950431596908408e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428846567869, + "rewards/format_reward_func": 1.0, + "step": 12364 + }, + { + "completion_length": 220.6250114440918, + "epoch": 2.0734314095309947, + "grad_norm": 0.25779972652365757, + "kl": 0.1161346435546875, + "learning_rate": 4.950407825048273e-07, + "loss": 0.0001, + "reward": 1.791071467101574, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12366 + }, + { + "completion_length": 214.35268688201904, + "epoch": 2.0737667127708623, + "grad_norm": 0.19846381452816875, + "kl": 0.09454345703125, + "learning_rate": 4.950384047546393e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 12368 + }, + { + "completion_length": 220.6384038925171, + "epoch": 2.0741020160107295, + "grad_norm": 0.21754462554569812, + "kl": 0.136505126953125, + "learning_rate": 4.95036026440282e-07, + "loss": 0.0001, + "reward": 1.751785770058632, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12370 + }, + { + "completion_length": 225.25001049041748, + "epoch": 2.074437319250597, + "grad_norm": 0.18328997328817917, + "kl": 0.206512451171875, + "learning_rate": 4.950336475617608e-07, + "loss": 0.0002, + "reward": 1.7696429193019867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12372 + }, + { + "completion_length": 221.01340293884277, + "epoch": 2.074772622490465, + "grad_norm": 0.2382505115459546, + "kl": 0.163055419921875, + "learning_rate": 4.950312681190813e-07, + "loss": 0.0002, + "reward": 1.7660714983940125, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.770535733550787, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12374 + }, + { + "completion_length": 221.47322368621826, + "epoch": 2.0751079257303324, + "grad_norm": 0.36282886473131615, + "kl": 0.253021240234375, + "learning_rate": 4.950288881122491e-07, + "loss": 0.0003, + "reward": 1.8071429058909416, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428742259741, + "rewards/format_reward_func": 1.0, + "step": 12376 + }, + { + "completion_length": 219.15179634094238, + "epoch": 2.0754432289702, + "grad_norm": 0.06999156876761552, + "kl": 0.363494873046875, + "learning_rate": 4.950265075412694e-07, + "loss": 0.0004, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000171363354, + "rewards/format_reward_func": 1.0, + "step": 12378 + }, + { + "completion_length": 207.70536613464355, + "epoch": 2.0757785322100677, + "grad_norm": 0.21250234133357188, + "kl": 0.13916015625, + "learning_rate": 4.95024126406148e-07, + "loss": 0.0001, + "reward": 1.817857176065445, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.817857176065445, + "rewards/format_reward_func": 1.0, + "step": 12380 + }, + { + "completion_length": 221.18750953674316, + "epoch": 2.076113835449935, + "grad_norm": 0.21523313575949132, + "kl": 0.14617919921875, + "learning_rate": 4.950217447068901e-07, + "loss": 0.0001, + "reward": 1.7160715162754059, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.720535758882761, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12382 + }, + { + "completion_length": 211.87500953674316, + "epoch": 2.0764491386898025, + "grad_norm": 0.1939850515633133, + "kl": 0.119110107421875, + "learning_rate": 4.950193624435013e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000242143869, + "rewards/format_reward_func": 1.0, + "step": 12384 + }, + { + "completion_length": 220.07590198516846, + "epoch": 2.07678444192967, + "grad_norm": 0.23250932473523991, + "kl": 0.113616943359375, + "learning_rate": 4.950169796159871e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 12386 + }, + { + "completion_length": 217.35715293884277, + "epoch": 2.077119745169538, + "grad_norm": 0.21520081391524276, + "kl": 0.161041259765625, + "learning_rate": 4.950145962243529e-07, + "loss": 0.0002, + "reward": 1.785714328289032, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 12388 + }, + { + "completion_length": 223.40179538726807, + "epoch": 2.0774550484094054, + "grad_norm": 0.751735218253887, + "kl": 1.289947509765625, + "learning_rate": 4.950122122686044e-07, + "loss": 0.0013, + "reward": 1.792857177555561, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571589291096, + "rewards/format_reward_func": 1.0, + "step": 12390 + }, + { + "completion_length": 220.88840293884277, + "epoch": 2.0777903516492726, + "grad_norm": 0.2627694441752578, + "kl": 0.12933349609375, + "learning_rate": 4.950098277487468e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 12392 + }, + { + "completion_length": 217.602689743042, + "epoch": 2.0781256548891403, + "grad_norm": 0.09369122758849056, + "kl": 0.142425537109375, + "learning_rate": 4.950074426647858e-07, + "loss": 0.0001, + "reward": 1.7303572297096252, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7348214723169804, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12394 + }, + { + "completion_length": 213.12947368621826, + "epoch": 2.078460958129008, + "grad_norm": 0.22262652267705993, + "kl": 0.17474365234375, + "learning_rate": 4.950050570167268e-07, + "loss": 0.0002, + "reward": 1.8000000640749931, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 12396 + }, + { + "completion_length": 216.79911518096924, + "epoch": 2.0787962613688755, + "grad_norm": 0.252928583623105, + "kl": 0.204071044921875, + "learning_rate": 4.950026708045754e-07, + "loss": 0.0002, + "reward": 1.7803572416305542, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12398 + }, + { + "completion_length": 222.32590293884277, + "epoch": 2.079131564608743, + "grad_norm": 0.1992977674304534, + "kl": 0.112945556640625, + "learning_rate": 4.95000284028337e-07, + "loss": 0.0001, + "reward": 1.756696492433548, + "reward_std": 0.04103744635358453, + "rewards/equation_reward_func": 0.7598214615136385, + "rewards/format_reward_func": 0.9968750029802322, + "step": 12400 + }, + { + "completion_length": 229.6071538925171, + "epoch": 2.079466867848611, + "grad_norm": 0.8789151158222025, + "kl": 0.360076904296875, + "learning_rate": 4.94997896688017e-07, + "loss": 0.0004, + "reward": 1.7660714909434319, + "reward_std": 0.07828682195395231, + "rewards/equation_reward_func": 0.7794643081724644, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12402 + }, + { + "completion_length": 228.88840103149414, + "epoch": 2.079802171088478, + "grad_norm": 0.1887117558689548, + "kl": 0.135833740234375, + "learning_rate": 4.94995508783621e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 12404 + }, + { + "completion_length": 232.11608409881592, + "epoch": 2.0801374743283456, + "grad_norm": 0.19187944328862966, + "kl": 0.108917236328125, + "learning_rate": 4.949931203151547e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7419643066823483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12406 + }, + { + "completion_length": 233.0803689956665, + "epoch": 2.0804727775682132, + "grad_norm": 0.21875754778227985, + "kl": 0.12646484375, + "learning_rate": 4.949907312826231e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7758928798139095, + "rewards/format_reward_func": 0.9866071455180645, + "step": 12408 + }, + { + "completion_length": 221.72768688201904, + "epoch": 2.080808080808081, + "grad_norm": 0.28006826916314953, + "kl": 0.11962890625, + "learning_rate": 4.949883416860322e-07, + "loss": 0.0001, + "reward": 1.7803571969270706, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12410 + }, + { + "completion_length": 231.99108028411865, + "epoch": 2.0811433840479485, + "grad_norm": 0.10164294385984807, + "kl": 0.131683349609375, + "learning_rate": 4.949859515253873e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 12412 + }, + { + "completion_length": 226.0044755935669, + "epoch": 2.0814786872878157, + "grad_norm": 0.2045897020362465, + "kl": 0.165740966796875, + "learning_rate": 4.949835608006939e-07, + "loss": 0.0002, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 12414 + }, + { + "completion_length": 236.52679443359375, + "epoch": 2.0818139905276833, + "grad_norm": 0.16084116411428384, + "kl": 0.12066650390625, + "learning_rate": 4.949811695119574e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7517857365310192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12416 + }, + { + "completion_length": 232.37947845458984, + "epoch": 2.082149293767551, + "grad_norm": 0.21511452457931682, + "kl": 0.130126953125, + "learning_rate": 4.949787776591836e-07, + "loss": 0.0001, + "reward": 1.7767857387661934, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500391155481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12418 + }, + { + "completion_length": 229.25447463989258, + "epoch": 2.0824845970074186, + "grad_norm": 0.21129750414378579, + "kl": 0.1319732666015625, + "learning_rate": 4.949763852423776e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143312692642, + "rewards/format_reward_func": 1.0, + "step": 12420 + }, + { + "completion_length": 237.6964406967163, + "epoch": 2.0828199002472862, + "grad_norm": 0.40663412421997575, + "kl": 0.1898193359375, + "learning_rate": 4.949739922615454e-07, + "loss": 0.0002, + "reward": 1.7589286863803864, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928768336773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12422 + }, + { + "completion_length": 230.7946538925171, + "epoch": 2.083155203487154, + "grad_norm": 0.16470876741545637, + "kl": 0.1302337646484375, + "learning_rate": 4.949715987166921e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7839285936206579, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12424 + }, + { + "completion_length": 230.53572368621826, + "epoch": 2.083490506727021, + "grad_norm": 0.15797828640324696, + "kl": 0.15777587890625, + "learning_rate": 4.949692046078232e-07, + "loss": 0.0002, + "reward": 1.7857143580913544, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 12426 + }, + { + "completion_length": 237.6875114440918, + "epoch": 2.0838258099668887, + "grad_norm": 0.22081453106368287, + "kl": 0.161590576171875, + "learning_rate": 4.949668099349446e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 12428 + }, + { + "completion_length": 228.06250858306885, + "epoch": 2.0841611132067563, + "grad_norm": 0.22329552621134163, + "kl": 0.1348876953125, + "learning_rate": 4.949644146980615e-07, + "loss": 0.0001, + "reward": 1.7464286163449287, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 1.0, + "step": 12430 + }, + { + "completion_length": 230.2991180419922, + "epoch": 2.084496416446624, + "grad_norm": 0.22023536636649677, + "kl": 0.164520263671875, + "learning_rate": 4.949620188971794e-07, + "loss": 0.0002, + "reward": 1.764285795390606, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7732143178582191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12432 + }, + { + "completion_length": 224.04018878936768, + "epoch": 2.0848317196864916, + "grad_norm": 0.17440609622680234, + "kl": 0.1301422119140625, + "learning_rate": 4.94959622532304e-07, + "loss": 0.0001, + "reward": 1.8142857775092125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 12434 + }, + { + "completion_length": 232.14733219146729, + "epoch": 2.085167022926359, + "grad_norm": 0.17576071552016947, + "kl": 0.13531494140625, + "learning_rate": 4.949572256034406e-07, + "loss": 0.0001, + "reward": 1.783928632736206, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7883928753435612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12436 + }, + { + "completion_length": 232.62947463989258, + "epoch": 2.0855023261662264, + "grad_norm": 0.19723789067097333, + "kl": 0.14813232421875, + "learning_rate": 4.94954828110595e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7875000219792128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12438 + }, + { + "completion_length": 235.81697463989258, + "epoch": 2.085837629406094, + "grad_norm": 0.14950628149768866, + "kl": 0.1369171142578125, + "learning_rate": 4.949524300537726e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7589286081492901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12440 + }, + { + "completion_length": 230.64286708831787, + "epoch": 2.0861729326459617, + "grad_norm": 0.4378233230600497, + "kl": 0.23809814453125, + "learning_rate": 4.949500314329787e-07, + "loss": 0.0002, + "reward": 1.7089286223053932, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7133928835391998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12442 + }, + { + "completion_length": 222.9107255935669, + "epoch": 2.0865082358858293, + "grad_norm": 0.46352014756844934, + "kl": 0.178680419921875, + "learning_rate": 4.949476322482191e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 12444 + }, + { + "completion_length": 215.74554443359375, + "epoch": 2.086843539125697, + "grad_norm": 0.24448623383945575, + "kl": 0.13616943359375, + "learning_rate": 4.949452324994992e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 12446 + }, + { + "completion_length": 226.6250123977661, + "epoch": 2.087178842365564, + "grad_norm": 0.2482961130975891, + "kl": 0.15899658203125, + "learning_rate": 4.949428321868246e-07, + "loss": 0.0002, + "reward": 1.7714286372065544, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 12448 + }, + { + "completion_length": 232.67858409881592, + "epoch": 2.087514145605432, + "grad_norm": 0.31050673652855626, + "kl": 0.240020751953125, + "learning_rate": 4.949404313102008e-07, + "loss": 0.0002, + "reward": 1.7589286416769028, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928842842579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12450 + }, + { + "completion_length": 234.43304443359375, + "epoch": 2.0878494488452994, + "grad_norm": 0.26036864450294417, + "kl": 0.2906341552734375, + "learning_rate": 4.949380298696333e-07, + "loss": 0.0003, + "reward": 1.7723214700818062, + "reward_std": 0.049244935624301434, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12452 + }, + { + "completion_length": 229.3616180419922, + "epoch": 2.088184752085167, + "grad_norm": 0.09615753218038946, + "kl": 0.14678955078125, + "learning_rate": 4.949356278651277e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.746428582817316, + "rewards/format_reward_func": 1.0, + "step": 12454 + }, + { + "completion_length": 237.2232255935669, + "epoch": 2.0885200553250347, + "grad_norm": 0.24540628173182102, + "kl": 0.163970947265625, + "learning_rate": 4.949332252966893e-07, + "loss": 0.0002, + "reward": 1.750000074505806, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7589285969734192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12456 + }, + { + "completion_length": 221.55804538726807, + "epoch": 2.0888553585649023, + "grad_norm": 0.18356936840482607, + "kl": 0.1224517822265625, + "learning_rate": 4.949308221643239e-07, + "loss": 0.0001, + "reward": 1.731250062584877, + "reward_std": 0.056821079924702644, + "rewards/equation_reward_func": 0.7375000398606062, + "rewards/format_reward_func": 0.9937500059604645, + "step": 12458 + }, + { + "completion_length": 229.7009038925171, + "epoch": 2.0891906618047695, + "grad_norm": 0.4034373111510341, + "kl": 0.2835540771484375, + "learning_rate": 4.949284184680369e-07, + "loss": 0.0003, + "reward": 1.8232143372297287, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8276785835623741, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12460 + }, + { + "completion_length": 220.22768878936768, + "epoch": 2.089525965044637, + "grad_norm": 0.1536477528310614, + "kl": 0.1710205078125, + "learning_rate": 4.949260142078339e-07, + "loss": 0.0002, + "reward": 1.8178571909666061, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 12462 + }, + { + "completion_length": 231.63393878936768, + "epoch": 2.089861268284505, + "grad_norm": 0.244357653418181, + "kl": 0.19512939453125, + "learning_rate": 4.949236093837204e-07, + "loss": 0.0002, + "reward": 1.8071429133415222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 12464 + }, + { + "completion_length": 225.87054538726807, + "epoch": 2.0901965715243724, + "grad_norm": 0.18870748260664152, + "kl": 0.23162841796875, + "learning_rate": 4.949212039957019e-07, + "loss": 0.0002, + "reward": 1.7629465013742447, + "reward_std": 0.05240166233852506, + "rewards/equation_reward_func": 0.764285733923316, + "rewards/format_reward_func": 0.9986607171595097, + "step": 12466 + }, + { + "completion_length": 216.29911613464355, + "epoch": 2.09053187476424, + "grad_norm": 0.19688421926182828, + "kl": 0.33929443359375, + "learning_rate": 4.949187980437841e-07, + "loss": 0.0003, + "reward": 1.7857143506407738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 12468 + }, + { + "completion_length": 222.70983219146729, + "epoch": 2.0908671780041073, + "grad_norm": 0.5548509345150273, + "kl": 0.38800048828125, + "learning_rate": 4.949163915279722e-07, + "loss": 0.0004, + "reward": 1.717857226729393, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571708500385, + "rewards/format_reward_func": 1.0, + "step": 12470 + }, + { + "completion_length": 210.02232933044434, + "epoch": 2.091202481243975, + "grad_norm": 0.20444495515084055, + "kl": 0.30419921875, + "learning_rate": 4.949139844482721e-07, + "loss": 0.0003, + "reward": 1.7696429193019867, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7741071805357933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12472 + }, + { + "completion_length": 224.08483028411865, + "epoch": 2.0915377844838425, + "grad_norm": 0.22854212897739953, + "kl": 0.231231689453125, + "learning_rate": 4.949115768046893e-07, + "loss": 0.0002, + "reward": 1.8196429014205933, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8241071663796902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12474 + }, + { + "completion_length": 218.58929824829102, + "epoch": 2.09187308772371, + "grad_norm": 0.22404496442844024, + "kl": 0.5330810546875, + "learning_rate": 4.949091685972291e-07, + "loss": 0.0005, + "reward": 1.7660714983940125, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7705357223749161, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12476 + }, + { + "completion_length": 225.81251049041748, + "epoch": 2.092208390963578, + "grad_norm": 0.26550278680486267, + "kl": 0.1566162109375, + "learning_rate": 4.949067598258972e-07, + "loss": 0.0002, + "reward": 1.7589286491274834, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928805589676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12478 + }, + { + "completion_length": 228.25894165039062, + "epoch": 2.0925436942034454, + "grad_norm": 0.20035787461152282, + "kl": 0.33837890625, + "learning_rate": 4.949043504906992e-07, + "loss": 0.0003, + "reward": 1.8053571954369545, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8098214566707611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12480 + }, + { + "completion_length": 224.30358123779297, + "epoch": 2.0928789974433126, + "grad_norm": 0.09406973070022828, + "kl": 0.242889404296875, + "learning_rate": 4.949019405916405e-07, + "loss": 0.0002, + "reward": 1.82857146859169, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8285714648663998, + "rewards/format_reward_func": 1.0, + "step": 12482 + }, + { + "completion_length": 229.59375858306885, + "epoch": 2.0932143006831803, + "grad_norm": 0.2354022784534689, + "kl": 0.712188720703125, + "learning_rate": 4.948995301287268e-07, + "loss": 0.0007, + "reward": 1.814285770058632, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857551574707, + "rewards/format_reward_func": 1.0, + "step": 12484 + }, + { + "completion_length": 222.6607255935669, + "epoch": 2.093549603923048, + "grad_norm": 0.14301333677568714, + "kl": 0.337249755859375, + "learning_rate": 4.948971191019635e-07, + "loss": 0.0003, + "reward": 1.7839286401867867, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12486 + }, + { + "completion_length": 222.1116180419922, + "epoch": 2.0938849071629155, + "grad_norm": 0.09825610613763396, + "kl": 0.35711669921875, + "learning_rate": 4.948947075113563e-07, + "loss": 0.0004, + "reward": 1.7500000670552254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 12488 + }, + { + "completion_length": 230.32143783569336, + "epoch": 2.094220210402783, + "grad_norm": 0.01907351915692328, + "kl": 0.244720458984375, + "learning_rate": 4.948922953569107e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 12490 + }, + { + "completion_length": 228.71429443359375, + "epoch": 2.0945555136426504, + "grad_norm": 0.19290737070434144, + "kl": 0.406707763671875, + "learning_rate": 4.948898826386322e-07, + "loss": 0.0004, + "reward": 1.735714353621006, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143275439739, + "rewards/format_reward_func": 1.0, + "step": 12492 + }, + { + "completion_length": 230.04465293884277, + "epoch": 2.094890816882518, + "grad_norm": 0.19890650026038148, + "kl": 0.1217041015625, + "learning_rate": 4.948874693565263e-07, + "loss": 0.0001, + "reward": 1.7410715147852898, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7455357387661934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12494 + }, + { + "completion_length": 232.09375953674316, + "epoch": 2.0952261201223856, + "grad_norm": 0.24585345275777434, + "kl": 0.1280670166015625, + "learning_rate": 4.948850555105988e-07, + "loss": 0.0001, + "reward": 1.7214286476373672, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7303571812808514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12496 + }, + { + "completion_length": 227.74554634094238, + "epoch": 2.0955614233622533, + "grad_norm": 0.2588696781419015, + "kl": 0.45916748046875, + "learning_rate": 4.948826411008551e-07, + "loss": 0.0005, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428697556257, + "rewards/format_reward_func": 1.0, + "step": 12498 + }, + { + "completion_length": 245.97768878936768, + "epoch": 2.095896726602121, + "grad_norm": 0.428716342350395, + "kl": 0.237884521484375, + "learning_rate": 4.948802261273007e-07, + "loss": 0.0002, + "reward": 1.7392857894301414, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.748214315623045, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12500 + }, + { + "completion_length": 225.5044755935669, + "epoch": 2.0962320298419885, + "grad_norm": 0.16797785673849946, + "kl": 0.152435302734375, + "learning_rate": 4.948778105899412e-07, + "loss": 0.0002, + "reward": 1.8142857775092125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 12502 + }, + { + "completion_length": 239.165189743042, + "epoch": 2.0965673330818557, + "grad_norm": 0.13361972927868213, + "kl": 0.1374664306640625, + "learning_rate": 4.948753944887823e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 12504 + }, + { + "completion_length": 223.86608219146729, + "epoch": 2.0969026363217234, + "grad_norm": 0.1260006794495531, + "kl": 0.21466064453125, + "learning_rate": 4.948729778238293e-07, + "loss": 0.0002, + "reward": 1.8000000268220901, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 12506 + }, + { + "completion_length": 233.87947368621826, + "epoch": 2.097237939561591, + "grad_norm": 0.21277466514518686, + "kl": 0.112701416015625, + "learning_rate": 4.948705605950879e-07, + "loss": 0.0001, + "reward": 1.7750000357627869, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7839285973459482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12508 + }, + { + "completion_length": 237.5178689956665, + "epoch": 2.0975732428014586, + "grad_norm": 0.2227883614821665, + "kl": 0.1185150146484375, + "learning_rate": 4.948681428025638e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 12510 + }, + { + "completion_length": 250.70983219146729, + "epoch": 2.0979085460413263, + "grad_norm": 0.20100054630349626, + "kl": 0.16143798828125, + "learning_rate": 4.948657244462624e-07, + "loss": 0.0002, + "reward": 1.719642922282219, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7241071797907352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12512 + }, + { + "completion_length": 248.77233505249023, + "epoch": 2.098243849281194, + "grad_norm": 0.3901435180972612, + "kl": 0.18560791015625, + "learning_rate": 4.948633055261894e-07, + "loss": 0.0002, + "reward": 1.803571492433548, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 12514 + }, + { + "completion_length": 248.1026906967163, + "epoch": 2.098579152521061, + "grad_norm": 0.17118320801676853, + "kl": 0.18536376953125, + "learning_rate": 4.948608860423501e-07, + "loss": 0.0002, + "reward": 1.825000062584877, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000141561031, + "rewards/format_reward_func": 1.0, + "step": 12516 + }, + { + "completion_length": 253.0580472946167, + "epoch": 2.0989144557609287, + "grad_norm": 0.1819693796045121, + "kl": 0.36138916015625, + "learning_rate": 4.948584659947504e-07, + "loss": 0.0004, + "reward": 1.8250000402331352, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000327825546, + "rewards/format_reward_func": 1.0, + "step": 12518 + }, + { + "completion_length": 260.1785840988159, + "epoch": 2.0992497590007964, + "grad_norm": 0.3183018413425357, + "kl": 0.29443359375, + "learning_rate": 4.948560453833956e-07, + "loss": 0.0003, + "reward": 1.7267857939004898, + "reward_std": 0.0530330091714859, + "rewards/equation_reward_func": 0.7401785999536514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12520 + }, + { + "completion_length": 256.9642972946167, + "epoch": 2.099585062240664, + "grad_norm": 0.3034727556135848, + "kl": 0.21856689453125, + "learning_rate": 4.948536242082915e-07, + "loss": 0.0002, + "reward": 1.7339286357164383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12522 + }, + { + "completion_length": 252.75447750091553, + "epoch": 2.0999203654805316, + "grad_norm": 0.2733843588864411, + "kl": 0.21575927734375, + "learning_rate": 4.948512024694436e-07, + "loss": 0.0002, + "reward": 1.783928632736206, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928883820772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12524 + }, + { + "completion_length": 256.7901906967163, + "epoch": 2.100255668720399, + "grad_norm": 0.09788705115293139, + "kl": 0.50946044921875, + "learning_rate": 4.948487801668574e-07, + "loss": 0.0005, + "reward": 1.7785715013742447, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 12526 + }, + { + "completion_length": 254.4241180419922, + "epoch": 2.1005909719602665, + "grad_norm": 0.4143288036401865, + "kl": 0.23956298828125, + "learning_rate": 4.948463573005384e-07, + "loss": 0.0002, + "reward": 1.7933036237955093, + "reward_std": 0.05997780663892627, + "rewards/equation_reward_func": 0.7991071790456772, + "rewards/format_reward_func": 0.9941964335739613, + "step": 12528 + }, + { + "completion_length": 253.6160831451416, + "epoch": 2.100926275200134, + "grad_norm": 0.2037976901278312, + "kl": 0.17803955078125, + "learning_rate": 4.948439338704925e-07, + "loss": 0.0002, + "reward": 1.7428572103381157, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7517857495695353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12530 + }, + { + "completion_length": 254.9375114440918, + "epoch": 2.1012615784400017, + "grad_norm": 0.1723439343144399, + "kl": 0.4744873046875, + "learning_rate": 4.94841509876725e-07, + "loss": 0.0005, + "reward": 1.7910715118050575, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357320606709, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12532 + }, + { + "completion_length": 255.24108505249023, + "epoch": 2.1015968816798694, + "grad_norm": 0.555953331373012, + "kl": 0.710052490234375, + "learning_rate": 4.948390853192415e-07, + "loss": 0.0007, + "reward": 1.7714286297559738, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7803571745753288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12534 + }, + { + "completion_length": 257.3660840988159, + "epoch": 2.101932184919737, + "grad_norm": 0.2409187824669436, + "kl": 0.139373779296875, + "learning_rate": 4.948366601980479e-07, + "loss": 0.0001, + "reward": 1.805357202887535, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8098214566707611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12536 + }, + { + "completion_length": 257.7991199493408, + "epoch": 2.102267488159604, + "grad_norm": 0.2669260915173354, + "kl": 0.519439697265625, + "learning_rate": 4.948342345131492e-07, + "loss": 0.0005, + "reward": 1.7803571969270706, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214767873287, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12538 + }, + { + "completion_length": 248.3526906967163, + "epoch": 2.102602791399472, + "grad_norm": 0.24818006289820269, + "kl": 0.297607421875, + "learning_rate": 4.948318082645515e-07, + "loss": 0.0003, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 12540 + }, + { + "completion_length": 247.46429920196533, + "epoch": 2.1029380946393395, + "grad_norm": 0.16690281788865083, + "kl": 0.3175048828125, + "learning_rate": 4.948293814522602e-07, + "loss": 0.0003, + "reward": 1.787500038743019, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.8008929006755352, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12542 + }, + { + "completion_length": 249.2991180419922, + "epoch": 2.103273397879207, + "grad_norm": 0.3490837667156745, + "kl": 0.246368408203125, + "learning_rate": 4.948269540762809e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 12544 + }, + { + "completion_length": 254.8884038925171, + "epoch": 2.1036087011190747, + "grad_norm": 0.13535564372346423, + "kl": 0.2268829345703125, + "learning_rate": 4.94824526136619e-07, + "loss": 0.0002, + "reward": 1.7232143729925156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12546 + }, + { + "completion_length": 262.5759038925171, + "epoch": 2.103944004358942, + "grad_norm": 0.17458894142526152, + "kl": 0.57427978515625, + "learning_rate": 4.948220976332804e-07, + "loss": 0.0006, + "reward": 1.8000000715255737, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 12548 + }, + { + "completion_length": 257.714298248291, + "epoch": 2.1042793075988095, + "grad_norm": 0.309491138362646, + "kl": 0.987518310546875, + "learning_rate": 4.948196685662705e-07, + "loss": 0.001, + "reward": 1.7714286521077156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 12550 + }, + { + "completion_length": 256.870548248291, + "epoch": 2.104614610838677, + "grad_norm": 0.1550566234114588, + "kl": 0.277984619140625, + "learning_rate": 4.948172389355951e-07, + "loss": 0.0003, + "reward": 1.778571493923664, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 12552 + }, + { + "completion_length": 251.26787185668945, + "epoch": 2.104949914078545, + "grad_norm": 0.16131576931507266, + "kl": 0.341156005859375, + "learning_rate": 4.948148087412594e-07, + "loss": 0.0003, + "reward": 1.7928572073578835, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.8017857372760773, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12554 + }, + { + "completion_length": 250.04911994934082, + "epoch": 2.1052852173184124, + "grad_norm": 0.21502764088900536, + "kl": 0.14691162109375, + "learning_rate": 4.948123779832694e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633928991854191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12556 + }, + { + "completion_length": 282.6250104904175, + "epoch": 2.10562052055828, + "grad_norm": 0.3334944368615149, + "kl": 1.33514404296875, + "learning_rate": 4.948099466616307e-07, + "loss": 0.0013, + "reward": 1.750000037252903, + "reward_std": 0.08081220276653767, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 0.9821428656578064, + "step": 12558 + }, + { + "completion_length": 262.6875114440918, + "epoch": 2.1059558237981473, + "grad_norm": 0.0697285926398769, + "kl": 0.35528564453125, + "learning_rate": 4.948075147763484e-07, + "loss": 0.0004, + "reward": 1.7767857760190964, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7901786081492901, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12560 + }, + { + "completion_length": 270.2767963409424, + "epoch": 2.106291127038015, + "grad_norm": 0.18152372568138508, + "kl": 0.301727294921875, + "learning_rate": 4.948050823274287e-07, + "loss": 0.0003, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214633762836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12562 + }, + { + "completion_length": 267.56251335144043, + "epoch": 2.1066264302778825, + "grad_norm": 0.18210512754892727, + "kl": 0.208160400390625, + "learning_rate": 4.948026493148769e-07, + "loss": 0.0002, + "reward": 1.8107143566012383, + "reward_std": 0.07576144021004438, + "rewards/equation_reward_func": 0.8196428790688515, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12564 + }, + { + "completion_length": 267.49554538726807, + "epoch": 2.10696173351775, + "grad_norm": 0.654924979260132, + "kl": 0.542205810546875, + "learning_rate": 4.948002157386984e-07, + "loss": 0.0005, + "reward": 1.7785714864730835, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7875000312924385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12566 + }, + { + "completion_length": 265.4642963409424, + "epoch": 2.107297036757618, + "grad_norm": 0.10704312735958628, + "kl": 0.801483154296875, + "learning_rate": 4.947977815988992e-07, + "loss": 0.0008, + "reward": 1.7446429431438446, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12568 + }, + { + "completion_length": 278.745548248291, + "epoch": 2.107632339997485, + "grad_norm": 0.1477785650076975, + "kl": 0.212066650390625, + "learning_rate": 4.947953468954848e-07, + "loss": 0.0002, + "reward": 1.7196429148316383, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.977678582072258, + "step": 12570 + }, + { + "completion_length": 269.9241199493408, + "epoch": 2.1079676432373526, + "grad_norm": 0.2375696627186269, + "kl": 0.45843505859375, + "learning_rate": 4.947929116284606e-07, + "loss": 0.0005, + "reward": 1.7196429073810577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12572 + }, + { + "completion_length": 263.5312604904175, + "epoch": 2.1083029464772203, + "grad_norm": 0.15276034133103347, + "kl": 0.269683837890625, + "learning_rate": 4.947904757978325e-07, + "loss": 0.0003, + "reward": 1.7464286610484123, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 12574 + }, + { + "completion_length": 253.84375858306885, + "epoch": 2.108638249717088, + "grad_norm": 0.19924283727198752, + "kl": 0.11798095703125, + "learning_rate": 4.947880394036058e-07, + "loss": 0.0001, + "reward": 1.7571428939700127, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7660714443773031, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12576 + }, + { + "completion_length": 266.6339387893677, + "epoch": 2.1089735529569555, + "grad_norm": 0.5955246532748683, + "kl": 0.442657470703125, + "learning_rate": 4.947856024457865e-07, + "loss": 0.0004, + "reward": 1.7410714998841286, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7455357443541288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12578 + }, + { + "completion_length": 255.42411708831787, + "epoch": 2.109308856196823, + "grad_norm": 0.1900532254599884, + "kl": 0.520721435546875, + "learning_rate": 4.947831649243798e-07, + "loss": 0.0005, + "reward": 1.8107143193483353, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.8196428939700127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12580 + }, + { + "completion_length": 269.7991189956665, + "epoch": 2.1096441594366904, + "grad_norm": 0.31313371529861506, + "kl": 0.828125, + "learning_rate": 4.947807268393915e-07, + "loss": 0.0008, + "reward": 1.7093750685453415, + "reward_std": 0.06755395070649683, + "rewards/equation_reward_func": 0.7196428924798965, + "rewards/format_reward_func": 0.9897321499884129, + "step": 12582 + }, + { + "completion_length": 256.04465198516846, + "epoch": 2.109979462676558, + "grad_norm": 0.2102414212533989, + "kl": 1.605743408203125, + "learning_rate": 4.947782881908273e-07, + "loss": 0.0016, + "reward": 1.8017857819795609, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500320374966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12584 + }, + { + "completion_length": 251.70536613464355, + "epoch": 2.1103147659164256, + "grad_norm": 0.13884928172541447, + "kl": 0.314422607421875, + "learning_rate": 4.947758489786927e-07, + "loss": 0.0003, + "reward": 1.7464286386966705, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.755357176065445, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12586 + }, + { + "completion_length": 254.7366180419922, + "epoch": 2.1106500691562933, + "grad_norm": 0.19675570563541028, + "kl": 0.25933837890625, + "learning_rate": 4.947734092029934e-07, + "loss": 0.0003, + "reward": 1.714285783469677, + "reward_std": 0.09091372787952423, + "rewards/equation_reward_func": 0.7232143357396126, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12588 + }, + { + "completion_length": 251.1428680419922, + "epoch": 2.110985372396161, + "grad_norm": 0.24465264663289335, + "kl": 0.3800048828125, + "learning_rate": 4.947709688637348e-07, + "loss": 0.0004, + "reward": 1.7107143849134445, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7107143178582191, + "rewards/format_reward_func": 1.0, + "step": 12590 + }, + { + "completion_length": 250.22322463989258, + "epoch": 2.1113206756360285, + "grad_norm": 0.18432661633018005, + "kl": 0.72412109375, + "learning_rate": 4.947685279609228e-07, + "loss": 0.0007, + "reward": 1.775000087916851, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 12592 + }, + { + "completion_length": 253.7857265472412, + "epoch": 2.1116559788758957, + "grad_norm": 0.2395700514182744, + "kl": 0.30462646484375, + "learning_rate": 4.947660864945629e-07, + "loss": 0.0003, + "reward": 1.7535715028643608, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 12594 + }, + { + "completion_length": 243.6473331451416, + "epoch": 2.1119912821157634, + "grad_norm": 0.4434999353814348, + "kl": 0.285491943359375, + "learning_rate": 4.947636444646605e-07, + "loss": 0.0003, + "reward": 1.7285715267062187, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 12596 + }, + { + "completion_length": 253.28126430511475, + "epoch": 2.112326585355631, + "grad_norm": 0.20279855857111112, + "kl": 0.1322021484375, + "learning_rate": 4.947612018712218e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 12598 + }, + { + "completion_length": 249.29465770721436, + "epoch": 2.1126618885954986, + "grad_norm": 0.28162706638188384, + "kl": 0.419097900390625, + "learning_rate": 4.947587587142518e-07, + "loss": 0.0004, + "reward": 1.7285715192556381, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 12600 + }, + { + "completion_length": 249.19643878936768, + "epoch": 2.1129971918353663, + "grad_norm": 0.18187601895711714, + "kl": 0.145660400390625, + "learning_rate": 4.947563149937565e-07, + "loss": 0.0001, + "reward": 1.8200893625617027, + "reward_std": 0.05240166140720248, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 0.9986607171595097, + "step": 12602 + }, + { + "completion_length": 243.66965579986572, + "epoch": 2.1133324950752335, + "grad_norm": 0.23567424608565962, + "kl": 0.263092041015625, + "learning_rate": 4.947538707097413e-07, + "loss": 0.0003, + "reward": 1.8250000774860382, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.825000025331974, + "rewards/format_reward_func": 1.0, + "step": 12604 + }, + { + "completion_length": 246.23215579986572, + "epoch": 2.113667798315101, + "grad_norm": 0.26916544402526654, + "kl": 0.621246337890625, + "learning_rate": 4.94751425862212e-07, + "loss": 0.0006, + "reward": 1.7607143446803093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143223285675, + "rewards/format_reward_func": 1.0, + "step": 12606 + }, + { + "completion_length": 238.81697463989258, + "epoch": 2.1140031015549687, + "grad_norm": 0.18268104728073659, + "kl": 0.4063720703125, + "learning_rate": 4.947489804511742e-07, + "loss": 0.0004, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 12608 + }, + { + "completion_length": 251.133939743042, + "epoch": 2.1143384047948364, + "grad_norm": 0.17624304213752912, + "kl": 0.40753173828125, + "learning_rate": 4.947465344766335e-07, + "loss": 0.0004, + "reward": 1.7535715103149414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714451223612, + "rewards/format_reward_func": 1.0, + "step": 12610 + }, + { + "completion_length": 248.90625762939453, + "epoch": 2.114673708034704, + "grad_norm": 0.28952980847030996, + "kl": 0.224822998046875, + "learning_rate": 4.947440879385955e-07, + "loss": 0.0002, + "reward": 1.7214286476373672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 12612 + }, + { + "completion_length": 231.34822463989258, + "epoch": 2.1150090112745716, + "grad_norm": 0.39774063211490374, + "kl": 0.192047119140625, + "learning_rate": 4.947416408370659e-07, + "loss": 0.0002, + "reward": 1.8589285984635353, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8633928783237934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12614 + }, + { + "completion_length": 242.60268783569336, + "epoch": 2.115344314514439, + "grad_norm": 0.24908888175240032, + "kl": 0.17919921875, + "learning_rate": 4.947391931720503e-07, + "loss": 0.0002, + "reward": 1.7357143461704254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143182307482, + "rewards/format_reward_func": 1.0, + "step": 12616 + }, + { + "completion_length": 253.0669755935669, + "epoch": 2.1156796177543065, + "grad_norm": 0.19669393215519845, + "kl": 0.18682861328125, + "learning_rate": 4.947367449435542e-07, + "loss": 0.0002, + "reward": 1.7875000536441803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919643111526966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12618 + }, + { + "completion_length": 242.2232255935669, + "epoch": 2.116014920994174, + "grad_norm": 0.1812298854708925, + "kl": 0.133697509765625, + "learning_rate": 4.947342961515835e-07, + "loss": 0.0001, + "reward": 1.785714365541935, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 12620 + }, + { + "completion_length": 245.42858505249023, + "epoch": 2.1163502242340417, + "grad_norm": 0.6592117291790457, + "kl": 0.193939208984375, + "learning_rate": 4.947318467961437e-07, + "loss": 0.0002, + "reward": 1.7946429029107094, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071678698063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12622 + }, + { + "completion_length": 254.62947750091553, + "epoch": 2.1166855274739094, + "grad_norm": 0.1700151949627714, + "kl": 0.1432952880859375, + "learning_rate": 4.947293968772405e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 12624 + }, + { + "completion_length": 242.79911708831787, + "epoch": 2.1170208307137766, + "grad_norm": 0.17733056326423086, + "kl": 0.13336181640625, + "learning_rate": 4.947269463948795e-07, + "loss": 0.0001, + "reward": 1.7964286133646965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286170899868, + "rewards/format_reward_func": 1.0, + "step": 12626 + }, + { + "completion_length": 252.6250114440918, + "epoch": 2.117356133953644, + "grad_norm": 0.1922726528758622, + "kl": 0.155487060546875, + "learning_rate": 4.947244953490662e-07, + "loss": 0.0002, + "reward": 1.760714367032051, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.760714340955019, + "rewards/format_reward_func": 1.0, + "step": 12628 + }, + { + "completion_length": 249.55804920196533, + "epoch": 2.117691437193512, + "grad_norm": 0.24193949809038998, + "kl": 0.129150390625, + "learning_rate": 4.947220437398064e-07, + "loss": 0.0001, + "reward": 1.7750000357627869, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000413507223, + "rewards/format_reward_func": 1.0, + "step": 12630 + }, + { + "completion_length": 243.89286708831787, + "epoch": 2.1180267404333795, + "grad_norm": 0.1576165068164449, + "kl": 0.19390869140625, + "learning_rate": 4.947195915671058e-07, + "loss": 0.0002, + "reward": 1.8035714775323868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714738070965, + "rewards/format_reward_func": 1.0, + "step": 12632 + }, + { + "completion_length": 256.64733505249023, + "epoch": 2.118362043673247, + "grad_norm": 0.10762529730307481, + "kl": 0.192108154296875, + "learning_rate": 4.947171388309699e-07, + "loss": 0.0002, + "reward": 1.8285714536905289, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8285714499652386, + "rewards/format_reward_func": 1.0, + "step": 12634 + }, + { + "completion_length": 255.4642972946167, + "epoch": 2.1186973469131147, + "grad_norm": 0.1776726750654768, + "kl": 0.19140625, + "learning_rate": 4.947146855314045e-07, + "loss": 0.0002, + "reward": 1.721428669989109, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214286103844643, + "rewards/format_reward_func": 1.0, + "step": 12636 + }, + { + "completion_length": 249.75893878936768, + "epoch": 2.119032650152982, + "grad_norm": 0.19551152845046466, + "kl": 0.137237548828125, + "learning_rate": 4.947122316684151e-07, + "loss": 0.0001, + "reward": 1.8178572058677673, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571723401546, + "rewards/format_reward_func": 1.0, + "step": 12638 + }, + { + "completion_length": 263.6919755935669, + "epoch": 2.1193679533928496, + "grad_norm": 0.16631205617192682, + "kl": 0.1456298828125, + "learning_rate": 4.947097772420074e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 12640 + }, + { + "completion_length": 262.1205463409424, + "epoch": 2.119703256632717, + "grad_norm": 0.25879339095547543, + "kl": 0.1573333740234375, + "learning_rate": 4.94707322252187e-07, + "loss": 0.0002, + "reward": 1.7571429461240768, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 12642 + }, + { + "completion_length": 265.4285840988159, + "epoch": 2.120038559872585, + "grad_norm": 0.008186147312404166, + "kl": 0.17401123046875, + "learning_rate": 4.947048666989597e-07, + "loss": 0.0002, + "reward": 1.7107143551111221, + "reward_std": 0.025253813713788986, + "rewards/equation_reward_func": 0.7196428868919611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12644 + }, + { + "completion_length": 255.92858600616455, + "epoch": 2.1203738631124525, + "grad_norm": 0.1079018393352671, + "kl": 0.259307861328125, + "learning_rate": 4.947024105823311e-07, + "loss": 0.0003, + "reward": 1.769642896950245, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7830357346683741, + "rewards/format_reward_func": 0.9866071492433548, + "step": 12646 + }, + { + "completion_length": 261.4285840988159, + "epoch": 2.12070916635232, + "grad_norm": 0.16324194777430936, + "kl": 0.167999267578125, + "learning_rate": 4.946999539023068e-07, + "loss": 0.0002, + "reward": 1.7410715147852898, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.745535746216774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12648 + }, + { + "completion_length": 266.54465770721436, + "epoch": 2.1210444695921873, + "grad_norm": 0.24910040332383881, + "kl": 0.21734619140625, + "learning_rate": 4.946974966588924e-07, + "loss": 0.0002, + "reward": 1.7464286237955093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285958558321, + "rewards/format_reward_func": 1.0, + "step": 12650 + }, + { + "completion_length": 273.5401954650879, + "epoch": 2.121379772832055, + "grad_norm": 0.24830042928049242, + "kl": 0.1650390625, + "learning_rate": 4.946950388520938e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7392857521772385, + "rewards/format_reward_func": 1.0, + "step": 12652 + }, + { + "completion_length": 259.7500104904175, + "epoch": 2.1217150760719226, + "grad_norm": 0.1759505675423626, + "kl": 0.13311767578125, + "learning_rate": 4.946925804819163e-07, + "loss": 0.0001, + "reward": 1.8250000476837158, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000141561031, + "rewards/format_reward_func": 1.0, + "step": 12654 + }, + { + "completion_length": 263.8080463409424, + "epoch": 2.12205037931179, + "grad_norm": 0.3763846045538354, + "kl": 0.1600341796875, + "learning_rate": 4.946901215483659e-07, + "loss": 0.0002, + "reward": 1.753571480512619, + "reward_std": 0.08586296532303095, + "rewards/equation_reward_func": 0.7625000216066837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12656 + }, + { + "completion_length": 262.22768688201904, + "epoch": 2.122385682551658, + "grad_norm": 0.09794541085463028, + "kl": 0.1695556640625, + "learning_rate": 4.94687662051448e-07, + "loss": 0.0002, + "reward": 1.7892857566475868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857529222965, + "rewards/format_reward_func": 1.0, + "step": 12658 + }, + { + "completion_length": 261.977689743042, + "epoch": 2.122720985791525, + "grad_norm": 0.21298906750501434, + "kl": 0.194488525390625, + "learning_rate": 4.946852019911684e-07, + "loss": 0.0002, + "reward": 1.7750000357627869, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000301748514, + "rewards/format_reward_func": 1.0, + "step": 12660 + }, + { + "completion_length": 253.3482255935669, + "epoch": 2.1230562890313927, + "grad_norm": 0.1329742623791657, + "kl": 0.120574951171875, + "learning_rate": 4.946827413675328e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 12662 + }, + { + "completion_length": 256.8526906967163, + "epoch": 2.1233915922712603, + "grad_norm": 0.27845250518816944, + "kl": 0.246185302734375, + "learning_rate": 4.946802801805467e-07, + "loss": 0.0002, + "reward": 1.7392857819795609, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857410013676, + "rewards/format_reward_func": 1.0, + "step": 12664 + }, + { + "completion_length": 247.48215293884277, + "epoch": 2.123726895511128, + "grad_norm": 0.21240464239704232, + "kl": 0.2212371826171875, + "learning_rate": 4.946778184302161e-07, + "loss": 0.0002, + "reward": 1.7964286282658577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 12666 + }, + { + "completion_length": 255.99554634094238, + "epoch": 2.1240621987509956, + "grad_norm": 0.2736472912552943, + "kl": 0.261810302734375, + "learning_rate": 4.946753561165462e-07, + "loss": 0.0003, + "reward": 1.7482143640518188, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12668 + }, + { + "completion_length": 255.18751049041748, + "epoch": 2.124397501990863, + "grad_norm": 0.14204252196879363, + "kl": 0.22735595703125, + "learning_rate": 4.946728932395431e-07, + "loss": 0.0002, + "reward": 1.716071493923664, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7205357402563095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12670 + }, + { + "completion_length": 244.4464406967163, + "epoch": 2.1247328052307304, + "grad_norm": 0.44296381897565446, + "kl": 0.254974365234375, + "learning_rate": 4.946704297992121e-07, + "loss": 0.0003, + "reward": 1.8035714998841286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 12672 + }, + { + "completion_length": 250.7544765472412, + "epoch": 2.125068108470598, + "grad_norm": 0.1577363318076351, + "kl": 0.217864990234375, + "learning_rate": 4.946679657955591e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857268452644, + "rewards/format_reward_func": 1.0, + "step": 12674 + }, + { + "completion_length": 248.09376049041748, + "epoch": 2.1254034117104657, + "grad_norm": 0.14258669660016726, + "kl": 0.170196533203125, + "learning_rate": 4.946655012285898e-07, + "loss": 0.0002, + "reward": 1.7589286416769028, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12676 + }, + { + "completion_length": 243.52233123779297, + "epoch": 2.1257387149503333, + "grad_norm": 0.23125273450125225, + "kl": 0.18536376953125, + "learning_rate": 4.946630360983098e-07, + "loss": 0.0002, + "reward": 1.7379465028643608, + "reward_std": 0.0473508988507092, + "rewards/equation_reward_func": 0.7392857559025288, + "rewards/format_reward_func": 0.9986607171595097, + "step": 12678 + }, + { + "completion_length": 251.3750114440918, + "epoch": 2.126074018190201, + "grad_norm": 0.18106699184120606, + "kl": 0.211212158203125, + "learning_rate": 4.946605704047247e-07, + "loss": 0.0002, + "reward": 1.753571480512619, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 12680 + }, + { + "completion_length": 246.27679538726807, + "epoch": 2.126409321430068, + "grad_norm": 0.6107476265404755, + "kl": 0.19964599609375, + "learning_rate": 4.946581041478404e-07, + "loss": 0.0002, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 12682 + }, + { + "completion_length": 250.76340675354004, + "epoch": 2.1267446246699357, + "grad_norm": 0.19478372513176684, + "kl": 0.208770751953125, + "learning_rate": 4.946556373276622e-07, + "loss": 0.0002, + "reward": 1.7535714879631996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714711993933, + "rewards/format_reward_func": 1.0, + "step": 12684 + }, + { + "completion_length": 248.64733123779297, + "epoch": 2.1270799279098034, + "grad_norm": 0.0650269589634784, + "kl": 0.2137451171875, + "learning_rate": 4.946531699441963e-07, + "loss": 0.0002, + "reward": 1.6892857998609543, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.6892857532948256, + "rewards/format_reward_func": 1.0, + "step": 12686 + }, + { + "completion_length": 248.41519165039062, + "epoch": 2.127415231149671, + "grad_norm": 0.06310082996410975, + "kl": 0.1319580078125, + "learning_rate": 4.946507019974479e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 12688 + }, + { + "completion_length": 251.1741189956665, + "epoch": 2.1277505343895387, + "grad_norm": 0.15660504026504873, + "kl": 0.162445068359375, + "learning_rate": 4.946482334874229e-07, + "loss": 0.0002, + "reward": 1.8142857626080513, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 12690 + }, + { + "completion_length": 240.34822463989258, + "epoch": 2.1280858376294063, + "grad_norm": 0.3681435884554297, + "kl": 0.135772705078125, + "learning_rate": 4.94645764414127e-07, + "loss": 0.0001, + "reward": 1.8151786178350449, + "reward_std": 0.049244935624301434, + "rewards/equation_reward_func": 0.8169643133878708, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12692 + }, + { + "completion_length": 234.3750114440918, + "epoch": 2.1284211408692735, + "grad_norm": 0.0995885339917896, + "kl": 0.128265380859375, + "learning_rate": 4.946432947775657e-07, + "loss": 0.0001, + "reward": 1.8035714700818062, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714570432901, + "rewards/format_reward_func": 1.0, + "step": 12694 + }, + { + "completion_length": 245.80804538726807, + "epoch": 2.128756444109141, + "grad_norm": 0.5983936194365209, + "kl": 0.15484619140625, + "learning_rate": 4.946408245777449e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7660714499652386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12696 + }, + { + "completion_length": 240.03572368621826, + "epoch": 2.1290917473490087, + "grad_norm": 0.21612487513493533, + "kl": 0.141876220703125, + "learning_rate": 4.946383538146704e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714510828257, + "rewards/format_reward_func": 1.0, + "step": 12698 + }, + { + "completion_length": 243.2544755935669, + "epoch": 2.1294270505888764, + "grad_norm": 0.15909532838737647, + "kl": 0.151611328125, + "learning_rate": 4.946358824883476e-07, + "loss": 0.0002, + "reward": 1.8178571686148643, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8178571723401546, + "rewards/format_reward_func": 1.0, + "step": 12700 + }, + { + "completion_length": 236.58036708831787, + "epoch": 2.129762353828744, + "grad_norm": 0.1954038164117403, + "kl": 0.157379150390625, + "learning_rate": 4.946334105987822e-07, + "loss": 0.0002, + "reward": 1.7214286401867867, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286029338837, + "rewards/format_reward_func": 1.0, + "step": 12702 + }, + { + "completion_length": 237.7410831451416, + "epoch": 2.130097657068611, + "grad_norm": 0.15764717852820267, + "kl": 0.182037353515625, + "learning_rate": 4.9463093814598e-07, + "loss": 0.0002, + "reward": 1.7714286223053932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714285887777805, + "rewards/format_reward_func": 1.0, + "step": 12704 + }, + { + "completion_length": 241.12501049041748, + "epoch": 2.130432960308479, + "grad_norm": 0.1504773466200583, + "kl": 0.121063232421875, + "learning_rate": 4.946284651299467e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 12706 + }, + { + "completion_length": 237.4419765472412, + "epoch": 2.1307682635483465, + "grad_norm": 0.11417352849954919, + "kl": 0.2191009521484375, + "learning_rate": 4.946259915506879e-07, + "loss": 0.0002, + "reward": 1.789285771548748, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857566475868, + "rewards/format_reward_func": 1.0, + "step": 12708 + }, + { + "completion_length": 238.57590579986572, + "epoch": 2.131103566788214, + "grad_norm": 0.5617548101285669, + "kl": 0.231964111328125, + "learning_rate": 4.946235174082096e-07, + "loss": 0.0002, + "reward": 1.7808036282658577, + "reward_std": 0.03724937466904521, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 0.9986607171595097, + "step": 12710 + }, + { + "completion_length": 240.80358219146729, + "epoch": 2.1314388700280817, + "grad_norm": 0.23236516414959607, + "kl": 0.173370361328125, + "learning_rate": 4.94621042702517e-07, + "loss": 0.0002, + "reward": 1.7500000596046448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000111758709, + "rewards/format_reward_func": 1.0, + "step": 12712 + }, + { + "completion_length": 237.41072463989258, + "epoch": 2.1317741732679494, + "grad_norm": 0.24766922153293103, + "kl": 0.2633056640625, + "learning_rate": 4.946185674336163e-07, + "loss": 0.0003, + "reward": 1.7607143372297287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 12714 + }, + { + "completion_length": 248.7410831451416, + "epoch": 2.1321094765078166, + "grad_norm": 0.2851471685393492, + "kl": 0.199249267578125, + "learning_rate": 4.946160916015128e-07, + "loss": 0.0002, + "reward": 1.7678572237491608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.767857164144516, + "rewards/format_reward_func": 1.0, + "step": 12716 + }, + { + "completion_length": 235.63393878936768, + "epoch": 2.132444779747684, + "grad_norm": 0.24856425416445746, + "kl": 0.212646484375, + "learning_rate": 4.946136152062123e-07, + "loss": 0.0002, + "reward": 1.74508935213089, + "reward_std": 0.03346130205318332, + "rewards/equation_reward_func": 0.7464286088943481, + "rewards/format_reward_func": 0.9986607171595097, + "step": 12718 + }, + { + "completion_length": 246.4062614440918, + "epoch": 2.132780082987552, + "grad_norm": 0.1818853615657871, + "kl": 0.207855224609375, + "learning_rate": 4.946111382477207e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857637256384, + "rewards/format_reward_func": 1.0, + "step": 12720 + }, + { + "completion_length": 236.11608028411865, + "epoch": 2.1331153862274195, + "grad_norm": 0.2388736773932256, + "kl": 0.304229736328125, + "learning_rate": 4.946086607260436e-07, + "loss": 0.0003, + "reward": 1.7928572073578835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 12722 + }, + { + "completion_length": 234.6384038925171, + "epoch": 2.133450689467287, + "grad_norm": 0.1481105248262793, + "kl": 0.19354248046875, + "learning_rate": 4.946061826411866e-07, + "loss": 0.0002, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 12724 + }, + { + "completion_length": 236.2589406967163, + "epoch": 2.1337859927071543, + "grad_norm": 0.19108891475425632, + "kl": 0.15118408203125, + "learning_rate": 4.946037039931554e-07, + "loss": 0.0002, + "reward": 1.787500075995922, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12726 + }, + { + "completion_length": 232.9821538925171, + "epoch": 2.134121295947022, + "grad_norm": 0.1379382492406183, + "kl": 0.180023193359375, + "learning_rate": 4.946012247819559e-07, + "loss": 0.0002, + "reward": 1.74642863124609, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 12728 + }, + { + "completion_length": 238.2142972946167, + "epoch": 2.1344565991868896, + "grad_norm": 0.42170481009249156, + "kl": 0.398681640625, + "learning_rate": 4.945987450075936e-07, + "loss": 0.0004, + "reward": 1.7928571999073029, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 12730 + }, + { + "completion_length": 240.73661708831787, + "epoch": 2.134791902426757, + "grad_norm": 0.13332983942445129, + "kl": 0.1611328125, + "learning_rate": 4.945962646700744e-07, + "loss": 0.0002, + "reward": 1.7142857909202576, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857555299997, + "rewards/format_reward_func": 1.0, + "step": 12732 + }, + { + "completion_length": 230.14732933044434, + "epoch": 2.135127205666625, + "grad_norm": 0.14906506873937503, + "kl": 0.3936309814453125, + "learning_rate": 4.945937837694039e-07, + "loss": 0.0004, + "reward": 1.805357202887535, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.8098214641213417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12734 + }, + { + "completion_length": 242.59822177886963, + "epoch": 2.1354625089064925, + "grad_norm": 0.09027486073294985, + "kl": 0.150177001953125, + "learning_rate": 4.945913023055877e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7214286178350449, + "rewards/format_reward_func": 1.0, + "step": 12736 + }, + { + "completion_length": 246.93750953674316, + "epoch": 2.1357978121463597, + "grad_norm": 0.5243776630881638, + "kl": 0.211212158203125, + "learning_rate": 4.945888202786317e-07, + "loss": 0.0002, + "reward": 1.760714329779148, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143335044384, + "rewards/format_reward_func": 1.0, + "step": 12738 + }, + { + "completion_length": 242.2634048461914, + "epoch": 2.1361331153862273, + "grad_norm": 0.29612204522098395, + "kl": 0.139556884765625, + "learning_rate": 4.945863376885415e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 12740 + }, + { + "completion_length": 239.7812614440918, + "epoch": 2.136468418626095, + "grad_norm": 0.24427987178665603, + "kl": 0.1544189453125, + "learning_rate": 4.94583854535323e-07, + "loss": 0.0002, + "reward": 1.7696429193019867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7741071581840515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12742 + }, + { + "completion_length": 234.2276906967163, + "epoch": 2.1368037218659626, + "grad_norm": 0.1073191925162693, + "kl": 0.1297607421875, + "learning_rate": 4.945813708189816e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 12744 + }, + { + "completion_length": 248.2634048461914, + "epoch": 2.13713902510583, + "grad_norm": 0.19538950089251983, + "kl": 0.14056396484375, + "learning_rate": 4.945788865395234e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7875000201165676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12746 + }, + { + "completion_length": 246.08929824829102, + "epoch": 2.137474328345698, + "grad_norm": 0.2802740460051694, + "kl": 0.148651123046875, + "learning_rate": 4.945764016969538e-07, + "loss": 0.0001, + "reward": 1.7196429148316383, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7241071872413158, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12748 + }, + { + "completion_length": 239.8750123977661, + "epoch": 2.137809631585565, + "grad_norm": 0.21579468742525323, + "kl": 0.1495361328125, + "learning_rate": 4.945739162912787e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 12750 + }, + { + "completion_length": 241.4375114440918, + "epoch": 2.1381449348254327, + "grad_norm": 0.1691322578412735, + "kl": 0.12725830078125, + "learning_rate": 4.945714303225037e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 12752 + }, + { + "completion_length": 232.0491180419922, + "epoch": 2.1384802380653003, + "grad_norm": 0.24020569233265468, + "kl": 0.12933349609375, + "learning_rate": 4.945689437906346e-07, + "loss": 0.0001, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 12754 + }, + { + "completion_length": 243.6875123977661, + "epoch": 2.138815541305168, + "grad_norm": 0.2156933055364058, + "kl": 0.146575927734375, + "learning_rate": 4.945664566956771e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 12756 + }, + { + "completion_length": 235.9553680419922, + "epoch": 2.1391508445450356, + "grad_norm": 0.23128610933191962, + "kl": 0.15655517578125, + "learning_rate": 4.94563969037637e-07, + "loss": 0.0002, + "reward": 1.710714340209961, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143141329288, + "rewards/format_reward_func": 1.0, + "step": 12758 + }, + { + "completion_length": 234.47768878936768, + "epoch": 2.139486147784903, + "grad_norm": 0.36352148754101415, + "kl": 0.148345947265625, + "learning_rate": 4.945614808165199e-07, + "loss": 0.0001, + "reward": 1.732142947614193, + "reward_std": 0.07576143834739923, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 12760 + }, + { + "completion_length": 244.48215293884277, + "epoch": 2.1398214510247704, + "grad_norm": 0.14300670479017372, + "kl": 0.15911865234375, + "learning_rate": 4.945589920323317e-07, + "loss": 0.0002, + "reward": 1.7642857730388641, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 12762 + }, + { + "completion_length": 234.37054634094238, + "epoch": 2.140156754264638, + "grad_norm": 0.8349680304170968, + "kl": 0.28076171875, + "learning_rate": 4.945565026850779e-07, + "loss": 0.0003, + "reward": 1.7642857879400253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 12764 + }, + { + "completion_length": 241.1875114440918, + "epoch": 2.1404920575045057, + "grad_norm": 1.056639847178951, + "kl": 0.64544677734375, + "learning_rate": 4.945540127747643e-07, + "loss": 0.0006, + "reward": 1.7839286103844643, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7883928772062063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12766 + }, + { + "completion_length": 227.62501049041748, + "epoch": 2.1408273607443733, + "grad_norm": 0.1428928703085797, + "kl": 0.1385498046875, + "learning_rate": 4.945515223013969e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000156462193, + "rewards/format_reward_func": 1.0, + "step": 12768 + }, + { + "completion_length": 232.0491180419922, + "epoch": 2.141162663984241, + "grad_norm": 0.14921203769840066, + "kl": 0.152801513671875, + "learning_rate": 4.94549031264981e-07, + "loss": 0.0002, + "reward": 1.8250000551342964, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.825000025331974, + "rewards/format_reward_func": 1.0, + "step": 12770 + }, + { + "completion_length": 233.75000953674316, + "epoch": 2.141497967224108, + "grad_norm": 0.2428657901809172, + "kl": 0.146759033203125, + "learning_rate": 4.945465396655227e-07, + "loss": 0.0001, + "reward": 1.8464286103844643, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8464285954833031, + "rewards/format_reward_func": 1.0, + "step": 12772 + }, + { + "completion_length": 239.32143878936768, + "epoch": 2.1418332704639758, + "grad_norm": 0.12902869004400336, + "kl": 0.138916015625, + "learning_rate": 4.945440475030276e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 12774 + }, + { + "completion_length": 232.8750114440918, + "epoch": 2.1421685737038434, + "grad_norm": 0.19235439708019675, + "kl": 0.1307373046875, + "learning_rate": 4.945415547775013e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 12776 + }, + { + "completion_length": 243.0669755935669, + "epoch": 2.142503876943711, + "grad_norm": 0.435288208029278, + "kl": 0.150848388671875, + "learning_rate": 4.945390614889499e-07, + "loss": 0.0002, + "reward": 1.7750000655651093, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7839286029338837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 12778 + }, + { + "completion_length": 234.4062614440918, + "epoch": 2.1428391801835787, + "grad_norm": 0.23064924712066712, + "kl": 0.206634521484375, + "learning_rate": 4.945365676373787e-07, + "loss": 0.0002, + "reward": 1.7821429446339607, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 12780 + }, + { + "completion_length": 238.9285831451416, + "epoch": 2.1431744834234463, + "grad_norm": 0.1791756501286884, + "kl": 0.144989013671875, + "learning_rate": 4.945340732227938e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 12782 + }, + { + "completion_length": 244.47769165039062, + "epoch": 2.1435097866633135, + "grad_norm": 0.4171668278383133, + "kl": 0.13970947265625, + "learning_rate": 4.945315782452008e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571626543999, + "rewards/format_reward_func": 1.0, + "step": 12784 + }, + { + "completion_length": 236.4241189956665, + "epoch": 2.143845089903181, + "grad_norm": 0.19374842158996536, + "kl": 0.1378936767578125, + "learning_rate": 4.945290827046053e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143227010965, + "rewards/format_reward_func": 1.0, + "step": 12786 + }, + { + "completion_length": 229.32590198516846, + "epoch": 2.1441803931430488, + "grad_norm": 0.21744398788212368, + "kl": 0.120758056640625, + "learning_rate": 4.945265866010133e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 12788 + }, + { + "completion_length": 226.24108409881592, + "epoch": 2.1445156963829164, + "grad_norm": 0.27278636765832526, + "kl": 0.154998779296875, + "learning_rate": 4.945240899344304e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 12790 + }, + { + "completion_length": 234.80358219146729, + "epoch": 2.144850999622784, + "grad_norm": 0.3670106294704579, + "kl": 0.1717529296875, + "learning_rate": 4.945215927048623e-07, + "loss": 0.0002, + "reward": 1.728571504354477, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 12792 + }, + { + "completion_length": 238.80804824829102, + "epoch": 2.145186302862651, + "grad_norm": 0.34621680075467387, + "kl": 0.151611328125, + "learning_rate": 4.94519094912315e-07, + "loss": 0.0002, + "reward": 1.7857143580913544, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 12794 + }, + { + "completion_length": 234.86608028411865, + "epoch": 2.145521606102519, + "grad_norm": 0.35599495255633246, + "kl": 0.12451171875, + "learning_rate": 4.945165965567939e-07, + "loss": 0.0001, + "reward": 1.8035715073347092, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8035714663565159, + "rewards/format_reward_func": 1.0, + "step": 12796 + }, + { + "completion_length": 233.17858123779297, + "epoch": 2.1458569093423865, + "grad_norm": 0.3200094677780826, + "kl": 0.1365966796875, + "learning_rate": 4.945140976383051e-07, + "loss": 0.0001, + "reward": 1.769642911851406, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12798 + }, + { + "completion_length": 233.56251049041748, + "epoch": 2.146192212582254, + "grad_norm": 0.4427323544427076, + "kl": 0.123443603515625, + "learning_rate": 4.94511598156854e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 12800 + }, + { + "completion_length": 226.57590579986572, + "epoch": 2.1465275158221218, + "grad_norm": 0.14726816307483223, + "kl": 0.128753662109375, + "learning_rate": 4.945090981124468e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 12802 + }, + { + "completion_length": 231.0535831451416, + "epoch": 2.1468628190619894, + "grad_norm": 0.29032372274327023, + "kl": 0.143157958984375, + "learning_rate": 4.945065975050888e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857544124126, + "rewards/format_reward_func": 1.0, + "step": 12804 + }, + { + "completion_length": 228.04911613464355, + "epoch": 2.1471981223018566, + "grad_norm": 0.12590136262960336, + "kl": 0.145751953125, + "learning_rate": 4.94504096334786e-07, + "loss": 0.0001, + "reward": 1.807142898440361, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428835391998, + "rewards/format_reward_func": 1.0, + "step": 12806 + }, + { + "completion_length": 238.69643783569336, + "epoch": 2.147533425541724, + "grad_norm": 0.30009913665500965, + "kl": 0.144683837890625, + "learning_rate": 4.945015946015442e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 1.0, + "step": 12808 + }, + { + "completion_length": 234.6294755935669, + "epoch": 2.147868728781592, + "grad_norm": 0.21758811899769895, + "kl": 0.137908935546875, + "learning_rate": 4.944990923053689e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 12810 + }, + { + "completion_length": 239.12054443359375, + "epoch": 2.1482040320214595, + "grad_norm": 0.27609212343688166, + "kl": 0.1478271484375, + "learning_rate": 4.944965894462662e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.785714304074645, + "rewards/format_reward_func": 1.0, + "step": 12812 + }, + { + "completion_length": 240.602689743042, + "epoch": 2.148539335261327, + "grad_norm": 0.21616130808787837, + "kl": 0.164459228515625, + "learning_rate": 4.944940860242416e-07, + "loss": 0.0002, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 12814 + }, + { + "completion_length": 237.48661994934082, + "epoch": 2.1488746385011943, + "grad_norm": 0.24664055919005196, + "kl": 0.142486572265625, + "learning_rate": 4.94491582039301e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000417232513, + "rewards/format_reward_func": 1.0, + "step": 12816 + }, + { + "completion_length": 244.68304634094238, + "epoch": 2.149209941741062, + "grad_norm": 0.2179678551592934, + "kl": 0.149383544921875, + "learning_rate": 4.944890774914502e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571671247482, + "rewards/format_reward_func": 1.0, + "step": 12818 + }, + { + "completion_length": 238.2366180419922, + "epoch": 2.1495452449809296, + "grad_norm": 0.10679201305907572, + "kl": 0.14337158203125, + "learning_rate": 4.944865723806948e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 12820 + }, + { + "completion_length": 233.70983123779297, + "epoch": 2.149880548220797, + "grad_norm": 0.3037664974555986, + "kl": 0.140838623046875, + "learning_rate": 4.944840667070406e-07, + "loss": 0.0001, + "reward": 1.8080357685685158, + "reward_std": 0.029041885398328304, + "rewards/equation_reward_func": 0.8098214641213417, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12822 + }, + { + "completion_length": 240.33929824829102, + "epoch": 2.150215851460665, + "grad_norm": 0.17097031750047306, + "kl": 0.132476806640625, + "learning_rate": 4.944815604704936e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 12824 + }, + { + "completion_length": 240.99108123779297, + "epoch": 2.1505511547005325, + "grad_norm": 0.2717364472056746, + "kl": 0.14019775390625, + "learning_rate": 4.944790536710592e-07, + "loss": 0.0001, + "reward": 1.7589286491274834, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7625000290572643, + "rewards/format_reward_func": 0.9964285716414452, + "step": 12826 + }, + { + "completion_length": 240.4062614440918, + "epoch": 2.1508864579403997, + "grad_norm": 0.06785661992573451, + "kl": 0.140716552734375, + "learning_rate": 4.944765463087435e-07, + "loss": 0.0001, + "reward": 1.8160714507102966, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.8205357566475868, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12828 + }, + { + "completion_length": 250.36608600616455, + "epoch": 2.1512217611802673, + "grad_norm": 0.1515955621735846, + "kl": 0.15386962890625, + "learning_rate": 4.944740383835521e-07, + "loss": 0.0002, + "reward": 1.7357143759727478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143256813288, + "rewards/format_reward_func": 1.0, + "step": 12830 + }, + { + "completion_length": 242.3928680419922, + "epoch": 2.151557064420135, + "grad_norm": 0.1461774077036663, + "kl": 0.129852294921875, + "learning_rate": 4.944715298954909e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 12832 + }, + { + "completion_length": 239.88393878936768, + "epoch": 2.1518923676600026, + "grad_norm": 0.11773547519069961, + "kl": 0.15191650390625, + "learning_rate": 4.944690208445656e-07, + "loss": 0.0002, + "reward": 1.8107143267989159, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 12834 + }, + { + "completion_length": 242.55358028411865, + "epoch": 2.15222767089987, + "grad_norm": 0.13150936149774745, + "kl": 0.14276123046875, + "learning_rate": 4.944665112307819e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428827941418, + "rewards/format_reward_func": 1.0, + "step": 12836 + }, + { + "completion_length": 247.94644260406494, + "epoch": 2.1525629741397374, + "grad_norm": 0.004567143634278605, + "kl": 0.137481689453125, + "learning_rate": 4.944640010541457e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 12838 + }, + { + "completion_length": 244.2544755935669, + "epoch": 2.152898277379605, + "grad_norm": 0.17917256243490093, + "kl": 0.1134490966796875, + "learning_rate": 4.944614903146626e-07, + "loss": 0.0001, + "reward": 1.8142857775092125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857328057289, + "rewards/format_reward_func": 1.0, + "step": 12840 + }, + { + "completion_length": 239.4955472946167, + "epoch": 2.1532335806194727, + "grad_norm": 0.166648209325707, + "kl": 0.13653564453125, + "learning_rate": 4.944589790123387e-07, + "loss": 0.0001, + "reward": 1.8107143491506577, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 12842 + }, + { + "completion_length": 244.07590579986572, + "epoch": 2.1535688838593403, + "grad_norm": 0.2685607135597953, + "kl": 0.1331787109375, + "learning_rate": 4.944564671471794e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 12844 + }, + { + "completion_length": 246.5491189956665, + "epoch": 2.153904187099208, + "grad_norm": 0.13520245836311928, + "kl": 0.152587890625, + "learning_rate": 4.944539547191908e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 12846 + }, + { + "completion_length": 251.59822940826416, + "epoch": 2.1542394903390756, + "grad_norm": 0.20708427682175876, + "kl": 0.1339111328125, + "learning_rate": 4.944514417283786e-07, + "loss": 0.0001, + "reward": 1.6928572282195091, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.6928571816533804, + "rewards/format_reward_func": 1.0, + "step": 12848 + }, + { + "completion_length": 245.05804538726807, + "epoch": 2.1545747935789428, + "grad_norm": 0.13092118170338693, + "kl": 0.134552001953125, + "learning_rate": 4.944489281747485e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 12850 + }, + { + "completion_length": 238.1785831451416, + "epoch": 2.1549100968188104, + "grad_norm": 0.2020475938040691, + "kl": 0.13726806640625, + "learning_rate": 4.944464140583063e-07, + "loss": 0.0001, + "reward": 1.7616072222590446, + "reward_std": 0.054295698180794716, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12852 + }, + { + "completion_length": 248.54911613464355, + "epoch": 2.155245400058678, + "grad_norm": 0.26429877443982663, + "kl": 0.129852294921875, + "learning_rate": 4.944438993790578e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714637488127, + "rewards/format_reward_func": 1.0, + "step": 12854 + }, + { + "completion_length": 250.86608505249023, + "epoch": 2.1555807032985457, + "grad_norm": 0.18398285158750388, + "kl": 0.1419677734375, + "learning_rate": 4.944413841370088e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 12856 + }, + { + "completion_length": 244.50447368621826, + "epoch": 2.1559160065384133, + "grad_norm": 0.27150227309966846, + "kl": 0.137847900390625, + "learning_rate": 4.944388683321652e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7500000316649675, + "rewards/format_reward_func": 1.0, + "step": 12858 + }, + { + "completion_length": 237.32143878936768, + "epoch": 2.1562513097782805, + "grad_norm": 0.15710165366100282, + "kl": 0.136749267578125, + "learning_rate": 4.944363519645326e-07, + "loss": 0.0001, + "reward": 1.7178572192788124, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7178571745753288, + "rewards/format_reward_func": 1.0, + "step": 12860 + }, + { + "completion_length": 239.290189743042, + "epoch": 2.156586613018148, + "grad_norm": 0.15330536446957824, + "kl": 0.131866455078125, + "learning_rate": 4.944338350341169e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 12862 + }, + { + "completion_length": 237.24108409881592, + "epoch": 2.1569219162580158, + "grad_norm": 0.058822255525960855, + "kl": 0.142547607421875, + "learning_rate": 4.944313175409239e-07, + "loss": 0.0001, + "reward": 1.7928571850061417, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571850061417, + "rewards/format_reward_func": 1.0, + "step": 12864 + }, + { + "completion_length": 254.0759048461914, + "epoch": 2.1572572194978834, + "grad_norm": 0.20590961982409228, + "kl": 0.1185302734375, + "learning_rate": 4.944287994849594e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 12866 + }, + { + "completion_length": 237.5759038925171, + "epoch": 2.157592522737751, + "grad_norm": 0.23532305877840948, + "kl": 0.13934326171875, + "learning_rate": 4.944262808662292e-07, + "loss": 0.0001, + "reward": 1.717857226729393, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571838885546, + "rewards/format_reward_func": 1.0, + "step": 12868 + }, + { + "completion_length": 244.3482255935669, + "epoch": 2.1579278259776187, + "grad_norm": 0.19126064452006622, + "kl": 0.137176513671875, + "learning_rate": 4.94423761684739e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 1.0, + "step": 12870 + }, + { + "completion_length": 233.21876049041748, + "epoch": 2.158263129217486, + "grad_norm": 0.18566948164981376, + "kl": 0.13421630859375, + "learning_rate": 4.944212419404947e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 12872 + }, + { + "completion_length": 236.08036708831787, + "epoch": 2.1585984324573535, + "grad_norm": 0.2988504098040542, + "kl": 0.147857666015625, + "learning_rate": 4.944187216335021e-07, + "loss": 0.0001, + "reward": 1.836607187986374, + "reward_std": 0.049244935624301434, + "rewards/equation_reward_func": 0.8383928723633289, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12874 + }, + { + "completion_length": 240.16518783569336, + "epoch": 2.158933735697221, + "grad_norm": 0.17006143463215537, + "kl": 0.138519287109375, + "learning_rate": 4.94416200763767e-07, + "loss": 0.0001, + "reward": 1.7696429267525673, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7732143178582191, + "rewards/format_reward_func": 0.9964285790920258, + "step": 12876 + }, + { + "completion_length": 242.79465579986572, + "epoch": 2.1592690389370888, + "grad_norm": 0.24022367229575953, + "kl": 0.15008544921875, + "learning_rate": 4.944136793312952e-07, + "loss": 0.0002, + "reward": 1.7857143506407738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 12878 + }, + { + "completion_length": 237.74554824829102, + "epoch": 2.1596043421769564, + "grad_norm": 0.12534665865683378, + "kl": 0.155029296875, + "learning_rate": 4.944111573360924e-07, + "loss": 0.0002, + "reward": 1.7544643729925156, + "reward_std": 0.03409264795482159, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12880 + }, + { + "completion_length": 238.2053680419922, + "epoch": 2.159939645416824, + "grad_norm": 0.2235627000227893, + "kl": 0.134307861328125, + "learning_rate": 4.944086347781646e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 12882 + }, + { + "completion_length": 242.26340198516846, + "epoch": 2.1602749486566912, + "grad_norm": 0.21898008921483378, + "kl": 0.13482666015625, + "learning_rate": 4.944061116575173e-07, + "loss": 0.0001, + "reward": 1.7651786506175995, + "reward_std": 0.029041884932667017, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12884 + }, + { + "completion_length": 242.3035831451416, + "epoch": 2.160610251896559, + "grad_norm": 0.24976873447595765, + "kl": 0.139129638671875, + "learning_rate": 4.944035879741567e-07, + "loss": 0.0001, + "reward": 1.72946435213089, + "reward_std": 0.049244935624301434, + "rewards/equation_reward_func": 0.7312500327825546, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12886 + }, + { + "completion_length": 246.10268878936768, + "epoch": 2.1609455551364265, + "grad_norm": 0.35250302204529516, + "kl": 0.134246826171875, + "learning_rate": 4.944010637280884e-07, + "loss": 0.0001, + "reward": 1.7580357939004898, + "reward_std": 0.059346460737288, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12888 + }, + { + "completion_length": 238.5848331451416, + "epoch": 2.161280858376294, + "grad_norm": 0.1638936631207704, + "kl": 0.130126953125, + "learning_rate": 4.943985389193182e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 12890 + }, + { + "completion_length": 236.87947273254395, + "epoch": 2.1616161616161618, + "grad_norm": 0.117822403773612, + "kl": 0.1263427734375, + "learning_rate": 4.943960135478519e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 12892 + }, + { + "completion_length": 240.99108219146729, + "epoch": 2.1619514648560294, + "grad_norm": 0.4960007888798321, + "kl": 0.125946044921875, + "learning_rate": 4.943934876136955e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 12894 + }, + { + "completion_length": 240.76786994934082, + "epoch": 2.1622867680958966, + "grad_norm": 0.2388921461736386, + "kl": 0.147216796875, + "learning_rate": 4.943909611168546e-07, + "loss": 0.0001, + "reward": 1.7214286550879478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214285880327225, + "rewards/format_reward_func": 1.0, + "step": 12896 + }, + { + "completion_length": 235.0267972946167, + "epoch": 2.1626220713357642, + "grad_norm": 0.1771875945580934, + "kl": 0.130950927734375, + "learning_rate": 4.94388434057335e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 12898 + }, + { + "completion_length": 242.977689743042, + "epoch": 2.162957374575632, + "grad_norm": 0.11793385067854463, + "kl": 0.14813232421875, + "learning_rate": 4.943859064351426e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 12900 + }, + { + "completion_length": 236.16072273254395, + "epoch": 2.1632926778154995, + "grad_norm": 0.1048863379933829, + "kl": 0.1263427734375, + "learning_rate": 4.943833782502834e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 12902 + }, + { + "completion_length": 248.91965293884277, + "epoch": 2.163627981055367, + "grad_norm": 0.24007095785937319, + "kl": 0.14215087890625, + "learning_rate": 4.94380849502763e-07, + "loss": 0.0001, + "reward": 1.7267858013510704, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7312500439584255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12904 + }, + { + "completion_length": 243.36161708831787, + "epoch": 2.1639632842952343, + "grad_norm": 0.22506223143518816, + "kl": 0.137115478515625, + "learning_rate": 4.943783201925873e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 12906 + }, + { + "completion_length": 237.40626049041748, + "epoch": 2.164298587535102, + "grad_norm": 0.09530809311521778, + "kl": 0.143402099609375, + "learning_rate": 4.943757903197621e-07, + "loss": 0.0001, + "reward": 1.835714340209961, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8357142992317677, + "rewards/format_reward_func": 1.0, + "step": 12908 + }, + { + "completion_length": 242.81251049041748, + "epoch": 2.1646338907749696, + "grad_norm": 0.2760145257903106, + "kl": 0.160980224609375, + "learning_rate": 4.943732598842931e-07, + "loss": 0.0002, + "reward": 1.682142935693264, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.6821429096162319, + "rewards/format_reward_func": 1.0, + "step": 12910 + }, + { + "completion_length": 248.42858409881592, + "epoch": 2.1649691940148372, + "grad_norm": 0.17348253473498154, + "kl": 0.135223388671875, + "learning_rate": 4.943707288861864e-07, + "loss": 0.0001, + "reward": 1.7232143580913544, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7276786025613546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12912 + }, + { + "completion_length": 246.4553680419922, + "epoch": 2.165304497254705, + "grad_norm": 0.187496772007708, + "kl": 0.165252685546875, + "learning_rate": 4.943681973254476e-07, + "loss": 0.0002, + "reward": 1.7321429252624512, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321429122239351, + "rewards/format_reward_func": 1.0, + "step": 12914 + }, + { + "completion_length": 236.97322368621826, + "epoch": 2.1656398004945725, + "grad_norm": 0.12459034225641151, + "kl": 0.131591796875, + "learning_rate": 4.943656652020825e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 12916 + }, + { + "completion_length": 238.65179443359375, + "epoch": 2.1659751037344397, + "grad_norm": 0.347977850319333, + "kl": 0.138275146484375, + "learning_rate": 4.943631325160971e-07, + "loss": 0.0001, + "reward": 1.7892857939004898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 12918 + }, + { + "completion_length": 235.53126049041748, + "epoch": 2.1663104069743073, + "grad_norm": 0.17829052465307105, + "kl": 0.133636474609375, + "learning_rate": 4.943605992674973e-07, + "loss": 0.0001, + "reward": 1.7000000849366188, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7000000402331352, + "rewards/format_reward_func": 1.0, + "step": 12920 + }, + { + "completion_length": 242.2455472946167, + "epoch": 2.166645710214175, + "grad_norm": 0.20660870590652766, + "kl": 0.1290283203125, + "learning_rate": 4.943580654562886e-07, + "loss": 0.0001, + "reward": 1.7558036521077156, + "reward_std": 0.07260471233166754, + "rewards/equation_reward_func": 0.7571428790688515, + "rewards/format_reward_func": 0.9986607171595097, + "step": 12922 + }, + { + "completion_length": 238.7053689956665, + "epoch": 2.1669810134540426, + "grad_norm": 0.10815500831875394, + "kl": 0.177398681640625, + "learning_rate": 4.943555310824772e-07, + "loss": 0.0002, + "reward": 1.7946429029107094, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7991071604192257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12924 + }, + { + "completion_length": 234.91518878936768, + "epoch": 2.1673163166939102, + "grad_norm": 0.2615412690593561, + "kl": 0.123870849609375, + "learning_rate": 4.943529961460688e-07, + "loss": 0.0001, + "reward": 1.804464340209961, + "reward_std": 0.03409264795482159, + "rewards/equation_reward_func": 0.8062500320374966, + "rewards/format_reward_func": 0.9982142895460129, + "step": 12926 + }, + { + "completion_length": 241.53125953674316, + "epoch": 2.1676516199337774, + "grad_norm": 0.23643354312928433, + "kl": 0.138916015625, + "learning_rate": 4.943504606470691e-07, + "loss": 0.0001, + "reward": 1.8392857611179352, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8392857238650322, + "rewards/format_reward_func": 1.0, + "step": 12928 + }, + { + "completion_length": 236.5223331451416, + "epoch": 2.167986923173645, + "grad_norm": 0.20165447625301341, + "kl": 0.13446044921875, + "learning_rate": 4.943479245854841e-07, + "loss": 0.0001, + "reward": 1.7839286401867867, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7875000163912773, + "rewards/format_reward_func": 0.9964285790920258, + "step": 12930 + }, + { + "completion_length": 237.37947273254395, + "epoch": 2.1683222264135127, + "grad_norm": 0.2801653478125558, + "kl": 0.139373779296875, + "learning_rate": 4.943453879613194e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 12932 + }, + { + "completion_length": 235.91965293884277, + "epoch": 2.1686575296533803, + "grad_norm": 0.24652515854083196, + "kl": 0.11895751953125, + "learning_rate": 4.943428507745811e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000171363354, + "rewards/format_reward_func": 1.0, + "step": 12934 + }, + { + "completion_length": 228.6294765472412, + "epoch": 2.168992832893248, + "grad_norm": 0.5322413719457854, + "kl": 0.122833251953125, + "learning_rate": 4.94340313025275e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857305705547, + "rewards/format_reward_func": 1.0, + "step": 12936 + }, + { + "completion_length": 233.21429634094238, + "epoch": 2.1693281361331156, + "grad_norm": 0.10032357589058459, + "kl": 0.120361328125, + "learning_rate": 4.94337774713407e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143238186836, + "rewards/format_reward_func": 1.0, + "step": 12938 + }, + { + "completion_length": 236.88394165039062, + "epoch": 2.169663439372983, + "grad_norm": 0.14355426238650287, + "kl": 0.0960540771484375, + "learning_rate": 4.943352358389827e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 12940 + }, + { + "completion_length": 235.64733123779297, + "epoch": 2.1699987426128504, + "grad_norm": 0.25435027499296264, + "kl": 0.12823486328125, + "learning_rate": 4.943326964020083e-07, + "loss": 0.0001, + "reward": 1.8089286237955093, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.8133928813040257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12942 + }, + { + "completion_length": 244.31250953674316, + "epoch": 2.170334045852718, + "grad_norm": 0.2898807685825902, + "kl": 0.119598388671875, + "learning_rate": 4.943301564024892e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571581840515, + "rewards/format_reward_func": 1.0, + "step": 12944 + }, + { + "completion_length": 247.62947463989258, + "epoch": 2.1706693490925857, + "grad_norm": 0.19631638793532102, + "kl": 0.12933349609375, + "learning_rate": 4.943276158404316e-07, + "loss": 0.0001, + "reward": 1.7000000700354576, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7000000290572643, + "rewards/format_reward_func": 1.0, + "step": 12946 + }, + { + "completion_length": 243.0892972946167, + "epoch": 2.1710046523324533, + "grad_norm": 0.17855137823784265, + "kl": 0.110931396484375, + "learning_rate": 4.943250747158413e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7392857596278191, + "rewards/format_reward_func": 1.0, + "step": 12948 + }, + { + "completion_length": 237.7991180419922, + "epoch": 2.1713399555723205, + "grad_norm": 0.3307348578135927, + "kl": 0.140533447265625, + "learning_rate": 4.943225330287239e-07, + "loss": 0.0001, + "reward": 1.7714286148548126, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 12950 + }, + { + "completion_length": 235.58036708831787, + "epoch": 2.171675258812188, + "grad_norm": 0.4506722857335034, + "kl": 0.117889404296875, + "learning_rate": 4.943199907790856e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 12952 + }, + { + "completion_length": 236.32590293884277, + "epoch": 2.172010562052056, + "grad_norm": 0.20793578554397196, + "kl": 0.1285400390625, + "learning_rate": 4.943174479669321e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 12954 + }, + { + "completion_length": 237.71429538726807, + "epoch": 2.1723458652919234, + "grad_norm": 0.15868192009752824, + "kl": 0.105560302734375, + "learning_rate": 4.943149045922692e-07, + "loss": 0.0001, + "reward": 1.7482143342494965, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7526786085218191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12956 + }, + { + "completion_length": 237.6071538925171, + "epoch": 2.172681168531791, + "grad_norm": 0.18955779258601502, + "kl": 0.109527587890625, + "learning_rate": 4.943123606551028e-07, + "loss": 0.0001, + "reward": 1.8160714954137802, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.8205357380211353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12958 + }, + { + "completion_length": 248.1562623977661, + "epoch": 2.1730164717716587, + "grad_norm": 0.2265311963984262, + "kl": 0.206207275390625, + "learning_rate": 4.943098161554388e-07, + "loss": 0.0002, + "reward": 1.7696429267525673, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7741071730852127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12960 + }, + { + "completion_length": 239.08483409881592, + "epoch": 2.173351775011526, + "grad_norm": 0.24927354696810647, + "kl": 0.1038360595703125, + "learning_rate": 4.94307271093283e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 12962 + }, + { + "completion_length": 244.32143688201904, + "epoch": 2.1736870782513935, + "grad_norm": 0.12840740398816736, + "kl": 0.1054229736328125, + "learning_rate": 4.943047254686413e-07, + "loss": 0.0001, + "reward": 1.8035714700818062, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714570432901, + "rewards/format_reward_func": 1.0, + "step": 12964 + }, + { + "completion_length": 238.6785831451416, + "epoch": 2.174022381491261, + "grad_norm": 0.185315617261064, + "kl": 0.110504150390625, + "learning_rate": 4.943021792815194e-07, + "loss": 0.0001, + "reward": 1.8214286342263222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.821428582072258, + "rewards/format_reward_func": 1.0, + "step": 12966 + }, + { + "completion_length": 242.80804824829102, + "epoch": 2.174357684731129, + "grad_norm": 0.48087252874133135, + "kl": 0.13427734375, + "learning_rate": 4.942996325319234e-07, + "loss": 0.0001, + "reward": 1.7625000849366188, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12968 + }, + { + "completion_length": 249.78126049041748, + "epoch": 2.1746929879709964, + "grad_norm": 0.15766073915047, + "kl": 0.11785888671875, + "learning_rate": 4.942970852198591e-07, + "loss": 0.0001, + "reward": 1.6750000715255737, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.6750000286847353, + "rewards/format_reward_func": 1.0, + "step": 12970 + }, + { + "completion_length": 239.28572463989258, + "epoch": 2.1750282912108636, + "grad_norm": 0.09665109152968684, + "kl": 0.13330078125, + "learning_rate": 4.942945373453323e-07, + "loss": 0.0001, + "reward": 1.7589286118745804, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12972 + }, + { + "completion_length": 237.196439743042, + "epoch": 2.1753635944507312, + "grad_norm": 0.21211440287784858, + "kl": 0.1326904296875, + "learning_rate": 4.942919889083488e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000204890966, + "rewards/format_reward_func": 1.0, + "step": 12974 + }, + { + "completion_length": 244.95090675354004, + "epoch": 2.175698897690599, + "grad_norm": 0.5472880673504554, + "kl": 0.135406494140625, + "learning_rate": 4.942894399089148e-07, + "loss": 0.0001, + "reward": 1.814285784959793, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 12976 + }, + { + "completion_length": 232.86608219146729, + "epoch": 2.1760342009304665, + "grad_norm": 0.4324622384788192, + "kl": 0.1165771484375, + "learning_rate": 4.942868903470357e-07, + "loss": 0.0001, + "reward": 1.8214286118745804, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 1.0, + "step": 12978 + }, + { + "completion_length": 234.79018878936768, + "epoch": 2.176369504170334, + "grad_norm": 0.23861054744834512, + "kl": 0.129150390625, + "learning_rate": 4.942843402227178e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 12980 + }, + { + "completion_length": 244.60715293884277, + "epoch": 2.176704807410202, + "grad_norm": 0.17599420950231748, + "kl": 0.119537353515625, + "learning_rate": 4.942817895359666e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 12982 + }, + { + "completion_length": 242.7232265472412, + "epoch": 2.177040110650069, + "grad_norm": 0.20729228630689475, + "kl": 0.131744384765625, + "learning_rate": 4.942792382867884e-07, + "loss": 0.0001, + "reward": 1.7142858058214188, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7142857555299997, + "rewards/format_reward_func": 1.0, + "step": 12984 + }, + { + "completion_length": 234.30804824829102, + "epoch": 2.1773754138899366, + "grad_norm": 0.23350022151308217, + "kl": 0.131134033203125, + "learning_rate": 4.942766864751886e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12986 + }, + { + "completion_length": 243.3303689956665, + "epoch": 2.1777107171298042, + "grad_norm": 0.16288105801056224, + "kl": 0.125274658203125, + "learning_rate": 4.942741341011733e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 12988 + }, + { + "completion_length": 238.56697177886963, + "epoch": 2.178046020369672, + "grad_norm": 0.17009587148192523, + "kl": 0.134521484375, + "learning_rate": 4.942715811647484e-07, + "loss": 0.0001, + "reward": 1.701785795390606, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7062500342726707, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12990 + }, + { + "completion_length": 241.102689743042, + "epoch": 2.1783813236095395, + "grad_norm": 0.1392263569922293, + "kl": 0.191131591796875, + "learning_rate": 4.942690276659198e-07, + "loss": 0.0002, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 12992 + }, + { + "completion_length": 241.1116180419922, + "epoch": 2.1787166268494067, + "grad_norm": 0.19459083800717636, + "kl": 0.20111083984375, + "learning_rate": 4.942664736046933e-07, + "loss": 0.0002, + "reward": 1.7321429401636124, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428768336773, + "rewards/format_reward_func": 1.0, + "step": 12994 + }, + { + "completion_length": 240.47322463989258, + "epoch": 2.1790519300892743, + "grad_norm": 0.22337487727499877, + "kl": 0.15386962890625, + "learning_rate": 4.942639189810748e-07, + "loss": 0.0002, + "reward": 1.7375000715255737, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643253087997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12996 + }, + { + "completion_length": 247.71876335144043, + "epoch": 2.179387233329142, + "grad_norm": 0.39531989522954086, + "kl": 0.17596435546875, + "learning_rate": 4.942613637950702e-07, + "loss": 0.0002, + "reward": 1.7339286655187607, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 12998 + }, + { + "completion_length": 243.2232255935669, + "epoch": 2.1797225365690096, + "grad_norm": 0.10995658678733815, + "kl": 0.2958984375, + "learning_rate": 4.942588080466854e-07, + "loss": 0.0003, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 13000 + }, + { + "completion_length": 231.92858123779297, + "epoch": 2.1800578398088772, + "grad_norm": 0.2621288294232232, + "kl": 0.099517822265625, + "learning_rate": 4.942562517359262e-07, + "loss": 0.0001, + "reward": 1.8250000551342964, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000216066837, + "rewards/format_reward_func": 1.0, + "step": 13002 + }, + { + "completion_length": 244.7366189956665, + "epoch": 2.180393143048745, + "grad_norm": 0.2000719271959465, + "kl": 0.135772705078125, + "learning_rate": 4.942536948627986e-07, + "loss": 0.0001, + "reward": 1.753571517765522, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 13004 + }, + { + "completion_length": 236.33036708831787, + "epoch": 2.180728446288612, + "grad_norm": 0.19829520653945196, + "kl": 0.115936279296875, + "learning_rate": 4.942511374273084e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7250000201165676, + "rewards/format_reward_func": 1.0, + "step": 13006 + }, + { + "completion_length": 242.82143878936768, + "epoch": 2.1810637495284797, + "grad_norm": 0.23086917895470546, + "kl": 0.17926025390625, + "learning_rate": 4.942485794294616e-07, + "loss": 0.0002, + "reward": 1.8178572058677673, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8178571574389935, + "rewards/format_reward_func": 1.0, + "step": 13008 + }, + { + "completion_length": 244.8125123977661, + "epoch": 2.1813990527683473, + "grad_norm": 0.12326582796200601, + "kl": 0.230316162109375, + "learning_rate": 4.942460208692639e-07, + "loss": 0.0002, + "reward": 1.7392857894301414, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7392857447266579, + "rewards/format_reward_func": 1.0, + "step": 13010 + }, + { + "completion_length": 246.26340579986572, + "epoch": 2.181734356008215, + "grad_norm": 0.5111766646617298, + "kl": 0.141357421875, + "learning_rate": 4.942434617467213e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000149011612, + "rewards/format_reward_func": 1.0, + "step": 13012 + }, + { + "completion_length": 247.73661708831787, + "epoch": 2.1820696592480826, + "grad_norm": 0.18897060945754657, + "kl": 0.142791748046875, + "learning_rate": 4.942409020618398e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7571428883820772, + "rewards/format_reward_func": 1.0, + "step": 13014 + }, + { + "completion_length": 245.17858600616455, + "epoch": 2.1824049624879502, + "grad_norm": 0.192037173413482, + "kl": 0.22442626953125, + "learning_rate": 4.942383418146251e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 13016 + }, + { + "completion_length": 246.90179538726807, + "epoch": 2.1827402657278174, + "grad_norm": 0.1740235049742812, + "kl": 0.486602783203125, + "learning_rate": 4.942357810050832e-07, + "loss": 0.0005, + "reward": 1.743303619325161, + "reward_std": 0.04987628059461713, + "rewards/equation_reward_func": 0.749107176437974, + "rewards/format_reward_func": 0.9941964335739613, + "step": 13018 + }, + { + "completion_length": 244.92858123779297, + "epoch": 2.183075568967685, + "grad_norm": 0.28280715676848855, + "kl": 0.159271240234375, + "learning_rate": 4.9423321963322e-07, + "loss": 0.0002, + "reward": 1.7669643759727478, + "reward_std": 0.04671955481171608, + "rewards/equation_reward_func": 0.7732143066823483, + "rewards/format_reward_func": 0.9937500059604645, + "step": 13020 + }, + { + "completion_length": 241.37501049041748, + "epoch": 2.1834108722075527, + "grad_norm": 0.19996439624931472, + "kl": 0.172210693359375, + "learning_rate": 4.942306576990414e-07, + "loss": 0.0002, + "reward": 1.7910714820027351, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357395112514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13022 + }, + { + "completion_length": 249.63840579986572, + "epoch": 2.1837461754474203, + "grad_norm": 0.0978921822772246, + "kl": 0.177581787109375, + "learning_rate": 4.942280952025531e-07, + "loss": 0.0002, + "reward": 1.7857143357396126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 13024 + }, + { + "completion_length": 240.29018878936768, + "epoch": 2.184081478687288, + "grad_norm": 0.3820928046968436, + "kl": 0.283355712890625, + "learning_rate": 4.942255321437614e-07, + "loss": 0.0003, + "reward": 1.78035718947649, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7848214656114578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13026 + }, + { + "completion_length": 234.20983219146729, + "epoch": 2.1844167819271556, + "grad_norm": 0.31466234028861156, + "kl": 0.243011474609375, + "learning_rate": 4.942229685226719e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 13028 + }, + { + "completion_length": 241.95536613464355, + "epoch": 2.184752085167023, + "grad_norm": 0.14632813185984134, + "kl": 0.114013671875, + "learning_rate": 4.942204043392905e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13030 + }, + { + "completion_length": 238.94197845458984, + "epoch": 2.1850873884068904, + "grad_norm": 0.12801467517810663, + "kl": 0.314605712890625, + "learning_rate": 4.942178395936232e-07, + "loss": 0.0003, + "reward": 1.717857226729393, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7178571540862322, + "rewards/format_reward_func": 1.0, + "step": 13032 + }, + { + "completion_length": 244.12054634094238, + "epoch": 2.185422691646758, + "grad_norm": 0.1482522554310038, + "kl": 0.12371826171875, + "learning_rate": 4.942152742856759e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.753571443259716, + "rewards/format_reward_func": 1.0, + "step": 13034 + }, + { + "completion_length": 252.0535831451416, + "epoch": 2.1857579948866257, + "grad_norm": 0.328016225135122, + "kl": 0.243499755859375, + "learning_rate": 4.942127084154545e-07, + "loss": 0.0002, + "reward": 1.7750000804662704, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 13036 + }, + { + "completion_length": 245.58483409881592, + "epoch": 2.1860932981264933, + "grad_norm": 0.2915103242446317, + "kl": 0.1773681640625, + "learning_rate": 4.942101419829649e-07, + "loss": 0.0002, + "reward": 1.7660715132951736, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13038 + }, + { + "completion_length": 242.22768688201904, + "epoch": 2.1864286013663605, + "grad_norm": 0.15696621358735263, + "kl": 0.229949951171875, + "learning_rate": 4.94207574988213e-07, + "loss": 0.0002, + "reward": 1.7035714760422707, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7035714611411095, + "rewards/format_reward_func": 1.0, + "step": 13040 + }, + { + "completion_length": 249.46876430511475, + "epoch": 2.186763904606228, + "grad_norm": 0.16961914232860997, + "kl": 0.256103515625, + "learning_rate": 4.942050074312047e-07, + "loss": 0.0003, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214484751225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13042 + }, + { + "completion_length": 246.27233219146729, + "epoch": 2.187099207846096, + "grad_norm": 0.08810562177050113, + "kl": 0.142333984375, + "learning_rate": 4.94202439311946e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 13044 + }, + { + "completion_length": 237.0982255935669, + "epoch": 2.1874345110859634, + "grad_norm": 0.19939925649510315, + "kl": 0.15093994140625, + "learning_rate": 4.941998706304426e-07, + "loss": 0.0002, + "reward": 1.7928571924567223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 13046 + }, + { + "completion_length": 240.7321538925171, + "epoch": 2.187769814325831, + "grad_norm": 0.37088907779026276, + "kl": 0.181976318359375, + "learning_rate": 4.941973013867007e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428831666708, + "rewards/format_reward_func": 1.0, + "step": 13048 + }, + { + "completion_length": 241.3571548461914, + "epoch": 2.1881051175656987, + "grad_norm": 0.34147771560012263, + "kl": 0.721923828125, + "learning_rate": 4.94194731580726e-07, + "loss": 0.0007, + "reward": 1.766071505844593, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13050 + }, + { + "completion_length": 249.03572368621826, + "epoch": 2.188440420805566, + "grad_norm": 0.19678197516440343, + "kl": 0.230865478515625, + "learning_rate": 4.941921612125246e-07, + "loss": 0.0002, + "reward": 1.7607143446803093, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13052 + }, + { + "completion_length": 241.30358028411865, + "epoch": 2.1887757240454335, + "grad_norm": 0.18948886620622116, + "kl": 0.227386474609375, + "learning_rate": 4.941895902821022e-07, + "loss": 0.0002, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 13054 + }, + { + "completion_length": 237.2455472946167, + "epoch": 2.189111027285301, + "grad_norm": 0.16057536488565435, + "kl": 0.434478759765625, + "learning_rate": 4.941870187894648e-07, + "loss": 0.0004, + "reward": 1.7500000521540642, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 13056 + }, + { + "completion_length": 227.5625114440918, + "epoch": 2.189446330525169, + "grad_norm": 0.22919726377893454, + "kl": 0.162689208984375, + "learning_rate": 4.941844467346183e-07, + "loss": 0.0002, + "reward": 1.8107143491506577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143007218838, + "rewards/format_reward_func": 1.0, + "step": 13058 + }, + { + "completion_length": 238.60268688201904, + "epoch": 2.1897816337650364, + "grad_norm": 0.208520058844818, + "kl": 0.284759521484375, + "learning_rate": 4.941818741175689e-07, + "loss": 0.0003, + "reward": 1.792857177555561, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 13060 + }, + { + "completion_length": 237.09822463989258, + "epoch": 2.1901169370049036, + "grad_norm": 0.13094832960062402, + "kl": 0.207489013671875, + "learning_rate": 4.941793009383221e-07, + "loss": 0.0002, + "reward": 1.8142857626080513, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 13062 + }, + { + "completion_length": 239.67858123779297, + "epoch": 2.1904522402447713, + "grad_norm": 0.12603606773818665, + "kl": 0.319000244140625, + "learning_rate": 4.94176727196884e-07, + "loss": 0.0003, + "reward": 1.7517857775092125, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13064 + }, + { + "completion_length": 230.8125114440918, + "epoch": 2.190787543484639, + "grad_norm": 0.15633282016082847, + "kl": 0.31939697265625, + "learning_rate": 4.941741528932606e-07, + "loss": 0.0003, + "reward": 1.744642935693264, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7491071633994579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13066 + }, + { + "completion_length": 234.54465293884277, + "epoch": 2.1911228467245065, + "grad_norm": 0.43623124322019347, + "kl": 0.6263427734375, + "learning_rate": 4.941715780274578e-07, + "loss": 0.0006, + "reward": 1.7750000804662704, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.783928606659174, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13068 + }, + { + "completion_length": 242.11161708831787, + "epoch": 2.191458149964374, + "grad_norm": 0.2560604452422528, + "kl": 0.178070068359375, + "learning_rate": 4.941690025994814e-07, + "loss": 0.0002, + "reward": 1.7357143759727478, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143089175224, + "rewards/format_reward_func": 1.0, + "step": 13070 + }, + { + "completion_length": 229.65626049041748, + "epoch": 2.191793453204242, + "grad_norm": 0.11541466072905084, + "kl": 0.167022705078125, + "learning_rate": 4.941664266093375e-07, + "loss": 0.0002, + "reward": 1.7428572177886963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 13072 + }, + { + "completion_length": 238.8125114440918, + "epoch": 2.192128756444109, + "grad_norm": 0.1561942106742485, + "kl": 0.19775390625, + "learning_rate": 4.941638500570319e-07, + "loss": 0.0002, + "reward": 1.6517857909202576, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.6562500298023224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13074 + }, + { + "completion_length": 237.8884048461914, + "epoch": 2.1924640596839766, + "grad_norm": 0.17344246627332055, + "kl": 0.162841796875, + "learning_rate": 4.941612729425706e-07, + "loss": 0.0002, + "reward": 1.8446429073810577, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8491071686148643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13076 + }, + { + "completion_length": 241.5491189956665, + "epoch": 2.1927993629238443, + "grad_norm": 0.22191151876539467, + "kl": 0.168365478515625, + "learning_rate": 4.941586952659595e-07, + "loss": 0.0002, + "reward": 1.8071428909897804, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428947150707, + "rewards/format_reward_func": 1.0, + "step": 13078 + }, + { + "completion_length": 245.51787090301514, + "epoch": 2.193134666163712, + "grad_norm": 0.41291817489040716, + "kl": 0.185089111328125, + "learning_rate": 4.941561170272047e-07, + "loss": 0.0002, + "reward": 1.7500000596046448, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 13080 + }, + { + "completion_length": 234.9866180419922, + "epoch": 2.1934699694035795, + "grad_norm": 0.2561760676478586, + "kl": 0.151092529296875, + "learning_rate": 4.941535382263119e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 13082 + }, + { + "completion_length": 228.5625123977661, + "epoch": 2.1938052726434467, + "grad_norm": 0.2149898909073194, + "kl": 0.13726806640625, + "learning_rate": 4.94150958863287e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 13084 + }, + { + "completion_length": 240.17858219146729, + "epoch": 2.1941405758833143, + "grad_norm": 0.1678046221919406, + "kl": 0.171142578125, + "learning_rate": 4.941483789381362e-07, + "loss": 0.0002, + "reward": 1.767857201397419, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 13086 + }, + { + "completion_length": 234.0491180419922, + "epoch": 2.194475879123182, + "grad_norm": 0.11829237495922411, + "kl": 0.215484619140625, + "learning_rate": 4.941457984508653e-07, + "loss": 0.0002, + "reward": 1.7821429446339607, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 13088 + }, + { + "completion_length": 226.1785831451416, + "epoch": 2.1948111823630496, + "grad_norm": 0.1562739900767488, + "kl": 0.147369384765625, + "learning_rate": 4.941432174014803e-07, + "loss": 0.0001, + "reward": 1.780357226729393, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13090 + }, + { + "completion_length": 230.92411613464355, + "epoch": 2.1951464856029173, + "grad_norm": 0.16259796276652824, + "kl": 0.198089599609375, + "learning_rate": 4.941406357899871e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 13092 + }, + { + "completion_length": 232.1875123977661, + "epoch": 2.195481788842785, + "grad_norm": 0.20673151685475438, + "kl": 0.217559814453125, + "learning_rate": 4.941380536163915e-07, + "loss": 0.0002, + "reward": 1.8089286237955093, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8133928813040257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13094 + }, + { + "completion_length": 236.8571538925171, + "epoch": 2.195817092082652, + "grad_norm": 0.1114181529377033, + "kl": 0.215423583984375, + "learning_rate": 4.941354708806996e-07, + "loss": 0.0002, + "reward": 1.7107143476605415, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143104076385, + "rewards/format_reward_func": 1.0, + "step": 13096 + }, + { + "completion_length": 236.03126430511475, + "epoch": 2.1961523953225197, + "grad_norm": 0.2740025408477465, + "kl": 0.39141845703125, + "learning_rate": 4.941328875829175e-07, + "loss": 0.0004, + "reward": 1.7589286267757416, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13098 + }, + { + "completion_length": 228.6875114440918, + "epoch": 2.1964876985623873, + "grad_norm": 0.0945623183408348, + "kl": 0.166290283203125, + "learning_rate": 4.941303037230508e-07, + "loss": 0.0002, + "reward": 1.8250000402331352, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 13100 + }, + { + "completion_length": 232.227689743042, + "epoch": 2.196823001802255, + "grad_norm": 0.22985184246306434, + "kl": 0.164337158203125, + "learning_rate": 4.941277193011057e-07, + "loss": 0.0002, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 13102 + }, + { + "completion_length": 234.11608219146729, + "epoch": 2.1971583050421226, + "grad_norm": 0.20757059453359875, + "kl": 0.194183349609375, + "learning_rate": 4.94125134317088e-07, + "loss": 0.0002, + "reward": 1.7357143238186836, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143312692642, + "rewards/format_reward_func": 1.0, + "step": 13104 + }, + { + "completion_length": 246.06697463989258, + "epoch": 2.19749360828199, + "grad_norm": 0.23602206759001917, + "kl": 0.2412109375, + "learning_rate": 4.941225487710038e-07, + "loss": 0.0002, + "reward": 1.7232143729925156, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7276786006987095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13106 + }, + { + "completion_length": 237.20983123779297, + "epoch": 2.1978289115218574, + "grad_norm": 0.19656052106365124, + "kl": 0.30426025390625, + "learning_rate": 4.94119962662859e-07, + "loss": 0.0003, + "reward": 1.7767857760190964, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7812500242143869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13108 + }, + { + "completion_length": 233.30804824829102, + "epoch": 2.198164214761725, + "grad_norm": 0.1888572394239757, + "kl": 0.25299072265625, + "learning_rate": 4.941173759926595e-07, + "loss": 0.0003, + "reward": 1.7214286625385284, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.721428606659174, + "rewards/format_reward_func": 1.0, + "step": 13110 + }, + { + "completion_length": 234.5134038925171, + "epoch": 2.1984995180015927, + "grad_norm": 0.14359310245026907, + "kl": 0.162506103515625, + "learning_rate": 4.941147887604113e-07, + "loss": 0.0002, + "reward": 1.7808036133646965, + "reward_std": 0.02714784862473607, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 0.9986607171595097, + "step": 13112 + }, + { + "completion_length": 230.72768878936768, + "epoch": 2.1988348212414603, + "grad_norm": 0.1815342261677029, + "kl": 0.1749267578125, + "learning_rate": 4.941122009661202e-07, + "loss": 0.0002, + "reward": 1.7571429163217545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 13114 + }, + { + "completion_length": 234.0134038925171, + "epoch": 2.199170124481328, + "grad_norm": 0.22855059183061707, + "kl": 0.1219482421875, + "learning_rate": 4.941096126097926e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000275671482, + "rewards/format_reward_func": 1.0, + "step": 13116 + }, + { + "completion_length": 235.95536708831787, + "epoch": 2.199505427721195, + "grad_norm": 0.3166301610655401, + "kl": 0.2043609619140625, + "learning_rate": 4.94107023691434e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428921073675, + "rewards/format_reward_func": 1.0, + "step": 13118 + }, + { + "completion_length": 237.70536613464355, + "epoch": 2.199840730961063, + "grad_norm": 0.17579007961883955, + "kl": 0.1287078857421875, + "learning_rate": 4.941044342110504e-07, + "loss": 0.0001, + "reward": 1.7196429371833801, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13120 + }, + { + "completion_length": 240.477689743042, + "epoch": 2.2001760342009304, + "grad_norm": 0.25137758864888454, + "kl": 0.1131591796875, + "learning_rate": 4.94101844168648e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 13122 + }, + { + "completion_length": 236.290189743042, + "epoch": 2.200511337440798, + "grad_norm": 0.19700262041976796, + "kl": 0.11376953125, + "learning_rate": 4.940992535642327e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964286170899868, + "rewards/format_reward_func": 1.0, + "step": 13124 + }, + { + "completion_length": 231.4375123977661, + "epoch": 2.2008466406806657, + "grad_norm": 0.07527575658926482, + "kl": 0.156158447265625, + "learning_rate": 4.940966623978103e-07, + "loss": 0.0002, + "reward": 1.8214286044239998, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8214286044239998, + "rewards/format_reward_func": 1.0, + "step": 13126 + }, + { + "completion_length": 242.0625114440918, + "epoch": 2.201181943920533, + "grad_norm": 0.21908432403033265, + "kl": 0.110595703125, + "learning_rate": 4.94094070669387e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 13128 + }, + { + "completion_length": 244.4509048461914, + "epoch": 2.2015172471604005, + "grad_norm": 0.19426442618048773, + "kl": 0.130706787109375, + "learning_rate": 4.940914783789685e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.775000024586916, + "rewards/format_reward_func": 1.0, + "step": 13130 + }, + { + "completion_length": 230.9821538925171, + "epoch": 2.201852550400268, + "grad_norm": 0.13175834022378757, + "kl": 0.1300048828125, + "learning_rate": 4.940888855265611e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428943425417, + "rewards/format_reward_func": 1.0, + "step": 13132 + }, + { + "completion_length": 225.47768783569336, + "epoch": 2.202187853640136, + "grad_norm": 0.33402959027393114, + "kl": 0.1414337158203125, + "learning_rate": 4.940862921121705e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 13134 + }, + { + "completion_length": 236.03125953674316, + "epoch": 2.2025231568800034, + "grad_norm": 0.1890372794346963, + "kl": 0.11383056640625, + "learning_rate": 4.940836981358027e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 1.0, + "step": 13136 + }, + { + "completion_length": 230.97322463989258, + "epoch": 2.202858460119871, + "grad_norm": 0.1619307898339516, + "kl": 0.1507568359375, + "learning_rate": 4.940811035974638e-07, + "loss": 0.0002, + "reward": 1.7642857879400253, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 13138 + }, + { + "completion_length": 236.21429634094238, + "epoch": 2.2031937633597383, + "grad_norm": 0.24074576556891386, + "kl": 0.128509521484375, + "learning_rate": 4.940785084971597e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000186264515, + "rewards/format_reward_func": 1.0, + "step": 13140 + }, + { + "completion_length": 223.4732265472412, + "epoch": 2.203529066599606, + "grad_norm": 0.11759453324945487, + "kl": 0.268890380859375, + "learning_rate": 4.940759128348963e-07, + "loss": 0.0003, + "reward": 1.7928571850061417, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8017857410013676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13142 + }, + { + "completion_length": 236.6428689956665, + "epoch": 2.2038643698394735, + "grad_norm": 0.17489192703639114, + "kl": 0.14190673828125, + "learning_rate": 4.940733166106797e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 1.0, + "step": 13144 + }, + { + "completion_length": 222.54465198516846, + "epoch": 2.204199673079341, + "grad_norm": 0.2115976973391719, + "kl": 0.13714599609375, + "learning_rate": 4.940707198245158e-07, + "loss": 0.0001, + "reward": 1.7571429386734962, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 13146 + }, + { + "completion_length": 220.4330472946167, + "epoch": 2.204534976319209, + "grad_norm": 0.10467149392557401, + "kl": 0.148040771484375, + "learning_rate": 4.940681224764107e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7321428880095482, + "rewards/format_reward_func": 1.0, + "step": 13148 + }, + { + "completion_length": 224.23661613464355, + "epoch": 2.2048702795590764, + "grad_norm": 0.218368171315283, + "kl": 0.23980712890625, + "learning_rate": 4.940655245663702e-07, + "loss": 0.0002, + "reward": 1.7500000670552254, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 13150 + }, + { + "completion_length": 230.19197463989258, + "epoch": 2.2052055827989436, + "grad_norm": 0.1515065738307242, + "kl": 0.230987548828125, + "learning_rate": 4.940629260944004e-07, + "loss": 0.0002, + "reward": 1.785714328289032, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 13152 + }, + { + "completion_length": 223.79018688201904, + "epoch": 2.2055408860388113, + "grad_norm": 0.35081248015729055, + "kl": 0.125213623046875, + "learning_rate": 4.940603270605072e-07, + "loss": 0.0001, + "reward": 1.7946429252624512, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7991071678698063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13154 + }, + { + "completion_length": 223.76340293884277, + "epoch": 2.205876189278679, + "grad_norm": 0.4487984507252899, + "kl": 0.14996337890625, + "learning_rate": 4.940577274646967e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7428571786731482, + "rewards/format_reward_func": 1.0, + "step": 13156 + }, + { + "completion_length": 229.00893878936768, + "epoch": 2.2062114925185465, + "grad_norm": 0.12167153735920415, + "kl": 0.259735107421875, + "learning_rate": 4.940551273069748e-07, + "loss": 0.0003, + "reward": 1.8053571805357933, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8098214510828257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13158 + }, + { + "completion_length": 229.25447463989258, + "epoch": 2.206546795758414, + "grad_norm": 0.1730710604136419, + "kl": 0.174346923828125, + "learning_rate": 4.940525265873475e-07, + "loss": 0.0002, + "reward": 1.778571493923664, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 13160 + }, + { + "completion_length": 225.1875114440918, + "epoch": 2.206882098998282, + "grad_norm": 0.007503315078522894, + "kl": 0.306854248046875, + "learning_rate": 4.940499253058208e-07, + "loss": 0.0003, + "reward": 1.757142923772335, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 13162 + }, + { + "completion_length": 228.38393878936768, + "epoch": 2.207217402238149, + "grad_norm": 0.16247735776919064, + "kl": 0.292724609375, + "learning_rate": 4.940473234624008e-07, + "loss": 0.0003, + "reward": 1.742857187986374, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7517857439815998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13164 + }, + { + "completion_length": 223.4241180419922, + "epoch": 2.2075527054780166, + "grad_norm": 0.4386678302614182, + "kl": 0.526611328125, + "learning_rate": 4.940447210570932e-07, + "loss": 0.0005, + "reward": 1.7714286297559738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 13166 + }, + { + "completion_length": 229.11607933044434, + "epoch": 2.2078880087178843, + "grad_norm": 0.1466795277008058, + "kl": 0.63433837890625, + "learning_rate": 4.940421180899042e-07, + "loss": 0.0006, + "reward": 1.74642863124609, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 13168 + }, + { + "completion_length": 223.15626049041748, + "epoch": 2.208223311957752, + "grad_norm": 0.1795933383348911, + "kl": 0.19903564453125, + "learning_rate": 4.940395145608398e-07, + "loss": 0.0002, + "reward": 1.7750000655651093, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 13170 + }, + { + "completion_length": 224.09822368621826, + "epoch": 2.2085586151976195, + "grad_norm": 0.15875580392748082, + "kl": 0.1873779296875, + "learning_rate": 4.940369104699059e-07, + "loss": 0.0002, + "reward": 1.7785715013742447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 13172 + }, + { + "completion_length": 237.92411708831787, + "epoch": 2.2088939184374867, + "grad_norm": 0.21953287765735693, + "kl": 0.3541259765625, + "learning_rate": 4.940343058171086e-07, + "loss": 0.0004, + "reward": 1.7696429044008255, + "reward_std": 0.08333758264780045, + "rewards/equation_reward_func": 0.7741071805357933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13174 + }, + { + "completion_length": 226.10268878936768, + "epoch": 2.2092292216773544, + "grad_norm": 0.23376125371304202, + "kl": 0.185394287109375, + "learning_rate": 4.940317006024539e-07, + "loss": 0.0002, + "reward": 1.8107143267989159, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.8196428716182709, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13176 + }, + { + "completion_length": 241.2991189956665, + "epoch": 2.209564524917222, + "grad_norm": 0.14580594406211703, + "kl": 0.4073486328125, + "learning_rate": 4.940290948259477e-07, + "loss": 0.0004, + "reward": 1.7267857939004898, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7312500197440386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13178 + }, + { + "completion_length": 227.40625858306885, + "epoch": 2.2098998281570896, + "grad_norm": 0.21299387186202437, + "kl": 0.16229248046875, + "learning_rate": 4.94026488487596e-07, + "loss": 0.0002, + "reward": 1.7767857760190964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13180 + }, + { + "completion_length": 224.30804634094238, + "epoch": 2.2102351313969573, + "grad_norm": 0.27077338842068727, + "kl": 0.13043212890625, + "learning_rate": 4.940238815874049e-07, + "loss": 0.0001, + "reward": 1.8017857819795609, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.806250024586916, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13182 + }, + { + "completion_length": 230.0491180419922, + "epoch": 2.210570434636825, + "grad_norm": 0.17648422454315202, + "kl": 0.1396484375, + "learning_rate": 4.940212741253803e-07, + "loss": 0.0001, + "reward": 1.7660715132951736, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7794643230736256, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13184 + }, + { + "completion_length": 230.9285831451416, + "epoch": 2.210905737876692, + "grad_norm": 0.19626567358030633, + "kl": 0.319732666015625, + "learning_rate": 4.940186661015283e-07, + "loss": 0.0003, + "reward": 1.7857143357396126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143394649029, + "rewards/format_reward_func": 1.0, + "step": 13186 + }, + { + "completion_length": 233.7767972946167, + "epoch": 2.2112410411165597, + "grad_norm": 0.3021643043558887, + "kl": 0.31884765625, + "learning_rate": 4.940160575158549e-07, + "loss": 0.0003, + "reward": 1.76071435213089, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7696429062634706, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13188 + }, + { + "completion_length": 228.15179538726807, + "epoch": 2.2115763443564274, + "grad_norm": 0.3841834809821566, + "kl": 0.54290771484375, + "learning_rate": 4.94013448368366e-07, + "loss": 0.0005, + "reward": 1.714285783469677, + "reward_std": 0.12121830508112907, + "rewards/equation_reward_func": 0.7321428917348385, + "rewards/format_reward_func": 0.9821428656578064, + "step": 13190 + }, + { + "completion_length": 223.93750953674316, + "epoch": 2.211911647596295, + "grad_norm": 0.32209311190475476, + "kl": 0.2110443115234375, + "learning_rate": 4.940108386590676e-07, + "loss": 0.0002, + "reward": 1.714285783469677, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7232143133878708, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13192 + }, + { + "completion_length": 235.66072463989258, + "epoch": 2.2122469508361626, + "grad_norm": 0.15596004633128052, + "kl": 0.768280029296875, + "learning_rate": 4.940082283879658e-07, + "loss": 0.0008, + "reward": 1.7357143461704254, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7446428947150707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13194 + }, + { + "completion_length": 226.04018783569336, + "epoch": 2.21258225407603, + "grad_norm": 0.21343340214385512, + "kl": 0.2079620361328125, + "learning_rate": 4.940056175550666e-07, + "loss": 0.0002, + "reward": 1.7875000461935997, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.791964303702116, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13196 + }, + { + "completion_length": 229.76340293884277, + "epoch": 2.2129175573158975, + "grad_norm": 0.26267101558592826, + "kl": 0.22930908203125, + "learning_rate": 4.940030061603761e-07, + "loss": 0.0002, + "reward": 1.8053571730852127, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8098214641213417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13198 + }, + { + "completion_length": 229.1384038925171, + "epoch": 2.213252860555765, + "grad_norm": 0.23785842627214238, + "kl": 0.151458740234375, + "learning_rate": 4.940003942039002e-07, + "loss": 0.0002, + "reward": 1.7125000581145287, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7258928902447224, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13200 + }, + { + "completion_length": 241.13394260406494, + "epoch": 2.2135881637956327, + "grad_norm": 0.2702312817671659, + "kl": 0.29144287109375, + "learning_rate": 4.939977816856447e-07, + "loss": 0.0003, + "reward": 1.7500000670552254, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 13202 + }, + { + "completion_length": 240.51340293884277, + "epoch": 2.2139234670355004, + "grad_norm": 0.14928824964527235, + "kl": 0.140411376953125, + "learning_rate": 4.939951686056161e-07, + "loss": 0.0001, + "reward": 1.7375000715255737, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7419643215835094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13204 + }, + { + "completion_length": 234.3125114440918, + "epoch": 2.214258770275368, + "grad_norm": 0.15811116279839596, + "kl": 0.374359130859375, + "learning_rate": 4.939925549638201e-07, + "loss": 0.0004, + "reward": 1.8535714745521545, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8535714522004128, + "rewards/format_reward_func": 1.0, + "step": 13206 + }, + { + "completion_length": 249.30804824829102, + "epoch": 2.214594073515235, + "grad_norm": 0.3955689485987965, + "kl": 0.528076171875, + "learning_rate": 4.939899407602627e-07, + "loss": 0.0005, + "reward": 1.778571456670761, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7875000406056643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13208 + }, + { + "completion_length": 241.12501335144043, + "epoch": 2.214929376755103, + "grad_norm": 0.4672937520059191, + "kl": 0.34112548828125, + "learning_rate": 4.939873259949499e-07, + "loss": 0.0003, + "reward": 1.8035714700818062, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.8125000149011612, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13210 + }, + { + "completion_length": 249.0446538925171, + "epoch": 2.2152646799949705, + "grad_norm": 0.09535743739495414, + "kl": 0.265960693359375, + "learning_rate": 4.939847106678881e-07, + "loss": 0.0003, + "reward": 1.7321429252624512, + "reward_std": 0.05555839091539383, + "rewards/equation_reward_func": 0.741071468219161, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13212 + }, + { + "completion_length": 250.58037090301514, + "epoch": 2.215599983234838, + "grad_norm": 0.11639680227151614, + "kl": 0.145843505859375, + "learning_rate": 4.939820947790828e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 13214 + }, + { + "completion_length": 236.36608219146729, + "epoch": 2.2159352864747057, + "grad_norm": 0.22684060354639632, + "kl": 0.2822265625, + "learning_rate": 4.939794783285403e-07, + "loss": 0.0003, + "reward": 1.8410714864730835, + "reward_std": 0.0833375845104456, + "rewards/equation_reward_func": 0.8544643111526966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13216 + }, + { + "completion_length": 241.20536518096924, + "epoch": 2.216270589714573, + "grad_norm": 0.06232753458071066, + "kl": 0.929351806640625, + "learning_rate": 4.939768613162666e-07, + "loss": 0.0009, + "reward": 1.7696429044008255, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.7830357365310192, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13218 + }, + { + "completion_length": 225.6250114440918, + "epoch": 2.2166058929544405, + "grad_norm": 0.17169788855031926, + "kl": 0.16473388671875, + "learning_rate": 4.939742437422677e-07, + "loss": 0.0002, + "reward": 1.8500000312924385, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.858928594738245, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13220 + }, + { + "completion_length": 231.65179634094238, + "epoch": 2.216941196194308, + "grad_norm": 0.22926052211592493, + "kl": 0.64813232421875, + "learning_rate": 4.939716256065496e-07, + "loss": 0.0006, + "reward": 1.7928571999073029, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.8017857559025288, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13222 + }, + { + "completion_length": 231.18304824829102, + "epoch": 2.217276499434176, + "grad_norm": 0.22491708701033208, + "kl": 0.4256591796875, + "learning_rate": 4.939690069091185e-07, + "loss": 0.0004, + "reward": 1.696428656578064, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.7053571753203869, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13224 + }, + { + "completion_length": 225.24554443359375, + "epoch": 2.2176118026740435, + "grad_norm": 0.1770503668273261, + "kl": 0.747161865234375, + "learning_rate": 4.939663876499801e-07, + "loss": 0.0007, + "reward": 1.8142857775092125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857328057289, + "rewards/format_reward_func": 1.0, + "step": 13226 + }, + { + "completion_length": 240.5044755935669, + "epoch": 2.217947105913911, + "grad_norm": 0.1134760793769743, + "kl": 1.253082275390625, + "learning_rate": 4.939637678291408e-07, + "loss": 0.0013, + "reward": 1.7678572088479996, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.776785746216774, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13228 + }, + { + "completion_length": 223.47322463989258, + "epoch": 2.2182824091537783, + "grad_norm": 0.32150448400287074, + "kl": 0.858489990234375, + "learning_rate": 4.939611474466063e-07, + "loss": 0.0009, + "reward": 1.744642935693264, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071671247482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13230 + }, + { + "completion_length": 223.62947463989258, + "epoch": 2.218617712393646, + "grad_norm": 0.2605905038870512, + "kl": 0.36175537109375, + "learning_rate": 4.939585265023828e-07, + "loss": 0.0004, + "reward": 1.7660714909434319, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7705357521772385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13232 + }, + { + "completion_length": 239.5937614440918, + "epoch": 2.2189530156335135, + "grad_norm": 0.2594317754827288, + "kl": 0.962310791015625, + "learning_rate": 4.939559049964764e-07, + "loss": 0.001, + "reward": 1.7285715118050575, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7375000398606062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13234 + }, + { + "completion_length": 238.602689743042, + "epoch": 2.219288318873381, + "grad_norm": 0.2423681610989229, + "kl": 0.459014892578125, + "learning_rate": 4.93953282928893e-07, + "loss": 0.0005, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214633762836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13236 + }, + { + "completion_length": 246.2857255935669, + "epoch": 2.219623622113249, + "grad_norm": 0.18164710354197572, + "kl": 0.57537841796875, + "learning_rate": 4.939506602996388e-07, + "loss": 0.0006, + "reward": 1.7642857655882835, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857618629932, + "rewards/format_reward_func": 1.0, + "step": 13238 + }, + { + "completion_length": 231.62054347991943, + "epoch": 2.219958925353116, + "grad_norm": 0.1970273094202194, + "kl": 0.5147247314453125, + "learning_rate": 4.939480371087196e-07, + "loss": 0.0005, + "reward": 1.8089286163449287, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.813392885029316, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13240 + }, + { + "completion_length": 246.5625114440918, + "epoch": 2.2202942285929836, + "grad_norm": 0.2030433424392116, + "kl": 0.534576416015625, + "learning_rate": 4.939454133561415e-07, + "loss": 0.0005, + "reward": 1.7196429148316383, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.733035746961832, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13242 + }, + { + "completion_length": 242.4151906967163, + "epoch": 2.2206295318328513, + "grad_norm": 0.1793058229309209, + "kl": 0.400634765625, + "learning_rate": 4.939427890419108e-07, + "loss": 0.0004, + "reward": 1.7482143715023994, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13244 + }, + { + "completion_length": 246.6116189956665, + "epoch": 2.220964835072719, + "grad_norm": 0.18217815077685284, + "kl": 0.204193115234375, + "learning_rate": 4.939401641660332e-07, + "loss": 0.0002, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 13246 + }, + { + "completion_length": 236.48661613464355, + "epoch": 2.2213001383125865, + "grad_norm": 0.703389472035067, + "kl": 0.359527587890625, + "learning_rate": 4.93937538728515e-07, + "loss": 0.0004, + "reward": 1.750000074505806, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7589285895228386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13248 + }, + { + "completion_length": 230.27233123779297, + "epoch": 2.221635441552454, + "grad_norm": 0.15593416735907303, + "kl": 0.16259765625, + "learning_rate": 4.939349127293621e-07, + "loss": 0.0002, + "reward": 1.7821429073810577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 13250 + }, + { + "completion_length": 247.27679920196533, + "epoch": 2.2219707447923214, + "grad_norm": 0.31791197842722513, + "kl": 0.48486328125, + "learning_rate": 4.939322861685806e-07, + "loss": 0.0005, + "reward": 1.7321429252624512, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7410714700818062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13252 + }, + { + "completion_length": 224.00447273254395, + "epoch": 2.222306048032189, + "grad_norm": 0.12457813021055096, + "kl": 0.187591552734375, + "learning_rate": 4.939296590461765e-07, + "loss": 0.0002, + "reward": 1.79464291036129, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071529686451, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13254 + }, + { + "completion_length": 233.6607265472412, + "epoch": 2.2226413512720566, + "grad_norm": 0.16283221198189626, + "kl": 0.1414794921875, + "learning_rate": 4.939270313621559e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857506871223, + "rewards/format_reward_func": 1.0, + "step": 13256 + }, + { + "completion_length": 237.87500667572021, + "epoch": 2.2229766545119243, + "grad_norm": 0.26208369447346597, + "kl": 0.2811279296875, + "learning_rate": 4.939244031165248e-07, + "loss": 0.0003, + "reward": 1.7857143580913544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 13258 + }, + { + "completion_length": 237.2009048461914, + "epoch": 2.223311957751792, + "grad_norm": 0.14783752676243633, + "kl": 0.203338623046875, + "learning_rate": 4.939217743092894e-07, + "loss": 0.0002, + "reward": 1.7250000685453415, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 1.0, + "step": 13260 + }, + { + "completion_length": 233.46876335144043, + "epoch": 2.223647260991659, + "grad_norm": 0.5945979191598133, + "kl": 0.155548095703125, + "learning_rate": 4.939191449404555e-07, + "loss": 0.0002, + "reward": 1.8178571984171867, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571686148643, + "rewards/format_reward_func": 1.0, + "step": 13262 + }, + { + "completion_length": 235.95090579986572, + "epoch": 2.2239825642315267, + "grad_norm": 0.08776234428861629, + "kl": 0.2462158203125, + "learning_rate": 4.939165150100294e-07, + "loss": 0.0002, + "reward": 1.707142949104309, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7071428950875998, + "rewards/format_reward_func": 1.0, + "step": 13264 + }, + { + "completion_length": 233.6250114440918, + "epoch": 2.2243178674713944, + "grad_norm": 0.12217827536623904, + "kl": 0.211669921875, + "learning_rate": 4.93913884518017e-07, + "loss": 0.0002, + "reward": 1.7910714820027351, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7955357544124126, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13266 + }, + { + "completion_length": 229.37947463989258, + "epoch": 2.224653170711262, + "grad_norm": 0.19298089285952225, + "kl": 0.270233154296875, + "learning_rate": 4.939112534644245e-07, + "loss": 0.0003, + "reward": 1.7517857924103737, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7562500201165676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13268 + }, + { + "completion_length": 229.87054538726807, + "epoch": 2.2249884739511296, + "grad_norm": 0.14434383594107386, + "kl": 0.153961181640625, + "learning_rate": 4.939086218492577e-07, + "loss": 0.0002, + "reward": 1.7107143551111221, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7107143308967352, + "rewards/format_reward_func": 1.0, + "step": 13270 + }, + { + "completion_length": 230.58036708831787, + "epoch": 2.2253237771909973, + "grad_norm": 0.16133111860810595, + "kl": 0.159027099609375, + "learning_rate": 4.939059896725228e-07, + "loss": 0.0002, + "reward": 1.8000000640749931, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 13272 + }, + { + "completion_length": 229.78572368621826, + "epoch": 2.2256590804308645, + "grad_norm": 0.2167370871722688, + "kl": 0.201263427734375, + "learning_rate": 4.939033569342259e-07, + "loss": 0.0002, + "reward": 1.7714286223053932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714286185801029, + "rewards/format_reward_func": 1.0, + "step": 13274 + }, + { + "completion_length": 229.45090293884277, + "epoch": 2.225994383670732, + "grad_norm": 0.2795147417582892, + "kl": 0.13800048828125, + "learning_rate": 4.939007236343732e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 13276 + }, + { + "completion_length": 229.4419755935669, + "epoch": 2.2263296869105997, + "grad_norm": 0.26344600167128623, + "kl": 0.23748779296875, + "learning_rate": 4.938980897729704e-07, + "loss": 0.0002, + "reward": 1.775000050663948, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 13278 + }, + { + "completion_length": 234.10715293884277, + "epoch": 2.2266649901504674, + "grad_norm": 0.09384786079257194, + "kl": 0.217926025390625, + "learning_rate": 4.938954553500238e-07, + "loss": 0.0002, + "reward": 1.7678571864962578, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 13280 + }, + { + "completion_length": 221.8928680419922, + "epoch": 2.227000293390335, + "grad_norm": 0.009029269373749394, + "kl": 0.191070556640625, + "learning_rate": 4.938928203655396e-07, + "loss": 0.0002, + "reward": 1.846428595483303, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8464285880327225, + "rewards/format_reward_func": 1.0, + "step": 13282 + }, + { + "completion_length": 229.92858219146729, + "epoch": 2.2273355966302026, + "grad_norm": 0.10794024296635173, + "kl": 0.130828857421875, + "learning_rate": 4.938901848195236e-07, + "loss": 0.0001, + "reward": 1.7767857611179352, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7803571745753288, + "rewards/format_reward_func": 0.9964285716414452, + "step": 13284 + }, + { + "completion_length": 232.4821538925171, + "epoch": 2.22767089987007, + "grad_norm": 0.10866707186198227, + "kl": 0.20367431640625, + "learning_rate": 4.938875487119819e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857581377029, + "rewards/format_reward_func": 1.0, + "step": 13286 + }, + { + "completion_length": 235.95983219146729, + "epoch": 2.2280062031099375, + "grad_norm": 0.06240347822895245, + "kl": 0.181243896484375, + "learning_rate": 4.938849120429207e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7642857488244772, + "rewards/format_reward_func": 1.0, + "step": 13288 + }, + { + "completion_length": 228.5937623977661, + "epoch": 2.228341506349805, + "grad_norm": 0.1655080256398611, + "kl": 0.1346435546875, + "learning_rate": 4.93882274812346e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 13290 + }, + { + "completion_length": 234.00000858306885, + "epoch": 2.2286768095896727, + "grad_norm": 0.1567514946119727, + "kl": 0.19720458984375, + "learning_rate": 4.938796370202639e-07, + "loss": 0.0002, + "reward": 1.821428619325161, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214285783469677, + "rewards/format_reward_func": 1.0, + "step": 13292 + }, + { + "completion_length": 227.3928680419922, + "epoch": 2.2290121128295404, + "grad_norm": 0.1766307361388533, + "kl": 0.14697265625, + "learning_rate": 4.938769986666804e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000465661287, + "rewards/format_reward_func": 1.0, + "step": 13294 + }, + { + "completion_length": 238.21429920196533, + "epoch": 2.229347416069408, + "grad_norm": 0.09715960629355828, + "kl": 0.177581787109375, + "learning_rate": 4.938743597516017e-07, + "loss": 0.0002, + "reward": 1.7142857983708382, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7142857536673546, + "rewards/format_reward_func": 1.0, + "step": 13296 + }, + { + "completion_length": 232.15179634094238, + "epoch": 2.229682719309275, + "grad_norm": 0.4376923634979283, + "kl": 0.1544189453125, + "learning_rate": 4.938717202750338e-07, + "loss": 0.0002, + "reward": 1.7964286357164383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 13298 + }, + { + "completion_length": 239.44644165039062, + "epoch": 2.230018022549143, + "grad_norm": 0.14513764982478072, + "kl": 0.25518798828125, + "learning_rate": 4.938690802369827e-07, + "loss": 0.0003, + "reward": 1.750000074505806, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 13300 + }, + { + "completion_length": 235.77679634094238, + "epoch": 2.2303533257890105, + "grad_norm": 0.22726923725353268, + "kl": 0.1492919921875, + "learning_rate": 4.938664396374545e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 13302 + }, + { + "completion_length": 231.8794755935669, + "epoch": 2.230688629028878, + "grad_norm": 0.24320216391633032, + "kl": 0.149627685546875, + "learning_rate": 4.938637984764555e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428723633289, + "rewards/format_reward_func": 1.0, + "step": 13304 + }, + { + "completion_length": 245.3571538925171, + "epoch": 2.2310239322687457, + "grad_norm": 0.22138009336756811, + "kl": 0.219757080078125, + "learning_rate": 4.938611567539915e-07, + "loss": 0.0002, + "reward": 1.814285770058632, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857328057289, + "rewards/format_reward_func": 1.0, + "step": 13306 + }, + { + "completion_length": 243.50001049041748, + "epoch": 2.231359235508613, + "grad_norm": 0.37270995546541225, + "kl": 0.167022705078125, + "learning_rate": 4.938585144700688e-07, + "loss": 0.0002, + "reward": 1.7767857909202576, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13308 + }, + { + "completion_length": 237.3125123977661, + "epoch": 2.2316945387484806, + "grad_norm": 0.3370206438808287, + "kl": 0.197845458984375, + "learning_rate": 4.938558716246933e-07, + "loss": 0.0002, + "reward": 1.8125000447034836, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8169643133878708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13310 + }, + { + "completion_length": 238.8080472946167, + "epoch": 2.232029841988348, + "grad_norm": 0.10725523459583197, + "kl": 0.18255615234375, + "learning_rate": 4.938532282178713e-07, + "loss": 0.0002, + "reward": 1.7428572103381157, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 13312 + }, + { + "completion_length": 249.25447368621826, + "epoch": 2.232365145228216, + "grad_norm": 0.05706609548541911, + "kl": 0.271392822265625, + "learning_rate": 4.938505842496086e-07, + "loss": 0.0003, + "reward": 1.769642911851406, + "reward_std": 0.012626906856894493, + "rewards/equation_reward_func": 0.7741071805357933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13314 + }, + { + "completion_length": 241.30358409881592, + "epoch": 2.2327004484680835, + "grad_norm": 0.297844963742036, + "kl": 0.2213134765625, + "learning_rate": 4.938479397199115e-07, + "loss": 0.0002, + "reward": 1.7750000804662704, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 13316 + }, + { + "completion_length": 242.61608505249023, + "epoch": 2.233035751707951, + "grad_norm": 0.06049902245495544, + "kl": 0.239166259765625, + "learning_rate": 4.93845294628786e-07, + "loss": 0.0002, + "reward": 1.7660714909434319, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7705357484519482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13318 + }, + { + "completion_length": 253.62500858306885, + "epoch": 2.2333710549478183, + "grad_norm": 0.20782510215756428, + "kl": 0.47796630859375, + "learning_rate": 4.938426489762382e-07, + "loss": 0.0005, + "reward": 1.8107143715023994, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 13320 + }, + { + "completion_length": 241.4062623977661, + "epoch": 2.233706358187686, + "grad_norm": 0.15660245756764526, + "kl": 0.2474365234375, + "learning_rate": 4.938400027622744e-07, + "loss": 0.0002, + "reward": 1.800000049173832, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 13322 + }, + { + "completion_length": 239.20536994934082, + "epoch": 2.2340416614275536, + "grad_norm": 0.1377272999131664, + "kl": 0.541748046875, + "learning_rate": 4.938373559869003e-07, + "loss": 0.0005, + "reward": 1.7446429282426834, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13324 + }, + { + "completion_length": 234.62054538726807, + "epoch": 2.234376964667421, + "grad_norm": 0.4585849996854235, + "kl": 0.53997802734375, + "learning_rate": 4.938347086501223e-07, + "loss": 0.0005, + "reward": 1.755357213318348, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13326 + }, + { + "completion_length": 233.2634048461914, + "epoch": 2.234712267907289, + "grad_norm": 0.15034751021902695, + "kl": 0.90093994140625, + "learning_rate": 4.938320607519464e-07, + "loss": 0.0009, + "reward": 1.7410714998841286, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357369035482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13328 + }, + { + "completion_length": 249.1116180419922, + "epoch": 2.235047571147156, + "grad_norm": 0.18235629112200313, + "kl": 0.187347412109375, + "learning_rate": 4.938294122923785e-07, + "loss": 0.0002, + "reward": 1.7803572118282318, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13330 + }, + { + "completion_length": 238.00893878936768, + "epoch": 2.2353828743870237, + "grad_norm": 0.2001295789142171, + "kl": 1.100799560546875, + "learning_rate": 4.938267632714252e-07, + "loss": 0.0011, + "reward": 1.7821429148316383, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13332 + }, + { + "completion_length": 231.4910831451416, + "epoch": 2.2357181776268913, + "grad_norm": 0.7877813828478731, + "kl": 0.62371826171875, + "learning_rate": 4.938241136890921e-07, + "loss": 0.0006, + "reward": 1.753571517765522, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7625000178813934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13334 + }, + { + "completion_length": 235.11161708831787, + "epoch": 2.236053480866759, + "grad_norm": 0.15697719157481013, + "kl": 0.374969482421875, + "learning_rate": 4.938214635453854e-07, + "loss": 0.0004, + "reward": 1.7464286535978317, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 13336 + }, + { + "completion_length": 239.9241180419922, + "epoch": 2.2363887841066266, + "grad_norm": 0.45109904732269696, + "kl": 0.39056396484375, + "learning_rate": 4.938188128403114e-07, + "loss": 0.0004, + "reward": 1.7017858028411865, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7062500193715096, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13338 + }, + { + "completion_length": 239.34376049041748, + "epoch": 2.236724087346494, + "grad_norm": 0.21322803207128538, + "kl": 1.0438232421875, + "learning_rate": 4.938161615738762e-07, + "loss": 0.001, + "reward": 1.7178571820259094, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.7267857324331999, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13340 + }, + { + "completion_length": 240.7142972946167, + "epoch": 2.2370593905863614, + "grad_norm": 0.1442597372037303, + "kl": 0.367919921875, + "learning_rate": 4.938135097460856e-07, + "loss": 0.0004, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526786103844643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13342 + }, + { + "completion_length": 239.2053689956665, + "epoch": 2.237394693826229, + "grad_norm": 0.26108253632311346, + "kl": 0.6842041015625, + "learning_rate": 4.93810857356946e-07, + "loss": 0.0007, + "reward": 1.719642959535122, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071835160255, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13344 + }, + { + "completion_length": 240.7366180419922, + "epoch": 2.2377299970660967, + "grad_norm": 0.19328804186926843, + "kl": 0.4044189453125, + "learning_rate": 4.938082044064634e-07, + "loss": 0.0004, + "reward": 1.7392857745289803, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7482143193483353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13346 + }, + { + "completion_length": 236.4285831451416, + "epoch": 2.2380653003059643, + "grad_norm": 0.10448958503500268, + "kl": 0.347503662109375, + "learning_rate": 4.938055508946439e-07, + "loss": 0.0003, + "reward": 1.7053572088479996, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7098214663565159, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13348 + }, + { + "completion_length": 226.47322463989258, + "epoch": 2.238400603545832, + "grad_norm": 0.3026737005415429, + "kl": 0.236053466796875, + "learning_rate": 4.938028968214937e-07, + "loss": 0.0002, + "reward": 1.7410715147852898, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13350 + }, + { + "completion_length": 227.45090293884277, + "epoch": 2.238735906785699, + "grad_norm": 0.21333624860225478, + "kl": 0.204345703125, + "learning_rate": 4.938002421870187e-07, + "loss": 0.0002, + "reward": 1.741071492433548, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7455357499420643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13352 + }, + { + "completion_length": 242.4241189956665, + "epoch": 2.2390712100255667, + "grad_norm": 0.11296037504989383, + "kl": 0.187530517578125, + "learning_rate": 4.937975869912252e-07, + "loss": 0.0002, + "reward": 1.7718750685453415, + "reward_std": 0.035986683797091246, + "rewards/equation_reward_func": 0.7776785846799612, + "rewards/format_reward_func": 0.9941964335739613, + "step": 13354 + }, + { + "completion_length": 238.2500114440918, + "epoch": 2.2394065132654344, + "grad_norm": 0.4362049106828727, + "kl": 0.346435546875, + "learning_rate": 4.937949312341193e-07, + "loss": 0.0003, + "reward": 1.7183036357164383, + "reward_std": 0.07513009523972869, + "rewards/equation_reward_func": 0.733035746961832, + "rewards/format_reward_func": 0.9852678664028645, + "step": 13356 + }, + { + "completion_length": 229.33929538726807, + "epoch": 2.239741816505302, + "grad_norm": 0.18681869868795595, + "kl": 0.319091796875, + "learning_rate": 4.937922749157071e-07, + "loss": 0.0003, + "reward": 1.750446505844593, + "reward_std": 0.05997780757024884, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9941964335739613, + "step": 13358 + }, + { + "completion_length": 230.15179634094238, + "epoch": 2.2400771197451697, + "grad_norm": 0.565797210429418, + "kl": 0.379913330078125, + "learning_rate": 4.937896180359946e-07, + "loss": 0.0004, + "reward": 1.7678572162985802, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 1.0, + "step": 13360 + }, + { + "completion_length": 229.35715293884277, + "epoch": 2.2404124229850373, + "grad_norm": 0.25123464149445224, + "kl": 0.290374755859375, + "learning_rate": 4.937869605949881e-07, + "loss": 0.0003, + "reward": 1.7910714745521545, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7955357357859612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13362 + }, + { + "completion_length": 233.9732255935669, + "epoch": 2.2407477262249045, + "grad_norm": 0.936629094464273, + "kl": 0.34619140625, + "learning_rate": 4.937843025926936e-07, + "loss": 0.0003, + "reward": 1.7446429282426834, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071578115225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13364 + }, + { + "completion_length": 237.1160831451416, + "epoch": 2.241083029464772, + "grad_norm": 0.1248168838109331, + "kl": 0.30145263671875, + "learning_rate": 4.937816440291172e-07, + "loss": 0.0003, + "reward": 1.7750000730156898, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 13366 + }, + { + "completion_length": 224.27233123779297, + "epoch": 2.2414183327046397, + "grad_norm": 0.07464853245578626, + "kl": 0.222442626953125, + "learning_rate": 4.937789849042651e-07, + "loss": 0.0002, + "reward": 1.7803572043776512, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7848214581608772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13368 + }, + { + "completion_length": 218.9910831451416, + "epoch": 2.2417536359445074, + "grad_norm": 0.3339117384872975, + "kl": 0.615142822265625, + "learning_rate": 4.937763252181434e-07, + "loss": 0.0006, + "reward": 1.7892857789993286, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7982143089175224, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13370 + }, + { + "completion_length": 221.57143878936768, + "epoch": 2.242088939184375, + "grad_norm": 0.23947230221231972, + "kl": 0.119140625, + "learning_rate": 4.937736649707582e-07, + "loss": 0.0001, + "reward": 1.853571467101574, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.8625000193715096, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13372 + }, + { + "completion_length": 231.50447273254395, + "epoch": 2.242424242424242, + "grad_norm": 0.19491513244225595, + "kl": 0.304962158203125, + "learning_rate": 4.937710041621156e-07, + "loss": 0.0003, + "reward": 1.7964286133646965, + "reward_std": 0.07576143927872181, + "rewards/equation_reward_func": 0.805357176810503, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13374 + }, + { + "completion_length": 217.4017972946167, + "epoch": 2.24275954566411, + "grad_norm": 0.2010389179975474, + "kl": 0.448394775390625, + "learning_rate": 4.937683427922218e-07, + "loss": 0.0004, + "reward": 1.775000050663948, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000152736902, + "rewards/format_reward_func": 1.0, + "step": 13376 + }, + { + "completion_length": 228.98661613464355, + "epoch": 2.2430948489039775, + "grad_norm": 0.34878759281384486, + "kl": 0.397796630859375, + "learning_rate": 4.93765680861083e-07, + "loss": 0.0004, + "reward": 1.7767857760190964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.781250037252903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13378 + }, + { + "completion_length": 231.84822463989258, + "epoch": 2.243430152143845, + "grad_norm": 0.40787454075194673, + "kl": 0.804473876953125, + "learning_rate": 4.93763018368705e-07, + "loss": 0.0008, + "reward": 1.7982143089175224, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.8116071708500385, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13380 + }, + { + "completion_length": 228.08929443359375, + "epoch": 2.2437654553837127, + "grad_norm": 0.3151023477866274, + "kl": 0.313934326171875, + "learning_rate": 4.937603553150944e-07, + "loss": 0.0003, + "reward": 1.7803571969270706, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214600235224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13382 + }, + { + "completion_length": 237.78572368621826, + "epoch": 2.2441007586235804, + "grad_norm": 0.13640015825158425, + "kl": 0.5728759765625, + "learning_rate": 4.937576917002569e-07, + "loss": 0.0006, + "reward": 1.662500061094761, + "reward_std": 0.06313453428447247, + "rewards/equation_reward_func": 0.6758929062634706, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13384 + }, + { + "completion_length": 236.40179443359375, + "epoch": 2.2444360618634476, + "grad_norm": 0.17800346490208352, + "kl": 0.17962646484375, + "learning_rate": 4.937550275241989e-07, + "loss": 0.0002, + "reward": 1.7941965013742447, + "reward_std": 0.05871511623263359, + "rewards/equation_reward_func": 0.8017857484519482, + "rewards/format_reward_func": 0.9924107193946838, + "step": 13386 + }, + { + "completion_length": 232.49108409881592, + "epoch": 2.244771365103315, + "grad_norm": 0.6126242940664247, + "kl": 0.170166015625, + "learning_rate": 4.937523627869264e-07, + "loss": 0.0002, + "reward": 1.7683036029338837, + "reward_std": 0.054927044780924916, + "rewards/equation_reward_func": 0.7830357439815998, + "rewards/format_reward_func": 0.9852678664028645, + "step": 13388 + }, + { + "completion_length": 246.34822463989258, + "epoch": 2.245106668343183, + "grad_norm": 0.32475932657472756, + "kl": 0.211639404296875, + "learning_rate": 4.937496974884457e-07, + "loss": 0.0002, + "reward": 1.725000075995922, + "reward_std": 0.11616754159331322, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 0.9821428656578064, + "step": 13390 + }, + { + "completion_length": 239.3794765472412, + "epoch": 2.2454419715830505, + "grad_norm": 0.5876372394781416, + "kl": 0.22247314453125, + "learning_rate": 4.937470316287627e-07, + "loss": 0.0002, + "reward": 1.7875000461935997, + "reward_std": 0.06818529404699802, + "rewards/equation_reward_func": 0.8008928894996643, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13392 + }, + { + "completion_length": 236.8080472946167, + "epoch": 2.245777274822918, + "grad_norm": 0.36167189638405006, + "kl": 0.2952880859375, + "learning_rate": 4.937443652078836e-07, + "loss": 0.0003, + "reward": 1.709375075995922, + "reward_std": 0.04735090141184628, + "rewards/equation_reward_func": 0.7196428962051868, + "rewards/format_reward_func": 0.9897321499884129, + "step": 13394 + }, + { + "completion_length": 232.4241180419922, + "epoch": 2.2461125780627853, + "grad_norm": 0.2621123567593843, + "kl": 0.179229736328125, + "learning_rate": 4.937416982258146e-07, + "loss": 0.0002, + "reward": 1.7910714745521545, + "reward_std": 0.08333758357912302, + "rewards/equation_reward_func": 0.8044643141329288, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13396 + }, + { + "completion_length": 227.4196538925171, + "epoch": 2.246447881302653, + "grad_norm": 0.15406421507840143, + "kl": 0.264862060546875, + "learning_rate": 4.93739030682562e-07, + "loss": 0.0003, + "reward": 1.8053571805357933, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8187500312924385, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13398 + }, + { + "completion_length": 221.07590293884277, + "epoch": 2.2467831845425206, + "grad_norm": 0.19532516413637616, + "kl": 0.7318115234375, + "learning_rate": 4.937363625781317e-07, + "loss": 0.0007, + "reward": 1.7982143238186836, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.802678607404232, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13400 + }, + { + "completion_length": 219.62054634094238, + "epoch": 2.247118487782388, + "grad_norm": 0.1643623346466658, + "kl": 0.149444580078125, + "learning_rate": 4.937336939125299e-07, + "loss": 0.0001, + "reward": 1.7571429088711739, + "reward_std": 0.030304577201604843, + "rewards/equation_reward_func": 0.7660714592784643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13402 + }, + { + "completion_length": 224.09822368621826, + "epoch": 2.247453791022256, + "grad_norm": 0.1959789257161625, + "kl": 0.298797607421875, + "learning_rate": 4.937310246857628e-07, + "loss": 0.0003, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 13404 + }, + { + "completion_length": 221.84375953674316, + "epoch": 2.2477890942621235, + "grad_norm": 0.2076420808661432, + "kl": 0.147674560546875, + "learning_rate": 4.937283548978365e-07, + "loss": 0.0001, + "reward": 1.755357213318348, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214671015739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13406 + }, + { + "completion_length": 223.008939743042, + "epoch": 2.2481243975019907, + "grad_norm": 0.14167261600573977, + "kl": 0.41143798828125, + "learning_rate": 4.937256845487572e-07, + "loss": 0.0004, + "reward": 1.7839286178350449, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7875000350177288, + "rewards/format_reward_func": 0.9964285716414452, + "step": 13408 + }, + { + "completion_length": 221.56250858306885, + "epoch": 2.2484597007418583, + "grad_norm": 0.2508994403319969, + "kl": 0.654296875, + "learning_rate": 4.93723013638531e-07, + "loss": 0.0007, + "reward": 1.725000075995922, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7339285966008902, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13410 + }, + { + "completion_length": 230.81251049041748, + "epoch": 2.248795003981726, + "grad_norm": 0.14106200207665437, + "kl": 0.626007080078125, + "learning_rate": 4.93720342167164e-07, + "loss": 0.0006, + "reward": 1.778571479022503, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 13412 + }, + { + "completion_length": 218.97322463989258, + "epoch": 2.2491303072215936, + "grad_norm": 0.36174404106793095, + "kl": 0.28924560546875, + "learning_rate": 4.937176701346623e-07, + "loss": 0.0003, + "reward": 1.778571493923664, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 13414 + }, + { + "completion_length": 226.5491189956665, + "epoch": 2.249465610461461, + "grad_norm": 0.23314871303389753, + "kl": 0.13134765625, + "learning_rate": 4.937149975410324e-07, + "loss": 0.0001, + "reward": 1.7053572461009026, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7098214589059353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13416 + }, + { + "completion_length": 213.50000953674316, + "epoch": 2.249800913701329, + "grad_norm": 0.27470988123417955, + "kl": 0.274169921875, + "learning_rate": 4.937123243862801e-07, + "loss": 0.0003, + "reward": 1.796428620815277, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 13418 + }, + { + "completion_length": 219.67858028411865, + "epoch": 2.250136216941196, + "grad_norm": 0.3176363415668159, + "kl": 0.127593994140625, + "learning_rate": 4.937096506704116e-07, + "loss": 0.0001, + "reward": 1.8089286163449287, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.8133928887546062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13420 + }, + { + "completion_length": 221.87501049041748, + "epoch": 2.2504715201810637, + "grad_norm": 0.224929363983825, + "kl": 0.194793701171875, + "learning_rate": 4.937069763934333e-07, + "loss": 0.0002, + "reward": 1.759375050663948, + "reward_std": 0.027147849323228, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 0.9986607171595097, + "step": 13422 + }, + { + "completion_length": 225.65179538726807, + "epoch": 2.2508068234209313, + "grad_norm": 0.018372357162259132, + "kl": 0.30120849609375, + "learning_rate": 4.937043015553511e-07, + "loss": 0.0003, + "reward": 1.7892857566475868, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 13424 + }, + { + "completion_length": 226.90626049041748, + "epoch": 2.251142126660799, + "grad_norm": 0.23131515399617453, + "kl": 0.152099609375, + "learning_rate": 4.937016261561712e-07, + "loss": 0.0002, + "reward": 1.8017857670783997, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500264495611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13426 + }, + { + "completion_length": 233.4910831451416, + "epoch": 2.2514774299006666, + "grad_norm": 0.2435184314392744, + "kl": 0.39764404296875, + "learning_rate": 4.936989501958997e-07, + "loss": 0.0004, + "reward": 1.7392857670783997, + "reward_std": 0.0656599160283804, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 0.9821428656578064, + "step": 13428 + }, + { + "completion_length": 229.040189743042, + "epoch": 2.251812733140534, + "grad_norm": 0.14798224341983937, + "kl": 0.17626953125, + "learning_rate": 4.93696273674543e-07, + "loss": 0.0002, + "reward": 1.817857213318348, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.817857164889574, + "rewards/format_reward_func": 1.0, + "step": 13430 + }, + { + "completion_length": 234.95536708831787, + "epoch": 2.2521480363804014, + "grad_norm": 0.2096833069883818, + "kl": 0.19940185546875, + "learning_rate": 4.93693596592107e-07, + "loss": 0.0002, + "reward": 1.725000061094761, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000182539225, + "rewards/format_reward_func": 1.0, + "step": 13432 + }, + { + "completion_length": 219.80358028411865, + "epoch": 2.252483339620269, + "grad_norm": 0.2303255478167584, + "kl": 0.118499755859375, + "learning_rate": 4.936909189485981e-07, + "loss": 0.0001, + "reward": 1.785714365541935, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 13434 + }, + { + "completion_length": 228.52679443359375, + "epoch": 2.2528186428601367, + "grad_norm": 0.15056984969771808, + "kl": 0.13458251953125, + "learning_rate": 4.936882407440225e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 13436 + }, + { + "completion_length": 234.61608219146729, + "epoch": 2.2531539461000043, + "grad_norm": 0.23384952531418834, + "kl": 0.36761474609375, + "learning_rate": 4.936855619783859e-07, + "loss": 0.0004, + "reward": 1.7642857730388641, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 13438 + }, + { + "completion_length": 242.87054824829102, + "epoch": 2.253489249339872, + "grad_norm": 0.2586868516066045, + "kl": 0.386260986328125, + "learning_rate": 4.93682882651695e-07, + "loss": 0.0004, + "reward": 1.7625000551342964, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13440 + }, + { + "completion_length": 235.42411708831787, + "epoch": 2.253824552579739, + "grad_norm": 0.15906009822520661, + "kl": 0.144134521484375, + "learning_rate": 4.936802027639557e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 13442 + }, + { + "completion_length": 228.6562614440918, + "epoch": 2.2541598558196068, + "grad_norm": 0.18259206308634707, + "kl": 0.108551025390625, + "learning_rate": 4.936775223151742e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.741071455180645, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13444 + }, + { + "completion_length": 251.96430015563965, + "epoch": 2.2544951590594744, + "grad_norm": 0.0743592992559579, + "kl": 0.2841796875, + "learning_rate": 4.936748413053567e-07, + "loss": 0.0003, + "reward": 1.7517857775092125, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500312924385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13446 + }, + { + "completion_length": 230.42411708831787, + "epoch": 2.254830462299342, + "grad_norm": 0.1739434056610246, + "kl": 0.193389892578125, + "learning_rate": 4.936721597345093e-07, + "loss": 0.0002, + "reward": 1.8107143342494965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 13448 + }, + { + "completion_length": 242.21876049041748, + "epoch": 2.2551657655392097, + "grad_norm": 0.25470983698517347, + "kl": 0.884552001953125, + "learning_rate": 4.936694776026384e-07, + "loss": 0.0009, + "reward": 1.6821429207921028, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.6821429040282965, + "rewards/format_reward_func": 1.0, + "step": 13450 + }, + { + "completion_length": 236.9955472946167, + "epoch": 2.2555010687790773, + "grad_norm": 0.19159469148946703, + "kl": 0.196136474609375, + "learning_rate": 4.9366679490975e-07, + "loss": 0.0002, + "reward": 1.7839286476373672, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13452 + }, + { + "completion_length": 243.32143878936768, + "epoch": 2.2558363720189445, + "grad_norm": 0.2567005414121196, + "kl": 0.144256591796875, + "learning_rate": 4.936641116558502e-07, + "loss": 0.0001, + "reward": 1.7821429297327995, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 13454 + }, + { + "completion_length": 237.25893783569336, + "epoch": 2.256171675258812, + "grad_norm": 0.30205311916900046, + "kl": 0.12261962890625, + "learning_rate": 4.936614278409452e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7669643051922321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13456 + }, + { + "completion_length": 240.8928680419922, + "epoch": 2.2565069784986798, + "grad_norm": 0.1936254461303534, + "kl": 0.12506103515625, + "learning_rate": 4.936587434650414e-07, + "loss": 0.0001, + "reward": 1.819642886519432, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8241071626543999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13458 + }, + { + "completion_length": 233.42858219146729, + "epoch": 2.2568422817385474, + "grad_norm": 0.06795896946491352, + "kl": 0.112396240234375, + "learning_rate": 4.936560585281447e-07, + "loss": 0.0001, + "reward": 1.7732143253087997, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13460 + }, + { + "completion_length": 229.0044755935669, + "epoch": 2.257177584978415, + "grad_norm": 0.24394781620102302, + "kl": 0.173187255859375, + "learning_rate": 4.936533730302615e-07, + "loss": 0.0002, + "reward": 1.7535714879631996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 13462 + }, + { + "completion_length": 226.38393783569336, + "epoch": 2.257512888218282, + "grad_norm": 0.26312285892319304, + "kl": 0.642669677734375, + "learning_rate": 4.936506869713979e-07, + "loss": 0.0006, + "reward": 1.7696429193019867, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13464 + }, + { + "completion_length": 228.5000114440918, + "epoch": 2.25784819145815, + "grad_norm": 0.327693517526902, + "kl": 0.1544189453125, + "learning_rate": 4.9364800035156e-07, + "loss": 0.0002, + "reward": 1.7607143744826317, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607142981141806, + "rewards/format_reward_func": 1.0, + "step": 13466 + }, + { + "completion_length": 228.26340198516846, + "epoch": 2.2581834946980175, + "grad_norm": 0.2079338402791647, + "kl": 0.130828857421875, + "learning_rate": 4.936453131707542e-07, + "loss": 0.0001, + "reward": 1.7214286625385284, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286141097546, + "rewards/format_reward_func": 1.0, + "step": 13468 + }, + { + "completion_length": 235.0491180419922, + "epoch": 2.258518797937885, + "grad_norm": 0.09090744850758828, + "kl": 0.2340087890625, + "learning_rate": 4.936426254289865e-07, + "loss": 0.0002, + "reward": 1.7696429416537285, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7741071712225676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13470 + }, + { + "completion_length": 223.758939743042, + "epoch": 2.2588541011777528, + "grad_norm": 0.24446218583199866, + "kl": 0.10858154296875, + "learning_rate": 4.936399371262631e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 13472 + }, + { + "completion_length": 231.61608028411865, + "epoch": 2.2591894044176204, + "grad_norm": 0.37153122030690916, + "kl": 0.2354736328125, + "learning_rate": 4.936372482625902e-07, + "loss": 0.0002, + "reward": 1.7696429044008255, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071693599224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13474 + }, + { + "completion_length": 224.7991180419922, + "epoch": 2.2595247076574876, + "grad_norm": 0.27591459797161505, + "kl": 0.130615234375, + "learning_rate": 4.936345588379742e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 13476 + }, + { + "completion_length": 218.008939743042, + "epoch": 2.259860010897355, + "grad_norm": 0.19964699963299606, + "kl": 0.11248779296875, + "learning_rate": 4.936318688524209e-07, + "loss": 0.0001, + "reward": 1.7785715088248253, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 13478 + }, + { + "completion_length": 222.89286613464355, + "epoch": 2.260195314137223, + "grad_norm": 0.19133067031787318, + "kl": 0.102752685546875, + "learning_rate": 4.936291783059367e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 13480 + }, + { + "completion_length": 225.29465293884277, + "epoch": 2.2605306173770905, + "grad_norm": 0.11899468868728356, + "kl": 0.1959228515625, + "learning_rate": 4.93626487198528e-07, + "loss": 0.0002, + "reward": 1.803571492433548, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714458674192, + "rewards/format_reward_func": 1.0, + "step": 13482 + }, + { + "completion_length": 219.39733123779297, + "epoch": 2.260865920616958, + "grad_norm": 0.21464143001614744, + "kl": 0.13031005859375, + "learning_rate": 4.936237955302006e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 13484 + }, + { + "completion_length": 207.0759048461914, + "epoch": 2.2612012238568253, + "grad_norm": 0.25222121509954876, + "kl": 0.114593505859375, + "learning_rate": 4.936211033009611e-07, + "loss": 0.0001, + "reward": 1.7464286088943481, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286033064127, + "rewards/format_reward_func": 1.0, + "step": 13486 + }, + { + "completion_length": 217.8616180419922, + "epoch": 2.261536527096693, + "grad_norm": 0.15743116447756114, + "kl": 0.11029052734375, + "learning_rate": 4.936184105108153e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 13488 + }, + { + "completion_length": 211.71429538726807, + "epoch": 2.2618718303365606, + "grad_norm": 0.33981219507890775, + "kl": 0.143707275390625, + "learning_rate": 4.936157171597697e-07, + "loss": 0.0001, + "reward": 1.8142857775092125, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857253551483, + "rewards/format_reward_func": 1.0, + "step": 13490 + }, + { + "completion_length": 222.19197273254395, + "epoch": 2.262207133576428, + "grad_norm": 0.20660793689804108, + "kl": 0.121429443359375, + "learning_rate": 4.936130232478303e-07, + "loss": 0.0001, + "reward": 1.7678571864962578, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571920841932, + "rewards/format_reward_func": 1.0, + "step": 13492 + }, + { + "completion_length": 216.99108028411865, + "epoch": 2.262542436816296, + "grad_norm": 0.18929196634838816, + "kl": 0.161285400390625, + "learning_rate": 4.936103287750035e-07, + "loss": 0.0002, + "reward": 1.7321429401636124, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 13494 + }, + { + "completion_length": 211.60715103149414, + "epoch": 2.2628777400561635, + "grad_norm": 0.24113602668906872, + "kl": 0.107635498046875, + "learning_rate": 4.936076337412954e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7848214395344257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13496 + }, + { + "completion_length": 212.38393783569336, + "epoch": 2.2632130432960307, + "grad_norm": 0.13588768404975396, + "kl": 0.2230224609375, + "learning_rate": 4.936049381467121e-07, + "loss": 0.0002, + "reward": 1.807142898440361, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 13498 + }, + { + "completion_length": 215.57590198516846, + "epoch": 2.2635483465358983, + "grad_norm": 0.17110466396901336, + "kl": 0.12445068359375, + "learning_rate": 4.9360224199126e-07, + "loss": 0.0001, + "reward": 1.832142896950245, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8321428708732128, + "rewards/format_reward_func": 1.0, + "step": 13500 + }, + { + "completion_length": 217.27233028411865, + "epoch": 2.263883649775766, + "grad_norm": 0.2187412522132426, + "kl": 0.113922119140625, + "learning_rate": 4.935995452749452e-07, + "loss": 0.0001, + "reward": 1.7178572043776512, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7258928902447224, + "rewards/format_reward_func": 0.9919642955064774, + "step": 13502 + }, + { + "completion_length": 215.1205472946167, + "epoch": 2.2642189530156336, + "grad_norm": 0.15344155196626352, + "kl": 0.102447509765625, + "learning_rate": 4.935968479977738e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8035714477300644, + "rewards/format_reward_func": 1.0, + "step": 13504 + }, + { + "completion_length": 214.22768688201904, + "epoch": 2.264554256255501, + "grad_norm": 0.06149945562538831, + "kl": 0.119384765625, + "learning_rate": 4.935941501597523e-07, + "loss": 0.0001, + "reward": 1.7357143834233284, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.735714316368103, + "rewards/format_reward_func": 1.0, + "step": 13506 + }, + { + "completion_length": 211.74554538726807, + "epoch": 2.2648895594953684, + "grad_norm": 0.06281976986005453, + "kl": 0.101837158203125, + "learning_rate": 4.935914517608867e-07, + "loss": 0.0001, + "reward": 1.8107143193483353, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 13508 + }, + { + "completion_length": 216.7410831451416, + "epoch": 2.265224862735236, + "grad_norm": 0.18027577339039147, + "kl": 0.097869873046875, + "learning_rate": 4.935887528011833e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 13510 + }, + { + "completion_length": 220.33929538726807, + "epoch": 2.2655601659751037, + "grad_norm": 0.2485328935508096, + "kl": 0.113037109375, + "learning_rate": 4.935860532806482e-07, + "loss": 0.0001, + "reward": 1.8196429163217545, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8241071663796902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13512 + }, + { + "completion_length": 210.77679634094238, + "epoch": 2.2658954692149713, + "grad_norm": 0.1617038734571405, + "kl": 0.128143310546875, + "learning_rate": 4.935833531992877e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571604192257, + "rewards/format_reward_func": 1.0, + "step": 13514 + }, + { + "completion_length": 210.24554538726807, + "epoch": 2.266230772454839, + "grad_norm": 0.1882055746221909, + "kl": 0.122772216796875, + "learning_rate": 4.93580652557108e-07, + "loss": 0.0001, + "reward": 1.7178572118282318, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7178571783006191, + "rewards/format_reward_func": 1.0, + "step": 13516 + }, + { + "completion_length": 203.39732837677002, + "epoch": 2.2665660756947066, + "grad_norm": 0.05495894567000175, + "kl": 0.1014556884765625, + "learning_rate": 4.935779513541154e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 13518 + }, + { + "completion_length": 212.12947273254395, + "epoch": 2.2669013789345738, + "grad_norm": 0.1476861958550593, + "kl": 0.106475830078125, + "learning_rate": 4.93575249590316e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714484751225, + "rewards/format_reward_func": 1.0, + "step": 13520 + }, + { + "completion_length": 212.41072273254395, + "epoch": 2.2672366821744414, + "grad_norm": 0.32315519728920805, + "kl": 0.111785888671875, + "learning_rate": 4.935725472657161e-07, + "loss": 0.0001, + "reward": 1.8178571984171867, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 13522 + }, + { + "completion_length": 215.58036708831787, + "epoch": 2.267571985414309, + "grad_norm": 0.19584402312119173, + "kl": 0.115447998046875, + "learning_rate": 4.935698443803218e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 13524 + }, + { + "completion_length": 207.04911613464355, + "epoch": 2.2679072886541767, + "grad_norm": 0.14119223662442124, + "kl": 0.130584716796875, + "learning_rate": 4.935671409341394e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571730852127, + "rewards/format_reward_func": 1.0, + "step": 13526 + }, + { + "completion_length": 209.13840007781982, + "epoch": 2.2682425918940443, + "grad_norm": 0.18142706946143367, + "kl": 0.10601806640625, + "learning_rate": 4.935644369271753e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 13528 + }, + { + "completion_length": 217.67858123779297, + "epoch": 2.2685778951339115, + "grad_norm": 0.3059452585174862, + "kl": 0.129547119140625, + "learning_rate": 4.935617323594355e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 13530 + }, + { + "completion_length": 206.62947463989258, + "epoch": 2.268913198373779, + "grad_norm": 0.2880253369534212, + "kl": 0.128936767578125, + "learning_rate": 4.935590272309261e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 13532 + }, + { + "completion_length": 208.76786708831787, + "epoch": 2.2692485016136468, + "grad_norm": 0.2830816012342474, + "kl": 0.1079864501953125, + "learning_rate": 4.935563215416537e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714544355869, + "rewards/format_reward_func": 1.0, + "step": 13534 + }, + { + "completion_length": 216.34376049041748, + "epoch": 2.2695838048535144, + "grad_norm": 0.1421908094574526, + "kl": 0.144805908203125, + "learning_rate": 4.935536152916243e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714380443096, + "rewards/format_reward_func": 1.0, + "step": 13536 + }, + { + "completion_length": 205.62500762939453, + "epoch": 2.269919108093382, + "grad_norm": 0.08285467355914013, + "kl": 0.24676513671875, + "learning_rate": 4.935509084808441e-07, + "loss": 0.0002, + "reward": 1.7821429073810577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 13538 + }, + { + "completion_length": 211.83483123779297, + "epoch": 2.2702544113332497, + "grad_norm": 0.13214472741476582, + "kl": 0.20220947265625, + "learning_rate": 4.935482011093195e-07, + "loss": 0.0002, + "reward": 1.7500000819563866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 13540 + }, + { + "completion_length": 207.53572463989258, + "epoch": 2.270589714573117, + "grad_norm": 0.08028323710669062, + "kl": 0.10394287109375, + "learning_rate": 4.935454931770567e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 13542 + }, + { + "completion_length": 206.92858028411865, + "epoch": 2.2709250178129845, + "grad_norm": 0.18685389674187627, + "kl": 0.113983154296875, + "learning_rate": 4.935427846840617e-07, + "loss": 0.0001, + "reward": 1.7678572237491608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 13544 + }, + { + "completion_length": 202.05358028411865, + "epoch": 2.271260321052852, + "grad_norm": 0.3338371078997882, + "kl": 0.1292724609375, + "learning_rate": 4.935400756303411e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 13546 + }, + { + "completion_length": 211.33929538726807, + "epoch": 2.2715956242927198, + "grad_norm": 0.18042370161512275, + "kl": 0.106903076171875, + "learning_rate": 4.935373660159008e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 13548 + }, + { + "completion_length": 212.53572463989258, + "epoch": 2.2719309275325874, + "grad_norm": 0.15448487921955562, + "kl": 0.162017822265625, + "learning_rate": 4.935346558407472e-07, + "loss": 0.0002, + "reward": 1.782142922282219, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 13550 + }, + { + "completion_length": 207.77233028411865, + "epoch": 2.2722662307724546, + "grad_norm": 0.1929360171540079, + "kl": 0.11138916015625, + "learning_rate": 4.935319451048866e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 13552 + }, + { + "completion_length": 207.915189743042, + "epoch": 2.2726015340123222, + "grad_norm": 0.22371807794920298, + "kl": 0.11895751953125, + "learning_rate": 4.935292338083251e-07, + "loss": 0.0001, + "reward": 1.850000038743019, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8500000275671482, + "rewards/format_reward_func": 1.0, + "step": 13554 + }, + { + "completion_length": 215.48215293884277, + "epoch": 2.27293683725219, + "grad_norm": 0.2786729445248081, + "kl": 0.112701416015625, + "learning_rate": 4.93526521951069e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000469386578, + "rewards/format_reward_func": 1.0, + "step": 13556 + }, + { + "completion_length": 217.72768592834473, + "epoch": 2.2732721404920575, + "grad_norm": 0.1350406475402067, + "kl": 0.15032958984375, + "learning_rate": 4.935238095331246e-07, + "loss": 0.0002, + "reward": 1.7714286223053932, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 13558 + }, + { + "completion_length": 208.49107933044434, + "epoch": 2.273607443731925, + "grad_norm": 0.3304221891155211, + "kl": 0.2548828125, + "learning_rate": 4.935210965544981e-07, + "loss": 0.0003, + "reward": 1.7892857789993286, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 13560 + }, + { + "completion_length": 215.14733028411865, + "epoch": 2.2739427469717928, + "grad_norm": 0.2805466392633232, + "kl": 0.131866455078125, + "learning_rate": 4.935183830151958e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 13562 + }, + { + "completion_length": 203.00000858306885, + "epoch": 2.2742780502116604, + "grad_norm": 0.2473474152110955, + "kl": 0.250518798828125, + "learning_rate": 4.935156689152238e-07, + "loss": 0.0002, + "reward": 1.8392857313156128, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8392857350409031, + "rewards/format_reward_func": 1.0, + "step": 13564 + }, + { + "completion_length": 219.06697273254395, + "epoch": 2.2746133534515276, + "grad_norm": 0.5644412380449078, + "kl": 0.20465087890625, + "learning_rate": 4.935129542545885e-07, + "loss": 0.0002, + "reward": 1.809821479022503, + "reward_std": 0.04671955434605479, + "rewards/equation_reward_func": 0.8160714544355869, + "rewards/format_reward_func": 0.9937500059604645, + "step": 13566 + }, + { + "completion_length": 213.90179538726807, + "epoch": 2.2749486566913952, + "grad_norm": 0.11572995366572475, + "kl": 0.11407470703125, + "learning_rate": 4.93510239033296e-07, + "loss": 0.0001, + "reward": 1.7428572252392769, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 13568 + }, + { + "completion_length": 216.8973331451416, + "epoch": 2.275283959931263, + "grad_norm": 0.1595908587723372, + "kl": 0.116973876953125, + "learning_rate": 4.935075232513528e-07, + "loss": 0.0001, + "reward": 1.8285714760422707, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8375000096857548, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13570 + }, + { + "completion_length": 217.70983123779297, + "epoch": 2.2756192631711305, + "grad_norm": 0.49599002369885, + "kl": 0.1279296875, + "learning_rate": 4.935048069087648e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857365310192, + "rewards/format_reward_func": 1.0, + "step": 13572 + }, + { + "completion_length": 224.6071548461914, + "epoch": 2.275954566410998, + "grad_norm": 0.21004672028786117, + "kl": 0.272674560546875, + "learning_rate": 4.935020900055386e-07, + "loss": 0.0003, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13574 + }, + { + "completion_length": 224.84822463989258, + "epoch": 2.2762898696508653, + "grad_norm": 0.29430683920252254, + "kl": 0.14007568359375, + "learning_rate": 4.934993725416803e-07, + "loss": 0.0001, + "reward": 1.7607143223285675, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7696428783237934, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13576 + }, + { + "completion_length": 224.74108409881592, + "epoch": 2.276625172890733, + "grad_norm": 0.1017204526352656, + "kl": 0.1552734375, + "learning_rate": 4.93496654517196e-07, + "loss": 0.0002, + "reward": 1.7535714954137802, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7625000216066837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13578 + }, + { + "completion_length": 225.508939743042, + "epoch": 2.2769604761306006, + "grad_norm": 0.09881116987726597, + "kl": 0.22412109375, + "learning_rate": 4.934939359320924e-07, + "loss": 0.0002, + "reward": 1.8071429207921028, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8160714544355869, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13580 + }, + { + "completion_length": 218.7098331451416, + "epoch": 2.2772957793704682, + "grad_norm": 0.1220410963795954, + "kl": 0.153106689453125, + "learning_rate": 4.934912167863752e-07, + "loss": 0.0002, + "reward": 1.7642857804894447, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 13582 + }, + { + "completion_length": 231.77679538726807, + "epoch": 2.277631082610336, + "grad_norm": 0.39462603179441164, + "kl": 0.32989501953125, + "learning_rate": 4.934884970800511e-07, + "loss": 0.0003, + "reward": 1.7303572073578835, + "reward_std": 0.05808377172797918, + "rewards/equation_reward_func": 0.7437500320374966, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13584 + }, + { + "completion_length": 236.2991180419922, + "epoch": 2.2779663858502035, + "grad_norm": 0.14974869337815552, + "kl": 0.371826171875, + "learning_rate": 4.934857768131261e-07, + "loss": 0.0004, + "reward": 1.7214286550879478, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7303571682423353, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13586 + }, + { + "completion_length": 225.37054538726807, + "epoch": 2.2783016890900707, + "grad_norm": 0.11756198998018, + "kl": 0.312469482421875, + "learning_rate": 4.934830559856067e-07, + "loss": 0.0003, + "reward": 1.735714390873909, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143219560385, + "rewards/format_reward_func": 1.0, + "step": 13588 + }, + { + "completion_length": 231.43751049041748, + "epoch": 2.2786369923299383, + "grad_norm": 0.22023421549487873, + "kl": 0.3299560546875, + "learning_rate": 4.934803345974989e-07, + "loss": 0.0003, + "reward": 1.8107143267989159, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.8196428865194321, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13590 + }, + { + "completion_length": 238.9687614440918, + "epoch": 2.278972295569806, + "grad_norm": 0.14824793990543636, + "kl": 0.301483154296875, + "learning_rate": 4.934776126488091e-07, + "loss": 0.0003, + "reward": 1.7500000521540642, + "reward_std": 0.03030457627028227, + "rewards/equation_reward_func": 0.7589285969734192, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13592 + }, + { + "completion_length": 244.1294755935669, + "epoch": 2.2793075988096736, + "grad_norm": 15.409615537389927, + "kl": 2.38897705078125, + "learning_rate": 4.934748901395436e-07, + "loss": 0.0024, + "reward": 1.7214286550879478, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7303571775555611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13594 + }, + { + "completion_length": 227.65179634094238, + "epoch": 2.2796429020495412, + "grad_norm": 0.24654334349817073, + "kl": 0.198028564453125, + "learning_rate": 4.934721670697087e-07, + "loss": 0.0002, + "reward": 1.8125000223517418, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.816964328289032, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13596 + }, + { + "completion_length": 235.83929920196533, + "epoch": 2.2799782052894084, + "grad_norm": 0.30922367505226184, + "kl": 0.369232177734375, + "learning_rate": 4.934694434393105e-07, + "loss": 0.0004, + "reward": 1.719642959535122, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7241071797907352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13598 + }, + { + "completion_length": 237.27679443359375, + "epoch": 2.280313508529276, + "grad_norm": 0.15791678668274603, + "kl": 0.239410400390625, + "learning_rate": 4.934667192483553e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571429014205933, + "rewards/format_reward_func": 1.0, + "step": 13600 + }, + { + "completion_length": 230.6384038925171, + "epoch": 2.2806488117691437, + "grad_norm": 0.2840192280306608, + "kl": 0.96710205078125, + "learning_rate": 4.934639944968496e-07, + "loss": 0.001, + "reward": 1.7285715341567993, + "reward_std": 0.0707106776535511, + "rewards/equation_reward_func": 0.7375000305473804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13602 + }, + { + "completion_length": 225.9732265472412, + "epoch": 2.2809841150090113, + "grad_norm": 0.19712667125561653, + "kl": 0.264129638671875, + "learning_rate": 4.934612691847994e-07, + "loss": 0.0003, + "reward": 1.7142857983708382, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7142857573926449, + "rewards/format_reward_func": 1.0, + "step": 13604 + }, + { + "completion_length": 228.01340293884277, + "epoch": 2.281319418248879, + "grad_norm": 0.514089876213886, + "kl": 0.3343505859375, + "learning_rate": 4.934585433122112e-07, + "loss": 0.0003, + "reward": 1.7428572177886963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 13606 + }, + { + "completion_length": 233.58483219146729, + "epoch": 2.2816547214887466, + "grad_norm": 0.2498944000336924, + "kl": 0.374603271484375, + "learning_rate": 4.934558168790912e-07, + "loss": 0.0004, + "reward": 1.7218750789761543, + "reward_std": 0.04987628129310906, + "rewards/equation_reward_func": 0.7276785895228386, + "rewards/format_reward_func": 0.9941964335739613, + "step": 13608 + }, + { + "completion_length": 234.68304920196533, + "epoch": 2.281990024728614, + "grad_norm": 0.4071963245632583, + "kl": 0.64990234375, + "learning_rate": 4.934530898854456e-07, + "loss": 0.0006, + "reward": 1.7714286297559738, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7803571596741676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13610 + }, + { + "completion_length": 239.5089406967163, + "epoch": 2.2823253279684814, + "grad_norm": 0.1957224771809233, + "kl": 0.480499267578125, + "learning_rate": 4.934503623312806e-07, + "loss": 0.0005, + "reward": 1.7428572103381157, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7517857477068901, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13612 + }, + { + "completion_length": 225.28126049041748, + "epoch": 2.282660631208349, + "grad_norm": 0.05916231324178276, + "kl": 0.1220703125, + "learning_rate": 4.934476342166026e-07, + "loss": 0.0001, + "reward": 1.8053571581840515, + "reward_std": 0.012626906856894493, + "rewards/equation_reward_func": 0.8098214715719223, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13614 + }, + { + "completion_length": 228.18304634094238, + "epoch": 2.2829959344482167, + "grad_norm": 0.07388420505371393, + "kl": 0.17498779296875, + "learning_rate": 4.93444905541418e-07, + "loss": 0.0002, + "reward": 1.7553571984171867, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214671015739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13616 + }, + { + "completion_length": 243.58036613464355, + "epoch": 2.2833312376880843, + "grad_norm": 0.20886338374905272, + "kl": 0.190582275390625, + "learning_rate": 4.93442176305733e-07, + "loss": 0.0002, + "reward": 1.7196429520845413, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13618 + }, + { + "completion_length": 244.8705472946167, + "epoch": 2.2836665409279515, + "grad_norm": 0.16759168247428288, + "kl": 0.393280029296875, + "learning_rate": 4.934394465095537e-07, + "loss": 0.0004, + "reward": 1.7375000715255737, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7508928924798965, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13620 + }, + { + "completion_length": 236.58482933044434, + "epoch": 2.284001844167819, + "grad_norm": 0.23261784632714255, + "kl": 0.28619384765625, + "learning_rate": 4.934367161528866e-07, + "loss": 0.0003, + "reward": 1.789285771548748, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7982142977416515, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13622 + }, + { + "completion_length": 242.65625953674316, + "epoch": 2.284337147407687, + "grad_norm": 0.07630423716878422, + "kl": 0.18927001953125, + "learning_rate": 4.93433985235738e-07, + "loss": 0.0002, + "reward": 1.8071429133415222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428723633289, + "rewards/format_reward_func": 1.0, + "step": 13624 + }, + { + "completion_length": 249.1651906967163, + "epoch": 2.2846724506475544, + "grad_norm": 0.13332822719857498, + "kl": 0.256378173828125, + "learning_rate": 4.934312537581141e-07, + "loss": 0.0003, + "reward": 1.7375000789761543, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7419643178582191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13626 + }, + { + "completion_length": 246.47769165039062, + "epoch": 2.285007753887422, + "grad_norm": 0.2380732736718792, + "kl": 0.46343994140625, + "learning_rate": 4.93428521720021e-07, + "loss": 0.0005, + "reward": 1.8089286237955093, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8133928813040257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13628 + }, + { + "completion_length": 257.6339387893677, + "epoch": 2.2853430571272897, + "grad_norm": 0.14169497889649182, + "kl": 0.411865234375, + "learning_rate": 4.934257891214653e-07, + "loss": 0.0004, + "reward": 1.7625000700354576, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.775892898440361, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13630 + }, + { + "completion_length": 248.64733505249023, + "epoch": 2.285678360367157, + "grad_norm": 0.13220287571089961, + "kl": 0.20489501953125, + "learning_rate": 4.934230559624533e-07, + "loss": 0.0002, + "reward": 1.7321429029107094, + "reward_std": 0.025253813713788986, + "rewards/equation_reward_func": 0.7410714533179998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13632 + }, + { + "completion_length": 255.30358409881592, + "epoch": 2.2860136636070245, + "grad_norm": 0.18408713303571822, + "kl": 0.283905029296875, + "learning_rate": 4.93420322242991e-07, + "loss": 0.0003, + "reward": 1.7517857775092125, + "reward_std": 0.08838834706693888, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.977678582072258, + "step": 13634 + }, + { + "completion_length": 252.05804824829102, + "epoch": 2.286348966846892, + "grad_norm": 0.164943595184681, + "kl": 0.294219970703125, + "learning_rate": 4.934175879630849e-07, + "loss": 0.0003, + "reward": 1.7214286774396896, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.730357188731432, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13636 + }, + { + "completion_length": 251.68751049041748, + "epoch": 2.28668427008676, + "grad_norm": 0.08371264433430739, + "kl": 0.379974365234375, + "learning_rate": 4.934148531227413e-07, + "loss": 0.0004, + "reward": 1.7196429297327995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13638 + }, + { + "completion_length": 238.352689743042, + "epoch": 2.2870195733266274, + "grad_norm": 0.22735480378235184, + "kl": 0.209197998046875, + "learning_rate": 4.934121177219664e-07, + "loss": 0.0002, + "reward": 1.8339286148548126, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8383928835391998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13640 + }, + { + "completion_length": 238.80358505249023, + "epoch": 2.2873548765664946, + "grad_norm": 0.19841604872169602, + "kl": 0.206878662109375, + "learning_rate": 4.934093817607666e-07, + "loss": 0.0002, + "reward": 1.7000000849366188, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.7089286148548126, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13642 + }, + { + "completion_length": 244.9687614440918, + "epoch": 2.2876901798063622, + "grad_norm": 0.18402523268310608, + "kl": 0.249176025390625, + "learning_rate": 4.934066452391482e-07, + "loss": 0.0002, + "reward": 1.7446429282426834, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7491071783006191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13644 + }, + { + "completion_length": 248.65179634094238, + "epoch": 2.28802548304623, + "grad_norm": 0.2082230382842461, + "kl": 0.2586669921875, + "learning_rate": 4.934039081571174e-07, + "loss": 0.0003, + "reward": 1.7500000521540642, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 13646 + }, + { + "completion_length": 241.821439743042, + "epoch": 2.2883607862860975, + "grad_norm": 0.17611803208102136, + "kl": 0.144500732421875, + "learning_rate": 4.934011705146805e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 13648 + }, + { + "completion_length": 227.36161708831787, + "epoch": 2.288696089525965, + "grad_norm": 0.14735190264686204, + "kl": 0.245391845703125, + "learning_rate": 4.93398432311844e-07, + "loss": 0.0002, + "reward": 1.7785714864730835, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7875000275671482, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13650 + }, + { + "completion_length": 237.90179824829102, + "epoch": 2.289031392765833, + "grad_norm": 0.25570229818737517, + "kl": 0.161041259765625, + "learning_rate": 4.93395693548614e-07, + "loss": 0.0002, + "reward": 1.7625000551342964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13652 + }, + { + "completion_length": 238.3928689956665, + "epoch": 2.2893666960057, + "grad_norm": 0.1867942311968773, + "kl": 0.15655517578125, + "learning_rate": 4.933929542249968e-07, + "loss": 0.0002, + "reward": 1.7428572103381157, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7517857439815998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13654 + }, + { + "completion_length": 224.59376049041748, + "epoch": 2.2897019992455676, + "grad_norm": 0.21953006511119944, + "kl": 0.207977294921875, + "learning_rate": 4.933902143409988e-07, + "loss": 0.0002, + "reward": 1.7705357745289803, + "reward_std": 0.06187184248119593, + "rewards/equation_reward_func": 0.7767857573926449, + "rewards/format_reward_func": 0.9937500059604645, + "step": 13656 + }, + { + "completion_length": 247.86161994934082, + "epoch": 2.2900373024854352, + "grad_norm": 0.3480181787400742, + "kl": 0.210174560546875, + "learning_rate": 4.933874738966264e-07, + "loss": 0.0002, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13658 + }, + { + "completion_length": 233.80358123779297, + "epoch": 2.290372605725303, + "grad_norm": 0.10373686583389505, + "kl": 0.1361083984375, + "learning_rate": 4.933847328918857e-07, + "loss": 0.0001, + "reward": 1.7732143551111221, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13660 + }, + { + "completion_length": 243.1384048461914, + "epoch": 2.2907079089651705, + "grad_norm": 0.10525744507139827, + "kl": 0.15374755859375, + "learning_rate": 4.933819913267831e-07, + "loss": 0.0002, + "reward": 1.7750000730156898, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 13662 + }, + { + "completion_length": 246.4732265472412, + "epoch": 2.2910432122050377, + "grad_norm": 0.1968361967529215, + "kl": 0.165618896484375, + "learning_rate": 4.933792492013249e-07, + "loss": 0.0002, + "reward": 1.7785714715719223, + "reward_std": 0.07071067672222853, + "rewards/equation_reward_func": 0.7875000163912773, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13664 + }, + { + "completion_length": 235.36161708831787, + "epoch": 2.2913785154449053, + "grad_norm": 0.08932347581233609, + "kl": 0.140716552734375, + "learning_rate": 4.933765065155175e-07, + "loss": 0.0001, + "reward": 1.7839286103844643, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13666 + }, + { + "completion_length": 229.04018878936768, + "epoch": 2.291713818684773, + "grad_norm": 0.0748245735877684, + "kl": 0.135772705078125, + "learning_rate": 4.933737632693671e-07, + "loss": 0.0001, + "reward": 1.8392857313156128, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.839285746216774, + "rewards/format_reward_func": 1.0, + "step": 13668 + }, + { + "completion_length": 231.80358123779297, + "epoch": 2.2920491219246406, + "grad_norm": 0.09018754149817187, + "kl": 0.127655029296875, + "learning_rate": 4.9337101946288e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 13670 + }, + { + "completion_length": 236.7142972946167, + "epoch": 2.2923844251645082, + "grad_norm": 0.400827240902266, + "kl": 0.14300537109375, + "learning_rate": 4.933682750960627e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000264495611, + "rewards/format_reward_func": 1.0, + "step": 13672 + }, + { + "completion_length": 231.1741180419922, + "epoch": 2.292719728404376, + "grad_norm": 0.3008411517499098, + "kl": 0.14361572265625, + "learning_rate": 4.933655301689214e-07, + "loss": 0.0001, + "reward": 1.7446429133415222, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7580357510596514, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13674 + }, + { + "completion_length": 239.4687623977661, + "epoch": 2.293055031644243, + "grad_norm": 0.24453466502782206, + "kl": 0.164886474609375, + "learning_rate": 4.933627846814623e-07, + "loss": 0.0002, + "reward": 1.669642947614193, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.674107177183032, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13676 + }, + { + "completion_length": 240.4553689956665, + "epoch": 2.2933903348841107, + "grad_norm": 0.08003393432284185, + "kl": 0.157135009765625, + "learning_rate": 4.933600386336919e-07, + "loss": 0.0002, + "reward": 1.7500000670552254, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 13678 + }, + { + "completion_length": 244.01340293884277, + "epoch": 2.2937256381239783, + "grad_norm": 0.22095536044182848, + "kl": 0.130096435546875, + "learning_rate": 4.933572920256165e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7660714536905289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13680 + }, + { + "completion_length": 248.5803680419922, + "epoch": 2.294060941363846, + "grad_norm": 0.1975708507074759, + "kl": 0.145477294921875, + "learning_rate": 4.933545448572422e-07, + "loss": 0.0001, + "reward": 1.7325893640518188, + "reward_std": 0.04482551896944642, + "rewards/equation_reward_func": 0.7473214603960514, + "rewards/format_reward_func": 0.9852678664028645, + "step": 13682 + }, + { + "completion_length": 235.977689743042, + "epoch": 2.2943962446037136, + "grad_norm": 0.0816102094427138, + "kl": 0.14111328125, + "learning_rate": 4.933517971285758e-07, + "loss": 0.0001, + "reward": 1.8071428909897804, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8071428760886192, + "rewards/format_reward_func": 1.0, + "step": 13684 + }, + { + "completion_length": 227.9375114440918, + "epoch": 2.294731547843581, + "grad_norm": 0.3697824132658964, + "kl": 0.158905029296875, + "learning_rate": 4.933490488396232e-07, + "loss": 0.0002, + "reward": 1.8000000640749931, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8000000305473804, + "rewards/format_reward_func": 1.0, + "step": 13686 + }, + { + "completion_length": 222.3794708251953, + "epoch": 2.2950668510834484, + "grad_norm": 0.3124872607707278, + "kl": 0.131500244140625, + "learning_rate": 4.933462999903908e-07, + "loss": 0.0001, + "reward": 1.7982143461704254, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.8026786036789417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13688 + }, + { + "completion_length": 216.25447177886963, + "epoch": 2.295402154323316, + "grad_norm": 0.4198675461813113, + "kl": 0.137725830078125, + "learning_rate": 4.93343550580885e-07, + "loss": 0.0001, + "reward": 1.8089286386966705, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.8133928924798965, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13690 + }, + { + "completion_length": 219.85715293884277, + "epoch": 2.2957374575631837, + "grad_norm": 0.09580375324629241, + "kl": 0.1383056640625, + "learning_rate": 4.933408006111121e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 13692 + }, + { + "completion_length": 222.63393878936768, + "epoch": 2.2960727608030513, + "grad_norm": 0.10388410768653196, + "kl": 0.134613037109375, + "learning_rate": 4.933380500810785e-07, + "loss": 0.0001, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.759821455925703, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13694 + }, + { + "completion_length": 211.35715293884277, + "epoch": 2.296408064042919, + "grad_norm": 0.13906267429864502, + "kl": 0.145172119140625, + "learning_rate": 4.933352989907906e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 1.0, + "step": 13696 + }, + { + "completion_length": 219.80358219146729, + "epoch": 2.2967433672827866, + "grad_norm": 0.13736673153418766, + "kl": 0.1318359375, + "learning_rate": 4.933325473402545e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.769642885774374, + "rewards/format_reward_func": 0.9964285716414452, + "step": 13698 + }, + { + "completion_length": 223.89733028411865, + "epoch": 2.297078670522654, + "grad_norm": 0.20006064325335238, + "kl": 0.117767333984375, + "learning_rate": 4.933297951294767e-07, + "loss": 0.0001, + "reward": 1.7392857745289803, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7392857596278191, + "rewards/format_reward_func": 1.0, + "step": 13700 + }, + { + "completion_length": 218.90179634094238, + "epoch": 2.2974139737625214, + "grad_norm": 0.06124106879307074, + "kl": 0.128936767578125, + "learning_rate": 4.933270423584634e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 13702 + }, + { + "completion_length": 216.00000858306885, + "epoch": 2.297749277002389, + "grad_norm": 0.12457756050918946, + "kl": 0.126312255859375, + "learning_rate": 4.933242890272211e-07, + "loss": 0.0001, + "reward": 1.7982143238186836, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13704 + }, + { + "completion_length": 215.63840103149414, + "epoch": 2.2980845802422567, + "grad_norm": 0.19463971761404908, + "kl": 0.1267242431640625, + "learning_rate": 4.933215351357561e-07, + "loss": 0.0001, + "reward": 1.7107143625617027, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7107143271714449, + "rewards/format_reward_func": 1.0, + "step": 13706 + }, + { + "completion_length": 222.50001049041748, + "epoch": 2.2984198834821243, + "grad_norm": 0.1579762789545742, + "kl": 0.125274658203125, + "learning_rate": 4.933187806840747e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8214286044239998, + "rewards/format_reward_func": 1.0, + "step": 13708 + }, + { + "completion_length": 230.0044755935669, + "epoch": 2.2987551867219915, + "grad_norm": 0.2105210773896155, + "kl": 0.15521240234375, + "learning_rate": 4.933160256721831e-07, + "loss": 0.0002, + "reward": 1.7339286357164383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13710 + }, + { + "completion_length": 213.31251049041748, + "epoch": 2.299090489961859, + "grad_norm": 0.27241427140351204, + "kl": 0.122161865234375, + "learning_rate": 4.933132701000879e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 13712 + }, + { + "completion_length": 219.40626049041748, + "epoch": 2.299425793201727, + "grad_norm": 0.259448491171959, + "kl": 0.1417236328125, + "learning_rate": 4.933105139677954e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.775000037625432, + "rewards/format_reward_func": 1.0, + "step": 13714 + }, + { + "completion_length": 219.7366180419922, + "epoch": 2.2997610964415944, + "grad_norm": 0.25233047889996035, + "kl": 0.137237548828125, + "learning_rate": 4.933077572753118e-07, + "loss": 0.0001, + "reward": 1.7107143849134445, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7107143346220255, + "rewards/format_reward_func": 1.0, + "step": 13716 + }, + { + "completion_length": 221.102689743042, + "epoch": 2.300096399681462, + "grad_norm": 0.3017274371156937, + "kl": 0.140655517578125, + "learning_rate": 4.933050000226437e-07, + "loss": 0.0001, + "reward": 1.7250000685453415, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7250000294297934, + "rewards/format_reward_func": 1.0, + "step": 13718 + }, + { + "completion_length": 209.57143783569336, + "epoch": 2.3004317029213297, + "grad_norm": 0.11165261758553362, + "kl": 0.136688232421875, + "learning_rate": 4.93302242209797e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 13720 + }, + { + "completion_length": 216.10269165039062, + "epoch": 2.300767006161197, + "grad_norm": 0.29489920734681696, + "kl": 0.1304931640625, + "learning_rate": 4.932994838367786e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 13722 + }, + { + "completion_length": 220.83929538726807, + "epoch": 2.3011023094010645, + "grad_norm": 0.0707173188716113, + "kl": 0.13385009765625, + "learning_rate": 4.932967249035944e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7392857559025288, + "rewards/format_reward_func": 1.0, + "step": 13724 + }, + { + "completion_length": 221.10268878936768, + "epoch": 2.301437612640932, + "grad_norm": 0.3062096065804645, + "kl": 0.14453125, + "learning_rate": 4.93293965410251e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 13726 + }, + { + "completion_length": 226.51786613464355, + "epoch": 2.3017729158808, + "grad_norm": 0.23534533660186088, + "kl": 0.1221923828125, + "learning_rate": 4.932912053567546e-07, + "loss": 0.0001, + "reward": 1.807142898440361, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8071428909897804, + "rewards/format_reward_func": 1.0, + "step": 13728 + }, + { + "completion_length": 213.85715293884277, + "epoch": 2.3021082191206674, + "grad_norm": 0.0034166384522816224, + "kl": 0.1214599609375, + "learning_rate": 4.932884447431118e-07, + "loss": 0.0001, + "reward": 1.8107143267989159, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 13730 + }, + { + "completion_length": 220.40179538726807, + "epoch": 2.3024435223605346, + "grad_norm": 0.15621227615878694, + "kl": 0.13421630859375, + "learning_rate": 4.932856835693287e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 13732 + }, + { + "completion_length": 218.62501049041748, + "epoch": 2.3027788256004023, + "grad_norm": 0.15452761746025206, + "kl": 0.120635986328125, + "learning_rate": 4.932829218354117e-07, + "loss": 0.0001, + "reward": 1.8125000223517418, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8169643133878708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13734 + }, + { + "completion_length": 225.28572368621826, + "epoch": 2.30311412884027, + "grad_norm": 0.2775790745195352, + "kl": 0.13287353515625, + "learning_rate": 4.932801595413673e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 13736 + }, + { + "completion_length": 225.00893688201904, + "epoch": 2.3034494320801375, + "grad_norm": 0.10617741628463849, + "kl": 0.125, + "learning_rate": 4.932773966872017e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7428571842610836, + "rewards/format_reward_func": 1.0, + "step": 13738 + }, + { + "completion_length": 216.41965198516846, + "epoch": 2.303784735320005, + "grad_norm": 0.13860951964324789, + "kl": 0.115509033203125, + "learning_rate": 4.932746332729214e-07, + "loss": 0.0001, + "reward": 1.7803571820259094, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.784821443259716, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13740 + }, + { + "completion_length": 229.27679634094238, + "epoch": 2.304120038559873, + "grad_norm": 0.4581590275723739, + "kl": 0.1278839111328125, + "learning_rate": 4.932718692985327e-07, + "loss": 0.0001, + "reward": 1.7196429446339607, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7241071686148643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13742 + }, + { + "completion_length": 224.69197368621826, + "epoch": 2.30445534179974, + "grad_norm": 0.2416406933707828, + "kl": 0.1219482421875, + "learning_rate": 4.93269104764042e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 13744 + }, + { + "completion_length": 225.04911708831787, + "epoch": 2.3047906450396076, + "grad_norm": 0.31154448894666115, + "kl": 0.120697021484375, + "learning_rate": 4.932663396694554e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 13746 + }, + { + "completion_length": 222.9419755935669, + "epoch": 2.3051259482794753, + "grad_norm": 0.14378458200593694, + "kl": 0.121368408203125, + "learning_rate": 4.932635740147797e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 13748 + }, + { + "completion_length": 226.48661613464355, + "epoch": 2.305461251519343, + "grad_norm": 0.2690045565333607, + "kl": 0.12030029296875, + "learning_rate": 4.93260807800021e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 13750 + }, + { + "completion_length": 228.49107933044434, + "epoch": 2.3057965547592105, + "grad_norm": 0.3038352147540726, + "kl": 0.12451171875, + "learning_rate": 4.932580410251857e-07, + "loss": 0.0001, + "reward": 1.7392857894301414, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 1.0, + "step": 13752 + }, + { + "completion_length": 216.6071538925171, + "epoch": 2.3061318579990777, + "grad_norm": 0.19908074429539108, + "kl": 0.13238525390625, + "learning_rate": 4.932552736902803e-07, + "loss": 0.0001, + "reward": 1.7535714879631996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714525729418, + "rewards/format_reward_func": 1.0, + "step": 13754 + }, + { + "completion_length": 220.8794755935669, + "epoch": 2.3064671612389454, + "grad_norm": 0.2938801381346632, + "kl": 0.1182708740234375, + "learning_rate": 4.932525057953109e-07, + "loss": 0.0001, + "reward": 1.814285770058632, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 13756 + }, + { + "completion_length": 224.69643783569336, + "epoch": 2.306802464478813, + "grad_norm": 0.08695741183974194, + "kl": 0.1104888916015625, + "learning_rate": 4.932497373402842e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 13758 + }, + { + "completion_length": 223.71429538726807, + "epoch": 2.3071377677186806, + "grad_norm": 0.280091163735147, + "kl": 0.1106109619140625, + "learning_rate": 4.932469683252062e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 13760 + }, + { + "completion_length": 226.7723331451416, + "epoch": 2.3074730709585483, + "grad_norm": 0.36408205627825097, + "kl": 0.1078948974609375, + "learning_rate": 4.932441987500837e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 13762 + }, + { + "completion_length": 223.4241180419922, + "epoch": 2.307808374198416, + "grad_norm": 0.18035158937625873, + "kl": 0.13739013671875, + "learning_rate": 4.932414286149228e-07, + "loss": 0.0001, + "reward": 1.7482143566012383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13764 + }, + { + "completion_length": 220.99554634094238, + "epoch": 2.308143677438283, + "grad_norm": 0.12655154468340585, + "kl": 0.12335205078125, + "learning_rate": 4.9323865791973e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 13766 + }, + { + "completion_length": 221.41965293884277, + "epoch": 2.3084789806781507, + "grad_norm": 0.155988583871809, + "kl": 0.122344970703125, + "learning_rate": 4.932358866645116e-07, + "loss": 0.0001, + "reward": 1.8428571820259094, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8428571578115225, + "rewards/format_reward_func": 1.0, + "step": 13768 + }, + { + "completion_length": 230.5044765472412, + "epoch": 2.3088142839180184, + "grad_norm": 0.21690628723905842, + "kl": 0.1177978515625, + "learning_rate": 4.932331148492739e-07, + "loss": 0.0001, + "reward": 1.8178571909666061, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571686148643, + "rewards/format_reward_func": 1.0, + "step": 13770 + }, + { + "completion_length": 230.7500114440918, + "epoch": 2.309149587157886, + "grad_norm": 0.22875686319240196, + "kl": 0.10528564453125, + "learning_rate": 4.932303424740234e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285887777805, + "rewards/format_reward_func": 1.0, + "step": 13772 + }, + { + "completion_length": 240.36608219146729, + "epoch": 2.3094848903977536, + "grad_norm": 0.13109625511493334, + "kl": 0.117034912109375, + "learning_rate": 4.932275695387666e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 13774 + }, + { + "completion_length": 236.40179920196533, + "epoch": 2.309820193637621, + "grad_norm": 0.15888328027894855, + "kl": 0.106597900390625, + "learning_rate": 4.932247960435096e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 13776 + }, + { + "completion_length": 234.8125114440918, + "epoch": 2.3101554968774884, + "grad_norm": 0.15421914534549414, + "kl": 0.1151123046875, + "learning_rate": 4.932220219882591e-07, + "loss": 0.0001, + "reward": 1.825000062584877, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000178813934, + "rewards/format_reward_func": 1.0, + "step": 13778 + }, + { + "completion_length": 236.29911613464355, + "epoch": 2.310490800117356, + "grad_norm": 0.09890678034383357, + "kl": 0.12774658203125, + "learning_rate": 4.932192473730213e-07, + "loss": 0.0001, + "reward": 1.7910714447498322, + "reward_std": 0.012626906856894493, + "rewards/equation_reward_func": 0.7955357562750578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13780 + }, + { + "completion_length": 238.1875123977661, + "epoch": 2.3108261033572237, + "grad_norm": 0.16175457520796585, + "kl": 0.127197265625, + "learning_rate": 4.932164721978024e-07, + "loss": 0.0001, + "reward": 1.7017858028411865, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.706250024959445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13782 + }, + { + "completion_length": 231.01340198516846, + "epoch": 2.3111614065970914, + "grad_norm": 0.19291834599943525, + "kl": 0.1243896484375, + "learning_rate": 4.932136964626093e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000432133675, + "rewards/format_reward_func": 1.0, + "step": 13784 + }, + { + "completion_length": 247.75894165039062, + "epoch": 2.311496709836959, + "grad_norm": 0.1155973995569849, + "kl": 0.126373291015625, + "learning_rate": 4.932109201674478e-07, + "loss": 0.0001, + "reward": 1.7821428924798965, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 13786 + }, + { + "completion_length": 242.05358219146729, + "epoch": 2.311832013076826, + "grad_norm": 0.2314433409991325, + "kl": 0.156005859375, + "learning_rate": 4.932081433123248e-07, + "loss": 0.0002, + "reward": 1.7875000834465027, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919642943888903, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13788 + }, + { + "completion_length": 236.2946548461914, + "epoch": 2.312167316316694, + "grad_norm": 0.0022804056792775137, + "kl": 0.1068115234375, + "learning_rate": 4.932053658972464e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 13790 + }, + { + "completion_length": 229.5982255935669, + "epoch": 2.3125026195565614, + "grad_norm": 0.22285338579905276, + "kl": 0.125946044921875, + "learning_rate": 4.932025879222191e-07, + "loss": 0.0001, + "reward": 1.7392858117818832, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7392857372760773, + "rewards/format_reward_func": 1.0, + "step": 13792 + }, + { + "completion_length": 221.48661708831787, + "epoch": 2.312837922796429, + "grad_norm": 0.32382712126000757, + "kl": 0.119415283203125, + "learning_rate": 4.931998093872492e-07, + "loss": 0.0001, + "reward": 1.8285714760422707, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8285714574158192, + "rewards/format_reward_func": 1.0, + "step": 13794 + }, + { + "completion_length": 228.21875858306885, + "epoch": 2.3131732260362967, + "grad_norm": 0.19368082670345033, + "kl": 0.11956787109375, + "learning_rate": 4.931970302923433e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7214285992085934, + "rewards/format_reward_func": 1.0, + "step": 13796 + }, + { + "completion_length": 226.20090293884277, + "epoch": 2.313508529276164, + "grad_norm": 0.29231224698111935, + "kl": 0.12823486328125, + "learning_rate": 4.931942506375076e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 13798 + }, + { + "completion_length": 230.42411708831787, + "epoch": 2.3138438325160315, + "grad_norm": 0.18361270601123025, + "kl": 0.19329833984375, + "learning_rate": 4.931914704227486e-07, + "loss": 0.0002, + "reward": 1.7714286521077156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 13800 + }, + { + "completion_length": 228.7053680419922, + "epoch": 2.314179135755899, + "grad_norm": 0.20926399834142734, + "kl": 0.152862548828125, + "learning_rate": 4.931886896480726e-07, + "loss": 0.0002, + "reward": 1.7321429401636124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 13802 + }, + { + "completion_length": 225.32143783569336, + "epoch": 2.314514438995767, + "grad_norm": 0.24781507311502457, + "kl": 0.138458251953125, + "learning_rate": 4.931859083134861e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 13804 + }, + { + "completion_length": 229.91518783569336, + "epoch": 2.3148497422356344, + "grad_norm": 0.13919086617690282, + "kl": 0.14959716796875, + "learning_rate": 4.931831264189954e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714566707611, + "rewards/format_reward_func": 1.0, + "step": 13806 + }, + { + "completion_length": 223.39286994934082, + "epoch": 2.315185045475502, + "grad_norm": 0.2862038229346431, + "kl": 0.141693115234375, + "learning_rate": 4.931803439646071e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321429029107094, + "rewards/format_reward_func": 1.0, + "step": 13808 + }, + { + "completion_length": 229.4330472946167, + "epoch": 2.3155203487153693, + "grad_norm": 0.536633798934367, + "kl": 0.44781494140625, + "learning_rate": 4.931775609503274e-07, + "loss": 0.0004, + "reward": 1.7642857804894447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 13810 + }, + { + "completion_length": 224.14286708831787, + "epoch": 2.315855651955237, + "grad_norm": 0.17537454963307, + "kl": 0.14599609375, + "learning_rate": 4.931747773761628e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 13812 + }, + { + "completion_length": 230.0982255935669, + "epoch": 2.3161909551951045, + "grad_norm": 0.21195475663310653, + "kl": 0.161041259765625, + "learning_rate": 4.931719932421198e-07, + "loss": 0.0002, + "reward": 1.7678572088479996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 13814 + }, + { + "completion_length": 234.0491189956665, + "epoch": 2.316526258434972, + "grad_norm": 0.3517252391572919, + "kl": 0.397308349609375, + "learning_rate": 4.931692085482045e-07, + "loss": 0.0004, + "reward": 1.7522322237491608, + "reward_std": 0.04735089954920113, + "rewards/equation_reward_func": 0.753571480512619, + "rewards/format_reward_func": 0.9986607171595097, + "step": 13816 + }, + { + "completion_length": 218.95982933044434, + "epoch": 2.31686156167484, + "grad_norm": 0.21637620176464864, + "kl": 0.149993896484375, + "learning_rate": 4.931664232944237e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 13818 + }, + { + "completion_length": 233.7946538925171, + "epoch": 2.317196864914707, + "grad_norm": 0.1493142580467565, + "kl": 0.184417724609375, + "learning_rate": 4.931636374807836e-07, + "loss": 0.0002, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428958326578, + "rewards/format_reward_func": 1.0, + "step": 13820 + }, + { + "completion_length": 238.07590293884277, + "epoch": 2.3175321681545746, + "grad_norm": 0.23369268131692472, + "kl": 0.210357666015625, + "learning_rate": 4.931608511072906e-07, + "loss": 0.0002, + "reward": 1.7464286461472511, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 13822 + }, + { + "completion_length": 222.6517972946167, + "epoch": 2.3178674713944423, + "grad_norm": 0.18778130385122507, + "kl": 0.2147216796875, + "learning_rate": 4.931580641739513e-07, + "loss": 0.0002, + "reward": 1.8214286342263222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214285932481289, + "rewards/format_reward_func": 1.0, + "step": 13824 + }, + { + "completion_length": 230.2321548461914, + "epoch": 2.31820277463431, + "grad_norm": 0.09521929881811564, + "kl": 0.161590576171875, + "learning_rate": 4.931552766807718e-07, + "loss": 0.0002, + "reward": 1.7892857417464256, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 13826 + }, + { + "completion_length": 234.70536708831787, + "epoch": 2.3185380778741775, + "grad_norm": 0.31405589164753217, + "kl": 0.1912841796875, + "learning_rate": 4.931524886277587e-07, + "loss": 0.0002, + "reward": 1.7267858237028122, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7312500197440386, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13828 + }, + { + "completion_length": 242.9107255935669, + "epoch": 2.318873381114045, + "grad_norm": 0.14866603002861797, + "kl": 0.142608642578125, + "learning_rate": 4.931497000149185e-07, + "loss": 0.0001, + "reward": 1.7401786372065544, + "reward_std": 0.03409264795482159, + "rewards/equation_reward_func": 0.7419643048197031, + "rewards/format_reward_func": 0.9982142895460129, + "step": 13830 + }, + { + "completion_length": 234.8973331451416, + "epoch": 2.319208684353913, + "grad_norm": 0.25162094245415106, + "kl": 0.150634765625, + "learning_rate": 4.931469108422576e-07, + "loss": 0.0002, + "reward": 1.7482143566012383, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13832 + }, + { + "completion_length": 246.16519355773926, + "epoch": 2.31954398759378, + "grad_norm": 0.11166127332516824, + "kl": 0.158416748046875, + "learning_rate": 4.931441211097822e-07, + "loss": 0.0002, + "reward": 1.7937500327825546, + "reward_std": 0.029041884932667017, + "rewards/equation_reward_func": 0.7955357301980257, + "rewards/format_reward_func": 0.9982142895460129, + "step": 13834 + }, + { + "completion_length": 246.6875114440918, + "epoch": 2.3198792908336476, + "grad_norm": 0.21306896478258996, + "kl": 0.1680908203125, + "learning_rate": 4.93141330817499e-07, + "loss": 0.0002, + "reward": 1.725000075995922, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000312924385, + "rewards/format_reward_func": 1.0, + "step": 13836 + }, + { + "completion_length": 245.06251335144043, + "epoch": 2.3202145940735153, + "grad_norm": 0.12055812100393454, + "kl": 0.15869140625, + "learning_rate": 4.931385399654143e-07, + "loss": 0.0002, + "reward": 1.751785784959793, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13838 + }, + { + "completion_length": 242.83929634094238, + "epoch": 2.320549897313383, + "grad_norm": 0.17749519435879707, + "kl": 0.128753662109375, + "learning_rate": 4.931357485535345e-07, + "loss": 0.0001, + "reward": 1.803571455180645, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714589059353, + "rewards/format_reward_func": 1.0, + "step": 13840 + }, + { + "completion_length": 247.52233409881592, + "epoch": 2.3208852005532505, + "grad_norm": 0.05211308765186071, + "kl": 0.16229248046875, + "learning_rate": 4.93132956581866e-07, + "loss": 0.0002, + "reward": 1.775000050663948, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 13842 + }, + { + "completion_length": 242.63840198516846, + "epoch": 2.3212205037931177, + "grad_norm": 0.22663570203284325, + "kl": 0.146820068359375, + "learning_rate": 4.931301640504154e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 13844 + }, + { + "completion_length": 253.0312614440918, + "epoch": 2.3215558070329854, + "grad_norm": 0.16115678842341616, + "kl": 0.141632080078125, + "learning_rate": 4.93127370959189e-07, + "loss": 0.0001, + "reward": 1.7714286595582962, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285925030708, + "rewards/format_reward_func": 1.0, + "step": 13846 + }, + { + "completion_length": 257.8259000778198, + "epoch": 2.321891110272853, + "grad_norm": 0.23600484644051928, + "kl": 0.141021728515625, + "learning_rate": 4.931245773081931e-07, + "loss": 0.0001, + "reward": 1.7285715192556381, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.728571455925703, + "rewards/format_reward_func": 1.0, + "step": 13848 + }, + { + "completion_length": 240.5000123977661, + "epoch": 2.3222264135127206, + "grad_norm": 0.14853695490593902, + "kl": 0.16864013671875, + "learning_rate": 4.931217830974344e-07, + "loss": 0.0002, + "reward": 1.7785714715719223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 13850 + }, + { + "completion_length": 253.27679634094238, + "epoch": 2.3225617167525883, + "grad_norm": 0.442639076297675, + "kl": 0.14019775390625, + "learning_rate": 4.931189883269193e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7607143074274063, + "rewards/format_reward_func": 0.9821428656578064, + "step": 13852 + }, + { + "completion_length": 256.1741199493408, + "epoch": 2.322897019992456, + "grad_norm": 0.2172474753408119, + "kl": 0.162078857421875, + "learning_rate": 4.931161929966541e-07, + "loss": 0.0002, + "reward": 1.821428619325161, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.821428595110774, + "rewards/format_reward_func": 1.0, + "step": 13854 + }, + { + "completion_length": 272.4464416503906, + "epoch": 2.323232323232323, + "grad_norm": 0.25720741051210894, + "kl": 0.141754150390625, + "learning_rate": 4.931133971066451e-07, + "loss": 0.0001, + "reward": 1.7410714849829674, + "reward_std": 0.09343910869210958, + "rewards/equation_reward_func": 0.7544643096625805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13856 + }, + { + "completion_length": 252.602689743042, + "epoch": 2.3235676264721907, + "grad_norm": 0.19960302259651616, + "kl": 0.136322021484375, + "learning_rate": 4.931106006568991e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928939700127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13858 + }, + { + "completion_length": 257.57143783569336, + "epoch": 2.3239029297120584, + "grad_norm": 0.22315189200039845, + "kl": 0.1318359375, + "learning_rate": 4.931078036474224e-07, + "loss": 0.0001, + "reward": 1.7982143461704254, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13860 + }, + { + "completion_length": 268.7410840988159, + "epoch": 2.324238232951926, + "grad_norm": 0.6520892118623528, + "kl": 0.4154052734375, + "learning_rate": 4.931050060782212e-07, + "loss": 0.0004, + "reward": 1.7732143476605415, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13862 + }, + { + "completion_length": 263.2634038925171, + "epoch": 2.3245735361917936, + "grad_norm": 0.29998155060416803, + "kl": 0.1378173828125, + "learning_rate": 4.931022079493024e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714522004128, + "rewards/format_reward_func": 1.0, + "step": 13864 + }, + { + "completion_length": 277.0759038925171, + "epoch": 2.324908839431661, + "grad_norm": 0.09117265999784231, + "kl": 0.144500732421875, + "learning_rate": 4.930994092606719e-07, + "loss": 0.0001, + "reward": 1.7946429029107094, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7991071604192257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13866 + }, + { + "completion_length": 279.62947368621826, + "epoch": 2.3252441426715285, + "grad_norm": 0.17952208593591043, + "kl": 0.141357421875, + "learning_rate": 4.930966100123366e-07, + "loss": 0.0001, + "reward": 1.8535714820027351, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8535714484751225, + "rewards/format_reward_func": 1.0, + "step": 13868 + }, + { + "completion_length": 270.15179538726807, + "epoch": 2.325579445911396, + "grad_norm": 0.1446632522156985, + "kl": 0.142425537109375, + "learning_rate": 4.930938102043027e-07, + "loss": 0.0001, + "reward": 1.8446428999304771, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8491071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13870 + }, + { + "completion_length": 287.2901906967163, + "epoch": 2.3259147491512637, + "grad_norm": 0.09508477728521918, + "kl": 0.1434326171875, + "learning_rate": 4.930910098365768e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7767857275903225, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13872 + }, + { + "completion_length": 287.87947845458984, + "epoch": 2.3262500523911314, + "grad_norm": 0.15208370071022806, + "kl": 0.157379150390625, + "learning_rate": 4.930882089091652e-07, + "loss": 0.0002, + "reward": 1.796428643167019, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.8053571730852127, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13874 + }, + { + "completion_length": 267.7678689956665, + "epoch": 2.326585355630999, + "grad_norm": 0.18081008050190195, + "kl": 0.143707275390625, + "learning_rate": 4.930854074220744e-07, + "loss": 0.0001, + "reward": 1.8000000342726707, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 13876 + }, + { + "completion_length": 276.84376335144043, + "epoch": 2.326920658870866, + "grad_norm": 0.048354361422872846, + "kl": 0.125518798828125, + "learning_rate": 4.930826053753109e-07, + "loss": 0.0001, + "reward": 1.7732143253087997, + "reward_std": 0.007576144300401211, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13878 + }, + { + "completion_length": 278.4732303619385, + "epoch": 2.327255962110734, + "grad_norm": 0.1011312406782036, + "kl": 0.16424560546875, + "learning_rate": 4.930798027688811e-07, + "loss": 0.0002, + "reward": 1.7928571924567223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 13880 + }, + { + "completion_length": 274.27233123779297, + "epoch": 2.3275912653506015, + "grad_norm": 0.11005274619203892, + "kl": 0.13623046875, + "learning_rate": 4.930769996027915e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.777678593993187, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13882 + }, + { + "completion_length": 269.85268783569336, + "epoch": 2.327926568590469, + "grad_norm": 0.08785502020528425, + "kl": 0.13714599609375, + "learning_rate": 4.930741958770485e-07, + "loss": 0.0001, + "reward": 1.769642911851406, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7741071842610836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13884 + }, + { + "completion_length": 273.22322940826416, + "epoch": 2.3282618718303367, + "grad_norm": 0.04808767039662426, + "kl": 0.149322509765625, + "learning_rate": 4.930713915916585e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13886 + }, + { + "completion_length": 267.8437614440918, + "epoch": 2.328597175070204, + "grad_norm": 0.0033872799839121735, + "kl": 0.1512451171875, + "learning_rate": 4.930685867466281e-07, + "loss": 0.0002, + "reward": 1.8214285969734192, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.8214286006987095, + "rewards/format_reward_func": 1.0, + "step": 13888 + }, + { + "completion_length": 277.64287185668945, + "epoch": 2.3289324783100716, + "grad_norm": 0.17916554169069204, + "kl": 0.153594970703125, + "learning_rate": 4.930657813419638e-07, + "loss": 0.0002, + "reward": 1.7303571850061417, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7437500283122063, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13890 + }, + { + "completion_length": 281.90626525878906, + "epoch": 2.329267781549939, + "grad_norm": 0.3585679228241435, + "kl": 0.16705322265625, + "learning_rate": 4.930629753776717e-07, + "loss": 0.0002, + "reward": 1.7464286386966705, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7553571686148643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13892 + }, + { + "completion_length": 265.11162185668945, + "epoch": 2.329603084789807, + "grad_norm": 0.18914141170325072, + "kl": 0.13250732421875, + "learning_rate": 4.930601688537586e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.753571443259716, + "rewards/format_reward_func": 1.0, + "step": 13894 + }, + { + "completion_length": 262.7500104904175, + "epoch": 2.3299383880296745, + "grad_norm": 0.03789824323268437, + "kl": 0.121246337890625, + "learning_rate": 4.930573617702311e-07, + "loss": 0.0001, + "reward": 1.7839286178350449, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13896 + }, + { + "completion_length": 262.20537185668945, + "epoch": 2.330273691269542, + "grad_norm": 0.2800962462007646, + "kl": 0.1329345703125, + "learning_rate": 4.930545541270952e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571753203869, + "rewards/format_reward_func": 1.0, + "step": 13898 + }, + { + "completion_length": 267.5892963409424, + "epoch": 2.3306089945094093, + "grad_norm": 0.31285336060043917, + "kl": 0.133331298828125, + "learning_rate": 4.930517459243576e-07, + "loss": 0.0001, + "reward": 1.7803572043776512, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214544355869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13900 + }, + { + "completion_length": 262.29465770721436, + "epoch": 2.330944297749277, + "grad_norm": 0.2676019483729929, + "kl": 0.142913818359375, + "learning_rate": 4.930489371620247e-07, + "loss": 0.0001, + "reward": 1.7696429342031479, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7741071656346321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13902 + }, + { + "completion_length": 249.6071538925171, + "epoch": 2.3312796009891446, + "grad_norm": 0.2603757129759202, + "kl": 0.135467529296875, + "learning_rate": 4.930461278401033e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 13904 + }, + { + "completion_length": 255.84822463989258, + "epoch": 2.331614904229012, + "grad_norm": 0.17282725467474225, + "kl": 0.143646240234375, + "learning_rate": 4.930433179585995e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 13906 + }, + { + "completion_length": 260.2187614440918, + "epoch": 2.33195020746888, + "grad_norm": 0.08766943170738209, + "kl": 0.1429443359375, + "learning_rate": 4.930405075175197e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 13908 + }, + { + "completion_length": 256.4375114440918, + "epoch": 2.332285510708747, + "grad_norm": 0.20621206310280166, + "kl": 0.1400146484375, + "learning_rate": 4.930376965168707e-07, + "loss": 0.0001, + "reward": 1.8214286267757416, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8214285969734192, + "rewards/format_reward_func": 1.0, + "step": 13910 + }, + { + "completion_length": 263.1875114440918, + "epoch": 2.3326208139486146, + "grad_norm": 0.20267854080407852, + "kl": 0.13336181640625, + "learning_rate": 4.930348849566587e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000350177288, + "rewards/format_reward_func": 1.0, + "step": 13912 + }, + { + "completion_length": 247.66072750091553, + "epoch": 2.3329561171884823, + "grad_norm": 0.149466506521507, + "kl": 0.114837646484375, + "learning_rate": 4.930320728368904e-07, + "loss": 0.0001, + "reward": 1.730357214808464, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7348214611411095, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13914 + }, + { + "completion_length": 261.1339406967163, + "epoch": 2.33329142042835, + "grad_norm": 0.21434438156759789, + "kl": 0.137481689453125, + "learning_rate": 4.930292601575721e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714328289032, + "rewards/format_reward_func": 1.0, + "step": 13916 + }, + { + "completion_length": 250.38393783569336, + "epoch": 2.3336267236682176, + "grad_norm": 0.16715115682608317, + "kl": 0.128936767578125, + "learning_rate": 4.930264469187103e-07, + "loss": 0.0001, + "reward": 1.764285758137703, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7642857432365417, + "rewards/format_reward_func": 1.0, + "step": 13918 + }, + { + "completion_length": 255.82144165039062, + "epoch": 2.333962026908085, + "grad_norm": 0.12919460625180537, + "kl": 0.16015625, + "learning_rate": 4.930236331203116e-07, + "loss": 0.0002, + "reward": 1.789285808801651, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857380211353, + "rewards/format_reward_func": 1.0, + "step": 13920 + }, + { + "completion_length": 254.96429920196533, + "epoch": 2.3342973301479524, + "grad_norm": 0.19431944763476283, + "kl": 0.1590576171875, + "learning_rate": 4.930208187623823e-07, + "loss": 0.0002, + "reward": 1.7892857640981674, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857529222965, + "rewards/format_reward_func": 1.0, + "step": 13922 + }, + { + "completion_length": 252.14733409881592, + "epoch": 2.33463263338782, + "grad_norm": 0.5509848451277506, + "kl": 0.17974853515625, + "learning_rate": 4.93018003844929e-07, + "loss": 0.0002, + "reward": 1.758928619325161, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13924 + }, + { + "completion_length": 248.67858123779297, + "epoch": 2.3349679366276876, + "grad_norm": 0.06223604978287845, + "kl": 0.12445068359375, + "learning_rate": 4.930151883679582e-07, + "loss": 0.0001, + "reward": 1.7321429178118706, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 13926 + }, + { + "completion_length": 251.88840293884277, + "epoch": 2.3353032398675553, + "grad_norm": 0.2945459558718312, + "kl": 0.14276123046875, + "learning_rate": 4.930123723314763e-07, + "loss": 0.0001, + "reward": 1.7696429193019867, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7830357365310192, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13928 + }, + { + "completion_length": 246.0223331451416, + "epoch": 2.335638543107423, + "grad_norm": 0.20297409443292536, + "kl": 0.1510009765625, + "learning_rate": 4.930095557354897e-07, + "loss": 0.0002, + "reward": 1.7821429446339607, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428682655096, + "rewards/format_reward_func": 1.0, + "step": 13930 + }, + { + "completion_length": 252.99108505249023, + "epoch": 2.33597384634729, + "grad_norm": 0.17278858736936387, + "kl": 0.183074951171875, + "learning_rate": 4.930067385800051e-07, + "loss": 0.0002, + "reward": 1.785714365541935, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 13932 + }, + { + "completion_length": 261.7634048461914, + "epoch": 2.3363091495871577, + "grad_norm": 0.3142320730365933, + "kl": 0.22943115234375, + "learning_rate": 4.93003920865029e-07, + "loss": 0.0002, + "reward": 1.6785715073347092, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.6785714589059353, + "rewards/format_reward_func": 1.0, + "step": 13934 + }, + { + "completion_length": 263.3169755935669, + "epoch": 2.3366444528270254, + "grad_norm": 0.16214082641153868, + "kl": 0.15496826171875, + "learning_rate": 4.930011025905677e-07, + "loss": 0.0002, + "reward": 1.7714286223053932, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7803571727126837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13936 + }, + { + "completion_length": 264.99554920196533, + "epoch": 2.336979756066893, + "grad_norm": 0.29927774244425503, + "kl": 0.158111572265625, + "learning_rate": 4.929982837566277e-07, + "loss": 0.0002, + "reward": 1.7517857626080513, + "reward_std": 0.08838834520429373, + "rewards/equation_reward_func": 0.7651786096394062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13938 + }, + { + "completion_length": 270.9241199493408, + "epoch": 2.3373150593067606, + "grad_norm": 0.13127494720037441, + "kl": 0.209259033203125, + "learning_rate": 4.929954643632156e-07, + "loss": 0.0002, + "reward": 1.7696429342031479, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7741071805357933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13940 + }, + { + "completion_length": 260.0669765472412, + "epoch": 2.3376503625466283, + "grad_norm": 0.1928191917215741, + "kl": 0.188751220703125, + "learning_rate": 4.929926444103378e-07, + "loss": 0.0002, + "reward": 1.7767857685685158, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13942 + }, + { + "completion_length": 259.8750123977661, + "epoch": 2.3379856657864955, + "grad_norm": 0.07497032977190916, + "kl": 0.191162109375, + "learning_rate": 4.92989823898001e-07, + "loss": 0.0002, + "reward": 1.6946429312229156, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.699107188731432, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13944 + }, + { + "completion_length": 255.8348331451416, + "epoch": 2.338320969026363, + "grad_norm": 0.1732593334616933, + "kl": 0.129974365234375, + "learning_rate": 4.929870028262113e-07, + "loss": 0.0001, + "reward": 1.7910714596509933, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.795535746961832, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13946 + }, + { + "completion_length": 258.7187614440918, + "epoch": 2.3386562722662307, + "grad_norm": 0.17122023310010204, + "kl": 33.64996337890625, + "learning_rate": 4.929841811949755e-07, + "loss": 0.0336, + "reward": 1.7410714849829674, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7455357424914837, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13948 + }, + { + "completion_length": 257.1428689956665, + "epoch": 2.3389915755060984, + "grad_norm": 0.15583217546042194, + "kl": 0.14776611328125, + "learning_rate": 4.929813590043001e-07, + "loss": 0.0001, + "reward": 1.6910715028643608, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.695535758510232, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13950 + }, + { + "completion_length": 248.93750858306885, + "epoch": 2.339326878745966, + "grad_norm": 0.20410850654255108, + "kl": 0.15264892578125, + "learning_rate": 4.929785362541915e-07, + "loss": 0.0002, + "reward": 1.7214286401867867, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7303571775555611, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13952 + }, + { + "completion_length": 246.821439743042, + "epoch": 2.339662181985833, + "grad_norm": 0.19591857120170567, + "kl": 0.222259521484375, + "learning_rate": 4.929757129446562e-07, + "loss": 0.0002, + "reward": 1.739285796880722, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7482143118977547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13954 + }, + { + "completion_length": 242.6428689956665, + "epoch": 2.339997485225701, + "grad_norm": 0.1094692940084371, + "kl": 0.172698974609375, + "learning_rate": 4.929728890757007e-07, + "loss": 0.0002, + "reward": 1.8017857670783997, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.806250024586916, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13956 + }, + { + "completion_length": 267.64732933044434, + "epoch": 2.3403327884655685, + "grad_norm": 0.2566541790875162, + "kl": 0.1883544921875, + "learning_rate": 4.929700646473315e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 13958 + }, + { + "completion_length": 254.52233123779297, + "epoch": 2.340668091705436, + "grad_norm": 0.17341243088634806, + "kl": 0.224151611328125, + "learning_rate": 4.929672396595553e-07, + "loss": 0.0002, + "reward": 1.7625000551342964, + "reward_std": 0.07323605939745903, + "rewards/equation_reward_func": 0.7848214488476515, + "rewards/format_reward_func": 0.977678582072258, + "step": 13960 + }, + { + "completion_length": 253.27233028411865, + "epoch": 2.3410033949453037, + "grad_norm": 0.22604397912429652, + "kl": 0.131744384765625, + "learning_rate": 4.929644141123783e-07, + "loss": 0.0001, + "reward": 1.7732143327593803, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13962 + }, + { + "completion_length": 251.7678680419922, + "epoch": 2.3413386981851714, + "grad_norm": 0.6640328142423192, + "kl": 0.463836669921875, + "learning_rate": 4.92961588005807e-07, + "loss": 0.0005, + "reward": 1.7267857789993286, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.740178607404232, + "rewards/format_reward_func": 0.9866071492433548, + "step": 13964 + }, + { + "completion_length": 248.24108600616455, + "epoch": 2.341674001425039, + "grad_norm": 0.10695830413721986, + "kl": 0.135406494140625, + "learning_rate": 4.929587613398481e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 13966 + }, + { + "completion_length": 243.7053689956665, + "epoch": 2.342009304664906, + "grad_norm": 0.24921575355770892, + "kl": 0.1546630859375, + "learning_rate": 4.929559341145082e-07, + "loss": 0.0002, + "reward": 1.7767857685685158, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500409781933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13968 + }, + { + "completion_length": 245.3750114440918, + "epoch": 2.342344607904774, + "grad_norm": 0.1565350284699649, + "kl": 0.681243896484375, + "learning_rate": 4.929531063297936e-07, + "loss": 0.0007, + "reward": 1.800000049173832, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 13970 + }, + { + "completion_length": 242.3437614440918, + "epoch": 2.3426799111446415, + "grad_norm": 0.15593943795376905, + "kl": 0.13665771484375, + "learning_rate": 4.929502779857108e-07, + "loss": 0.0001, + "reward": 1.7160715088248253, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7205357365310192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13972 + }, + { + "completion_length": 246.63840293884277, + "epoch": 2.343015214384509, + "grad_norm": 0.13590393267799425, + "kl": 0.175994873046875, + "learning_rate": 4.929474490822665e-07, + "loss": 0.0002, + "reward": 1.825000062584877, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 13974 + }, + { + "completion_length": 234.18751049041748, + "epoch": 2.3433505176243767, + "grad_norm": 0.4499483960615894, + "kl": 0.209136962890625, + "learning_rate": 4.92944619619467e-07, + "loss": 0.0002, + "reward": 1.7910714894533157, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357357859612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13976 + }, + { + "completion_length": 241.38394165039062, + "epoch": 2.343685820864244, + "grad_norm": 0.25160773088122984, + "kl": 0.16693115234375, + "learning_rate": 4.929417895973189e-07, + "loss": 0.0002, + "reward": 1.7678572237491608, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 13978 + }, + { + "completion_length": 242.8392972946167, + "epoch": 2.3440211241041116, + "grad_norm": 0.19259065367274678, + "kl": 0.203277587890625, + "learning_rate": 4.929389590158287e-07, + "loss": 0.0002, + "reward": 1.719642922282219, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.724107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13980 + }, + { + "completion_length": 234.7187614440918, + "epoch": 2.344356427343979, + "grad_norm": 0.16228758744483066, + "kl": 0.11175537109375, + "learning_rate": 4.929361278750031e-07, + "loss": 0.0001, + "reward": 1.8107143193483353, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143230736256, + "rewards/format_reward_func": 1.0, + "step": 13982 + }, + { + "completion_length": 237.6205472946167, + "epoch": 2.344691730583847, + "grad_norm": 0.1776062458911018, + "kl": 0.115142822265625, + "learning_rate": 4.929332961748482e-07, + "loss": 0.0001, + "reward": 1.7910714969038963, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357395112514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13984 + }, + { + "completion_length": 249.27679538726807, + "epoch": 2.3450270338237145, + "grad_norm": 0.26582332087857086, + "kl": 0.185760498046875, + "learning_rate": 4.92930463915371e-07, + "loss": 0.0002, + "reward": 1.7571429461240768, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7660714536905289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13986 + }, + { + "completion_length": 237.51786708831787, + "epoch": 2.345362337063582, + "grad_norm": 0.26260524688968806, + "kl": 0.1392822265625, + "learning_rate": 4.929276310965778e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.06060915160924196, + "rewards/equation_reward_func": 0.7946428805589676, + "rewards/format_reward_func": 0.9910714328289032, + "step": 13988 + }, + { + "completion_length": 242.7634048461914, + "epoch": 2.3456976403034493, + "grad_norm": 0.5846938540785703, + "kl": 0.22698974609375, + "learning_rate": 4.92924797718475e-07, + "loss": 0.0002, + "reward": 1.8017857745289803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500208616257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13990 + }, + { + "completion_length": 247.4241180419922, + "epoch": 2.346032943543317, + "grad_norm": 0.232436541671342, + "kl": 0.162109375, + "learning_rate": 4.929219637810693e-07, + "loss": 0.0002, + "reward": 1.848214328289032, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8526785969734192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13992 + }, + { + "completion_length": 249.7321548461914, + "epoch": 2.3463682467831846, + "grad_norm": 0.29302366800324925, + "kl": 0.1395263671875, + "learning_rate": 4.929191292843672e-07, + "loss": 0.0001, + "reward": 1.7303572297096252, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7348214462399483, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13994 + }, + { + "completion_length": 246.446439743042, + "epoch": 2.346703550023052, + "grad_norm": 0.18002907127930834, + "kl": 0.11785888671875, + "learning_rate": 4.929162942283751e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7883928865194321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13996 + }, + { + "completion_length": 248.5357255935669, + "epoch": 2.34703885326292, + "grad_norm": 0.30579652194045426, + "kl": 0.1209716796875, + "learning_rate": 4.929134586130998e-07, + "loss": 0.0001, + "reward": 1.758928656578064, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7633928991854191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 13998 + }, + { + "completion_length": 249.0446538925171, + "epoch": 2.347374156502787, + "grad_norm": 0.2357568781576199, + "kl": 0.103302001953125, + "learning_rate": 4.929106224385475e-07, + "loss": 0.0001, + "reward": 1.7964286729693413, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 14000 + }, + { + "completion_length": 258.7678699493408, + "epoch": 2.3477094597426547, + "grad_norm": 0.13462703461072748, + "kl": 0.127471923828125, + "learning_rate": 4.929077857047249e-07, + "loss": 0.0001, + "reward": 1.7250000834465027, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7339286115020514, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14002 + }, + { + "completion_length": 253.50447368621826, + "epoch": 2.3480447629825223, + "grad_norm": 0.25273754623362055, + "kl": 0.14434814453125, + "learning_rate": 4.929049484116386e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.7696428708732128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14004 + }, + { + "completion_length": 258.54018783569336, + "epoch": 2.34838006622239, + "grad_norm": 0.19787426664635152, + "kl": 0.123138427734375, + "learning_rate": 4.929021105592949e-07, + "loss": 0.0001, + "reward": 1.8232143074274063, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.827678594738245, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14006 + }, + { + "completion_length": 271.6116199493408, + "epoch": 2.3487153694622576, + "grad_norm": 0.14992894677494317, + "kl": 0.147003173828125, + "learning_rate": 4.928992721477005e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7803571708500385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14008 + }, + { + "completion_length": 263.29911708831787, + "epoch": 2.349050672702125, + "grad_norm": 0.0617426976233598, + "kl": 0.131195068359375, + "learning_rate": 4.928964331768621e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7571428772062063, + "rewards/format_reward_func": 1.0, + "step": 14010 + }, + { + "completion_length": 253.83929920196533, + "epoch": 2.3493859759419924, + "grad_norm": 0.07002114062756537, + "kl": 0.154541015625, + "learning_rate": 4.928935936467859e-07, + "loss": 0.0002, + "reward": 1.748214341700077, + "reward_std": 0.012626906856894493, + "rewards/equation_reward_func": 0.7526786122471094, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14012 + }, + { + "completion_length": 250.7991180419922, + "epoch": 2.34972127918186, + "grad_norm": 0.25174814949002133, + "kl": 0.28607177734375, + "learning_rate": 4.928907535574786e-07, + "loss": 0.0003, + "reward": 1.7625000551342964, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14014 + }, + { + "completion_length": 253.5625123977661, + "epoch": 2.3500565824217277, + "grad_norm": 0.17426929995654813, + "kl": 0.119384765625, + "learning_rate": 4.928879129089468e-07, + "loss": 0.0001, + "reward": 1.7732143551111221, + "reward_std": 0.03788072057068348, + "rewards/equation_reward_func": 0.7776786163449287, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14016 + }, + { + "completion_length": 262.9821538925171, + "epoch": 2.3503918856615953, + "grad_norm": 0.1572711767458029, + "kl": 0.310943603515625, + "learning_rate": 4.928850717011969e-07, + "loss": 0.0003, + "reward": 1.6875000819563866, + "reward_std": 0.07828682288527489, + "rewards/equation_reward_func": 0.7008928880095482, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14018 + }, + { + "completion_length": 249.61608219146729, + "epoch": 2.350727188901463, + "grad_norm": 0.1388379776191087, + "kl": 0.14971923828125, + "learning_rate": 4.928822299342355e-07, + "loss": 0.0001, + "reward": 1.7910714969038963, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357395112514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14020 + }, + { + "completion_length": 249.49554634094238, + "epoch": 2.35106249214133, + "grad_norm": 0.17107019366412024, + "kl": 0.875244140625, + "learning_rate": 4.928793876080692e-07, + "loss": 0.0009, + "reward": 1.7482143491506577, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7526785880327225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14022 + }, + { + "completion_length": 242.68304824829102, + "epoch": 2.3513977953811978, + "grad_norm": 0.1720167275916922, + "kl": 0.295806884765625, + "learning_rate": 4.928765447227044e-07, + "loss": 0.0003, + "reward": 1.753571480512619, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.7625000327825546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14024 + }, + { + "completion_length": 244.09376430511475, + "epoch": 2.3517330986210654, + "grad_norm": 0.133536958377386, + "kl": 0.123748779296875, + "learning_rate": 4.928737012781479e-07, + "loss": 0.0001, + "reward": 1.737500049173832, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7419643308967352, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14026 + }, + { + "completion_length": 246.2053680419922, + "epoch": 2.352068401860933, + "grad_norm": 0.26511754390693604, + "kl": 0.18597412109375, + "learning_rate": 4.92870857274406e-07, + "loss": 0.0002, + "reward": 1.7446428909897804, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071857511997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14028 + }, + { + "completion_length": 239.23661708831787, + "epoch": 2.3524037051008007, + "grad_norm": 0.13234529295434244, + "kl": 0.11456298828125, + "learning_rate": 4.928680127114854e-07, + "loss": 0.0001, + "reward": 1.8250000178813934, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000197440386, + "rewards/format_reward_func": 1.0, + "step": 14030 + }, + { + "completion_length": 241.6696548461914, + "epoch": 2.3527390083406683, + "grad_norm": 0.1214244184569556, + "kl": 0.145416259765625, + "learning_rate": 4.928651675893925e-07, + "loss": 0.0001, + "reward": 1.787500038743019, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14032 + }, + { + "completion_length": 240.49108409881592, + "epoch": 2.3530743115805355, + "grad_norm": 0.434751465835638, + "kl": 1.688201904296875, + "learning_rate": 4.92862321908134e-07, + "loss": 0.0017, + "reward": 1.773214340209961, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776785865426064, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14034 + }, + { + "completion_length": 250.0357265472412, + "epoch": 2.353409614820403, + "grad_norm": 0.15028585969842692, + "kl": 0.15380859375, + "learning_rate": 4.928594756677163e-07, + "loss": 0.0002, + "reward": 1.796428620815277, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 14036 + }, + { + "completion_length": 231.30358219146729, + "epoch": 2.3537449180602708, + "grad_norm": 0.2420700954221213, + "kl": 0.0980072021484375, + "learning_rate": 4.928566288681461e-07, + "loss": 0.0001, + "reward": 1.8075893595814705, + "reward_std": 0.0498762815259397, + "rewards/equation_reward_func": 0.8133928757160902, + "rewards/format_reward_func": 0.9941964335739613, + "step": 14038 + }, + { + "completion_length": 237.0892972946167, + "epoch": 2.3540802213001384, + "grad_norm": 0.14530807503245907, + "kl": 0.228179931640625, + "learning_rate": 4.928537815094299e-07, + "loss": 0.0002, + "reward": 1.8250000402331352, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 14040 + }, + { + "completion_length": 244.4509048461914, + "epoch": 2.354415524540006, + "grad_norm": 0.11180813148997293, + "kl": 1.75067138671875, + "learning_rate": 4.928509335915742e-07, + "loss": 0.0017, + "reward": 1.7803571820259094, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7848214525729418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14042 + }, + { + "completion_length": 241.87947845458984, + "epoch": 2.354750827779873, + "grad_norm": 0.2982834793049277, + "kl": 0.1754150390625, + "learning_rate": 4.928480851145857e-07, + "loss": 0.0002, + "reward": 1.8321428894996643, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8321428783237934, + "rewards/format_reward_func": 1.0, + "step": 14044 + }, + { + "completion_length": 242.92411708831787, + "epoch": 2.355086131019741, + "grad_norm": 0.08737988500493561, + "kl": 0.383636474609375, + "learning_rate": 4.928452360784709e-07, + "loss": 0.0004, + "reward": 1.751785770058632, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500294297934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14046 + }, + { + "completion_length": 240.31697463989258, + "epoch": 2.3554214342596085, + "grad_norm": 0.20044606430797915, + "kl": 0.116455078125, + "learning_rate": 4.928423864832362e-07, + "loss": 0.0001, + "reward": 1.7428571805357933, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7517857290804386, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14048 + }, + { + "completion_length": 235.4866189956665, + "epoch": 2.355756737499476, + "grad_norm": 0.2400343553634341, + "kl": 0.223541259765625, + "learning_rate": 4.928395363288883e-07, + "loss": 0.0002, + "reward": 1.8500000685453415, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.8589285761117935, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14050 + }, + { + "completion_length": 224.12947463989258, + "epoch": 2.3560920407393438, + "grad_norm": 0.11322100214358818, + "kl": 0.1439056396484375, + "learning_rate": 4.928366856154338e-07, + "loss": 0.0001, + "reward": 1.7714286223053932, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7714286111295223, + "rewards/format_reward_func": 1.0, + "step": 14052 + }, + { + "completion_length": 241.1250114440918, + "epoch": 2.3564273439792114, + "grad_norm": 0.2621128270019946, + "kl": 0.87982177734375, + "learning_rate": 4.928338343428791e-07, + "loss": 0.0009, + "reward": 1.7732143253087997, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7866071797907352, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14054 + }, + { + "completion_length": 237.6250114440918, + "epoch": 2.3567626472190786, + "grad_norm": 0.30913754789060155, + "kl": 0.264068603515625, + "learning_rate": 4.928309825112311e-07, + "loss": 0.0003, + "reward": 1.76071435213089, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143186032772, + "rewards/format_reward_func": 1.0, + "step": 14056 + }, + { + "completion_length": 232.92858219146729, + "epoch": 2.357097950458946, + "grad_norm": 0.00804325255606672, + "kl": 0.174468994140625, + "learning_rate": 4.92828130120496e-07, + "loss": 0.0002, + "reward": 1.8035714775323868, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 14058 + }, + { + "completion_length": 236.0357255935669, + "epoch": 2.357433253698814, + "grad_norm": 0.10639365932382884, + "kl": 0.263580322265625, + "learning_rate": 4.928252771706805e-07, + "loss": 0.0003, + "reward": 1.8250000476837158, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8250000178813934, + "rewards/format_reward_func": 1.0, + "step": 14060 + }, + { + "completion_length": 228.81250858306885, + "epoch": 2.3577685569386815, + "grad_norm": 0.22752689175076363, + "kl": 0.15313720703125, + "learning_rate": 4.928224236617912e-07, + "loss": 0.0002, + "reward": 1.7928571924567223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571775555611, + "rewards/format_reward_func": 1.0, + "step": 14062 + }, + { + "completion_length": 244.99554634094238, + "epoch": 2.358103860178549, + "grad_norm": 0.13870489497186492, + "kl": 0.566314697265625, + "learning_rate": 4.928195695938346e-07, + "loss": 0.0006, + "reward": 1.7535714879631996, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7625000327825546, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14064 + }, + { + "completion_length": 239.48215103149414, + "epoch": 2.3584391634184163, + "grad_norm": 0.28570805232389446, + "kl": 0.2423095703125, + "learning_rate": 4.928167149668174e-07, + "loss": 0.0002, + "reward": 1.8071429133415222, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 14066 + }, + { + "completion_length": 234.80804920196533, + "epoch": 2.358774466658284, + "grad_norm": 0.08342651959363287, + "kl": 0.23736572265625, + "learning_rate": 4.928138597807461e-07, + "loss": 0.0002, + "reward": 1.7357143387198448, + "reward_std": 0.012626906856894493, + "rewards/equation_reward_func": 0.7437500357627869, + "rewards/format_reward_func": 0.9919642880558968, + "step": 14068 + }, + { + "completion_length": 242.87947368621826, + "epoch": 2.3591097698981516, + "grad_norm": 0.14099043974641465, + "kl": 1.082427978515625, + "learning_rate": 4.928110040356272e-07, + "loss": 0.0011, + "reward": 1.7553572058677673, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7598214484751225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14070 + }, + { + "completion_length": 228.77233123779297, + "epoch": 2.359445073138019, + "grad_norm": 0.17428381709094845, + "kl": 0.250213623046875, + "learning_rate": 4.928081477314674e-07, + "loss": 0.0003, + "reward": 1.805357202887535, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8098214529454708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14072 + }, + { + "completion_length": 223.40179538726807, + "epoch": 2.359780376377887, + "grad_norm": 0.0054469211535804855, + "kl": 0.17437744140625, + "learning_rate": 4.928052908682731e-07, + "loss": 0.0002, + "reward": 1.783928595483303, + "reward_std": 0.012626906856894493, + "rewards/equation_reward_func": 0.7883929014205933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14074 + }, + { + "completion_length": 228.59822273254395, + "epoch": 2.3601156796177545, + "grad_norm": 0.1825644958873576, + "kl": 0.489898681640625, + "learning_rate": 4.928024334460513e-07, + "loss": 0.0005, + "reward": 1.7553572058677673, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7598214671015739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14076 + }, + { + "completion_length": 227.62947273254395, + "epoch": 2.3604509828576217, + "grad_norm": 0.28957667120115893, + "kl": 0.1666259765625, + "learning_rate": 4.92799575464808e-07, + "loss": 0.0002, + "reward": 1.825000062584877, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000216066837, + "rewards/format_reward_func": 1.0, + "step": 14078 + }, + { + "completion_length": 228.77233123779297, + "epoch": 2.3607862860974893, + "grad_norm": 0.15762237281826855, + "kl": 0.22186279296875, + "learning_rate": 4.9279671692455e-07, + "loss": 0.0002, + "reward": 1.7964286133646965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286003261805, + "rewards/format_reward_func": 1.0, + "step": 14080 + }, + { + "completion_length": 231.45536708831787, + "epoch": 2.361121589337357, + "grad_norm": 0.1459108784338708, + "kl": 0.273040771484375, + "learning_rate": 4.927938578252841e-07, + "loss": 0.0003, + "reward": 1.7892857939004898, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 14082 + }, + { + "completion_length": 233.0803689956665, + "epoch": 2.3614568925772246, + "grad_norm": 0.35305294761911016, + "kl": 0.149017333984375, + "learning_rate": 4.927909981670166e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857413738966, + "rewards/format_reward_func": 1.0, + "step": 14084 + }, + { + "completion_length": 242.602689743042, + "epoch": 2.361792195817092, + "grad_norm": 0.13617110402336785, + "kl": 0.170318603515625, + "learning_rate": 4.927881379497544e-07, + "loss": 0.0002, + "reward": 1.7357143685221672, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 14086 + }, + { + "completion_length": 237.3884038925171, + "epoch": 2.3621274990569594, + "grad_norm": 0.135181646646026, + "kl": 0.1388092041015625, + "learning_rate": 4.927852771735037e-07, + "loss": 0.0001, + "reward": 1.775000050663948, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 14088 + }, + { + "completion_length": 227.25000953674316, + "epoch": 2.362462802296827, + "grad_norm": 0.17809936062049062, + "kl": 0.260589599609375, + "learning_rate": 4.927824158382713e-07, + "loss": 0.0003, + "reward": 1.8053572103381157, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.8098214454948902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14090 + }, + { + "completion_length": 231.6562614440918, + "epoch": 2.3627981055366947, + "grad_norm": 0.2369996115166829, + "kl": 0.130584716796875, + "learning_rate": 4.927795539440638e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 14092 + }, + { + "completion_length": 243.64286613464355, + "epoch": 2.3631334087765623, + "grad_norm": 0.15600298133011206, + "kl": 0.2369384765625, + "learning_rate": 4.927766914908878e-07, + "loss": 0.0002, + "reward": 1.7928571850061417, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 14094 + }, + { + "completion_length": 229.9241189956665, + "epoch": 2.36346871201643, + "grad_norm": 0.2560262655058894, + "kl": 0.256927490234375, + "learning_rate": 4.927738284787497e-07, + "loss": 0.0003, + "reward": 1.7446429207921028, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7491071689873934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14096 + }, + { + "completion_length": 238.24108409881592, + "epoch": 2.3638040152562976, + "grad_norm": 0.13041247601855738, + "kl": 0.255828857421875, + "learning_rate": 4.927709649076564e-07, + "loss": 0.0003, + "reward": 1.8035714849829674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714514553547, + "rewards/format_reward_func": 1.0, + "step": 14098 + }, + { + "completion_length": 244.14286994934082, + "epoch": 2.364139318496165, + "grad_norm": 0.18896033080219826, + "kl": 0.119232177734375, + "learning_rate": 4.927681007776142e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 1.0, + "step": 14100 + }, + { + "completion_length": 230.97322463989258, + "epoch": 2.3644746217360324, + "grad_norm": 0.2740426705203649, + "kl": 0.307281494140625, + "learning_rate": 4.927652360886299e-07, + "loss": 0.0003, + "reward": 1.7375000566244125, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.74196432903409, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14102 + }, + { + "completion_length": 243.58036994934082, + "epoch": 2.3648099249759, + "grad_norm": 0.28152166534803, + "kl": 0.393951416015625, + "learning_rate": 4.927623708407099e-07, + "loss": 0.0004, + "reward": 1.7339286357164383, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14104 + }, + { + "completion_length": 231.9553689956665, + "epoch": 2.3651452282157677, + "grad_norm": 0.1982005460353846, + "kl": 0.279541015625, + "learning_rate": 4.92759505033861e-07, + "loss": 0.0003, + "reward": 1.750000074505806, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000186264515, + "rewards/format_reward_func": 1.0, + "step": 14106 + }, + { + "completion_length": 236.05358123779297, + "epoch": 2.3654805314556353, + "grad_norm": 0.3090752866343735, + "kl": 0.13604736328125, + "learning_rate": 4.927566386680897e-07, + "loss": 0.0001, + "reward": 1.739285796880722, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7482143118977547, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14108 + }, + { + "completion_length": 236.45090579986572, + "epoch": 2.365815834695503, + "grad_norm": 0.3107913359321936, + "kl": 0.2520751953125, + "learning_rate": 4.927537717434027e-07, + "loss": 0.0003, + "reward": 1.7785714864730835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714529454708, + "rewards/format_reward_func": 1.0, + "step": 14110 + }, + { + "completion_length": 236.11161994934082, + "epoch": 2.36615113793537, + "grad_norm": 0.29293999141656574, + "kl": 0.412841796875, + "learning_rate": 4.927509042598064e-07, + "loss": 0.0004, + "reward": 1.7250000834465027, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7250000201165676, + "rewards/format_reward_func": 1.0, + "step": 14112 + }, + { + "completion_length": 225.22768783569336, + "epoch": 2.3664864411752378, + "grad_norm": 0.37355280432683863, + "kl": 0.17962646484375, + "learning_rate": 4.927480362173075e-07, + "loss": 0.0002, + "reward": 1.8000000715255737, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 14114 + }, + { + "completion_length": 237.2276906967163, + "epoch": 2.3668217444151054, + "grad_norm": 0.0930373437809911, + "kl": 0.1688232421875, + "learning_rate": 4.927451676159127e-07, + "loss": 0.0002, + "reward": 1.785714328289032, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 14116 + }, + { + "completion_length": 238.4107255935669, + "epoch": 2.367157047654973, + "grad_norm": 0.17907423202000464, + "kl": 0.161224365234375, + "learning_rate": 4.927422984556284e-07, + "loss": 0.0002, + "reward": 1.785714328289032, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7946428991854191, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14118 + }, + { + "completion_length": 230.6384038925171, + "epoch": 2.3674923508948407, + "grad_norm": 0.21644035641179135, + "kl": 0.115264892578125, + "learning_rate": 4.927394287364614e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 14120 + }, + { + "completion_length": 230.62947463989258, + "epoch": 2.3678276541347083, + "grad_norm": 0.1731059367453474, + "kl": 0.129791259765625, + "learning_rate": 4.927365584584184e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.764285746961832, + "rewards/format_reward_func": 1.0, + "step": 14122 + }, + { + "completion_length": 239.2678680419922, + "epoch": 2.3681629573745755, + "grad_norm": 0.1786554966406942, + "kl": 0.126251220703125, + "learning_rate": 4.927336876215055e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7812500149011612, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14124 + }, + { + "completion_length": 235.62500858306885, + "epoch": 2.368498260614443, + "grad_norm": 0.15925308523575013, + "kl": 0.12420654296875, + "learning_rate": 4.927308162257299e-07, + "loss": 0.0001, + "reward": 1.7678571790456772, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 14126 + }, + { + "completion_length": 230.16965293884277, + "epoch": 2.3688335638543108, + "grad_norm": 0.08282613935284787, + "kl": 0.15203857421875, + "learning_rate": 4.927279442710979e-07, + "loss": 0.0002, + "reward": 1.7892857789993286, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857417464256, + "rewards/format_reward_func": 1.0, + "step": 14128 + }, + { + "completion_length": 235.19197368621826, + "epoch": 2.3691688670941784, + "grad_norm": 0.2480821388756342, + "kl": 0.123291015625, + "learning_rate": 4.927250717576161e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 14130 + }, + { + "completion_length": 226.73661708831787, + "epoch": 2.369504170334046, + "grad_norm": 0.17368345542346247, + "kl": 0.1526947021484375, + "learning_rate": 4.927221986852913e-07, + "loss": 0.0002, + "reward": 1.7357143387198448, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143238186836, + "rewards/format_reward_func": 1.0, + "step": 14132 + }, + { + "completion_length": 239.01340293884277, + "epoch": 2.3698394735739132, + "grad_norm": 0.20367427730227183, + "kl": 0.244476318359375, + "learning_rate": 4.927193250541299e-07, + "loss": 0.0002, + "reward": 1.7482143491506577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7526785992085934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14134 + }, + { + "completion_length": 237.95536613464355, + "epoch": 2.370174776813781, + "grad_norm": 0.0815353276461526, + "kl": 0.142181396484375, + "learning_rate": 4.927164508641388e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000447034836, + "rewards/format_reward_func": 1.0, + "step": 14136 + }, + { + "completion_length": 229.46429538726807, + "epoch": 2.3705100800536485, + "grad_norm": 0.10713861064394091, + "kl": 0.175323486328125, + "learning_rate": 4.927135761153242e-07, + "loss": 0.0002, + "reward": 1.7767857909202576, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14138 + }, + { + "completion_length": 225.665189743042, + "epoch": 2.370845383293516, + "grad_norm": 0.17673572626422573, + "kl": 0.131622314453125, + "learning_rate": 4.927107008076932e-07, + "loss": 0.0001, + "reward": 1.7160715013742447, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7205357495695353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14140 + }, + { + "completion_length": 227.540189743042, + "epoch": 2.3711806865333838, + "grad_norm": 0.33349945659697183, + "kl": 0.13836669921875, + "learning_rate": 4.92707824941252e-07, + "loss": 0.0001, + "reward": 1.7839286476373672, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14142 + }, + { + "completion_length": 220.33929347991943, + "epoch": 2.3715159897732514, + "grad_norm": 0.1168489542544372, + "kl": 0.1265869140625, + "learning_rate": 4.927049485160075e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 14144 + }, + { + "completion_length": 238.7634048461914, + "epoch": 2.3718512930131186, + "grad_norm": 0.1929672755226574, + "kl": 0.138336181640625, + "learning_rate": 4.927020715319662e-07, + "loss": 0.0001, + "reward": 1.8267857506871223, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8312500268220901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14146 + }, + { + "completion_length": 226.50893783569336, + "epoch": 2.3721865962529862, + "grad_norm": 0.22830195961372327, + "kl": 0.1256103515625, + "learning_rate": 4.926991939891346e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428962051868, + "rewards/format_reward_func": 1.0, + "step": 14148 + }, + { + "completion_length": 223.10268878936768, + "epoch": 2.372521899492854, + "grad_norm": 0.1226224587835622, + "kl": 0.162078857421875, + "learning_rate": 4.926963158875197e-07, + "loss": 0.0002, + "reward": 1.8214286044239998, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214286081492901, + "rewards/format_reward_func": 1.0, + "step": 14150 + }, + { + "completion_length": 239.80358123779297, + "epoch": 2.3728572027327215, + "grad_norm": 0.13037067304308758, + "kl": 0.135223388671875, + "learning_rate": 4.926934372271277e-07, + "loss": 0.0001, + "reward": 1.7625000849366188, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7669643275439739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14152 + }, + { + "completion_length": 234.99554538726807, + "epoch": 2.373192505972589, + "grad_norm": 0.2723590433917922, + "kl": 0.140380859375, + "learning_rate": 4.926905580079655e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7892857305705547, + "rewards/format_reward_func": 1.0, + "step": 14154 + }, + { + "completion_length": 249.51786613464355, + "epoch": 2.3735278092124563, + "grad_norm": 0.2768983996081548, + "kl": 0.42657470703125, + "learning_rate": 4.926876782300397e-07, + "loss": 0.0004, + "reward": 1.76071435213089, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7696428894996643, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14156 + }, + { + "completion_length": 228.89733028411865, + "epoch": 2.373863112452324, + "grad_norm": 0.2700517762375239, + "kl": 0.125091552734375, + "learning_rate": 4.926847978933568e-07, + "loss": 0.0001, + "reward": 1.760714367032051, + "reward_std": 0.04545686487108469, + "rewards/equation_reward_func": 0.7696428839117289, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14158 + }, + { + "completion_length": 228.62947463989258, + "epoch": 2.3741984156921916, + "grad_norm": 0.16048736519894619, + "kl": 0.1554718017578125, + "learning_rate": 4.926819169979235e-07, + "loss": 0.0002, + "reward": 1.7428572252392769, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571674972773, + "rewards/format_reward_func": 1.0, + "step": 14160 + }, + { + "completion_length": 227.23215293884277, + "epoch": 2.3745337189320592, + "grad_norm": 0.18227184233243968, + "kl": 0.13006591796875, + "learning_rate": 4.926790355437466e-07, + "loss": 0.0001, + "reward": 1.7125000655651093, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7169643193483353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14162 + }, + { + "completion_length": 241.8660831451416, + "epoch": 2.374869022171927, + "grad_norm": 0.289783282635767, + "kl": 0.160186767578125, + "learning_rate": 4.926761535308324e-07, + "loss": 0.0002, + "reward": 1.844642885029316, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.849107176065445, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14164 + }, + { + "completion_length": 232.65179443359375, + "epoch": 2.3752043254117945, + "grad_norm": 0.1962872977790209, + "kl": 0.124542236328125, + "learning_rate": 4.926732709591878e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 14166 + }, + { + "completion_length": 240.821439743042, + "epoch": 2.3755396286516617, + "grad_norm": 0.22684733865237583, + "kl": 0.1270751953125, + "learning_rate": 4.926703878288194e-07, + "loss": 0.0001, + "reward": 1.712500087916851, + "reward_std": 0.07323605846613646, + "rewards/equation_reward_func": 0.716964315623045, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14168 + }, + { + "completion_length": 239.41518878936768, + "epoch": 2.3758749318915293, + "grad_norm": 0.21397832178290516, + "kl": 0.120819091796875, + "learning_rate": 4.926675041397338e-07, + "loss": 0.0001, + "reward": 1.7339286282658577, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7383928783237934, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14170 + }, + { + "completion_length": 232.6428689956665, + "epoch": 2.376210235131397, + "grad_norm": 0.18256904282426623, + "kl": 0.129730224609375, + "learning_rate": 4.926646198919375e-07, + "loss": 0.0001, + "reward": 1.7821428999304771, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428887546062, + "rewards/format_reward_func": 1.0, + "step": 14172 + }, + { + "completion_length": 226.29911613464355, + "epoch": 2.3765455383712646, + "grad_norm": 0.2511379796899202, + "kl": 0.1105499267578125, + "learning_rate": 4.926617350854373e-07, + "loss": 0.0001, + "reward": 1.728571504354477, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714596509933, + "rewards/format_reward_func": 1.0, + "step": 14174 + }, + { + "completion_length": 234.5357265472412, + "epoch": 2.3768808416111322, + "grad_norm": 0.11663380199458673, + "kl": 0.11688232421875, + "learning_rate": 4.926588497202399e-07, + "loss": 0.0001, + "reward": 1.7821428924798965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 14176 + }, + { + "completion_length": 232.5759048461914, + "epoch": 2.3772161448509994, + "grad_norm": 0.14246938662770156, + "kl": 0.097503662109375, + "learning_rate": 4.926559637963518e-07, + "loss": 0.0001, + "reward": 1.733928643167019, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7383928932249546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14178 + }, + { + "completion_length": 243.15179538726807, + "epoch": 2.377551448090867, + "grad_norm": 0.1807404842578879, + "kl": 0.113311767578125, + "learning_rate": 4.926530773137798e-07, + "loss": 0.0001, + "reward": 1.7464286610484123, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 14180 + }, + { + "completion_length": 237.43750953674316, + "epoch": 2.3778867513307347, + "grad_norm": 0.2560356006133875, + "kl": 0.116668701171875, + "learning_rate": 4.926501902725303e-07, + "loss": 0.0001, + "reward": 1.8196429014205933, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8241071775555611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14182 + }, + { + "completion_length": 238.04911708831787, + "epoch": 2.3782220545706023, + "grad_norm": 0.17475930712994994, + "kl": 0.139495849609375, + "learning_rate": 4.926473026726101e-07, + "loss": 0.0001, + "reward": 1.6910715028643608, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.6955357547849417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14184 + }, + { + "completion_length": 245.30358123779297, + "epoch": 2.37855735781047, + "grad_norm": 0.17446587499780564, + "kl": 0.1220703125, + "learning_rate": 4.92644414514026e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14186 + }, + { + "completion_length": 234.08036518096924, + "epoch": 2.3788926610503376, + "grad_norm": 0.27303907383285336, + "kl": 0.19439697265625, + "learning_rate": 4.926415257967843e-07, + "loss": 0.0002, + "reward": 1.7571429386734962, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 14188 + }, + { + "completion_length": 253.4732255935669, + "epoch": 2.3792279642902048, + "grad_norm": 0.14374585376478768, + "kl": 0.106475830078125, + "learning_rate": 4.92638636520892e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714700818062, + "rewards/format_reward_func": 1.0, + "step": 14190 + }, + { + "completion_length": 238.92411613464355, + "epoch": 2.3795632675300724, + "grad_norm": 0.09037984547445888, + "kl": 0.112701416015625, + "learning_rate": 4.926357466863556e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000208616257, + "rewards/format_reward_func": 1.0, + "step": 14192 + }, + { + "completion_length": 250.32143878936768, + "epoch": 2.37989857076994, + "grad_norm": 0.2056104027875523, + "kl": 0.109954833984375, + "learning_rate": 4.926328562931816e-07, + "loss": 0.0001, + "reward": 1.712500087916851, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7169643193483353, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14194 + }, + { + "completion_length": 254.5714406967163, + "epoch": 2.3802338740098077, + "grad_norm": 0.21454443708666152, + "kl": 0.124755859375, + "learning_rate": 4.92629965341377e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 14196 + }, + { + "completion_length": 245.61161708831787, + "epoch": 2.3805691772496753, + "grad_norm": 0.21721769675845912, + "kl": 0.1088714599609375, + "learning_rate": 4.92627073830948e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 14198 + }, + { + "completion_length": 251.32590675354004, + "epoch": 2.3809044804895425, + "grad_norm": 0.14698313947537994, + "kl": 0.118255615234375, + "learning_rate": 4.926241817619017e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 14200 + }, + { + "completion_length": 247.36161518096924, + "epoch": 2.38123978372941, + "grad_norm": 0.2984387094846763, + "kl": 0.11102294921875, + "learning_rate": 4.926212891342445e-07, + "loss": 0.0001, + "reward": 1.8160714954137802, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.820535734295845, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14202 + }, + { + "completion_length": 246.8259038925171, + "epoch": 2.3815750869692778, + "grad_norm": 0.19331540875940945, + "kl": 0.115264892578125, + "learning_rate": 4.926183959479833e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.753571467474103, + "rewards/format_reward_func": 1.0, + "step": 14204 + }, + { + "completion_length": 245.6026906967163, + "epoch": 2.3819103902091454, + "grad_norm": 0.0032176377544802333, + "kl": 0.115203857421875, + "learning_rate": 4.926155022031244e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 14206 + }, + { + "completion_length": 240.30357933044434, + "epoch": 2.382245693449013, + "grad_norm": 0.22095254078246251, + "kl": 0.111480712890625, + "learning_rate": 4.926126078996747e-07, + "loss": 0.0001, + "reward": 1.8017857745289803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500283122063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14208 + }, + { + "completion_length": 252.48661613464355, + "epoch": 2.3825809966888807, + "grad_norm": 0.268309403775508, + "kl": 0.125274658203125, + "learning_rate": 4.926097130376409e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071723401546, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14210 + }, + { + "completion_length": 244.227689743042, + "epoch": 2.382916299928748, + "grad_norm": 0.1499611496941687, + "kl": 0.1090087890625, + "learning_rate": 4.926068176170295e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285962283611, + "rewards/format_reward_func": 1.0, + "step": 14212 + }, + { + "completion_length": 250.77233123779297, + "epoch": 2.3832516031686155, + "grad_norm": 0.108530994446496, + "kl": 0.17041015625, + "learning_rate": 4.926039216378472e-07, + "loss": 0.0002, + "reward": 1.7696429267525673, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7741071619093418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14214 + }, + { + "completion_length": 239.0535831451416, + "epoch": 2.383586906408483, + "grad_norm": 0.14041276773962685, + "kl": 0.10272216796875, + "learning_rate": 4.926010251001008e-07, + "loss": 0.0001, + "reward": 1.8321429044008255, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8321428745985031, + "rewards/format_reward_func": 1.0, + "step": 14216 + }, + { + "completion_length": 237.2901906967163, + "epoch": 2.3839222096483508, + "grad_norm": 0.12151716109331748, + "kl": 0.14166259765625, + "learning_rate": 4.92598128003797e-07, + "loss": 0.0001, + "reward": 1.7196429297327995, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7241071742027998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14218 + }, + { + "completion_length": 240.2455472946167, + "epoch": 2.3842575128882184, + "grad_norm": 0.16364630154217485, + "kl": 0.126220703125, + "learning_rate": 4.925952303489422e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7535714693367481, + "rewards/format_reward_func": 1.0, + "step": 14220 + }, + { + "completion_length": 233.61608219146729, + "epoch": 2.3845928161280856, + "grad_norm": 0.17529658175337104, + "kl": 0.133697509765625, + "learning_rate": 4.925923321355433e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143092900515, + "rewards/format_reward_func": 1.0, + "step": 14222 + }, + { + "completion_length": 241.4732255935669, + "epoch": 2.3849281193679532, + "grad_norm": 0.09252615676509375, + "kl": 0.133056640625, + "learning_rate": 4.925894333636069e-07, + "loss": 0.0001, + "reward": 1.7285714969038963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7285714671015739, + "rewards/format_reward_func": 1.0, + "step": 14224 + }, + { + "completion_length": 234.6294765472412, + "epoch": 2.385263422607821, + "grad_norm": 0.12747518818637713, + "kl": 0.110015869140625, + "learning_rate": 4.925865340331398e-07, + "loss": 0.0001, + "reward": 1.7214286550879478, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7214286122471094, + "rewards/format_reward_func": 1.0, + "step": 14226 + }, + { + "completion_length": 242.12947463989258, + "epoch": 2.3855987258476885, + "grad_norm": 0.23217568953704362, + "kl": 0.135894775390625, + "learning_rate": 4.925836341441484e-07, + "loss": 0.0001, + "reward": 1.6642858162522316, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.6642857529222965, + "rewards/format_reward_func": 1.0, + "step": 14228 + }, + { + "completion_length": 241.42858409881592, + "epoch": 2.385934029087556, + "grad_norm": 0.32283599449486605, + "kl": 0.107147216796875, + "learning_rate": 4.925807336966395e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 14230 + }, + { + "completion_length": 241.62054824829102, + "epoch": 2.3862693323274238, + "grad_norm": 0.4361940214247699, + "kl": 0.099517822265625, + "learning_rate": 4.925778326906198e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428757160902, + "rewards/format_reward_func": 1.0, + "step": 14232 + }, + { + "completion_length": 241.95983409881592, + "epoch": 2.3866046355672914, + "grad_norm": 0.15672447960175115, + "kl": 0.114654541015625, + "learning_rate": 4.925749311260962e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 14234 + }, + { + "completion_length": 247.38840293884277, + "epoch": 2.3869399388071586, + "grad_norm": 0.12357675091633724, + "kl": 0.127593994140625, + "learning_rate": 4.92572029003075e-07, + "loss": 0.0001, + "reward": 1.8392857760190964, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8392857313156128, + "rewards/format_reward_func": 1.0, + "step": 14236 + }, + { + "completion_length": 238.7009048461914, + "epoch": 2.3872752420470262, + "grad_norm": 0.34950113473575367, + "kl": 0.118927001953125, + "learning_rate": 4.92569126321563e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 14238 + }, + { + "completion_length": 238.56250762939453, + "epoch": 2.387610545286894, + "grad_norm": 0.005908223008530719, + "kl": 0.1171875, + "learning_rate": 4.925662230815671e-07, + "loss": 0.0001, + "reward": 1.703571505844593, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7035714685916901, + "rewards/format_reward_func": 1.0, + "step": 14240 + }, + { + "completion_length": 238.3705472946167, + "epoch": 2.3879458485267615, + "grad_norm": 0.23473823838715496, + "kl": 0.10906982421875, + "learning_rate": 4.925633192830937e-07, + "loss": 0.0001, + "reward": 1.7660714909434319, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7705357410013676, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14242 + }, + { + "completion_length": 234.7678680419922, + "epoch": 2.3882811517666287, + "grad_norm": 0.26786727999529003, + "kl": 0.112030029296875, + "learning_rate": 4.925604149261497e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 14244 + }, + { + "completion_length": 233.0223331451416, + "epoch": 2.3886164550064963, + "grad_norm": 0.18724417602482607, + "kl": 0.10772705078125, + "learning_rate": 4.925575100107415e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714286185801029, + "rewards/format_reward_func": 1.0, + "step": 14246 + }, + { + "completion_length": 240.4776906967163, + "epoch": 2.388951758246364, + "grad_norm": 0.1619747174711402, + "kl": 0.1134033203125, + "learning_rate": 4.925546045368762e-07, + "loss": 0.0001, + "reward": 1.850000038743019, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8500000201165676, + "rewards/format_reward_func": 1.0, + "step": 14248 + }, + { + "completion_length": 235.22322463989258, + "epoch": 2.3892870614862316, + "grad_norm": 0.14819214172171727, + "kl": 0.11566162109375, + "learning_rate": 4.925516985045601e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 14250 + }, + { + "completion_length": 238.0446538925171, + "epoch": 2.3896223647260992, + "grad_norm": 0.14841388129257704, + "kl": 0.229644775390625, + "learning_rate": 4.925487919138001e-07, + "loss": 0.0002, + "reward": 1.7928571924567223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571850061417, + "rewards/format_reward_func": 1.0, + "step": 14252 + }, + { + "completion_length": 227.77679634094238, + "epoch": 2.389957667965967, + "grad_norm": 0.20086124023025853, + "kl": 0.114776611328125, + "learning_rate": 4.925458847646029e-07, + "loss": 0.0001, + "reward": 1.8107143566012383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.810714315623045, + "rewards/format_reward_func": 1.0, + "step": 14254 + }, + { + "completion_length": 233.6250114440918, + "epoch": 2.3902929712058345, + "grad_norm": 0.2715224088342795, + "kl": 0.108551025390625, + "learning_rate": 4.925429770569751e-07, + "loss": 0.0001, + "reward": 1.7839286252856255, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.788392897695303, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14256 + }, + { + "completion_length": 227.0446538925171, + "epoch": 2.3906282744457017, + "grad_norm": 0.18961583525723497, + "kl": 0.1087646484375, + "learning_rate": 4.925400687909235e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143294066191, + "rewards/format_reward_func": 1.0, + "step": 14258 + }, + { + "completion_length": 240.30804443359375, + "epoch": 2.3909635776855693, + "grad_norm": 0.20396875238100765, + "kl": 0.1153564453125, + "learning_rate": 4.925371599664547e-07, + "loss": 0.0001, + "reward": 1.8125000298023224, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.816964328289032, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14260 + }, + { + "completion_length": 235.96429538726807, + "epoch": 2.391298880925437, + "grad_norm": 1.6884351687415977, + "kl": 0.130615234375, + "learning_rate": 4.925342505835754e-07, + "loss": 0.0001, + "reward": 1.8107143118977547, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.810714328661561, + "rewards/format_reward_func": 1.0, + "step": 14262 + }, + { + "completion_length": 245.05804538726807, + "epoch": 2.3916341841653046, + "grad_norm": 0.1875430097990074, + "kl": 0.109161376953125, + "learning_rate": 4.925313406422924e-07, + "loss": 0.0001, + "reward": 1.7642858028411865, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857413738966, + "rewards/format_reward_func": 1.0, + "step": 14264 + }, + { + "completion_length": 240.56250953674316, + "epoch": 2.3919694874051722, + "grad_norm": 0.17942496787693438, + "kl": 0.10748291015625, + "learning_rate": 4.925284301426122e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7785714454948902, + "rewards/format_reward_func": 1.0, + "step": 14266 + }, + { + "completion_length": 237.4821538925171, + "epoch": 2.3923047906450394, + "grad_norm": 0.09619269901935326, + "kl": 0.11822509765625, + "learning_rate": 4.925255190845418e-07, + "loss": 0.0001, + "reward": 1.7892857491970062, + "reward_std": 0.005050762556493282, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 14268 + }, + { + "completion_length": 235.8928680419922, + "epoch": 2.392640093884907, + "grad_norm": 0.11474742306725783, + "kl": 0.108551025390625, + "learning_rate": 4.925226074680876e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 1.0, + "step": 14270 + }, + { + "completion_length": 247.57143878936768, + "epoch": 2.3929753971247747, + "grad_norm": 0.15741634440553448, + "kl": 0.1328125, + "learning_rate": 4.925196952932565e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 14272 + }, + { + "completion_length": 251.08929920196533, + "epoch": 2.3933107003646423, + "grad_norm": 0.10766437431018219, + "kl": 0.12640380859375, + "learning_rate": 4.925167825600552e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.775000037625432, + "rewards/format_reward_func": 1.0, + "step": 14274 + }, + { + "completion_length": 257.86609077453613, + "epoch": 2.39364600360451, + "grad_norm": 0.16834165502722323, + "kl": 0.108551025390625, + "learning_rate": 4.925138692684902e-07, + "loss": 0.0001, + "reward": 1.8214286267757416, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214286006987095, + "rewards/format_reward_func": 1.0, + "step": 14276 + }, + { + "completion_length": 247.4509048461914, + "epoch": 2.3939813068443776, + "grad_norm": 0.17944890412209027, + "kl": 0.102783203125, + "learning_rate": 4.925109554185685e-07, + "loss": 0.0001, + "reward": 1.801785759627819, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500283122063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14278 + }, + { + "completion_length": 251.76340198516846, + "epoch": 2.394316610084245, + "grad_norm": 0.11806261551808396, + "kl": 0.233489990234375, + "learning_rate": 4.925080410102965e-07, + "loss": 0.0002, + "reward": 1.6642857864499092, + "reward_std": 0.03030457627028227, + "rewards/equation_reward_func": 0.6732143275439739, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14280 + }, + { + "completion_length": 256.3169755935669, + "epoch": 2.3946519133241124, + "grad_norm": 0.282034937595845, + "kl": 0.12908935546875, + "learning_rate": 4.925051260436812e-07, + "loss": 0.0001, + "reward": 1.7750000581145287, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000283122063, + "rewards/format_reward_func": 1.0, + "step": 14282 + }, + { + "completion_length": 252.32144260406494, + "epoch": 2.39498721656398, + "grad_norm": 0.17036559088687567, + "kl": 0.13946533203125, + "learning_rate": 4.925022105187291e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714790225029, + "rewards/format_reward_func": 1.0, + "step": 14284 + }, + { + "completion_length": 258.7276887893677, + "epoch": 2.3953225198038477, + "grad_norm": 0.1503807619374923, + "kl": 0.144378662109375, + "learning_rate": 4.924992944354472e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 14286 + }, + { + "completion_length": 244.72768783569336, + "epoch": 2.3956578230437153, + "grad_norm": 0.21012577005227273, + "kl": 0.1162109375, + "learning_rate": 4.924963777938417e-07, + "loss": 0.0001, + "reward": 1.8000000715255737, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 14288 + }, + { + "completion_length": 248.89733219146729, + "epoch": 2.3959931262835825, + "grad_norm": 0.3899478536470825, + "kl": 0.423553466796875, + "learning_rate": 4.924934605939199e-07, + "loss": 0.0004, + "reward": 1.7821428924798965, + "reward_std": 0.045456865802407265, + "rewards/equation_reward_func": 0.7910714633762836, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14290 + }, + { + "completion_length": 246.04018688201904, + "epoch": 2.39632842952345, + "grad_norm": 0.13040493195453964, + "kl": 0.135955810546875, + "learning_rate": 4.924905428356882e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7776786088943481, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14292 + }, + { + "completion_length": 256.4910840988159, + "epoch": 2.396663732763318, + "grad_norm": 0.22800534481410561, + "kl": 0.128997802734375, + "learning_rate": 4.924876245191532e-07, + "loss": 0.0001, + "reward": 1.7839286178350449, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7883928902447224, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14294 + }, + { + "completion_length": 260.1651887893677, + "epoch": 2.3969990360031854, + "grad_norm": 0.25404098138978654, + "kl": 0.384521484375, + "learning_rate": 4.924847056443219e-07, + "loss": 0.0004, + "reward": 1.8250000774860382, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8250000216066837, + "rewards/format_reward_func": 1.0, + "step": 14296 + }, + { + "completion_length": 250.84376335144043, + "epoch": 2.397334339243053, + "grad_norm": 0.2945103278601156, + "kl": 0.115875244140625, + "learning_rate": 4.924817862112008e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.05808377172797918, + "rewards/equation_reward_func": 0.7651786096394062, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14298 + }, + { + "completion_length": 262.4776916503906, + "epoch": 2.3976696424829207, + "grad_norm": 0.13702960543474418, + "kl": 0.113006591796875, + "learning_rate": 4.924788662197968e-07, + "loss": 0.0001, + "reward": 1.7053572237491608, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7098214775323868, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14300 + }, + { + "completion_length": 260.9151906967163, + "epoch": 2.398004945722788, + "grad_norm": 0.16906523532274226, + "kl": 0.54217529296875, + "learning_rate": 4.924759456701166e-07, + "loss": 0.0005, + "reward": 1.7392857745289803, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7482143081724644, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14302 + }, + { + "completion_length": 258.808048248291, + "epoch": 2.3983402489626555, + "grad_norm": 0.37236756985967784, + "kl": 0.363616943359375, + "learning_rate": 4.924730245621669e-07, + "loss": 0.0004, + "reward": 1.8000000417232513, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 14304 + }, + { + "completion_length": 255.62054538726807, + "epoch": 2.398675552202523, + "grad_norm": 0.0919150041745982, + "kl": 0.917205810546875, + "learning_rate": 4.924701028959543e-07, + "loss": 0.0009, + "reward": 1.7750000432133675, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 14306 + }, + { + "completion_length": 247.10269165039062, + "epoch": 2.399010855442391, + "grad_norm": 0.20273143904873656, + "kl": 0.12310791015625, + "learning_rate": 4.924671806714856e-07, + "loss": 0.0001, + "reward": 1.7589286342263222, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928917348385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14308 + }, + { + "completion_length": 246.0580472946167, + "epoch": 2.3993461586822584, + "grad_norm": 0.19527974290438713, + "kl": 0.1595458984375, + "learning_rate": 4.924642578887675e-07, + "loss": 0.0002, + "reward": 1.8339286148548126, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8383928760886192, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14310 + }, + { + "completion_length": 260.1919746398926, + "epoch": 2.3996814619221256, + "grad_norm": 0.4782948238589906, + "kl": 0.147247314453125, + "learning_rate": 4.924613345478069e-07, + "loss": 0.0001, + "reward": 1.6857143640518188, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.6857143305242062, + "rewards/format_reward_func": 1.0, + "step": 14312 + }, + { + "completion_length": 261.4419775009155, + "epoch": 2.4000167651619932, + "grad_norm": 0.5121912858089605, + "kl": 2.307708740234375, + "learning_rate": 4.924584106486105e-07, + "loss": 0.0023, + "reward": 1.7446429207921028, + "reward_std": 0.06818529684096575, + "rewards/equation_reward_func": 0.7580357529222965, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14314 + }, + { + "completion_length": 245.25447463989258, + "epoch": 2.400352068401861, + "grad_norm": 0.18907697095639242, + "kl": 0.214569091796875, + "learning_rate": 4.924554861911847e-07, + "loss": 0.0002, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428902447224, + "rewards/format_reward_func": 1.0, + "step": 14316 + }, + { + "completion_length": 242.28125953674316, + "epoch": 2.4006873716417285, + "grad_norm": 0.24829432582435484, + "kl": 1.02117919921875, + "learning_rate": 4.924525611755366e-07, + "loss": 0.001, + "reward": 1.8035714849829674, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.8125000260770321, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14318 + }, + { + "completion_length": 246.0178689956665, + "epoch": 2.401022674881596, + "grad_norm": 0.5520300313907123, + "kl": 0.346954345703125, + "learning_rate": 4.924496356016729e-07, + "loss": 0.0003, + "reward": 1.7267857864499092, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7401785887777805, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14320 + }, + { + "completion_length": 255.9821538925171, + "epoch": 2.401357978121464, + "grad_norm": 0.23108229551636367, + "kl": 0.32366943359375, + "learning_rate": 4.924467094696001e-07, + "loss": 0.0003, + "reward": 1.728571504354477, + "reward_std": 0.06060915347188711, + "rewards/equation_reward_func": 0.7375000342726707, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14322 + }, + { + "completion_length": 258.02679538726807, + "epoch": 2.401693281361331, + "grad_norm": 0.5556203153502393, + "kl": 1.0614013671875, + "learning_rate": 4.924437827793251e-07, + "loss": 0.0011, + "reward": 1.705357201397419, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.718750037252903, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14324 + }, + { + "completion_length": 250.02233505249023, + "epoch": 2.4020285846011986, + "grad_norm": 0.33785567885457496, + "kl": 0.931396484375, + "learning_rate": 4.924408555308547e-07, + "loss": 0.0009, + "reward": 1.7732143327593803, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7866071797907352, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14326 + }, + { + "completion_length": 257.00894260406494, + "epoch": 2.4023638878410662, + "grad_norm": 0.5425677862198246, + "kl": 3.427093505859375, + "learning_rate": 4.924379277241955e-07, + "loss": 0.0034, + "reward": 1.7357143685221672, + "reward_std": 0.0505076264962554, + "rewards/equation_reward_func": 0.7446428909897804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14328 + }, + { + "completion_length": 241.6562614440918, + "epoch": 2.402699191080934, + "grad_norm": 0.08042267566696397, + "kl": 0.248321533203125, + "learning_rate": 4.924349993593543e-07, + "loss": 0.0002, + "reward": 1.7821428924798965, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7910714633762836, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14330 + }, + { + "completion_length": 250.99108409881592, + "epoch": 2.4030344943208015, + "grad_norm": 0.23087088388512614, + "kl": 0.36553955078125, + "learning_rate": 4.924320704363378e-07, + "loss": 0.0004, + "reward": 1.7357143461704254, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 0.9821428656578064, + "step": 14332 + }, + { + "completion_length": 253.00447750091553, + "epoch": 2.4033697975606687, + "grad_norm": 0.32966255660926874, + "kl": 0.310394287109375, + "learning_rate": 4.924291409551529e-07, + "loss": 0.0003, + "reward": 1.717857226729393, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7267857417464256, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14334 + }, + { + "completion_length": 240.29465103149414, + "epoch": 2.4037051008005363, + "grad_norm": 0.23779095458913324, + "kl": 0.32281494140625, + "learning_rate": 4.924262109158061e-07, + "loss": 0.0003, + "reward": 1.7839286252856255, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7973214611411095, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14336 + }, + { + "completion_length": 241.95536994934082, + "epoch": 2.404040404040404, + "grad_norm": 0.1600356620901426, + "kl": 0.13165283203125, + "learning_rate": 4.924232803183044e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.0328299580141902, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14338 + }, + { + "completion_length": 250.74108219146729, + "epoch": 2.4043757072802716, + "grad_norm": 0.24163101976938872, + "kl": 0.1605224609375, + "learning_rate": 4.924203491626544e-07, + "loss": 0.0002, + "reward": 1.7589286267757416, + "reward_std": 0.07828682102262974, + "rewards/equation_reward_func": 0.7723214514553547, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14340 + }, + { + "completion_length": 239.1517972946167, + "epoch": 2.4047110105201392, + "grad_norm": 0.2275927010975339, + "kl": 0.12322998046875, + "learning_rate": 4.924174174488628e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8026785999536514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14342 + }, + { + "completion_length": 230.55804538726807, + "epoch": 2.405046313760007, + "grad_norm": 0.18734378399333992, + "kl": 0.105255126953125, + "learning_rate": 4.924144851769365e-07, + "loss": 0.0001, + "reward": 1.7732143551111221, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14344 + }, + { + "completion_length": 238.5267972946167, + "epoch": 2.405381616999874, + "grad_norm": 0.18580847734440029, + "kl": 0.16204833984375, + "learning_rate": 4.924115523468821e-07, + "loss": 0.0002, + "reward": 1.716071493923664, + "reward_std": 0.05808377172797918, + "rewards/equation_reward_func": 0.7294643204659224, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14346 + }, + { + "completion_length": 231.34375953674316, + "epoch": 2.4057169202397417, + "grad_norm": 0.19578708489691765, + "kl": 0.121368408203125, + "learning_rate": 4.924086189587065e-07, + "loss": 0.0001, + "reward": 1.7910714969038963, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7955357283353806, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14348 + }, + { + "completion_length": 231.26786994934082, + "epoch": 2.4060522234796093, + "grad_norm": 0.19892274262116755, + "kl": 0.140625, + "learning_rate": 4.924056850124164e-07, + "loss": 0.0001, + "reward": 1.719642922282219, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7241071686148643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14350 + }, + { + "completion_length": 241.85715579986572, + "epoch": 2.406387526719477, + "grad_norm": 0.1743829897833669, + "kl": 0.119781494140625, + "learning_rate": 4.924027505080185e-07, + "loss": 0.0001, + "reward": 1.7857143357396126, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143245637417, + "rewards/format_reward_func": 1.0, + "step": 14352 + }, + { + "completion_length": 239.40626049041748, + "epoch": 2.4067228299593446, + "grad_norm": 0.39051117823511666, + "kl": 0.1546630859375, + "learning_rate": 4.923998154455195e-07, + "loss": 0.0002, + "reward": 1.7732143104076385, + "reward_std": 0.06818529777228832, + "rewards/equation_reward_func": 0.7955357357859612, + "rewards/format_reward_func": 0.977678582072258, + "step": 14354 + }, + { + "completion_length": 240.5580472946167, + "epoch": 2.407058133199212, + "grad_norm": 0.003754627012270124, + "kl": 0.13525390625, + "learning_rate": 4.923968798249264e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 14356 + }, + { + "completion_length": 234.4821538925171, + "epoch": 2.4073934364390794, + "grad_norm": 0.1461006787847886, + "kl": 0.112762451171875, + "learning_rate": 4.923939436462457e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 14358 + }, + { + "completion_length": 248.38393878936768, + "epoch": 2.407728739678947, + "grad_norm": 0.24628964012318919, + "kl": 0.124847412109375, + "learning_rate": 4.923910069094843e-07, + "loss": 0.0001, + "reward": 1.7504465207457542, + "reward_std": 0.04987628059461713, + "rewards/equation_reward_func": 0.7562500201165676, + "rewards/format_reward_func": 0.9941964335739613, + "step": 14360 + }, + { + "completion_length": 235.34376049041748, + "epoch": 2.4080640429188147, + "grad_norm": 0.2245625890808432, + "kl": 0.107025146484375, + "learning_rate": 4.92388069614649e-07, + "loss": 0.0001, + "reward": 1.7107143700122833, + "reward_std": 0.05555838905274868, + "rewards/equation_reward_func": 0.7196428887546062, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14362 + }, + { + "completion_length": 249.2187623977661, + "epoch": 2.4083993461586823, + "grad_norm": 0.17260432031337294, + "kl": 0.115936279296875, + "learning_rate": 4.923851317617464e-07, + "loss": 0.0001, + "reward": 1.751785784959793, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14364 + }, + { + "completion_length": 244.25447463989258, + "epoch": 2.40873464939855, + "grad_norm": 0.2208763728434164, + "kl": 0.113006591796875, + "learning_rate": 4.923821933507834e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 14366 + }, + { + "completion_length": 245.88394165039062, + "epoch": 2.4090699526384176, + "grad_norm": 0.23033703151636636, + "kl": 0.10723876953125, + "learning_rate": 4.923792543817667e-07, + "loss": 0.0001, + "reward": 1.7785715013742447, + "reward_std": 0.04040610138326883, + "rewards/equation_reward_func": 0.7875000312924385, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14368 + }, + { + "completion_length": 236.25894165039062, + "epoch": 2.409405255878285, + "grad_norm": 0.2851470594450002, + "kl": 0.143096923828125, + "learning_rate": 4.923763148547031e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 14370 + }, + { + "completion_length": 235.01786708831787, + "epoch": 2.4097405591181524, + "grad_norm": 0.1443877591575158, + "kl": 0.10845947265625, + "learning_rate": 4.923733747695994e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428999304771, + "rewards/format_reward_func": 1.0, + "step": 14372 + }, + { + "completion_length": 236.65179538726807, + "epoch": 2.41007586235802, + "grad_norm": 0.0027849364925823456, + "kl": 0.105072021484375, + "learning_rate": 4.923704341264623e-07, + "loss": 0.0001, + "reward": 1.776785783469677, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7812500335276127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14374 + }, + { + "completion_length": 236.2455472946167, + "epoch": 2.4104111655978877, + "grad_norm": 0.2298392056406199, + "kl": 0.108001708984375, + "learning_rate": 4.923674929252985e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714581608772, + "rewards/format_reward_func": 1.0, + "step": 14376 + }, + { + "completion_length": 234.21876049041748, + "epoch": 2.410746468837755, + "grad_norm": 0.1903379262753221, + "kl": 0.114990234375, + "learning_rate": 4.92364551166115e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.803571455180645, + "rewards/format_reward_func": 1.0, + "step": 14378 + }, + { + "completion_length": 238.33036613464355, + "epoch": 2.4110817720776225, + "grad_norm": 0.2296659492073621, + "kl": 0.106414794921875, + "learning_rate": 4.923616088489183e-07, + "loss": 0.0001, + "reward": 1.787500061094761, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14380 + }, + { + "completion_length": 240.4509048461914, + "epoch": 2.41141707531749, + "grad_norm": 0.1183545544888929, + "kl": 0.10882568359375, + "learning_rate": 4.923586659737154e-07, + "loss": 0.0001, + "reward": 1.7696429044008255, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7741071879863739, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14382 + }, + { + "completion_length": 241.94197463989258, + "epoch": 2.411752378557358, + "grad_norm": 0.2451140468106907, + "kl": 0.1093902587890625, + "learning_rate": 4.92355722540513e-07, + "loss": 0.0001, + "reward": 1.7875000536441803, + "reward_std": 0.06818529590964317, + "rewards/equation_reward_func": 0.7919643186032772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14384 + }, + { + "completion_length": 231.56251049041748, + "epoch": 2.4120876817972254, + "grad_norm": 0.15612636244581551, + "kl": 0.103759765625, + "learning_rate": 4.923527785493179e-07, + "loss": 0.0001, + "reward": 1.8178571835160255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571686148643, + "rewards/format_reward_func": 1.0, + "step": 14386 + }, + { + "completion_length": 237.1205472946167, + "epoch": 2.412422985037093, + "grad_norm": 0.22794551959835807, + "kl": 0.105438232421875, + "learning_rate": 4.923498340001369e-07, + "loss": 0.0001, + "reward": 1.7839286103844643, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928939700127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14388 + }, + { + "completion_length": 241.57590675354004, + "epoch": 2.4127582882769607, + "grad_norm": 0.21870437344172014, + "kl": 0.112030029296875, + "learning_rate": 4.923468888929766e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428842842579, + "rewards/format_reward_func": 1.0, + "step": 14390 + }, + { + "completion_length": 237.82143878936768, + "epoch": 2.413093591516828, + "grad_norm": 0.3153485246471543, + "kl": 0.113677978515625, + "learning_rate": 4.923439432278439e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 14392 + }, + { + "completion_length": 240.8303680419922, + "epoch": 2.4134288947566955, + "grad_norm": 0.1720802940204395, + "kl": 0.1099853515625, + "learning_rate": 4.923409970047456e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571846336126, + "rewards/format_reward_func": 1.0, + "step": 14394 + }, + { + "completion_length": 242.5312614440918, + "epoch": 2.413764197996563, + "grad_norm": 0.2575036019942336, + "kl": 0.109466552734375, + "learning_rate": 4.923380502236885e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14396 + }, + { + "completion_length": 249.15179824829102, + "epoch": 2.414099501236431, + "grad_norm": 0.12303614393479674, + "kl": 0.11273193359375, + "learning_rate": 4.923351028846794e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000342726707, + "rewards/format_reward_func": 1.0, + "step": 14398 + }, + { + "completion_length": 237.24554634094238, + "epoch": 2.4144348044762984, + "grad_norm": 0.22113499831589165, + "kl": 0.114288330078125, + "learning_rate": 4.92332154987725e-07, + "loss": 0.0001, + "reward": 1.7803572416305542, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.784821443259716, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14400 + }, + { + "completion_length": 246.93304920196533, + "epoch": 2.4147701077161656, + "grad_norm": 0.34530621108268905, + "kl": 0.130767822265625, + "learning_rate": 4.923292065328321e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 14402 + }, + { + "completion_length": 253.29018878936768, + "epoch": 2.4151054109560333, + "grad_norm": 0.22128352179118405, + "kl": 0.125274658203125, + "learning_rate": 4.923262575200076e-07, + "loss": 0.0001, + "reward": 1.7946428805589676, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.7991071753203869, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14404 + }, + { + "completion_length": 251.0848331451416, + "epoch": 2.415440714195901, + "grad_norm": 0.06485660401128394, + "kl": 0.122039794921875, + "learning_rate": 4.923233079492582e-07, + "loss": 0.0001, + "reward": 1.758928619325161, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928693830967, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14406 + }, + { + "completion_length": 249.37501430511475, + "epoch": 2.4157760174357685, + "grad_norm": 0.11814345401091123, + "kl": 0.10894775390625, + "learning_rate": 4.923203578205906e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 14408 + }, + { + "completion_length": 246.2678689956665, + "epoch": 2.416111320675636, + "grad_norm": 0.044181690154128005, + "kl": 0.11151123046875, + "learning_rate": 4.923174071340117e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857566475868, + "rewards/format_reward_func": 1.0, + "step": 14410 + }, + { + "completion_length": 254.5000123977661, + "epoch": 2.416446623915504, + "grad_norm": 0.12192966153626056, + "kl": 0.104949951171875, + "learning_rate": 4.923144558895283e-07, + "loss": 0.0001, + "reward": 1.805357187986374, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8098214603960514, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14412 + }, + { + "completion_length": 250.5625123977661, + "epoch": 2.416781927155371, + "grad_norm": 0.18429756344803158, + "kl": 0.102203369140625, + "learning_rate": 4.923115040871472e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 14414 + }, + { + "completion_length": 249.3169755935669, + "epoch": 2.4171172303952386, + "grad_norm": 0.15038284538850447, + "kl": 0.105133056640625, + "learning_rate": 4.923085517268752e-07, + "loss": 0.0001, + "reward": 1.805357187986374, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8187500275671482, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14416 + }, + { + "completion_length": 254.67411994934082, + "epoch": 2.4174525336351063, + "grad_norm": 0.05553958299680993, + "kl": 0.1148681640625, + "learning_rate": 4.92305598808719e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714752972126, + "rewards/format_reward_func": 1.0, + "step": 14418 + }, + { + "completion_length": 251.93751049041748, + "epoch": 2.417787836874974, + "grad_norm": 0.219178777856766, + "kl": 0.1243896484375, + "learning_rate": 4.923026453326856e-07, + "loss": 0.0001, + "reward": 1.739285759627819, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.739285746589303, + "rewards/format_reward_func": 1.0, + "step": 14420 + }, + { + "completion_length": 250.5267972946167, + "epoch": 2.4181231401148415, + "grad_norm": 0.21583952375718596, + "kl": 0.101226806640625, + "learning_rate": 4.922996912987815e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143171131611, + "rewards/format_reward_func": 1.0, + "step": 14422 + }, + { + "completion_length": 252.9821548461914, + "epoch": 2.4184584433547087, + "grad_norm": 0.14685803789461777, + "kl": 0.110504150390625, + "learning_rate": 4.922967367070139e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7428571619093418, + "rewards/format_reward_func": 1.0, + "step": 14424 + }, + { + "completion_length": 250.14286613464355, + "epoch": 2.4187937465945764, + "grad_norm": 0.19287115760743048, + "kl": 0.1021728515625, + "learning_rate": 4.922937815573892e-07, + "loss": 0.0001, + "reward": 1.7232143580913544, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7276786081492901, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14426 + }, + { + "completion_length": 253.7053689956665, + "epoch": 2.419129049834444, + "grad_norm": 0.17641124181431558, + "kl": 0.116424560546875, + "learning_rate": 4.922908258499144e-07, + "loss": 0.0001, + "reward": 1.74642863124609, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7464286033064127, + "rewards/format_reward_func": 1.0, + "step": 14428 + }, + { + "completion_length": 246.64733505249023, + "epoch": 2.4194643530743116, + "grad_norm": 0.24730402074427776, + "kl": 0.0968017578125, + "learning_rate": 4.922878695845964e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.07071067858487368, + "rewards/equation_reward_func": 0.7589286118745804, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14430 + }, + { + "completion_length": 242.6205472946167, + "epoch": 2.4197996563141793, + "grad_norm": 0.1609843188328104, + "kl": 0.107147216796875, + "learning_rate": 4.922849127614417e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8071428872644901, + "rewards/format_reward_func": 1.0, + "step": 14432 + }, + { + "completion_length": 246.95090293884277, + "epoch": 2.420134959554047, + "grad_norm": 0.08023298552883207, + "kl": 0.1016845703125, + "learning_rate": 4.922819553804575e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 14434 + }, + { + "completion_length": 258.5803689956665, + "epoch": 2.420470262793914, + "grad_norm": 0.1708562641566658, + "kl": 0.09808349609375, + "learning_rate": 4.922789974416503e-07, + "loss": 0.0001, + "reward": 1.805357187986374, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8098214417695999, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14436 + }, + { + "completion_length": 253.1294755935669, + "epoch": 2.4208055660337817, + "grad_norm": 0.1869148624482686, + "kl": 0.091583251953125, + "learning_rate": 4.92276038945027e-07, + "loss": 0.0001, + "reward": 1.7892857864499092, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857529222965, + "rewards/format_reward_func": 1.0, + "step": 14438 + }, + { + "completion_length": 245.75893878936768, + "epoch": 2.4211408692736494, + "grad_norm": 0.23055645746875422, + "kl": 0.1028900146484375, + "learning_rate": 4.922730798905944e-07, + "loss": 0.0001, + "reward": 1.732142947614193, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428898721933, + "rewards/format_reward_func": 1.0, + "step": 14440 + }, + { + "completion_length": 245.9330472946167, + "epoch": 2.421476172513517, + "grad_norm": 0.2640415440209392, + "kl": 0.10809326171875, + "learning_rate": 4.922701202783593e-07, + "loss": 0.0001, + "reward": 1.8107143491506577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 14442 + }, + { + "completion_length": 249.7053689956665, + "epoch": 2.4218114757533846, + "grad_norm": 0.16123161749216006, + "kl": 0.100982666015625, + "learning_rate": 4.922671601083287e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143208384514, + "rewards/format_reward_func": 1.0, + "step": 14444 + }, + { + "completion_length": 243.96429443359375, + "epoch": 2.422146778993252, + "grad_norm": 0.19344738804761819, + "kl": 0.101409912109375, + "learning_rate": 4.922641993805092e-07, + "loss": 0.0001, + "reward": 1.7214286550879478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214285954833031, + "rewards/format_reward_func": 1.0, + "step": 14446 + }, + { + "completion_length": 243.48662090301514, + "epoch": 2.4224820822331194, + "grad_norm": 0.1864402544792117, + "kl": 0.108001708984375, + "learning_rate": 4.922612380949077e-07, + "loss": 0.0001, + "reward": 1.800000049173832, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 14448 + }, + { + "completion_length": 254.58483123779297, + "epoch": 2.422817385472987, + "grad_norm": 0.18771169862166331, + "kl": 0.095550537109375, + "learning_rate": 4.922582762515311e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 14450 + }, + { + "completion_length": 256.90626335144043, + "epoch": 2.4231526887128547, + "grad_norm": 0.152067826141475, + "kl": 0.112213134765625, + "learning_rate": 4.92255313850386e-07, + "loss": 0.0001, + "reward": 1.7500000447034836, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 14452 + }, + { + "completion_length": 244.7232255935669, + "epoch": 2.4234879919527224, + "grad_norm": 0.15337086305242245, + "kl": 0.1212158203125, + "learning_rate": 4.922523508914794e-07, + "loss": 0.0001, + "reward": 1.7321429401636124, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428693830967, + "rewards/format_reward_func": 1.0, + "step": 14454 + }, + { + "completion_length": 249.39733123779297, + "epoch": 2.42382329519259, + "grad_norm": 0.19277169712502537, + "kl": 0.10882568359375, + "learning_rate": 4.92249387374818e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7285714447498322, + "rewards/format_reward_func": 1.0, + "step": 14456 + }, + { + "completion_length": 239.508939743042, + "epoch": 2.424158598432457, + "grad_norm": 0.23544722794422707, + "kl": 0.1253662109375, + "learning_rate": 4.922464233004088e-07, + "loss": 0.0001, + "reward": 1.8000000789761543, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000081956387, + "rewards/format_reward_func": 1.0, + "step": 14458 + }, + { + "completion_length": 246.9285831451416, + "epoch": 2.424493901672325, + "grad_norm": 0.16456192966586017, + "kl": 0.104736328125, + "learning_rate": 4.922434586682584e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607142981141806, + "rewards/format_reward_func": 1.0, + "step": 14460 + }, + { + "completion_length": 252.95983219146729, + "epoch": 2.4248292049121924, + "grad_norm": 0.22480382634898416, + "kl": 0.097076416015625, + "learning_rate": 4.922404934783738e-07, + "loss": 0.0001, + "reward": 1.7321429252624512, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7321428898721933, + "rewards/format_reward_func": 1.0, + "step": 14462 + }, + { + "completion_length": 250.0044755935669, + "epoch": 2.42516450815206, + "grad_norm": 0.11925795390631981, + "kl": 0.1038055419921875, + "learning_rate": 4.922375277307618e-07, + "loss": 0.0001, + "reward": 1.7553572207689285, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214522004128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14464 + }, + { + "completion_length": 234.99554538726807, + "epoch": 2.4254998113919277, + "grad_norm": 0.15510926176807355, + "kl": 0.11212158203125, + "learning_rate": 4.922345614254292e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 14466 + }, + { + "completion_length": 239.2634048461914, + "epoch": 2.425835114631795, + "grad_norm": 0.26056520083745166, + "kl": 0.114044189453125, + "learning_rate": 4.922315945623827e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000335276127, + "rewards/format_reward_func": 1.0, + "step": 14468 + }, + { + "completion_length": 241.0669755935669, + "epoch": 2.4261704178716625, + "grad_norm": 0.1362923694488133, + "kl": 0.10235595703125, + "learning_rate": 4.922286271416294e-07, + "loss": 0.0001, + "reward": 1.7892857566475868, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 14470 + }, + { + "completion_length": 245.40626335144043, + "epoch": 2.42650572111153, + "grad_norm": 0.1893996972953304, + "kl": 0.101776123046875, + "learning_rate": 4.92225659163176e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 14472 + }, + { + "completion_length": 239.88393878936768, + "epoch": 2.426841024351398, + "grad_norm": 0.2687254884525047, + "kl": 0.107421875, + "learning_rate": 4.922226906270292e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7750000357627869, + "rewards/format_reward_func": 1.0, + "step": 14474 + }, + { + "completion_length": 240.14733219146729, + "epoch": 2.4271763275912654, + "grad_norm": 0.192068117999007, + "kl": 0.109466552734375, + "learning_rate": 4.92219721533196e-07, + "loss": 0.0001, + "reward": 1.8178571835160255, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571686148643, + "rewards/format_reward_func": 1.0, + "step": 14476 + }, + { + "completion_length": 241.20983219146729, + "epoch": 2.427511630831133, + "grad_norm": 0.26444603268357253, + "kl": 0.10882568359375, + "learning_rate": 4.922167518816832e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 14478 + }, + { + "completion_length": 239.41072368621826, + "epoch": 2.4278469340710003, + "grad_norm": 0.09661934193422012, + "kl": 0.1143798828125, + "learning_rate": 4.922137816724976e-07, + "loss": 0.0001, + "reward": 1.7303572073578835, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7348214648663998, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14480 + }, + { + "completion_length": 249.3214406967163, + "epoch": 2.428182237310868, + "grad_norm": 0.11849886046639548, + "kl": 0.1033935546875, + "learning_rate": 4.922108109056461e-07, + "loss": 0.0001, + "reward": 1.769642896950245, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7741071619093418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14482 + }, + { + "completion_length": 241.2544765472412, + "epoch": 2.4285175405507355, + "grad_norm": 0.19818905557945873, + "kl": 0.1063232421875, + "learning_rate": 4.922078395811355e-07, + "loss": 0.0001, + "reward": 1.7928571924567223, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571738302708, + "rewards/format_reward_func": 1.0, + "step": 14484 + }, + { + "completion_length": 257.62501430511475, + "epoch": 2.428852843790603, + "grad_norm": 0.16333273125407996, + "kl": 0.12158203125, + "learning_rate": 4.922048676989728e-07, + "loss": 0.0001, + "reward": 1.691071517765522, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.6955357491970062, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14486 + }, + { + "completion_length": 244.50894260406494, + "epoch": 2.429188147030471, + "grad_norm": 0.06860305031557319, + "kl": 0.102630615234375, + "learning_rate": 4.922018952591645e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7535714656114578, + "rewards/format_reward_func": 1.0, + "step": 14488 + }, + { + "completion_length": 255.4285831451416, + "epoch": 2.429523450270338, + "grad_norm": 0.18951155606865122, + "kl": 0.11529541015625, + "learning_rate": 4.921989222617177e-07, + "loss": 0.0001, + "reward": 1.7714286372065544, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7714285999536514, + "rewards/format_reward_func": 1.0, + "step": 14490 + }, + { + "completion_length": 243.508939743042, + "epoch": 2.4298587535102056, + "grad_norm": 0.06729385120696377, + "kl": 0.10260009765625, + "learning_rate": 4.921959487066392e-07, + "loss": 0.0001, + "reward": 1.7535715103149414, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714600235224, + "rewards/format_reward_func": 1.0, + "step": 14492 + }, + { + "completion_length": 243.26340198516846, + "epoch": 2.4301940567500733, + "grad_norm": 0.21808525209582078, + "kl": 0.1015625, + "learning_rate": 4.921929745939357e-07, + "loss": 0.0001, + "reward": 1.8214286267757416, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8214286044239998, + "rewards/format_reward_func": 1.0, + "step": 14494 + }, + { + "completion_length": 242.9866189956665, + "epoch": 2.430529359989941, + "grad_norm": 0.10459009473345975, + "kl": 0.10888671875, + "learning_rate": 4.921899999236143e-07, + "loss": 0.0001, + "reward": 1.7535714954137802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 14496 + }, + { + "completion_length": 243.4062614440918, + "epoch": 2.4308646632298085, + "grad_norm": 0.20468565131128358, + "kl": 0.09014892578125, + "learning_rate": 4.921870246956817e-07, + "loss": 0.0001, + "reward": 1.7857143729925156, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 14498 + }, + { + "completion_length": 240.4196548461914, + "epoch": 2.431199966469676, + "grad_norm": 0.2839771802697649, + "kl": 0.102447509765625, + "learning_rate": 4.921840489101447e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571827709675, + "rewards/format_reward_func": 1.0, + "step": 14500 + }, + { + "completion_length": 243.91965293884277, + "epoch": 2.431535269709544, + "grad_norm": 0.13837876458820203, + "kl": 0.09967041015625, + "learning_rate": 4.921810725670104e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7964285984635353, + "rewards/format_reward_func": 1.0, + "step": 14502 + }, + { + "completion_length": 248.0759048461914, + "epoch": 2.431870572949411, + "grad_norm": 0.0625324599377449, + "kl": 0.10888671875, + "learning_rate": 4.921780956662853e-07, + "loss": 0.0001, + "reward": 1.7357143461704254, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143275439739, + "rewards/format_reward_func": 1.0, + "step": 14504 + }, + { + "completion_length": 240.58929824829102, + "epoch": 2.4322058761892786, + "grad_norm": 0.14144479185833814, + "kl": 0.116973876953125, + "learning_rate": 4.921751182079765e-07, + "loss": 0.0001, + "reward": 1.7750000655651093, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 14506 + }, + { + "completion_length": 238.99554920196533, + "epoch": 2.4325411794291463, + "grad_norm": 0.12906166659910961, + "kl": 0.115478515625, + "learning_rate": 4.921721401920907e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286051690578, + "rewards/format_reward_func": 1.0, + "step": 14508 + }, + { + "completion_length": 244.4062614440918, + "epoch": 2.432876482669014, + "grad_norm": 0.19870262101296146, + "kl": 0.1026611328125, + "learning_rate": 4.921691616186349e-07, + "loss": 0.0001, + "reward": 1.7892857491970062, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 14510 + }, + { + "completion_length": 244.85269165039062, + "epoch": 2.433211785908881, + "grad_norm": 0.34484337934409914, + "kl": 0.0943756103515625, + "learning_rate": 4.921661824876161e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143022119999, + "rewards/format_reward_func": 1.0, + "step": 14512 + }, + { + "completion_length": 238.5848331451416, + "epoch": 2.4335470891487487, + "grad_norm": 0.16552693092879947, + "kl": 0.10479736328125, + "learning_rate": 4.921632027990408e-07, + "loss": 0.0001, + "reward": 1.7625000551342964, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7669643238186836, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14514 + }, + { + "completion_length": 233.008939743042, + "epoch": 2.4338823923886164, + "grad_norm": 0.23178198865499708, + "kl": 0.109375, + "learning_rate": 4.921602225529159e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 14516 + }, + { + "completion_length": 241.25447940826416, + "epoch": 2.434217695628484, + "grad_norm": 0.23617625683615337, + "kl": 0.1003265380859375, + "learning_rate": 4.921572417492485e-07, + "loss": 0.0001, + "reward": 1.7500000968575478, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000260770321, + "rewards/format_reward_func": 1.0, + "step": 14518 + }, + { + "completion_length": 236.50447463989258, + "epoch": 2.4345529988683516, + "grad_norm": 0.09500009038811856, + "kl": 0.108428955078125, + "learning_rate": 4.921542603880453e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000223517418, + "rewards/format_reward_func": 1.0, + "step": 14520 + }, + { + "completion_length": 241.37947368621826, + "epoch": 2.4348883021082193, + "grad_norm": 0.10477360638602254, + "kl": 0.10235595703125, + "learning_rate": 4.921512784693132e-07, + "loss": 0.0001, + "reward": 1.8017857521772385, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500394880772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14522 + }, + { + "completion_length": 244.4375123977661, + "epoch": 2.435223605348087, + "grad_norm": 0.5641491334390114, + "kl": 0.11566162109375, + "learning_rate": 4.921482959930592e-07, + "loss": 0.0001, + "reward": 1.7464286461472511, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 14524 + }, + { + "completion_length": 239.9107265472412, + "epoch": 2.435558908587954, + "grad_norm": 0.18693819338494735, + "kl": 0.114654541015625, + "learning_rate": 4.9214531295929e-07, + "loss": 0.0001, + "reward": 1.741071492433548, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7455357313156128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14526 + }, + { + "completion_length": 233.2009048461914, + "epoch": 2.4358942118278217, + "grad_norm": 0.20661760679939017, + "kl": 0.104034423828125, + "learning_rate": 4.921423293680124e-07, + "loss": 0.0001, + "reward": 1.8250000551342964, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.825000025331974, + "rewards/format_reward_func": 1.0, + "step": 14528 + }, + { + "completion_length": 230.52679634094238, + "epoch": 2.4362295150676894, + "grad_norm": 0.10369726032449242, + "kl": 0.112823486328125, + "learning_rate": 4.921393452192334e-07, + "loss": 0.0001, + "reward": 1.8035714700818062, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714626312256, + "rewards/format_reward_func": 1.0, + "step": 14530 + }, + { + "completion_length": 240.18751049041748, + "epoch": 2.436564818307557, + "grad_norm": 0.3213602942407995, + "kl": 0.10504150390625, + "learning_rate": 4.9213636051296e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.778571467846632, + "rewards/format_reward_func": 1.0, + "step": 14532 + }, + { + "completion_length": 231.95983219146729, + "epoch": 2.4369001215474246, + "grad_norm": 0.2630550350345542, + "kl": 0.102081298828125, + "learning_rate": 4.921333752491987e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428924798965, + "rewards/format_reward_func": 1.0, + "step": 14534 + }, + { + "completion_length": 230.3482265472412, + "epoch": 2.437235424787292, + "grad_norm": 0.13355432159632558, + "kl": 0.104248046875, + "learning_rate": 4.921303894279568e-07, + "loss": 0.0001, + "reward": 1.810714341700077, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8107143118977547, + "rewards/format_reward_func": 1.0, + "step": 14536 + }, + { + "completion_length": 242.2857265472412, + "epoch": 2.4375707280271595, + "grad_norm": 0.23392933237799146, + "kl": 0.1086273193359375, + "learning_rate": 4.921274030492408e-07, + "loss": 0.0001, + "reward": 1.7178572416305542, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7178571857511997, + "rewards/format_reward_func": 1.0, + "step": 14538 + }, + { + "completion_length": 233.24108505249023, + "epoch": 2.437906031267027, + "grad_norm": 0.23157423824280435, + "kl": 0.10504150390625, + "learning_rate": 4.921244161130578e-07, + "loss": 0.0001, + "reward": 1.7500000819563866, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7589286100119352, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14540 + }, + { + "completion_length": 237.29911518096924, + "epoch": 2.4382413345068947, + "grad_norm": 0.08473106919533094, + "kl": 0.11676025390625, + "learning_rate": 4.921214286194147e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571678698063, + "rewards/format_reward_func": 1.0, + "step": 14542 + }, + { + "completion_length": 228.8571548461914, + "epoch": 2.4385766377467624, + "grad_norm": 0.1442209266300916, + "kl": 0.10284423828125, + "learning_rate": 4.921184405683182e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857454717159, + "rewards/format_reward_func": 1.0, + "step": 14544 + }, + { + "completion_length": 227.5669755935669, + "epoch": 2.43891194098663, + "grad_norm": 0.0418952892430739, + "kl": 0.0934600830078125, + "learning_rate": 4.921154519597753e-07, + "loss": 0.0001, + "reward": 1.789285771548748, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.789285734295845, + "rewards/format_reward_func": 1.0, + "step": 14546 + }, + { + "completion_length": 241.20983219146729, + "epoch": 2.439247244226497, + "grad_norm": 0.15438437741472294, + "kl": 0.097076416015625, + "learning_rate": 4.921124627937929e-07, + "loss": 0.0001, + "reward": 1.7946429178118706, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071678698063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14548 + }, + { + "completion_length": 237.89733409881592, + "epoch": 2.439582547466365, + "grad_norm": 0.22889737678043695, + "kl": 0.110015869140625, + "learning_rate": 4.921094730703778e-07, + "loss": 0.0001, + "reward": 1.765178643167019, + "reward_std": 0.03914341004565358, + "rewards/equation_reward_func": 0.7669643200933933, + "rewards/format_reward_func": 0.9982142895460129, + "step": 14550 + }, + { + "completion_length": 236.3214406967163, + "epoch": 2.4399178507062325, + "grad_norm": 0.09313547110453636, + "kl": 0.1150360107421875, + "learning_rate": 4.921064827895369e-07, + "loss": 0.0001, + "reward": 1.8178571686148643, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8178571555763483, + "rewards/format_reward_func": 1.0, + "step": 14552 + }, + { + "completion_length": 237.19197750091553, + "epoch": 2.4402531539461, + "grad_norm": 0.10693348598193116, + "kl": 0.093170166015625, + "learning_rate": 4.921034919512772e-07, + "loss": 0.0001, + "reward": 1.7839286103844643, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7883928827941418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14554 + }, + { + "completion_length": 239.31697368621826, + "epoch": 2.4405884571859677, + "grad_norm": 0.17416553763203155, + "kl": 0.0992889404296875, + "learning_rate": 4.921005005556055e-07, + "loss": 0.0001, + "reward": 1.7750000357627869, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7750000394880772, + "rewards/format_reward_func": 1.0, + "step": 14556 + }, + { + "completion_length": 233.5312614440918, + "epoch": 2.440923760425835, + "grad_norm": 0.5400463217476548, + "kl": 0.1147308349609375, + "learning_rate": 4.920975086025286e-07, + "loss": 0.0001, + "reward": 1.8321429193019867, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8321428708732128, + "rewards/format_reward_func": 1.0, + "step": 14558 + }, + { + "completion_length": 224.7678680419922, + "epoch": 2.4412590636657026, + "grad_norm": 0.246319043845516, + "kl": 0.093414306640625, + "learning_rate": 4.920945160920535e-07, + "loss": 0.0001, + "reward": 1.783928632736206, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7875000312924385, + "rewards/format_reward_func": 0.9964285716414452, + "step": 14560 + }, + { + "completion_length": 239.2634048461914, + "epoch": 2.44159436690557, + "grad_norm": 0.1861008204380244, + "kl": 0.1091461181640625, + "learning_rate": 4.920915230241871e-07, + "loss": 0.0001, + "reward": 1.7535714730620384, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714618861675, + "rewards/format_reward_func": 1.0, + "step": 14562 + }, + { + "completion_length": 232.36161994934082, + "epoch": 2.441929670145438, + "grad_norm": 0.22969570435893646, + "kl": 0.108428955078125, + "learning_rate": 4.920885293989362e-07, + "loss": 0.0001, + "reward": 1.7428571954369545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 14564 + }, + { + "completion_length": 236.60268783569336, + "epoch": 2.4422649733853055, + "grad_norm": 0.17641325910517738, + "kl": 0.0992279052734375, + "learning_rate": 4.920855352163077e-07, + "loss": 0.0001, + "reward": 1.8000000417232513, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.8000000230967999, + "rewards/format_reward_func": 1.0, + "step": 14566 + }, + { + "completion_length": 243.73661708831787, + "epoch": 2.442600276625173, + "grad_norm": 0.2636670701693931, + "kl": 0.107452392578125, + "learning_rate": 4.920825404763086e-07, + "loss": 0.0001, + "reward": 1.7696429342031479, + "reward_std": 0.07323605753481388, + "rewards/equation_reward_func": 0.7741071619093418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14568 + }, + { + "completion_length": 235.40625858306885, + "epoch": 2.4429355798650403, + "grad_norm": 0.20063605323852532, + "kl": 0.106597900390625, + "learning_rate": 4.920795451789458e-07, + "loss": 0.0001, + "reward": 1.7803572192788124, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7848214507102966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14570 + }, + { + "completion_length": 237.8928689956665, + "epoch": 2.443270883104908, + "grad_norm": 0.2844660939043713, + "kl": 0.105743408203125, + "learning_rate": 4.920765493242261e-07, + "loss": 0.0001, + "reward": 1.7839286401867867, + "reward_std": 0.06313453335314989, + "rewards/equation_reward_func": 0.7883928790688515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14572 + }, + { + "completion_length": 237.571439743042, + "epoch": 2.4436061863447756, + "grad_norm": 0.14908936651878493, + "kl": 0.09649658203125, + "learning_rate": 4.920735529121563e-07, + "loss": 0.0001, + "reward": 1.7910714596509933, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.8044643010944128, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14574 + }, + { + "completion_length": 240.28125953674316, + "epoch": 2.443941489584643, + "grad_norm": 0.1813296833370229, + "kl": 0.111358642578125, + "learning_rate": 4.920705559427436e-07, + "loss": 0.0001, + "reward": 1.7339286282658577, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.738392885774374, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14576 + }, + { + "completion_length": 245.95983219146729, + "epoch": 2.444276792824511, + "grad_norm": 0.08269867957004332, + "kl": 0.124542236328125, + "learning_rate": 4.920675584159947e-07, + "loss": 0.0001, + "reward": 1.7500000521540642, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 14578 + }, + { + "completion_length": 247.1294765472412, + "epoch": 2.444612096064378, + "grad_norm": 0.2082926494731955, + "kl": 0.1058349609375, + "learning_rate": 4.920645603319164e-07, + "loss": 0.0001, + "reward": 1.7428571954369545, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571581840515, + "rewards/format_reward_func": 1.0, + "step": 14580 + }, + { + "completion_length": 238.02233409881592, + "epoch": 2.4449473993042456, + "grad_norm": 0.25334244067411815, + "kl": 0.1026611328125, + "learning_rate": 4.920615616905158e-07, + "loss": 0.0001, + "reward": 1.8089286237955093, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.8133928701281548, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14582 + }, + { + "completion_length": 244.36608219146729, + "epoch": 2.4452827025441133, + "grad_norm": 0.22762717588504205, + "kl": 0.1002197265625, + "learning_rate": 4.920585624917998e-07, + "loss": 0.0001, + "reward": 1.8178571984171867, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.8267857246100903, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14584 + }, + { + "completion_length": 238.6741189956665, + "epoch": 2.445618005783981, + "grad_norm": 0.1654778832690156, + "kl": 0.0996856689453125, + "learning_rate": 4.920555627357752e-07, + "loss": 0.0001, + "reward": 1.7678571864962578, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571734577417, + "rewards/format_reward_func": 1.0, + "step": 14586 + }, + { + "completion_length": 243.37501049041748, + "epoch": 2.4459533090238486, + "grad_norm": 0.2654349418206511, + "kl": 0.099334716796875, + "learning_rate": 4.920525624224489e-07, + "loss": 0.0001, + "reward": 1.76607146859169, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7705357596278191, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14588 + }, + { + "completion_length": 250.31697463989258, + "epoch": 2.446288612263716, + "grad_norm": 0.20491560862196873, + "kl": 0.113525390625, + "learning_rate": 4.920495615518279e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 14590 + }, + { + "completion_length": 256.28126335144043, + "epoch": 2.4466239155035834, + "grad_norm": 0.0950820708130051, + "kl": 0.118316650390625, + "learning_rate": 4.92046560123919e-07, + "loss": 0.0001, + "reward": 1.8107143118977547, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8107143081724644, + "rewards/format_reward_func": 1.0, + "step": 14592 + }, + { + "completion_length": 253.70536613464355, + "epoch": 2.446959218743451, + "grad_norm": 0.15718448474235885, + "kl": 0.112945556640625, + "learning_rate": 4.920435581387293e-07, + "loss": 0.0001, + "reward": 1.805357187986374, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.809821467846632, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14594 + }, + { + "completion_length": 259.1651906967163, + "epoch": 2.4472945219833186, + "grad_norm": 0.09285379066929722, + "kl": 0.122222900390625, + "learning_rate": 4.920405555962654e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 14596 + }, + { + "completion_length": 256.9509057998657, + "epoch": 2.4476298252231863, + "grad_norm": 0.10765725585666376, + "kl": 0.12353515625, + "learning_rate": 4.920375524965346e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7526785954833031, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14598 + }, + { + "completion_length": 254.65179634094238, + "epoch": 2.447965128463054, + "grad_norm": 0.09697412589677633, + "kl": 0.116363525390625, + "learning_rate": 4.920345488395436e-07, + "loss": 0.0001, + "reward": 1.744642935693264, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7491071745753288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14600 + }, + { + "completion_length": 258.3660831451416, + "epoch": 2.448300431702921, + "grad_norm": 0.2110317592273284, + "kl": 0.10479736328125, + "learning_rate": 4.920315446252992e-07, + "loss": 0.0001, + "reward": 1.7214286550879478, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7392857484519482, + "rewards/format_reward_func": 0.9821428656578064, + "step": 14602 + }, + { + "completion_length": 259.3482246398926, + "epoch": 2.4486357349427887, + "grad_norm": 0.16048095784784516, + "kl": 0.11065673828125, + "learning_rate": 4.920285398538085e-07, + "loss": 0.0001, + "reward": 1.6964286491274834, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.6964286062866449, + "rewards/format_reward_func": 1.0, + "step": 14604 + }, + { + "completion_length": 265.58929347991943, + "epoch": 2.4489710381826564, + "grad_norm": 0.3574648813796875, + "kl": 0.12420654296875, + "learning_rate": 4.920255345250784e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.08081220183521509, + "rewards/equation_reward_func": 0.7589286044239998, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14606 + }, + { + "completion_length": 260.214298248291, + "epoch": 2.449306341422524, + "grad_norm": 0.22111108570420177, + "kl": 0.118621826171875, + "learning_rate": 4.920225286391157e-07, + "loss": 0.0001, + "reward": 1.740178644657135, + "reward_std": 0.06439722329378128, + "rewards/equation_reward_func": 0.7419643141329288, + "rewards/format_reward_func": 0.9982142895460129, + "step": 14608 + }, + { + "completion_length": 254.24108123779297, + "epoch": 2.4496416446623916, + "grad_norm": 0.21975558457475472, + "kl": 0.10028076171875, + "learning_rate": 4.920195221959275e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8000000193715096, + "rewards/format_reward_func": 1.0, + "step": 14610 + }, + { + "completion_length": 258.1562623977661, + "epoch": 2.4499769479022593, + "grad_norm": 0.3037889689093855, + "kl": 0.12811279296875, + "learning_rate": 4.920165151955205e-07, + "loss": 0.0001, + "reward": 1.7446429431438446, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7491071708500385, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14612 + }, + { + "completion_length": 251.09376430511475, + "epoch": 2.4503122511421265, + "grad_norm": 0.11148829311822392, + "kl": 0.096405029296875, + "learning_rate": 4.920135076379019e-07, + "loss": 0.0001, + "reward": 1.7446429058909416, + "reward_std": 0.047982245683670044, + "rewards/equation_reward_func": 0.7491071801632643, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14614 + }, + { + "completion_length": 259.15626525878906, + "epoch": 2.450647554381994, + "grad_norm": 0.2130659164666184, + "kl": 0.120697021484375, + "learning_rate": 4.920104995230784e-07, + "loss": 0.0001, + "reward": 1.7910714969038963, + "reward_std": 0.04293148312717676, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14616 + }, + { + "completion_length": 253.92412185668945, + "epoch": 2.4509828576218617, + "grad_norm": 0.1594905819651956, + "kl": 0.123779296875, + "learning_rate": 4.92007490851057e-07, + "loss": 0.0001, + "reward": 1.725000075995922, + "reward_std": 0.06565991509705782, + "rewards/equation_reward_func": 0.733928594738245, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14618 + }, + { + "completion_length": 258.1116180419922, + "epoch": 2.4513181608617294, + "grad_norm": 0.28001387833506153, + "kl": 0.139801025390625, + "learning_rate": 4.920044816218446e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.03535533882677555, + "rewards/equation_reward_func": 0.7910714484751225, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14620 + }, + { + "completion_length": 256.36608028411865, + "epoch": 2.451653464101597, + "grad_norm": 0.15087349096042746, + "kl": 0.132415771484375, + "learning_rate": 4.920014718354481e-07, + "loss": 0.0001, + "reward": 1.7517857924103737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14622 + }, + { + "completion_length": 251.73661994934082, + "epoch": 2.451988767341464, + "grad_norm": 0.0777120217393265, + "kl": 0.12347412109375, + "learning_rate": 4.919984614918746e-07, + "loss": 0.0001, + "reward": 1.7642857879400253, + "reward_std": 0.05050762742757797, + "rewards/equation_reward_func": 0.7821428775787354, + "rewards/format_reward_func": 0.9821428656578064, + "step": 14624 + }, + { + "completion_length": 250.46876335144043, + "epoch": 2.452324070581332, + "grad_norm": 0.061192203297932936, + "kl": 0.110107421875, + "learning_rate": 4.919954505911309e-07, + "loss": 0.0001, + "reward": 1.8107143342494965, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8107143193483353, + "rewards/format_reward_func": 1.0, + "step": 14626 + }, + { + "completion_length": 264.4732255935669, + "epoch": 2.4526593738211995, + "grad_norm": 0.25103566878335815, + "kl": 0.142913818359375, + "learning_rate": 4.919924391332239e-07, + "loss": 0.0001, + "reward": 1.730357214808464, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7348214406520128, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14628 + }, + { + "completion_length": 249.88393878936768, + "epoch": 2.452994677061067, + "grad_norm": 0.12985395819053508, + "kl": 0.1202392578125, + "learning_rate": 4.919894271181606e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.045456863939762115, + "rewards/equation_reward_func": 0.798214316368103, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14630 + }, + { + "completion_length": 257.48215675354004, + "epoch": 2.4533299803009347, + "grad_norm": 0.20780368372354519, + "kl": 0.151947021484375, + "learning_rate": 4.91986414545948e-07, + "loss": 0.0002, + "reward": 1.7464286386966705, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7464285977184772, + "rewards/format_reward_func": 1.0, + "step": 14632 + }, + { + "completion_length": 256.6294765472412, + "epoch": 2.4536652835408024, + "grad_norm": 0.20687599287668856, + "kl": 0.11669921875, + "learning_rate": 4.919834014165929e-07, + "loss": 0.0001, + "reward": 1.807142935693264, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.8160714581608772, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14634 + }, + { + "completion_length": 252.1294755935669, + "epoch": 2.45400058678067, + "grad_norm": 0.1576095082803586, + "kl": 0.1048583984375, + "learning_rate": 4.919803877301022e-07, + "loss": 0.0001, + "reward": 1.8017857819795609, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500208616257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14636 + }, + { + "completion_length": 254.8571538925171, + "epoch": 2.454335890020537, + "grad_norm": 0.10649602704630118, + "kl": 0.137786865234375, + "learning_rate": 4.91977373486483e-07, + "loss": 0.0001, + "reward": 1.7500000596046448, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 14638 + }, + { + "completion_length": 251.45090579986572, + "epoch": 2.454671193260405, + "grad_norm": 0.17848525693874534, + "kl": 0.128631591796875, + "learning_rate": 4.919743586857422e-07, + "loss": 0.0001, + "reward": 1.742857202887535, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7428571749478579, + "rewards/format_reward_func": 1.0, + "step": 14640 + }, + { + "completion_length": 256.9285840988159, + "epoch": 2.4550064965002725, + "grad_norm": 0.1561715109998979, + "kl": 0.1240692138671875, + "learning_rate": 4.919713433278866e-07, + "loss": 0.0001, + "reward": 1.7553571835160255, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7598214596509933, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14642 + }, + { + "completion_length": 250.8928680419922, + "epoch": 2.45534179974014, + "grad_norm": 0.09381988812620205, + "kl": 0.139129638671875, + "learning_rate": 4.919683274129234e-07, + "loss": 0.0001, + "reward": 1.8214286342263222, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214285857975483, + "rewards/format_reward_func": 1.0, + "step": 14644 + }, + { + "completion_length": 254.1785831451416, + "epoch": 2.4556771029800073, + "grad_norm": 0.1776446950481363, + "kl": 0.1341552734375, + "learning_rate": 4.919653109408593e-07, + "loss": 0.0001, + "reward": 1.776785746216774, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500260770321, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14646 + }, + { + "completion_length": 251.08483600616455, + "epoch": 2.456012406219875, + "grad_norm": 0.16014605591091147, + "kl": 0.126861572265625, + "learning_rate": 4.919622939117013e-07, + "loss": 0.0001, + "reward": 1.7758929058909416, + "reward_std": 0.06439722329378128, + "rewards/equation_reward_func": 0.7776786088943481, + "rewards/format_reward_func": 0.9982142895460129, + "step": 14648 + }, + { + "completion_length": 254.19197750091553, + "epoch": 2.4563477094597426, + "grad_norm": 0.0025472339599904464, + "kl": 0.126617431640625, + "learning_rate": 4.919592763254564e-07, + "loss": 0.0001, + "reward": 1.791071467101574, + "reward_std": 0.022728432901203632, + "rewards/equation_reward_func": 0.7955357506871223, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14650 + }, + { + "completion_length": 250.99108409881592, + "epoch": 2.45668301269961, + "grad_norm": 0.294690839121686, + "kl": 0.19085693359375, + "learning_rate": 4.919562581821316e-07, + "loss": 0.0002, + "reward": 1.7410714626312256, + "reward_std": 0.09343910962343216, + "rewards/equation_reward_func": 0.7544643059372902, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14652 + }, + { + "completion_length": 241.83929634094238, + "epoch": 2.457018315939478, + "grad_norm": 0.2685941546167499, + "kl": 0.141021728515625, + "learning_rate": 4.919532394817338e-07, + "loss": 0.0001, + "reward": 1.8000000640749931, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 14654 + }, + { + "completion_length": 249.29911994934082, + "epoch": 2.4573536191793455, + "grad_norm": 0.12056836743824613, + "kl": 0.155242919921875, + "learning_rate": 4.919502202242699e-07, + "loss": 0.0002, + "reward": 1.7767857685685158, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500316649675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14656 + }, + { + "completion_length": 237.00893878936768, + "epoch": 2.457688922419213, + "grad_norm": 0.1632951800033488, + "kl": 0.137969970703125, + "learning_rate": 4.919472004097468e-07, + "loss": 0.0001, + "reward": 1.7517857551574707, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7562500350177288, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14658 + }, + { + "completion_length": 242.84375953674316, + "epoch": 2.4580242256590803, + "grad_norm": 0.38433586783614954, + "kl": 0.147674560546875, + "learning_rate": 4.919441800381717e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.06565991323441267, + "rewards/equation_reward_func": 0.7607143148779869, + "rewards/format_reward_func": 1.0, + "step": 14660 + }, + { + "completion_length": 233.5178689956665, + "epoch": 2.458359528898948, + "grad_norm": 0.15426940267481715, + "kl": 0.13128662109375, + "learning_rate": 4.919411591095512e-07, + "loss": 0.0001, + "reward": 1.7357143387198448, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 14662 + }, + { + "completion_length": 238.9330472946167, + "epoch": 2.4586948321388156, + "grad_norm": 0.2574320114609894, + "kl": 0.115234375, + "learning_rate": 4.919381376238927e-07, + "loss": 0.0001, + "reward": 1.8214286267757416, + "reward_std": 0.07071067579090595, + "rewards/equation_reward_func": 0.821428582072258, + "rewards/format_reward_func": 1.0, + "step": 14664 + }, + { + "completion_length": 235.0625123977661, + "epoch": 2.459030135378683, + "grad_norm": 0.2578124993607625, + "kl": 0.14154052734375, + "learning_rate": 4.919351155812027e-07, + "loss": 0.0001, + "reward": 1.8017857521772385, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500208616257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14666 + }, + { + "completion_length": 229.8616180419922, + "epoch": 2.459365438618551, + "grad_norm": 0.18994664944682665, + "kl": 0.11993408203125, + "learning_rate": 4.919320929814884e-07, + "loss": 0.0001, + "reward": 1.7571429163217545, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428883820772, + "rewards/format_reward_func": 1.0, + "step": 14668 + }, + { + "completion_length": 226.6071538925171, + "epoch": 2.459700741858418, + "grad_norm": 0.08334989454251703, + "kl": 0.107177734375, + "learning_rate": 4.919290698247568e-07, + "loss": 0.0001, + "reward": 1.8250000402331352, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.825000025331974, + "rewards/format_reward_func": 1.0, + "step": 14670 + }, + { + "completion_length": 236.5759038925171, + "epoch": 2.4600360450982857, + "grad_norm": 0.24064977610399677, + "kl": 0.124664306640625, + "learning_rate": 4.919260461110146e-07, + "loss": 0.0001, + "reward": 1.7428572177886963, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.742857176810503, + "rewards/format_reward_func": 1.0, + "step": 14672 + }, + { + "completion_length": 228.68304538726807, + "epoch": 2.4603713483381533, + "grad_norm": 0.3965086922599267, + "kl": 0.116363525390625, + "learning_rate": 4.919230218402692e-07, + "loss": 0.0001, + "reward": 1.750000074505806, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7500000298023224, + "rewards/format_reward_func": 1.0, + "step": 14674 + }, + { + "completion_length": 227.08483219146729, + "epoch": 2.460706651578021, + "grad_norm": 0.07705829929685584, + "kl": 0.12017822265625, + "learning_rate": 4.919199970125271e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8035714365541935, + "rewards/format_reward_func": 1.0, + "step": 14676 + }, + { + "completion_length": 229.67411518096924, + "epoch": 2.4610419548178886, + "grad_norm": 0.13454988026363998, + "kl": 0.10888671875, + "learning_rate": 4.919169716277956e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7321428898721933, + "rewards/format_reward_func": 1.0, + "step": 14678 + }, + { + "completion_length": 236.2053680419922, + "epoch": 2.461377258057756, + "grad_norm": 0.251892997636684, + "kl": 0.114898681640625, + "learning_rate": 4.919139456860814e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 14680 + }, + { + "completion_length": 227.08483028411865, + "epoch": 2.4617125612976234, + "grad_norm": 0.3806151196214663, + "kl": 0.105987548828125, + "learning_rate": 4.919109191873917e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 14682 + }, + { + "completion_length": 240.7991189956665, + "epoch": 2.462047864537491, + "grad_norm": 0.15874235465177416, + "kl": 0.12335205078125, + "learning_rate": 4.919078921317334e-07, + "loss": 0.0001, + "reward": 1.7196429371833801, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.7241071946918964, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14684 + }, + { + "completion_length": 233.2812614440918, + "epoch": 2.4623831677773587, + "grad_norm": 0.24433920234307233, + "kl": 0.116546630859375, + "learning_rate": 4.919048645191133e-07, + "loss": 0.0001, + "reward": 1.8196429163217545, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.8241071589291096, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14686 + }, + { + "completion_length": 233.66518878936768, + "epoch": 2.4627184710172263, + "grad_norm": 0.1526931970382, + "kl": 0.1051025390625, + "learning_rate": 4.919018363495386e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 14688 + }, + { + "completion_length": 238.74108219146729, + "epoch": 2.463053774257094, + "grad_norm": 0.10079218212524813, + "kl": 0.10546875, + "learning_rate": 4.918988076230161e-07, + "loss": 0.0001, + "reward": 1.7714286297559738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7714286036789417, + "rewards/format_reward_func": 1.0, + "step": 14690 + }, + { + "completion_length": 239.29911708831787, + "epoch": 2.463389077496961, + "grad_norm": 0.14274660573087072, + "kl": 0.113311767578125, + "learning_rate": 4.91895778339553e-07, + "loss": 0.0001, + "reward": 1.796428605914116, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 14692 + }, + { + "completion_length": 246.5580472946167, + "epoch": 2.4637243807368288, + "grad_norm": 0.20910002876872932, + "kl": 0.101806640625, + "learning_rate": 4.91892748499156e-07, + "loss": 0.0001, + "reward": 1.7517857626080513, + "reward_std": 0.05808377079665661, + "rewards/equation_reward_func": 0.7562500163912773, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14694 + }, + { + "completion_length": 240.37054538726807, + "epoch": 2.4640596839766964, + "grad_norm": 0.3170325404349844, + "kl": 0.119476318359375, + "learning_rate": 4.918897181018323e-07, + "loss": 0.0001, + "reward": 1.762500062584877, + "reward_std": 0.053033008240163326, + "rewards/equation_reward_func": 0.7669643126428127, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14696 + }, + { + "completion_length": 237.92411518096924, + "epoch": 2.464394987216564, + "grad_norm": 0.17860278813200464, + "kl": 0.110260009765625, + "learning_rate": 4.918866871475887e-07, + "loss": 0.0001, + "reward": 1.791071467101574, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.795535746961832, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14698 + }, + { + "completion_length": 240.602689743042, + "epoch": 2.4647302904564317, + "grad_norm": 0.19296638822722686, + "kl": 0.103668212890625, + "learning_rate": 4.918836556364324e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 14700 + }, + { + "completion_length": 244.63393783569336, + "epoch": 2.4650655936962993, + "grad_norm": 0.1819558090672466, + "kl": 0.107757568359375, + "learning_rate": 4.918806235683701e-07, + "loss": 0.0001, + "reward": 1.8017857745289803, + "reward_std": 0.04798224475234747, + "rewards/equation_reward_func": 0.8062500208616257, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14702 + }, + { + "completion_length": 239.3884048461914, + "epoch": 2.4654008969361665, + "grad_norm": 0.16922460661830055, + "kl": 0.110870361328125, + "learning_rate": 4.91877590943409e-07, + "loss": 0.0001, + "reward": 1.8500000685453415, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.850000012665987, + "rewards/format_reward_func": 1.0, + "step": 14704 + }, + { + "completion_length": 243.34822177886963, + "epoch": 2.465736200176034, + "grad_norm": 0.174083358236681, + "kl": 0.11090087890625, + "learning_rate": 4.918745577615559e-07, + "loss": 0.0001, + "reward": 1.758928619325161, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7633928712457418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14706 + }, + { + "completion_length": 236.4866180419922, + "epoch": 2.4660715034159018, + "grad_norm": 0.18333538920013637, + "kl": 0.104248046875, + "learning_rate": 4.918715240228181e-07, + "loss": 0.0001, + "reward": 1.7321429327130318, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7321428954601288, + "rewards/format_reward_func": 1.0, + "step": 14708 + }, + { + "completion_length": 249.57590198516846, + "epoch": 2.4664068066557694, + "grad_norm": 0.050767253993794496, + "kl": 0.119903564453125, + "learning_rate": 4.918684897272022e-07, + "loss": 0.0001, + "reward": 1.7482143491506577, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.7526785843074322, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14710 + }, + { + "completion_length": 240.40179634094238, + "epoch": 2.466742109895637, + "grad_norm": 0.20012551468951106, + "kl": 0.118896484375, + "learning_rate": 4.918654548747154e-07, + "loss": 0.0001, + "reward": 1.7625000774860382, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.766964316368103, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14712 + }, + { + "completion_length": 244.321439743042, + "epoch": 2.467077413135504, + "grad_norm": 0.18902951420859376, + "kl": 0.1083984375, + "learning_rate": 4.918624194653646e-07, + "loss": 0.0001, + "reward": 1.771428644657135, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.771428607404232, + "rewards/format_reward_func": 1.0, + "step": 14714 + }, + { + "completion_length": 238.95983028411865, + "epoch": 2.467412716375372, + "grad_norm": 0.16578032130190862, + "kl": 0.136688232421875, + "learning_rate": 4.918593834991569e-07, + "loss": 0.0001, + "reward": 1.821428619325161, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.8214285895228386, + "rewards/format_reward_func": 1.0, + "step": 14716 + }, + { + "completion_length": 240.70983123779297, + "epoch": 2.4677480196152395, + "grad_norm": 0.2250877718203696, + "kl": 0.108428955078125, + "learning_rate": 4.918563469760992e-07, + "loss": 0.0001, + "reward": 1.8000000715255737, + "reward_std": 0.06060915254056454, + "rewards/equation_reward_func": 0.8089285865426064, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14718 + }, + { + "completion_length": 248.5803689956665, + "epoch": 2.468083322855107, + "grad_norm": 0.13311610670420726, + "kl": 0.112213134765625, + "learning_rate": 4.918533098961985e-07, + "loss": 0.0001, + "reward": 1.8142857551574707, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857439815998, + "rewards/format_reward_func": 1.0, + "step": 14720 + }, + { + "completion_length": 242.0312614440918, + "epoch": 2.4684186260949748, + "grad_norm": 0.17924485307370552, + "kl": 0.13079833984375, + "learning_rate": 4.918502722594619e-07, + "loss": 0.0001, + "reward": 1.7642857730388641, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7642857600003481, + "rewards/format_reward_func": 1.0, + "step": 14722 + }, + { + "completion_length": 249.31697940826416, + "epoch": 2.4687539293348424, + "grad_norm": 0.2366084108225645, + "kl": 0.127410888671875, + "learning_rate": 4.918472340658961e-07, + "loss": 0.0001, + "reward": 1.807142935693264, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 14724 + }, + { + "completion_length": 248.94644451141357, + "epoch": 2.4690892325747096, + "grad_norm": 0.1999633689515607, + "kl": 0.123321533203125, + "learning_rate": 4.918441953155085e-07, + "loss": 0.0001, + "reward": 1.7500000670552254, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7500000409781933, + "rewards/format_reward_func": 1.0, + "step": 14726 + }, + { + "completion_length": 251.47768878936768, + "epoch": 2.469424535814577, + "grad_norm": 0.1271370337954408, + "kl": 0.123931884765625, + "learning_rate": 4.918411560083058e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.796428594738245, + "rewards/format_reward_func": 1.0, + "step": 14728 + }, + { + "completion_length": 249.41965293884277, + "epoch": 2.469759839054445, + "grad_norm": 0.11077971547908103, + "kl": 0.110931396484375, + "learning_rate": 4.918381161442951e-07, + "loss": 0.0001, + "reward": 1.7732143178582191, + "reward_std": 0.017677669413387775, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14730 + }, + { + "completion_length": 246.1651906967163, + "epoch": 2.4700951422943125, + "grad_norm": 0.21287808362756122, + "kl": 0.1270751953125, + "learning_rate": 4.918350757234834e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7821428701281548, + "rewards/format_reward_func": 1.0, + "step": 14732 + }, + { + "completion_length": 250.77233409881592, + "epoch": 2.47043044553418, + "grad_norm": 0.1836312903043879, + "kl": 0.114288330078125, + "learning_rate": 4.918320347458777e-07, + "loss": 0.0001, + "reward": 1.7839286476373672, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7883928790688515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14734 + }, + { + "completion_length": 248.48661708831787, + "epoch": 2.4707657487740473, + "grad_norm": 0.13193506465499494, + "kl": 0.11639404296875, + "learning_rate": 4.91828993211485e-07, + "loss": 0.0001, + "reward": 1.8285714760422707, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8285714536905289, + "rewards/format_reward_func": 1.0, + "step": 14736 + }, + { + "completion_length": 257.45536613464355, + "epoch": 2.471101052013915, + "grad_norm": 0.14764558623734747, + "kl": 0.123046875, + "learning_rate": 4.918259511203122e-07, + "loss": 0.0001, + "reward": 1.8017857670783997, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.8062500152736902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14738 + }, + { + "completion_length": 239.48215293884277, + "epoch": 2.4714363552537826, + "grad_norm": 0.22586394380238098, + "kl": 0.103912353515625, + "learning_rate": 4.918229084723665e-07, + "loss": 0.0001, + "reward": 1.82589291036129, + "reward_std": 0.0340926474891603, + "rewards/equation_reward_func": 0.8276785910129547, + "rewards/format_reward_func": 0.9982142895460129, + "step": 14740 + }, + { + "completion_length": 254.2321548461914, + "epoch": 2.47177165849365, + "grad_norm": 0.25298517588656405, + "kl": 0.115631103515625, + "learning_rate": 4.918198652676547e-07, + "loss": 0.0001, + "reward": 1.8125000447034836, + "reward_std": 0.06313453242182732, + "rewards/equation_reward_func": 0.8169643059372902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14742 + }, + { + "completion_length": 259.14287090301514, + "epoch": 2.472106961733518, + "grad_norm": 0.16073060918881626, + "kl": 0.110382080078125, + "learning_rate": 4.918168215061841e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8053571619093418, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14744 + }, + { + "completion_length": 260.5535821914673, + "epoch": 2.4724422649733855, + "grad_norm": 0.3182146734872481, + "kl": 0.12188720703125, + "learning_rate": 4.918137771879614e-07, + "loss": 0.0001, + "reward": 1.76071435213089, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143316417933, + "rewards/format_reward_func": 1.0, + "step": 14746 + }, + { + "completion_length": 251.88840579986572, + "epoch": 2.4727775682132527, + "grad_norm": 0.3186785826320642, + "kl": 0.12640380859375, + "learning_rate": 4.918107323129937e-07, + "loss": 0.0001, + "reward": 1.7964286282658577, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 14748 + }, + { + "completion_length": 255.2187614440918, + "epoch": 2.4731128714531203, + "grad_norm": 0.2865873297261211, + "kl": 0.111236572265625, + "learning_rate": 4.91807686881288e-07, + "loss": 0.0001, + "reward": 1.7892857789993286, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 14750 + }, + { + "completion_length": 248.99554920196533, + "epoch": 2.473448174692988, + "grad_norm": 0.1606260793115146, + "kl": 0.104583740234375, + "learning_rate": 4.918046408928515e-07, + "loss": 0.0001, + "reward": 1.8178572207689285, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571611642838, + "rewards/format_reward_func": 1.0, + "step": 14752 + }, + { + "completion_length": 252.3928737640381, + "epoch": 2.4737834779328556, + "grad_norm": 0.3196020255422196, + "kl": 0.107391357421875, + "learning_rate": 4.91801594347691e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 14754 + }, + { + "completion_length": 254.96429824829102, + "epoch": 2.474118781172723, + "grad_norm": 0.13566741367227556, + "kl": 0.116729736328125, + "learning_rate": 4.917985472458135e-07, + "loss": 0.0001, + "reward": 1.7517857775092125, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7562500275671482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14756 + }, + { + "completion_length": 254.2500114440918, + "epoch": 2.4744540844125904, + "grad_norm": 0.20523246500830702, + "kl": 0.113677978515625, + "learning_rate": 4.917954995872262e-07, + "loss": 0.0001, + "reward": 1.773214340209961, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7767857313156128, + "rewards/format_reward_func": 0.9964285716414452, + "step": 14758 + }, + { + "completion_length": 255.88393878936768, + "epoch": 2.474789387652458, + "grad_norm": 0.17088202418743903, + "kl": 0.1111907958984375, + "learning_rate": 4.917924513719359e-07, + "loss": 0.0001, + "reward": 1.7500000894069672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.750000037252903, + "rewards/format_reward_func": 1.0, + "step": 14760 + }, + { + "completion_length": 248.63840770721436, + "epoch": 2.4751246908923257, + "grad_norm": 0.060042085703267425, + "kl": 0.103485107421875, + "learning_rate": 4.917894025999498e-07, + "loss": 0.0001, + "reward": 1.7857143580913544, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7857143059372902, + "rewards/format_reward_func": 1.0, + "step": 14762 + }, + { + "completion_length": 258.9285821914673, + "epoch": 2.4754599941321933, + "grad_norm": 0.10379061448307643, + "kl": 0.118896484375, + "learning_rate": 4.917863532712748e-07, + "loss": 0.0001, + "reward": 1.7750000730156898, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7839286029338837, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14764 + }, + { + "completion_length": 245.46876335144043, + "epoch": 2.475795297372061, + "grad_norm": 0.08166250755892202, + "kl": 0.11309814453125, + "learning_rate": 4.91783303385918e-07, + "loss": 0.0001, + "reward": 1.769642911851406, + "reward_std": 0.022728431969881058, + "rewards/equation_reward_func": 0.774107176810503, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14766 + }, + { + "completion_length": 242.16965579986572, + "epoch": 2.4761306006119286, + "grad_norm": 0.11966890403423652, + "kl": 0.120269775390625, + "learning_rate": 4.917802529438863e-07, + "loss": 0.0001, + "reward": 1.7857143506407738, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7857143096625805, + "rewards/format_reward_func": 1.0, + "step": 14768 + }, + { + "completion_length": 244.65179538726807, + "epoch": 2.476465903851796, + "grad_norm": 0.06438476377658822, + "kl": 0.12042236328125, + "learning_rate": 4.91777201945187e-07, + "loss": 0.0001, + "reward": 1.7535714730620384, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7535714507102966, + "rewards/format_reward_func": 1.0, + "step": 14770 + }, + { + "completion_length": 248.00000953674316, + "epoch": 2.4768012070916634, + "grad_norm": 0.25166210607398165, + "kl": 0.1202392578125, + "learning_rate": 4.917741503898268e-07, + "loss": 0.0001, + "reward": 1.8035714849829674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8035714495927095, + "rewards/format_reward_func": 1.0, + "step": 14772 + }, + { + "completion_length": 243.7455472946167, + "epoch": 2.477136510331531, + "grad_norm": 0.12492238999535173, + "kl": 0.111572265625, + "learning_rate": 4.917710982778129e-07, + "loss": 0.0001, + "reward": 1.7107143700122833, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7107143048197031, + "rewards/format_reward_func": 1.0, + "step": 14774 + }, + { + "completion_length": 256.3303689956665, + "epoch": 2.4774718135713987, + "grad_norm": 0.23558559483290095, + "kl": 0.10693359375, + "learning_rate": 4.917680456091523e-07, + "loss": 0.0001, + "reward": 1.798214353621006, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8026785925030708, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14776 + }, + { + "completion_length": 251.633939743042, + "epoch": 2.4778071168112663, + "grad_norm": 0.18466931698986533, + "kl": 0.118194580078125, + "learning_rate": 4.917649923838521e-07, + "loss": 0.0001, + "reward": 1.7607143595814705, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 14778 + }, + { + "completion_length": 254.3214406967163, + "epoch": 2.4781424200511335, + "grad_norm": 0.18040244234467673, + "kl": 0.122772216796875, + "learning_rate": 4.917619386019191e-07, + "loss": 0.0001, + "reward": 1.7482143640518188, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7526785880327225, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14780 + }, + { + "completion_length": 246.53572463989258, + "epoch": 2.478477723291001, + "grad_norm": 0.0026001646841756804, + "kl": 0.106536865234375, + "learning_rate": 4.917588842633605e-07, + "loss": 0.0001, + "reward": 1.8035714775323868, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8035714402794838, + "rewards/format_reward_func": 1.0, + "step": 14782 + }, + { + "completion_length": 257.0714406967163, + "epoch": 2.4788130265308688, + "grad_norm": 0.2176137843266907, + "kl": 0.1085205078125, + "learning_rate": 4.917558293681834e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.778571454808116, + "rewards/format_reward_func": 1.0, + "step": 14784 + }, + { + "completion_length": 252.01340293884277, + "epoch": 2.4791483297707364, + "grad_norm": 0.08079521274065427, + "kl": 0.102691650390625, + "learning_rate": 4.917527739163947e-07, + "loss": 0.0001, + "reward": 1.7750000804662704, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000320374966, + "rewards/format_reward_func": 1.0, + "step": 14786 + }, + { + "completion_length": 241.19197750091553, + "epoch": 2.479483633010604, + "grad_norm": 0.06630944765696818, + "kl": 0.09710693359375, + "learning_rate": 4.917497179080013e-07, + "loss": 0.0001, + "reward": 1.8250000476837158, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8250000178813934, + "rewards/format_reward_func": 1.0, + "step": 14788 + }, + { + "completion_length": 244.2857255935669, + "epoch": 2.4798189362504717, + "grad_norm": 0.15402009555722682, + "kl": 0.108489990234375, + "learning_rate": 4.917466613430107e-07, + "loss": 0.0001, + "reward": 1.7678572162985802, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 14790 + }, + { + "completion_length": 253.34376430511475, + "epoch": 2.4801542394903393, + "grad_norm": 0.22679226122504897, + "kl": 0.116607666015625, + "learning_rate": 4.917436042214294e-07, + "loss": 0.0001, + "reward": 1.7464286386966705, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.755357164889574, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14792 + }, + { + "completion_length": 248.57143878936768, + "epoch": 2.4804895427302065, + "grad_norm": 0.14180333609406645, + "kl": 0.107330322265625, + "learning_rate": 4.917405465432649e-07, + "loss": 0.0001, + "reward": 1.757142923772335, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428939700127, + "rewards/format_reward_func": 1.0, + "step": 14794 + }, + { + "completion_length": 250.1607255935669, + "epoch": 2.480824845970074, + "grad_norm": 0.21500318228619053, + "kl": 0.118072509765625, + "learning_rate": 4.91737488308524e-07, + "loss": 0.0001, + "reward": 1.7821428924798965, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428813040257, + "rewards/format_reward_func": 1.0, + "step": 14796 + }, + { + "completion_length": 248.2678689956665, + "epoch": 2.4811601492099418, + "grad_norm": 0.17072486776960497, + "kl": 0.1063385009765625, + "learning_rate": 4.917344295172137e-07, + "loss": 0.0001, + "reward": 1.8000000566244125, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8000000268220901, + "rewards/format_reward_func": 1.0, + "step": 14798 + }, + { + "completion_length": 241.50447845458984, + "epoch": 2.4814954524498094, + "grad_norm": 0.16210839854375575, + "kl": 0.114166259765625, + "learning_rate": 4.917313701693412e-07, + "loss": 0.0001, + "reward": 1.792857214808464, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571701049805, + "rewards/format_reward_func": 1.0, + "step": 14800 + }, + { + "completion_length": 245.27233695983887, + "epoch": 2.481830755689677, + "grad_norm": 0.18009075898418456, + "kl": 0.11663818359375, + "learning_rate": 4.917283102649133e-07, + "loss": 0.0001, + "reward": 1.7821429371833801, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 14802 + }, + { + "completion_length": 252.86161994934082, + "epoch": 2.4821660589295442, + "grad_norm": 0.22212604391809182, + "kl": 0.1253662109375, + "learning_rate": 4.917252498039374e-07, + "loss": 0.0001, + "reward": 1.7589286267757416, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7633928880095482, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14804 + }, + { + "completion_length": 247.05804824829102, + "epoch": 2.482501362169412, + "grad_norm": 0.17060599530084553, + "kl": 0.10174560546875, + "learning_rate": 4.917221887864202e-07, + "loss": 0.0001, + "reward": 1.776785783469677, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7812500186264515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14806 + }, + { + "completion_length": 248.0044765472412, + "epoch": 2.4828366654092795, + "grad_norm": 0.19718760886727332, + "kl": 0.108856201171875, + "learning_rate": 4.91719127212369e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143092900515, + "rewards/format_reward_func": 1.0, + "step": 14808 + }, + { + "completion_length": 256.0312604904175, + "epoch": 2.483171968649147, + "grad_norm": 0.002740858816126789, + "kl": 0.102996826171875, + "learning_rate": 4.917160650817906e-07, + "loss": 0.0001, + "reward": 1.735714353621006, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7357143200933933, + "rewards/format_reward_func": 1.0, + "step": 14810 + }, + { + "completion_length": 246.3884048461914, + "epoch": 2.4835072718890148, + "grad_norm": 0.24967328188320786, + "kl": 0.11102294921875, + "learning_rate": 4.917130023946924e-07, + "loss": 0.0001, + "reward": 1.7678572088479996, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7678571790456772, + "rewards/format_reward_func": 1.0, + "step": 14812 + }, + { + "completion_length": 247.1696548461914, + "epoch": 2.4838425751288824, + "grad_norm": 0.14186057536510902, + "kl": 0.105712890625, + "learning_rate": 4.917099391510811e-07, + "loss": 0.0001, + "reward": 1.8464286103844643, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8464285992085934, + "rewards/format_reward_func": 1.0, + "step": 14814 + }, + { + "completion_length": 243.70090293884277, + "epoch": 2.4841778783687496, + "grad_norm": 0.16875123376236464, + "kl": 0.115570068359375, + "learning_rate": 4.917068753509639e-07, + "loss": 0.0001, + "reward": 1.7821429148316383, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.782142885029316, + "rewards/format_reward_func": 1.0, + "step": 14816 + }, + { + "completion_length": 251.5178680419922, + "epoch": 2.4845131816086172, + "grad_norm": 0.18352318078910093, + "kl": 0.108489990234375, + "learning_rate": 4.917038109943479e-07, + "loss": 0.0001, + "reward": 1.7071429267525673, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7071429006755352, + "rewards/format_reward_func": 1.0, + "step": 14818 + }, + { + "completion_length": 245.8437623977661, + "epoch": 2.484848484848485, + "grad_norm": 0.16090355001841908, + "kl": 0.105133056640625, + "learning_rate": 4.917007460812401e-07, + "loss": 0.0001, + "reward": 1.8214286118745804, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8214285857975483, + "rewards/format_reward_func": 1.0, + "step": 14820 + }, + { + "completion_length": 246.42858219146729, + "epoch": 2.4851837880883525, + "grad_norm": 0.1380564013172623, + "kl": 0.09600830078125, + "learning_rate": 4.916976806116476e-07, + "loss": 0.0001, + "reward": 1.8196429163217545, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8241071663796902, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14822 + }, + { + "completion_length": 246.72322750091553, + "epoch": 2.48551909132822, + "grad_norm": 0.15147188409912418, + "kl": 0.097808837890625, + "learning_rate": 4.916946145855774e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 14824 + }, + { + "completion_length": 249.93304634094238, + "epoch": 2.4858543945680873, + "grad_norm": 0.25257574082870304, + "kl": 0.1040191650390625, + "learning_rate": 4.916915480030365e-07, + "loss": 0.0001, + "reward": 1.7464286535978317, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.746428593993187, + "rewards/format_reward_func": 1.0, + "step": 14826 + }, + { + "completion_length": 247.65626049041748, + "epoch": 2.486189697807955, + "grad_norm": 0.09384685704779432, + "kl": 0.09881591796875, + "learning_rate": 4.916884808640323e-07, + "loss": 0.0001, + "reward": 1.7464286163449287, + "reward_std": 0.005050762556493282, + "rewards/equation_reward_func": 0.7464286126196384, + "rewards/format_reward_func": 1.0, + "step": 14828 + }, + { + "completion_length": 248.2321548461914, + "epoch": 2.4865250010478226, + "grad_norm": 0.16828925861257932, + "kl": 0.105712890625, + "learning_rate": 4.916854131685713e-07, + "loss": 0.0001, + "reward": 1.755357213318348, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.7598214708268642, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14830 + }, + { + "completion_length": 242.9285831451416, + "epoch": 2.4868603042876902, + "grad_norm": 0.15619959437078137, + "kl": 0.10552978515625, + "learning_rate": 4.916823449166611e-07, + "loss": 0.0001, + "reward": 1.7267857939004898, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.7312500365078449, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14832 + }, + { + "completion_length": 254.9955472946167, + "epoch": 2.487195607527558, + "grad_norm": 0.2567715716716288, + "kl": 0.0962982177734375, + "learning_rate": 4.916792761083084e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 14834 + }, + { + "completion_length": 255.8794755935669, + "epoch": 2.4875309107674255, + "grad_norm": 0.13534000071902677, + "kl": 0.105438232421875, + "learning_rate": 4.916762067435204e-07, + "loss": 0.0001, + "reward": 1.7928571999073029, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 14836 + }, + { + "completion_length": 252.67858600616455, + "epoch": 2.4878662140072927, + "grad_norm": 0.24524325288408574, + "kl": 0.112060546875, + "learning_rate": 4.916731368223042e-07, + "loss": 0.0001, + "reward": 1.7142857983708382, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.714285746216774, + "rewards/format_reward_func": 1.0, + "step": 14838 + }, + { + "completion_length": 254.23215198516846, + "epoch": 2.4882015172471603, + "grad_norm": 0.21139274158098173, + "kl": 0.104644775390625, + "learning_rate": 4.916700663446668e-07, + "loss": 0.0001, + "reward": 1.7607143446803093, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7607143297791481, + "rewards/format_reward_func": 1.0, + "step": 14840 + }, + { + "completion_length": 259.102689743042, + "epoch": 2.488536820487028, + "grad_norm": 0.14614650915831776, + "kl": 0.117706298828125, + "learning_rate": 4.916669953106154e-07, + "loss": 0.0001, + "reward": 1.7214286401867867, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7214285917580128, + "rewards/format_reward_func": 1.0, + "step": 14842 + }, + { + "completion_length": 248.5491189956665, + "epoch": 2.4888721237268956, + "grad_norm": 0.24970700735528656, + "kl": 0.1097412109375, + "learning_rate": 4.916639237201568e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7357143126428127, + "rewards/format_reward_func": 1.0, + "step": 14844 + }, + { + "completion_length": 250.30804538726807, + "epoch": 2.4892074269667632, + "grad_norm": 0.15992926840127522, + "kl": 0.1121826171875, + "learning_rate": 4.916608515732984e-07, + "loss": 0.0001, + "reward": 1.7035714834928513, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7035714685916901, + "rewards/format_reward_func": 1.0, + "step": 14846 + }, + { + "completion_length": 249.04018878936768, + "epoch": 2.4895427302066304, + "grad_norm": 0.20425696513495342, + "kl": 0.096221923828125, + "learning_rate": 4.916577788700471e-07, + "loss": 0.0001, + "reward": 1.79464291036129, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7991071715950966, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14848 + }, + { + "completion_length": 255.94644165039062, + "epoch": 2.489878033446498, + "grad_norm": 0.2703017241730625, + "kl": 0.104248046875, + "learning_rate": 4.9165470561041e-07, + "loss": 0.0001, + "reward": 1.7660714834928513, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7705357447266579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14850 + }, + { + "completion_length": 251.53572750091553, + "epoch": 2.4902133366863657, + "grad_norm": 0.08715365745771998, + "kl": 0.109375, + "learning_rate": 4.916516317943942e-07, + "loss": 0.0001, + "reward": 1.7464286237955093, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7464286014437675, + "rewards/format_reward_func": 1.0, + "step": 14852 + }, + { + "completion_length": 242.1071548461914, + "epoch": 2.4905486399262333, + "grad_norm": 0.1947782628519035, + "kl": 0.105072021484375, + "learning_rate": 4.916485574220066e-07, + "loss": 0.0001, + "reward": 1.7767857760190964, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.7812500223517418, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14854 + }, + { + "completion_length": 244.5178689956665, + "epoch": 2.490883943166101, + "grad_norm": 0.17149906856996142, + "kl": 0.099365234375, + "learning_rate": 4.916454824932545e-07, + "loss": 0.0001, + "reward": 1.8321429193019867, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.8321428745985031, + "rewards/format_reward_func": 1.0, + "step": 14856 + }, + { + "completion_length": 253.70536613464355, + "epoch": 2.4912192464059686, + "grad_norm": 0.225006882706186, + "kl": 0.106292724609375, + "learning_rate": 4.916424070081448e-07, + "loss": 0.0001, + "reward": 1.7357143685221672, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7357143089175224, + "rewards/format_reward_func": 1.0, + "step": 14858 + }, + { + "completion_length": 258.6651906967163, + "epoch": 2.491554549645836, + "grad_norm": 0.19474515995727912, + "kl": 0.110687255859375, + "learning_rate": 4.916393309666849e-07, + "loss": 0.0001, + "reward": 1.7892857640981674, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7892857491970062, + "rewards/format_reward_func": 1.0, + "step": 14860 + }, + { + "completion_length": 252.4821548461914, + "epoch": 2.4918898528857034, + "grad_norm": 0.25165869699775634, + "kl": 0.0942230224609375, + "learning_rate": 4.916362543688816e-07, + "loss": 0.0001, + "reward": 1.7392857819795609, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.739285746589303, + "rewards/format_reward_func": 1.0, + "step": 14862 + }, + { + "completion_length": 253.31697750091553, + "epoch": 2.492225156125571, + "grad_norm": 0.4725974865113826, + "kl": 0.11981201171875, + "learning_rate": 4.91633177214742e-07, + "loss": 0.0001, + "reward": 1.7517857924103737, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7562500238418579, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14864 + }, + { + "completion_length": 246.87947463989258, + "epoch": 2.4925604593654387, + "grad_norm": 0.16160872714609656, + "kl": 0.10552978515625, + "learning_rate": 4.916300995042732e-07, + "loss": 0.0001, + "reward": 1.8142857626080513, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857514321804, + "rewards/format_reward_func": 1.0, + "step": 14866 + }, + { + "completion_length": 250.52679443359375, + "epoch": 2.4928957626053063, + "grad_norm": 0.09402971091803046, + "kl": 0.097747802734375, + "learning_rate": 4.916270212374824e-07, + "loss": 0.0001, + "reward": 1.803571492433548, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8035714440047741, + "rewards/format_reward_func": 1.0, + "step": 14868 + }, + { + "completion_length": 256.6116189956665, + "epoch": 2.4932310658451735, + "grad_norm": 0.16822883694092985, + "kl": 0.1134033203125, + "learning_rate": 4.916239424143766e-07, + "loss": 0.0001, + "reward": 1.7375000640749931, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7419643253087997, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14870 + }, + { + "completion_length": 247.7544765472412, + "epoch": 2.493566369085041, + "grad_norm": 0.17898003366783408, + "kl": 0.10797119140625, + "learning_rate": 4.916208630349628e-07, + "loss": 0.0001, + "reward": 1.835714340209961, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.835714302957058, + "rewards/format_reward_func": 1.0, + "step": 14872 + }, + { + "completion_length": 237.7410831451416, + "epoch": 2.493901672324909, + "grad_norm": 0.15222057489193858, + "kl": 0.100433349609375, + "learning_rate": 4.916177830992482e-07, + "loss": 0.0001, + "reward": 1.8107143267989159, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.8107143044471741, + "rewards/format_reward_func": 1.0, + "step": 14874 + }, + { + "completion_length": 253.51786994934082, + "epoch": 2.4942369755647764, + "grad_norm": 0.16371729482573208, + "kl": 0.10443115234375, + "learning_rate": 4.9161470260724e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7776785977184772, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14876 + }, + { + "completion_length": 263.071439743042, + "epoch": 2.494572278804644, + "grad_norm": 0.19259044880307022, + "kl": 0.119903564453125, + "learning_rate": 4.916116215589451e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.0883883461356163, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14878 + }, + { + "completion_length": 253.60268878936768, + "epoch": 2.4949075820445117, + "grad_norm": 0.13104247323147747, + "kl": 0.109039306640625, + "learning_rate": 4.916085399543707e-07, + "loss": 0.0001, + "reward": 1.7321429178118706, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7321428973227739, + "rewards/format_reward_func": 1.0, + "step": 14880 + }, + { + "completion_length": 247.28126335144043, + "epoch": 2.495242885284379, + "grad_norm": 0.0019853055196513178, + "kl": 0.099822998046875, + "learning_rate": 4.916054577935238e-07, + "loss": 0.0001, + "reward": 1.791071467101574, + "reward_std": 0.012626906856894493, + "rewards/equation_reward_func": 0.7955357432365417, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14882 + }, + { + "completion_length": 252.61608600616455, + "epoch": 2.4955781885242465, + "grad_norm": 0.18710983271999201, + "kl": 0.117523193359375, + "learning_rate": 4.916023750764116e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 14884 + }, + { + "completion_length": 252.84375858306885, + "epoch": 2.495913491764114, + "grad_norm": 0.18797587280849457, + "kl": 0.1116943359375, + "learning_rate": 4.915992918030412e-07, + "loss": 0.0001, + "reward": 1.8125000596046448, + "reward_std": 0.03282995708286762, + "rewards/equation_reward_func": 0.8169643096625805, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14886 + }, + { + "completion_length": 247.48662090301514, + "epoch": 2.496248795003982, + "grad_norm": 0.27004660458537005, + "kl": 0.1292724609375, + "learning_rate": 4.915962079734195e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.04545686300843954, + "rewards/equation_reward_func": 0.7964285910129547, + "rewards/format_reward_func": 1.0, + "step": 14888 + }, + { + "completion_length": 252.14733219146729, + "epoch": 2.4965840982438494, + "grad_norm": 0.12158380895645579, + "kl": 0.120574951171875, + "learning_rate": 4.915931235875538e-07, + "loss": 0.0001, + "reward": 1.7785714864730835, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 14890 + }, + { + "completion_length": 261.8794765472412, + "epoch": 2.4969194014837166, + "grad_norm": 0.12848837493711257, + "kl": 0.1173095703125, + "learning_rate": 4.915900386454512e-07, + "loss": 0.0001, + "reward": 1.817857213318348, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.8178571723401546, + "rewards/format_reward_func": 1.0, + "step": 14892 + }, + { + "completion_length": 257.508939743042, + "epoch": 2.4972547047235842, + "grad_norm": 0.37214306927685425, + "kl": 0.11920166015625, + "learning_rate": 4.915869531471188e-07, + "loss": 0.0001, + "reward": 1.7642857804894447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857357859612, + "rewards/format_reward_func": 1.0, + "step": 14894 + }, + { + "completion_length": 257.415189743042, + "epoch": 2.497590007963452, + "grad_norm": 0.19139334417935533, + "kl": 0.1249847412109375, + "learning_rate": 4.915838670925636e-07, + "loss": 0.0001, + "reward": 1.778571479022503, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7785714585334063, + "rewards/format_reward_func": 1.0, + "step": 14896 + }, + { + "completion_length": 258.83037090301514, + "epoch": 2.4979253112033195, + "grad_norm": 0.1739975764465377, + "kl": 0.11895751953125, + "learning_rate": 4.915807804817927e-07, + "loss": 0.0001, + "reward": 1.7625000402331352, + "reward_std": 0.06313453521579504, + "rewards/equation_reward_func": 0.775892885401845, + "rewards/format_reward_func": 0.9866071492433548, + "step": 14898 + }, + { + "completion_length": 257.80358028411865, + "epoch": 2.498260614443187, + "grad_norm": 0.12046699172711495, + "kl": 0.124237060546875, + "learning_rate": 4.915776933148135e-07, + "loss": 0.0001, + "reward": 1.81428574770689, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 14900 + }, + { + "completion_length": 263.2187604904175, + "epoch": 2.498595917683055, + "grad_norm": 0.35322802369354245, + "kl": 0.1260986328125, + "learning_rate": 4.915746055916327e-07, + "loss": 0.0001, + "reward": 1.7803571969270706, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7848214488476515, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14902 + }, + { + "completion_length": 246.6428689956665, + "epoch": 2.4989312209229224, + "grad_norm": 0.004772484218738324, + "kl": 0.1209869384765625, + "learning_rate": 4.915715173122575e-07, + "loss": 0.0001, + "reward": 1.7857143431901932, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 14904 + }, + { + "completion_length": 258.6875104904175, + "epoch": 2.4992665241627896, + "grad_norm": 0.14919904400282405, + "kl": 0.1072998046875, + "learning_rate": 4.915684284766953e-07, + "loss": 0.0001, + "reward": 1.7785714715719223, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714641213417, + "rewards/format_reward_func": 1.0, + "step": 14906 + }, + { + "completion_length": 253.6205472946167, + "epoch": 2.4996018274026572, + "grad_norm": 0.25196101649712355, + "kl": 0.10772705078125, + "learning_rate": 4.915653390849529e-07, + "loss": 0.0001, + "reward": 1.832142911851406, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8321428708732128, + "rewards/format_reward_func": 1.0, + "step": 14908 + }, + { + "completion_length": 253.071439743042, + "epoch": 2.499937130642525, + "grad_norm": 0.1945168808074864, + "kl": 0.124420166015625, + "learning_rate": 4.915622491370376e-07, + "loss": 0.0001, + "reward": 1.7821429073810577, + "reward_std": 0.055558389984071255, + "rewards/equation_reward_func": 0.7910714522004128, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14910 + }, + { + "completion_length": 250.1384038925171, + "epoch": 2.5002724338823925, + "grad_norm": 0.15781246856605685, + "kl": 0.1234130859375, + "learning_rate": 4.915591586329563e-07, + "loss": 0.0001, + "reward": 1.7928571850061417, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7928571812808514, + "rewards/format_reward_func": 1.0, + "step": 14912 + }, + { + "completion_length": 255.00894165039062, + "epoch": 2.5006077371222597, + "grad_norm": 0.2201245062876672, + "kl": 0.155975341796875, + "learning_rate": 4.915560675727164e-07, + "loss": 0.0002, + "reward": 1.8053572103381157, + "reward_std": 0.05303300730884075, + "rewards/equation_reward_func": 0.8098214566707611, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14914 + }, + { + "completion_length": 252.48215579986572, + "epoch": 2.5009430403621273, + "grad_norm": 0.14755556808748146, + "kl": 0.123779296875, + "learning_rate": 4.915529759563248e-07, + "loss": 0.0001, + "reward": 1.796428620815277, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286096394062, + "rewards/format_reward_func": 1.0, + "step": 14916 + }, + { + "completion_length": 250.58483123779297, + "epoch": 2.501278343601995, + "grad_norm": 0.16455636465986687, + "kl": 0.120697021484375, + "learning_rate": 4.915498837837887e-07, + "loss": 0.0001, + "reward": 1.785714365541935, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857143133878708, + "rewards/format_reward_func": 1.0, + "step": 14918 + }, + { + "completion_length": 260.75894260406494, + "epoch": 2.5016136468418626, + "grad_norm": 0.17272260899471706, + "kl": 0.1383056640625, + "learning_rate": 4.915467910551153e-07, + "loss": 0.0001, + "reward": 1.814285784959793, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8142857477068901, + "rewards/format_reward_func": 1.0, + "step": 14920 + }, + { + "completion_length": 256.6205463409424, + "epoch": 2.5019489500817302, + "grad_norm": 0.19178782600827965, + "kl": 0.25885009765625, + "learning_rate": 4.915436977703114e-07, + "loss": 0.0003, + "reward": 1.8107143566012383, + "reward_std": 0.055558388121426105, + "rewards/equation_reward_func": 0.8107143063098192, + "rewards/format_reward_func": 1.0, + "step": 14922 + }, + { + "completion_length": 258.1964416503906, + "epoch": 2.502284253321598, + "grad_norm": 0.39726649210975157, + "kl": 0.156768798828125, + "learning_rate": 4.915406039293845e-07, + "loss": 0.0002, + "reward": 1.7821428924798965, + "reward_std": 0.06565991416573524, + "rewards/equation_reward_func": 0.7910714633762836, + "rewards/format_reward_func": 0.9910714328289032, + "step": 14924 + }, + { + "completion_length": 268.22769355773926, + "epoch": 2.5026195565614655, + "grad_norm": 0.2066904623535352, + "kl": 0.150665283203125, + "learning_rate": 4.915375095323417e-07, + "loss": 0.0002, + "reward": 1.780357226729393, + "reward_std": 0.0681852949783206, + "rewards/equation_reward_func": 0.7848214618861675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14926 + }, + { + "completion_length": 257.2232246398926, + "epoch": 2.5029548598013327, + "grad_norm": 0.21913485169174565, + "kl": 0.150390625, + "learning_rate": 4.915344145791898e-07, + "loss": 0.0002, + "reward": 1.7142858058214188, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.7142857611179352, + "rewards/format_reward_func": 1.0, + "step": 14928 + }, + { + "completion_length": 263.6607255935669, + "epoch": 2.5032901630412003, + "grad_norm": 0.2649583866173159, + "kl": 0.17791748046875, + "learning_rate": 4.915313190699362e-07, + "loss": 0.0002, + "reward": 1.7160714864730835, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7205357383936644, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14930 + }, + { + "completion_length": 266.9955472946167, + "epoch": 2.503625466281068, + "grad_norm": 0.20459147581001738, + "kl": 0.191192626953125, + "learning_rate": 4.915282230045878e-07, + "loss": 0.0002, + "reward": 1.7732143253087997, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776786051690578, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14932 + }, + { + "completion_length": 255.90180015563965, + "epoch": 2.5039607695209356, + "grad_norm": 0.25484307197764544, + "kl": 0.142059326171875, + "learning_rate": 4.91525126383152e-07, + "loss": 0.0001, + "reward": 1.7964286357164383, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7964286021888256, + "rewards/format_reward_func": 1.0, + "step": 14934 + }, + { + "completion_length": 267.54912090301514, + "epoch": 2.504296072760803, + "grad_norm": 0.15722467533159465, + "kl": 0.2144775390625, + "learning_rate": 4.915220292056359e-07, + "loss": 0.0002, + "reward": 1.753571517765522, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714730620384, + "rewards/format_reward_func": 1.0, + "step": 14936 + }, + { + "completion_length": 250.33483219146729, + "epoch": 2.5046313760006704, + "grad_norm": 0.13771303946559407, + "kl": 0.122711181640625, + "learning_rate": 4.915189314720465e-07, + "loss": 0.0001, + "reward": 1.7982143387198448, + "reward_std": 0.04293148219585419, + "rewards/equation_reward_func": 0.802678607404232, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14938 + }, + { + "completion_length": 247.95536708831787, + "epoch": 2.504966679240538, + "grad_norm": 0.15147710220354377, + "kl": 0.14837646484375, + "learning_rate": 4.915158331823909e-07, + "loss": 0.0001, + "reward": 1.796428643167019, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7964285910129547, + "rewards/format_reward_func": 1.0, + "step": 14940 + }, + { + "completion_length": 252.75000953674316, + "epoch": 2.5053019824804057, + "grad_norm": 0.09408187522982184, + "kl": 0.2237548828125, + "learning_rate": 4.915127343366763e-07, + "loss": 0.0002, + "reward": 1.760714367032051, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7607143111526966, + "rewards/format_reward_func": 1.0, + "step": 14942 + }, + { + "completion_length": 250.71430015563965, + "epoch": 2.5056372857202733, + "grad_norm": 0.09563232486259707, + "kl": 0.1910400390625, + "learning_rate": 4.915096349349098e-07, + "loss": 0.0002, + "reward": 1.7875000312924385, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.7919643223285675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14944 + }, + { + "completion_length": 249.0937623977661, + "epoch": 2.505972588960141, + "grad_norm": 0.17560738401963297, + "kl": 0.17327880859375, + "learning_rate": 4.915065349770987e-07, + "loss": 0.0002, + "reward": 1.7428572252392769, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7428571805357933, + "rewards/format_reward_func": 1.0, + "step": 14946 + }, + { + "completion_length": 247.321439743042, + "epoch": 2.5063078922000086, + "grad_norm": 0.4454910772698461, + "kl": 0.14111328125, + "learning_rate": 4.9150343446325e-07, + "loss": 0.0001, + "reward": 1.7571429312229156, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7571428865194321, + "rewards/format_reward_func": 1.0, + "step": 14948 + }, + { + "completion_length": 249.8571538925171, + "epoch": 2.506643195439876, + "grad_norm": 0.24536262681171336, + "kl": 0.170684814453125, + "learning_rate": 4.915003333933708e-07, + "loss": 0.0002, + "reward": 1.7937500774860382, + "reward_std": 0.049244935624301434, + "rewards/equation_reward_func": 0.7955357395112514, + "rewards/format_reward_func": 0.9982142895460129, + "step": 14950 + }, + { + "completion_length": 238.4687614440918, + "epoch": 2.5069784986797434, + "grad_norm": 0.25230609198150733, + "kl": 0.148651123046875, + "learning_rate": 4.914972317674683e-07, + "loss": 0.0001, + "reward": 1.782142885029316, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428831666708, + "rewards/format_reward_func": 1.0, + "step": 14952 + }, + { + "completion_length": 250.16072368621826, + "epoch": 2.507313801919611, + "grad_norm": 0.1501950138335119, + "kl": 0.25262451171875, + "learning_rate": 4.914941295855496e-07, + "loss": 0.0003, + "reward": 1.7785714715719223, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 14954 + }, + { + "completion_length": 233.84376335144043, + "epoch": 2.5076491051594787, + "grad_norm": 0.10516466220468688, + "kl": 0.159942626953125, + "learning_rate": 4.91491026847622e-07, + "loss": 0.0002, + "reward": 1.7857143431901932, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.7857142947614193, + "rewards/format_reward_func": 1.0, + "step": 14956 + }, + { + "completion_length": 245.0982265472412, + "epoch": 2.507984408399346, + "grad_norm": 0.22351192817369384, + "kl": 0.1419677734375, + "learning_rate": 4.914879235536924e-07, + "loss": 0.0001, + "reward": 1.8250000402331352, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.8250000290572643, + "rewards/format_reward_func": 1.0, + "step": 14958 + }, + { + "completion_length": 234.88840293884277, + "epoch": 2.5083197116392135, + "grad_norm": 0.10862469988182176, + "kl": 0.117095947265625, + "learning_rate": 4.914848197037681e-07, + "loss": 0.0001, + "reward": 1.7232143506407738, + "reward_std": 0.02777919452637434, + "rewards/equation_reward_func": 0.727678595110774, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14960 + }, + { + "completion_length": 246.5982255935669, + "epoch": 2.508655014879081, + "grad_norm": 0.34570987382328267, + "kl": 0.117919921875, + "learning_rate": 4.914817152978561e-07, + "loss": 0.0001, + "reward": 1.7428572103381157, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7428571693599224, + "rewards/format_reward_func": 1.0, + "step": 14962 + }, + { + "completion_length": 245.7232265472412, + "epoch": 2.508990318118949, + "grad_norm": 0.16383356087510462, + "kl": 0.11492919921875, + "learning_rate": 4.914786103359639e-07, + "loss": 0.0001, + "reward": 1.7678571939468384, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7678571902215481, + "rewards/format_reward_func": 1.0, + "step": 14964 + }, + { + "completion_length": 229.80804634094238, + "epoch": 2.5093256213588164, + "grad_norm": 0.3562128005704095, + "kl": 0.173583984375, + "learning_rate": 4.914755048180981e-07, + "loss": 0.0002, + "reward": 1.7875000536441803, + "reward_std": 0.058083769865334034, + "rewards/equation_reward_func": 0.7919643074274063, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14966 + }, + { + "completion_length": 235.73215293884277, + "epoch": 2.509660924598684, + "grad_norm": 0.20378825155179361, + "kl": 0.159698486328125, + "learning_rate": 4.914723987442664e-07, + "loss": 0.0002, + "reward": 1.7785715013742447, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7785714603960514, + "rewards/format_reward_func": 1.0, + "step": 14968 + }, + { + "completion_length": 240.1250114440918, + "epoch": 2.5099962278385517, + "grad_norm": 0.0031109090255418614, + "kl": 0.117584228515625, + "learning_rate": 4.914692921144756e-07, + "loss": 0.0001, + "reward": 1.8500000312924385, + "reward_std": 0.010101525112986565, + "rewards/equation_reward_func": 0.8500000312924385, + "rewards/format_reward_func": 1.0, + "step": 14970 + }, + { + "completion_length": 243.4732265472412, + "epoch": 2.510331531078419, + "grad_norm": 0.1671077710760345, + "kl": 0.12384033203125, + "learning_rate": 4.91466184928733e-07, + "loss": 0.0001, + "reward": 1.8285714611411095, + "reward_std": 0.04040610045194626, + "rewards/equation_reward_func": 0.8285714574158192, + "rewards/format_reward_func": 1.0, + "step": 14972 + }, + { + "completion_length": 243.3616180419922, + "epoch": 2.5106668343182865, + "grad_norm": 0.0040697662114411014, + "kl": 0.123992919921875, + "learning_rate": 4.914630771870457e-07, + "loss": 0.0001, + "reward": 1.7607143372297287, + "reward_std": 0.015152287669479847, + "rewards/equation_reward_func": 0.7607143260538578, + "rewards/format_reward_func": 1.0, + "step": 14974 + }, + { + "completion_length": 233.06251049041748, + "epoch": 2.511002137558154, + "grad_norm": 0.2969220308731453, + "kl": 0.117645263671875, + "learning_rate": 4.914599688894208e-07, + "loss": 0.0001, + "reward": 1.7928572073578835, + "reward_std": 0.06060915067791939, + "rewards/equation_reward_func": 0.7928571663796902, + "rewards/format_reward_func": 1.0, + "step": 14976 + }, + { + "completion_length": 244.55804443359375, + "epoch": 2.511337440798022, + "grad_norm": 0.14675272791289684, + "kl": 0.14947509765625, + "learning_rate": 4.914568600358656e-07, + "loss": 0.0001, + "reward": 1.7750000432133675, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7750000432133675, + "rewards/format_reward_func": 1.0, + "step": 14978 + }, + { + "completion_length": 242.08929824829102, + "epoch": 2.5116727440378894, + "grad_norm": 0.23792415652608712, + "kl": 0.123748779296875, + "learning_rate": 4.914537506263871e-07, + "loss": 0.0001, + "reward": 1.7285715118050575, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7285714522004128, + "rewards/format_reward_func": 1.0, + "step": 14980 + }, + { + "completion_length": 237.70983219146729, + "epoch": 2.5120080472777566, + "grad_norm": 0.16721352864248729, + "kl": 0.1259307861328125, + "learning_rate": 4.914506406609927e-07, + "loss": 0.0001, + "reward": 1.7732143476605415, + "reward_std": 0.037880719639360905, + "rewards/equation_reward_func": 0.7776786014437675, + "rewards/format_reward_func": 0.9955357164144516, + "step": 14982 + }, + { + "completion_length": 242.0669755935669, + "epoch": 2.5123433505176243, + "grad_norm": 0.1793158228133712, + "kl": 0.114105224609375, + "learning_rate": 4.914475301396893e-07, + "loss": 0.0001, + "reward": 1.767857201397419, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7678571715950966, + "rewards/format_reward_func": 1.0, + "step": 14984 + }, + { + "completion_length": 234.45090198516846, + "epoch": 2.512678653757492, + "grad_norm": 0.1808318129904359, + "kl": 0.1202392578125, + "learning_rate": 4.914444190624842e-07, + "loss": 0.0001, + "reward": 1.778571493923664, + "reward_std": 0.02020305022597313, + "rewards/equation_reward_func": 0.7785714492201805, + "rewards/format_reward_func": 1.0, + "step": 14986 + }, + { + "completion_length": 232.07590198516846, + "epoch": 2.5130139569973595, + "grad_norm": 0.00458665470919745, + "kl": 0.172943115234375, + "learning_rate": 4.914413074293845e-07, + "loss": 0.0002, + "reward": 1.7642857655882835, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7642857395112514, + "rewards/format_reward_func": 1.0, + "step": 14988 + }, + { + "completion_length": 241.29018783569336, + "epoch": 2.513349260237227, + "grad_norm": 0.21982994897106903, + "kl": 0.13336181640625, + "learning_rate": 4.914381952403974e-07, + "loss": 0.0001, + "reward": 1.8142857626080513, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8142857402563095, + "rewards/format_reward_func": 1.0, + "step": 14990 + }, + { + "completion_length": 230.55358123779297, + "epoch": 2.513684563477095, + "grad_norm": 0.23499309856862302, + "kl": 0.129669189453125, + "learning_rate": 4.9143508249553e-07, + "loss": 0.0001, + "reward": 1.7535715028643608, + "reward_std": 0.035355337895452976, + "rewards/equation_reward_func": 0.7535714767873287, + "rewards/format_reward_func": 1.0, + "step": 14992 + }, + { + "completion_length": 240.4241180419922, + "epoch": 2.5140198667169624, + "grad_norm": 0.15111986044046005, + "kl": 0.165252685546875, + "learning_rate": 4.914319691947896e-07, + "loss": 0.0002, + "reward": 1.807142898440361, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 14994 + }, + { + "completion_length": 227.11608028411865, + "epoch": 2.5143551699568296, + "grad_norm": 0.13801908442240357, + "kl": 0.121490478515625, + "learning_rate": 4.914288553381833e-07, + "loss": 0.0001, + "reward": 1.782142922282219, + "reward_std": 0.02525381278246641, + "rewards/equation_reward_func": 0.7821428906172514, + "rewards/format_reward_func": 1.0, + "step": 14996 + }, + { + "completion_length": 232.95090293884277, + "epoch": 2.5146904731966973, + "grad_norm": 0.13381548188941406, + "kl": 0.12237548828125, + "learning_rate": 4.914257409257182e-07, + "loss": 0.0001, + "reward": 1.8071429058909416, + "reward_std": 0.05050762556493282, + "rewards/equation_reward_func": 0.8071428798139095, + "rewards/format_reward_func": 1.0, + "step": 14998 + }, + { + "completion_length": 230.81250953674316, + "epoch": 2.515025776436565, + "grad_norm": 0.003928641599324006, + "kl": 0.128692626953125, + "learning_rate": 4.914226259574015e-07, + "loss": 0.0001, + "reward": 1.7357143610715866, + "reward_std": 0.030304575338959694, + "rewards/equation_reward_func": 0.7357143014669418, + "rewards/format_reward_func": 1.0, + "step": 15000 + } + ], + "logging_steps": 2, + "max_steps": 134996, + "num_input_tokens_seen": 0, + "num_train_epochs": 23, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}