{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.48154093097913325, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 560.3958587646484, "epoch": 0.0032102728731942215, "grad_norm": 0.1884765625, "kl": 0.0, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 0.6299200654029846, "reward_std": 0.34568188339471817, "rewards/expression_based_accuracy_reward_length_penalized": 0.3343471363186836, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2955729365348816, "step": 1 }, { "completion_length": 574.9948120117188, "epoch": 0.006420545746388443, "grad_norm": 0.20703125, "kl": 0.0, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "reward": 0.6667226850986481, "reward_std": 0.3381393700838089, "rewards/expression_based_accuracy_reward_length_penalized": 0.3724518120288849, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2942708432674408, "step": 2 }, { "completion_length": 597.8541870117188, "epoch": 0.009630818619582664, "grad_norm": 0.185546875, "kl": 0.00022509081827593036, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.636066347360611, "reward_std": 0.35888948291540146, "rewards/expression_based_accuracy_reward_length_penalized": 0.33658717572689056, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.299479179084301, "step": 3 }, { "completion_length": 568.2656555175781, "epoch": 0.012841091492776886, "grad_norm": 0.2412109375, "kl": 0.00023551580670755357, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "reward": 0.6489640921354294, "reward_std": 0.344046413898468, "rewards/expression_based_accuracy_reward_length_penalized": 0.3677141070365906, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2812500149011612, "step": 4 }, { "completion_length": 637.7135620117188, "epoch": 0.016051364365971106, "grad_norm": 0.169921875, "kl": 0.00023702834732830524, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.6146693080663681, "reward_std": 0.35021649301052094, "rewards/expression_based_accuracy_reward_length_penalized": 0.3256068006157875, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2890625037252903, "step": 5 }, { "completion_length": 607.0260620117188, "epoch": 0.019261637239165328, "grad_norm": 0.2080078125, "kl": 0.00022915955560165457, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.5901845693588257, "reward_std": 0.3410582020878792, "rewards/expression_based_accuracy_reward_length_penalized": 0.3063303604722023, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.283854179084301, "step": 6 }, { "completion_length": 536.9713745117188, "epoch": 0.02247191011235955, "grad_norm": 0.20703125, "kl": 0.00023870709992479533, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "reward": 0.5996608734130859, "reward_std": 0.3273390009999275, "rewards/expression_based_accuracy_reward_length_penalized": 0.29497333616018295, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3046875074505806, "step": 7 }, { "completion_length": 579.4062652587891, "epoch": 0.025682182985553772, "grad_norm": 0.193359375, "kl": 0.0002171014821215067, "learning_rate": 5.333333333333333e-07, "loss": 0.0, "reward": 0.6376358270645142, "reward_std": 0.34004897624254227, "rewards/expression_based_accuracy_reward_length_penalized": 0.355083703994751, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2825520932674408, "step": 8 }, { "completion_length": 546.3750228881836, "epoch": 0.028892455858747994, "grad_norm": 0.212890625, "kl": 0.00022866667859489098, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.6513122767210007, "reward_std": 0.32241296768188477, "rewards/expression_based_accuracy_reward_length_penalized": 0.3576924651861191, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2936197966337204, "step": 9 }, { "completion_length": 573.7396087646484, "epoch": 0.03210272873194221, "grad_norm": 0.1904296875, "kl": 0.0002470466679369565, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.679995134472847, "reward_std": 0.3322247415781021, "rewards/expression_based_accuracy_reward_length_penalized": 0.3909326568245888, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2890625074505806, "step": 10 }, { "completion_length": 563.1927337646484, "epoch": 0.03531300160513644, "grad_norm": 0.220703125, "kl": 0.00023816750763216987, "learning_rate": 7.333333333333332e-07, "loss": 0.0, "reward": 0.6119517982006073, "reward_std": 0.3330337107181549, "rewards/expression_based_accuracy_reward_length_penalized": 0.3124726414680481, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2994791865348816, "step": 11 }, { "completion_length": 520.0416793823242, "epoch": 0.038523274478330656, "grad_norm": 0.2421875, "kl": 0.0002197102876380086, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.608386904001236, "reward_std": 0.3292866423726082, "rewards/expression_based_accuracy_reward_length_penalized": 0.31997546553611755, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2884114608168602, "step": 12 }, { "completion_length": 564.2135543823242, "epoch": 0.04173354735152488, "grad_norm": 0.2158203125, "kl": 0.00023579742264701054, "learning_rate": 8.666666666666667e-07, "loss": 0.0, "reward": 0.6887014210224152, "reward_std": 0.3306478410959244, "rewards/expression_based_accuracy_reward_length_penalized": 0.40614935383200645, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2825520895421505, "step": 13 }, { "completion_length": 597.3411712646484, "epoch": 0.0449438202247191, "grad_norm": 0.18359375, "kl": 0.00021818295135744847, "learning_rate": 9.333333333333333e-07, "loss": 0.0, "reward": 0.5946466475725174, "reward_std": 0.322207048535347, "rewards/expression_based_accuracy_reward_length_penalized": 0.3166518397629261, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2779947966337204, "step": 14 }, { "completion_length": 584.4505310058594, "epoch": 0.048154093097913325, "grad_norm": 0.17578125, "kl": 0.00022104514937382191, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.5846492573618889, "reward_std": 0.3315364196896553, "rewards/expression_based_accuracy_reward_length_penalized": 0.2994930259883404, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2851562574505806, "step": 15 }, { "completion_length": 554.7083587646484, "epoch": 0.051364365971107544, "grad_norm": 0.2119140625, "kl": 0.00023191924265120178, "learning_rate": 9.998781585307575e-07, "loss": 0.0, "reward": 0.6661794185638428, "reward_std": 0.3503050282597542, "rewards/expression_based_accuracy_reward_length_penalized": 0.3647470995783806, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.301432304084301, "step": 16 }, { "completion_length": 580.8854522705078, "epoch": 0.05457463884430177, "grad_norm": 0.1826171875, "kl": 0.0002030548857874237, "learning_rate": 9.99512700102336e-07, "loss": 0.0, "reward": 0.6631067544221878, "reward_std": 0.3135067969560623, "rewards/expression_based_accuracy_reward_length_penalized": 0.3707890138030052, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2923177182674408, "step": 17 }, { "completion_length": 580.8359527587891, "epoch": 0.05778491171749599, "grad_norm": 0.2001953125, "kl": 0.0002304925255884882, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.651978924870491, "reward_std": 0.36701615154743195, "rewards/expression_based_accuracy_reward_length_penalized": 0.3407810106873512, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.311197929084301, "step": 18 }, { "completion_length": 572.1041717529297, "epoch": 0.060995184590690206, "grad_norm": 0.2236328125, "kl": 0.00021570282842731103, "learning_rate": 9.98051855792412e-07, "loss": 0.0, "reward": 0.631376326084137, "reward_std": 0.34789177030324936, "rewards/expression_based_accuracy_reward_length_penalized": 0.3279908671975136, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.303385429084301, "step": 19 }, { "completion_length": 569.4166717529297, "epoch": 0.06420545746388442, "grad_norm": 0.177734375, "kl": 0.00021378670862759463, "learning_rate": 9.969572609838744e-07, "loss": 0.0, "reward": 0.5896809697151184, "reward_std": 0.3236342519521713, "rewards/expression_based_accuracy_reward_length_penalized": 0.2973632514476776, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2923177182674408, "step": 20 }, { "completion_length": 580.0417022705078, "epoch": 0.06741573033707865, "grad_norm": 0.1923828125, "kl": 0.00023035979393171147, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": 0.6523794531822205, "reward_std": 0.3744150176644325, "rewards/expression_based_accuracy_reward_length_penalized": 0.35159818083047867, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3007812574505806, "step": 21 }, { "completion_length": 587.8021087646484, "epoch": 0.07062600321027288, "grad_norm": 0.1962890625, "kl": 0.0002475921137374826, "learning_rate": 9.940426894506606e-07, "loss": 0.0, "reward": 0.6196304857730865, "reward_std": 0.3361932933330536, "rewards/expression_based_accuracy_reward_length_penalized": 0.32015133649110794, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2994791753590107, "step": 22 }, { "completion_length": 518.0104217529297, "epoch": 0.0738362760834671, "grad_norm": 0.1904296875, "kl": 0.00022199605882633477, "learning_rate": 9.922242910178859e-07, "loss": 0.0, "reward": 0.737170621752739, "reward_std": 0.3162895292043686, "rewards/expression_based_accuracy_reward_length_penalized": 0.44485291838645935, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2923177182674408, "step": 23 }, { "completion_length": 539.7265625, "epoch": 0.07704654895666131, "grad_norm": 0.1943359375, "kl": 0.0002175298322981689, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": 0.7175260633230209, "reward_std": 0.3508952334523201, "rewards/expression_based_accuracy_reward_length_penalized": 0.40307293832302094, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3144531399011612, "step": 24 }, { "completion_length": 512.5755310058594, "epoch": 0.08025682182985554, "grad_norm": 0.224609375, "kl": 0.0002242086047772318, "learning_rate": 9.878701917609207e-07, "loss": 0.0, "reward": 0.6891498863697052, "reward_std": 0.3474579304456711, "rewards/expression_based_accuracy_reward_length_penalized": 0.38055606931447983, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3085937574505806, "step": 25 }, { "completion_length": 568.2630310058594, "epoch": 0.08346709470304976, "grad_norm": 0.20703125, "kl": 0.0002301457461726386, "learning_rate": 9.853368487582886e-07, "loss": 0.0, "reward": 0.6178770214319229, "reward_std": 0.35181906819343567, "rewards/expression_based_accuracy_reward_length_penalized": 0.3281634747982025, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2897135466337204, "step": 26 }, { "completion_length": 538.7916946411133, "epoch": 0.08667736757624397, "grad_norm": 0.2041015625, "kl": 0.00026182403962593526, "learning_rate": 9.825677631722435e-07, "loss": 0.0, "reward": 0.7029251009225845, "reward_std": 0.360026091337204, "rewards/expression_based_accuracy_reward_length_penalized": 0.39693548530340195, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3059896007180214, "step": 27 }, { "completion_length": 552.0781478881836, "epoch": 0.0898876404494382, "grad_norm": 0.19140625, "kl": 0.00023814345331629738, "learning_rate": 9.795644345114794e-07, "loss": 0.0, "reward": 0.7071576714515686, "reward_std": 0.33075109869241714, "rewards/expression_based_accuracy_reward_length_penalized": 0.4206993207335472, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2864583507180214, "step": 28 }, { "completion_length": 503.5703353881836, "epoch": 0.09309791332263243, "grad_norm": 0.2197265625, "kl": 0.00023130706176743843, "learning_rate": 9.76328489131448e-07, "loss": 0.0, "reward": 0.6565393060445786, "reward_std": 0.2805866673588753, "rewards/expression_based_accuracy_reward_length_penalized": 0.36096640676259995, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.295572929084301, "step": 29 }, { "completion_length": 532.4583435058594, "epoch": 0.09630818619582665, "grad_norm": 0.21484375, "kl": 0.00023216806584969163, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.6117298901081085, "reward_std": 0.32376599311828613, "rewards/expression_based_accuracy_reward_length_penalized": 0.3122507072985172, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.299479179084301, "step": 30 }, { "completion_length": 576.1484527587891, "epoch": 0.09951845906902086, "grad_norm": 0.1953125, "kl": 0.00021910631767241284, "learning_rate": 9.69165882516764e-07, "loss": 0.0, "reward": 0.6560553312301636, "reward_std": 0.3462247848510742, "rewards/expression_based_accuracy_reward_length_penalized": 0.37480536848306656, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2812500074505806, "step": 31 }, { "completion_length": 592.3385696411133, "epoch": 0.10272873194221509, "grad_norm": 0.1767578125, "kl": 0.0002467254307703115, "learning_rate": 9.65243099959949e-07, "loss": 0.0, "reward": 0.5856707692146301, "reward_std": 0.31634171307086945, "rewards/expression_based_accuracy_reward_length_penalized": 0.28033220022916794, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.305338554084301, "step": 32 }, { "completion_length": 583.9010620117188, "epoch": 0.10593900481540931, "grad_norm": 0.2265625, "kl": 0.00024941361698438413, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": 0.6140669733285904, "reward_std": 0.32649289071559906, "rewards/expression_based_accuracy_reward_length_penalized": 0.327608622610569, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2864583358168602, "step": 33 }, { "completion_length": 538.0364685058594, "epoch": 0.10914927768860354, "grad_norm": 0.208984375, "kl": 0.0002286795133841224, "learning_rate": 9.567251964768342e-07, "loss": 0.0, "reward": 0.6336007714271545, "reward_std": 0.32907337695360184, "rewards/expression_based_accuracy_reward_length_penalized": 0.34258514642715454, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2910156399011612, "step": 34 }, { "completion_length": 507.6510543823242, "epoch": 0.11235955056179775, "grad_norm": 0.255859375, "kl": 0.00024302997917402536, "learning_rate": 9.521346881455354e-07, "loss": 0.0, "reward": 0.7129171043634415, "reward_std": 0.35209202766418457, "rewards/expression_based_accuracy_reward_length_penalized": 0.40757858008146286, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3053385615348816, "step": 35 }, { "completion_length": 584.4531555175781, "epoch": 0.11556982343499198, "grad_norm": 0.2138671875, "kl": 0.00023655666518607177, "learning_rate": 9.473264167865171e-07, "loss": 0.0, "reward": 0.6754663735628128, "reward_std": 0.33357472717761993, "rewards/expression_based_accuracy_reward_length_penalized": 0.3831486627459526, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2923177182674408, "step": 36 }, { "completion_length": 619.2396087646484, "epoch": 0.1187800963081862, "grad_norm": 0.1953125, "kl": 0.0002523561015550513, "learning_rate": 9.42302986163543e-07, "loss": 0.0, "reward": 0.6422896459698677, "reward_std": 0.3401486799120903, "rewards/expression_based_accuracy_reward_length_penalized": 0.3831750750541687, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2591145858168602, "step": 37 }, { "completion_length": 632.1067962646484, "epoch": 0.12199036918138041, "grad_norm": 0.19140625, "kl": 0.00025913729768944904, "learning_rate": 9.370671165529144e-07, "loss": 0.0, "reward": 0.5953093469142914, "reward_std": 0.33438971638679504, "rewards/expression_based_accuracy_reward_length_penalized": 0.3147103600203991, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2805989757180214, "step": 38 }, { "completion_length": 569.0026397705078, "epoch": 0.12520064205457465, "grad_norm": 0.208984375, "kl": 0.0002631417410157155, "learning_rate": 9.316216432703916e-07, "loss": 0.0, "reward": 0.6718064844608307, "reward_std": 0.3528323844075203, "rewards/expression_based_accuracy_reward_length_penalized": 0.3859991952776909, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2858073115348816, "step": 39 }, { "completion_length": 542.0260620117188, "epoch": 0.12841091492776885, "grad_norm": 0.2353515625, "kl": 0.0002535913408792112, "learning_rate": 9.259695151358214e-07, "loss": 0.0, "reward": 0.6311447024345398, "reward_std": 0.3200613558292389, "rewards/expression_based_accuracy_reward_length_penalized": 0.3459884449839592, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2851562574505806, "step": 40 }, { "completion_length": 573.9167022705078, "epoch": 0.13162118780096307, "grad_norm": 0.1923828125, "kl": 0.0002568592317402363, "learning_rate": 9.20113792876298e-07, "loss": 0.0, "reward": 0.6579329371452332, "reward_std": 0.33611204475164413, "rewards/expression_based_accuracy_reward_length_penalized": 0.3617089316248894, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2962239682674408, "step": 41 }, { "completion_length": 563.0416793823242, "epoch": 0.1348314606741573, "grad_norm": 0.205078125, "kl": 0.00026875592448050156, "learning_rate": 9.140576474687263e-07, "loss": 0.0, "reward": 0.6627669483423233, "reward_std": 0.3593166694045067, "rewards/expression_based_accuracy_reward_length_penalized": 0.3750064894556999, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2877604253590107, "step": 42 }, { "completion_length": 496.56251525878906, "epoch": 0.13804173354735153, "grad_norm": 0.2275390625, "kl": 0.0002509369187464472, "learning_rate": 9.078043584226815e-07, "loss": 0.0, "reward": 0.693062499165535, "reward_std": 0.3470332473516464, "rewards/expression_based_accuracy_reward_length_penalized": 0.3753541484475136, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3177083432674408, "step": 43 }, { "completion_length": 589.6536712646484, "epoch": 0.14125200642054575, "grad_norm": 0.1884765625, "kl": 0.0002775079774437472, "learning_rate": 9.013573120044966e-07, "loss": 0.0, "reward": 0.5451524406671524, "reward_std": 0.3420337289571762, "rewards/expression_based_accuracy_reward_length_penalized": 0.2665066123008728, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2786458432674408, "step": 44 }, { "completion_length": 515.5677261352539, "epoch": 0.14446227929373998, "grad_norm": 0.2431640625, "kl": 0.00026737275038613006, "learning_rate": 8.9471999940354e-07, "loss": 0.0, "reward": 0.6689368337392807, "reward_std": 0.3494330644607544, "rewards/expression_based_accuracy_reward_length_penalized": 0.3753170371055603, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2936197966337204, "step": 45 }, { "completion_length": 531.6041870117188, "epoch": 0.1476725521669342, "grad_norm": 0.2158203125, "kl": 0.00027584553754422814, "learning_rate": 8.878960148416747e-07, "loss": 0.0, "reward": 0.6247715353965759, "reward_std": 0.3459451347589493, "rewards/expression_based_accuracy_reward_length_penalized": 0.3357090353965759, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2890625149011612, "step": 46 }, { "completion_length": 523.8359527587891, "epoch": 0.1508828250401284, "grad_norm": 0.2109375, "kl": 0.0002594252800918184, "learning_rate": 8.808890536269229e-07, "loss": 0.0, "reward": 0.6625895947217941, "reward_std": 0.35964568704366684, "rewards/expression_based_accuracy_reward_length_penalized": 0.36180833727121353, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3007812574505806, "step": 47 }, { "completion_length": 572.2395935058594, "epoch": 0.15409309791332262, "grad_norm": 0.2001953125, "kl": 0.0002760118877631612, "learning_rate": 8.737029101523929e-07, "loss": 0.0, "reward": 0.6687695384025574, "reward_std": 0.3379608243703842, "rewards/expression_based_accuracy_reward_length_penalized": 0.36733726412057877, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.301432304084301, "step": 48 }, { "completion_length": 565.372428894043, "epoch": 0.15730337078651685, "grad_norm": 0.2109375, "kl": 0.00026545282889856026, "learning_rate": 8.663414758415478e-07, "loss": 0.0, "reward": 0.6460029184818268, "reward_std": 0.33386022597551346, "rewards/expression_based_accuracy_reward_length_penalized": 0.3458726927638054, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3001302108168602, "step": 49 }, { "completion_length": 540.8411560058594, "epoch": 0.16051364365971107, "grad_norm": 0.2177734375, "kl": 0.0002867219809559174, "learning_rate": 8.588087370409302e-07, "loss": 0.0, "reward": 0.6432211250066757, "reward_std": 0.35255035012960434, "rewards/expression_based_accuracy_reward_length_penalized": 0.3235596604645252, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3196614682674408, "step": 50 }, { "completion_length": 583.2864685058594, "epoch": 0.1637239165329053, "grad_norm": 0.220703125, "kl": 0.0003001616059918888, "learning_rate": 8.511087728614862e-07, "loss": 0.0, "reward": 0.6296520233154297, "reward_std": 0.3602987676858902, "rewards/expression_based_accuracy_reward_length_penalized": 0.3295218013226986, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3001302182674408, "step": 51 }, { "completion_length": 596.6302185058594, "epoch": 0.16693418940609953, "grad_norm": 0.2412109375, "kl": 0.0002572698904259596, "learning_rate": 8.432457529696548e-07, "loss": 0.0, "reward": 0.6288764774799347, "reward_std": 0.3630865290760994, "rewards/expression_based_accuracy_reward_length_penalized": 0.3443712741136551, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2845052257180214, "step": 52 }, { "completion_length": 476.7343978881836, "epoch": 0.17014446227929375, "grad_norm": 0.23828125, "kl": 0.0003045099292648956, "learning_rate": 8.352239353294194e-07, "loss": 0.0, "reward": 0.6977786123752594, "reward_std": 0.36942026019096375, "rewards/expression_based_accuracy_reward_length_penalized": 0.3748619332909584, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.322916679084301, "step": 53 }, { "completion_length": 594.0599060058594, "epoch": 0.17335473515248795, "grad_norm": 0.1982421875, "kl": 0.0002901406696764752, "learning_rate": 8.270476638965461e-07, "loss": 0.0, "reward": 0.614113561809063, "reward_std": 0.30325619876384735, "rewards/expression_based_accuracy_reward_length_penalized": 0.30291564762592316, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3111979216337204, "step": 54 }, { "completion_length": 540.8698120117188, "epoch": 0.17656500802568217, "grad_norm": 0.2177734375, "kl": 0.0002815077095874585, "learning_rate": 8.187213662662538e-07, "loss": 0.0, "reward": 0.7013998925685883, "reward_std": 0.345312163233757, "rewards/expression_based_accuracy_reward_length_penalized": 0.39671240001916885, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3046875074505806, "step": 55 }, { "completion_length": 551.1432342529297, "epoch": 0.1797752808988764, "grad_norm": 0.208984375, "kl": 0.0003022913369932212, "learning_rate": 8.102495512755938e-07, "loss": 0.0, "reward": 0.6621358841657639, "reward_std": 0.3478364497423172, "rewards/expression_based_accuracy_reward_length_penalized": 0.35614627599716187, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3059896007180214, "step": 56 }, { "completion_length": 541.5338668823242, "epoch": 0.18298555377207062, "grad_norm": 0.251953125, "kl": 0.00029883202660130337, "learning_rate": 8.01636806561836e-07, "loss": 0.0, "reward": 0.6321840733289719, "reward_std": 0.3268617168068886, "rewards/expression_based_accuracy_reward_length_penalized": 0.3522360995411873, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2799479216337204, "step": 57 }, { "completion_length": 534.2135620117188, "epoch": 0.18619582664526485, "grad_norm": 0.2177734375, "kl": 0.00031317536922870204, "learning_rate": 7.928877960781808e-07, "loss": 0.0, "reward": 0.6300312578678131, "reward_std": 0.31237364560365677, "rewards/expression_based_accuracy_reward_length_penalized": 0.3422708138823509, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2877604216337204, "step": 58 }, { "completion_length": 569.7265930175781, "epoch": 0.18940609951845908, "grad_norm": 0.1943359375, "kl": 0.0002944675215985626, "learning_rate": 7.840072575681468e-07, "loss": 0.0, "reward": 0.6045078411698341, "reward_std": 0.33760548382997513, "rewards/expression_based_accuracy_reward_length_penalized": 0.30958598107099533, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2949218824505806, "step": 59 }, { "completion_length": 546.7135467529297, "epoch": 0.1926163723916533, "grad_norm": 0.244140625, "kl": 0.0003155921949655749, "learning_rate": 7.75e-07, "loss": 0.0, "reward": 0.6555080115795135, "reward_std": 0.32254888117313385, "rewards/expression_based_accuracy_reward_length_penalized": 0.33779964968562126, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3177083432674408, "step": 60 }, { "completion_length": 544.5573120117188, "epoch": 0.1958266452648475, "grad_norm": 0.193359375, "kl": 0.00029893887403886765, "learning_rate": 7.658709009626109e-07, "loss": 0.0, "reward": 0.6744174212217331, "reward_std": 0.33529237657785416, "rewards/expression_based_accuracy_reward_length_penalized": 0.3684278205037117, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3059895932674408, "step": 61 }, { "completion_length": 499.6224136352539, "epoch": 0.19903691813804172, "grad_norm": 0.2353515625, "kl": 0.00032137856032932177, "learning_rate": 7.566249040241553e-07, "loss": 0.0, "reward": 0.6523666083812714, "reward_std": 0.32566210627555847, "rewards/expression_based_accuracy_reward_length_penalized": 0.34898117184638977, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3033854216337204, "step": 62 }, { "completion_length": 578.7239837646484, "epoch": 0.20224719101123595, "grad_norm": 0.20703125, "kl": 0.0003287481522420421, "learning_rate": 7.472670160550848e-07, "loss": 0.0, "reward": 0.634161502122879, "reward_std": 0.34120889008045197, "rewards/expression_based_accuracy_reward_length_penalized": 0.3372865132987499, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2968750074505806, "step": 63 }, { "completion_length": 506.3177261352539, "epoch": 0.20545746388443017, "grad_norm": 0.2412109375, "kl": 0.0003212923475075513, "learning_rate": 7.37802304516818e-07, "loss": 0.0, "reward": 0.6933595240116119, "reward_std": 0.3754495605826378, "rewards/expression_based_accuracy_reward_length_penalized": 0.38085950165987015, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3125000074505806, "step": 64 }, { "completion_length": 581.5833587646484, "epoch": 0.2086677367576244, "grad_norm": 0.181640625, "kl": 0.00029418900521704927, "learning_rate": 7.282358947176205e-07, "loss": 0.0, "reward": 0.6189248859882355, "reward_std": 0.33084874600172043, "rewards/expression_based_accuracy_reward_length_penalized": 0.3142373785376549, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3046875149011612, "step": 65 }, { "completion_length": 534.5729446411133, "epoch": 0.21187800963081863, "grad_norm": 0.224609375, "kl": 0.00033117266866611317, "learning_rate": 7.185729670371604e-07, "loss": 0.0, "reward": 0.6608574390411377, "reward_std": 0.31472062319517136, "rewards/expression_based_accuracy_reward_length_penalized": 0.3600761741399765, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3007812574505806, "step": 66 }, { "completion_length": 517.5755386352539, "epoch": 0.21508828250401285, "grad_norm": 0.23828125, "kl": 0.00034513785067247227, "learning_rate": 7.08818754121241e-07, "loss": 0.0, "reward": 0.6840898096561432, "reward_std": 0.3518378511071205, "rewards/expression_based_accuracy_reward_length_penalized": 0.3644283339381218, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3196614682674408, "step": 67 }, { "completion_length": 534.2578353881836, "epoch": 0.21829855537720708, "grad_norm": 0.20703125, "kl": 0.00032389759144280106, "learning_rate": 6.989785380482312e-07, "loss": 0.0, "reward": 0.7169905304908752, "reward_std": 0.3356803208589554, "rewards/expression_based_accuracy_reward_length_penalized": 0.39472493529319763, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3222656324505806, "step": 68 }, { "completion_length": 593.1797027587891, "epoch": 0.22150882825040127, "grad_norm": 0.205078125, "kl": 0.00034336688258918, "learning_rate": 6.890576474687263e-07, "loss": 0.0, "reward": 0.6631477773189545, "reward_std": 0.37854011356830597, "rewards/expression_based_accuracy_reward_length_penalized": 0.34023110568523407, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.322916679084301, "step": 69 }, { "completion_length": 629.6015625, "epoch": 0.2247191011235955, "grad_norm": 0.185546875, "kl": 0.00032993722561514005, "learning_rate": 6.790614547199906e-07, "loss": 0.0, "reward": 0.5925078019499779, "reward_std": 0.3088828846812248, "rewards/expression_based_accuracy_reward_length_penalized": 0.3125598691403866, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.279947929084301, "step": 70 }, { "completion_length": 559.5963897705078, "epoch": 0.22792937399678972, "grad_norm": 0.2255859375, "kl": 0.0003137872990919277, "learning_rate": 6.68995372916741e-07, "loss": 0.0, "reward": 0.7026459574699402, "reward_std": 0.33306000381708145, "rewards/expression_based_accuracy_reward_length_penalized": 0.392750084400177, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3098958432674408, "step": 71 }, { "completion_length": 494.39845275878906, "epoch": 0.23113964686998395, "grad_norm": 0.2275390625, "kl": 0.0003587143437471241, "learning_rate": 6.588648530198504e-07, "loss": 0.0, "reward": 0.6391649395227432, "reward_std": 0.3157573267817497, "rewards/expression_based_accuracy_reward_length_penalized": 0.32080554217100143, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3183593824505806, "step": 72 }, { "completion_length": 583.8672027587891, "epoch": 0.23434991974317818, "grad_norm": 0.1875, "kl": 0.0002944624357041903, "learning_rate": 6.486753808845564e-07, "loss": 0.0, "reward": 0.6462114006280899, "reward_std": 0.33720824867486954, "rewards/expression_based_accuracy_reward_length_penalized": 0.3584509789943695, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.287760429084301, "step": 73 }, { "completion_length": 509.7161560058594, "epoch": 0.2375601926163724, "grad_norm": 0.2333984375, "kl": 0.00037064859498059377, "learning_rate": 6.384324742897735e-07, "loss": 0.0, "reward": 0.6612931340932846, "reward_std": 0.3572119027376175, "rewards/expression_based_accuracy_reward_length_penalized": 0.3448868505656719, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3164062649011612, "step": 74 }, { "completion_length": 549.5651245117188, "epoch": 0.24077046548956663, "grad_norm": 0.197265625, "kl": 0.00032304248452419415, "learning_rate": 6.281416799501187e-07, "loss": 0.0, "reward": 0.6797159165143967, "reward_std": 0.34857943654060364, "rewards/expression_based_accuracy_reward_length_penalized": 0.36135654896497726, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3183593824505806, "step": 75 }, { "completion_length": 572.7578353881836, "epoch": 0.24398073836276082, "grad_norm": 0.2119140625, "kl": 0.0003249031215091236, "learning_rate": 6.178085705122674e-07, "loss": 0.0, "reward": 0.6995292603969574, "reward_std": 0.3806586042046547, "rewards/expression_based_accuracy_reward_length_penalized": 0.3727063462138176, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3268229216337204, "step": 76 }, { "completion_length": 507.5078353881836, "epoch": 0.24719101123595505, "grad_norm": 0.255859375, "kl": 0.0003559839096851647, "learning_rate": 6.074387415372676e-07, "loss": 0.0, "reward": 0.7540216147899628, "reward_std": 0.38066261261701584, "rewards/expression_based_accuracy_reward_length_penalized": 0.43045392632484436, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3235677182674408, "step": 77 }, { "completion_length": 618.5390930175781, "epoch": 0.2504012841091493, "grad_norm": 0.220703125, "kl": 0.0003840129793388769, "learning_rate": 5.97037808470444e-07, "loss": 0.0, "reward": 0.5318701416254044, "reward_std": 0.35173140466213226, "rewards/expression_based_accuracy_reward_length_penalized": 0.26559409499168396, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2662760466337204, "step": 78 }, { "completion_length": 514.5781555175781, "epoch": 0.2536115569823435, "grad_norm": 0.234375, "kl": 0.00037649404839612544, "learning_rate": 5.866114036005362e-07, "loss": 0.0, "reward": 0.677052691578865, "reward_std": 0.36026471108198166, "rewards/expression_based_accuracy_reward_length_penalized": 0.35348496586084366, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3235677257180214, "step": 79 }, { "completion_length": 546.5338745117188, "epoch": 0.2568218298555377, "grad_norm": 0.19921875, "kl": 0.0003384711453691125, "learning_rate": 5.761651730097142e-07, "loss": 0.0, "reward": 0.6351290941238403, "reward_std": 0.34162163734436035, "rewards/expression_based_accuracy_reward_length_penalized": 0.34281135350465775, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2923177257180214, "step": 80 }, { "completion_length": 545.1432495117188, "epoch": 0.26003210272873195, "grad_norm": 0.2001953125, "kl": 0.0003302170734968968, "learning_rate": 5.657047735161255e-07, "loss": 0.0, "reward": 0.7321957647800446, "reward_std": 0.3832404538989067, "rewards/expression_based_accuracy_reward_length_penalized": 0.42946138232946396, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3027343824505806, "step": 81 }, { "completion_length": 602.4036712646484, "epoch": 0.26324237560192615, "grad_norm": 0.169921875, "kl": 0.0003239936995669268, "learning_rate": 5.552358696106288e-07, "loss": 0.0, "reward": 0.6142081022262573, "reward_std": 0.33728527277708054, "rewards/expression_based_accuracy_reward_length_penalized": 0.31277579814195633, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.301432304084301, "step": 82 }, { "completion_length": 562.7057495117188, "epoch": 0.2664526484751204, "grad_norm": 0.240234375, "kl": 0.00037012308894190937, "learning_rate": 5.447641303893714e-07, "loss": 0.0, "reward": 0.6191717982292175, "reward_std": 0.3545895963907242, "rewards/expression_based_accuracy_reward_length_penalized": 0.31578636169433594, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.303385429084301, "step": 83 }, { "completion_length": 543.9427261352539, "epoch": 0.2696629213483146, "grad_norm": 0.2314453125, "kl": 0.00037831455847481266, "learning_rate": 5.342952264838747e-07, "loss": 0.0, "reward": 0.7242841571569443, "reward_std": 0.3670550063252449, "rewards/expression_based_accuracy_reward_length_penalized": 0.4020185172557831, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3222656324505806, "step": 84 }, { "completion_length": 558.5781555175781, "epoch": 0.27287319422150885, "grad_norm": 0.22265625, "kl": 0.00036308395647211, "learning_rate": 5.238348269902859e-07, "loss": 0.0, "reward": 0.6587125062942505, "reward_std": 0.36182229965925217, "rewards/expression_based_accuracy_reward_length_penalized": 0.3572801947593689, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.301432304084301, "step": 85 }, { "completion_length": 581.3099060058594, "epoch": 0.27608346709470305, "grad_norm": 0.23046875, "kl": 0.00038044428947614506, "learning_rate": 5.133885963994639e-07, "loss": 0.0, "reward": 0.6719960719347, "reward_std": 0.3624914661049843, "rewards/expression_based_accuracy_reward_length_penalized": 0.3588450253009796, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.313151054084301, "step": 86 }, { "completion_length": 580.1745147705078, "epoch": 0.27929373996789725, "grad_norm": 0.1767578125, "kl": 0.00034601552761159837, "learning_rate": 5.02962191529556e-07, "loss": 0.0, "reward": 0.6323724538087845, "reward_std": 0.32785172015428543, "rewards/expression_based_accuracy_reward_length_penalized": 0.3433099538087845, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2890625074505806, "step": 87 }, { "completion_length": 530.0156326293945, "epoch": 0.2825040128410915, "grad_norm": 0.220703125, "kl": 0.00040156069735530764, "learning_rate": 4.925612584627324e-07, "loss": 0.0, "reward": 0.7260984629392624, "reward_std": 0.38204891979694366, "rewards/expression_based_accuracy_reward_length_penalized": 0.3940671756863594, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.33203125, "step": 88 }, { "completion_length": 548.8020935058594, "epoch": 0.2857142857142857, "grad_norm": 0.248046875, "kl": 0.0004189757601125166, "learning_rate": 4.821914294877326e-07, "loss": 0.0, "reward": 0.6541236937046051, "reward_std": 0.344281330704689, "rewards/expression_based_accuracy_reward_length_penalized": 0.3533423990011215, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3007812574505806, "step": 89 }, { "completion_length": 609.1432342529297, "epoch": 0.28892455858747995, "grad_norm": 0.1982421875, "kl": 0.0003810434936895035, "learning_rate": 4.7185832004988133e-07, "loss": 0.0, "reward": 0.5851198732852936, "reward_std": 0.32441411167383194, "rewards/expression_based_accuracy_reward_length_penalized": 0.2947552725672722, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2903645858168602, "step": 90 }, { "completion_length": 574.8021087646484, "epoch": 0.29213483146067415, "grad_norm": 0.2255859375, "kl": 0.0003523045379552059, "learning_rate": 4.6156752571022637e-07, "loss": 0.0, "reward": 0.6396794319152832, "reward_std": 0.33973030745983124, "rewards/expression_based_accuracy_reward_length_penalized": 0.3525700494647026, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.287109375, "step": 91 }, { "completion_length": 551.4505462646484, "epoch": 0.2953451043338684, "grad_norm": 0.1845703125, "kl": 0.00035572806518757716, "learning_rate": 4.513246191154434e-07, "loss": 0.0, "reward": 0.6876581907272339, "reward_std": 0.3704243451356888, "rewards/expression_based_accuracy_reward_length_penalized": 0.3888300210237503, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2988281399011612, "step": 92 }, { "completion_length": 560.6718902587891, "epoch": 0.2985553772070626, "grad_norm": 0.2001953125, "kl": 0.0003872549714287743, "learning_rate": 4.4113514698014953e-07, "loss": 0.0, "reward": 0.6538409739732742, "reward_std": 0.35449104756116867, "rewards/expression_based_accuracy_reward_length_penalized": 0.35175760090351105, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3020833432674408, "step": 93 }, { "completion_length": 607.7396087646484, "epoch": 0.3017656500802568, "grad_norm": 0.263671875, "kl": 0.0003801950879278593, "learning_rate": 4.3100462708325914e-07, "loss": 0.0, "reward": 0.5898270905017853, "reward_std": 0.3407137244939804, "rewards/expression_based_accuracy_reward_length_penalized": 0.3066239655017853, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2832031287252903, "step": 94 }, { "completion_length": 496.19793701171875, "epoch": 0.30497592295345105, "grad_norm": 0.240234375, "kl": 0.0003671470913104713, "learning_rate": 4.209385452800095e-07, "loss": 0.0, "reward": 0.7184917479753494, "reward_std": 0.3648832216858864, "rewards/expression_based_accuracy_reward_length_penalized": 0.38646050542593, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3320312574505806, "step": 95 }, { "completion_length": 508.57554626464844, "epoch": 0.30818619582664525, "grad_norm": 0.2265625, "kl": 0.00038119566306704655, "learning_rate": 4.1094235253127374e-07, "loss": 0.0, "reward": 0.6568552851676941, "reward_std": 0.3511122092604637, "rewards/expression_based_accuracy_reward_length_penalized": 0.3293813392519951, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3274739682674408, "step": 96 }, { "completion_length": 549.0312652587891, "epoch": 0.3113964686998395, "grad_norm": 0.21484375, "kl": 0.0003632343214121647, "learning_rate": 4.0102146195176887e-07, "loss": 0.0, "reward": 0.7204606682062149, "reward_std": 0.3499609977006912, "rewards/expression_based_accuracy_reward_length_penalized": 0.4001481980085373, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3203125074505806, "step": 97 }, { "completion_length": 490.3802261352539, "epoch": 0.3146067415730337, "grad_norm": 0.2255859375, "kl": 0.00044602488924283534, "learning_rate": 3.911812458787591e-07, "loss": 0.0, "reward": 0.6808420717716217, "reward_std": 0.35114526003599167, "rewards/expression_based_accuracy_reward_length_penalized": 0.3728993684053421, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3079427182674408, "step": 98 }, { "completion_length": 524.2682342529297, "epoch": 0.31781701444622795, "grad_norm": 0.20703125, "kl": 0.0003882949022226967, "learning_rate": 3.8142703296283953e-07, "loss": 0.0, "reward": 0.6448683142662048, "reward_std": 0.3429142013192177, "rewards/expression_based_accuracy_reward_length_penalized": 0.33367037773132324, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.311197929084301, "step": 99 }, { "completion_length": 557.1823043823242, "epoch": 0.32102728731942215, "grad_norm": 0.1884765625, "kl": 0.00035858208866557106, "learning_rate": 3.7176410528237945e-07, "loss": 0.0, "reward": 0.6761815696954727, "reward_std": 0.3675583600997925, "rewards/expression_based_accuracy_reward_length_penalized": 0.35326486080884933, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.322916679084301, "step": 100 }, { "completion_length": 564.6927337646484, "epoch": 0.32423756019261635, "grad_norm": 0.22265625, "kl": 0.00038343547930708155, "learning_rate": 3.62197695483182e-07, "loss": 0.0, "reward": 0.6524051502346992, "reward_std": 0.36947014927864075, "rewards/expression_based_accuracy_reward_length_penalized": 0.34055614471435547, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3118489682674408, "step": 101 }, { "completion_length": 551.9453353881836, "epoch": 0.3274478330658106, "grad_norm": 0.2294921875, "kl": 0.0003793273790506646, "learning_rate": 3.5273298394491515e-07, "loss": 0.0, "reward": 0.6944572031497955, "reward_std": 0.37888605892658234, "rewards/expression_based_accuracy_reward_length_penalized": 0.37870199978351593, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3157552182674408, "step": 102 }, { "completion_length": 530.1771011352539, "epoch": 0.3306581059390048, "grad_norm": 0.22265625, "kl": 0.00038907503767404705, "learning_rate": 3.433750959758446e-07, "loss": 0.0, "reward": 0.6862371563911438, "reward_std": 0.3600939214229584, "rewards/expression_based_accuracy_reward_length_penalized": 0.3555079624056816, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3307291716337204, "step": 103 }, { "completion_length": 504.72398376464844, "epoch": 0.33386837881219905, "grad_norm": 0.2265625, "kl": 0.0004411861809785478, "learning_rate": 3.3412909903738936e-07, "loss": 0.0, "reward": 0.7003691345453262, "reward_std": 0.34579480439424515, "rewards/expression_based_accuracy_reward_length_penalized": 0.38917120546102524, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.311197929084301, "step": 104 }, { "completion_length": 579.5859527587891, "epoch": 0.33707865168539325, "grad_norm": 0.2060546875, "kl": 0.0003610364656196907, "learning_rate": 3.250000000000001e-07, "loss": 0.0, "reward": 0.7041359394788742, "reward_std": 0.3546976149082184, "rewards/expression_based_accuracy_reward_length_penalized": 0.3883807212114334, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3157552182674408, "step": 105 }, { "completion_length": 504.9349136352539, "epoch": 0.3402889245585875, "grad_norm": 0.2294921875, "kl": 0.0004345797060523182, "learning_rate": 3.159927424318531e-07, "loss": 0.0, "reward": 0.7195965945720673, "reward_std": 0.34991642087697983, "rewards/expression_based_accuracy_reward_length_penalized": 0.39863305538892746, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3209635466337204, "step": 106 }, { "completion_length": 521.9349060058594, "epoch": 0.3434991974317817, "grad_norm": 0.2333984375, "kl": 0.0004348123256932013, "learning_rate": 3.0711220392181934e-07, "loss": 0.0, "reward": 0.5767635926604271, "reward_std": 0.3495699018239975, "rewards/expression_based_accuracy_reward_length_penalized": 0.2896541878581047, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2871093824505806, "step": 107 }, { "completion_length": 578.0781402587891, "epoch": 0.3467094703049759, "grad_norm": 0.26171875, "kl": 0.0003971747573814355, "learning_rate": 2.9836319343816397e-07, "loss": 0.0, "reward": 0.5868955999612808, "reward_std": 0.3408031612634659, "rewards/expression_based_accuracy_reward_length_penalized": 0.29587996006011963, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2910156324505806, "step": 108 }, { "completion_length": 558.0286712646484, "epoch": 0.34991974317817015, "grad_norm": 0.1845703125, "kl": 0.00036870845360681415, "learning_rate": 2.897504487244061e-07, "loss": 0.0, "reward": 0.6787082105875015, "reward_std": 0.3448420986533165, "rewards/expression_based_accuracy_reward_length_penalized": 0.3570936322212219, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3216145932674408, "step": 109 }, { "completion_length": 547.6562652587891, "epoch": 0.35313001605136435, "grad_norm": 0.2158203125, "kl": 0.00039495840610470623, "learning_rate": 2.812786337337463e-07, "loss": 0.0, "reward": 0.5997674912214279, "reward_std": 0.32131277769804, "rewards/expression_based_accuracy_reward_length_penalized": 0.3054966703057289, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2942708432674408, "step": 110 }, { "completion_length": 552.3463745117188, "epoch": 0.3563402889245586, "grad_norm": 0.2578125, "kl": 0.00039361264498438686, "learning_rate": 2.729523361034538e-07, "loss": 0.0, "reward": 0.5880802720785141, "reward_std": 0.34414373338222504, "rewards/expression_based_accuracy_reward_length_penalized": 0.2827417254447937, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.305338554084301, "step": 111 }, { "completion_length": 542.1927185058594, "epoch": 0.3595505617977528, "grad_norm": 0.1904296875, "kl": 0.0003947726945625618, "learning_rate": 2.6477606467058035e-07, "loss": 0.0, "reward": 0.6639807671308517, "reward_std": 0.3379776254296303, "rewards/expression_based_accuracy_reward_length_penalized": 0.32934536039829254, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.334635429084301, "step": 112 }, { "completion_length": 550.3698043823242, "epoch": 0.36276083467094705, "grad_norm": 0.2119140625, "kl": 0.00041885858081514016, "learning_rate": 2.567542470303452e-07, "loss": 0.0, "reward": 0.6352178752422333, "reward_std": 0.3331167697906494, "rewards/expression_based_accuracy_reward_length_penalized": 0.3402960002422333, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2949218824505806, "step": 113 }, { "completion_length": 520.3073120117188, "epoch": 0.36597110754414125, "grad_norm": 0.2158203125, "kl": 0.00037509016692638397, "learning_rate": 2.488912271385139e-07, "loss": 0.0, "reward": 0.6496723890304565, "reward_std": 0.36061549186706543, "rewards/expression_based_accuracy_reward_length_penalized": 0.33131300657987595, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3183593824505806, "step": 114 }, { "completion_length": 545.2890701293945, "epoch": 0.36918138041733545, "grad_norm": 0.2412109375, "kl": 0.00040495285793440416, "learning_rate": 2.411912629590699e-07, "loss": 0.0, "reward": 0.6173844560980797, "reward_std": 0.3021947294473648, "rewards/expression_based_accuracy_reward_length_penalized": 0.3328792154788971, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2845052108168602, "step": 115 }, { "completion_length": 597.0130462646484, "epoch": 0.3723916532905297, "grad_norm": 0.205078125, "kl": 0.0003835263050859794, "learning_rate": 2.336585241584522e-07, "loss": 0.0, "reward": 0.6083859652280807, "reward_std": 0.34971795231103897, "rewards/expression_based_accuracy_reward_length_penalized": 0.31671928614377975, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2916666716337204, "step": 116 }, { "completion_length": 505.15106201171875, "epoch": 0.3756019261637239, "grad_norm": 0.259765625, "kl": 0.0004204789365758188, "learning_rate": 2.2629708984760706e-07, "loss": 0.0, "reward": 0.6160649359226227, "reward_std": 0.3238491714000702, "rewards/expression_based_accuracy_reward_length_penalized": 0.3263513892889023, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2897135466337204, "step": 117 }, { "completion_length": 567.8411712646484, "epoch": 0.37881219903691815, "grad_norm": 0.197265625, "kl": 0.0003820292549789883, "learning_rate": 2.1911094637307714e-07, "loss": 0.0, "reward": 0.5847776532173157, "reward_std": 0.33124052733182907, "rewards/expression_based_accuracy_reward_length_penalized": 0.2970172315835953, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2877604216337204, "step": 118 }, { "completion_length": 539.3724136352539, "epoch": 0.38202247191011235, "grad_norm": 0.212890625, "kl": 0.0003783565916819498, "learning_rate": 2.1210398515832536e-07, "loss": 0.0, "reward": 0.7074552923440933, "reward_std": 0.33786971867084503, "rewards/expression_based_accuracy_reward_length_penalized": 0.3910490423440933, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3164062574505806, "step": 119 }, { "completion_length": 578.6484527587891, "epoch": 0.3852327447833066, "grad_norm": 0.208984375, "kl": 0.00036553355312207714, "learning_rate": 2.0528000059645995e-07, "loss": 0.0, "reward": 0.6493179947137833, "reward_std": 0.35857032984495163, "rewards/expression_based_accuracy_reward_length_penalized": 0.35374507308006287, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.295572929084301, "step": 120 }, { "completion_length": 516.8437576293945, "epoch": 0.3884430176565008, "grad_norm": 0.2373046875, "kl": 0.00045376412163022906, "learning_rate": 1.986426879955034e-07, "loss": 0.0, "reward": 0.684567391872406, "reward_std": 0.3590117618441582, "rewards/expression_based_accuracy_reward_length_penalized": 0.3818329870700836, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3027343824505806, "step": 121 }, { "completion_length": 551.9687652587891, "epoch": 0.391653290529695, "grad_norm": 0.193359375, "kl": 0.0003975575600634329, "learning_rate": 1.9219564157731844e-07, "loss": 0.0, "reward": 0.6631377786397934, "reward_std": 0.377517007291317, "rewards/expression_based_accuracy_reward_length_penalized": 0.3408721387386322, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3222656399011612, "step": 122 }, { "completion_length": 525.5026245117188, "epoch": 0.39486356340288925, "grad_norm": 0.21875, "kl": 0.00042099927668459713, "learning_rate": 1.8594235253127372e-07, "loss": 0.0, "reward": 0.7239128798246384, "reward_std": 0.35999199748039246, "rewards/expression_based_accuracy_reward_length_penalized": 0.40815767645835876, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3157552182674408, "step": 123 }, { "completion_length": 523.8567886352539, "epoch": 0.39807383627608345, "grad_norm": 0.318359375, "kl": 0.00044889742275699973, "learning_rate": 1.7988620712370195e-07, "loss": 0.0, "reward": 0.716105192899704, "reward_std": 0.345996156334877, "rewards/expression_based_accuracy_reward_length_penalized": 0.4315999895334244, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2845052182674408, "step": 124 }, { "completion_length": 522.7838668823242, "epoch": 0.4012841091492777, "grad_norm": 0.255859375, "kl": 0.00039373226172756404, "learning_rate": 1.7403048486417868e-07, "loss": 0.0, "reward": 0.6855793744325638, "reward_std": 0.3608446344733238, "rewards/expression_based_accuracy_reward_length_penalized": 0.3704752177000046, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.315104179084301, "step": 125 }, { "completion_length": 557.4557495117188, "epoch": 0.4044943820224719, "grad_norm": 0.2060546875, "kl": 0.00039951602957444265, "learning_rate": 1.6837835672960831e-07, "loss": 0.0, "reward": 0.5974871069192886, "reward_std": 0.3423160910606384, "rewards/expression_based_accuracy_reward_length_penalized": 0.27131520584225655, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3261718824505806, "step": 126 }, { "completion_length": 578.3697967529297, "epoch": 0.40770465489566615, "grad_norm": 0.2041015625, "kl": 0.00037851801607757807, "learning_rate": 1.6293288344708566e-07, "loss": 0.0, "reward": 0.633305624127388, "reward_std": 0.372529074549675, "rewards/expression_based_accuracy_reward_length_penalized": 0.3253629058599472, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3079427108168602, "step": 127 }, { "completion_length": 535.1666870117188, "epoch": 0.41091492776886035, "grad_norm": 0.21484375, "kl": 0.0003694754414027557, "learning_rate": 1.5769701383645698e-07, "loss": 0.0, "reward": 0.6848493814468384, "reward_std": 0.344666950404644, "rewards/expression_based_accuracy_reward_length_penalized": 0.3814639300107956, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3033854216337204, "step": 128 }, { "completion_length": 513.1224060058594, "epoch": 0.41412520064205455, "grad_norm": 0.19921875, "kl": 0.0003918400325346738, "learning_rate": 1.5267358321348285e-07, "loss": 0.0, "reward": 0.6448424756526947, "reward_std": 0.3401818424463272, "rewards/expression_based_accuracy_reward_length_penalized": 0.3212747722864151, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3235677257180214, "step": 129 }, { "completion_length": 541.6172027587891, "epoch": 0.4173354735152488, "grad_norm": 0.2060546875, "kl": 0.0003936137800337747, "learning_rate": 1.4786531185446452e-07, "loss": 0.0, "reward": 0.583847850561142, "reward_std": 0.33960337191820145, "rewards/expression_based_accuracy_reward_length_penalized": 0.27004576474428177, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3138020858168602, "step": 130 }, { "completion_length": 533.5599136352539, "epoch": 0.420545746388443, "grad_norm": 0.2197265625, "kl": 0.00039682938950136304, "learning_rate": 1.432748035231658e-07, "loss": 0.0, "reward": 0.6769755631685257, "reward_std": 0.3399392068386078, "rewards/expression_based_accuracy_reward_length_penalized": 0.3683818504214287, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3085937649011612, "step": 131 }, { "completion_length": 523.9271087646484, "epoch": 0.42375601926163725, "grad_norm": 0.2265625, "kl": 0.00040404664468951523, "learning_rate": 1.3890454406082956e-07, "loss": 0.0, "reward": 0.6483045816421509, "reward_std": 0.32681532204151154, "rewards/expression_based_accuracy_reward_length_penalized": 0.3390597552061081, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.309244804084301, "step": 132 }, { "completion_length": 471.9661636352539, "epoch": 0.42696629213483145, "grad_norm": 0.24609375, "kl": 0.00040609255665913224, "learning_rate": 1.3475690004005097e-07, "loss": 0.0, "reward": 0.7119551748037338, "reward_std": 0.34096624702215195, "rewards/expression_based_accuracy_reward_length_penalized": 0.39619994908571243, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3157552182674408, "step": 133 }, { "completion_length": 565.4192962646484, "epoch": 0.4301765650080257, "grad_norm": 0.205078125, "kl": 0.00037678072112612426, "learning_rate": 1.308341174832359e-07, "loss": 0.0, "reward": 0.6749380528926849, "reward_std": 0.3803337290883064, "rewards/expression_based_accuracy_reward_length_penalized": 0.37480782717466354, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3001302182674408, "step": 134 }, { "completion_length": 496.27345275878906, "epoch": 0.4333868378812199, "grad_norm": 0.27734375, "kl": 0.0004564332193695009, "learning_rate": 1.2713832064634125e-07, "loss": 0.0, "reward": 0.7301954329013824, "reward_std": 0.3306322991847992, "rewards/expression_based_accuracy_reward_length_penalized": 0.41378918290138245, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3164062649011612, "step": 135 }, { "completion_length": 530.8125228881836, "epoch": 0.43659711075441415, "grad_norm": 0.205078125, "kl": 0.0003717996005434543, "learning_rate": 1.2367151086855187e-07, "loss": 0.0, "reward": 0.6495877057313919, "reward_std": 0.3487004414200783, "rewards/expression_based_accuracy_reward_length_penalized": 0.3253689482808113, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3242187649011612, "step": 136 }, { "completion_length": 554.2031402587891, "epoch": 0.43980738362760835, "grad_norm": 0.2109375, "kl": 0.0003636257752077654, "learning_rate": 1.2043556548852063e-07, "loss": 0.0, "reward": 0.5865623354911804, "reward_std": 0.30131980776786804, "rewards/expression_based_accuracy_reward_length_penalized": 0.3040102533996105, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2825520932674408, "step": 137 }, { "completion_length": 545.6093826293945, "epoch": 0.44301765650080255, "grad_norm": 0.22265625, "kl": 0.00041512529423926026, "learning_rate": 1.1743223682775649e-07, "loss": 0.0, "reward": 0.6579451262950897, "reward_std": 0.3593253716826439, "rewards/expression_based_accuracy_reward_length_penalized": 0.36367426812648773, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2942708432674408, "step": 138 }, { "completion_length": 573.9088745117188, "epoch": 0.4462279293739968, "grad_norm": 0.1982421875, "kl": 0.0003498100923025049, "learning_rate": 1.1466315124171128e-07, "loss": 0.0, "reward": 0.6012589037418365, "reward_std": 0.34214527904987335, "rewards/expression_based_accuracy_reward_length_penalized": 0.31414950639009476, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2871093899011612, "step": 139 }, { "completion_length": 546.6432495117188, "epoch": 0.449438202247191, "grad_norm": 0.21875, "kl": 0.0004052919539390132, "learning_rate": 1.1212980823907929e-07, "loss": 0.0, "reward": 0.63412706553936, "reward_std": 0.36361514031887054, "rewards/expression_based_accuracy_reward_length_penalized": 0.33920522779226303, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2949218899011612, "step": 140 }, { "completion_length": 547.6432495117188, "epoch": 0.45264847512038525, "grad_norm": 0.2001953125, "kl": 0.0003856433249893598, "learning_rate": 1.0983357966978745e-07, "loss": 0.0, "reward": 0.7091409862041473, "reward_std": 0.3494722992181778, "rewards/expression_based_accuracy_reward_length_penalized": 0.37906285375356674, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3300781324505806, "step": 141 }, { "completion_length": 562.7890777587891, "epoch": 0.45585874799357945, "grad_norm": 0.28515625, "kl": 0.0004331854870542884, "learning_rate": 1.0777570898211405e-07, "loss": 0.0, "reward": 0.677094116806984, "reward_std": 0.36977435648441315, "rewards/expression_based_accuracy_reward_length_penalized": 0.35808368027210236, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.319010429084301, "step": 142 }, { "completion_length": 514.6145858764648, "epoch": 0.4590690208667737, "grad_norm": 0.26171875, "kl": 0.0004561090827337466, "learning_rate": 1.0595731054933934e-07, "loss": 0.0, "reward": 0.7047944366931915, "reward_std": 0.3853035420179367, "rewards/expression_based_accuracy_reward_length_penalized": 0.39815381169319153, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3066406399011612, "step": 143 }, { "completion_length": 515.0520858764648, "epoch": 0.4622792937399679, "grad_norm": 0.228515625, "kl": 0.00042895031219813973, "learning_rate": 1.0437936906629334e-07, "loss": 0.0, "reward": 0.687195435166359, "reward_std": 0.39286451041698456, "rewards/expression_based_accuracy_reward_length_penalized": 0.37925272434949875, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3079427257180214, "step": 144 }, { "completion_length": 532.7969055175781, "epoch": 0.4654895666131621, "grad_norm": 0.212890625, "kl": 0.00040866951167117804, "learning_rate": 1.0304273901612565e-07, "loss": 0.0, "reward": 0.7079404592514038, "reward_std": 0.3612729534506798, "rewards/expression_based_accuracy_reward_length_penalized": 0.3934873268008232, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3144531324505806, "step": 145 }, { "completion_length": 552.0989837646484, "epoch": 0.46869983948635635, "grad_norm": 0.2158203125, "kl": 0.0003939080925192684, "learning_rate": 1.0194814420758804e-07, "loss": 0.0, "reward": 0.6515837609767914, "reward_std": 0.3383214473724365, "rewards/expression_based_accuracy_reward_length_penalized": 0.3384326733648777, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3131510466337204, "step": 146 }, { "completion_length": 595.5234527587891, "epoch": 0.47191011235955055, "grad_norm": 0.2099609375, "kl": 0.0003522088081808761, "learning_rate": 1.0109617738307911e-07, "loss": 0.0, "reward": 0.5905841588973999, "reward_std": 0.36369770765304565, "rewards/expression_based_accuracy_reward_length_penalized": 0.3151935264468193, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.2753906287252903, "step": 147 }, { "completion_length": 493.57032012939453, "epoch": 0.4751203852327448, "grad_norm": 0.228515625, "kl": 0.0003881813900079578, "learning_rate": 1.0048729989766394e-07, "loss": 0.0, "reward": 0.7446072101593018, "reward_std": 0.37431684136390686, "rewards/expression_based_accuracy_reward_length_penalized": 0.42234158515930176, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3222656399011612, "step": 148 }, { "completion_length": 543.2708587646484, "epoch": 0.478330658105939, "grad_norm": 0.1953125, "kl": 0.00041060569492401555, "learning_rate": 1.0012184146924223e-07, "loss": 0.0, "reward": 0.6233467310667038, "reward_std": 0.3531793877482414, "rewards/expression_based_accuracy_reward_length_penalized": 0.3141019344329834, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3092447966337204, "step": 149 }, { "completion_length": 487.1823043823242, "epoch": 0.48154093097913325, "grad_norm": 0.28515625, "kl": 0.0004451891945791431, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.7395021021366119, "reward_std": 0.35496869683265686, "rewards/expression_based_accuracy_reward_length_penalized": 0.41202811151742935, "rewards/format_reward": 0.0, "rewards/soft_format_reward": 0.0, "rewards/tag_count_reward": 0.3274739682674408, "step": 150 } ], "logging_steps": 1, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }