|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 375, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 270.8035888671875, |
|
"epoch": 0.0026666666666666666, |
|
"grad_norm": 0.7334129323891274, |
|
"kl": 0.0001461505889892578, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0712, |
|
"reward": 0.6089551746845245, |
|
"reward_std": 0.1534397415816784, |
|
"rewards/length_reward": 0.06294643878936768, |
|
"rewards/similarity_reward": 0.546008750796318, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 248.49555206298828, |
|
"epoch": 0.005333333333333333, |
|
"grad_norm": 0.9035152344267738, |
|
"kl": 0.00016069412231445312, |
|
"learning_rate": 2e-06, |
|
"loss": 0.06, |
|
"reward": 0.5894998908042908, |
|
"reward_std": 0.17237309366464615, |
|
"rewards/length_reward": 0.061160728335380554, |
|
"rewards/similarity_reward": 0.528339147567749, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 271.5848388671875, |
|
"epoch": 0.008, |
|
"grad_norm": 0.8948061881977104, |
|
"kl": 0.00017070770263671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0442, |
|
"reward": 0.6276047825813293, |
|
"reward_std": 0.15094101428985596, |
|
"rewards/length_reward": 0.071428582072258, |
|
"rewards/similarity_reward": 0.5561762154102325, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 257.46429443359375, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 0.7848935176364024, |
|
"kl": 0.00016450881958007812, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0583, |
|
"reward": 0.5882956981658936, |
|
"reward_std": 0.1704816073179245, |
|
"rewards/length_reward": 0.06473215110599995, |
|
"rewards/similarity_reward": 0.5235635340213776, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 240.4821548461914, |
|
"epoch": 0.013333333333333334, |
|
"grad_norm": 0.7884926165963533, |
|
"kl": 0.000179290771484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0444, |
|
"reward": 0.6010282635688782, |
|
"reward_std": 0.10747816786170006, |
|
"rewards/length_reward": 0.058482151478528976, |
|
"rewards/similarity_reward": 0.5425460636615753, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 271.5758972167969, |
|
"epoch": 0.016, |
|
"grad_norm": 0.7176003677981444, |
|
"kl": 0.0001480579376220703, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1772, |
|
"reward": 0.632064938545227, |
|
"reward_std": 0.13313144445419312, |
|
"rewards/length_reward": 0.06562501192092896, |
|
"rewards/similarity_reward": 0.5664399564266205, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 248.83483123779297, |
|
"epoch": 0.018666666666666668, |
|
"grad_norm": 0.583690210979672, |
|
"kl": 0.000118255615234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.01, |
|
"reward": 0.6557432115077972, |
|
"reward_std": 0.09923132508993149, |
|
"rewards/length_reward": 0.0714285783469677, |
|
"rewards/similarity_reward": 0.584314614534378, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 240.2991180419922, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 0.8285932775348754, |
|
"kl": 0.00018167495727539062, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0352, |
|
"reward": 0.5436868965625763, |
|
"reward_std": 0.19458478689193726, |
|
"rewards/length_reward": 0.053571440279483795, |
|
"rewards/similarity_reward": 0.4901154637336731, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 274.74554443359375, |
|
"epoch": 0.024, |
|
"grad_norm": 0.6297183637280267, |
|
"kl": 0.00014972686767578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0198, |
|
"reward": 0.6230277419090271, |
|
"reward_std": 0.15007304400205612, |
|
"rewards/length_reward": 0.06562501192092896, |
|
"rewards/similarity_reward": 0.5574026703834534, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 271.2991180419922, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 0.6908976518587052, |
|
"kl": 0.00015354156494140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0383, |
|
"reward": 0.6355253756046295, |
|
"reward_std": 0.16409822553396225, |
|
"rewards/length_reward": 0.0647321492433548, |
|
"rewards/similarity_reward": 0.5707932114601135, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 263.7901916503906, |
|
"epoch": 0.029333333333333333, |
|
"grad_norm": 0.7190922283358612, |
|
"kl": 0.00019359588623046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.039, |
|
"reward": 0.6213563680648804, |
|
"reward_std": 0.14211475104093552, |
|
"rewards/length_reward": 0.07053572311997414, |
|
"rewards/similarity_reward": 0.5508206188678741, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 268.21429443359375, |
|
"epoch": 0.032, |
|
"grad_norm": 0.7674287367860577, |
|
"kl": 0.0001621246337890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0666, |
|
"reward": 0.6959913372993469, |
|
"reward_std": 0.13710426539182663, |
|
"rewards/length_reward": 0.07098215445876122, |
|
"rewards/similarity_reward": 0.6250091791152954, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 308.23216247558594, |
|
"epoch": 0.034666666666666665, |
|
"grad_norm": 0.6961227618622208, |
|
"kl": 0.00017309188842773438, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1339, |
|
"reward": 0.6554811000823975, |
|
"reward_std": 0.14746695756912231, |
|
"rewards/length_reward": 0.06383929960429668, |
|
"rewards/similarity_reward": 0.5916417837142944, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 255.1696548461914, |
|
"epoch": 0.037333333333333336, |
|
"grad_norm": 0.6790296137610172, |
|
"kl": 0.00018310546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0102, |
|
"reward": 0.639388918876648, |
|
"reward_std": 0.13504448905587196, |
|
"rewards/length_reward": 0.07232143729925156, |
|
"rewards/similarity_reward": 0.5670674443244934, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 288.5982208251953, |
|
"epoch": 0.04, |
|
"grad_norm": 0.5592059952702082, |
|
"kl": 0.00015926361083984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0083, |
|
"reward": 0.6401915550231934, |
|
"reward_std": 0.10427659377455711, |
|
"rewards/length_reward": 0.061607155948877335, |
|
"rewards/similarity_reward": 0.5785843580961227, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 256.5669708251953, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 0.6493017975612209, |
|
"kl": 0.0001373291015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0011, |
|
"reward": 0.6510922610759735, |
|
"reward_std": 0.12598292529582977, |
|
"rewards/length_reward": 0.0669642984867096, |
|
"rewards/similarity_reward": 0.5841279327869415, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 306.2857208251953, |
|
"epoch": 0.04533333333333334, |
|
"grad_norm": 0.6883394230623637, |
|
"kl": 0.00019121170043945312, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0685, |
|
"reward": 0.5360667556524277, |
|
"reward_std": 0.17403440922498703, |
|
"rewards/length_reward": 0.04866072162985802, |
|
"rewards/similarity_reward": 0.48740604519844055, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 332.41966247558594, |
|
"epoch": 0.048, |
|
"grad_norm": 0.5705875740256017, |
|
"kl": 0.00019073486328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0755, |
|
"reward": 0.6520929932594299, |
|
"reward_std": 0.11369618400931358, |
|
"rewards/length_reward": 0.06428572162985802, |
|
"rewards/similarity_reward": 0.5878072679042816, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 285.03126525878906, |
|
"epoch": 0.050666666666666665, |
|
"grad_norm": 0.6938225330516384, |
|
"kl": 0.00016689300537109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1144, |
|
"reward": 0.6287901103496552, |
|
"reward_std": 0.14219095557928085, |
|
"rewards/length_reward": 0.06071429327130318, |
|
"rewards/similarity_reward": 0.5680757761001587, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 271.70982360839844, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.7297528927947673, |
|
"kl": 0.00022459030151367188, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0185, |
|
"reward": 0.6177776157855988, |
|
"reward_std": 0.16364814341068268, |
|
"rewards/length_reward": 0.058482155203819275, |
|
"rewards/similarity_reward": 0.5592954754829407, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 288.85716247558594, |
|
"epoch": 0.056, |
|
"grad_norm": 0.7214386901249736, |
|
"kl": 0.0002155303955078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0434, |
|
"reward": 0.597628653049469, |
|
"reward_std": 0.14737623929977417, |
|
"rewards/length_reward": 0.06205358728766441, |
|
"rewards/similarity_reward": 0.5355750769376755, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 286.6875228881836, |
|
"epoch": 0.058666666666666666, |
|
"grad_norm": 0.6632688424489647, |
|
"kl": 0.0001938343048095703, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0844, |
|
"reward": 0.6639349460601807, |
|
"reward_std": 0.13361597061157227, |
|
"rewards/length_reward": 0.06205357797443867, |
|
"rewards/similarity_reward": 0.6018813252449036, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 274.4419708251953, |
|
"epoch": 0.06133333333333333, |
|
"grad_norm": 0.6481364691610889, |
|
"kl": 0.00017881393432617188, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0025, |
|
"reward": 0.6752828657627106, |
|
"reward_std": 0.1561322584748268, |
|
"rewards/length_reward": 0.0625000149011612, |
|
"rewards/similarity_reward": 0.612782746553421, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 286.56697845458984, |
|
"epoch": 0.064, |
|
"grad_norm": 0.6864769985687215, |
|
"kl": 0.00017595291137695312, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0716, |
|
"reward": 0.5987082123756409, |
|
"reward_std": 0.1384410411119461, |
|
"rewards/length_reward": 0.06428572349250317, |
|
"rewards/similarity_reward": 0.5344224572181702, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 279.7053680419922, |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 0.6989530874534239, |
|
"kl": 0.0001468658447265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1417, |
|
"reward": 0.7139560580253601, |
|
"reward_std": 0.12727026268839836, |
|
"rewards/length_reward": 0.0691964402794838, |
|
"rewards/similarity_reward": 0.6447596251964569, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 269.00000762939453, |
|
"epoch": 0.06933333333333333, |
|
"grad_norm": 0.8072710062956403, |
|
"kl": 0.00020742416381835938, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1312, |
|
"reward": 0.5711838901042938, |
|
"reward_std": 0.16566643118858337, |
|
"rewards/length_reward": 0.055803582072257996, |
|
"rewards/similarity_reward": 0.515380322933197, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 261.1339340209961, |
|
"epoch": 0.072, |
|
"grad_norm": 0.6813535460039397, |
|
"kl": 0.00016641616821289062, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0717, |
|
"reward": 0.6621068716049194, |
|
"reward_std": 0.126276895403862, |
|
"rewards/length_reward": 0.0669642984867096, |
|
"rewards/similarity_reward": 0.5951426327228546, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 301.8169708251953, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 0.7408476748329949, |
|
"kl": 0.00023174285888671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0763, |
|
"reward": 0.5747545063495636, |
|
"reward_std": 0.13357429951429367, |
|
"rewards/length_reward": 0.05178572237491608, |
|
"rewards/similarity_reward": 0.5229687094688416, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 292.96876525878906, |
|
"epoch": 0.07733333333333334, |
|
"grad_norm": 0.6666274319697456, |
|
"kl": 0.0001964569091796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1043, |
|
"reward": 0.6386013031005859, |
|
"reward_std": 0.12155063822865486, |
|
"rewards/length_reward": 0.0669642984867096, |
|
"rewards/similarity_reward": 0.5716370046138763, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 295.7589416503906, |
|
"epoch": 0.08, |
|
"grad_norm": 0.6572218858197006, |
|
"kl": 0.0002288818359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0662, |
|
"reward": 0.6807651817798615, |
|
"reward_std": 0.16092021018266678, |
|
"rewards/length_reward": 0.0691964328289032, |
|
"rewards/similarity_reward": 0.6115686893463135, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 307.6383972167969, |
|
"epoch": 0.08266666666666667, |
|
"grad_norm": 0.7925535842055806, |
|
"kl": 0.00022029876708984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2064, |
|
"reward": 0.5656463354825974, |
|
"reward_std": 0.1584705486893654, |
|
"rewards/length_reward": 0.051785726100206375, |
|
"rewards/similarity_reward": 0.5138606131076813, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 232.94197845458984, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.7532498555139676, |
|
"kl": 0.00029754638671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0905, |
|
"reward": 0.6083326637744904, |
|
"reward_std": 0.14686080068349838, |
|
"rewards/length_reward": 0.06339287012815475, |
|
"rewards/similarity_reward": 0.544939786195755, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 282.5089416503906, |
|
"epoch": 0.088, |
|
"grad_norm": 0.8072212411776664, |
|
"kl": 0.0002989768981933594, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0575, |
|
"reward": 0.6441718935966492, |
|
"reward_std": 0.14405860379338264, |
|
"rewards/length_reward": 0.06919643096625805, |
|
"rewards/similarity_reward": 0.574975460767746, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 255.03125762939453, |
|
"epoch": 0.09066666666666667, |
|
"grad_norm": 0.7214716728436158, |
|
"kl": 0.00023508071899414062, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0549, |
|
"reward": 0.65805384516716, |
|
"reward_std": 0.10812044516205788, |
|
"rewards/length_reward": 0.054017869755625725, |
|
"rewards/similarity_reward": 0.6040360331535339, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 318.4464416503906, |
|
"epoch": 0.09333333333333334, |
|
"grad_norm": 0.6443834325802673, |
|
"kl": 0.0002117156982421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0879, |
|
"reward": 0.6600002646446228, |
|
"reward_std": 0.13175412267446518, |
|
"rewards/length_reward": 0.0656250137835741, |
|
"rewards/similarity_reward": 0.5943752527236938, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 265.6607360839844, |
|
"epoch": 0.096, |
|
"grad_norm": 0.7350551326118709, |
|
"kl": 0.000255584716796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0763, |
|
"reward": 0.6379535496234894, |
|
"reward_std": 0.11777842044830322, |
|
"rewards/length_reward": 0.061160726472735405, |
|
"rewards/similarity_reward": 0.57679283618927, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 257.3928756713867, |
|
"epoch": 0.09866666666666667, |
|
"grad_norm": 0.7838614111079478, |
|
"kl": 0.0003414154052734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0207, |
|
"reward": 0.6382779181003571, |
|
"reward_std": 0.10474509000778198, |
|
"rewards/length_reward": 0.07053572684526443, |
|
"rewards/similarity_reward": 0.567742183804512, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 281.02232360839844, |
|
"epoch": 0.10133333333333333, |
|
"grad_norm": 0.718502539431113, |
|
"kl": 0.000362396240234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0634, |
|
"reward": 0.684599369764328, |
|
"reward_std": 0.12997651100158691, |
|
"rewards/length_reward": 0.0669642947614193, |
|
"rewards/similarity_reward": 0.6176350712776184, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 263.25447845458984, |
|
"epoch": 0.104, |
|
"grad_norm": 0.6560992159018658, |
|
"kl": 0.00020933151245117188, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0628, |
|
"reward": 0.6398553550243378, |
|
"reward_std": 0.14855975657701492, |
|
"rewards/length_reward": 0.058035727590322495, |
|
"rewards/similarity_reward": 0.5818196535110474, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 315.5982360839844, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.6477813051659901, |
|
"kl": 0.000324249267578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1156, |
|
"reward": 0.57817542552948, |
|
"reward_std": 0.13820311054587364, |
|
"rewards/length_reward": 0.052678581327199936, |
|
"rewards/similarity_reward": 0.525496780872345, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 239.44644165039062, |
|
"epoch": 0.10933333333333334, |
|
"grad_norm": 0.6691411219790073, |
|
"kl": 0.0003085136413574219, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0558, |
|
"reward": 0.6137453615665436, |
|
"reward_std": 0.13270848989486694, |
|
"rewards/length_reward": 0.061607152223587036, |
|
"rewards/similarity_reward": 0.5521382689476013, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 256.9866256713867, |
|
"epoch": 0.112, |
|
"grad_norm": 0.7883991158904521, |
|
"kl": 0.00034618377685546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0302, |
|
"reward": 0.5703277885913849, |
|
"reward_std": 0.16237758100032806, |
|
"rewards/length_reward": 0.04910715110599995, |
|
"rewards/similarity_reward": 0.5212206840515137, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 294.6339416503906, |
|
"epoch": 0.11466666666666667, |
|
"grad_norm": 0.6538334220808693, |
|
"kl": 0.00020074844360351562, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1178, |
|
"reward": 0.6531505286693573, |
|
"reward_std": 0.1386118158698082, |
|
"rewards/length_reward": 0.06964286789298058, |
|
"rewards/similarity_reward": 0.583507627248764, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 259.49554443359375, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 0.6454158665655293, |
|
"kl": 0.0002779960632324219, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0229, |
|
"reward": 0.6056394279003143, |
|
"reward_std": 0.1302252970635891, |
|
"rewards/length_reward": 0.07008930295705795, |
|
"rewards/similarity_reward": 0.5355501472949982, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 281.0982360839844, |
|
"epoch": 0.12, |
|
"grad_norm": 0.819280226455994, |
|
"kl": 0.00036144256591796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1387, |
|
"reward": 0.5956102013587952, |
|
"reward_std": 0.1252099834382534, |
|
"rewards/length_reward": 0.05089286528527737, |
|
"rewards/similarity_reward": 0.5447173118591309, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 259.5848388671875, |
|
"epoch": 0.12266666666666666, |
|
"grad_norm": 0.7348999369116066, |
|
"kl": 0.00026702880859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0863, |
|
"reward": 0.6511934101581573, |
|
"reward_std": 0.1133498027920723, |
|
"rewards/length_reward": 0.0625000111758709, |
|
"rewards/similarity_reward": 0.588693380355835, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 255.55358123779297, |
|
"epoch": 0.12533333333333332, |
|
"grad_norm": 0.8374758265958845, |
|
"kl": 0.00042057037353515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.094, |
|
"reward": 0.5409015715122223, |
|
"reward_std": 0.10493671149015427, |
|
"rewards/length_reward": 0.058928582817316055, |
|
"rewards/similarity_reward": 0.48197296261787415, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 262.93304443359375, |
|
"epoch": 0.128, |
|
"grad_norm": 0.6793546139834532, |
|
"kl": 0.000423431396484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0649, |
|
"reward": 0.5910505056381226, |
|
"reward_std": 0.16320697963237762, |
|
"rewards/length_reward": 0.06428572721779346, |
|
"rewards/similarity_reward": 0.5267648100852966, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 274.53572845458984, |
|
"epoch": 0.13066666666666665, |
|
"grad_norm": 0.7145465386525555, |
|
"kl": 0.0003376007080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0209, |
|
"reward": 0.65211421251297, |
|
"reward_std": 0.16534826159477234, |
|
"rewards/length_reward": 0.06339287012815475, |
|
"rewards/similarity_reward": 0.588721364736557, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 267.5714340209961, |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.6146172464604014, |
|
"kl": 0.00035190582275390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0452, |
|
"reward": 0.6328105330467224, |
|
"reward_std": 0.16772306710481644, |
|
"rewards/length_reward": 0.06651786714792252, |
|
"rewards/similarity_reward": 0.5662927031517029, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 244.4910888671875, |
|
"epoch": 0.136, |
|
"grad_norm": 0.9373778001138064, |
|
"kl": 0.00045299530029296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1341, |
|
"reward": 0.5600460767745972, |
|
"reward_std": 0.1334986537694931, |
|
"rewards/length_reward": 0.059375012293457985, |
|
"rewards/similarity_reward": 0.5006710439920425, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 251.4553680419922, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 0.6828077611773323, |
|
"kl": 0.00024318695068359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0511, |
|
"reward": 0.6347568333148956, |
|
"reward_std": 0.07518525794148445, |
|
"rewards/length_reward": 0.060267867520451546, |
|
"rewards/similarity_reward": 0.5744889676570892, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 258.74554443359375, |
|
"epoch": 0.14133333333333334, |
|
"grad_norm": 0.7291338070723046, |
|
"kl": 0.00031948089599609375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0178, |
|
"reward": 0.6101614236831665, |
|
"reward_std": 0.15437154471874237, |
|
"rewards/length_reward": 0.06651786714792252, |
|
"rewards/similarity_reward": 0.5436435341835022, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 275.62501525878906, |
|
"epoch": 0.144, |
|
"grad_norm": 0.5749401912567113, |
|
"kl": 0.0002770423889160156, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1447, |
|
"reward": 0.7116933763027191, |
|
"reward_std": 0.10556156933307648, |
|
"rewards/length_reward": 0.06964286789298058, |
|
"rewards/similarity_reward": 0.6420504748821259, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 297.88841247558594, |
|
"epoch": 0.14666666666666667, |
|
"grad_norm": 0.6712391841955945, |
|
"kl": 0.00043582916259765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0543, |
|
"reward": 0.6665964424610138, |
|
"reward_std": 0.13994581252336502, |
|
"rewards/length_reward": 0.07232143357396126, |
|
"rewards/similarity_reward": 0.5942749679088593, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 261.2946548461914, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 0.7543785996872888, |
|
"kl": 0.000606536865234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0927, |
|
"reward": 0.5725409537553787, |
|
"reward_std": 0.19671519845724106, |
|
"rewards/length_reward": 0.05178572237491608, |
|
"rewards/similarity_reward": 0.5207552313804626, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 232.9196548461914, |
|
"epoch": 0.152, |
|
"grad_norm": 0.6310665650295014, |
|
"kl": 0.00034427642822265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0251, |
|
"reward": 0.710765928030014, |
|
"reward_std": 0.11330895498394966, |
|
"rewards/length_reward": 0.07678571715950966, |
|
"rewards/similarity_reward": 0.6339801549911499, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 275.8839340209961, |
|
"epoch": 0.15466666666666667, |
|
"grad_norm": 0.6081646973355043, |
|
"kl": 0.00048542022705078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0153, |
|
"reward": 0.634895533323288, |
|
"reward_std": 0.12484097108244896, |
|
"rewards/length_reward": 0.0736607201397419, |
|
"rewards/similarity_reward": 0.5612348020076752, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 296.9151916503906, |
|
"epoch": 0.15733333333333333, |
|
"grad_norm": 0.6547927211710071, |
|
"kl": 0.0003204345703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.101, |
|
"reward": 0.679833859205246, |
|
"reward_std": 0.13348159193992615, |
|
"rewards/length_reward": 0.07053572311997414, |
|
"rewards/similarity_reward": 0.6092981398105621, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 281.6919708251953, |
|
"epoch": 0.16, |
|
"grad_norm": 0.6398386646259059, |
|
"kl": 0.000492095947265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0297, |
|
"reward": 0.6490921080112457, |
|
"reward_std": 0.14558688551187515, |
|
"rewards/length_reward": 0.07008930295705795, |
|
"rewards/similarity_reward": 0.5790028274059296, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 255.53126525878906, |
|
"epoch": 0.16266666666666665, |
|
"grad_norm": 0.7177382491967137, |
|
"kl": 0.0004730224609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0431, |
|
"reward": 0.6999586224555969, |
|
"reward_std": 0.13094842806458473, |
|
"rewards/length_reward": 0.06383929774165154, |
|
"rewards/similarity_reward": 0.6361193954944611, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 269.15626525878906, |
|
"epoch": 0.16533333333333333, |
|
"grad_norm": 0.5891947471632187, |
|
"kl": 0.00038623809814453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0092, |
|
"reward": 0.7129235565662384, |
|
"reward_std": 0.0941559188067913, |
|
"rewards/length_reward": 0.07455357909202576, |
|
"rewards/similarity_reward": 0.6383699178695679, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 294.62054443359375, |
|
"epoch": 0.168, |
|
"grad_norm": 0.8226717384917975, |
|
"kl": 0.0005388259887695312, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0827, |
|
"reward": 0.6617253720760345, |
|
"reward_std": 0.10956033691763878, |
|
"rewards/length_reward": 0.06250001303851604, |
|
"rewards/similarity_reward": 0.5992253720760345, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 265.2410888671875, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 0.7642480042855918, |
|
"kl": 0.0007228851318359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0968, |
|
"reward": 0.6403190791606903, |
|
"reward_std": 0.1368250623345375, |
|
"rewards/length_reward": 0.06071429327130318, |
|
"rewards/similarity_reward": 0.5796047747135162, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 241.55358123779297, |
|
"epoch": 0.17333333333333334, |
|
"grad_norm": 0.8292179764665186, |
|
"kl": 0.00046443939208984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.065, |
|
"reward": 0.6242024898529053, |
|
"reward_std": 0.15151464939117432, |
|
"rewards/length_reward": 0.05491072125732899, |
|
"rewards/similarity_reward": 0.5692917704582214, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 234.7946548461914, |
|
"epoch": 0.176, |
|
"grad_norm": 0.6954246830187802, |
|
"kl": 0.00040721893310546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0081, |
|
"reward": 0.6392101645469666, |
|
"reward_std": 0.13006018847227097, |
|
"rewards/length_reward": 0.06562500819563866, |
|
"rewards/similarity_reward": 0.5735851526260376, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 272.9776916503906, |
|
"epoch": 0.17866666666666667, |
|
"grad_norm": 0.8379515100728363, |
|
"kl": 0.0006618499755859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0778, |
|
"reward": 0.6054639518260956, |
|
"reward_std": 0.15866044908761978, |
|
"rewards/length_reward": 0.056250011548399925, |
|
"rewards/similarity_reward": 0.5492139458656311, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 246.08482360839844, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.6633622096307521, |
|
"kl": 0.000507354736328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0327, |
|
"reward": 0.6771285533905029, |
|
"reward_std": 0.11619659885764122, |
|
"rewards/length_reward": 0.06696430034935474, |
|
"rewards/similarity_reward": 0.6101642847061157, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 260.4776916503906, |
|
"epoch": 0.184, |
|
"grad_norm": 0.7742612358878467, |
|
"kl": 0.0005159378051757812, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0148, |
|
"reward": 0.711969256401062, |
|
"reward_std": 0.14254993572831154, |
|
"rewards/length_reward": 0.07276786491274834, |
|
"rewards/similarity_reward": 0.6392013728618622, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 218.6696548461914, |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 0.7878879808209311, |
|
"kl": 0.0006504058837890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1186, |
|
"reward": 0.5868320167064667, |
|
"reward_std": 0.139366053044796, |
|
"rewards/length_reward": 0.050892867147922516, |
|
"rewards/similarity_reward": 0.5359391123056412, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 273.07591247558594, |
|
"epoch": 0.18933333333333333, |
|
"grad_norm": 0.6879154147040387, |
|
"kl": 0.00072479248046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0361, |
|
"reward": 0.5751383006572723, |
|
"reward_std": 0.16684868186712265, |
|
"rewards/length_reward": 0.061607155948877335, |
|
"rewards/similarity_reward": 0.5135311335325241, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 271.7053680419922, |
|
"epoch": 0.192, |
|
"grad_norm": 0.6918784121385549, |
|
"kl": 0.0006055831909179688, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0412, |
|
"reward": 0.6015307903289795, |
|
"reward_std": 0.11276621744036674, |
|
"rewards/length_reward": 0.06383929401636124, |
|
"rewards/similarity_reward": 0.5376915037631989, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 239.18304443359375, |
|
"epoch": 0.19466666666666665, |
|
"grad_norm": 0.6763207999389166, |
|
"kl": 0.00046539306640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0378, |
|
"reward": 0.738762378692627, |
|
"reward_std": 0.07676012441515923, |
|
"rewards/length_reward": 0.0669642947614193, |
|
"rewards/similarity_reward": 0.671798050403595, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 298.2321472167969, |
|
"epoch": 0.19733333333333333, |
|
"grad_norm": 0.7183360474036766, |
|
"kl": 0.00069427490234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1536, |
|
"reward": 0.6193753182888031, |
|
"reward_std": 0.13081318512558937, |
|
"rewards/length_reward": 0.055803585797548294, |
|
"rewards/similarity_reward": 0.5635717213153839, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 278.71876525878906, |
|
"epoch": 0.2, |
|
"grad_norm": 0.6422209743450289, |
|
"kl": 0.0005578994750976562, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1141, |
|
"reward": 0.7105484306812286, |
|
"reward_std": 0.1271527223289013, |
|
"rewards/length_reward": 0.0758928656578064, |
|
"rewards/similarity_reward": 0.6346555352210999, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 232.0759048461914, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.8972600571762434, |
|
"kl": 0.0006427764892578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.051, |
|
"reward": 0.5786014497280121, |
|
"reward_std": 0.13573814183473587, |
|
"rewards/length_reward": 0.06964286416769028, |
|
"rewards/similarity_reward": 0.5089586079120636, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 281.2232208251953, |
|
"epoch": 0.20533333333333334, |
|
"grad_norm": 0.8130156685147141, |
|
"kl": 0.00061798095703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1302, |
|
"reward": 0.6210056841373444, |
|
"reward_std": 0.19511007517576218, |
|
"rewards/length_reward": 0.053125010803341866, |
|
"rewards/similarity_reward": 0.5678807199001312, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 258.6651916503906, |
|
"epoch": 0.208, |
|
"grad_norm": 0.7255155232620163, |
|
"kl": 0.0008106231689453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0153, |
|
"reward": 0.619497686624527, |
|
"reward_std": 0.10088447853922844, |
|
"rewards/length_reward": 0.060267867520451546, |
|
"rewards/similarity_reward": 0.5592298209667206, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 255.79466247558594, |
|
"epoch": 0.21066666666666667, |
|
"grad_norm": 0.6729672044279214, |
|
"kl": 0.000576019287109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0819, |
|
"reward": 0.7009203135967255, |
|
"reward_std": 0.10652491822838783, |
|
"rewards/length_reward": 0.07142858020961285, |
|
"rewards/similarity_reward": 0.6294918060302734, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 275.15626525878906, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.7063326351608399, |
|
"kl": 0.0008029937744140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.106, |
|
"reward": 0.6244928240776062, |
|
"reward_std": 0.14966875314712524, |
|
"rewards/length_reward": 0.056696439161896706, |
|
"rewards/similarity_reward": 0.5677963495254517, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 280.9464416503906, |
|
"epoch": 0.216, |
|
"grad_norm": 0.5928309311542868, |
|
"kl": 0.00060272216796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0107, |
|
"reward": 0.6575084030628204, |
|
"reward_std": 0.10019119456410408, |
|
"rewards/length_reward": 0.06964286789298058, |
|
"rewards/similarity_reward": 0.587865561246872, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 232.42858123779297, |
|
"epoch": 0.21866666666666668, |
|
"grad_norm": 0.728258017728098, |
|
"kl": 0.0008029937744140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0327, |
|
"reward": 0.6456558704376221, |
|
"reward_std": 0.10026798397302628, |
|
"rewards/length_reward": 0.07187500596046448, |
|
"rewards/similarity_reward": 0.5737808644771576, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 272.4241180419922, |
|
"epoch": 0.22133333333333333, |
|
"grad_norm": 0.7474863840166143, |
|
"kl": 0.0006437301635742188, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0835, |
|
"reward": 0.6149618327617645, |
|
"reward_std": 0.11428236961364746, |
|
"rewards/length_reward": 0.06205357797443867, |
|
"rewards/similarity_reward": 0.552908256649971, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 281.40179443359375, |
|
"epoch": 0.224, |
|
"grad_norm": 0.5698111928088367, |
|
"kl": 0.0008392333984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0067, |
|
"reward": 0.6963447332382202, |
|
"reward_std": 0.10215538740158081, |
|
"rewards/length_reward": 0.06830358132719994, |
|
"rewards/similarity_reward": 0.6280410885810852, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 249.6696548461914, |
|
"epoch": 0.22666666666666666, |
|
"grad_norm": 0.774725228553314, |
|
"kl": 0.0007877349853515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0906, |
|
"reward": 0.6118488311767578, |
|
"reward_std": 0.14368778094649315, |
|
"rewards/length_reward": 0.06428572349250317, |
|
"rewards/similarity_reward": 0.5475630760192871, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 265.3482208251953, |
|
"epoch": 0.22933333333333333, |
|
"grad_norm": 0.8923548112445079, |
|
"kl": 0.0011768341064453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0958, |
|
"reward": 0.6147017180919647, |
|
"reward_std": 0.1361084319651127, |
|
"rewards/length_reward": 0.057589296251535416, |
|
"rewards/similarity_reward": 0.557112455368042, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 251.7901840209961, |
|
"epoch": 0.232, |
|
"grad_norm": 0.618718290820211, |
|
"kl": 0.0007457733154296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0706, |
|
"reward": 0.6785232722759247, |
|
"reward_std": 0.09245472028851509, |
|
"rewards/length_reward": 0.07500000670552254, |
|
"rewards/similarity_reward": 0.6035232245922089, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 311.21876525878906, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 0.5515407468114694, |
|
"kl": 0.000797271728515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.012, |
|
"reward": 0.6955881416797638, |
|
"reward_std": 0.10647468641400337, |
|
"rewards/length_reward": 0.08303571492433548, |
|
"rewards/similarity_reward": 0.6125523746013641, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 241.92858123779297, |
|
"epoch": 0.23733333333333334, |
|
"grad_norm": 0.7246692380980303, |
|
"kl": 0.0008525848388671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0111, |
|
"reward": 0.7196685373783112, |
|
"reward_std": 0.11786854639649391, |
|
"rewards/length_reward": 0.06562501564621925, |
|
"rewards/similarity_reward": 0.6540435552597046, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 266.3482208251953, |
|
"epoch": 0.24, |
|
"grad_norm": 0.8337410916485611, |
|
"kl": 0.00109100341796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0753, |
|
"reward": 0.6227089762687683, |
|
"reward_std": 0.11949346587061882, |
|
"rewards/length_reward": 0.05178572237491608, |
|
"rewards/similarity_reward": 0.570923238992691, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 312.4553680419922, |
|
"epoch": 0.24266666666666667, |
|
"grad_norm": 0.7247658170183164, |
|
"kl": 0.0009250640869140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.074, |
|
"reward": 0.6411847472190857, |
|
"reward_std": 0.15169727057218552, |
|
"rewards/length_reward": 0.06651787087321281, |
|
"rewards/similarity_reward": 0.5746668875217438, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 240.4821548461914, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 0.7784189357920823, |
|
"kl": 0.000835418701171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0335, |
|
"reward": 0.7014745771884918, |
|
"reward_std": 0.08627640455961227, |
|
"rewards/length_reward": 0.06517857871949673, |
|
"rewards/similarity_reward": 0.6362960338592529, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 253.05805206298828, |
|
"epoch": 0.248, |
|
"grad_norm": 0.7201282634879667, |
|
"kl": 0.000904083251953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0981, |
|
"reward": 0.6915363669395447, |
|
"reward_std": 0.14938431978225708, |
|
"rewards/length_reward": 0.06875001266598701, |
|
"rewards/similarity_reward": 0.6227863430976868, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 281.4776916503906, |
|
"epoch": 0.25066666666666665, |
|
"grad_norm": 0.6973337009868545, |
|
"kl": 0.0007686614990234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0789, |
|
"reward": 0.6481258869171143, |
|
"reward_std": 0.10927876830101013, |
|
"rewards/length_reward": 0.06830358132719994, |
|
"rewards/similarity_reward": 0.5798223316669464, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 205.28125762939453, |
|
"epoch": 0.25333333333333335, |
|
"grad_norm": 0.8693659568274648, |
|
"kl": 0.00103759765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0431, |
|
"reward": 0.6439125239849091, |
|
"reward_std": 0.11249563843011856, |
|
"rewards/length_reward": 0.058035727590322495, |
|
"rewards/similarity_reward": 0.5858767628669739, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 292.4241180419922, |
|
"epoch": 0.256, |
|
"grad_norm": 0.5457303629614552, |
|
"kl": 0.0009708404541015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0148, |
|
"reward": 0.7119653820991516, |
|
"reward_std": 0.1094476543366909, |
|
"rewards/length_reward": 0.08169642835855484, |
|
"rewards/similarity_reward": 0.630268931388855, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 300.5089416503906, |
|
"epoch": 0.25866666666666666, |
|
"grad_norm": 0.5831645575431905, |
|
"kl": 0.0009822845458984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0146, |
|
"reward": 0.6134711802005768, |
|
"reward_std": 0.10632903501391411, |
|
"rewards/length_reward": 0.07142858020961285, |
|
"rewards/similarity_reward": 0.5420425981283188, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 239.30358123779297, |
|
"epoch": 0.2613333333333333, |
|
"grad_norm": 0.7748344420880229, |
|
"kl": 0.0009174346923828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0451, |
|
"reward": 0.682037353515625, |
|
"reward_std": 0.08752915635704994, |
|
"rewards/length_reward": 0.07410715520381927, |
|
"rewards/similarity_reward": 0.6079301834106445, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 247.00001525878906, |
|
"epoch": 0.264, |
|
"grad_norm": 0.6574205211811521, |
|
"kl": 0.00130462646484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0383, |
|
"reward": 0.6871787905693054, |
|
"reward_std": 0.12282633036375046, |
|
"rewards/length_reward": 0.06607143767178059, |
|
"rewards/similarity_reward": 0.6211073100566864, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 255.9866180419922, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.6550349068969015, |
|
"kl": 0.0011653900146484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.031, |
|
"reward": 0.6847736537456512, |
|
"reward_std": 0.11585507914423943, |
|
"rewards/length_reward": 0.07767857611179352, |
|
"rewards/similarity_reward": 0.6070950329303741, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 302.05804443359375, |
|
"epoch": 0.2693333333333333, |
|
"grad_norm": 0.6581037881537074, |
|
"kl": 0.0008869171142578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0275, |
|
"reward": 0.7058568000793457, |
|
"reward_std": 0.1038212925195694, |
|
"rewards/length_reward": 0.0736607238650322, |
|
"rewards/similarity_reward": 0.6321960687637329, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 252.37055206298828, |
|
"epoch": 0.272, |
|
"grad_norm": 0.7659892153308909, |
|
"kl": 0.0009002685546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0099, |
|
"reward": 0.6600384712219238, |
|
"reward_std": 0.11620288342237473, |
|
"rewards/length_reward": 0.06964286416769028, |
|
"rewards/similarity_reward": 0.5903956294059753, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 260.9910888671875, |
|
"epoch": 0.27466666666666667, |
|
"grad_norm": 0.7873803837848522, |
|
"kl": 0.0011749267578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.067, |
|
"reward": 0.647940456867218, |
|
"reward_std": 0.12016744539141655, |
|
"rewards/length_reward": 0.060267869383096695, |
|
"rewards/similarity_reward": 0.587672621011734, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 303.0982208251953, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 0.6698404225945914, |
|
"kl": 0.0008029937744140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0326, |
|
"reward": 0.6724097728729248, |
|
"reward_std": 0.08444990962743759, |
|
"rewards/length_reward": 0.07232143729925156, |
|
"rewards/similarity_reward": 0.600088357925415, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 263.43751525878906, |
|
"epoch": 0.28, |
|
"grad_norm": 0.7035133703979454, |
|
"kl": 0.0010471343994140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0377, |
|
"reward": 0.6536130607128143, |
|
"reward_std": 0.16632840782403946, |
|
"rewards/length_reward": 0.06741072982549667, |
|
"rewards/similarity_reward": 0.5862023532390594, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 265.3526916503906, |
|
"epoch": 0.2826666666666667, |
|
"grad_norm": 0.7008056428586112, |
|
"kl": 0.0012378692626953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0104, |
|
"reward": 0.6467622220516205, |
|
"reward_std": 0.11927535384893417, |
|
"rewards/length_reward": 0.06651787087321281, |
|
"rewards/similarity_reward": 0.5802443325519562, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 275.6026916503906, |
|
"epoch": 0.2853333333333333, |
|
"grad_norm": 0.6158480236246975, |
|
"kl": 0.001171112060546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0208, |
|
"reward": 0.6745641827583313, |
|
"reward_std": 0.11500725150108337, |
|
"rewards/length_reward": 0.06964286416769028, |
|
"rewards/similarity_reward": 0.604921281337738, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 260.33036041259766, |
|
"epoch": 0.288, |
|
"grad_norm": 0.7224564328760787, |
|
"kl": 0.000965118408203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0761, |
|
"reward": 0.5630394518375397, |
|
"reward_std": 0.1106732226908207, |
|
"rewards/length_reward": 0.053571442142128944, |
|
"rewards/similarity_reward": 0.5094679594039917, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 243.68751525878906, |
|
"epoch": 0.2906666666666667, |
|
"grad_norm": 0.8254406902750699, |
|
"kl": 0.001445770263671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0425, |
|
"reward": 0.6382045447826385, |
|
"reward_std": 0.12056771293282509, |
|
"rewards/length_reward": 0.062053581699728966, |
|
"rewards/similarity_reward": 0.5761510133743286, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 224.87501525878906, |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 0.6268606551971334, |
|
"kl": 0.0011119842529296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0105, |
|
"reward": 0.5971523225307465, |
|
"reward_std": 0.10623245313763618, |
|
"rewards/length_reward": 0.06517858244478703, |
|
"rewards/similarity_reward": 0.5319737493991852, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 259.1339340209961, |
|
"epoch": 0.296, |
|
"grad_norm": 0.6687342397274337, |
|
"kl": 0.00101470947265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1205, |
|
"reward": 0.6370533108711243, |
|
"reward_std": 0.12068188562989235, |
|
"rewards/length_reward": 0.06294644251465797, |
|
"rewards/similarity_reward": 0.5741068720817566, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 249.50000762939453, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 0.6806425164227126, |
|
"kl": 0.001422882080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.052, |
|
"reward": 0.6264582276344299, |
|
"reward_std": 0.11265718936920166, |
|
"rewards/length_reward": 0.06428572908043861, |
|
"rewards/similarity_reward": 0.562172532081604, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 270.3883972167969, |
|
"epoch": 0.30133333333333334, |
|
"grad_norm": 0.7014564695453412, |
|
"kl": 0.0009479522705078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1126, |
|
"reward": 0.7100532650947571, |
|
"reward_std": 0.10190926492214203, |
|
"rewards/length_reward": 0.07857143878936768, |
|
"rewards/similarity_reward": 0.6314818263053894, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 271.99554443359375, |
|
"epoch": 0.304, |
|
"grad_norm": 0.8072459229444039, |
|
"kl": 0.001361846923828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1128, |
|
"reward": 0.6362347304821014, |
|
"reward_std": 0.13582201302051544, |
|
"rewards/length_reward": 0.07008929550647736, |
|
"rewards/similarity_reward": 0.5661454498767853, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 223.00447845458984, |
|
"epoch": 0.30666666666666664, |
|
"grad_norm": 0.8550809137589263, |
|
"kl": 0.001277923583984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1073, |
|
"reward": 0.6763125956058502, |
|
"reward_std": 0.11202903836965561, |
|
"rewards/length_reward": 0.06607143580913544, |
|
"rewards/similarity_reward": 0.6102411448955536, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 279.54466247558594, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 0.7634314721408118, |
|
"kl": 0.001033782958984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1038, |
|
"reward": 0.6390451192855835, |
|
"reward_std": 0.11136174201965332, |
|
"rewards/length_reward": 0.0714285746216774, |
|
"rewards/similarity_reward": 0.5676165223121643, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 251.08929443359375, |
|
"epoch": 0.312, |
|
"grad_norm": 0.8512986272863483, |
|
"kl": 0.0017547607421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.027, |
|
"reward": 0.5662170350551605, |
|
"reward_std": 0.1223873421549797, |
|
"rewards/length_reward": 0.051785726100206375, |
|
"rewards/similarity_reward": 0.5144313126802444, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 279.3526916503906, |
|
"epoch": 0.31466666666666665, |
|
"grad_norm": 0.5648142162422048, |
|
"kl": 0.001270294189453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.058, |
|
"reward": 0.6961429119110107, |
|
"reward_std": 0.08081653341650963, |
|
"rewards/length_reward": 0.06651786714792252, |
|
"rewards/similarity_reward": 0.6296250522136688, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 228.95536041259766, |
|
"epoch": 0.31733333333333336, |
|
"grad_norm": 0.87246537686519, |
|
"kl": 0.00152587890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0655, |
|
"reward": 0.5725010931491852, |
|
"reward_std": 0.13679825142025948, |
|
"rewards/length_reward": 0.051785724237561226, |
|
"rewards/similarity_reward": 0.5207154005765915, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 213.12500762939453, |
|
"epoch": 0.32, |
|
"grad_norm": 0.90775103522726, |
|
"kl": 0.00186920166015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0752, |
|
"reward": 0.5904355347156525, |
|
"reward_std": 0.13604873418807983, |
|
"rewards/length_reward": 0.060714298859238625, |
|
"rewards/similarity_reward": 0.5297212451696396, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 272.4598388671875, |
|
"epoch": 0.32266666666666666, |
|
"grad_norm": 0.679207876261083, |
|
"kl": 0.001651763916015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0208, |
|
"reward": 0.5465417206287384, |
|
"reward_std": 0.11999019607901573, |
|
"rewards/length_reward": 0.060714296996593475, |
|
"rewards/similarity_reward": 0.48582740128040314, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 264.77679443359375, |
|
"epoch": 0.3253333333333333, |
|
"grad_norm": 0.6586415817875254, |
|
"kl": 0.0012359619140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0806, |
|
"reward": 0.68158820271492, |
|
"reward_std": 0.1025921143591404, |
|
"rewards/length_reward": 0.07008929923176765, |
|
"rewards/similarity_reward": 0.6114989817142487, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 256.1607208251953, |
|
"epoch": 0.328, |
|
"grad_norm": 0.7273109756128965, |
|
"kl": 0.00136566162109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0542, |
|
"reward": 0.6582387089729309, |
|
"reward_std": 0.10396385937929153, |
|
"rewards/length_reward": 0.062053581699728966, |
|
"rewards/similarity_reward": 0.5961851477622986, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 258.7723388671875, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 0.7158563788139325, |
|
"kl": 0.0016632080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0495, |
|
"reward": 0.680817037820816, |
|
"reward_std": 0.15591518580913544, |
|
"rewards/length_reward": 0.0669643022119999, |
|
"rewards/similarity_reward": 0.6138526499271393, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 287.1026916503906, |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.7257840755435857, |
|
"kl": 0.0021820068359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1379, |
|
"reward": 0.6818304061889648, |
|
"reward_std": 0.14189621806144714, |
|
"rewards/length_reward": 0.06651786714792252, |
|
"rewards/similarity_reward": 0.6153125464916229, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 272.3794860839844, |
|
"epoch": 0.336, |
|
"grad_norm": 0.716044862480554, |
|
"kl": 0.00128936767578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0606, |
|
"reward": 0.607832282781601, |
|
"reward_std": 0.10822133347392082, |
|
"rewards/length_reward": 0.06517858244478703, |
|
"rewards/similarity_reward": 0.5426536649465561, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 267.2008972167969, |
|
"epoch": 0.33866666666666667, |
|
"grad_norm": 0.7078650786713803, |
|
"kl": 0.0017242431640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0553, |
|
"reward": 0.6737089157104492, |
|
"reward_std": 0.12828165292739868, |
|
"rewards/length_reward": 0.06830358132719994, |
|
"rewards/similarity_reward": 0.6054053008556366, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 247.7276840209961, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 0.9316222904188678, |
|
"kl": 0.0016021728515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0704, |
|
"reward": 0.6324192881584167, |
|
"reward_std": 0.09700313210487366, |
|
"rewards/length_reward": 0.056250011548399925, |
|
"rewards/similarity_reward": 0.5761693120002747, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 266.2946548461914, |
|
"epoch": 0.344, |
|
"grad_norm": 0.6821391109445131, |
|
"kl": 0.00136566162109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0922, |
|
"reward": 0.600318193435669, |
|
"reward_std": 0.14177118614315987, |
|
"rewards/length_reward": 0.05892857722938061, |
|
"rewards/similarity_reward": 0.5413895845413208, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 260.25447845458984, |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 0.6135827945376331, |
|
"kl": 0.001003265380859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0975, |
|
"reward": 0.6941237151622772, |
|
"reward_std": 0.11082329601049423, |
|
"rewards/length_reward": 0.07232144102454185, |
|
"rewards/similarity_reward": 0.6218023300170898, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 301.4598388671875, |
|
"epoch": 0.34933333333333333, |
|
"grad_norm": 0.727973261017594, |
|
"kl": 0.00146484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1049, |
|
"reward": 0.6211672425270081, |
|
"reward_std": 0.1372002437710762, |
|
"rewards/length_reward": 0.05625000782310963, |
|
"rewards/similarity_reward": 0.5649172067642212, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 235.4910888671875, |
|
"epoch": 0.352, |
|
"grad_norm": 0.7386524195731257, |
|
"kl": 0.0012664794921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0175, |
|
"reward": 0.6908943951129913, |
|
"reward_std": 0.09667576849460602, |
|
"rewards/length_reward": 0.06830358132719994, |
|
"rewards/similarity_reward": 0.6225908398628235, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 220.09376525878906, |
|
"epoch": 0.3546666666666667, |
|
"grad_norm": 0.6544388171069107, |
|
"kl": 0.00138092041015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0025, |
|
"reward": 0.6801804006099701, |
|
"reward_std": 0.11540298722684383, |
|
"rewards/length_reward": 0.06830358132719994, |
|
"rewards/similarity_reward": 0.6118768155574799, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 273.5714340209961, |
|
"epoch": 0.35733333333333334, |
|
"grad_norm": 0.5808123143329609, |
|
"kl": 0.0009899139404296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0302, |
|
"reward": 0.6981923878192902, |
|
"reward_std": 0.12756088376045227, |
|
"rewards/length_reward": 0.071428582072258, |
|
"rewards/similarity_reward": 0.6267638206481934, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 273.83483123779297, |
|
"epoch": 0.36, |
|
"grad_norm": 0.6257800546867194, |
|
"kl": 0.001750946044921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0255, |
|
"reward": 0.6324395537376404, |
|
"reward_std": 0.11662106215953827, |
|
"rewards/length_reward": 0.07276786491274834, |
|
"rewards/similarity_reward": 0.5596716403961182, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 209.4821548461914, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 0.7210088344519984, |
|
"kl": 0.00167083740234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0129, |
|
"reward": 0.6599233448505402, |
|
"reward_std": 0.08075730502605438, |
|
"rewards/length_reward": 0.06339286640286446, |
|
"rewards/similarity_reward": 0.5965304374694824, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 258.7232208251953, |
|
"epoch": 0.36533333333333334, |
|
"grad_norm": 0.7922066598101163, |
|
"kl": 0.001499176025390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0838, |
|
"reward": 0.6245545744895935, |
|
"reward_std": 0.12391168251633644, |
|
"rewards/length_reward": 0.0625000111758709, |
|
"rewards/similarity_reward": 0.5620545446872711, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 290.34822845458984, |
|
"epoch": 0.368, |
|
"grad_norm": 0.6230240946470257, |
|
"kl": 0.00208282470703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0447, |
|
"reward": 0.6634125411510468, |
|
"reward_std": 0.10979421064257622, |
|
"rewards/length_reward": 0.06428572535514832, |
|
"rewards/similarity_reward": 0.599126785993576, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 265.3839416503906, |
|
"epoch": 0.37066666666666664, |
|
"grad_norm": 0.7078887986803306, |
|
"kl": 0.002201080322265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0458, |
|
"reward": 0.70221146941185, |
|
"reward_std": 0.10299094393849373, |
|
"rewards/length_reward": 0.06964286789298058, |
|
"rewards/similarity_reward": 0.6325685381889343, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 265.7678680419922, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.7842037190011205, |
|
"kl": 0.001491546630859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0973, |
|
"reward": 0.644274890422821, |
|
"reward_std": 0.10034830123186111, |
|
"rewards/length_reward": 0.061160728335380554, |
|
"rewards/similarity_reward": 0.5831141769886017, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 249.66964721679688, |
|
"epoch": 0.376, |
|
"grad_norm": 0.567490680527846, |
|
"kl": 0.001689910888671875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.024, |
|
"reward": 0.7013588547706604, |
|
"reward_std": 0.0840650200843811, |
|
"rewards/length_reward": 0.06651786342263222, |
|
"rewards/similarity_reward": 0.6348409652709961, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 226.25447845458984, |
|
"epoch": 0.37866666666666665, |
|
"grad_norm": 0.912319551814626, |
|
"kl": 0.001934051513671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0961, |
|
"reward": 0.6543315052986145, |
|
"reward_std": 0.10859474539756775, |
|
"rewards/length_reward": 0.06607143953442574, |
|
"rewards/similarity_reward": 0.5882599949836731, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 298.8616180419922, |
|
"epoch": 0.38133333333333336, |
|
"grad_norm": 0.5902099764974178, |
|
"kl": 0.001720428466796875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0059, |
|
"reward": 0.7150795757770538, |
|
"reward_std": 0.08074402436614037, |
|
"rewards/length_reward": 0.0781250111758709, |
|
"rewards/similarity_reward": 0.6369545459747314, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 258.6026916503906, |
|
"epoch": 0.384, |
|
"grad_norm": 0.7182388748669096, |
|
"kl": 0.0016632080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.073, |
|
"reward": 0.6721713840961456, |
|
"reward_std": 0.10838266462087631, |
|
"rewards/length_reward": 0.07321429625153542, |
|
"rewards/similarity_reward": 0.5989570617675781, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 249.93304443359375, |
|
"epoch": 0.38666666666666666, |
|
"grad_norm": 0.736716904979314, |
|
"kl": 0.001873016357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0098, |
|
"reward": 0.6520076394081116, |
|
"reward_std": 0.08915667980909348, |
|
"rewards/length_reward": 0.059821439906954765, |
|
"rewards/similarity_reward": 0.5921862125396729, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 249.5401840209961, |
|
"epoch": 0.3893333333333333, |
|
"grad_norm": 0.696504045867252, |
|
"kl": 0.0015106201171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0287, |
|
"reward": 0.6746790409088135, |
|
"reward_std": 0.10656709223985672, |
|
"rewards/length_reward": 0.06785715371370316, |
|
"rewards/similarity_reward": 0.6068219244480133, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 335.7098388671875, |
|
"epoch": 0.392, |
|
"grad_norm": 0.5956601576776329, |
|
"kl": 0.001354217529296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0678, |
|
"reward": 0.6324039399623871, |
|
"reward_std": 0.0887768529355526, |
|
"rewards/length_reward": 0.07053572684526443, |
|
"rewards/similarity_reward": 0.5618681907653809, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 245.93304443359375, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 0.7100692950335996, |
|
"kl": 0.001987457275390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0427, |
|
"reward": 0.6061528325080872, |
|
"reward_std": 0.09401173889636993, |
|
"rewards/length_reward": 0.06830358132719994, |
|
"rewards/similarity_reward": 0.5378492623567581, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 220.54911041259766, |
|
"epoch": 0.3973333333333333, |
|
"grad_norm": 0.8672496323443106, |
|
"kl": 0.00212860107421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0339, |
|
"reward": 0.6478115916252136, |
|
"reward_std": 0.12049812451004982, |
|
"rewards/length_reward": 0.06562500819563866, |
|
"rewards/similarity_reward": 0.5821865797042847, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 277.59376525878906, |
|
"epoch": 0.4, |
|
"grad_norm": 0.808658829177956, |
|
"kl": 0.002712249755859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0533, |
|
"reward": 0.6419509947299957, |
|
"reward_std": 0.11133093386888504, |
|
"rewards/length_reward": 0.06517858430743217, |
|
"rewards/similarity_reward": 0.5767723917961121, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 284.6026916503906, |
|
"epoch": 0.4026666666666667, |
|
"grad_norm": 0.5726944777875907, |
|
"kl": 0.0017871856689453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0081, |
|
"reward": 0.6313992738723755, |
|
"reward_std": 0.1251782327890396, |
|
"rewards/length_reward": 0.07008929550647736, |
|
"rewards/similarity_reward": 0.5613099485635757, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 269.39286041259766, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 0.6397037674423297, |
|
"kl": 0.001956939697265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0305, |
|
"reward": 0.6884091794490814, |
|
"reward_std": 0.08797129616141319, |
|
"rewards/length_reward": 0.07187500968575478, |
|
"rewards/similarity_reward": 0.6165341138839722, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 249.98214721679688, |
|
"epoch": 0.408, |
|
"grad_norm": 0.7056757365461955, |
|
"kl": 0.001522064208984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.051, |
|
"reward": 0.6953409612178802, |
|
"reward_std": 0.10321801900863647, |
|
"rewards/length_reward": 0.07321429625153542, |
|
"rewards/similarity_reward": 0.6221266388893127, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 301.4196472167969, |
|
"epoch": 0.4106666666666667, |
|
"grad_norm": 0.672677703608687, |
|
"kl": 0.001739501953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0471, |
|
"reward": 0.6628727614879608, |
|
"reward_std": 0.09228364005684853, |
|
"rewards/length_reward": 0.06339287012815475, |
|
"rewards/similarity_reward": 0.5994798243045807, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 255.06697845458984, |
|
"epoch": 0.41333333333333333, |
|
"grad_norm": 0.7128891033714289, |
|
"kl": 0.00139617919921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1005, |
|
"reward": 0.7157993614673615, |
|
"reward_std": 0.09617481008172035, |
|
"rewards/length_reward": 0.06339286454021931, |
|
"rewards/similarity_reward": 0.6524064540863037, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 283.53126525878906, |
|
"epoch": 0.416, |
|
"grad_norm": 0.6441555100001085, |
|
"kl": 0.00176239013671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0274, |
|
"reward": 0.7534910440444946, |
|
"reward_std": 0.08639111369848251, |
|
"rewards/length_reward": 0.0781250037252903, |
|
"rewards/similarity_reward": 0.675366073846817, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 238.80805206298828, |
|
"epoch": 0.4186666666666667, |
|
"grad_norm": 0.6969348755424246, |
|
"kl": 0.002040863037109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0883, |
|
"reward": 0.7300522923469543, |
|
"reward_std": 0.0814172811806202, |
|
"rewards/length_reward": 0.07991071790456772, |
|
"rewards/similarity_reward": 0.650141566991806, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 277.8526840209961, |
|
"epoch": 0.42133333333333334, |
|
"grad_norm": 0.6143618956533469, |
|
"kl": 0.002044677734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0655, |
|
"reward": 0.6692302823066711, |
|
"reward_std": 0.11947489529848099, |
|
"rewards/length_reward": 0.07053572311997414, |
|
"rewards/similarity_reward": 0.5986945033073425, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 272.3526916503906, |
|
"epoch": 0.424, |
|
"grad_norm": 0.7167795024215189, |
|
"kl": 0.00182342529296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0687, |
|
"reward": 0.6641163527965546, |
|
"reward_std": 0.11917684972286224, |
|
"rewards/length_reward": 0.0625000111758709, |
|
"rewards/similarity_reward": 0.6016163527965546, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 248.9821548461914, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.7327545151839033, |
|
"kl": 0.00356292724609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0745, |
|
"reward": 0.6552646160125732, |
|
"reward_std": 0.11251871287822723, |
|
"rewards/length_reward": 0.06294644065201283, |
|
"rewards/similarity_reward": 0.5923182666301727, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 256.07591247558594, |
|
"epoch": 0.42933333333333334, |
|
"grad_norm": 0.7535276542214958, |
|
"kl": 0.0027008056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0314, |
|
"reward": 0.6351377367973328, |
|
"reward_std": 0.10685055330395699, |
|
"rewards/length_reward": 0.059375012293457985, |
|
"rewards/similarity_reward": 0.5757627189159393, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 257.43750762939453, |
|
"epoch": 0.432, |
|
"grad_norm": 0.6595271125555469, |
|
"kl": 0.00250244140625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0056, |
|
"reward": 0.6979295313358307, |
|
"reward_std": 0.09611150622367859, |
|
"rewards/length_reward": 0.07187500968575478, |
|
"rewards/similarity_reward": 0.6260544955730438, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 262.4419860839844, |
|
"epoch": 0.43466666666666665, |
|
"grad_norm": 0.6185607216367858, |
|
"kl": 0.002216339111328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0996, |
|
"reward": 0.6647298634052277, |
|
"reward_std": 0.15266689658164978, |
|
"rewards/length_reward": 0.06964286416769028, |
|
"rewards/similarity_reward": 0.5950870215892792, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 307.3526916503906, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 0.7121773803870707, |
|
"kl": 0.001800537109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0489, |
|
"reward": 0.7000181972980499, |
|
"reward_std": 0.12559669092297554, |
|
"rewards/length_reward": 0.06741072237491608, |
|
"rewards/similarity_reward": 0.6326074600219727, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 246.79019165039062, |
|
"epoch": 0.44, |
|
"grad_norm": 0.6721019148616484, |
|
"kl": 0.00197601318359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0431, |
|
"reward": 0.7078234553337097, |
|
"reward_std": 0.09047579020261765, |
|
"rewards/length_reward": 0.06741072610020638, |
|
"rewards/similarity_reward": 0.6404128074645996, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 272.7723388671875, |
|
"epoch": 0.44266666666666665, |
|
"grad_norm": 0.5673761372545018, |
|
"kl": 0.002292633056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0246, |
|
"reward": 0.7385044991970062, |
|
"reward_std": 0.07343112863600254, |
|
"rewards/length_reward": 0.0714285783469677, |
|
"rewards/similarity_reward": 0.6670758426189423, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 236.49108123779297, |
|
"epoch": 0.44533333333333336, |
|
"grad_norm": 0.7731297375677283, |
|
"kl": 0.00337982177734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0703, |
|
"reward": 0.6287428140640259, |
|
"reward_std": 0.13660584390163422, |
|
"rewards/length_reward": 0.058928582817316055, |
|
"rewards/similarity_reward": 0.5698141753673553, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 250.63839721679688, |
|
"epoch": 0.448, |
|
"grad_norm": 0.6844249610340352, |
|
"kl": 0.002716064453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.005, |
|
"reward": 0.6882721185684204, |
|
"reward_std": 0.08635392412543297, |
|
"rewards/length_reward": 0.07276786863803864, |
|
"rewards/similarity_reward": 0.6155042350292206, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 299.9151916503906, |
|
"epoch": 0.45066666666666666, |
|
"grad_norm": 0.5305327006388908, |
|
"kl": 0.001537322998046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.009, |
|
"reward": 0.7003590762615204, |
|
"reward_std": 0.06643648073077202, |
|
"rewards/length_reward": 0.0781250074505806, |
|
"rewards/similarity_reward": 0.6222340762615204, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 274.7143096923828, |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 0.6964990599197315, |
|
"kl": 0.002025604248046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0422, |
|
"reward": 0.6924863159656525, |
|
"reward_std": 0.10944930091500282, |
|
"rewards/length_reward": 0.06830358505249023, |
|
"rewards/similarity_reward": 0.6241826713085175, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 260.29911041259766, |
|
"epoch": 0.456, |
|
"grad_norm": 0.7007252255586068, |
|
"kl": 0.0022430419921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0892, |
|
"reward": 0.6610677242279053, |
|
"reward_std": 0.08702437207102776, |
|
"rewards/length_reward": 0.0625000111758709, |
|
"rewards/similarity_reward": 0.5985676944255829, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 291.8883972167969, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 0.6818242026608151, |
|
"kl": 0.00199127197265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0296, |
|
"reward": 0.6590520441532135, |
|
"reward_std": 0.11536738649010658, |
|
"rewards/length_reward": 0.058035723865032196, |
|
"rewards/similarity_reward": 0.6010163724422455, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 262.5982360839844, |
|
"epoch": 0.4613333333333333, |
|
"grad_norm": 0.7058845306121532, |
|
"kl": 0.00278472900390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0147, |
|
"reward": 0.6392101943492889, |
|
"reward_std": 0.112693652510643, |
|
"rewards/length_reward": 0.0736607238650322, |
|
"rewards/similarity_reward": 0.5655494332313538, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 243.60269165039062, |
|
"epoch": 0.464, |
|
"grad_norm": 0.6664009632817656, |
|
"kl": 0.00189208984375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0054, |
|
"reward": 0.6843527555465698, |
|
"reward_std": 0.1105504259467125, |
|
"rewards/length_reward": 0.06964286416769028, |
|
"rewards/similarity_reward": 0.6147099137306213, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 250.46875762939453, |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 0.6473092015687406, |
|
"kl": 0.00228118896484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0463, |
|
"reward": 0.6191678941249847, |
|
"reward_std": 0.1144786849617958, |
|
"rewards/length_reward": 0.06294644251465797, |
|
"rewards/similarity_reward": 0.5562214553356171, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 246.2276840209961, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 0.782293454462268, |
|
"kl": 0.00267791748046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.104, |
|
"reward": 0.6215286254882812, |
|
"reward_std": 0.10633666813373566, |
|
"rewards/length_reward": 0.0691964402794838, |
|
"rewards/similarity_reward": 0.552332192659378, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 288.9598388671875, |
|
"epoch": 0.472, |
|
"grad_norm": 0.6068876990848109, |
|
"kl": 0.0028076171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0499, |
|
"reward": 0.6609079539775848, |
|
"reward_std": 0.12471455708146095, |
|
"rewards/length_reward": 0.06294644251465797, |
|
"rewards/similarity_reward": 0.5979615151882172, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 231.6919708251953, |
|
"epoch": 0.4746666666666667, |
|
"grad_norm": 0.6072574366547386, |
|
"kl": 0.00218963623046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0049, |
|
"reward": 0.6936376392841339, |
|
"reward_std": 0.09063273295760155, |
|
"rewards/length_reward": 0.0691964402794838, |
|
"rewards/similarity_reward": 0.6244412362575531, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 244.82144165039062, |
|
"epoch": 0.47733333333333333, |
|
"grad_norm": 0.6731140452564113, |
|
"kl": 0.002288818359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0199, |
|
"reward": 0.6732902526855469, |
|
"reward_std": 0.12003319337964058, |
|
"rewards/length_reward": 0.06741072982549667, |
|
"rewards/similarity_reward": 0.6058795154094696, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 248.89733123779297, |
|
"epoch": 0.48, |
|
"grad_norm": 0.6436090869352918, |
|
"kl": 0.00261688232421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0504, |
|
"reward": 0.6384674608707428, |
|
"reward_std": 0.13010089099407196, |
|
"rewards/length_reward": 0.057589298114180565, |
|
"rewards/similarity_reward": 0.5808781981468201, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 246.43304443359375, |
|
"epoch": 0.4826666666666667, |
|
"grad_norm": 0.6282260624355238, |
|
"kl": 0.005481719970703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0045, |
|
"reward": 0.717792272567749, |
|
"reward_std": 0.07219249755144119, |
|
"rewards/length_reward": 0.07008929550647736, |
|
"rewards/similarity_reward": 0.6477029919624329, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 250.3660888671875, |
|
"epoch": 0.48533333333333334, |
|
"grad_norm": 0.8187805221092193, |
|
"kl": 0.00237274169921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0499, |
|
"reward": 0.6446585357189178, |
|
"reward_std": 0.1213221587240696, |
|
"rewards/length_reward": 0.071428582072258, |
|
"rewards/similarity_reward": 0.573229968547821, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 238.0535888671875, |
|
"epoch": 0.488, |
|
"grad_norm": 0.7872695013078711, |
|
"kl": 0.0045166015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0325, |
|
"reward": 0.6350027918815613, |
|
"reward_std": 0.08915703371167183, |
|
"rewards/length_reward": 0.06205358728766441, |
|
"rewards/similarity_reward": 0.5729491412639618, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 239.0446548461914, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 0.6850244703959089, |
|
"kl": 0.002655029296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0088, |
|
"reward": 0.6566590070724487, |
|
"reward_std": 0.08741006627678871, |
|
"rewards/length_reward": 0.06651786714792252, |
|
"rewards/similarity_reward": 0.5901412069797516, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 228.43304443359375, |
|
"epoch": 0.49333333333333335, |
|
"grad_norm": 0.640590428870281, |
|
"kl": 0.00246429443359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0211, |
|
"reward": 0.7332558631896973, |
|
"reward_std": 0.08987564593553543, |
|
"rewards/length_reward": 0.07098215445876122, |
|
"rewards/similarity_reward": 0.6622737348079681, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 275.46875762939453, |
|
"epoch": 0.496, |
|
"grad_norm": 0.7034238309978142, |
|
"kl": 0.00243377685546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0345, |
|
"reward": 0.7175212800502777, |
|
"reward_std": 0.08901329711079597, |
|
"rewards/length_reward": 0.07544643431901932, |
|
"rewards/similarity_reward": 0.6420748233795166, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 280.00000762939453, |
|
"epoch": 0.49866666666666665, |
|
"grad_norm": 0.657474847366372, |
|
"kl": 0.00211334228515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0942, |
|
"reward": 0.702696293592453, |
|
"reward_std": 0.08004150912165642, |
|
"rewards/length_reward": 0.0669642984867096, |
|
"rewards/similarity_reward": 0.635731965303421, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 246.83929443359375, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 0.6840180489866146, |
|
"kl": 0.002269744873046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0305, |
|
"reward": 0.6730459034442902, |
|
"reward_std": 0.08726384490728378, |
|
"rewards/length_reward": 0.07053572684526443, |
|
"rewards/similarity_reward": 0.6025101542472839, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 253.83929443359375, |
|
"epoch": 0.504, |
|
"grad_norm": 0.5974802952790658, |
|
"kl": 0.00257110595703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0762, |
|
"reward": 0.6664688587188721, |
|
"reward_std": 0.11599006876349449, |
|
"rewards/length_reward": 0.07723214477300644, |
|
"rewards/similarity_reward": 0.5892367660999298, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 278.80804443359375, |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 0.6466637537752649, |
|
"kl": 0.00311279296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0959, |
|
"reward": 0.6832419335842133, |
|
"reward_std": 0.11445752903819084, |
|
"rewards/length_reward": 0.06830358505249023, |
|
"rewards/similarity_reward": 0.614938348531723, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 235.5491180419922, |
|
"epoch": 0.5093333333333333, |
|
"grad_norm": 0.7153062964233342, |
|
"kl": 0.0019378662109375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0045, |
|
"reward": 0.6687261462211609, |
|
"reward_std": 0.09642783552408218, |
|
"rewards/length_reward": 0.06562500447034836, |
|
"rewards/similarity_reward": 0.6031011044979095, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 252.80358123779297, |
|
"epoch": 0.512, |
|
"grad_norm": 0.6313047035707975, |
|
"kl": 0.00281524658203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0303, |
|
"reward": 0.6900187730789185, |
|
"reward_std": 0.10082340613007545, |
|
"rewards/length_reward": 0.07008929923176765, |
|
"rewards/similarity_reward": 0.6199295520782471, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 297.0580596923828, |
|
"epoch": 0.5146666666666667, |
|
"grad_norm": 0.6070377598000647, |
|
"kl": 0.001934051513671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0237, |
|
"reward": 0.7174617052078247, |
|
"reward_std": 0.10485106706619263, |
|
"rewards/length_reward": 0.07544643804430962, |
|
"rewards/similarity_reward": 0.642015278339386, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 235.87500762939453, |
|
"epoch": 0.5173333333333333, |
|
"grad_norm": 0.6962391024556379, |
|
"kl": 0.00244140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0154, |
|
"reward": 0.64629727602005, |
|
"reward_std": 0.0772336795926094, |
|
"rewards/length_reward": 0.07008929550647736, |
|
"rewards/similarity_reward": 0.5762079358100891, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 206.8482208251953, |
|
"epoch": 0.52, |
|
"grad_norm": 0.8558133302744291, |
|
"kl": 0.00318145751953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0683, |
|
"reward": 0.6385850310325623, |
|
"reward_std": 0.0856214202940464, |
|
"rewards/length_reward": 0.06517857871949673, |
|
"rewards/similarity_reward": 0.5734064280986786, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 306.2633972167969, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 0.5439460695909182, |
|
"kl": 0.0030517578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0583, |
|
"reward": 0.6826609075069427, |
|
"reward_std": 0.08685707673430443, |
|
"rewards/length_reward": 0.07857143133878708, |
|
"rewards/similarity_reward": 0.6040894687175751, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 284.8169708251953, |
|
"epoch": 0.5253333333333333, |
|
"grad_norm": 0.597796232109364, |
|
"kl": 0.00238800048828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0081, |
|
"reward": 0.7354407906532288, |
|
"reward_std": 0.06834723800420761, |
|
"rewards/length_reward": 0.07500000670552254, |
|
"rewards/similarity_reward": 0.6604407727718353, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 271.2901916503906, |
|
"epoch": 0.528, |
|
"grad_norm": 0.7779344829533027, |
|
"kl": 0.00433349609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1361, |
|
"reward": 0.6831106841564178, |
|
"reward_std": 0.16374623030424118, |
|
"rewards/length_reward": 0.06875000894069672, |
|
"rewards/similarity_reward": 0.6143606305122375, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 257.1294708251953, |
|
"epoch": 0.5306666666666666, |
|
"grad_norm": 0.6115200963053203, |
|
"kl": 0.002285003662109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0286, |
|
"reward": 0.6817789673805237, |
|
"reward_std": 0.07882888615131378, |
|
"rewards/length_reward": 0.07857143133878708, |
|
"rewards/similarity_reward": 0.6032074391841888, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 232.77679443359375, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.7171528103086203, |
|
"kl": 0.0023651123046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0367, |
|
"reward": 0.7090138792991638, |
|
"reward_std": 0.1117062047123909, |
|
"rewards/length_reward": 0.06741072610020638, |
|
"rewards/similarity_reward": 0.6416031718254089, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 292.0446472167969, |
|
"epoch": 0.536, |
|
"grad_norm": 0.6795728974282024, |
|
"kl": 0.0023193359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0438, |
|
"reward": 0.6276125609874725, |
|
"reward_std": 0.09431122243404388, |
|
"rewards/length_reward": 0.06875000894069672, |
|
"rewards/similarity_reward": 0.5588625073432922, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 211.61161041259766, |
|
"epoch": 0.5386666666666666, |
|
"grad_norm": 0.9251828325921423, |
|
"kl": 0.0043487548828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0439, |
|
"reward": 0.5786231458187103, |
|
"reward_std": 0.1044333316385746, |
|
"rewards/length_reward": 0.058928582817316055, |
|
"rewards/similarity_reward": 0.5196945518255234, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 241.27680206298828, |
|
"epoch": 0.5413333333333333, |
|
"grad_norm": 0.6521091947128457, |
|
"kl": 0.00319671630859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0361, |
|
"reward": 0.6666609346866608, |
|
"reward_std": 0.092064518481493, |
|
"rewards/length_reward": 0.07544643431901932, |
|
"rewards/similarity_reward": 0.5912144482135773, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 211.28125762939453, |
|
"epoch": 0.544, |
|
"grad_norm": 0.6743614543760093, |
|
"kl": 0.00234222412109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0135, |
|
"reward": 0.6388188004493713, |
|
"reward_std": 0.11344650015234947, |
|
"rewards/length_reward": 0.06785715371370316, |
|
"rewards/similarity_reward": 0.570961594581604, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 279.90179443359375, |
|
"epoch": 0.5466666666666666, |
|
"grad_norm": 0.6332153076352647, |
|
"kl": 0.00287628173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0423, |
|
"reward": 0.655907928943634, |
|
"reward_std": 0.0848810151219368, |
|
"rewards/length_reward": 0.07187500968575478, |
|
"rewards/similarity_reward": 0.5840329229831696, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 201.8169708251953, |
|
"epoch": 0.5493333333333333, |
|
"grad_norm": 0.85099814384088, |
|
"kl": 0.00351715087890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0037, |
|
"reward": 0.5818662643432617, |
|
"reward_std": 0.09155267104506493, |
|
"rewards/length_reward": 0.058482155203819275, |
|
"rewards/similarity_reward": 0.5233840942382812, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 265.27679443359375, |
|
"epoch": 0.552, |
|
"grad_norm": 0.6261335014996906, |
|
"kl": 0.00209808349609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0181, |
|
"reward": 0.7043133974075317, |
|
"reward_std": 0.09633800387382507, |
|
"rewards/length_reward": 0.06875001266598701, |
|
"rewards/similarity_reward": 0.6355634033679962, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 280.0669860839844, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 0.6804923498664105, |
|
"kl": 0.00217437744140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0146, |
|
"reward": 0.6569553017616272, |
|
"reward_std": 0.08773703873157501, |
|
"rewards/length_reward": 0.071428582072258, |
|
"rewards/similarity_reward": 0.585526704788208, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 274.3482208251953, |
|
"epoch": 0.5573333333333333, |
|
"grad_norm": 0.7277401805105727, |
|
"kl": 0.0023345947265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1005, |
|
"reward": 0.7013810873031616, |
|
"reward_std": 0.09931684285402298, |
|
"rewards/length_reward": 0.07455357909202576, |
|
"rewards/similarity_reward": 0.6268274784088135, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 260.08036041259766, |
|
"epoch": 0.56, |
|
"grad_norm": 0.7807615068386718, |
|
"kl": 0.002655029296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0773, |
|
"reward": 0.6310366690158844, |
|
"reward_std": 0.10352174565196037, |
|
"rewards/length_reward": 0.062053579837083817, |
|
"rewards/similarity_reward": 0.5689830929040909, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 259.75447845458984, |
|
"epoch": 0.5626666666666666, |
|
"grad_norm": 0.7093685270251532, |
|
"kl": 0.0039825439453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0714, |
|
"reward": 0.6279307305812836, |
|
"reward_std": 0.10738151893019676, |
|
"rewards/length_reward": 0.06607143767178059, |
|
"rewards/similarity_reward": 0.5618593096733093, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 219.70983123779297, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 0.8108428594239084, |
|
"kl": 0.00278472900390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0942, |
|
"reward": 0.6396916508674622, |
|
"reward_std": 0.10166310518980026, |
|
"rewards/length_reward": 0.06785715371370316, |
|
"rewards/similarity_reward": 0.5718345046043396, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 232.99555206298828, |
|
"epoch": 0.568, |
|
"grad_norm": 0.6621005503481664, |
|
"kl": 0.00279998779296875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0055, |
|
"reward": 0.6306145489215851, |
|
"reward_std": 0.06187950074672699, |
|
"rewards/length_reward": 0.058482153341174126, |
|
"rewards/similarity_reward": 0.5721323788166046, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 252.12947845458984, |
|
"epoch": 0.5706666666666667, |
|
"grad_norm": 0.5674408374580422, |
|
"kl": 0.00339508056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0245, |
|
"reward": 0.6192521750926971, |
|
"reward_std": 0.07048613205552101, |
|
"rewards/length_reward": 0.06651786714792252, |
|
"rewards/similarity_reward": 0.5527342855930328, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 255.9732208251953, |
|
"epoch": 0.5733333333333334, |
|
"grad_norm": 0.6904920555987271, |
|
"kl": 0.00270843505859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0648, |
|
"reward": 0.6510066092014313, |
|
"reward_std": 0.056849006563425064, |
|
"rewards/length_reward": 0.0669642947614193, |
|
"rewards/similarity_reward": 0.5840422213077545, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 259.75001525878906, |
|
"epoch": 0.576, |
|
"grad_norm": 0.7131778652424658, |
|
"kl": 0.00354766845703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0514, |
|
"reward": 0.6677349805831909, |
|
"reward_std": 0.10058147087693214, |
|
"rewards/length_reward": 0.058035725727677345, |
|
"rewards/similarity_reward": 0.6096992194652557, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 259.1294708251953, |
|
"epoch": 0.5786666666666667, |
|
"grad_norm": 0.5695386420901992, |
|
"kl": 0.00286102294921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0224, |
|
"reward": 0.6971063017845154, |
|
"reward_std": 0.09981782734394073, |
|
"rewards/length_reward": 0.06741072610020638, |
|
"rewards/similarity_reward": 0.6296955347061157, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 236.69644927978516, |
|
"epoch": 0.5813333333333334, |
|
"grad_norm": 0.8176974415818429, |
|
"kl": 0.00765228271484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0594, |
|
"reward": 0.611341118812561, |
|
"reward_std": 0.10692798718810081, |
|
"rewards/length_reward": 0.06428572721779346, |
|
"rewards/similarity_reward": 0.5470553934574127, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 251.71875762939453, |
|
"epoch": 0.584, |
|
"grad_norm": 0.5850098610329177, |
|
"kl": 0.002689361572265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0162, |
|
"reward": 0.7100738286972046, |
|
"reward_std": 0.08857924118638039, |
|
"rewards/length_reward": 0.0714285783469677, |
|
"rewards/similarity_reward": 0.638645201921463, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 263.6875, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.5616868949162326, |
|
"kl": 0.00293731689453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.014, |
|
"reward": 0.7153567671775818, |
|
"reward_std": 0.1150759868323803, |
|
"rewards/length_reward": 0.07857143506407738, |
|
"rewards/similarity_reward": 0.6367852687835693, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 259.24108123779297, |
|
"epoch": 0.5893333333333334, |
|
"grad_norm": 0.6551918580682736, |
|
"kl": 0.0029296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.015, |
|
"reward": 0.734772652387619, |
|
"reward_std": 0.07534275390207767, |
|
"rewards/length_reward": 0.07410714775323868, |
|
"rewards/similarity_reward": 0.6606654524803162, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 258.8303680419922, |
|
"epoch": 0.592, |
|
"grad_norm": 0.7150710475065644, |
|
"kl": 0.00344085693359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0202, |
|
"reward": 0.6679186522960663, |
|
"reward_std": 0.10778569802641869, |
|
"rewards/length_reward": 0.06875000521540642, |
|
"rewards/similarity_reward": 0.5991686284542084, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 263.20091247558594, |
|
"epoch": 0.5946666666666667, |
|
"grad_norm": 0.6293755467193611, |
|
"kl": 0.00225067138671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0133, |
|
"reward": 0.7472398281097412, |
|
"reward_std": 0.09251785278320312, |
|
"rewards/length_reward": 0.07723214849829674, |
|
"rewards/similarity_reward": 0.6700076460838318, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 270.21876525878906, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 0.7206087593599326, |
|
"kl": 0.00225067138671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1129, |
|
"reward": 0.6448234021663666, |
|
"reward_std": 0.13600966706871986, |
|
"rewards/length_reward": 0.060714298859238625, |
|
"rewards/similarity_reward": 0.5841090679168701, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 277.3839416503906, |
|
"epoch": 0.6, |
|
"grad_norm": 0.7555717124296486, |
|
"kl": 0.003326416015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1094, |
|
"reward": 0.6888637244701385, |
|
"reward_std": 0.12441981956362724, |
|
"rewards/length_reward": 0.0669642947614193, |
|
"rewards/similarity_reward": 0.6218994557857513, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 288.4553680419922, |
|
"epoch": 0.6026666666666667, |
|
"grad_norm": 0.545561898466039, |
|
"kl": 0.00273895263671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0106, |
|
"reward": 0.6741566956043243, |
|
"reward_std": 0.08976901695132256, |
|
"rewards/length_reward": 0.0825892873108387, |
|
"rewards/similarity_reward": 0.5915673971176147, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 291.31251525878906, |
|
"epoch": 0.6053333333333333, |
|
"grad_norm": 0.6862146524208239, |
|
"kl": 0.00433349609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0199, |
|
"reward": 0.7247207760810852, |
|
"reward_std": 0.10038316994905472, |
|
"rewards/length_reward": 0.0691964402794838, |
|
"rewards/similarity_reward": 0.6555242836475372, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 237.46429443359375, |
|
"epoch": 0.608, |
|
"grad_norm": 0.6514945534975937, |
|
"kl": 0.00385284423828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0598, |
|
"reward": 0.639589786529541, |
|
"reward_std": 0.08173046633601189, |
|
"rewards/length_reward": 0.06517858058214188, |
|
"rewards/similarity_reward": 0.5744112133979797, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 258.8393020629883, |
|
"epoch": 0.6106666666666667, |
|
"grad_norm": 0.5587705250798694, |
|
"kl": 0.001895904541015625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0066, |
|
"reward": 0.7333488166332245, |
|
"reward_std": 0.07437730580568314, |
|
"rewards/length_reward": 0.07946429029107094, |
|
"rewards/similarity_reward": 0.6538845300674438, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 237.80358123779297, |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 0.7749006841963979, |
|
"kl": 0.00295257568359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0066, |
|
"reward": 0.6269674301147461, |
|
"reward_std": 0.09678621962666512, |
|
"rewards/length_reward": 0.06562500819563866, |
|
"rewards/similarity_reward": 0.5613424628973007, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 234.6116180419922, |
|
"epoch": 0.616, |
|
"grad_norm": 0.8857992900076056, |
|
"kl": 0.002620697021484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.036, |
|
"reward": 0.6997538506984711, |
|
"reward_std": 0.08686410635709763, |
|
"rewards/length_reward": 0.0691964402794838, |
|
"rewards/similarity_reward": 0.6305573880672455, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 287.6339416503906, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 0.7389133589246519, |
|
"kl": 0.00316619873046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0551, |
|
"reward": 0.6583640575408936, |
|
"reward_std": 0.1155795156955719, |
|
"rewards/length_reward": 0.06651786714792252, |
|
"rewards/similarity_reward": 0.5918461382389069, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 288.3482208251953, |
|
"epoch": 0.6213333333333333, |
|
"grad_norm": 0.5841788620661817, |
|
"kl": 0.003204345703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0042, |
|
"reward": 0.6696169376373291, |
|
"reward_std": 0.09103860706090927, |
|
"rewards/length_reward": 0.07410714775323868, |
|
"rewards/similarity_reward": 0.5955097377300262, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 277.43304443359375, |
|
"epoch": 0.624, |
|
"grad_norm": 0.5987965468099127, |
|
"kl": 0.00360107421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0078, |
|
"reward": 0.7439869344234467, |
|
"reward_std": 0.08124737069010735, |
|
"rewards/length_reward": 0.07991072162985802, |
|
"rewards/similarity_reward": 0.664076179265976, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 264.87501525878906, |
|
"epoch": 0.6266666666666667, |
|
"grad_norm": 0.6848960422881881, |
|
"kl": 0.002899169921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.044, |
|
"reward": 0.7305464148521423, |
|
"reward_std": 0.05242483504116535, |
|
"rewards/length_reward": 0.0691964440047741, |
|
"rewards/similarity_reward": 0.6613499820232391, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 282.09376525878906, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 0.6580896863492091, |
|
"kl": 0.0034637451171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0489, |
|
"reward": 0.671272337436676, |
|
"reward_std": 0.10606055706739426, |
|
"rewards/length_reward": 0.059375010430812836, |
|
"rewards/similarity_reward": 0.6118973195552826, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 255.2946548461914, |
|
"epoch": 0.632, |
|
"grad_norm": 0.6382622236431467, |
|
"kl": 0.00327301025390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0185, |
|
"reward": 0.7024641931056976, |
|
"reward_std": 0.09816553816199303, |
|
"rewards/length_reward": 0.07187501341104507, |
|
"rewards/similarity_reward": 0.6305891871452332, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 290.4375, |
|
"epoch": 0.6346666666666667, |
|
"grad_norm": 0.6729725697653197, |
|
"kl": 0.00388336181640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0427, |
|
"reward": 0.6770188510417938, |
|
"reward_std": 0.08811097219586372, |
|
"rewards/length_reward": 0.07232144102454185, |
|
"rewards/similarity_reward": 0.6046973466873169, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 232.1384048461914, |
|
"epoch": 0.6373333333333333, |
|
"grad_norm": 0.729361913476319, |
|
"kl": 0.0040283203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0698, |
|
"reward": 0.6408596038818359, |
|
"reward_std": 0.10725216567516327, |
|
"rewards/length_reward": 0.06741072237491608, |
|
"rewards/similarity_reward": 0.5734489262104034, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 284.1964416503906, |
|
"epoch": 0.64, |
|
"grad_norm": 0.6823808496445949, |
|
"kl": 0.0027923583984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0463, |
|
"reward": 0.6450429260730743, |
|
"reward_std": 0.10368061438202858, |
|
"rewards/length_reward": 0.0669642984867096, |
|
"rewards/similarity_reward": 0.57807856798172, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 260.84822845458984, |
|
"epoch": 0.6426666666666667, |
|
"grad_norm": 0.5701250379634731, |
|
"kl": 0.00244140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.025, |
|
"reward": 0.7130960524082184, |
|
"reward_std": 0.0966276079416275, |
|
"rewards/length_reward": 0.08125000074505806, |
|
"rewards/similarity_reward": 0.631846010684967, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 224.5669708251953, |
|
"epoch": 0.6453333333333333, |
|
"grad_norm": 0.8310327870734144, |
|
"kl": 0.0076446533203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0555, |
|
"reward": 0.5563714802265167, |
|
"reward_std": 0.12328523397445679, |
|
"rewards/length_reward": 0.054464295506477356, |
|
"rewards/similarity_reward": 0.5019071996212006, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 248.47322845458984, |
|
"epoch": 0.648, |
|
"grad_norm": 0.8542415499139471, |
|
"kl": 0.00330352783203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1058, |
|
"reward": 0.6085337102413177, |
|
"reward_std": 0.12796913087368011, |
|
"rewards/length_reward": 0.056696439161896706, |
|
"rewards/similarity_reward": 0.5518373250961304, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 230.2321548461914, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 0.8214724780628195, |
|
"kl": 0.0033111572265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0369, |
|
"reward": 0.6595875322818756, |
|
"reward_std": 0.07628211192786694, |
|
"rewards/length_reward": 0.0656250100582838, |
|
"rewards/similarity_reward": 0.5939625054597855, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 271.4419860839844, |
|
"epoch": 0.6533333333333333, |
|
"grad_norm": 0.5339131510223507, |
|
"kl": 0.0025787353515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0016, |
|
"reward": 0.7837041914463043, |
|
"reward_std": 0.07424943707883358, |
|
"rewards/length_reward": 0.07857143506407738, |
|
"rewards/similarity_reward": 0.7051327228546143, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 211.97322845458984, |
|
"epoch": 0.656, |
|
"grad_norm": 0.7281589894604906, |
|
"kl": 0.00310516357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0412, |
|
"reward": 0.6736421883106232, |
|
"reward_std": 0.08142560347914696, |
|
"rewards/length_reward": 0.07455357909202576, |
|
"rewards/similarity_reward": 0.5990886092185974, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 244.30805206298828, |
|
"epoch": 0.6586666666666666, |
|
"grad_norm": 0.6881733301939125, |
|
"kl": 0.002948760986328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0521, |
|
"reward": 0.7186852991580963, |
|
"reward_std": 0.07804353907704353, |
|
"rewards/length_reward": 0.07544643431901932, |
|
"rewards/similarity_reward": 0.6432388722896576, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 204.83929443359375, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 0.776497059513588, |
|
"kl": 0.00371551513671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0183, |
|
"reward": 0.7021584212779999, |
|
"reward_std": 0.10252076759934425, |
|
"rewards/length_reward": 0.06562500819563866, |
|
"rewards/similarity_reward": 0.6365334391593933, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 248.63839721679688, |
|
"epoch": 0.664, |
|
"grad_norm": 0.6263280793364865, |
|
"kl": 0.0031890869140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0264, |
|
"reward": 0.7189670205116272, |
|
"reward_std": 0.07904865220189095, |
|
"rewards/length_reward": 0.07410715520381927, |
|
"rewards/similarity_reward": 0.6448598206043243, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 247.37501525878906, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.7055287610856288, |
|
"kl": 0.0048828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0909, |
|
"reward": 0.6185382604598999, |
|
"reward_std": 0.10694902017712593, |
|
"rewards/length_reward": 0.06517858430743217, |
|
"rewards/similarity_reward": 0.553359717130661, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 247.83482360839844, |
|
"epoch": 0.6693333333333333, |
|
"grad_norm": 0.6610447004218395, |
|
"kl": 0.0035247802734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0112, |
|
"reward": 0.5839682221412659, |
|
"reward_std": 0.0852314792573452, |
|
"rewards/length_reward": 0.0647321529686451, |
|
"rewards/similarity_reward": 0.5192360877990723, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 247.12501525878906, |
|
"epoch": 0.672, |
|
"grad_norm": 0.6773638528799676, |
|
"kl": 0.00295257568359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0296, |
|
"reward": 0.6655312776565552, |
|
"reward_std": 0.10929644852876663, |
|
"rewards/length_reward": 0.060267869383096695, |
|
"rewards/similarity_reward": 0.6052633821964264, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 221.02679443359375, |
|
"epoch": 0.6746666666666666, |
|
"grad_norm": 0.7655042728512782, |
|
"kl": 0.00439453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0034, |
|
"reward": 0.6889495253562927, |
|
"reward_std": 0.10270040482282639, |
|
"rewards/length_reward": 0.07410715147852898, |
|
"rewards/similarity_reward": 0.6148423850536346, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 251.55804443359375, |
|
"epoch": 0.6773333333333333, |
|
"grad_norm": 0.6277014120461456, |
|
"kl": 0.00347900390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0195, |
|
"reward": 0.6920295655727386, |
|
"reward_std": 0.07308394275605679, |
|
"rewards/length_reward": 0.07276786491274834, |
|
"rewards/similarity_reward": 0.6192616820335388, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 247.24108123779297, |
|
"epoch": 0.68, |
|
"grad_norm": 0.8278631058587137, |
|
"kl": 0.0052032470703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0801, |
|
"reward": 0.5797223746776581, |
|
"reward_std": 0.14392751455307007, |
|
"rewards/length_reward": 0.05446429364383221, |
|
"rewards/similarity_reward": 0.5252580344676971, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 283.0178680419922, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 0.6453549089492197, |
|
"kl": 0.002655029296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0299, |
|
"reward": 0.7264284193515778, |
|
"reward_std": 0.07789020985364914, |
|
"rewards/length_reward": 0.06964286416769028, |
|
"rewards/similarity_reward": 0.6567855477333069, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 299.46875, |
|
"epoch": 0.6853333333333333, |
|
"grad_norm": 0.5750252920299316, |
|
"kl": 0.00243377685546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0429, |
|
"reward": 0.7198934257030487, |
|
"reward_std": 0.06834491342306137, |
|
"rewards/length_reward": 0.07232143729925156, |
|
"rewards/similarity_reward": 0.6475720107555389, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 250.46429443359375, |
|
"epoch": 0.688, |
|
"grad_norm": 0.6989181650264392, |
|
"kl": 0.00299835205078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0029, |
|
"reward": 0.6974795460700989, |
|
"reward_std": 0.08845745399594307, |
|
"rewards/length_reward": 0.0691964365541935, |
|
"rewards/similarity_reward": 0.6282830834388733, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 249.0848388671875, |
|
"epoch": 0.6906666666666667, |
|
"grad_norm": 0.768500388572952, |
|
"kl": 0.0030670166015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.052, |
|
"reward": 0.6331368982791901, |
|
"reward_std": 0.09397462010383606, |
|
"rewards/length_reward": 0.06741072237491608, |
|
"rewards/similarity_reward": 0.5657261312007904, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 205.52233123779297, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.8190032734025428, |
|
"kl": 0.0068511962890625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0181, |
|
"reward": 0.6275625824928284, |
|
"reward_std": 0.0882430449128151, |
|
"rewards/length_reward": 0.062053583562374115, |
|
"rewards/similarity_reward": 0.5655089616775513, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 227.0134048461914, |
|
"epoch": 0.696, |
|
"grad_norm": 0.6819806218771027, |
|
"kl": 0.0030364990234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0396, |
|
"reward": 0.7068421840667725, |
|
"reward_std": 0.0854947492480278, |
|
"rewards/length_reward": 0.06741072610020638, |
|
"rewards/similarity_reward": 0.63943150639534, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 277.65625762939453, |
|
"epoch": 0.6986666666666667, |
|
"grad_norm": 0.6041036479677462, |
|
"kl": 0.002208709716796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0162, |
|
"reward": 0.7707177698612213, |
|
"reward_std": 0.060075582936406136, |
|
"rewards/length_reward": 0.08169642835855484, |
|
"rewards/similarity_reward": 0.6890212595462799, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 273.18750762939453, |
|
"epoch": 0.7013333333333334, |
|
"grad_norm": 0.6716948511043099, |
|
"kl": 0.00411224365234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0791, |
|
"reward": 0.6893300712108612, |
|
"reward_std": 0.08914920315146446, |
|
"rewards/length_reward": 0.07098215073347092, |
|
"rewards/similarity_reward": 0.6183479726314545, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 195.03125762939453, |
|
"epoch": 0.704, |
|
"grad_norm": 0.7288219165266566, |
|
"kl": 0.0046539306640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0365, |
|
"reward": 0.6311258971691132, |
|
"reward_std": 0.08954550698399544, |
|
"rewards/length_reward": 0.06741072610020638, |
|
"rewards/similarity_reward": 0.5637151300907135, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 230.5714340209961, |
|
"epoch": 0.7066666666666667, |
|
"grad_norm": 0.6590047515085815, |
|
"kl": 0.00440216064453125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0116, |
|
"reward": 0.6623952388763428, |
|
"reward_std": 0.07462666183710098, |
|
"rewards/length_reward": 0.06562501192092896, |
|
"rewards/similarity_reward": 0.5967701971530914, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 227.8616180419922, |
|
"epoch": 0.7093333333333334, |
|
"grad_norm": 0.6922227485667228, |
|
"kl": 0.00439453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0636, |
|
"reward": 0.6782627999782562, |
|
"reward_std": 0.11519866064190865, |
|
"rewards/length_reward": 0.061607154086232185, |
|
"rewards/similarity_reward": 0.6166556179523468, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 257.0714416503906, |
|
"epoch": 0.712, |
|
"grad_norm": 0.5520470941775752, |
|
"kl": 0.00386810302734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0104, |
|
"reward": 0.7550633549690247, |
|
"reward_std": 0.09709116071462631, |
|
"rewards/length_reward": 0.07008930295705795, |
|
"rewards/similarity_reward": 0.6849740147590637, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 232.32591247558594, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 0.7138805826191377, |
|
"kl": 0.003936767578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0331, |
|
"reward": 0.669314980506897, |
|
"reward_std": 0.09859981015324593, |
|
"rewards/length_reward": 0.06517858244478703, |
|
"rewards/similarity_reward": 0.6041364073753357, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 237.11162567138672, |
|
"epoch": 0.7173333333333334, |
|
"grad_norm": 0.6150700165348351, |
|
"kl": 0.00328826904296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0028, |
|
"reward": 0.7752477526664734, |
|
"reward_std": 0.057502467185258865, |
|
"rewards/length_reward": 0.0781250037252903, |
|
"rewards/similarity_reward": 0.6971226334571838, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 254.75447845458984, |
|
"epoch": 0.72, |
|
"grad_norm": 0.7962475762237703, |
|
"kl": 0.00536346435546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0608, |
|
"reward": 0.6644669771194458, |
|
"reward_std": 0.10142809525132179, |
|
"rewards/length_reward": 0.056250009685754776, |
|
"rewards/similarity_reward": 0.6082169711589813, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 263.02679443359375, |
|
"epoch": 0.7226666666666667, |
|
"grad_norm": 0.6290318550051345, |
|
"kl": 0.00321197509765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0428, |
|
"reward": 0.6874348521232605, |
|
"reward_std": 0.08607123605906963, |
|
"rewards/length_reward": 0.0691964328289032, |
|
"rewards/similarity_reward": 0.6182384490966797, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 253.59375762939453, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 0.5631283015221202, |
|
"kl": 0.0034027099609375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0079, |
|
"reward": 0.7192228138446808, |
|
"reward_std": 0.05196613632142544, |
|
"rewards/length_reward": 0.08392857387661934, |
|
"rewards/similarity_reward": 0.6352941989898682, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 275.14732360839844, |
|
"epoch": 0.728, |
|
"grad_norm": 0.5719219206376979, |
|
"kl": 0.00231170654296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0147, |
|
"reward": 0.7571655213832855, |
|
"reward_std": 0.07733196392655373, |
|
"rewards/length_reward": 0.07678572088479996, |
|
"rewards/similarity_reward": 0.6803797781467438, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 235.5982208251953, |
|
"epoch": 0.7306666666666667, |
|
"grad_norm": 0.694430050013244, |
|
"kl": 0.0037841796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0622, |
|
"reward": 0.7187261283397675, |
|
"reward_std": 0.11743078380823135, |
|
"rewards/length_reward": 0.0669642947614193, |
|
"rewards/similarity_reward": 0.6517618000507355, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 284.7678680419922, |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 0.5786627728822946, |
|
"kl": 0.00365447998046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0085, |
|
"reward": 0.7249119877815247, |
|
"reward_std": 0.06474133767187595, |
|
"rewards/length_reward": 0.07633929327130318, |
|
"rewards/similarity_reward": 0.648572713136673, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 268.0580596923828, |
|
"epoch": 0.736, |
|
"grad_norm": 0.554118778302666, |
|
"kl": 0.0027008056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0133, |
|
"reward": 0.7421092092990875, |
|
"reward_std": 0.06751231849193573, |
|
"rewards/length_reward": 0.07544643431901932, |
|
"rewards/similarity_reward": 0.6666627824306488, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 191.9866180419922, |
|
"epoch": 0.7386666666666667, |
|
"grad_norm": 0.7484110778385685, |
|
"kl": 0.0081329345703125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0117, |
|
"reward": 0.5663988292217255, |
|
"reward_std": 0.11341952532529831, |
|
"rewards/length_reward": 0.055357156321406364, |
|
"rewards/similarity_reward": 0.5110416412353516, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 262.1428680419922, |
|
"epoch": 0.7413333333333333, |
|
"grad_norm": 0.6308341820211528, |
|
"kl": 0.003448486328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0113, |
|
"reward": 0.6712258756160736, |
|
"reward_std": 0.10108800232410431, |
|
"rewards/length_reward": 0.0691964440047741, |
|
"rewards/similarity_reward": 0.6020293831825256, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 257.31251525878906, |
|
"epoch": 0.744, |
|
"grad_norm": 0.7444571736292193, |
|
"kl": 0.0075836181640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0194, |
|
"reward": 0.6420714855194092, |
|
"reward_std": 0.10531632602214813, |
|
"rewards/length_reward": 0.06830358505249023, |
|
"rewards/similarity_reward": 0.5737679302692413, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 273.0714416503906, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.6772210722840282, |
|
"kl": 0.0051116943359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0147, |
|
"reward": 0.6341734230518341, |
|
"reward_std": 0.0923330970108509, |
|
"rewards/length_reward": 0.07321428880095482, |
|
"rewards/similarity_reward": 0.5609591007232666, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 270.4151840209961, |
|
"epoch": 0.7493333333333333, |
|
"grad_norm": 0.6993063816092533, |
|
"kl": 0.0049591064453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0501, |
|
"reward": 0.6939212083816528, |
|
"reward_std": 0.10334672406315804, |
|
"rewards/length_reward": 0.0736607201397419, |
|
"rewards/similarity_reward": 0.6202605366706848, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 233.01339721679688, |
|
"epoch": 0.752, |
|
"grad_norm": 0.7285449434821649, |
|
"kl": 0.0037841796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.013, |
|
"reward": 0.6847032904624939, |
|
"reward_std": 0.06913377530872822, |
|
"rewards/length_reward": 0.06651786714792252, |
|
"rewards/similarity_reward": 0.618185430765152, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 238.3482208251953, |
|
"epoch": 0.7546666666666667, |
|
"grad_norm": 0.90960398931583, |
|
"kl": 0.0053253173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.019, |
|
"reward": 0.5968413352966309, |
|
"reward_std": 0.12051042169332504, |
|
"rewards/length_reward": 0.058035727590322495, |
|
"rewards/similarity_reward": 0.5388056635856628, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 296.4821472167969, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 0.6037765863353864, |
|
"kl": 0.004730224609375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0322, |
|
"reward": 0.6755504310131073, |
|
"reward_std": 0.06930392794311047, |
|
"rewards/length_reward": 0.07053572684526443, |
|
"rewards/similarity_reward": 0.6050147414207458, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 235.1339340209961, |
|
"epoch": 0.76, |
|
"grad_norm": 0.5442699124141502, |
|
"kl": 0.002349853515625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0114, |
|
"reward": 0.7083780765533447, |
|
"reward_std": 0.06875683926045895, |
|
"rewards/length_reward": 0.07633928954601288, |
|
"rewards/similarity_reward": 0.632038801908493, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 259.0401916503906, |
|
"epoch": 0.7626666666666667, |
|
"grad_norm": 0.7121048521613264, |
|
"kl": 0.00444793701171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0064, |
|
"reward": 0.6680812537670135, |
|
"reward_std": 0.077706228941679, |
|
"rewards/length_reward": 0.06607143953442574, |
|
"rewards/similarity_reward": 0.6020097434520721, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 270.7366256713867, |
|
"epoch": 0.7653333333333333, |
|
"grad_norm": 0.6208584555138446, |
|
"kl": 0.003662109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.015, |
|
"reward": 0.706127256155014, |
|
"reward_std": 0.11445373296737671, |
|
"rewards/length_reward": 0.07053572684526443, |
|
"rewards/similarity_reward": 0.635591596364975, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 250.53573608398438, |
|
"epoch": 0.768, |
|
"grad_norm": 0.7385667089013174, |
|
"kl": 0.0044708251953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.032, |
|
"reward": 0.6525153517723083, |
|
"reward_std": 0.08994012698531151, |
|
"rewards/length_reward": 0.07053572311997414, |
|
"rewards/similarity_reward": 0.5819795727729797, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 271.4241180419922, |
|
"epoch": 0.7706666666666667, |
|
"grad_norm": 0.571768569159425, |
|
"kl": 0.00421142578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0514, |
|
"reward": 0.6859732568264008, |
|
"reward_std": 0.10971596091985703, |
|
"rewards/length_reward": 0.07410715147852898, |
|
"rewards/similarity_reward": 0.6118661463260651, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 279.0089416503906, |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 0.5580184118619138, |
|
"kl": 0.00385284423828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0172, |
|
"reward": 0.7247184813022614, |
|
"reward_std": 0.11090399697422981, |
|
"rewards/length_reward": 0.07232144102454185, |
|
"rewards/similarity_reward": 0.652397096157074, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 268.5357208251953, |
|
"epoch": 0.776, |
|
"grad_norm": 0.6568855951756748, |
|
"kl": 0.0045928955078125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0124, |
|
"reward": 0.6895658373832703, |
|
"reward_std": 0.09936371073126793, |
|
"rewards/length_reward": 0.07321429252624512, |
|
"rewards/similarity_reward": 0.6163516044616699, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 290.41072845458984, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 0.5426211050313083, |
|
"kl": 0.0032196044921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.019, |
|
"reward": 0.7200377881526947, |
|
"reward_std": 0.06104239821434021, |
|
"rewards/length_reward": 0.07187500968575478, |
|
"rewards/similarity_reward": 0.6481626629829407, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 246.53572845458984, |
|
"epoch": 0.7813333333333333, |
|
"grad_norm": 0.7741354483614902, |
|
"kl": 0.0047454833984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.05, |
|
"reward": 0.6441958248615265, |
|
"reward_std": 0.13777245581150055, |
|
"rewards/length_reward": 0.06428572908043861, |
|
"rewards/similarity_reward": 0.5799101293087006, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 282.1339416503906, |
|
"epoch": 0.784, |
|
"grad_norm": 0.6078928600918315, |
|
"kl": 0.0030059814453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0654, |
|
"reward": 0.7353447079658508, |
|
"reward_std": 0.11103275418281555, |
|
"rewards/length_reward": 0.0736607238650322, |
|
"rewards/similarity_reward": 0.6616839170455933, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 277.2723388671875, |
|
"epoch": 0.7866666666666666, |
|
"grad_norm": 0.6681511715620417, |
|
"kl": 0.0036773681640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0328, |
|
"reward": 0.6674771308898926, |
|
"reward_std": 0.0996098667383194, |
|
"rewards/length_reward": 0.07321429252624512, |
|
"rewards/similarity_reward": 0.5942628383636475, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 239.18304443359375, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 0.587894018606086, |
|
"kl": 0.0035858154296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0416, |
|
"reward": 0.6579557359218597, |
|
"reward_std": 0.08152700588107109, |
|
"rewards/length_reward": 0.06919643469154835, |
|
"rewards/similarity_reward": 0.5887593328952789, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 234.94197845458984, |
|
"epoch": 0.792, |
|
"grad_norm": 0.6099426936791735, |
|
"kl": 0.0045318603515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0133, |
|
"reward": 0.6847122013568878, |
|
"reward_std": 0.1407642886042595, |
|
"rewards/length_reward": 0.06830357573926449, |
|
"rewards/similarity_reward": 0.6164086163043976, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 238.58483123779297, |
|
"epoch": 0.7946666666666666, |
|
"grad_norm": 0.7568584099378661, |
|
"kl": 0.0032196044921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0816, |
|
"reward": 0.7135740518569946, |
|
"reward_std": 0.08400523662567139, |
|
"rewards/length_reward": 0.07678572088479996, |
|
"rewards/similarity_reward": 0.63678839802742, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 265.5178680419922, |
|
"epoch": 0.7973333333333333, |
|
"grad_norm": 0.559632299831719, |
|
"kl": 0.00363922119140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0075, |
|
"reward": 0.7292493581771851, |
|
"reward_std": 0.06625411659479141, |
|
"rewards/length_reward": 0.07544643431901932, |
|
"rewards/similarity_reward": 0.6538029313087463, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 266.7366256713867, |
|
"epoch": 0.8, |
|
"grad_norm": 0.6245003701307471, |
|
"kl": 0.00376129150390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0309, |
|
"reward": 0.6834622323513031, |
|
"reward_std": 0.09874312952160835, |
|
"rewards/length_reward": 0.07053572684526443, |
|
"rewards/similarity_reward": 0.6129264831542969, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 268.4464416503906, |
|
"epoch": 0.8026666666666666, |
|
"grad_norm": 0.6001886392680107, |
|
"kl": 0.00341796875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0133, |
|
"reward": 0.7123071253299713, |
|
"reward_std": 0.05348840728402138, |
|
"rewards/length_reward": 0.06741072982549667, |
|
"rewards/similarity_reward": 0.6448963582515717, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 226.2321548461914, |
|
"epoch": 0.8053333333333333, |
|
"grad_norm": 0.6336093254745343, |
|
"kl": 0.00481414794921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.011, |
|
"reward": 0.6465486586093903, |
|
"reward_std": 0.09404471330344677, |
|
"rewards/length_reward": 0.06607143767178059, |
|
"rewards/similarity_reward": 0.5804772675037384, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 269.90179443359375, |
|
"epoch": 0.808, |
|
"grad_norm": 0.59634105150443, |
|
"kl": 0.00426483154296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0036, |
|
"reward": 0.6335479617118835, |
|
"reward_std": 0.0826010163873434, |
|
"rewards/length_reward": 0.07500001415610313, |
|
"rewards/similarity_reward": 0.5585480332374573, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 277.3571472167969, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 0.6002262528664385, |
|
"kl": 0.00386810302734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.081, |
|
"reward": 0.6462258398532867, |
|
"reward_std": 0.10935888066887856, |
|
"rewards/length_reward": 0.06875000894069672, |
|
"rewards/similarity_reward": 0.5774758458137512, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 291.7276916503906, |
|
"epoch": 0.8133333333333334, |
|
"grad_norm": 0.6664322809544898, |
|
"kl": 0.0032958984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0743, |
|
"reward": 0.754040002822876, |
|
"reward_std": 0.07836959138512611, |
|
"rewards/length_reward": 0.07455357536673546, |
|
"rewards/similarity_reward": 0.6794863939285278, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 256.62501525878906, |
|
"epoch": 0.816, |
|
"grad_norm": 0.6918122711031781, |
|
"kl": 0.003997802734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0019, |
|
"reward": 0.651542603969574, |
|
"reward_std": 0.11015984788537025, |
|
"rewards/length_reward": 0.06383929774165154, |
|
"rewards/similarity_reward": 0.587703287601471, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 269.5803680419922, |
|
"epoch": 0.8186666666666667, |
|
"grad_norm": 0.6835757950482498, |
|
"kl": 0.0058746337890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0137, |
|
"reward": 0.6834548711776733, |
|
"reward_std": 0.09235357493162155, |
|
"rewards/length_reward": 0.06517858058214188, |
|
"rewards/similarity_reward": 0.6182762682437897, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 249.43751525878906, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 0.6594580830657123, |
|
"kl": 0.00449371337890625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0019, |
|
"reward": 0.6838014125823975, |
|
"reward_std": 0.12604531273245811, |
|
"rewards/length_reward": 0.07008929550647736, |
|
"rewards/similarity_reward": 0.6137120425701141, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 272.4151916503906, |
|
"epoch": 0.824, |
|
"grad_norm": 0.616617412578935, |
|
"kl": 0.00392913818359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0372, |
|
"reward": 0.7449055016040802, |
|
"reward_std": 0.08290699683129787, |
|
"rewards/length_reward": 0.07276786491274834, |
|
"rewards/similarity_reward": 0.6721376478672028, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 255.8660888671875, |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 0.6324884031752415, |
|
"kl": 0.00347900390625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.014, |
|
"reward": 0.6722660958766937, |
|
"reward_std": 0.08029109984636307, |
|
"rewards/length_reward": 0.07232143729925156, |
|
"rewards/similarity_reward": 0.5999446511268616, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 229.05358123779297, |
|
"epoch": 0.8293333333333334, |
|
"grad_norm": 0.8069840909052901, |
|
"kl": 0.00518798828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0471, |
|
"reward": 0.6889167726039886, |
|
"reward_std": 0.10098548233509064, |
|
"rewards/length_reward": 0.0714285783469677, |
|
"rewards/similarity_reward": 0.6174881756305695, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 244.28125762939453, |
|
"epoch": 0.832, |
|
"grad_norm": 0.6703576381079311, |
|
"kl": 0.00594329833984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0115, |
|
"reward": 0.7172508239746094, |
|
"reward_std": 0.07987817749381065, |
|
"rewards/length_reward": 0.07008929178118706, |
|
"rewards/similarity_reward": 0.647161453962326, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 235.50000762939453, |
|
"epoch": 0.8346666666666667, |
|
"grad_norm": 0.6595218129039281, |
|
"kl": 0.00487518310546875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0028, |
|
"reward": 0.661961704492569, |
|
"reward_std": 0.09113451465964317, |
|
"rewards/length_reward": 0.06607143767178059, |
|
"rewards/similarity_reward": 0.5958903431892395, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 219.86608123779297, |
|
"epoch": 0.8373333333333334, |
|
"grad_norm": 0.7020826871224611, |
|
"kl": 0.0062713623046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0162, |
|
"reward": 0.6874544024467468, |
|
"reward_std": 0.07596220076084137, |
|
"rewards/length_reward": 0.0647321566939354, |
|
"rewards/similarity_reward": 0.6227222084999084, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 264.8616180419922, |
|
"epoch": 0.84, |
|
"grad_norm": 0.586765871425098, |
|
"kl": 0.00507354736328125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0005, |
|
"reward": 0.7921656966209412, |
|
"reward_std": 0.0773557759821415, |
|
"rewards/length_reward": 0.0781250074505806, |
|
"rewards/similarity_reward": 0.7140407264232635, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 268.8348388671875, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 0.556140838025419, |
|
"kl": 0.004425048828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0045, |
|
"reward": 0.7406611740589142, |
|
"reward_std": 0.07832159847021103, |
|
"rewards/length_reward": 0.07232143729925156, |
|
"rewards/similarity_reward": 0.668339729309082, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 286.6696548461914, |
|
"epoch": 0.8453333333333334, |
|
"grad_norm": 0.6317684457245647, |
|
"kl": 0.00397491455078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0177, |
|
"reward": 0.791156679391861, |
|
"reward_std": 0.06108024716377258, |
|
"rewards/length_reward": 0.0781250074505806, |
|
"rewards/similarity_reward": 0.7130315899848938, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 223.43305206298828, |
|
"epoch": 0.848, |
|
"grad_norm": 0.7967746076439991, |
|
"kl": 0.0052642822265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0238, |
|
"reward": 0.7096598744392395, |
|
"reward_std": 0.08100546710193157, |
|
"rewards/length_reward": 0.07098214700818062, |
|
"rewards/similarity_reward": 0.6386776566505432, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 248.56697845458984, |
|
"epoch": 0.8506666666666667, |
|
"grad_norm": 0.6967301959645911, |
|
"kl": 0.004791259765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0788, |
|
"reward": 0.712384045124054, |
|
"reward_std": 0.08104405552148819, |
|
"rewards/length_reward": 0.07633929327130318, |
|
"rewards/similarity_reward": 0.6360447704792023, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 253.03126525878906, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.6240879358111411, |
|
"kl": 0.0043487548828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0052, |
|
"reward": 0.6519544422626495, |
|
"reward_std": 0.0803583487868309, |
|
"rewards/length_reward": 0.0691964440047741, |
|
"rewards/similarity_reward": 0.5827580094337463, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 242.95091247558594, |
|
"epoch": 0.856, |
|
"grad_norm": 0.549313108944523, |
|
"kl": 0.0028839111328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0208, |
|
"reward": 0.7446077167987823, |
|
"reward_std": 0.09269357472658157, |
|
"rewards/length_reward": 0.07410714775323868, |
|
"rewards/similarity_reward": 0.6705006062984467, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 224.89286041259766, |
|
"epoch": 0.8586666666666667, |
|
"grad_norm": 0.7316053385982602, |
|
"kl": 0.00386810302734375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0086, |
|
"reward": 0.6810529232025146, |
|
"reward_std": 0.06378122419118881, |
|
"rewards/length_reward": 0.071428582072258, |
|
"rewards/similarity_reward": 0.6096242964267731, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 236.2009048461914, |
|
"epoch": 0.8613333333333333, |
|
"grad_norm": 0.6760501533209452, |
|
"kl": 0.0053558349609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0282, |
|
"reward": 0.6928089559078217, |
|
"reward_std": 0.07963042706251144, |
|
"rewards/length_reward": 0.07410715147852898, |
|
"rewards/similarity_reward": 0.6187017858028412, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 241.27233123779297, |
|
"epoch": 0.864, |
|
"grad_norm": 0.7174681952375248, |
|
"kl": 0.0048065185546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0184, |
|
"reward": 0.7107245922088623, |
|
"reward_std": 0.07196836359798908, |
|
"rewards/length_reward": 0.07321429252624512, |
|
"rewards/similarity_reward": 0.6375102698802948, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 219.32144165039062, |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 0.6619044506684808, |
|
"kl": 0.005645751953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0104, |
|
"reward": 0.6078022122383118, |
|
"reward_std": 0.09693693742156029, |
|
"rewards/length_reward": 0.07098215073347092, |
|
"rewards/similarity_reward": 0.5368200242519379, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 228.84375762939453, |
|
"epoch": 0.8693333333333333, |
|
"grad_norm": 0.6998337254787387, |
|
"kl": 0.00328826904296875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0045, |
|
"reward": 0.7553087770938873, |
|
"reward_std": 0.07146935909986496, |
|
"rewards/length_reward": 0.0736607275903225, |
|
"rewards/similarity_reward": 0.6816481053829193, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 288.2901916503906, |
|
"epoch": 0.872, |
|
"grad_norm": 0.6156275180911798, |
|
"kl": 0.00383758544921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0138, |
|
"reward": 0.7228290438652039, |
|
"reward_std": 0.08385154232382774, |
|
"rewards/length_reward": 0.071428582072258, |
|
"rewards/similarity_reward": 0.6514004170894623, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 260.9821472167969, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 0.6463359471511301, |
|
"kl": 0.0041351318359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0624, |
|
"reward": 0.7054339051246643, |
|
"reward_std": 0.08808758854866028, |
|
"rewards/length_reward": 0.06339286640286446, |
|
"rewards/similarity_reward": 0.6420409977436066, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 247.34376525878906, |
|
"epoch": 0.8773333333333333, |
|
"grad_norm": 0.7642501038807545, |
|
"kl": 0.00417327880859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0269, |
|
"reward": 0.6869321167469025, |
|
"reward_std": 0.06861887872219086, |
|
"rewards/length_reward": 0.06741072982549667, |
|
"rewards/similarity_reward": 0.61952143907547, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 267.3928756713867, |
|
"epoch": 0.88, |
|
"grad_norm": 0.6571448003835118, |
|
"kl": 0.0063934326171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0592, |
|
"reward": 0.6967031061649323, |
|
"reward_std": 0.08689341694116592, |
|
"rewards/length_reward": 0.07455357909202576, |
|
"rewards/similarity_reward": 0.6221494972705841, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 251.84822845458984, |
|
"epoch": 0.8826666666666667, |
|
"grad_norm": 0.6160126783055467, |
|
"kl": 0.00362396240234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0047, |
|
"reward": 0.6543110907077789, |
|
"reward_std": 0.08520649373531342, |
|
"rewards/length_reward": 0.06562501192092896, |
|
"rewards/similarity_reward": 0.58868607878685, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 277.6384048461914, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 0.6951712149387916, |
|
"kl": 0.00399017333984375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0197, |
|
"reward": 0.6851289868354797, |
|
"reward_std": 0.09784185141324997, |
|
"rewards/length_reward": 0.07232143357396126, |
|
"rewards/similarity_reward": 0.6128075420856476, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 256.7366180419922, |
|
"epoch": 0.888, |
|
"grad_norm": 0.5992862799755034, |
|
"kl": 0.004791259765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0491, |
|
"reward": 0.6802008152008057, |
|
"reward_std": 0.09179921820759773, |
|
"rewards/length_reward": 0.07008929178118706, |
|
"rewards/similarity_reward": 0.6101114749908447, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 215.21430206298828, |
|
"epoch": 0.8906666666666667, |
|
"grad_norm": 0.709856632058597, |
|
"kl": 0.0052032470703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0124, |
|
"reward": 0.692722350358963, |
|
"reward_std": 0.08042297512292862, |
|
"rewards/length_reward": 0.060714298859238625, |
|
"rewards/similarity_reward": 0.6320080757141113, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 244.75001525878906, |
|
"epoch": 0.8933333333333333, |
|
"grad_norm": 0.623193882721459, |
|
"kl": 0.00424957275390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0358, |
|
"reward": 0.7010281980037689, |
|
"reward_std": 0.08354394137859344, |
|
"rewards/length_reward": 0.07098215445876122, |
|
"rewards/similarity_reward": 0.630046010017395, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 222.87500762939453, |
|
"epoch": 0.896, |
|
"grad_norm": 0.7264124980613719, |
|
"kl": 0.0045928955078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0194, |
|
"reward": 0.6454309821128845, |
|
"reward_std": 0.10383091494441032, |
|
"rewards/length_reward": 0.06562501192092896, |
|
"rewards/similarity_reward": 0.579805999994278, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 260.1696548461914, |
|
"epoch": 0.8986666666666666, |
|
"grad_norm": 0.5907324344548502, |
|
"kl": 0.0032806396484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0162, |
|
"reward": 0.6906337440013885, |
|
"reward_std": 0.06872132420539856, |
|
"rewards/length_reward": 0.07723214477300644, |
|
"rewards/similarity_reward": 0.6134015619754791, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 245.22322845458984, |
|
"epoch": 0.9013333333333333, |
|
"grad_norm": 0.6126450019945038, |
|
"kl": 0.00359344482421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0172, |
|
"reward": 0.7013856172561646, |
|
"reward_std": 0.07948705554008484, |
|
"rewards/length_reward": 0.07678572088479996, |
|
"rewards/similarity_reward": 0.6245998740196228, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 250.85269927978516, |
|
"epoch": 0.904, |
|
"grad_norm": 0.7156180383403373, |
|
"kl": 0.00537109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0313, |
|
"reward": 0.6530185341835022, |
|
"reward_std": 0.0950470082461834, |
|
"rewards/length_reward": 0.07187500596046448, |
|
"rewards/similarity_reward": 0.5811434835195541, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 253.91964721679688, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.5742388325200097, |
|
"kl": 0.004302978515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.01, |
|
"reward": 0.6900386214256287, |
|
"reward_std": 0.06002875231206417, |
|
"rewards/length_reward": 0.07991071790456772, |
|
"rewards/similarity_reward": 0.6101278960704803, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 214.0446548461914, |
|
"epoch": 0.9093333333333333, |
|
"grad_norm": 0.7992075593440748, |
|
"kl": 0.0047760009765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.017, |
|
"reward": 0.6177321374416351, |
|
"reward_std": 0.0978124737739563, |
|
"rewards/length_reward": 0.058482153341174126, |
|
"rewards/similarity_reward": 0.559249997138977, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 244.4732208251953, |
|
"epoch": 0.912, |
|
"grad_norm": 0.8258653848140411, |
|
"kl": 0.0045623779296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0631, |
|
"reward": 0.6902224123477936, |
|
"reward_std": 0.08696568384766579, |
|
"rewards/length_reward": 0.061607154086232185, |
|
"rewards/similarity_reward": 0.6286152601242065, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 217.93304443359375, |
|
"epoch": 0.9146666666666666, |
|
"grad_norm": 0.6997396550994213, |
|
"kl": 0.005523681640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0019, |
|
"reward": 0.6602669954299927, |
|
"reward_std": 0.09806296601891518, |
|
"rewards/length_reward": 0.06428572349250317, |
|
"rewards/similarity_reward": 0.5959812104701996, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 285.45538330078125, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 0.6020909063855457, |
|
"kl": 0.00421142578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0088, |
|
"reward": 0.6885746419429779, |
|
"reward_std": 0.09227034822106361, |
|
"rewards/length_reward": 0.07232143729925156, |
|
"rewards/similarity_reward": 0.6162531673908234, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 260.15179443359375, |
|
"epoch": 0.92, |
|
"grad_norm": 0.8278574941445127, |
|
"kl": 0.0059051513671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1217, |
|
"reward": 0.6081913709640503, |
|
"reward_std": 0.18488339334726334, |
|
"rewards/length_reward": 0.058482155203819275, |
|
"rewards/similarity_reward": 0.5497092008590698, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 272.45091247558594, |
|
"epoch": 0.9226666666666666, |
|
"grad_norm": 0.5703333877662986, |
|
"kl": 0.00386810302734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.021, |
|
"reward": 0.7158257365226746, |
|
"reward_std": 0.08523573726415634, |
|
"rewards/length_reward": 0.07991071790456772, |
|
"rewards/similarity_reward": 0.6359150111675262, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 291.5714416503906, |
|
"epoch": 0.9253333333333333, |
|
"grad_norm": 0.664603610390973, |
|
"kl": 0.00350189208984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0368, |
|
"reward": 0.7160574495792389, |
|
"reward_std": 0.0684958528727293, |
|
"rewards/length_reward": 0.06651787087321281, |
|
"rewards/similarity_reward": 0.6495395302772522, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 301.0357360839844, |
|
"epoch": 0.928, |
|
"grad_norm": 0.5288343523218547, |
|
"kl": 0.00455474853515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.015, |
|
"reward": 0.7117947041988373, |
|
"reward_std": 0.10228880494832993, |
|
"rewards/length_reward": 0.06741072237491608, |
|
"rewards/similarity_reward": 0.6443840265274048, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 260.44197845458984, |
|
"epoch": 0.9306666666666666, |
|
"grad_norm": 0.6372539241149925, |
|
"kl": 0.00432586669921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.031, |
|
"reward": 0.7437091171741486, |
|
"reward_std": 0.07142951153218746, |
|
"rewards/length_reward": 0.0691964402794838, |
|
"rewards/similarity_reward": 0.6745127141475677, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 266.7232208251953, |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.7417544957869086, |
|
"kl": 0.003936767578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0914, |
|
"reward": 0.7335431277751923, |
|
"reward_std": 0.10938547924160957, |
|
"rewards/length_reward": 0.07321429252624512, |
|
"rewards/similarity_reward": 0.6603288650512695, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 263.49554443359375, |
|
"epoch": 0.936, |
|
"grad_norm": 0.7076854337089484, |
|
"kl": 0.008270263671875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0148, |
|
"reward": 0.6161931157112122, |
|
"reward_std": 0.12820692360401154, |
|
"rewards/length_reward": 0.06339287385344505, |
|
"rewards/similarity_reward": 0.5528002381324768, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 263.9866180419922, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 0.5813263791328529, |
|
"kl": 0.00344085693359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0071, |
|
"reward": 0.6747315526008606, |
|
"reward_std": 0.09237341955304146, |
|
"rewards/length_reward": 0.06607143953442574, |
|
"rewards/similarity_reward": 0.6086601614952087, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 242.46875762939453, |
|
"epoch": 0.9413333333333334, |
|
"grad_norm": 0.6091815171465548, |
|
"kl": 0.0047607421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0248, |
|
"reward": 0.6233691573143005, |
|
"reward_std": 0.12349110096693039, |
|
"rewards/length_reward": 0.0647321529686451, |
|
"rewards/similarity_reward": 0.5586370527744293, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 265.0089416503906, |
|
"epoch": 0.944, |
|
"grad_norm": 0.5824155704971612, |
|
"kl": 0.00321197509765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0139, |
|
"reward": 0.7225279808044434, |
|
"reward_std": 0.0762885119765997, |
|
"rewards/length_reward": 0.0714285857975483, |
|
"rewards/similarity_reward": 0.651099443435669, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 276.56697845458984, |
|
"epoch": 0.9466666666666667, |
|
"grad_norm": 0.5410198221272389, |
|
"kl": 0.0042724609375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0064, |
|
"reward": 0.7190838754177094, |
|
"reward_std": 0.11137175559997559, |
|
"rewards/length_reward": 0.07544643431901932, |
|
"rewards/similarity_reward": 0.6436374485492706, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 299.20982360839844, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 0.5828251751367955, |
|
"kl": 0.0042877197265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0076, |
|
"reward": 0.7062688171863556, |
|
"reward_std": 0.0902559757232666, |
|
"rewards/length_reward": 0.0781250037252903, |
|
"rewards/similarity_reward": 0.6281438171863556, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 261.5714416503906, |
|
"epoch": 0.952, |
|
"grad_norm": 0.662401613193247, |
|
"kl": 0.00417327880859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1072, |
|
"reward": 0.6910728216171265, |
|
"reward_std": 0.12025736272335052, |
|
"rewards/length_reward": 0.06517858058214188, |
|
"rewards/similarity_reward": 0.6258941888809204, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 273.22322845458984, |
|
"epoch": 0.9546666666666667, |
|
"grad_norm": 0.5897487229990825, |
|
"kl": 0.0035400390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0006, |
|
"reward": 0.7075876295566559, |
|
"reward_std": 0.06886249966919422, |
|
"rewards/length_reward": 0.07098215445876122, |
|
"rewards/similarity_reward": 0.6366054713726044, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 287.7634048461914, |
|
"epoch": 0.9573333333333334, |
|
"grad_norm": 0.5282572684933031, |
|
"kl": 0.00276947021484375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0016, |
|
"reward": 0.7663991749286652, |
|
"reward_std": 0.08505138382315636, |
|
"rewards/length_reward": 0.07767857611179352, |
|
"rewards/similarity_reward": 0.6887206435203552, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 245.16519165039062, |
|
"epoch": 0.96, |
|
"grad_norm": 0.6891670466809943, |
|
"kl": 0.00555419921875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0045, |
|
"reward": 0.6559944748878479, |
|
"reward_std": 0.10601956769824028, |
|
"rewards/length_reward": 0.0691964402794838, |
|
"rewards/similarity_reward": 0.5867980420589447, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 270.0357208251953, |
|
"epoch": 0.9626666666666667, |
|
"grad_norm": 0.5428726789809684, |
|
"kl": 0.0041961669921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0106, |
|
"reward": 0.6817942559719086, |
|
"reward_std": 0.07338645495474339, |
|
"rewards/length_reward": 0.07767857611179352, |
|
"rewards/similarity_reward": 0.6041156351566315, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 243.84376525878906, |
|
"epoch": 0.9653333333333334, |
|
"grad_norm": 0.643839801785035, |
|
"kl": 0.0121307373046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0103, |
|
"reward": 0.6772570908069611, |
|
"reward_std": 0.06569460779428482, |
|
"rewards/length_reward": 0.06964286789298058, |
|
"rewards/similarity_reward": 0.6076142191886902, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 242.50894165039062, |
|
"epoch": 0.968, |
|
"grad_norm": 0.5946163970209724, |
|
"kl": 0.00385284423828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0095, |
|
"reward": 0.684212863445282, |
|
"reward_std": 0.09431854262948036, |
|
"rewards/length_reward": 0.07455357909202576, |
|
"rewards/similarity_reward": 0.6096592247486115, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 266.8080520629883, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 0.5952393619644073, |
|
"kl": 0.0045318603515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0054, |
|
"reward": 0.6641288101673126, |
|
"reward_std": 0.08274504169821739, |
|
"rewards/length_reward": 0.06651787459850311, |
|
"rewards/similarity_reward": 0.5976109206676483, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 280.81251525878906, |
|
"epoch": 0.9733333333333334, |
|
"grad_norm": 0.704408476901472, |
|
"kl": 0.006378173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0302, |
|
"reward": 0.6689063608646393, |
|
"reward_std": 0.06241089664399624, |
|
"rewards/length_reward": 0.06964286789298058, |
|
"rewards/similarity_reward": 0.5992635488510132, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 272.53126525878906, |
|
"epoch": 0.976, |
|
"grad_norm": 0.6740721279945435, |
|
"kl": 0.005615234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0723, |
|
"reward": 0.6874203383922577, |
|
"reward_std": 0.08594712242484093, |
|
"rewards/length_reward": 0.06785715371370316, |
|
"rewards/similarity_reward": 0.6195632219314575, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 237.32144165039062, |
|
"epoch": 0.9786666666666667, |
|
"grad_norm": 0.6453259649723537, |
|
"kl": 0.0041046142578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0057, |
|
"reward": 0.6572641432285309, |
|
"reward_std": 0.07327758520841599, |
|
"rewards/length_reward": 0.07053572125732899, |
|
"rewards/similarity_reward": 0.5867283642292023, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 230.68304443359375, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 0.6672094875832669, |
|
"kl": 0.00567626953125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0058, |
|
"reward": 0.6544754207134247, |
|
"reward_std": 0.09319397434592247, |
|
"rewards/length_reward": 0.071428582072258, |
|
"rewards/similarity_reward": 0.5830467939376831, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 183.78125762939453, |
|
"epoch": 0.984, |
|
"grad_norm": 0.7366023492253149, |
|
"kl": 0.00495147705078125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0059, |
|
"reward": 0.6258625388145447, |
|
"reward_std": 0.08410684764385223, |
|
"rewards/length_reward": 0.07187500968575478, |
|
"rewards/similarity_reward": 0.5539875626564026, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 242.49555206298828, |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 0.6621715728429993, |
|
"kl": 0.0055999755859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0227, |
|
"reward": 0.6772402822971344, |
|
"reward_std": 0.08659335412085056, |
|
"rewards/length_reward": 0.0714285783469677, |
|
"rewards/similarity_reward": 0.6058117151260376, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 223.2366180419922, |
|
"epoch": 0.9893333333333333, |
|
"grad_norm": 0.7315442476679772, |
|
"kl": 0.0076751708984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0034, |
|
"reward": 0.6550423204898834, |
|
"reward_std": 0.08679429814219475, |
|
"rewards/length_reward": 0.06830358132719994, |
|
"rewards/similarity_reward": 0.5867387652397156, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 279.18751525878906, |
|
"epoch": 0.992, |
|
"grad_norm": 0.7043801053601064, |
|
"kl": 0.005035400390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0732, |
|
"reward": 0.6848048269748688, |
|
"reward_std": 0.10013966262340546, |
|
"rewards/length_reward": 0.07991071417927742, |
|
"rewards/similarity_reward": 0.6048941314220428, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 221.90625762939453, |
|
"epoch": 0.9946666666666667, |
|
"grad_norm": 0.6692794616147932, |
|
"kl": 0.0042877197265625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0054, |
|
"reward": 0.6771494746208191, |
|
"reward_std": 0.0797378458082676, |
|
"rewards/length_reward": 0.061607155948877335, |
|
"rewards/similarity_reward": 0.6155422627925873, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 287.4330596923828, |
|
"epoch": 0.9973333333333333, |
|
"grad_norm": 0.6988535931211539, |
|
"kl": 0.0048828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0135, |
|
"reward": 0.7209034264087677, |
|
"reward_std": 0.0916130281984806, |
|
"rewards/length_reward": 0.07321429252624512, |
|
"rewards/similarity_reward": 0.647689163684845, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 234.6508026123047, |
|
"epoch": 1.0, |
|
"grad_norm": 0.7155440816719996, |
|
"kl": 0.0072784423828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0237, |
|
"reward": 0.6349575817584991, |
|
"reward_std": 0.13314566388726234, |
|
"rewards/length_reward": 0.056696439161896706, |
|
"rewards/similarity_reward": 0.578261137008667, |
|
"step": 375 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 375, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|