flozi00 commited on
Commit
194fa9a
·
verified ·
1 Parent(s): e143380

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. config.json +1 -1
  2. model.safetensors +1 -1
  3. trainer_state.json +651 -1471
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "../educlassifier/checkpoint-500",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "VAGOsolutions/SauerkrautLM-1.5b",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d71009c8fed2e4c4c5d680199215b792fce20c0ea5f451cb5cefced062a9c6c
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:308b0909c8b941e293cffef3c22bb8ffbb5a9f307ddc235f9370c9ffaa10de45
3
  size 3087467144
trainer_state.json CHANGED
@@ -1,1516 +1,696 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.028088778094264185,
5
  "eval_steps": 5000,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.00028088778094264183,
13
- "grad_norm": 1.578125,
14
- "learning_rate": 1.4044943820224718e-07,
15
- "logits/chosen": -6.981852054595947,
16
- "logits/rejected": -6.981904029846191,
17
- "logps/chosen": -0.7819840312004089,
18
- "logps/rejected": -3.407517910003662,
19
- "loss": 0.8348,
20
- "rewards/accuracies": 0.8359375,
21
- "rewards/chosen": -0.0781984031200409,
22
- "rewards/margins": 0.2625533938407898,
23
- "rewards/rejected": -0.3407517671585083,
24
- "step": 1
25
- },
26
- {
27
- "epoch": 0.0005617755618852837,
28
- "grad_norm": 1.90625,
29
- "learning_rate": 2.8089887640449437e-07,
30
- "logits/chosen": -6.981129169464111,
31
- "logits/rejected": -6.981235027313232,
32
- "logps/chosen": -0.5972335338592529,
33
- "logps/rejected": -3.7509422302246094,
34
- "loss": 0.8099,
35
- "rewards/accuracies": 0.8203125,
36
- "rewards/chosen": -0.059723351150751114,
37
- "rewards/margins": 0.3153708577156067,
38
- "rewards/rejected": -0.3750942647457123,
39
- "step": 2
40
- },
41
- {
42
- "epoch": 0.0008426633428279256,
43
- "grad_norm": 1.328125,
44
- "learning_rate": 4.213483146067416e-07,
45
- "logits/chosen": -6.982983589172363,
46
- "logits/rejected": -6.983180522918701,
47
- "logps/chosen": -0.6279873847961426,
48
- "logps/rejected": -3.3199615478515625,
49
- "loss": 0.8299,
50
- "rewards/accuracies": 0.8671875,
51
- "rewards/chosen": -0.06279874593019485,
52
- "rewards/margins": 0.26919740438461304,
53
- "rewards/rejected": -0.3319961428642273,
54
- "step": 3
55
- },
56
- {
57
- "epoch": 0.0011235511237705673,
58
- "grad_norm": 1.046875,
59
- "learning_rate": 5.617977528089887e-07,
60
- "logits/chosen": -6.978228569030762,
61
- "logits/rejected": -6.978395462036133,
62
- "logps/chosen": -0.663998007774353,
63
- "logps/rejected": -3.635097026824951,
64
- "loss": 0.818,
65
- "rewards/accuracies": 0.9140625,
66
- "rewards/chosen": -0.06639980524778366,
67
- "rewards/margins": 0.2971099317073822,
68
- "rewards/rejected": -0.3635097146034241,
69
- "step": 4
70
- },
71
- {
72
- "epoch": 0.0014044389047132091,
73
- "grad_norm": 1.9921875,
74
- "learning_rate": 7.02247191011236e-07,
75
- "logits/chosen": -6.984366416931152,
76
- "logits/rejected": -6.984519004821777,
77
- "logps/chosen": -0.5929118990898132,
78
- "logps/rejected": -3.171377420425415,
79
- "loss": 0.833,
80
- "rewards/accuracies": 0.828125,
81
- "rewards/chosen": -0.05929119512438774,
82
- "rewards/margins": 0.25784653425216675,
83
- "rewards/rejected": -0.317137748003006,
84
- "step": 5
85
- },
86
- {
87
- "epoch": 0.0016853266856558511,
88
- "grad_norm": 0.81640625,
89
- "learning_rate": 8.426966292134832e-07,
90
- "logits/chosen": -6.9845099449157715,
91
- "logits/rejected": -6.9846272468566895,
92
- "logps/chosen": -0.6544881463050842,
93
- "logps/rejected": -3.7314887046813965,
94
- "loss": 0.8135,
95
- "rewards/accuracies": 0.8515625,
96
- "rewards/chosen": -0.0654488131403923,
97
- "rewards/margins": 0.3077000379562378,
98
- "rewards/rejected": -0.3731488585472107,
99
- "step": 6
100
- },
101
- {
102
- "epoch": 0.001966214466598493,
103
- "grad_norm": 2.0625,
104
- "learning_rate": 9.831460674157304e-07,
105
- "logits/chosen": -6.983869552612305,
106
- "logits/rejected": -6.98401403427124,
107
- "logps/chosen": -0.6330832839012146,
108
- "logps/rejected": -3.969212055206299,
109
- "loss": 0.7987,
110
- "rewards/accuracies": 0.8984375,
111
- "rewards/chosen": -0.06330832839012146,
112
- "rewards/margins": 0.3336128890514374,
113
- "rewards/rejected": -0.39692118763923645,
114
- "step": 7
115
- },
116
- {
117
- "epoch": 0.0022471022475411347,
118
- "grad_norm": 1.2265625,
119
- "learning_rate": 1.1235955056179775e-06,
120
- "logits/chosen": -6.984411239624023,
121
- "logits/rejected": -6.984560012817383,
122
- "logps/chosen": -0.5015445351600647,
123
- "logps/rejected": -3.5077507495880127,
124
- "loss": 0.812,
125
- "rewards/accuracies": 0.890625,
126
- "rewards/chosen": -0.05015445500612259,
127
- "rewards/margins": 0.30062058568000793,
128
- "rewards/rejected": -0.3507750630378723,
129
- "step": 8
130
- },
131
- {
132
- "epoch": 0.0025279900284837765,
133
- "grad_norm": 3.484375,
134
- "learning_rate": 1.2640449438202247e-06,
135
- "logits/chosen": -6.9832258224487305,
136
- "logits/rejected": -6.9833831787109375,
137
- "logps/chosen": -0.5916701555252075,
138
- "logps/rejected": -3.901883125305176,
139
- "loss": 0.7984,
140
- "rewards/accuracies": 0.875,
141
- "rewards/chosen": -0.05916702002286911,
142
- "rewards/margins": 0.33102133870124817,
143
- "rewards/rejected": -0.3901883363723755,
144
- "step": 9
145
- },
146
- {
147
- "epoch": 0.0028088778094264182,
148
- "grad_norm": 1.6484375,
149
- "learning_rate": 1.404494382022472e-06,
150
- "logits/chosen": -6.982822418212891,
151
- "logits/rejected": -6.982968330383301,
152
- "logps/chosen": -0.6599135994911194,
153
- "logps/rejected": -3.5486698150634766,
154
- "loss": 0.8177,
155
- "rewards/accuracies": 0.8671875,
156
- "rewards/chosen": -0.0659913569688797,
157
- "rewards/margins": 0.28887563943862915,
158
- "rewards/rejected": -0.35486698150634766,
159
  "step": 10
160
  },
161
  {
162
- "epoch": 0.00308976559036906,
163
- "grad_norm": 0.8125,
164
- "learning_rate": 1.544943820224719e-06,
165
- "logits/chosen": -6.987037658691406,
166
- "logits/rejected": -6.987265586853027,
167
- "logps/chosen": -0.5249046087265015,
168
- "logps/rejected": -3.0017249584198,
169
- "loss": 0.8425,
170
- "rewards/accuracies": 0.90625,
171
- "rewards/chosen": -0.05249045789241791,
172
- "rewards/margins": 0.24768203496932983,
173
- "rewards/rejected": -0.30017250776290894,
174
- "step": 11
175
- },
176
- {
177
- "epoch": 0.0033706533713117022,
178
- "grad_norm": 1.546875,
179
- "learning_rate": 1.6853932584269663e-06,
180
- "logits/chosen": -6.9844279289245605,
181
- "logits/rejected": -6.984550952911377,
182
- "logps/chosen": -0.6450465321540833,
183
- "logps/rejected": -3.3710899353027344,
184
- "loss": 0.8281,
185
- "rewards/accuracies": 0.859375,
186
- "rewards/chosen": -0.06450465321540833,
187
- "rewards/margins": 0.2726043164730072,
188
- "rewards/rejected": -0.3371089994907379,
189
- "step": 12
190
- },
191
- {
192
- "epoch": 0.003651541152254344,
193
- "grad_norm": 1.390625,
194
- "learning_rate": 1.8258426966292136e-06,
195
- "logits/chosen": -6.985495567321777,
196
- "logits/rejected": -6.985678672790527,
197
- "logps/chosen": -0.6660882234573364,
198
- "logps/rejected": -3.941418170928955,
199
- "loss": 0.8009,
200
- "rewards/accuracies": 0.90625,
201
- "rewards/chosen": -0.06660880893468857,
202
- "rewards/margins": 0.3275330066680908,
203
- "rewards/rejected": -0.3941417932510376,
204
- "step": 13
205
- },
206
- {
207
- "epoch": 0.003932428933196986,
208
- "grad_norm": 1.6015625,
209
- "learning_rate": 1.966292134831461e-06,
210
- "logits/chosen": -6.982288837432861,
211
- "logits/rejected": -6.982420921325684,
212
- "logps/chosen": -0.7159062623977661,
213
- "logps/rejected": -3.4875435829162598,
214
- "loss": 0.8243,
215
- "rewards/accuracies": 0.8984375,
216
- "rewards/chosen": -0.07159063965082169,
217
- "rewards/margins": 0.27716371417045593,
218
- "rewards/rejected": -0.348754346370697,
219
- "step": 14
220
- },
221
- {
222
- "epoch": 0.004213316714139628,
223
- "grad_norm": 1.4296875,
224
- "learning_rate": 2.106741573033708e-06,
225
- "logits/chosen": -6.986008644104004,
226
- "logits/rejected": -6.986215114593506,
227
- "logps/chosen": -0.6564575433731079,
228
- "logps/rejected": -3.4806642532348633,
229
- "loss": 0.8223,
230
- "rewards/accuracies": 0.84375,
231
- "rewards/chosen": -0.0656457468867302,
232
- "rewards/margins": 0.28242069482803345,
233
- "rewards/rejected": -0.34806641936302185,
234
- "step": 15
235
- },
236
- {
237
- "epoch": 0.004494204495082269,
238
- "grad_norm": 0.94921875,
239
- "learning_rate": 2.247191011235955e-06,
240
- "logits/chosen": -6.985368251800537,
241
- "logits/rejected": -6.985543727874756,
242
- "logps/chosen": -0.5949140787124634,
243
- "logps/rejected": -3.1833853721618652,
244
- "loss": 0.8354,
245
- "rewards/accuracies": 0.875,
246
- "rewards/chosen": -0.05949141085147858,
247
- "rewards/margins": 0.25884711742401123,
248
- "rewards/rejected": -0.3183385133743286,
249
- "step": 16
250
- },
251
- {
252
- "epoch": 0.004775092276024911,
253
- "grad_norm": 1.53125,
254
- "learning_rate": 2.3876404494382022e-06,
255
- "logits/chosen": -6.983895778656006,
256
- "logits/rejected": -6.984076023101807,
257
- "logps/chosen": -0.6407002806663513,
258
- "logps/rejected": -3.6408045291900635,
259
- "loss": 0.8183,
260
- "rewards/accuracies": 0.8671875,
261
- "rewards/chosen": -0.06407003104686737,
262
- "rewards/margins": 0.30001044273376465,
263
- "rewards/rejected": -0.3640804588794708,
264
- "step": 17
265
- },
266
- {
267
- "epoch": 0.005055980056967553,
268
- "grad_norm": 0.91796875,
269
- "learning_rate": 2.5280898876404495e-06,
270
- "logits/chosen": -6.984615802764893,
271
- "logits/rejected": -6.984807014465332,
272
- "logps/chosen": -0.5760031938552856,
273
- "logps/rejected": -2.942106008529663,
274
- "loss": 0.8485,
275
- "rewards/accuracies": 0.828125,
276
- "rewards/chosen": -0.057600319385528564,
277
- "rewards/margins": 0.2366103082895279,
278
- "rewards/rejected": -0.29421064257621765,
279
- "step": 18
280
- },
281
- {
282
- "epoch": 0.005336867837910195,
283
- "grad_norm": 2.765625,
284
- "learning_rate": 2.6685393258426968e-06,
285
- "logits/chosen": -6.985834121704102,
286
- "logits/rejected": -6.986002445220947,
287
- "logps/chosen": -0.603966474533081,
288
- "logps/rejected": -3.6589133739471436,
289
- "loss": 0.8121,
290
- "rewards/accuracies": 0.8984375,
291
- "rewards/chosen": -0.06039665639400482,
292
- "rewards/margins": 0.3054946959018707,
293
- "rewards/rejected": -0.36589139699935913,
294
- "step": 19
295
- },
296
- {
297
- "epoch": 0.0056177556188528365,
298
- "grad_norm": 1.1484375,
299
- "learning_rate": 2.808988764044944e-06,
300
- "logits/chosen": -6.985572814941406,
301
- "logits/rejected": -6.985714912414551,
302
- "logps/chosen": -0.520312488079071,
303
- "logps/rejected": -2.815075159072876,
304
- "loss": 0.8505,
305
- "rewards/accuracies": 0.8046875,
306
- "rewards/chosen": -0.0520312525331974,
307
- "rewards/margins": 0.22947625815868378,
308
- "rewards/rejected": -0.2815075218677521,
309
  "step": 20
310
  },
311
  {
312
- "epoch": 0.005898643399795478,
313
- "grad_norm": 3.421875,
314
- "learning_rate": 2.9494382022471913e-06,
315
- "logits/chosen": -6.986955165863037,
316
- "logits/rejected": -6.987088680267334,
317
- "logps/chosen": -0.6575434803962708,
318
- "logps/rejected": -3.354898691177368,
319
- "loss": 0.8309,
320
- "rewards/accuracies": 0.8515625,
321
- "rewards/chosen": -0.06575435400009155,
322
- "rewards/margins": 0.26973551511764526,
323
- "rewards/rejected": -0.3354898989200592,
324
- "step": 21
325
- },
326
- {
327
- "epoch": 0.00617953118073812,
328
- "grad_norm": 1.359375,
329
- "learning_rate": 3.089887640449438e-06,
330
- "logits/chosen": -6.985095500946045,
331
- "logits/rejected": -6.985231876373291,
332
- "logps/chosen": -0.6375535726547241,
333
- "logps/rejected": -3.722036600112915,
334
- "loss": 0.8155,
335
- "rewards/accuracies": 0.8125,
336
- "rewards/chosen": -0.06375535577535629,
337
- "rewards/margins": 0.30844831466674805,
338
- "rewards/rejected": -0.37220367789268494,
339
- "step": 22
340
- },
341
- {
342
- "epoch": 0.006460418961680763,
343
- "grad_norm": 1.3359375,
344
- "learning_rate": 3.230337078651686e-06,
345
- "logits/chosen": -6.985586166381836,
346
- "logits/rejected": -6.985759258270264,
347
- "logps/chosen": -0.6368163824081421,
348
- "logps/rejected": -3.551453113555908,
349
- "loss": 0.8186,
350
- "rewards/accuracies": 0.828125,
351
- "rewards/chosen": -0.06368163973093033,
352
- "rewards/margins": 0.2914636731147766,
353
- "rewards/rejected": -0.35514530539512634,
354
- "step": 23
355
- },
356
- {
357
- "epoch": 0.0067413067426234045,
358
- "grad_norm": 0.9453125,
359
- "learning_rate": 3.3707865168539327e-06,
360
- "logits/chosen": -6.983944892883301,
361
- "logits/rejected": -6.984124660491943,
362
- "logps/chosen": -0.6929716467857361,
363
- "logps/rejected": -3.0338144302368164,
364
- "loss": 0.8463,
365
- "rewards/accuracies": 0.828125,
366
- "rewards/chosen": -0.06929716467857361,
367
- "rewards/margins": 0.23408427834510803,
368
- "rewards/rejected": -0.30338141322135925,
369
- "step": 24
370
- },
371
- {
372
- "epoch": 0.007022194523566046,
373
- "grad_norm": 2.15625,
374
- "learning_rate": 3.5112359550561803e-06,
375
- "logits/chosen": -6.988375663757324,
376
- "logits/rejected": -6.988600730895996,
377
- "logps/chosen": -0.5282725691795349,
378
- "logps/rejected": -2.989157199859619,
379
- "loss": 0.84,
380
- "rewards/accuracies": 0.8671875,
381
- "rewards/chosen": -0.05282726138830185,
382
- "rewards/margins": 0.24608850479125977,
383
- "rewards/rejected": -0.2989157438278198,
384
- "step": 25
385
- },
386
- {
387
- "epoch": 0.007303082304508688,
388
- "grad_norm": 0.90234375,
389
- "learning_rate": 3.651685393258427e-06,
390
- "logits/chosen": -6.9826226234436035,
391
- "logits/rejected": -6.982814311981201,
392
- "logps/chosen": -0.576172411441803,
393
- "logps/rejected": -3.1249589920043945,
394
- "loss": 0.8391,
395
- "rewards/accuracies": 0.8984375,
396
- "rewards/chosen": -0.05761724337935448,
397
- "rewards/margins": 0.2548786401748657,
398
- "rewards/rejected": -0.3124958872795105,
399
- "step": 26
400
- },
401
- {
402
- "epoch": 0.00758397008545133,
403
- "grad_norm": 0.8984375,
404
- "learning_rate": 3.7921348314606744e-06,
405
- "logits/chosen": -6.98524284362793,
406
- "logits/rejected": -6.985445499420166,
407
- "logps/chosen": -0.567965567111969,
408
- "logps/rejected": -3.0560202598571777,
409
- "loss": 0.8399,
410
- "rewards/accuracies": 0.890625,
411
- "rewards/chosen": -0.05679655820131302,
412
- "rewards/margins": 0.2488054633140564,
413
- "rewards/rejected": -0.3056020438671112,
414
- "step": 27
415
- },
416
- {
417
- "epoch": 0.007864857866393972,
418
- "grad_norm": 2.046875,
419
- "learning_rate": 3.932584269662922e-06,
420
- "logits/chosen": -6.987497806549072,
421
- "logits/rejected": -6.9876604080200195,
422
- "logps/chosen": -0.671168863773346,
423
- "logps/rejected": -3.572070837020874,
424
- "loss": 0.8193,
425
- "rewards/accuracies": 0.8515625,
426
- "rewards/chosen": -0.0671168863773346,
427
- "rewards/margins": 0.2900902330875397,
428
- "rewards/rejected": -0.3572070896625519,
429
- "step": 28
430
- },
431
- {
432
- "epoch": 0.008145745647336613,
433
- "grad_norm": 1.7578125,
434
- "learning_rate": 4.073033707865169e-06,
435
- "logits/chosen": -6.984347343444824,
436
- "logits/rejected": -6.98446798324585,
437
- "logps/chosen": -0.7242225408554077,
438
- "logps/rejected": -3.0529520511627197,
439
- "loss": 0.8501,
440
- "rewards/accuracies": 0.8671875,
441
- "rewards/chosen": -0.07242225855588913,
442
- "rewards/margins": 0.23287296295166016,
443
- "rewards/rejected": -0.30529525876045227,
444
- "step": 29
445
- },
446
- {
447
- "epoch": 0.008426633428279255,
448
- "grad_norm": 0.90234375,
449
- "learning_rate": 4.213483146067416e-06,
450
- "logits/chosen": -6.987656116485596,
451
- "logits/rejected": -6.987755298614502,
452
- "logps/chosen": -0.5501436591148376,
453
- "logps/rejected": -2.847120761871338,
454
- "loss": 0.8488,
455
- "rewards/accuracies": 0.875,
456
- "rewards/chosen": -0.05501437187194824,
457
- "rewards/margins": 0.22969767451286316,
458
- "rewards/rejected": -0.2847120463848114,
459
  "step": 30
460
  },
461
  {
462
- "epoch": 0.008707521209221898,
463
- "grad_norm": 1.765625,
464
- "learning_rate": 4.3539325842696635e-06,
465
- "logits/chosen": -6.9837751388549805,
466
- "logits/rejected": -6.98394775390625,
467
- "logps/chosen": -0.6806322932243347,
468
- "logps/rejected": -3.402733325958252,
469
- "loss": 0.826,
470
- "rewards/accuracies": 0.8671875,
471
- "rewards/chosen": -0.06806322932243347,
472
- "rewards/margins": 0.27221009135246277,
473
- "rewards/rejected": -0.34027332067489624,
474
- "step": 31
475
- },
476
- {
477
- "epoch": 0.008988408990164539,
478
- "grad_norm": 1.28125,
479
- "learning_rate": 4.49438202247191e-06,
480
- "logits/chosen": -6.984766960144043,
481
- "logits/rejected": -6.984991073608398,
482
- "logps/chosen": -0.5861184597015381,
483
- "logps/rejected": -3.729720115661621,
484
- "loss": 0.8089,
485
- "rewards/accuracies": 0.9140625,
486
- "rewards/chosen": -0.05861184746026993,
487
- "rewards/margins": 0.3143601715564728,
488
- "rewards/rejected": -0.3729720413684845,
489
- "step": 32
490
- },
491
- {
492
- "epoch": 0.009269296771107181,
493
- "grad_norm": 1.046875,
494
- "learning_rate": 4.634831460674158e-06,
495
- "logits/chosen": -6.988133907318115,
496
- "logits/rejected": -6.988280296325684,
497
- "logps/chosen": -0.6212756037712097,
498
- "logps/rejected": -3.2797980308532715,
499
- "loss": 0.8298,
500
- "rewards/accuracies": 0.859375,
501
- "rewards/chosen": -0.06212755665183067,
502
- "rewards/margins": 0.26585227251052856,
503
- "rewards/rejected": -0.32797983288764954,
504
- "step": 33
505
- },
506
- {
507
- "epoch": 0.009550184552049822,
508
- "grad_norm": 1.21875,
509
- "learning_rate": 4.7752808988764044e-06,
510
- "logits/chosen": -6.988953113555908,
511
- "logits/rejected": -6.989142894744873,
512
- "logps/chosen": -0.5206266641616821,
513
- "logps/rejected": -3.0407469272613525,
514
- "loss": 0.8377,
515
- "rewards/accuracies": 0.875,
516
- "rewards/chosen": -0.05206267163157463,
517
- "rewards/margins": 0.2520120143890381,
518
- "rewards/rejected": -0.3040746748447418,
519
- "step": 34
520
- },
521
- {
522
- "epoch": 0.009831072332992465,
523
- "grad_norm": 1.0390625,
524
- "learning_rate": 4.915730337078652e-06,
525
- "logits/chosen": -6.989229202270508,
526
- "logits/rejected": -6.989412307739258,
527
- "logps/chosen": -0.5842424631118774,
528
- "logps/rejected": -3.2295570373535156,
529
- "loss": 0.8331,
530
- "rewards/accuracies": 0.8828125,
531
- "rewards/chosen": -0.05842424929141998,
532
- "rewards/margins": 0.2645314335823059,
533
- "rewards/rejected": -0.3229556977748871,
534
- "step": 35
535
- },
536
- {
537
- "epoch": 0.010111960113935106,
538
- "grad_norm": 1.6015625,
539
- "learning_rate": 5.056179775280899e-06,
540
- "logits/chosen": -6.989651203155518,
541
- "logits/rejected": -6.989835739135742,
542
- "logps/chosen": -0.5448687672615051,
543
- "logps/rejected": -3.445011615753174,
544
- "loss": 0.8174,
545
- "rewards/accuracies": 0.875,
546
- "rewards/chosen": -0.05448687821626663,
547
- "rewards/margins": 0.2900142967700958,
548
- "rewards/rejected": -0.34450116753578186,
549
- "step": 36
550
- },
551
- {
552
- "epoch": 0.010392847894877748,
553
- "grad_norm": 0.97265625,
554
- "learning_rate": 5.196629213483146e-06,
555
- "logits/chosen": -6.987741470336914,
556
- "logits/rejected": -6.987899303436279,
557
- "logps/chosen": -0.6063218116760254,
558
- "logps/rejected": -3.657203197479248,
559
- "loss": 0.8103,
560
- "rewards/accuracies": 0.8671875,
561
- "rewards/chosen": -0.06063217669725418,
562
- "rewards/margins": 0.3050881326198578,
563
- "rewards/rejected": -0.36572033166885376,
564
- "step": 37
565
- },
566
- {
567
- "epoch": 0.01067373567582039,
568
- "grad_norm": 1.3125,
569
- "learning_rate": 5.3370786516853935e-06,
570
- "logits/chosen": -6.992892265319824,
571
- "logits/rejected": -6.993043899536133,
572
- "logps/chosen": -0.6687143445014954,
573
- "logps/rejected": -3.4205684661865234,
574
- "loss": 0.8262,
575
- "rewards/accuracies": 0.921875,
576
- "rewards/chosen": -0.06687143445014954,
577
- "rewards/margins": 0.2751854360103607,
578
- "rewards/rejected": -0.34205687046051025,
579
- "step": 38
580
- },
581
- {
582
- "epoch": 0.010954623456763032,
583
- "grad_norm": 1.4453125,
584
- "learning_rate": 5.477528089887641e-06,
585
- "logits/chosen": -6.9873948097229,
586
- "logits/rejected": -6.987576484680176,
587
- "logps/chosen": -0.6130951046943665,
588
- "logps/rejected": -3.9130940437316895,
589
- "loss": 0.8023,
590
- "rewards/accuracies": 0.8828125,
591
- "rewards/chosen": -0.061309512704610825,
592
- "rewards/margins": 0.3299998939037323,
593
- "rewards/rejected": -0.3913094103336334,
594
- "step": 39
595
- },
596
- {
597
- "epoch": 0.011235511237705673,
598
- "grad_norm": 0.98828125,
599
- "learning_rate": 5.617977528089888e-06,
600
- "logits/chosen": -6.99265718460083,
601
- "logits/rejected": -6.992844104766846,
602
- "logps/chosen": -0.6354062557220459,
603
- "logps/rejected": -2.818499803543091,
604
- "loss": 0.8565,
605
- "rewards/accuracies": 0.7890625,
606
- "rewards/chosen": -0.06354063004255295,
607
- "rewards/margins": 0.21830937266349792,
608
- "rewards/rejected": -0.2818499803543091,
609
  "step": 40
610
  },
611
  {
612
- "epoch": 0.011516399018648316,
613
- "grad_norm": 0.80078125,
614
- "learning_rate": 5.758426966292135e-06,
615
- "logits/chosen": -6.9895339012146,
616
- "logits/rejected": -6.989684104919434,
617
- "logps/chosen": -0.6787976026535034,
618
- "logps/rejected": -3.6232690811157227,
619
- "loss": 0.8171,
620
- "rewards/accuracies": 0.890625,
621
- "rewards/chosen": -0.06787975877523422,
622
- "rewards/margins": 0.2944471836090088,
623
- "rewards/rejected": -0.3623269200325012,
624
- "step": 41
625
- },
626
- {
627
- "epoch": 0.011797286799590957,
628
- "grad_norm": 0.77734375,
629
- "learning_rate": 5.8988764044943826e-06,
630
- "logits/chosen": -6.996128082275391,
631
- "logits/rejected": -6.996224403381348,
632
- "logps/chosen": -0.666965901851654,
633
- "logps/rejected": -3.1539041996002197,
634
- "loss": 0.8416,
635
- "rewards/accuracies": 0.796875,
636
- "rewards/chosen": -0.06669659167528152,
637
- "rewards/margins": 0.2486938238143921,
638
- "rewards/rejected": -0.315390408039093,
639
- "step": 42
640
- },
641
- {
642
- "epoch": 0.0120781745805336,
643
- "grad_norm": 3.265625,
644
- "learning_rate": 6.03932584269663e-06,
645
- "logits/chosen": -6.990922927856445,
646
- "logits/rejected": -6.991119861602783,
647
- "logps/chosen": -0.6920310258865356,
648
- "logps/rejected": -3.7208621501922607,
649
- "loss": 0.8148,
650
- "rewards/accuracies": 0.8203125,
651
- "rewards/chosen": -0.06920310109853745,
652
- "rewards/margins": 0.3028830885887146,
653
- "rewards/rejected": -0.37208619713783264,
654
- "step": 43
655
- },
656
- {
657
- "epoch": 0.01235906236147624,
658
- "grad_norm": 0.953125,
659
- "learning_rate": 6.179775280898876e-06,
660
- "logits/chosen": -6.995143413543701,
661
- "logits/rejected": -6.995302200317383,
662
- "logps/chosen": -0.6558566093444824,
663
- "logps/rejected": -3.2403218746185303,
664
- "loss": 0.8344,
665
- "rewards/accuracies": 0.8359375,
666
- "rewards/chosen": -0.065585657954216,
667
- "rewards/margins": 0.2584465444087982,
668
- "rewards/rejected": -0.3240322172641754,
669
- "step": 44
670
- },
671
- {
672
- "epoch": 0.012639950142418883,
673
- "grad_norm": 0.94921875,
674
- "learning_rate": 6.320224719101124e-06,
675
- "logits/chosen": -6.992753028869629,
676
- "logits/rejected": -6.992949962615967,
677
- "logps/chosen": -0.6340925097465515,
678
- "logps/rejected": -4.1242756843566895,
679
- "loss": 0.7913,
680
- "rewards/accuracies": 0.84375,
681
- "rewards/chosen": -0.06340925395488739,
682
- "rewards/margins": 0.34901827573776245,
683
- "rewards/rejected": -0.41242751479148865,
684
- "step": 45
685
- },
686
- {
687
- "epoch": 0.012920837923361525,
688
- "grad_norm": 1.140625,
689
- "learning_rate": 6.460674157303372e-06,
690
- "logits/chosen": -6.998230934143066,
691
- "logits/rejected": -6.998466491699219,
692
- "logps/chosen": -0.48865756392478943,
693
- "logps/rejected": -3.8250622749328613,
694
- "loss": 0.7962,
695
- "rewards/accuracies": 0.9296875,
696
- "rewards/chosen": -0.048865754157304764,
697
- "rewards/margins": 0.333640456199646,
698
- "rewards/rejected": -0.38250622153282166,
699
- "step": 46
700
- },
701
- {
702
- "epoch": 0.013201725704304166,
703
- "grad_norm": 2.296875,
704
- "learning_rate": 6.601123595505618e-06,
705
- "logits/chosen": -6.998146057128906,
706
- "logits/rejected": -6.998294830322266,
707
- "logps/chosen": -0.6883760690689087,
708
- "logps/rejected": -3.535900831222534,
709
- "loss": 0.8262,
710
- "rewards/accuracies": 0.796875,
711
- "rewards/chosen": -0.06883761286735535,
712
- "rewards/margins": 0.2847524583339691,
713
- "rewards/rejected": -0.35359007120132446,
714
- "step": 47
715
- },
716
- {
717
- "epoch": 0.013482613485246809,
718
- "grad_norm": 1.0,
719
- "learning_rate": 6.741573033707865e-06,
720
- "logits/chosen": -6.998913288116455,
721
- "logits/rejected": -6.9990620613098145,
722
- "logps/chosen": -0.5589190721511841,
723
- "logps/rejected": -2.9924392700195312,
724
- "loss": 0.8434,
725
- "rewards/accuracies": 0.8671875,
726
- "rewards/chosen": -0.055891912430524826,
727
- "rewards/margins": 0.2433520257472992,
728
- "rewards/rejected": -0.2992439270019531,
729
- "step": 48
730
- },
731
- {
732
- "epoch": 0.01376350126618945,
733
- "grad_norm": 2.234375,
734
- "learning_rate": 6.8820224719101126e-06,
735
- "logits/chosen": -6.999466419219971,
736
- "logits/rejected": -6.999658107757568,
737
- "logps/chosen": -0.5700675845146179,
738
- "logps/rejected": -3.315387487411499,
739
- "loss": 0.8248,
740
- "rewards/accuracies": 0.890625,
741
- "rewards/chosen": -0.05700676143169403,
742
- "rewards/margins": 0.2745319902896881,
743
- "rewards/rejected": -0.33153873682022095,
744
- "step": 49
745
- },
746
- {
747
- "epoch": 0.014044389047132093,
748
- "grad_norm": 1.078125,
749
- "learning_rate": 7.022471910112361e-06,
750
- "logits/chosen": -7.00152587890625,
751
- "logits/rejected": -7.001701831817627,
752
- "logps/chosen": -0.6014066934585571,
753
- "logps/rejected": -3.370955228805542,
754
- "loss": 0.8292,
755
- "rewards/accuracies": 0.8125,
756
- "rewards/chosen": -0.060140661895275116,
757
- "rewards/margins": 0.27695485949516296,
758
- "rewards/rejected": -0.3370955288410187,
759
  "step": 50
760
  },
761
  {
762
- "epoch": 0.014325276828074733,
763
- "grad_norm": 1.84375,
764
- "learning_rate": 7.162921348314606e-06,
765
- "logits/chosen": -7.0027384757995605,
766
- "logits/rejected": -7.002865791320801,
767
- "logps/chosen": -0.6463361978530884,
768
- "logps/rejected": -3.259683609008789,
769
- "loss": 0.8333,
770
- "rewards/accuracies": 0.875,
771
- "rewards/chosen": -0.06463362276554108,
772
- "rewards/margins": 0.26133471727371216,
773
- "rewards/rejected": -0.32596835494041443,
774
- "step": 51
775
- },
776
- {
777
- "epoch": 0.014606164609017376,
778
- "grad_norm": 1.0703125,
779
- "learning_rate": 7.303370786516854e-06,
780
- "logits/chosen": -7.000398635864258,
781
- "logits/rejected": -7.000590801239014,
782
- "logps/chosen": -0.650733470916748,
783
- "logps/rejected": -3.551283359527588,
784
- "loss": 0.8207,
785
- "rewards/accuracies": 0.859375,
786
- "rewards/chosen": -0.06507335603237152,
787
- "rewards/margins": 0.29005497694015503,
788
- "rewards/rejected": -0.35512834787368774,
789
- "step": 52
790
- },
791
- {
792
- "epoch": 0.014887052389960017,
793
- "grad_norm": 4.78125,
794
- "learning_rate": 7.443820224719102e-06,
795
- "logits/chosen": -7.001314640045166,
796
- "logits/rejected": -7.001479148864746,
797
- "logps/chosen": -0.6439567804336548,
798
- "logps/rejected": -3.231163263320923,
799
- "loss": 0.8379,
800
- "rewards/accuracies": 0.8359375,
801
- "rewards/chosen": -0.06439567357301712,
802
- "rewards/margins": 0.2587206959724426,
803
- "rewards/rejected": -0.32311639189720154,
804
- "step": 53
805
- },
806
- {
807
- "epoch": 0.01516794017090266,
808
- "grad_norm": 9.125,
809
- "learning_rate": 7.584269662921349e-06,
810
- "logits/chosen": -7.009494304656982,
811
- "logits/rejected": -7.009639739990234,
812
- "logps/chosen": -0.6828871965408325,
813
- "logps/rejected": -3.5422186851501465,
814
- "loss": 0.8226,
815
- "rewards/accuracies": 0.84375,
816
- "rewards/chosen": -0.06828872114419937,
817
- "rewards/margins": 0.28593313694000244,
818
- "rewards/rejected": -0.3542218804359436,
819
- "step": 54
820
- },
821
- {
822
- "epoch": 0.0154488279518453,
823
- "grad_norm": 1.9765625,
824
- "learning_rate": 7.724719101123595e-06,
825
- "logits/chosen": -7.009527206420898,
826
- "logits/rejected": -7.009775161743164,
827
- "logps/chosen": -0.5451503992080688,
828
- "logps/rejected": -3.2060601711273193,
829
- "loss": 0.831,
830
- "rewards/accuracies": 0.8828125,
831
- "rewards/chosen": -0.054515041410923004,
832
- "rewards/margins": 0.2660909593105316,
833
- "rewards/rejected": -0.3206060528755188,
834
- "step": 55
835
- },
836
- {
837
- "epoch": 0.015729715732787943,
838
- "grad_norm": 1.203125,
839
- "learning_rate": 7.865168539325843e-06,
840
- "logits/chosen": -7.00971794128418,
841
- "logits/rejected": -7.009875774383545,
842
- "logps/chosen": -0.6500095129013062,
843
- "logps/rejected": -3.940423011779785,
844
- "loss": 0.8,
845
- "rewards/accuracies": 0.8984375,
846
- "rewards/chosen": -0.06500095129013062,
847
- "rewards/margins": 0.32904139161109924,
848
- "rewards/rejected": -0.39404234290122986,
849
- "step": 56
850
- },
851
- {
852
- "epoch": 0.016010603513730584,
853
- "grad_norm": 2.703125,
854
- "learning_rate": 8.00561797752809e-06,
855
- "logits/chosen": -7.013581275939941,
856
- "logits/rejected": -7.013791561126709,
857
- "logps/chosen": -0.5986226797103882,
858
- "logps/rejected": -3.3548402786254883,
859
- "loss": 0.8256,
860
- "rewards/accuracies": 0.828125,
861
- "rewards/chosen": -0.05986226722598076,
862
- "rewards/margins": 0.2756217420101166,
863
- "rewards/rejected": -0.33548396825790405,
864
- "step": 57
865
- },
866
- {
867
- "epoch": 0.016291491294673225,
868
- "grad_norm": 1.2109375,
869
- "learning_rate": 8.146067415730338e-06,
870
- "logits/chosen": -7.008817672729492,
871
- "logits/rejected": -7.008937358856201,
872
- "logps/chosen": -0.5546457171440125,
873
- "logps/rejected": -3.4342713356018066,
874
- "loss": 0.8211,
875
- "rewards/accuracies": 0.8671875,
876
- "rewards/chosen": -0.055464569479227066,
877
- "rewards/margins": 0.28796258568763733,
878
- "rewards/rejected": -0.3434271812438965,
879
- "step": 58
880
- },
881
- {
882
- "epoch": 0.01657237907561587,
883
- "grad_norm": 1.7109375,
884
- "learning_rate": 8.286516853932584e-06,
885
- "logits/chosen": -7.012298583984375,
886
- "logits/rejected": -7.012528896331787,
887
- "logps/chosen": -0.5672339797019958,
888
- "logps/rejected": -3.319044828414917,
889
- "loss": 0.8259,
890
- "rewards/accuracies": 0.90625,
891
- "rewards/chosen": -0.05672340467572212,
892
- "rewards/margins": 0.2751810550689697,
893
- "rewards/rejected": -0.33190447092056274,
894
- "step": 59
895
- },
896
- {
897
- "epoch": 0.01685326685655851,
898
- "grad_norm": 1.265625,
899
- "learning_rate": 8.426966292134832e-06,
900
- "logits/chosen": -7.013304710388184,
901
- "logits/rejected": -7.013603210449219,
902
- "logps/chosen": -0.5733633637428284,
903
- "logps/rejected": -3.944350004196167,
904
- "loss": 0.796,
905
- "rewards/accuracies": 0.921875,
906
- "rewards/chosen": -0.057336337864398956,
907
- "rewards/margins": 0.33709871768951416,
908
- "rewards/rejected": -0.3944350481033325,
909
  "step": 60
910
  },
911
  {
912
- "epoch": 0.01713415463750115,
913
- "grad_norm": 2.609375,
914
- "learning_rate": 8.567415730337079e-06,
915
- "logits/chosen": -7.01167106628418,
916
- "logits/rejected": -7.011828899383545,
917
- "logps/chosen": -0.6211044192314148,
918
- "logps/rejected": -3.431169033050537,
919
- "loss": 0.8257,
920
- "rewards/accuracies": 0.875,
921
- "rewards/chosen": -0.06211044266819954,
922
- "rewards/margins": 0.28100642561912537,
923
- "rewards/rejected": -0.3431169092655182,
924
- "step": 61
925
- },
926
- {
927
- "epoch": 0.017415042418443796,
928
- "grad_norm": 4.15625,
929
- "learning_rate": 8.707865168539327e-06,
930
- "logits/chosen": -7.013251781463623,
931
- "logits/rejected": -7.013393878936768,
932
- "logps/chosen": -0.6521888971328735,
933
- "logps/rejected": -3.21026873588562,
934
- "loss": 0.8371,
935
- "rewards/accuracies": 0.8671875,
936
- "rewards/chosen": -0.06521890312433243,
937
- "rewards/margins": 0.2558079659938812,
938
- "rewards/rejected": -0.32102692127227783,
939
- "step": 62
940
- },
941
- {
942
- "epoch": 0.017695930199386437,
943
- "grad_norm": 1.1796875,
944
- "learning_rate": 8.848314606741573e-06,
945
- "logits/chosen": -7.01942253112793,
946
- "logits/rejected": -7.019570350646973,
947
- "logps/chosen": -0.5423337817192078,
948
- "logps/rejected": -3.554921865463257,
949
- "loss": 0.8147,
950
- "rewards/accuracies": 0.875,
951
- "rewards/chosen": -0.054233379662036896,
952
- "rewards/margins": 0.3012588322162628,
953
- "rewards/rejected": -0.3554922044277191,
954
- "step": 63
955
- },
956
- {
957
- "epoch": 0.017976817980329077,
958
- "grad_norm": 0.84375,
959
- "learning_rate": 8.98876404494382e-06,
960
- "logits/chosen": -7.020681858062744,
961
- "logits/rejected": -7.020779132843018,
962
- "logps/chosen": -0.6097487807273865,
963
- "logps/rejected": -3.4758450984954834,
964
- "loss": 0.8241,
965
- "rewards/accuracies": 0.8671875,
966
- "rewards/chosen": -0.06097487360239029,
967
- "rewards/margins": 0.28660961985588074,
968
- "rewards/rejected": -0.3475845158100128,
969
- "step": 64
970
- },
971
- {
972
- "epoch": 0.01825770576127172,
973
- "grad_norm": 1.0703125,
974
- "learning_rate": 9.129213483146068e-06,
975
- "logits/chosen": -7.021762847900391,
976
- "logits/rejected": -7.022034645080566,
977
- "logps/chosen": -0.6843651533126831,
978
- "logps/rejected": -4.37304162979126,
979
- "loss": 0.7824,
980
- "rewards/accuracies": 0.921875,
981
- "rewards/chosen": -0.06843651831150055,
982
- "rewards/margins": 0.3688676655292511,
983
- "rewards/rejected": -0.43730416893959045,
984
- "step": 65
985
- },
986
- {
987
- "epoch": 0.018538593542214363,
988
- "grad_norm": 1.359375,
989
- "learning_rate": 9.269662921348316e-06,
990
- "logits/chosen": -7.02295446395874,
991
- "logits/rejected": -7.023041248321533,
992
- "logps/chosen": -0.6040046215057373,
993
- "logps/rejected": -3.583864688873291,
994
- "loss": 0.8154,
995
- "rewards/accuracies": 0.875,
996
- "rewards/chosen": -0.06040046364068985,
997
- "rewards/margins": 0.2979860007762909,
998
- "rewards/rejected": -0.3583865165710449,
999
- "step": 66
1000
- },
1001
- {
1002
- "epoch": 0.018819481323157004,
1003
- "grad_norm": 1.15625,
1004
- "learning_rate": 9.410112359550562e-06,
1005
- "logits/chosen": -7.029426097869873,
1006
- "logits/rejected": -7.0295729637146,
1007
- "logps/chosen": -0.5566189289093018,
1008
- "logps/rejected": -3.2764642238616943,
1009
- "loss": 0.8283,
1010
- "rewards/accuracies": 0.859375,
1011
- "rewards/chosen": -0.055661890655756,
1012
- "rewards/margins": 0.2719845473766327,
1013
- "rewards/rejected": -0.3276464343070984,
1014
- "step": 67
1015
- },
1016
- {
1017
- "epoch": 0.019100369104099645,
1018
- "grad_norm": 1.234375,
1019
- "learning_rate": 9.550561797752809e-06,
1020
- "logits/chosen": -7.031888961791992,
1021
- "logits/rejected": -7.032125473022461,
1022
- "logps/chosen": -0.5691148638725281,
1023
- "logps/rejected": -3.4460644721984863,
1024
- "loss": 0.821,
1025
- "rewards/accuracies": 0.890625,
1026
- "rewards/chosen": -0.05691148713231087,
1027
- "rewards/margins": 0.2876949906349182,
1028
- "rewards/rejected": -0.3446064591407776,
1029
- "step": 68
1030
- },
1031
- {
1032
- "epoch": 0.019381256885042285,
1033
- "grad_norm": 1.59375,
1034
- "learning_rate": 9.691011235955057e-06,
1035
- "logits/chosen": -7.036633014678955,
1036
- "logits/rejected": -7.036778926849365,
1037
- "logps/chosen": -0.6520639061927795,
1038
- "logps/rejected": -3.434640645980835,
1039
- "loss": 0.8252,
1040
- "rewards/accuracies": 0.8671875,
1041
- "rewards/chosen": -0.06520639359951019,
1042
- "rewards/margins": 0.27825766801834106,
1043
- "rewards/rejected": -0.34346407651901245,
1044
- "step": 69
1045
- },
1046
- {
1047
- "epoch": 0.01966214466598493,
1048
- "grad_norm": 1.046875,
1049
- "learning_rate": 9.831460674157303e-06,
1050
- "logits/chosen": -7.03872013092041,
1051
- "logits/rejected": -7.038879871368408,
1052
- "logps/chosen": -0.5827143788337708,
1053
- "logps/rejected": -3.6182808876037598,
1054
- "loss": 0.8103,
1055
- "rewards/accuracies": 0.9296875,
1056
- "rewards/chosen": -0.058271437883377075,
1057
- "rewards/margins": 0.3035566508769989,
1058
- "rewards/rejected": -0.36182811856269836,
1059
  "step": 70
1060
  },
1061
  {
1062
- "epoch": 0.01994303244692757,
1063
- "grad_norm": 1.0078125,
1064
- "learning_rate": 9.97191011235955e-06,
1065
- "logits/chosen": -7.044363021850586,
1066
- "logits/rejected": -7.044580936431885,
1067
- "logps/chosen": -0.6912685036659241,
1068
- "logps/rejected": -3.794344425201416,
1069
- "loss": 0.8079,
1070
- "rewards/accuracies": 0.8359375,
1071
- "rewards/chosen": -0.06912684440612793,
1072
- "rewards/margins": 0.3103076219558716,
1073
- "rewards/rejected": -0.3794344663619995,
1074
- "step": 71
1075
- },
1076
- {
1077
- "epoch": 0.02022392022787021,
1078
- "grad_norm": 3.609375,
1079
- "learning_rate": 1.0112359550561798e-05,
1080
- "logits/chosen": -7.046081066131592,
1081
- "logits/rejected": -7.046180725097656,
1082
- "logps/chosen": -0.683323085308075,
1083
- "logps/rejected": -3.082217216491699,
1084
- "loss": 0.843,
1085
- "rewards/accuracies": 0.875,
1086
- "rewards/chosen": -0.06833230704069138,
1087
- "rewards/margins": 0.23988942801952362,
1088
- "rewards/rejected": -0.3082217574119568,
1089
- "step": 72
1090
- },
1091
- {
1092
- "epoch": 0.020504808008812853,
1093
- "grad_norm": 1.1484375,
1094
- "learning_rate": 1.0252808988764046e-05,
1095
- "logits/chosen": -7.047407150268555,
1096
- "logits/rejected": -7.047583103179932,
1097
- "logps/chosen": -0.6924074292182922,
1098
- "logps/rejected": -3.506157159805298,
1099
- "loss": 0.8246,
1100
- "rewards/accuracies": 0.8046875,
1101
- "rewards/chosen": -0.0692407488822937,
1102
- "rewards/margins": 0.281374990940094,
1103
- "rewards/rejected": -0.3506157100200653,
1104
- "step": 73
1105
- },
1106
- {
1107
- "epoch": 0.020785695789755497,
1108
- "grad_norm": 74.5,
1109
- "learning_rate": 1.0393258426966292e-05,
1110
- "logits/chosen": -7.055050373077393,
1111
- "logits/rejected": -7.055255889892578,
1112
- "logps/chosen": -0.4969649910926819,
1113
- "logps/rejected": -3.5049824714660645,
1114
- "loss": 0.8142,
1115
- "rewards/accuracies": 0.8984375,
1116
- "rewards/chosen": -0.04969649761915207,
1117
- "rewards/margins": 0.30080172419548035,
1118
- "rewards/rejected": -0.350498229265213,
1119
- "step": 74
1120
- },
1121
- {
1122
- "epoch": 0.021066583570698138,
1123
- "grad_norm": 1.3515625,
1124
- "learning_rate": 1.0533707865168539e-05,
1125
- "logits/chosen": -7.057081699371338,
1126
- "logits/rejected": -7.057204723358154,
1127
- "logps/chosen": -0.6645674705505371,
1128
- "logps/rejected": -3.872537136077881,
1129
- "loss": 0.8049,
1130
- "rewards/accuracies": 0.890625,
1131
- "rewards/chosen": -0.06645674258470535,
1132
- "rewards/margins": 0.32079699635505676,
1133
- "rewards/rejected": -0.38725370168685913,
1134
- "step": 75
1135
- },
1136
- {
1137
- "epoch": 0.02134747135164078,
1138
- "grad_norm": 0.984375,
1139
- "learning_rate": 1.0674157303370787e-05,
1140
- "logits/chosen": -7.062127113342285,
1141
- "logits/rejected": -7.0622992515563965,
1142
- "logps/chosen": -0.683305561542511,
1143
- "logps/rejected": -3.850494861602783,
1144
- "loss": 0.8067,
1145
- "rewards/accuracies": 0.875,
1146
- "rewards/chosen": -0.0683305561542511,
1147
- "rewards/margins": 0.3167189657688141,
1148
- "rewards/rejected": -0.3850494921207428,
1149
- "step": 76
1150
- },
1151
- {
1152
- "epoch": 0.021628359132583423,
1153
- "grad_norm": 0.7890625,
1154
- "learning_rate": 1.0814606741573033e-05,
1155
- "logits/chosen": -7.0633769035339355,
1156
- "logits/rejected": -7.0634965896606445,
1157
- "logps/chosen": -0.6324968338012695,
1158
- "logps/rejected": -3.4045989513397217,
1159
- "loss": 0.8247,
1160
- "rewards/accuracies": 0.890625,
1161
- "rewards/chosen": -0.06324967741966248,
1162
- "rewards/margins": 0.27721020579338074,
1163
- "rewards/rejected": -0.3404598832130432,
1164
- "step": 77
1165
- },
1166
- {
1167
- "epoch": 0.021909246913526064,
1168
- "grad_norm": 1.1875,
1169
- "learning_rate": 1.0955056179775282e-05,
1170
- "logits/chosen": -7.069698333740234,
1171
- "logits/rejected": -7.0697784423828125,
1172
- "logps/chosen": -0.6205027103424072,
1173
- "logps/rejected": -3.5221495628356934,
1174
- "loss": 0.8222,
1175
- "rewards/accuracies": 0.8515625,
1176
- "rewards/chosen": -0.06205027177929878,
1177
- "rewards/margins": 0.2901647090911865,
1178
- "rewards/rejected": -0.3522149622440338,
1179
- "step": 78
1180
- },
1181
- {
1182
- "epoch": 0.022190134694468705,
1183
- "grad_norm": 1.2109375,
1184
- "learning_rate": 1.1095505617977528e-05,
1185
- "logits/chosen": -7.070184707641602,
1186
- "logits/rejected": -7.070287227630615,
1187
- "logps/chosen": -0.5837754607200623,
1188
- "logps/rejected": -3.566255807876587,
1189
- "loss": 0.8126,
1190
- "rewards/accuracies": 0.875,
1191
- "rewards/chosen": -0.058377549052238464,
1192
- "rewards/margins": 0.2982480823993683,
1193
- "rewards/rejected": -0.35662561655044556,
1194
- "step": 79
1195
- },
1196
- {
1197
- "epoch": 0.022471022475411346,
1198
- "grad_norm": 1.5859375,
1199
- "learning_rate": 1.1235955056179776e-05,
1200
- "logits/chosen": -7.073416709899902,
1201
- "logits/rejected": -7.073558807373047,
1202
- "logps/chosen": -0.6057207584381104,
1203
- "logps/rejected": -3.7733519077301025,
1204
- "loss": 0.8087,
1205
- "rewards/accuracies": 0.8515625,
1206
- "rewards/chosen": -0.060572076588869095,
1207
- "rewards/margins": 0.31676313281059265,
1208
- "rewards/rejected": -0.37733519077301025,
1209
  "step": 80
1210
  },
1211
  {
1212
- "epoch": 0.02275191025635399,
1213
- "grad_norm": 1.0390625,
1214
- "learning_rate": 1.1376404494382022e-05,
1215
- "logits/chosen": -7.079671859741211,
1216
- "logits/rejected": -7.079834461212158,
1217
- "logps/chosen": -0.7357921004295349,
1218
- "logps/rejected": -3.681584596633911,
1219
- "loss": 0.8192,
1220
- "rewards/accuracies": 0.84375,
1221
- "rewards/chosen": -0.07357922196388245,
1222
- "rewards/margins": 0.29457923769950867,
1223
- "rewards/rejected": -0.3681584596633911,
1224
- "step": 81
1225
- },
1226
- {
1227
- "epoch": 0.02303279803729663,
1228
- "grad_norm": 1.5625,
1229
- "learning_rate": 1.151685393258427e-05,
1230
- "logits/chosen": -7.0821213722229,
1231
- "logits/rejected": -7.082340240478516,
1232
- "logps/chosen": -0.628500759601593,
1233
- "logps/rejected": -3.7704579830169678,
1234
- "loss": 0.8099,
1235
- "rewards/accuracies": 0.890625,
1236
- "rewards/chosen": -0.06285007297992706,
1237
- "rewards/margins": 0.31419575214385986,
1238
- "rewards/rejected": -0.37704581022262573,
1239
- "step": 82
1240
- },
1241
- {
1242
- "epoch": 0.023313685818239272,
1243
- "grad_norm": 0.9765625,
1244
- "learning_rate": 1.1657303370786517e-05,
1245
- "logits/chosen": -7.087411880493164,
1246
- "logits/rejected": -7.0875701904296875,
1247
- "logps/chosen": -0.7774583101272583,
1248
- "logps/rejected": -3.098719358444214,
1249
- "loss": 0.8513,
1250
- "rewards/accuracies": 0.7890625,
1251
- "rewards/chosen": -0.07774583995342255,
1252
- "rewards/margins": 0.2321261167526245,
1253
- "rewards/rejected": -0.3098719120025635,
1254
- "step": 83
1255
- },
1256
- {
1257
- "epoch": 0.023594573599181913,
1258
- "grad_norm": 0.96484375,
1259
- "learning_rate": 1.1797752808988765e-05,
1260
- "logits/chosen": -7.087563991546631,
1261
- "logits/rejected": -7.087766647338867,
1262
- "logps/chosen": -0.5942208766937256,
1263
- "logps/rejected": -3.334862232208252,
1264
- "loss": 0.8292,
1265
- "rewards/accuracies": 0.875,
1266
- "rewards/chosen": -0.0594220906496048,
1267
- "rewards/margins": 0.2740641236305237,
1268
- "rewards/rejected": -0.3334861993789673,
1269
- "step": 84
1270
- },
1271
- {
1272
- "epoch": 0.023875461380124557,
1273
- "grad_norm": 1.21875,
1274
- "learning_rate": 1.1938202247191012e-05,
1275
- "logits/chosen": -7.094630241394043,
1276
- "logits/rejected": -7.094793319702148,
1277
- "logps/chosen": -0.6228974461555481,
1278
- "logps/rejected": -4.061489105224609,
1279
- "loss": 0.7951,
1280
- "rewards/accuracies": 0.875,
1281
- "rewards/chosen": -0.06228974461555481,
1282
- "rewards/margins": 0.34385910630226135,
1283
- "rewards/rejected": -0.40614888072013855,
1284
- "step": 85
1285
- },
1286
- {
1287
- "epoch": 0.0241563491610672,
1288
- "grad_norm": 1.6484375,
1289
- "learning_rate": 1.207865168539326e-05,
1290
- "logits/chosen": -7.10193395614624,
1291
- "logits/rejected": -7.1020917892456055,
1292
- "logps/chosen": -0.6275684237480164,
1293
- "logps/rejected": -3.7310783863067627,
1294
- "loss": 0.8106,
1295
- "rewards/accuracies": 0.8984375,
1296
- "rewards/chosen": -0.06275683641433716,
1297
- "rewards/margins": 0.31035101413726807,
1298
- "rewards/rejected": -0.3731078505516052,
1299
- "step": 86
1300
- },
1301
- {
1302
- "epoch": 0.02443723694200984,
1303
- "grad_norm": 1.1171875,
1304
- "learning_rate": 1.2219101123595506e-05,
1305
- "logits/chosen": -7.102446556091309,
1306
- "logits/rejected": -7.102678298950195,
1307
- "logps/chosen": -0.6499467492103577,
1308
- "logps/rejected": -3.5946946144104004,
1309
- "loss": 0.8177,
1310
- "rewards/accuracies": 0.8828125,
1311
- "rewards/chosen": -0.064994677901268,
1312
- "rewards/margins": 0.2944748103618622,
1313
- "rewards/rejected": -0.3594695031642914,
1314
- "step": 87
1315
- },
1316
- {
1317
- "epoch": 0.02471812472295248,
1318
- "grad_norm": 1.4921875,
1319
- "learning_rate": 1.2359550561797752e-05,
1320
- "logits/chosen": -7.104351997375488,
1321
- "logits/rejected": -7.104528427124023,
1322
- "logps/chosen": -0.6288101077079773,
1323
- "logps/rejected": -3.6652326583862305,
1324
- "loss": 0.814,
1325
- "rewards/accuracies": 0.8671875,
1326
- "rewards/chosen": -0.0628810003399849,
1327
- "rewards/margins": 0.303642213344574,
1328
- "rewards/rejected": -0.36652323603630066,
1329
- "step": 88
1330
- },
1331
- {
1332
- "epoch": 0.024999012503895125,
1333
- "grad_norm": 2.78125,
1334
- "learning_rate": 1.25e-05,
1335
- "logits/chosen": -7.110116004943848,
1336
- "logits/rejected": -7.110256671905518,
1337
- "logps/chosen": -0.6746590733528137,
1338
- "logps/rejected": -3.6932754516601562,
1339
- "loss": 0.8167,
1340
- "rewards/accuracies": 0.8515625,
1341
- "rewards/chosen": -0.06746591627597809,
1342
- "rewards/margins": 0.30186164379119873,
1343
- "rewards/rejected": -0.3693275451660156,
1344
- "step": 89
1345
- },
1346
- {
1347
- "epoch": 0.025279900284837765,
1348
- "grad_norm": 1.4140625,
1349
- "learning_rate": 1.2640449438202249e-05,
1350
- "logits/chosen": -7.113439559936523,
1351
- "logits/rejected": -7.113628387451172,
1352
- "logps/chosen": -0.6546343564987183,
1353
- "logps/rejected": -3.9948794841766357,
1354
- "loss": 0.7989,
1355
- "rewards/accuracies": 0.875,
1356
- "rewards/chosen": -0.06546343863010406,
1357
- "rewards/margins": 0.3340245485305786,
1358
- "rewards/rejected": -0.3994879722595215,
1359
  "step": 90
1360
  },
1361
  {
1362
- "epoch": 0.025560788065780406,
1363
- "grad_norm": 1.1171875,
1364
- "learning_rate": 1.2780898876404495e-05,
1365
- "logits/chosen": -7.119630336761475,
1366
- "logits/rejected": -7.119820594787598,
1367
- "logps/chosen": -0.5485891103744507,
1368
- "logps/rejected": -3.8539154529571533,
1369
- "loss": 0.8,
1370
- "rewards/accuracies": 0.890625,
1371
- "rewards/chosen": -0.054858915507793427,
1372
- "rewards/margins": 0.33053261041641235,
1373
- "rewards/rejected": -0.38539156317710876,
1374
- "step": 91
1375
- },
1376
- {
1377
- "epoch": 0.02584167584672305,
1378
- "grad_norm": 1.84375,
1379
- "learning_rate": 1.2921348314606743e-05,
1380
- "logits/chosen": -7.124433994293213,
1381
- "logits/rejected": -7.124541282653809,
1382
- "logps/chosen": -0.5675304532051086,
1383
- "logps/rejected": -3.8615882396698,
1384
- "loss": 0.8041,
1385
- "rewards/accuracies": 0.8828125,
1386
- "rewards/chosen": -0.056753043085336685,
1387
- "rewards/margins": 0.3294057846069336,
1388
- "rewards/rejected": -0.38615882396698,
1389
- "step": 92
1390
- },
1391
- {
1392
- "epoch": 0.02612256362766569,
1393
- "grad_norm": 1.1875,
1394
- "learning_rate": 1.3061797752808991e-05,
1395
- "logits/chosen": -7.122183799743652,
1396
- "logits/rejected": -7.122403621673584,
1397
- "logps/chosen": -0.6670414805412292,
1398
- "logps/rejected": -4.148705959320068,
1399
- "loss": 0.7911,
1400
- "rewards/accuracies": 0.8984375,
1401
- "rewards/chosen": -0.0667041540145874,
1402
- "rewards/margins": 0.34816643595695496,
1403
- "rewards/rejected": -0.41487061977386475,
1404
- "step": 93
1405
- },
1406
- {
1407
- "epoch": 0.026403451408608333,
1408
- "grad_norm": 4.0625,
1409
- "learning_rate": 1.3202247191011236e-05,
1410
- "logits/chosen": -7.128363609313965,
1411
- "logits/rejected": -7.128509998321533,
1412
- "logps/chosen": -0.6008510589599609,
1413
- "logps/rejected": -4.413697719573975,
1414
- "loss": 0.7794,
1415
- "rewards/accuracies": 0.875,
1416
- "rewards/chosen": -0.06008511036634445,
1417
- "rewards/margins": 0.3812846839427948,
1418
- "rewards/rejected": -0.44136980175971985,
1419
- "step": 94
1420
  },
1421
  {
1422
- "epoch": 0.026684339189550974,
1423
- "grad_norm": 2.171875,
1424
- "learning_rate": 1.3342696629213482e-05,
1425
- "logits/chosen": -7.137217044830322,
1426
- "logits/rejected": -7.137389183044434,
1427
- "logps/chosen": -0.6029202938079834,
1428
- "logps/rejected": -3.891530990600586,
1429
- "loss": 0.801,
1430
- "rewards/accuracies": 0.8828125,
1431
- "rewards/chosen": -0.06029203534126282,
1432
- "rewards/margins": 0.3288610577583313,
1433
- "rewards/rejected": -0.3891531229019165,
1434
- "step": 95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1435
  },
1436
  {
1437
- "epoch": 0.026965226970493618,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1438
  "grad_norm": 1.921875,
1439
- "learning_rate": 1.348314606741573e-05,
1440
- "logits/chosen": -7.1387410163879395,
1441
- "logits/rejected": -7.1389546394348145,
1442
- "logps/chosen": -0.6319260597229004,
1443
- "logps/rejected": -4.000391006469727,
1444
- "loss": 0.7998,
1445
- "rewards/accuracies": 0.875,
1446
- "rewards/chosen": -0.06319259852170944,
1447
- "rewards/margins": 0.3368465006351471,
1448
- "rewards/rejected": -0.40003910660743713,
1449
- "step": 96
1450
- },
1451
- {
1452
- "epoch": 0.02724611475143626,
1453
- "grad_norm": 1.0078125,
1454
- "learning_rate": 1.3623595505617979e-05,
1455
- "logits/chosen": -7.144716739654541,
1456
- "logits/rejected": -7.144957065582275,
1457
- "logps/chosen": -0.5787003040313721,
1458
- "logps/rejected": -3.752794027328491,
1459
- "loss": 0.8073,
1460
- "rewards/accuracies": 0.8984375,
1461
- "rewards/chosen": -0.05787003040313721,
1462
- "rewards/margins": 0.31740936636924744,
1463
- "rewards/rejected": -0.37527936697006226,
1464
- "step": 97
1465
- },
1466
- {
1467
- "epoch": 0.0275270025323789,
1468
- "grad_norm": 1.1875,
1469
- "learning_rate": 1.3764044943820225e-05,
1470
- "logits/chosen": -7.144606113433838,
1471
- "logits/rejected": -7.144704818725586,
1472
- "logps/chosen": -0.5937943458557129,
1473
- "logps/rejected": -3.984668493270874,
1474
- "loss": 0.7967,
1475
- "rewards/accuracies": 0.890625,
1476
- "rewards/chosen": -0.05937943235039711,
1477
- "rewards/margins": 0.33908742666244507,
1478
- "rewards/rejected": -0.3984668552875519,
1479
- "step": 98
1480
- },
1481
- {
1482
- "epoch": 0.02780789031332154,
1483
- "grad_norm": 1.1875,
1484
- "learning_rate": 1.3904494382022473e-05,
1485
- "logits/chosen": -7.152622222900391,
1486
- "logits/rejected": -7.152801990509033,
1487
- "logps/chosen": -0.5818836688995361,
1488
- "logps/rejected": -3.607451915740967,
1489
- "loss": 0.8172,
1490
- "rewards/accuracies": 0.8828125,
1491
- "rewards/chosen": -0.05818836763501167,
1492
- "rewards/margins": 0.3025568425655365,
1493
- "rewards/rejected": -0.3607451915740967,
1494
- "step": 99
1495
- },
1496
- {
1497
- "epoch": 0.028088778094264185,
1498
- "grad_norm": 1.0625,
1499
- "learning_rate": 1.4044943820224721e-05,
1500
- "logits/chosen": -7.15784215927124,
1501
- "logits/rejected": -7.1580023765563965,
1502
- "logps/chosen": -0.6804911494255066,
1503
- "logps/rejected": -4.50457239151001,
1504
- "loss": 0.7831,
1505
- "rewards/accuracies": 0.8515625,
1506
- "rewards/chosen": -0.0680491179227829,
1507
- "rewards/margins": 0.38240814208984375,
1508
- "rewards/rejected": -0.45045721530914307,
1509
- "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1510
  }
1511
  ],
1512
- "logging_steps": 1,
1513
- "max_steps": 3560,
1514
  "num_input_tokens_seen": 0,
1515
  "num_train_epochs": 1,
1516
  "save_steps": 100,
@@ -1526,7 +706,7 @@
1526
  "attributes": {}
1527
  }
1528
  },
1529
- "total_flos": 1.966690847510692e+17,
1530
  "train_batch_size": 2,
1531
  "trial_name": null,
1532
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5712596162965194,
5
  "eval_steps": 5000,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.014281490407412986,
13
+ "grad_norm": 49.5,
14
+ "learning_rate": 4.285714285714285e-05,
15
+ "logits/chosen": -2.928096294403076,
16
+ "logits/rejected": -2.9280941486358643,
17
+ "logps/chosen": -3.81945538520813,
18
+ "logps/rejected": -4.2658491134643555,
19
+ "loss": 3.8875,
20
+ "odds_ratio_loss": 7.880957126617432,
21
+ "rewards/accuracies": 0.641796886920929,
22
+ "rewards/chosen": -0.38194555044174194,
23
+ "rewards/margins": 0.044639457017183304,
24
+ "rewards/rejected": -0.42658501863479614,
25
+ "sft_loss": 3.099365711212158,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  "step": 10
27
  },
28
  {
29
+ "epoch": 0.028562980814825972,
30
+ "grad_norm": 9.4375,
31
+ "learning_rate": 8.57142857142857e-05,
32
+ "logits/chosen": -3.5747146606445312,
33
+ "logits/rejected": -3.5747463703155518,
34
+ "logps/chosen": -1.0031383037567139,
35
+ "logps/rejected": -1.795668601989746,
36
+ "loss": 1.0575,
37
+ "odds_ratio_loss": 3.653076171875,
38
+ "rewards/accuracies": 0.7490234375,
39
+ "rewards/chosen": -0.10031384229660034,
40
+ "rewards/margins": 0.07925303280353546,
41
+ "rewards/rejected": -0.1795668601989746,
42
+ "sft_loss": 0.6921551823616028,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  "step": 20
44
  },
45
  {
46
+ "epoch": 0.04284447122223896,
47
+ "grad_norm": 6.375,
48
+ "learning_rate": 0.00012857142857142855,
49
+ "logits/chosen": -4.056135177612305,
50
+ "logits/rejected": -4.05615234375,
51
+ "logps/chosen": -0.7499507665634155,
52
+ "logps/rejected": -1.9470783472061157,
53
+ "loss": 0.7945,
54
+ "odds_ratio_loss": 2.919811248779297,
55
+ "rewards/accuracies": 0.791210949420929,
56
+ "rewards/chosen": -0.07499508559703827,
57
+ "rewards/margins": 0.11971275508403778,
58
+ "rewards/rejected": -0.19470782577991486,
59
+ "sft_loss": 0.5025397539138794,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  "step": 30
61
  },
62
  {
63
+ "epoch": 0.057125961629651945,
64
+ "grad_norm": 6.4375,
65
+ "learning_rate": 0.0001714285714285714,
66
+ "logits/chosen": -3.9972000122070312,
67
+ "logits/rejected": -3.997206211090088,
68
+ "logps/chosen": -0.6447885632514954,
69
+ "logps/rejected": -2.2042269706726074,
70
+ "loss": 0.6819,
71
+ "odds_ratio_loss": 2.3861823081970215,
72
+ "rewards/accuracies": 0.8414062261581421,
73
+ "rewards/chosen": -0.06447885185480118,
74
+ "rewards/margins": 0.1559438556432724,
75
+ "rewards/rejected": -0.22042270004749298,
76
+ "sft_loss": 0.4432622492313385,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  "step": 40
78
  },
79
  {
80
+ "epoch": 0.07140745203706493,
81
+ "grad_norm": 2.53125,
82
+ "learning_rate": 0.00021428571428571427,
83
+ "logits/chosen": -3.6315770149230957,
84
+ "logits/rejected": -3.631592273712158,
85
+ "logps/chosen": -0.5242542028427124,
86
+ "logps/rejected": -2.490429401397705,
87
+ "loss": 0.5524,
88
+ "odds_ratio_loss": 1.806905746459961,
89
+ "rewards/accuracies": 0.887890636920929,
90
+ "rewards/chosen": -0.05242542549967766,
91
+ "rewards/margins": 0.1966175138950348,
92
+ "rewards/rejected": -0.24904294312000275,
93
+ "sft_loss": 0.37168318033218384,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  "step": 50
95
  },
96
  {
97
+ "epoch": 0.08568894244447792,
98
+ "grad_norm": 242.0,
99
+ "learning_rate": 0.0002571428571428571,
100
+ "logits/chosen": -3.232466459274292,
101
+ "logits/rejected": -3.2325031757354736,
102
+ "logps/chosen": -0.8311947584152222,
103
+ "logps/rejected": -2.9961485862731934,
104
+ "loss": 0.8575,
105
+ "odds_ratio_loss": 3.2764008045196533,
106
+ "rewards/accuracies": 0.8935546875,
107
+ "rewards/chosen": -0.0831194818019867,
108
+ "rewards/margins": 0.21649539470672607,
109
+ "rewards/rejected": -0.2996148467063904,
110
+ "sft_loss": 0.5298588275909424,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  "step": 60
112
  },
113
  {
114
+ "epoch": 0.0999704328518909,
115
+ "grad_norm": 5.65625,
116
+ "learning_rate": 0.0003,
117
+ "logits/chosen": -2.657118558883667,
118
+ "logits/rejected": -2.6572413444519043,
119
+ "logps/chosen": -0.6468337774276733,
120
+ "logps/rejected": -2.5454134941101074,
121
+ "loss": 0.6815,
122
+ "odds_ratio_loss": 2.252271890640259,
123
+ "rewards/accuracies": 0.869921863079071,
124
+ "rewards/chosen": -0.06468339264392853,
125
+ "rewards/margins": 0.18985795974731445,
126
+ "rewards/rejected": -0.2545413374900818,
127
+ "sft_loss": 0.4562531113624573,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  "step": 70
129
  },
130
  {
131
+ "epoch": 0.11425192325930389,
132
+ "grad_norm": 4.25,
133
+ "learning_rate": 0.0002998135381828383,
134
+ "logits/chosen": -2.8170955181121826,
135
+ "logits/rejected": -2.8171167373657227,
136
+ "logps/chosen": -0.4997388422489166,
137
+ "logps/rejected": -2.737879514694214,
138
+ "loss": 0.5264,
139
+ "odds_ratio_loss": 1.6635347604751587,
140
+ "rewards/accuracies": 0.8994140625,
141
+ "rewards/chosen": -0.04997389018535614,
142
+ "rewards/margins": 0.22381405532360077,
143
+ "rewards/rejected": -0.2737879753112793,
144
+ "sft_loss": 0.3600570261478424,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  "step": 80
146
  },
147
  {
148
+ "epoch": 0.12853341366671686,
149
+ "grad_norm": 3.671875,
150
+ "learning_rate": 0.0002992546163048102,
151
+ "logits/chosen": -3.062329053878784,
152
+ "logits/rejected": -3.0623464584350586,
153
+ "logps/chosen": -0.4833168089389801,
154
+ "logps/rejected": -2.676713466644287,
155
+ "loss": 0.5082,
156
+ "odds_ratio_loss": 1.625109314918518,
157
+ "rewards/accuracies": 0.9037109613418579,
158
+ "rewards/chosen": -0.048331670463085175,
159
+ "rewards/margins": 0.21933968365192413,
160
+ "rewards/rejected": -0.2676713764667511,
161
+ "sft_loss": 0.3456498384475708,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  "step": 90
163
  },
164
  {
165
+ "epoch": 0.14281490407412986,
166
+ "grad_norm": 3.828125,
167
+ "learning_rate": 0.0002983246239337692,
168
+ "logits/chosen": -2.9990651607513428,
169
+ "logits/rejected": -2.9990792274475098,
170
+ "logps/chosen": -0.45154619216918945,
171
+ "logps/rejected": -2.796322822570801,
172
+ "loss": 0.4752,
173
+ "odds_ratio_loss": 1.524524450302124,
174
+ "rewards/accuracies": 0.907421886920929,
175
+ "rewards/chosen": -0.04515461623668671,
176
+ "rewards/margins": 0.2344777137041092,
177
+ "rewards/rejected": -0.2796323001384735,
178
+ "sft_loss": 0.3227214813232422,
179
+ "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  },
181
  {
182
+ "epoch": 0.15709639448154283,
183
+ "grad_norm": 4.625,
184
+ "learning_rate": 0.00029702587317728153,
185
+ "logits/chosen": -3.0073421001434326,
186
+ "logits/rejected": -3.0073623657226562,
187
+ "logps/chosen": -0.4845556318759918,
188
+ "logps/rejected": -2.689493417739868,
189
+ "loss": 0.5115,
190
+ "odds_ratio_loss": 1.6119966506958008,
191
+ "rewards/accuracies": 0.8880859613418579,
192
+ "rewards/chosen": -0.04845556616783142,
193
+ "rewards/margins": 0.22049376368522644,
194
+ "rewards/rejected": -0.26894932985305786,
195
+ "sft_loss": 0.3503072261810303,
196
+ "step": 110
197
+ },
198
+ {
199
+ "epoch": 0.17137788488895583,
200
+ "grad_norm": 4.25,
201
+ "learning_rate": 0.00029536159293436166,
202
+ "logits/chosen": -3.0959103107452393,
203
+ "logits/rejected": -3.095935583114624,
204
+ "logps/chosen": -0.4538491368293762,
205
+ "logps/rejected": -2.8936073780059814,
206
+ "loss": 0.4767,
207
+ "odds_ratio_loss": 1.5220377445220947,
208
+ "rewards/accuracies": 0.9091796875,
209
+ "rewards/chosen": -0.04538491368293762,
210
+ "rewards/margins": 0.24397583305835724,
211
+ "rewards/rejected": -0.28936073184013367,
212
+ "sft_loss": 0.32447534799575806,
213
+ "step": 120
214
+ },
215
+ {
216
+ "epoch": 0.1856593752963688,
217
+ "grad_norm": 3.015625,
218
+ "learning_rate": 0.00029333592086792107,
219
+ "logits/chosen": -3.2102882862091064,
220
+ "logits/rejected": -3.210312604904175,
221
+ "logps/chosen": -0.4560007154941559,
222
+ "logps/rejected": -2.9516916275024414,
223
+ "loss": 0.4775,
224
+ "odds_ratio_loss": 1.4568021297454834,
225
+ "rewards/accuracies": 0.9140625,
226
+ "rewards/chosen": -0.04560007154941559,
227
+ "rewards/margins": 0.2495690882205963,
228
+ "rewards/rejected": -0.2951691746711731,
229
+ "sft_loss": 0.33186882734298706,
230
+ "step": 130
231
+ },
232
+ {
233
+ "epoch": 0.1999408657037818,
234
+ "grad_norm": 4.625,
235
+ "learning_rate": 0.0002909538931178862,
236
+ "logits/chosen": -3.1817660331726074,
237
+ "logits/rejected": -3.181790828704834,
238
+ "logps/chosen": -0.4694591164588928,
239
+ "logps/rejected": -2.694068193435669,
240
+ "loss": 0.4932,
241
+ "odds_ratio_loss": 1.5894033908843994,
242
+ "rewards/accuracies": 0.903515636920929,
243
+ "rewards/chosen": -0.046945907175540924,
244
+ "rewards/margins": 0.22246094048023224,
245
+ "rewards/rejected": -0.26940685510635376,
246
+ "sft_loss": 0.3343026041984558,
247
+ "step": 140
248
+ },
249
+ {
250
+ "epoch": 0.21422235611119478,
251
+ "grad_norm": 2.109375,
252
+ "learning_rate": 0.00028822143178056114,
253
+ "logits/chosen": -3.19804310798645,
254
+ "logits/rejected": -3.1980957984924316,
255
+ "logps/chosen": -0.4429679811000824,
256
+ "logps/rejected": -2.8848023414611816,
257
+ "loss": 0.4646,
258
+ "odds_ratio_loss": 1.4565680027008057,
259
+ "rewards/accuracies": 0.9175781011581421,
260
+ "rewards/chosen": -0.04429679363965988,
261
+ "rewards/margins": 0.24418342113494873,
262
+ "rewards/rejected": -0.2884802222251892,
263
+ "sft_loss": 0.31895238161087036,
264
+ "step": 150
265
+ },
266
+ {
267
+ "epoch": 0.22850384651860778,
268
+ "grad_norm": 2.53125,
269
+ "learning_rate": 0.0002851453301853628,
270
+ "logits/chosen": -3.1286864280700684,
271
+ "logits/rejected": -3.1287217140197754,
272
+ "logps/chosen": -0.4620634913444519,
273
+ "logps/rejected": -2.8395490646362305,
274
+ "loss": 0.4861,
275
+ "odds_ratio_loss": 1.5178568363189697,
276
+ "rewards/accuracies": 0.8980468511581421,
277
+ "rewards/chosen": -0.04620635136961937,
278
+ "rewards/margins": 0.23774857819080353,
279
+ "rewards/rejected": -0.283954918384552,
280
+ "sft_loss": 0.33431634306907654,
281
+ "step": 160
282
+ },
283
+ {
284
+ "epoch": 0.24278533692602075,
285
+ "grad_norm": 3.8125,
286
+ "learning_rate": 0.0002817332360055343,
287
+ "logits/chosen": -3.0237438678741455,
288
+ "logits/rejected": -3.0237746238708496,
289
+ "logps/chosen": -0.4375666677951813,
290
+ "logps/rejected": -2.892333507537842,
291
+ "loss": 0.4602,
292
+ "odds_ratio_loss": 1.4504070281982422,
293
+ "rewards/accuracies": 0.9097656011581421,
294
+ "rewards/chosen": -0.04375666379928589,
295
+ "rewards/margins": 0.24547667801380157,
296
+ "rewards/rejected": -0.28923335671424866,
297
+ "sft_loss": 0.31515270471572876,
298
+ "step": 170
299
+ },
300
+ {
301
+ "epoch": 0.2570668273334337,
302
+ "grad_norm": 2.625,
303
+ "learning_rate": 0.0002779936322448233,
304
+ "logits/chosen": -3.0108470916748047,
305
+ "logits/rejected": -3.0108840465545654,
306
+ "logps/chosen": -0.43512678146362305,
307
+ "logps/rejected": -3.0354106426239014,
308
+ "loss": 0.4572,
309
+ "odds_ratio_loss": 1.3499114513397217,
310
+ "rewards/accuracies": 0.9091796875,
311
+ "rewards/chosen": -0.043512679636478424,
312
+ "rewards/margins": 0.2600283920764923,
313
+ "rewards/rejected": -0.30354106426239014,
314
+ "sft_loss": 0.32224926352500916,
315
+ "step": 180
316
+ },
317
+ {
318
+ "epoch": 0.2713483177408467,
319
+ "grad_norm": 2.78125,
320
+ "learning_rate": 0.00027393581614739923,
321
+ "logits/chosen": -3.0553345680236816,
322
+ "logits/rejected": -3.055368423461914,
323
+ "logps/chosen": -0.42374086380004883,
324
+ "logps/rejected": -2.915168523788452,
325
+ "loss": 0.445,
326
+ "odds_ratio_loss": 1.4213359355926514,
327
+ "rewards/accuracies": 0.9175781011581421,
328
+ "rewards/chosen": -0.042374081909656525,
329
+ "rewards/margins": 0.24914276599884033,
330
+ "rewards/rejected": -0.29151684045791626,
331
+ "sft_loss": 0.3028421401977539,
332
+ "step": 190
333
+ },
334
+ {
335
+ "epoch": 0.2856298081482597,
336
+ "grad_norm": 2.421875,
337
+ "learning_rate": 0.0002695698760834384,
338
+ "logits/chosen": -2.994476318359375,
339
+ "logits/rejected": -2.9945011138916016,
340
+ "logps/chosen": -0.4544607102870941,
341
+ "logps/rejected": -2.8547749519348145,
342
+ "loss": 0.4776,
343
+ "odds_ratio_loss": 1.4777902364730835,
344
+ "rewards/accuracies": 0.9033203125,
345
+ "rewards/chosen": -0.04544607177376747,
346
+ "rewards/margins": 0.240031436085701,
347
+ "rewards/rejected": -0.28547748923301697,
348
+ "sft_loss": 0.32984623312950134,
349
+ "step": 200
350
+ },
351
+ {
352
+ "epoch": 0.2999112985556727,
353
+ "grad_norm": 3.546875,
354
+ "learning_rate": 0.00026490666646784665,
355
+ "logits/chosen": -3.063324451446533,
356
+ "logits/rejected": -3.063349723815918,
357
+ "logps/chosen": -0.43639254570007324,
358
+ "logps/rejected": -3.105325698852539,
359
+ "loss": 0.4578,
360
+ "odds_ratio_loss": 1.3640453815460205,
361
+ "rewards/accuracies": 0.9140625,
362
+ "rewards/chosen": -0.04363925755023956,
363
+ "rewards/margins": 0.26689332723617554,
364
+ "rewards/rejected": -0.3105325698852539,
365
+ "sft_loss": 0.3214019536972046,
366
+ "step": 210
367
  },
368
  {
369
+ "epoch": 0.31419278896308567,
370
+ "grad_norm": 2.296875,
371
+ "learning_rate": 0.0002599577807744739,
372
+ "logits/chosen": -3.115455389022827,
373
+ "logits/rejected": -3.1154801845550537,
374
+ "logps/chosen": -0.4168368875980377,
375
+ "logps/rejected": -3.096985340118408,
376
+ "loss": 0.4371,
377
+ "odds_ratio_loss": 1.3493207693099976,
378
+ "rewards/accuracies": 0.9164062738418579,
379
+ "rewards/chosen": -0.04168368875980377,
380
+ "rewards/margins": 0.2680148482322693,
381
+ "rewards/rejected": -0.30969855189323425,
382
+ "sft_loss": 0.3021194040775299,
383
+ "step": 220
384
+ },
385
+ {
386
+ "epoch": 0.32847427937049867,
387
  "grad_norm": 1.921875,
388
+ "learning_rate": 0.0002547355227129109,
389
+ "logits/chosen": -3.162436008453369,
390
+ "logits/rejected": -3.1624579429626465,
391
+ "logps/chosen": -0.4433667063713074,
392
+ "logps/rejected": -3.0064072608947754,
393
+ "loss": 0.4655,
394
+ "odds_ratio_loss": 1.3855293989181519,
395
+ "rewards/accuracies": 0.912304699420929,
396
+ "rewards/chosen": -0.04433666914701462,
397
+ "rewards/margins": 0.2563040852546692,
398
+ "rewards/rejected": -0.3006407618522644,
399
+ "sft_loss": 0.3269914984703064,
400
+ "step": 230
401
+ },
402
+ {
403
+ "epoch": 0.34275576977791167,
404
+ "grad_norm": 1.2734375,
405
+ "learning_rate": 0.0002492528756395289,
406
+ "logits/chosen": -3.1590659618377686,
407
+ "logits/rejected": -3.1590869426727295,
408
+ "logps/chosen": -0.4178268015384674,
409
+ "logps/rejected": -3.0285942554473877,
410
+ "loss": 0.4376,
411
+ "odds_ratio_loss": 1.3180664777755737,
412
+ "rewards/accuracies": 0.917187511920929,
413
+ "rewards/chosen": -0.0417826846241951,
414
+ "rewards/margins": 0.26107674837112427,
415
+ "rewards/rejected": -0.30285942554473877,
416
+ "sft_loss": 0.3058391213417053,
417
+ "step": 240
418
+ },
419
+ {
420
+ "epoch": 0.35703726018532467,
421
+ "grad_norm": 4.125,
422
+ "learning_rate": 0.00024352347027881003,
423
+ "logits/chosen": -3.2828221321105957,
424
+ "logits/rejected": -3.282838821411133,
425
+ "logps/chosen": -0.4194249212741852,
426
+ "logps/rejected": -3.0415470600128174,
427
+ "loss": 0.4403,
428
+ "odds_ratio_loss": 1.3526116609573364,
429
+ "rewards/accuracies": 0.9166015386581421,
430
+ "rewards/chosen": -0.041942495852708817,
431
+ "rewards/margins": 0.26221221685409546,
432
+ "rewards/rejected": -0.3041546940803528,
433
+ "sft_loss": 0.3050472140312195,
434
+ "step": 250
435
+ },
436
+ {
437
+ "epoch": 0.3713187505927376,
438
+ "grad_norm": 2.640625,
439
+ "learning_rate": 0.00023756155083521846,
440
+ "logits/chosen": -3.22637677192688,
441
+ "logits/rejected": -3.2263927459716797,
442
+ "logps/chosen": -0.41995421051979065,
443
+ "logps/rejected": -3.0328941345214844,
444
+ "loss": 0.4417,
445
+ "odds_ratio_loss": 1.3745537996292114,
446
+ "rewards/accuracies": 0.9140625,
447
+ "rewards/chosen": -0.04199542477726936,
448
+ "rewards/margins": 0.26129403710365295,
449
+ "rewards/rejected": -0.3032894432544708,
450
+ "sft_loss": 0.30421775579452515,
451
+ "step": 260
452
+ },
453
+ {
454
+ "epoch": 0.3856002410001506,
455
+ "grad_norm": 2.578125,
456
+ "learning_rate": 0.0002313819395798639,
457
+ "logits/chosen": -3.186093330383301,
458
+ "logits/rejected": -3.1861069202423096,
459
+ "logps/chosen": -0.4216841161251068,
460
+ "logps/rejected": -3.125148057937622,
461
+ "loss": 0.4439,
462
+ "odds_ratio_loss": 1.3766355514526367,
463
+ "rewards/accuracies": 0.913281261920929,
464
+ "rewards/chosen": -0.04216841608285904,
465
+ "rewards/margins": 0.27034634351730347,
466
+ "rewards/rejected": -0.3125148117542267,
467
+ "sft_loss": 0.30621883273124695,
468
+ "step": 270
469
+ },
470
+ {
471
+ "epoch": 0.3998817314075636,
472
+ "grad_norm": 2.03125,
473
+ "learning_rate": 0.000225,
474
+ "logits/chosen": -3.167736530303955,
475
+ "logits/rejected": -3.167752265930176,
476
+ "logps/chosen": -0.42215681076049805,
477
+ "logps/rejected": -3.0445384979248047,
478
+ "loss": 0.4415,
479
+ "odds_ratio_loss": 1.3319975137710571,
480
+ "rewards/accuracies": 0.9189453125,
481
+ "rewards/chosen": -0.04221567511558533,
482
+ "rewards/margins": 0.26223814487457275,
483
+ "rewards/rejected": -0.30445384979248047,
484
+ "sft_loss": 0.3082923889160156,
485
+ "step": 280
486
+ },
487
+ {
488
+ "epoch": 0.4141632218149766,
489
+ "grad_norm": 1.9140625,
490
+ "learning_rate": 0.00021843159860297442,
491
+ "logits/chosen": -3.2080886363983154,
492
+ "logits/rejected": -3.208108425140381,
493
+ "logps/chosen": -0.3955201208591461,
494
+ "logps/rejected": -3.1413300037384033,
495
+ "loss": 0.4162,
496
+ "odds_ratio_loss": 1.298853874206543,
497
+ "rewards/accuracies": 0.9166015386581421,
498
+ "rewards/chosen": -0.03955201059579849,
499
+ "rewards/margins": 0.27458101511001587,
500
+ "rewards/rejected": -0.3141329884529114,
501
+ "sft_loss": 0.28630274534225464,
502
+ "step": 290
503
+ },
504
+ {
505
+ "epoch": 0.42844471222238956,
506
+ "grad_norm": 2.296875,
507
+ "learning_rate": 0.00021169306546959174,
508
+ "logits/chosen": -3.1625964641571045,
509
+ "logits/rejected": -3.1626217365264893,
510
+ "logps/chosen": -0.4014604091644287,
511
+ "logps/rejected": -3.231706142425537,
512
+ "loss": 0.4211,
513
+ "odds_ratio_loss": 1.2652801275253296,
514
+ "rewards/accuracies": 0.91796875,
515
+ "rewards/chosen": -0.04014604538679123,
516
+ "rewards/margins": 0.2830246090888977,
517
+ "rewards/rejected": -0.32317066192626953,
518
+ "sft_loss": 0.2946000099182129,
519
+ "step": 300
520
+ },
521
+ {
522
+ "epoch": 0.44272620262980256,
523
+ "grad_norm": 1.9765625,
524
+ "learning_rate": 0.00020480115365495926,
525
+ "logits/chosen": -3.1747231483459473,
526
+ "logits/rejected": -3.1747519969940186,
527
+ "logps/chosen": -0.3960801064968109,
528
+ "logps/rejected": -3.082359790802002,
529
+ "loss": 0.4173,
530
+ "odds_ratio_loss": 1.3159233331680298,
531
+ "rewards/accuracies": 0.9173828363418579,
532
+ "rewards/chosen": -0.03960801288485527,
533
+ "rewards/margins": 0.26862797141075134,
534
+ "rewards/rejected": -0.30823594331741333,
535
+ "sft_loss": 0.2857065200805664,
536
+ "step": 310
537
+ },
538
+ {
539
+ "epoch": 0.45700769303721556,
540
+ "grad_norm": 2.90625,
541
+ "learning_rate": 0.00019777299753775265,
542
+ "logits/chosen": -3.2027382850646973,
543
+ "logits/rejected": -3.202775478363037,
544
+ "logps/chosen": -0.3917561173439026,
545
+ "logps/rejected": -3.128166675567627,
546
+ "loss": 0.4113,
547
+ "odds_ratio_loss": 1.213888168334961,
548
+ "rewards/accuracies": 0.923046886920929,
549
+ "rewards/chosen": -0.0391756072640419,
550
+ "rewards/margins": 0.27364104986190796,
551
+ "rewards/rejected": -0.31281667947769165,
552
+ "sft_loss": 0.28995418548583984,
553
+ "step": 320
554
+ },
555
+ {
556
+ "epoch": 0.47128918344462856,
557
+ "grad_norm": 1.1171875,
558
+ "learning_rate": 0.00019062607022145078,
559
+ "logits/chosen": -3.223431348800659,
560
+ "logits/rejected": -3.2234749794006348,
561
+ "logps/chosen": -0.3959726393222809,
562
+ "logps/rejected": -3.1301980018615723,
563
+ "loss": 0.4159,
564
+ "odds_ratio_loss": 1.2752103805541992,
565
+ "rewards/accuracies": 0.9134765863418579,
566
+ "rewards/chosen": -0.03959726542234421,
567
+ "rewards/margins": 0.2734225392341614,
568
+ "rewards/rejected": -0.3130198121070862,
569
+ "sft_loss": 0.2884255647659302,
570
+ "step": 330
571
+ },
572
+ {
573
+ "epoch": 0.4855706738520415,
574
+ "grad_norm": 2.421875,
575
+ "learning_rate": 0.00018337814009344714,
576
+ "logits/chosen": -3.229165554046631,
577
+ "logits/rejected": -3.22920298576355,
578
+ "logps/chosen": -0.40863022208213806,
579
+ "logps/rejected": -3.2035133838653564,
580
+ "loss": 0.4293,
581
+ "odds_ratio_loss": 1.3022868633270264,
582
+ "rewards/accuracies": 0.923046886920929,
583
+ "rewards/chosen": -0.040863025933504105,
584
+ "rewards/margins": 0.27948835492134094,
585
+ "rewards/rejected": -0.32035139203071594,
586
+ "sft_loss": 0.29906368255615234,
587
+ "step": 340
588
+ },
589
+ {
590
+ "epoch": 0.4998521642594545,
591
+ "grad_norm": 1.6953125,
592
+ "learning_rate": 0.00017604722665003956,
593
+ "logits/chosen": -3.268237590789795,
594
+ "logits/rejected": -3.268270969390869,
595
+ "logps/chosen": -0.3820918798446655,
596
+ "logps/rejected": -3.3142802715301514,
597
+ "loss": 0.4012,
598
+ "odds_ratio_loss": 1.2229855060577393,
599
+ "rewards/accuracies": 0.924023449420929,
600
+ "rewards/chosen": -0.03820918872952461,
601
+ "rewards/margins": 0.29321882128715515,
602
+ "rewards/rejected": -0.33142799139022827,
603
+ "sft_loss": 0.278933584690094,
604
+ "step": 350
605
+ },
606
+ {
607
+ "epoch": 0.5141336546668674,
608
+ "grad_norm": 2.375,
609
+ "learning_rate": 0.00016865155569712278,
610
+ "logits/chosen": -3.3011035919189453,
611
+ "logits/rejected": -3.3011412620544434,
612
+ "logps/chosen": -0.38040798902511597,
613
+ "logps/rejected": -3.2756595611572266,
614
+ "loss": 0.3985,
615
+ "odds_ratio_loss": 1.1769336462020874,
616
+ "rewards/accuracies": 0.9302734136581421,
617
+ "rewards/chosen": -0.03804079815745354,
618
+ "rewards/margins": 0.2895251214504242,
619
+ "rewards/rejected": -0.3275659680366516,
620
+ "sft_loss": 0.28078263998031616,
621
+ "step": 360
622
+ },
623
+ {
624
+ "epoch": 0.5284151450742804,
625
+ "grad_norm": 1.484375,
626
+ "learning_rate": 0.00016120951403796364,
627
+ "logits/chosen": -3.336045026779175,
628
+ "logits/rejected": -3.3360836505889893,
629
+ "logps/chosen": -0.3747532069683075,
630
+ "logps/rejected": -3.3229317665100098,
631
+ "loss": 0.3938,
632
+ "odds_ratio_loss": 1.2170263528823853,
633
+ "rewards/accuracies": 0.924023449420929,
634
+ "rewards/chosen": -0.03747531771659851,
635
+ "rewards/margins": 0.29481783509254456,
636
+ "rewards/rejected": -0.33229315280914307,
637
+ "sft_loss": 0.2720716595649719,
638
+ "step": 370
639
+ },
640
+ {
641
+ "epoch": 0.5426966354816934,
642
+ "grad_norm": 1.1953125,
643
+ "learning_rate": 0.00015373960376071093,
644
+ "logits/chosen": -3.3047919273376465,
645
+ "logits/rejected": -3.3048160076141357,
646
+ "logps/chosen": -0.37664586305618286,
647
+ "logps/rejected": -3.1220898628234863,
648
+ "loss": 0.3961,
649
+ "odds_ratio_loss": 1.2487261295318604,
650
+ "rewards/accuracies": 0.921875,
651
+ "rewards/chosen": -0.037664588540792465,
652
+ "rewards/margins": 0.2745443880558014,
653
+ "rewards/rejected": -0.31220895051956177,
654
+ "sft_loss": 0.2712169289588928,
655
+ "step": 380
656
+ },
657
+ {
658
+ "epoch": 0.5569781258891064,
659
+ "grad_norm": 2.015625,
660
+ "learning_rate": 0.00014626039623928907,
661
+ "logits/chosen": -3.3392891883850098,
662
+ "logits/rejected": -3.339310884475708,
663
+ "logps/chosen": -0.3588925302028656,
664
+ "logps/rejected": -3.4983272552490234,
665
+ "loss": 0.3772,
666
+ "odds_ratio_loss": 1.1625574827194214,
667
+ "rewards/accuracies": 0.927734375,
668
+ "rewards/chosen": -0.03588924929499626,
669
+ "rewards/margins": 0.31394344568252563,
670
+ "rewards/rejected": -0.3498327136039734,
671
+ "sft_loss": 0.2609647512435913,
672
+ "step": 390
673
+ },
674
+ {
675
+ "epoch": 0.5712596162965194,
676
+ "grad_norm": 1.5078125,
677
+ "learning_rate": 0.00013879048596203636,
678
+ "logits/chosen": -3.382007598876953,
679
+ "logits/rejected": -3.3820137977600098,
680
+ "logps/chosen": -0.3696475028991699,
681
+ "logps/rejected": -3.3737378120422363,
682
+ "loss": 0.3872,
683
+ "odds_ratio_loss": 1.1461818218231201,
684
+ "rewards/accuracies": 0.929882824420929,
685
+ "rewards/chosen": -0.03696475178003311,
686
+ "rewards/margins": 0.3004090189933777,
687
+ "rewards/rejected": -0.3373737931251526,
688
+ "sft_loss": 0.27257078886032104,
689
+ "step": 400
690
  }
691
  ],
692
+ "logging_steps": 10,
693
+ "max_steps": 700,
694
  "num_input_tokens_seen": 0,
695
  "num_train_epochs": 1,
696
  "save_steps": 100,
 
706
  "attributes": {}
707
  }
708
  },
709
+ "total_flos": 3.1756323933484155e+18,
710
  "train_batch_size": 2,
711
  "trial_name": null,
712
  "trial_params": null