hanxinyan commited on
Commit
ae5bbaf
·
verified ·
1 Parent(s): 71f87d4

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +2764 -440
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
3
- datasets: open-r1/OpenR1-Math-220k
4
  library_name: transformers
5
  model_name: Qwen2.5-1.5B-Open-R1-Distill
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -36,7 +34,7 @@ This model was trained with SFT.
36
 
37
  ### Framework versions
38
 
39
- - TRL: 0.16.0.dev0
40
  - Transformers: 4.49.0
41
  - Pytorch: 2.5.1
42
  - Datasets: 3.3.2
 
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
34
 
35
  ### Framework versions
36
 
37
+ - TRL: 0.15.0
38
  - Transformers: 4.49.0
39
  - Pytorch: 2.5.1
40
  - Datasets: 3.3.2
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 488621249396736.0,
3
- "train_loss": 0.5826025370564034,
4
- "train_runtime": 12677.8495,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 2.706,
7
- "train_steps_per_second": 0.042
8
  }
 
1
  {
2
+ "total_flos": 250577921507328.0,
3
+ "train_loss": 0.49602713585984115,
4
+ "train_runtime": 3019.3082,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 93.134,
7
+ "train_steps_per_second": 0.728
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 488621249396736.0,
3
- "train_loss": 0.5826025370564034,
4
- "train_runtime": 12677.8495,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 2.706,
7
- "train_steps_per_second": 0.042
8
  }
 
1
  {
2
+ "total_flos": 250577921507328.0,
3
+ "train_loss": 0.49602713585984115,
4
+ "train_runtime": 3019.3082,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 93.134,
7
+ "train_steps_per_second": 0.728
8
  }
trainer_state.json CHANGED
@@ -1,776 +1,3100 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 536,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.009328358208955223,
13
- "grad_norm": 1.9582884722650218,
14
- "learning_rate": 3.7037037037037037e-06,
15
- "loss": 0.8531,
16
  "step": 5
17
  },
18
  {
19
- "epoch": 0.018656716417910446,
20
- "grad_norm": 1.5507688578118357,
21
- "learning_rate": 7.4074074074074075e-06,
22
- "loss": 0.829,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.027985074626865673,
27
- "grad_norm": 0.9808848931456882,
28
- "learning_rate": 1.1111111111111113e-05,
29
- "loss": 0.7718,
30
  "step": 15
31
  },
32
  {
33
- "epoch": 0.03731343283582089,
34
- "grad_norm": 0.7057547622223923,
35
- "learning_rate": 1.4814814814814815e-05,
36
- "loss": 0.7146,
37
  "step": 20
38
  },
39
  {
40
- "epoch": 0.04664179104477612,
41
- "grad_norm": 0.5153515693635202,
42
- "learning_rate": 1.851851851851852e-05,
43
- "loss": 0.6855,
44
  "step": 25
45
  },
46
  {
47
- "epoch": 0.055970149253731345,
48
- "grad_norm": 0.42512955547277975,
49
- "learning_rate": 1.9998457210069423e-05,
50
- "loss": 0.6828,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.06529850746268656,
55
- "grad_norm": 0.35017714168063874,
56
- "learning_rate": 1.9989030964793824e-05,
57
- "loss": 0.6655,
58
  "step": 35
59
  },
60
  {
61
- "epoch": 0.07462686567164178,
62
- "grad_norm": 0.30321555619821305,
63
- "learning_rate": 1.9971044545405168e-05,
64
- "loss": 0.649,
65
  "step": 40
66
  },
67
  {
68
- "epoch": 0.08395522388059702,
69
- "grad_norm": 0.25480072292993633,
70
- "learning_rate": 1.9944515080210415e-05,
71
- "loss": 0.6294,
72
  "step": 45
73
  },
74
  {
75
- "epoch": 0.09328358208955224,
76
- "grad_norm": 0.24437407015433799,
77
- "learning_rate": 1.9909467832982947e-05,
78
- "loss": 0.631,
79
  "step": 50
80
  },
81
  {
82
- "epoch": 0.10261194029850747,
83
- "grad_norm": 0.27312552235212906,
84
- "learning_rate": 1.9865936178904096e-05,
85
- "loss": 0.634,
86
  "step": 55
87
  },
88
  {
89
- "epoch": 0.11194029850746269,
90
- "grad_norm": 0.2974145556250763,
91
- "learning_rate": 1.9813961572780254e-05,
92
- "loss": 0.6196,
93
  "step": 60
94
  },
95
  {
96
- "epoch": 0.12126865671641791,
97
- "grad_norm": 0.2804969799737878,
98
- "learning_rate": 1.9753593509565844e-05,
99
- "loss": 0.616,
100
  "step": 65
101
  },
102
  {
103
- "epoch": 0.13059701492537312,
104
- "grad_norm": 0.29481795383951065,
105
- "learning_rate": 1.968488947722971e-05,
106
- "loss": 0.6121,
107
  "step": 70
108
  },
109
  {
110
- "epoch": 0.13992537313432835,
111
- "grad_norm": 0.2675399493219817,
112
- "learning_rate": 1.9607914902009818e-05,
113
- "loss": 0.6078,
114
  "step": 75
115
  },
116
  {
117
- "epoch": 0.14925373134328357,
118
- "grad_norm": 0.2766868403860743,
119
- "learning_rate": 1.9522743086108458e-05,
120
- "loss": 0.6073,
121
  "step": 80
122
  },
123
  {
124
- "epoch": 0.15858208955223882,
125
- "grad_norm": 0.2789954475671177,
126
- "learning_rate": 1.94294551378872e-05,
127
- "loss": 0.6119,
128
  "step": 85
129
  },
130
  {
131
- "epoch": 0.16791044776119404,
132
- "grad_norm": 0.2739304731521366,
133
- "learning_rate": 1.932813989462812e-05,
134
- "loss": 0.6088,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 0.17723880597014927,
139
- "grad_norm": 0.24762169104339068,
140
- "learning_rate": 1.921889383793486e-05,
141
- "loss": 0.596,
142
  "step": 95
143
  },
144
  {
145
- "epoch": 0.1865671641791045,
146
- "grad_norm": 0.26048399604907696,
147
- "learning_rate": 1.9101821001854028e-05,
148
- "loss": 0.5926,
149
  "step": 100
150
  },
151
  {
152
- "epoch": 0.1958955223880597,
153
- "grad_norm": 0.23913719486892107,
154
- "learning_rate": 1.897703287380454e-05,
155
- "loss": 0.6026,
156
  "step": 105
157
  },
158
  {
159
- "epoch": 0.20522388059701493,
160
- "grad_norm": 0.2669491913905435,
161
- "learning_rate": 1.8844648288409122e-05,
162
- "loss": 0.5939,
163
  "step": 110
164
  },
165
  {
166
- "epoch": 0.21455223880597016,
167
- "grad_norm": 0.2825642677905445,
168
- "learning_rate": 1.870479331432915e-05,
169
- "loss": 0.5888,
170
  "step": 115
171
  },
172
  {
173
- "epoch": 0.22388059701492538,
174
- "grad_norm": 0.26698835275706084,
175
- "learning_rate": 1.8557601134210615e-05,
176
- "loss": 0.6,
177
  "step": 120
178
  },
179
  {
180
- "epoch": 0.2332089552238806,
181
- "grad_norm": 0.24988039852201316,
182
- "learning_rate": 1.840321191785544e-05,
183
- "loss": 0.5839,
184
  "step": 125
185
  },
186
  {
187
- "epoch": 0.24253731343283583,
188
- "grad_norm": 0.23724751101455963,
189
- "learning_rate": 1.8241772688739053e-05,
190
- "loss": 0.5868,
191
  "step": 130
192
  },
193
  {
194
- "epoch": 0.251865671641791,
195
- "grad_norm": 0.25353094061148207,
196
- "learning_rate": 1.8073437184001245e-05,
197
- "loss": 0.5797,
198
  "step": 135
199
  },
200
  {
201
- "epoch": 0.26119402985074625,
202
- "grad_norm": 0.28310908056906503,
203
- "learning_rate": 1.7898365708043625e-05,
204
- "loss": 0.5851,
205
  "step": 140
206
  },
207
  {
208
- "epoch": 0.27052238805970147,
209
- "grad_norm": 0.3057205030340506,
210
- "learning_rate": 1.7716724979873204e-05,
211
- "loss": 0.5844,
212
  "step": 145
213
  },
214
  {
215
- "epoch": 0.2798507462686567,
216
- "grad_norm": 0.25557080693275586,
217
- "learning_rate": 1.7528687974337362e-05,
218
- "loss": 0.5782,
219
  "step": 150
220
  },
221
  {
222
- "epoch": 0.2891791044776119,
223
- "grad_norm": 0.29118310404885117,
224
- "learning_rate": 1.733443375740142e-05,
225
- "loss": 0.5745,
226
  "step": 155
227
  },
228
  {
229
- "epoch": 0.29850746268656714,
230
- "grad_norm": 0.31536471725958115,
231
- "learning_rate": 1.7134147315625745e-05,
232
- "loss": 0.5875,
233
  "step": 160
234
  },
235
  {
236
- "epoch": 0.30783582089552236,
237
- "grad_norm": 0.27386069561360055,
238
- "learning_rate": 1.6928019380004676e-05,
239
- "loss": 0.5808,
240
  "step": 165
241
  },
242
  {
243
- "epoch": 0.31716417910447764,
244
- "grad_norm": 0.2554841411662944,
245
- "learning_rate": 1.6716246244335107e-05,
246
- "loss": 0.5732,
247
  "step": 170
248
  },
249
  {
250
- "epoch": 0.32649253731343286,
251
- "grad_norm": 0.240119376051681,
252
- "learning_rate": 1.6499029578287657e-05,
253
- "loss": 0.5794,
254
  "step": 175
255
  },
256
  {
257
- "epoch": 0.3358208955223881,
258
- "grad_norm": 0.3149497936923061,
259
- "learning_rate": 1.6276576235358418e-05,
260
- "loss": 0.5806,
261
  "step": 180
262
  },
263
  {
264
- "epoch": 0.3451492537313433,
265
- "grad_norm": 0.29962666315829234,
266
- "learning_rate": 1.6049098055884245e-05,
267
- "loss": 0.5846,
268
  "step": 185
269
  },
270
  {
271
- "epoch": 0.35447761194029853,
272
- "grad_norm": 0.24917143721758442,
273
- "learning_rate": 1.5816811665309056e-05,
274
- "loss": 0.5727,
275
  "step": 190
276
  },
277
  {
278
- "epoch": 0.36380597014925375,
279
- "grad_norm": 0.26717681329209175,
280
- "learning_rate": 1.5579938267893384e-05,
281
- "loss": 0.5728,
282
  "step": 195
283
  },
284
  {
285
- "epoch": 0.373134328358209,
286
- "grad_norm": 0.2727272803589457,
287
- "learning_rate": 1.5338703436063506e-05,
288
- "loss": 0.5768,
289
  "step": 200
290
  },
291
  {
292
- "epoch": 0.3824626865671642,
293
- "grad_norm": 0.25711804455030257,
294
- "learning_rate": 1.509333689560084e-05,
295
- "loss": 0.5678,
296
  "step": 205
297
  },
298
  {
299
- "epoch": 0.3917910447761194,
300
- "grad_norm": 0.2755159161222621,
301
- "learning_rate": 1.4844072306876138e-05,
302
- "loss": 0.575,
303
  "step": 210
304
  },
305
  {
306
- "epoch": 0.40111940298507465,
307
- "grad_norm": 0.2916452631396203,
308
- "learning_rate": 1.4591147042336775e-05,
309
- "loss": 0.5796,
310
  "step": 215
311
  },
312
  {
313
- "epoch": 0.41044776119402987,
314
- "grad_norm": 0.24664116560867164,
315
- "learning_rate": 1.4334801960459105e-05,
316
- "loss": 0.5779,
317
  "step": 220
318
  },
319
  {
320
- "epoch": 0.4197761194029851,
321
- "grad_norm": 0.2799305252096292,
322
- "learning_rate": 1.4075281176381066e-05,
323
- "loss": 0.5739,
324
  "step": 225
325
  },
326
  {
327
- "epoch": 0.4291044776119403,
328
- "grad_norm": 0.25956452060232377,
329
- "learning_rate": 1.3812831829433511e-05,
330
- "loss": 0.5657,
331
  "step": 230
332
  },
333
  {
334
- "epoch": 0.43843283582089554,
335
- "grad_norm": 0.2524561937779275,
336
- "learning_rate": 1.3547703847791627e-05,
337
- "loss": 0.567,
338
  "step": 235
339
  },
340
  {
341
- "epoch": 0.44776119402985076,
342
- "grad_norm": 0.2776928850104372,
343
- "learning_rate": 1.3280149710470556e-05,
344
- "loss": 0.5749,
345
  "step": 240
346
  },
347
  {
348
- "epoch": 0.457089552238806,
349
- "grad_norm": 0.25702076508566124,
350
- "learning_rate": 1.301042420689189e-05,
351
- "loss": 0.5734,
352
  "step": 245
353
  },
354
  {
355
- "epoch": 0.4664179104477612,
356
- "grad_norm": 0.22797355115908421,
357
- "learning_rate": 1.2738784194249954e-05,
358
- "loss": 0.5624,
359
  "step": 250
360
  },
361
  {
362
- "epoch": 0.47574626865671643,
363
- "grad_norm": 0.24254293214473366,
364
- "learning_rate": 1.2465488352909005e-05,
365
- "loss": 0.5706,
366
  "step": 255
367
  },
368
  {
369
- "epoch": 0.48507462686567165,
370
- "grad_norm": 0.255268618507525,
371
- "learning_rate": 1.2190796940064225e-05,
372
- "loss": 0.5632,
373
  "step": 260
374
  },
375
  {
376
- "epoch": 0.4944029850746269,
377
- "grad_norm": 0.2639171300781658,
378
- "learning_rate": 1.1914971541901094e-05,
379
- "loss": 0.5567,
380
  "step": 265
381
  },
382
  {
383
- "epoch": 0.503731343283582,
384
- "grad_norm": 0.25681696403278814,
385
- "learning_rate": 1.1638274824489224e-05,
386
- "loss": 0.5717,
387
  "step": 270
388
  },
389
  {
390
- "epoch": 0.5130597014925373,
391
- "grad_norm": 0.24386938842368983,
392
- "learning_rate": 1.1360970283647774e-05,
393
- "loss": 0.5712,
394
  "step": 275
395
  },
396
  {
397
- "epoch": 0.5223880597014925,
398
- "grad_norm": 0.23806492070210172,
399
- "learning_rate": 1.1083321994020748e-05,
400
- "loss": 0.5681,
401
  "step": 280
402
  },
403
  {
404
- "epoch": 0.5317164179104478,
405
- "grad_norm": 0.22991480578545048,
406
- "learning_rate": 1.080559435760105e-05,
407
- "loss": 0.5555,
408
  "step": 285
409
  },
410
  {
411
- "epoch": 0.5410447761194029,
412
- "grad_norm": 0.2520057600566624,
413
- "learning_rate": 1.0528051851942837e-05,
414
- "loss": 0.569,
415
  "step": 290
416
  },
417
  {
418
- "epoch": 0.5503731343283582,
419
- "grad_norm": 0.2209222924413786,
420
- "learning_rate": 1.025095877830188e-05,
421
- "loss": 0.5582,
422
  "step": 295
423
  },
424
  {
425
- "epoch": 0.5597014925373134,
426
- "grad_norm": 0.23977946340271009,
427
- "learning_rate": 9.97457900994385e-06,
428
- "loss": 0.5628,
429
  "step": 300
430
  },
431
  {
432
- "epoch": 0.5690298507462687,
433
- "grad_norm": 0.22571103566953005,
434
- "learning_rate": 9.699175740860154e-06,
435
- "loss": 0.5577,
436
  "step": 305
437
  },
438
  {
439
- "epoch": 0.5783582089552238,
440
- "grad_norm": 0.21591058152406564,
441
- "learning_rate": 9.42501123513063e-06,
442
- "loss": 0.5639,
443
  "step": 310
444
  },
445
  {
446
- "epoch": 0.5876865671641791,
447
- "grad_norm": 0.2167881232829152,
448
- "learning_rate": 9.152346577171818e-06,
449
- "loss": 0.561,
450
  "step": 315
451
  },
452
  {
453
- "epoch": 0.5970149253731343,
454
- "grad_norm": 0.22870575477314312,
455
- "learning_rate": 8.881441423108578e-06,
456
- "loss": 0.5572,
457
  "step": 320
458
  },
459
  {
460
- "epoch": 0.6063432835820896,
461
- "grad_norm": 0.25737341600608027,
462
- "learning_rate": 8.612553753505893e-06,
463
- "loss": 0.5558,
464
  "step": 325
465
  },
466
  {
467
- "epoch": 0.6156716417910447,
468
- "grad_norm": 0.22792887760278,
469
- "learning_rate": 8.345939627696258e-06,
470
- "loss": 0.5605,
471
  "step": 330
472
  },
473
  {
474
- "epoch": 0.625,
475
- "grad_norm": 0.21205982412192553,
476
- "learning_rate": 8.081852939936668e-06,
477
- "loss": 0.5576,
478
  "step": 335
479
  },
480
  {
481
- "epoch": 0.6343283582089553,
482
- "grad_norm": 0.24057275333558703,
483
- "learning_rate": 7.820545177627385e-06,
484
- "loss": 0.5597,
485
  "step": 340
486
  },
487
  {
488
- "epoch": 0.6436567164179104,
489
- "grad_norm": 0.22465764228731216,
490
- "learning_rate": 7.562265181822712e-06,
491
- "loss": 0.5533,
492
  "step": 345
493
  },
494
  {
495
- "epoch": 0.6529850746268657,
496
- "grad_norm": 0.21415330979830366,
497
- "learning_rate": 7.307258910261883e-06,
498
- "loss": 0.5636,
499
  "step": 350
500
  },
501
  {
502
- "epoch": 0.6623134328358209,
503
- "grad_norm": 0.21866500027167923,
504
- "learning_rate": 7.055769203145701e-06,
505
- "loss": 0.5502,
506
  "step": 355
507
  },
508
  {
509
- "epoch": 0.6716417910447762,
510
- "grad_norm": 0.21062825583929337,
511
- "learning_rate": 6.808035551881964e-06,
512
- "loss": 0.551,
513
  "step": 360
514
  },
515
  {
516
- "epoch": 0.6809701492537313,
517
- "grad_norm": 0.2712485586267244,
518
- "learning_rate": 6.564293871019949e-06,
519
- "loss": 0.5547,
520
  "step": 365
521
  },
522
  {
523
- "epoch": 0.6902985074626866,
524
- "grad_norm": 0.19984536466486436,
525
- "learning_rate": 6.32477627359108e-06,
526
- "loss": 0.5536,
527
  "step": 370
528
  },
529
  {
530
- "epoch": 0.6996268656716418,
531
- "grad_norm": 0.21558313058123174,
532
- "learning_rate": 6.089710850069755e-06,
533
- "loss": 0.5566,
534
  "step": 375
535
  },
536
  {
537
- "epoch": 0.7089552238805971,
538
- "grad_norm": 0.22554670287610917,
539
- "learning_rate": 5.8593214511648165e-06,
540
- "loss": 0.5592,
541
  "step": 380
542
  },
543
  {
544
- "epoch": 0.7182835820895522,
545
- "grad_norm": 0.20601098663874678,
546
- "learning_rate": 5.633827474648525e-06,
547
- "loss": 0.5615,
548
  "step": 385
549
  },
550
  {
551
- "epoch": 0.7276119402985075,
552
- "grad_norm": 0.19913008795158682,
553
- "learning_rate": 5.413443656426003e-06,
554
- "loss": 0.56,
555
  "step": 390
556
  },
557
  {
558
- "epoch": 0.7369402985074627,
559
- "grad_norm": 0.2169264111526138,
560
- "learning_rate": 5.198379866044152e-06,
561
- "loss": 0.5574,
562
  "step": 395
563
  },
564
  {
565
- "epoch": 0.746268656716418,
566
- "grad_norm": 0.20465918277941866,
567
- "learning_rate": 4.988840906834762e-06,
568
- "loss": 0.5531,
569
  "step": 400
570
  },
571
  {
572
- "epoch": 0.7555970149253731,
573
- "grad_norm": 0.21189384802184624,
574
- "learning_rate": 4.785026320882102e-06,
575
- "loss": 0.5596,
576
  "step": 405
577
  },
578
  {
579
- "epoch": 0.7649253731343284,
580
- "grad_norm": 0.21275940358631434,
581
- "learning_rate": 4.5871301990007814e-06,
582
- "loss": 0.5458,
583
  "step": 410
584
  },
585
  {
586
- "epoch": 0.7742537313432836,
587
- "grad_norm": 0.21099375846338056,
588
- "learning_rate": 4.395340995904802e-06,
589
- "loss": 0.5623,
590
  "step": 415
591
  },
592
  {
593
- "epoch": 0.7835820895522388,
594
- "grad_norm": 0.20468394486262131,
595
- "learning_rate": 4.209841350743804e-06,
596
- "loss": 0.5537,
597
  "step": 420
598
  },
599
  {
600
- "epoch": 0.792910447761194,
601
- "grad_norm": 0.2018709806008376,
602
- "learning_rate": 4.030807913177434e-06,
603
- "loss": 0.552,
604
  "step": 425
605
  },
606
  {
607
- "epoch": 0.8022388059701493,
608
- "grad_norm": 0.19195580300578935,
609
- "learning_rate": 3.858411175153449e-06,
610
- "loss": 0.5528,
611
  "step": 430
612
  },
613
  {
614
- "epoch": 0.8115671641791045,
615
- "grad_norm": 0.19513026612777026,
616
- "learning_rate": 3.6928153085497507e-06,
617
- "loss": 0.5552,
618
  "step": 435
619
  },
620
  {
621
- "epoch": 0.8208955223880597,
622
- "grad_norm": 0.20961199025649588,
623
- "learning_rate": 3.53417800883497e-06,
624
- "loss": 0.5495,
625
  "step": 440
626
  },
627
  {
628
- "epoch": 0.8302238805970149,
629
- "grad_norm": 0.21115306166452463,
630
- "learning_rate": 3.382650344896477e-06,
631
- "loss": 0.5602,
632
  "step": 445
633
  },
634
  {
635
- "epoch": 0.8395522388059702,
636
- "grad_norm": 0.19169723690704785,
637
- "learning_rate": 3.2383766151788232e-06,
638
- "loss": 0.5575,
639
  "step": 450
640
  },
641
  {
642
- "epoch": 0.8488805970149254,
643
- "grad_norm": 0.202529193254928,
644
- "learning_rate": 3.101494210269623e-06,
645
- "loss": 0.5499,
646
  "step": 455
647
  },
648
  {
649
- "epoch": 0.8582089552238806,
650
- "grad_norm": 0.20647105085898998,
651
- "learning_rate": 2.9721334820637142e-06,
652
- "loss": 0.5607,
653
  "step": 460
654
  },
655
  {
656
- "epoch": 0.8675373134328358,
657
- "grad_norm": 0.20610143681398768,
658
- "learning_rate": 2.850417619630219e-06,
659
- "loss": 0.5659,
660
  "step": 465
661
  },
662
  {
663
- "epoch": 0.8768656716417911,
664
- "grad_norm": 0.19801134463801753,
665
- "learning_rate": 2.73646253190068e-06,
666
- "loss": 0.5451,
667
  "step": 470
668
  },
669
  {
670
- "epoch": 0.8861940298507462,
671
- "grad_norm": 0.2005832233779882,
672
- "learning_rate": 2.6303767372900275e-06,
673
- "loss": 0.5493,
674
  "step": 475
675
  },
676
  {
677
- "epoch": 0.8955223880597015,
678
- "grad_norm": 0.22302976023377888,
679
- "learning_rate": 2.5322612603554467e-06,
680
- "loss": 0.554,
681
  "step": 480
682
  },
683
  {
684
- "epoch": 0.9048507462686567,
685
- "grad_norm": 0.22541364558555915,
686
- "learning_rate": 2.442209535591609e-06,
687
- "loss": 0.5623,
688
  "step": 485
689
  },
690
  {
691
- "epoch": 0.914179104477612,
692
- "grad_norm": 0.20000271941339307,
693
- "learning_rate": 2.3603073184538205e-06,
694
- "loss": 0.5489,
695
  "step": 490
696
  },
697
  {
698
- "epoch": 0.9235074626865671,
699
- "grad_norm": 0.20258730610423065,
700
- "learning_rate": 2.286632603693873e-06,
701
- "loss": 0.5514,
702
  "step": 495
703
  },
704
  {
705
- "epoch": 0.9328358208955224,
706
- "grad_norm": 0.19560816719935362,
707
- "learning_rate": 2.2212555510863334e-06,
708
- "loss": 0.5528,
709
  "step": 500
710
  },
711
  {
712
- "epoch": 0.9421641791044776,
713
- "grad_norm": 0.1910933154854795,
714
- "learning_rate": 2.16423841861602e-06,
715
- "loss": 0.5478,
716
  "step": 505
717
  },
718
  {
719
- "epoch": 0.9514925373134329,
720
- "grad_norm": 0.19921777014416955,
721
- "learning_rate": 2.115635503190272e-06,
722
- "loss": 0.5484,
723
  "step": 510
724
  },
725
  {
726
- "epoch": 0.960820895522388,
727
- "grad_norm": 0.2038655433791115,
728
- "learning_rate": 2.075493088932492e-06,
729
- "loss": 0.5496,
730
  "step": 515
731
  },
732
  {
733
- "epoch": 0.9701492537313433,
734
- "grad_norm": 0.1926153617962426,
735
- "learning_rate": 2.0438494031061866e-06,
736
- "loss": 0.5542,
737
  "step": 520
738
  },
739
  {
740
- "epoch": 0.9794776119402985,
741
- "grad_norm": 0.20095357931700691,
742
- "learning_rate": 2.0207345797114862e-06,
743
- "loss": 0.5562,
744
  "step": 525
745
  },
746
  {
747
- "epoch": 0.9888059701492538,
748
- "grad_norm": 0.19961768497451118,
749
- "learning_rate": 2.006170630788801e-06,
750
- "loss": 0.5551,
751
  "step": 530
752
  },
753
  {
754
- "epoch": 0.9981343283582089,
755
- "grad_norm": 0.2003009243538159,
756
- "learning_rate": 2.000171425456953e-06,
757
- "loss": 0.5553,
758
  "step": 535
759
  },
760
  {
761
- "epoch": 1.0,
762
- "step": 536,
763
- "total_flos": 488621249396736.0,
764
- "train_loss": 0.5826025370564034,
765
- "train_runtime": 12677.8495,
766
- "train_samples_per_second": 2.706,
767
- "train_steps_per_second": 0.042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768
  }
769
  ],
770
  "logging_steps": 5,
771
- "max_steps": 536,
772
  "num_input_tokens_seen": 0,
773
- "num_train_epochs": 1,
774
  "save_steps": 100,
775
  "stateful_callbacks": {
776
  "TrainerControl": {
@@ -784,7 +3108,7 @@
784
  "attributes": {}
785
  }
786
  },
787
- "total_flos": 488621249396736.0,
788
  "train_batch_size": 16,
789
  "trial_name": null,
790
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 2199,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0068212824010914054,
13
+ "grad_norm": 18.465126450848995,
14
+ "learning_rate": 9.090909090909091e-07,
15
+ "loss": 1.2862,
16
  "step": 5
17
  },
18
  {
19
+ "epoch": 0.013642564802182811,
20
+ "grad_norm": 9.620521609167104,
21
+ "learning_rate": 1.8181818181818183e-06,
22
+ "loss": 1.2471,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.020463847203274217,
27
+ "grad_norm": 5.6700954908479115,
28
+ "learning_rate": 2.7272727272727272e-06,
29
+ "loss": 1.1088,
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 0.027285129604365622,
34
+ "grad_norm": 4.396294181846934,
35
+ "learning_rate": 3.6363636363636366e-06,
36
+ "loss": 0.9952,
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.034106412005457026,
41
+ "grad_norm": 1.9531017991370636,
42
+ "learning_rate": 4.5454545454545455e-06,
43
+ "loss": 0.8603,
44
  "step": 25
45
  },
46
  {
47
+ "epoch": 0.040927694406548434,
48
+ "grad_norm": 1.0277693546437923,
49
+ "learning_rate": 5.4545454545454545e-06,
50
+ "loss": 0.7665,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.047748976807639835,
55
+ "grad_norm": 0.6910089460223975,
56
+ "learning_rate": 6.363636363636364e-06,
57
+ "loss": 0.7234,
58
  "step": 35
59
  },
60
  {
61
+ "epoch": 0.054570259208731244,
62
+ "grad_norm": 0.6835108712614276,
63
+ "learning_rate": 7.272727272727273e-06,
64
+ "loss": 0.6918,
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.061391541609822645,
69
+ "grad_norm": 0.5962965736622774,
70
+ "learning_rate": 8.181818181818183e-06,
71
+ "loss": 0.6764,
72
  "step": 45
73
  },
74
  {
75
+ "epoch": 0.06821282401091405,
76
+ "grad_norm": 0.6256141251393179,
77
+ "learning_rate": 9.090909090909091e-06,
78
+ "loss": 0.6703,
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.07503410641200546,
83
+ "grad_norm": 0.6342042835021994,
84
+ "learning_rate": 1e-05,
85
+ "loss": 0.6716,
86
  "step": 55
87
  },
88
  {
89
+ "epoch": 0.08185538881309687,
90
+ "grad_norm": 0.5891899162044236,
91
+ "learning_rate": 1.0909090909090909e-05,
92
+ "loss": 0.6433,
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.08867667121418826,
97
+ "grad_norm": 0.5578840971439548,
98
+ "learning_rate": 1.181818181818182e-05,
99
+ "loss": 0.6437,
100
  "step": 65
101
  },
102
  {
103
+ "epoch": 0.09549795361527967,
104
+ "grad_norm": 0.5704118464239262,
105
+ "learning_rate": 1.2727272727272728e-05,
106
+ "loss": 0.634,
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.10231923601637108,
111
+ "grad_norm": 0.5450983610009776,
112
+ "learning_rate": 1.3636363636363637e-05,
113
+ "loss": 0.6358,
114
  "step": 75
115
  },
116
  {
117
+ "epoch": 0.10914051841746249,
118
+ "grad_norm": 0.5970474778155904,
119
+ "learning_rate": 1.4545454545454546e-05,
120
+ "loss": 0.6153,
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 0.11596180081855388,
125
+ "grad_norm": 0.6082777501619802,
126
+ "learning_rate": 1.5454545454545454e-05,
127
+ "loss": 0.6005,
128
  "step": 85
129
  },
130
  {
131
+ "epoch": 0.12278308321964529,
132
+ "grad_norm": 0.6267538603338134,
133
+ "learning_rate": 1.6363636363636366e-05,
134
+ "loss": 0.618,
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 0.1296043656207367,
139
+ "grad_norm": 0.6468344635295971,
140
+ "learning_rate": 1.7272727272727274e-05,
141
+ "loss": 0.6142,
142
  "step": 95
143
  },
144
  {
145
+ "epoch": 0.1364256480218281,
146
+ "grad_norm": 0.6177323893910839,
147
+ "learning_rate": 1.8181818181818182e-05,
148
+ "loss": 0.6097,
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 0.1432469304229195,
153
+ "grad_norm": 0.5904826597809681,
154
+ "learning_rate": 1.9090909090909094e-05,
155
+ "loss": 0.6081,
156
  "step": 105
157
  },
158
  {
159
+ "epoch": 0.15006821282401092,
160
+ "grad_norm": 0.6497802879480495,
161
+ "learning_rate": 2e-05,
162
+ "loss": 0.6067,
163
  "step": 110
164
  },
165
  {
166
+ "epoch": 0.15688949522510232,
167
+ "grad_norm": 0.6230122222887485,
168
+ "learning_rate": 1.9999720123762578e-05,
169
+ "loss": 0.6061,
170
  "step": 115
171
  },
172
  {
173
+ "epoch": 0.16371077762619374,
174
+ "grad_norm": 0.7293635666196662,
175
+ "learning_rate": 1.9998880510874693e-05,
176
+ "loss": 0.5971,
177
  "step": 120
178
  },
179
  {
180
+ "epoch": 0.17053206002728513,
181
+ "grad_norm": 0.6952053234528929,
182
+ "learning_rate": 1.9997481208808608e-05,
183
+ "loss": 0.5915,
184
  "step": 125
185
  },
186
  {
187
+ "epoch": 0.17735334242837653,
188
+ "grad_norm": 0.6281217832903768,
189
+ "learning_rate": 1.9995522296681776e-05,
190
+ "loss": 0.6005,
191
  "step": 130
192
  },
193
  {
194
+ "epoch": 0.18417462482946795,
195
+ "grad_norm": 0.6023348866340367,
196
+ "learning_rate": 1.999300388525237e-05,
197
+ "loss": 0.5965,
198
  "step": 135
199
  },
200
  {
201
+ "epoch": 0.19099590723055934,
202
+ "grad_norm": 0.6399678150664048,
203
+ "learning_rate": 1.998992611691302e-05,
204
+ "loss": 0.6061,
205
  "step": 140
206
  },
207
  {
208
+ "epoch": 0.19781718963165076,
209
+ "grad_norm": 0.6509643588209947,
210
+ "learning_rate": 1.9986289165682767e-05,
211
+ "loss": 0.591,
212
  "step": 145
213
  },
214
  {
215
+ "epoch": 0.20463847203274216,
216
+ "grad_norm": 0.5672164251292882,
217
+ "learning_rate": 1.9982093237197198e-05,
218
+ "loss": 0.5953,
219
  "step": 150
220
  },
221
  {
222
+ "epoch": 0.21145975443383355,
223
+ "grad_norm": 0.61442736571869,
224
+ "learning_rate": 1.9977338568696867e-05,
225
+ "loss": 0.583,
226
  "step": 155
227
  },
228
  {
229
+ "epoch": 0.21828103683492497,
230
+ "grad_norm": 0.6163516135212436,
231
+ "learning_rate": 1.9972025429013836e-05,
232
+ "loss": 0.5996,
233
  "step": 160
234
  },
235
  {
236
+ "epoch": 0.22510231923601637,
237
+ "grad_norm": 0.7074468528850713,
238
+ "learning_rate": 1.9966154118556494e-05,
239
+ "loss": 0.587,
240
  "step": 165
241
  },
242
  {
243
+ "epoch": 0.23192360163710776,
244
+ "grad_norm": 0.6437004756190762,
245
+ "learning_rate": 1.995972496929258e-05,
246
+ "loss": 0.5816,
247
  "step": 170
248
  },
249
  {
250
+ "epoch": 0.23874488403819918,
251
+ "grad_norm": 0.6166442107748029,
252
+ "learning_rate": 1.9952738344730395e-05,
253
+ "loss": 0.5945,
254
  "step": 175
255
  },
256
  {
257
+ "epoch": 0.24556616643929058,
258
+ "grad_norm": 0.6226394993981523,
259
+ "learning_rate": 1.994519463989826e-05,
260
+ "loss": 0.5861,
261
  "step": 180
262
  },
263
  {
264
+ "epoch": 0.252387448840382,
265
+ "grad_norm": 0.6352005030903742,
266
+ "learning_rate": 1.9937094281322186e-05,
267
+ "loss": 0.5852,
268
  "step": 185
269
  },
270
  {
271
+ "epoch": 0.2592087312414734,
272
+ "grad_norm": 0.6052650272558217,
273
+ "learning_rate": 1.9928437727001743e-05,
274
+ "loss": 0.5918,
275
  "step": 190
276
  },
277
  {
278
+ "epoch": 0.2660300136425648,
279
+ "grad_norm": 0.6460373146884708,
280
+ "learning_rate": 1.9919225466384163e-05,
281
+ "loss": 0.582,
282
  "step": 195
283
  },
284
  {
285
+ "epoch": 0.2728512960436562,
286
+ "grad_norm": 0.5731142179764637,
287
+ "learning_rate": 1.9909458020336697e-05,
288
+ "loss": 0.5837,
289
  "step": 200
290
  },
291
  {
292
+ "epoch": 0.27967257844474763,
293
+ "grad_norm": 0.5560540050724747,
294
+ "learning_rate": 1.9899135941117135e-05,
295
+ "loss": 0.587,
296
  "step": 205
297
  },
298
  {
299
+ "epoch": 0.286493860845839,
300
+ "grad_norm": 0.5680417036845223,
301
+ "learning_rate": 1.988825981234258e-05,
302
+ "loss": 0.5741,
303
  "step": 210
304
  },
305
  {
306
+ "epoch": 0.2933151432469304,
307
+ "grad_norm": 0.6156811375088975,
308
+ "learning_rate": 1.987683024895647e-05,
309
+ "loss": 0.5842,
310
  "step": 215
311
  },
312
  {
313
+ "epoch": 0.30013642564802184,
314
+ "grad_norm": 0.6561647146790152,
315
+ "learning_rate": 1.986484789719379e-05,
316
+ "loss": 0.5764,
317
  "step": 220
318
  },
319
  {
320
+ "epoch": 0.3069577080491132,
321
+ "grad_norm": 0.6196711771702965,
322
+ "learning_rate": 1.985231343454455e-05,
323
+ "loss": 0.5767,
324
  "step": 225
325
  },
326
  {
327
+ "epoch": 0.31377899045020463,
328
+ "grad_norm": 0.6223261390308151,
329
+ "learning_rate": 1.9839227569715463e-05,
330
+ "loss": 0.5703,
331
  "step": 230
332
  },
333
  {
334
+ "epoch": 0.32060027285129605,
335
+ "grad_norm": 0.6148219036902507,
336
+ "learning_rate": 1.9825591042589888e-05,
337
+ "loss": 0.5644,
338
  "step": 235
339
  },
340
  {
341
+ "epoch": 0.3274215552523875,
342
+ "grad_norm": 0.6109107619627493,
343
+ "learning_rate": 1.9811404624185984e-05,
344
+ "loss": 0.5731,
345
  "step": 240
346
  },
347
  {
348
+ "epoch": 0.33424283765347884,
349
+ "grad_norm": 0.597948954920139,
350
+ "learning_rate": 1.9796669116613128e-05,
351
+ "loss": 0.583,
352
  "step": 245
353
  },
354
  {
355
+ "epoch": 0.34106412005457026,
356
+ "grad_norm": 0.5869886008100728,
357
+ "learning_rate": 1.9781385353026557e-05,
358
+ "loss": 0.5722,
359
  "step": 250
360
  },
361
  {
362
+ "epoch": 0.3478854024556617,
363
+ "grad_norm": 0.6206830241642184,
364
+ "learning_rate": 1.9765554197580257e-05,
365
+ "loss": 0.5663,
366
  "step": 255
367
  },
368
  {
369
+ "epoch": 0.35470668485675305,
370
+ "grad_norm": 0.5964076326071935,
371
+ "learning_rate": 1.974917654537811e-05,
372
+ "loss": 0.5535,
373
  "step": 260
374
  },
375
  {
376
+ "epoch": 0.3615279672578445,
377
+ "grad_norm": 0.5837129031281011,
378
+ "learning_rate": 1.9732253322423283e-05,
379
+ "loss": 0.5714,
380
  "step": 265
381
  },
382
  {
383
+ "epoch": 0.3683492496589359,
384
+ "grad_norm": 0.5668697682461173,
385
+ "learning_rate": 1.9714785485565878e-05,
386
+ "loss": 0.5663,
387
  "step": 270
388
  },
389
  {
390
+ "epoch": 0.37517053206002726,
391
+ "grad_norm": 0.6431513321796068,
392
+ "learning_rate": 1.9696774022448806e-05,
393
+ "loss": 0.571,
394
  "step": 275
395
  },
396
  {
397
+ "epoch": 0.3819918144611187,
398
+ "grad_norm": 0.6550933809459418,
399
+ "learning_rate": 1.9678219951451977e-05,
400
+ "loss": 0.5701,
401
  "step": 280
402
  },
403
  {
404
+ "epoch": 0.3888130968622101,
405
+ "grad_norm": 0.5874407279816264,
406
+ "learning_rate": 1.9659124321634692e-05,
407
+ "loss": 0.5745,
408
  "step": 285
409
  },
410
  {
411
+ "epoch": 0.3956343792633015,
412
+ "grad_norm": 0.5768705911971546,
413
+ "learning_rate": 1.963948821267636e-05,
414
+ "loss": 0.571,
415
  "step": 290
416
  },
417
  {
418
+ "epoch": 0.4024556616643929,
419
+ "grad_norm": 0.6747901801550915,
420
+ "learning_rate": 1.961931273481542e-05,
421
+ "loss": 0.5655,
422
  "step": 295
423
  },
424
  {
425
+ "epoch": 0.4092769440654843,
426
+ "grad_norm": 0.6456186229768464,
427
+ "learning_rate": 1.9598599028786584e-05,
428
+ "loss": 0.558,
429
  "step": 300
430
  },
431
  {
432
+ "epoch": 0.41609822646657574,
433
+ "grad_norm": 0.6161384905122704,
434
+ "learning_rate": 1.9577348265756338e-05,
435
+ "loss": 0.5705,
436
  "step": 305
437
  },
438
  {
439
+ "epoch": 0.4229195088676671,
440
+ "grad_norm": 0.6359310743099941,
441
+ "learning_rate": 1.9555561647256723e-05,
442
+ "loss": 0.5762,
443
  "step": 310
444
  },
445
  {
446
+ "epoch": 0.4297407912687585,
447
+ "grad_norm": 0.7060625563181914,
448
+ "learning_rate": 1.95332404051174e-05,
449
+ "loss": 0.5736,
450
  "step": 315
451
  },
452
  {
453
+ "epoch": 0.43656207366984995,
454
+ "grad_norm": 0.5367387545489988,
455
+ "learning_rate": 1.9510385801396e-05,
456
+ "loss": 0.5581,
457
  "step": 320
458
  },
459
  {
460
+ "epoch": 0.4433833560709413,
461
+ "grad_norm": 0.5927035459206245,
462
+ "learning_rate": 1.9486999128306757e-05,
463
+ "loss": 0.5692,
464
  "step": 325
465
  },
466
  {
467
+ "epoch": 0.45020463847203274,
468
+ "grad_norm": 0.6008067119022888,
469
+ "learning_rate": 1.946308170814747e-05,
470
+ "loss": 0.5617,
471
  "step": 330
472
  },
473
  {
474
+ "epoch": 0.45702592087312416,
475
+ "grad_norm": 0.6171323825640702,
476
+ "learning_rate": 1.9438634893224714e-05,
477
+ "loss": 0.5726,
478
  "step": 335
479
  },
480
  {
481
+ "epoch": 0.4638472032742155,
482
+ "grad_norm": 0.6348087429739124,
483
+ "learning_rate": 1.9413660065777398e-05,
484
+ "loss": 0.5698,
485
  "step": 340
486
  },
487
  {
488
+ "epoch": 0.47066848567530695,
489
+ "grad_norm": 0.6311336601700943,
490
+ "learning_rate": 1.9388158637898592e-05,
491
+ "loss": 0.5673,
492
  "step": 345
493
  },
494
  {
495
+ "epoch": 0.47748976807639837,
496
+ "grad_norm": 0.5754972961666545,
497
+ "learning_rate": 1.9362132051455713e-05,
498
+ "loss": 0.5693,
499
  "step": 350
500
  },
501
  {
502
+ "epoch": 0.4843110504774898,
503
+ "grad_norm": 0.6112403337157682,
504
+ "learning_rate": 1.933558177800898e-05,
505
+ "loss": 0.5726,
506
  "step": 355
507
  },
508
  {
509
+ "epoch": 0.49113233287858116,
510
+ "grad_norm": 0.608579178603472,
511
+ "learning_rate": 1.9308509318728216e-05,
512
+ "loss": 0.5587,
513
  "step": 360
514
  },
515
  {
516
+ "epoch": 0.4979536152796726,
517
+ "grad_norm": 0.5585318192657098,
518
+ "learning_rate": 1.9280916204307976e-05,
519
+ "loss": 0.5599,
520
  "step": 365
521
  },
522
  {
523
+ "epoch": 0.504774897680764,
524
+ "grad_norm": 0.6397266266283088,
525
+ "learning_rate": 1.9252803994880994e-05,
526
+ "loss": 0.5549,
527
  "step": 370
528
  },
529
  {
530
+ "epoch": 0.5115961800818554,
531
+ "grad_norm": 0.5758670924617978,
532
+ "learning_rate": 1.9224174279929983e-05,
533
+ "loss": 0.5631,
534
  "step": 375
535
  },
536
  {
537
+ "epoch": 0.5184174624829468,
538
+ "grad_norm": 0.6118586009405445,
539
+ "learning_rate": 1.9195028678197748e-05,
540
+ "loss": 0.5538,
541
  "step": 380
542
  },
543
  {
544
+ "epoch": 0.5252387448840382,
545
+ "grad_norm": 0.6333491320668606,
546
+ "learning_rate": 1.916536883759567e-05,
547
+ "loss": 0.5589,
548
  "step": 385
549
  },
550
  {
551
+ "epoch": 0.5320600272851296,
552
+ "grad_norm": 0.663075678783174,
553
+ "learning_rate": 1.9135196435110534e-05,
554
+ "loss": 0.5679,
555
  "step": 390
556
  },
557
  {
558
+ "epoch": 0.538881309686221,
559
+ "grad_norm": 0.6476208401862266,
560
+ "learning_rate": 1.9104513176709713e-05,
561
+ "loss": 0.5553,
562
  "step": 395
563
  },
564
  {
565
+ "epoch": 0.5457025920873124,
566
+ "grad_norm": 0.5634361444996904,
567
+ "learning_rate": 1.9073320797244707e-05,
568
+ "loss": 0.5642,
569
  "step": 400
570
  },
571
  {
572
+ "epoch": 0.5525238744884038,
573
+ "grad_norm": 0.5943511261893936,
574
+ "learning_rate": 1.9041621060353047e-05,
575
+ "loss": 0.5638,
576
  "step": 405
577
  },
578
  {
579
+ "epoch": 0.5593451568894953,
580
+ "grad_norm": 0.5692219403482403,
581
+ "learning_rate": 1.9009415758358588e-05,
582
+ "loss": 0.5582,
583
  "step": 410
584
  },
585
  {
586
+ "epoch": 0.5661664392905866,
587
+ "grad_norm": 0.5878247286961229,
588
+ "learning_rate": 1.8976706712170178e-05,
589
+ "loss": 0.5492,
590
  "step": 415
591
  },
592
  {
593
+ "epoch": 0.572987721691678,
594
+ "grad_norm": 0.594118308461736,
595
+ "learning_rate": 1.8943495771178665e-05,
596
+ "loss": 0.5529,
597
  "step": 420
598
  },
599
  {
600
+ "epoch": 0.5798090040927695,
601
+ "grad_norm": 0.5706855416163217,
602
+ "learning_rate": 1.8909784813152388e-05,
603
+ "loss": 0.5582,
604
  "step": 425
605
  },
606
  {
607
+ "epoch": 0.5866302864938608,
608
+ "grad_norm": 0.5988482524277083,
609
+ "learning_rate": 1.8875575744130957e-05,
610
+ "loss": 0.5453,
611
  "step": 430
612
  },
613
  {
614
+ "epoch": 0.5934515688949522,
615
+ "grad_norm": 0.5522453262711926,
616
+ "learning_rate": 1.8840870498317505e-05,
617
+ "loss": 0.5533,
618
  "step": 435
619
  },
620
  {
621
+ "epoch": 0.6002728512960437,
622
+ "grad_norm": 0.5654931208823876,
623
+ "learning_rate": 1.8805671037969332e-05,
624
+ "loss": 0.5453,
625
  "step": 440
626
  },
627
  {
628
+ "epoch": 0.607094133697135,
629
+ "grad_norm": 0.6018236788722027,
630
+ "learning_rate": 1.8769979353286947e-05,
631
+ "loss": 0.5479,
632
  "step": 445
633
  },
634
  {
635
+ "epoch": 0.6139154160982264,
636
+ "grad_norm": 0.6010303061293923,
637
+ "learning_rate": 1.873379746230154e-05,
638
+ "loss": 0.5573,
639
  "step": 450
640
  },
641
  {
642
+ "epoch": 0.6207366984993179,
643
+ "grad_norm": 0.5864635637945892,
644
+ "learning_rate": 1.86971274107609e-05,
645
+ "loss": 0.559,
646
  "step": 455
647
  },
648
  {
649
+ "epoch": 0.6275579809004093,
650
+ "grad_norm": 0.5108412439167901,
651
+ "learning_rate": 1.8659971272013716e-05,
652
+ "loss": 0.5482,
653
  "step": 460
654
  },
655
  {
656
+ "epoch": 0.6343792633015006,
657
+ "grad_norm": 0.5696450573133378,
658
+ "learning_rate": 1.8622331146892377e-05,
659
+ "loss": 0.543,
660
  "step": 465
661
  },
662
  {
663
+ "epoch": 0.6412005457025921,
664
+ "grad_norm": 0.5930551057489514,
665
+ "learning_rate": 1.8584209163594177e-05,
666
+ "loss": 0.5608,
667
  "step": 470
668
  },
669
  {
670
+ "epoch": 0.6480218281036835,
671
+ "grad_norm": 0.5425007208596792,
672
+ "learning_rate": 1.854560747756098e-05,
673
+ "loss": 0.5484,
674
  "step": 475
675
  },
676
  {
677
+ "epoch": 0.654843110504775,
678
+ "grad_norm": 0.5535649664874269,
679
+ "learning_rate": 1.8506528271357365e-05,
680
+ "loss": 0.5407,
681
  "step": 480
682
  },
683
  {
684
+ "epoch": 0.6616643929058663,
685
+ "grad_norm": 0.5678822910354981,
686
+ "learning_rate": 1.846697375454721e-05,
687
+ "loss": 0.5534,
688
  "step": 485
689
  },
690
  {
691
+ "epoch": 0.6684856753069577,
692
+ "grad_norm": 0.5531176845799359,
693
+ "learning_rate": 1.8426946163568764e-05,
694
+ "loss": 0.5543,
695
  "step": 490
696
  },
697
  {
698
+ "epoch": 0.6753069577080492,
699
+ "grad_norm": 0.545737056928321,
700
+ "learning_rate": 1.8386447761608197e-05,
701
+ "loss": 0.5452,
702
  "step": 495
703
  },
704
  {
705
+ "epoch": 0.6821282401091405,
706
+ "grad_norm": 0.5548974682360532,
707
+ "learning_rate": 1.834548083847164e-05,
708
+ "loss": 0.5451,
709
  "step": 500
710
  },
711
  {
712
+ "epoch": 0.6889495225102319,
713
+ "grad_norm": 0.6041267515319638,
714
+ "learning_rate": 1.830404771045573e-05,
715
+ "loss": 0.5516,
716
  "step": 505
717
  },
718
  {
719
+ "epoch": 0.6957708049113234,
720
+ "grad_norm": 0.5535683940400501,
721
+ "learning_rate": 1.826215072021662e-05,
722
+ "loss": 0.5492,
723
  "step": 510
724
  },
725
  {
726
+ "epoch": 0.7025920873124147,
727
+ "grad_norm": 0.5608852748171831,
728
+ "learning_rate": 1.821979223663754e-05,
729
+ "loss": 0.5488,
730
  "step": 515
731
  },
732
  {
733
+ "epoch": 0.7094133697135061,
734
+ "grad_norm": 0.6408084203178623,
735
+ "learning_rate": 1.817697465469485e-05,
736
+ "loss": 0.5602,
737
  "step": 520
738
  },
739
  {
740
+ "epoch": 0.7162346521145976,
741
+ "grad_norm": 0.5932168436922046,
742
+ "learning_rate": 1.813370039532264e-05,
743
+ "loss": 0.5493,
744
  "step": 525
745
  },
746
  {
747
+ "epoch": 0.723055934515689,
748
+ "grad_norm": 0.5842574049113836,
749
+ "learning_rate": 1.808997190527584e-05,
750
+ "loss": 0.5476,
751
  "step": 530
752
  },
753
  {
754
+ "epoch": 0.7298772169167803,
755
+ "grad_norm": 0.5437121097077668,
756
+ "learning_rate": 1.804579165699187e-05,
757
+ "loss": 0.5467,
758
  "step": 535
759
  },
760
  {
761
+ "epoch": 0.7366984993178718,
762
+ "grad_norm": 0.6303163851086648,
763
+ "learning_rate": 1.800116214845087e-05,
764
+ "loss": 0.5461,
765
+ "step": 540
766
+ },
767
+ {
768
+ "epoch": 0.7435197817189632,
769
+ "grad_norm": 0.5335832562666002,
770
+ "learning_rate": 1.795608590303444e-05,
771
+ "loss": 0.5558,
772
+ "step": 545
773
+ },
774
+ {
775
+ "epoch": 0.7503410641200545,
776
+ "grad_norm": 0.6721936398991404,
777
+ "learning_rate": 1.7910565469382974e-05,
778
+ "loss": 0.5483,
779
+ "step": 550
780
+ },
781
+ {
782
+ "epoch": 0.757162346521146,
783
+ "grad_norm": 0.5687080013947179,
784
+ "learning_rate": 1.786460342125157e-05,
785
+ "loss": 0.5335,
786
+ "step": 555
787
+ },
788
+ {
789
+ "epoch": 0.7639836289222374,
790
+ "grad_norm": 0.7069597062861273,
791
+ "learning_rate": 1.7818202357364478e-05,
792
+ "loss": 0.5423,
793
+ "step": 560
794
+ },
795
+ {
796
+ "epoch": 0.7708049113233287,
797
+ "grad_norm": 0.5824735587256229,
798
+ "learning_rate": 1.7771364901268205e-05,
799
+ "loss": 0.5337,
800
+ "step": 565
801
+ },
802
+ {
803
+ "epoch": 0.7776261937244202,
804
+ "grad_norm": 0.6959844177948984,
805
+ "learning_rate": 1.772409370118315e-05,
806
+ "loss": 0.5444,
807
+ "step": 570
808
+ },
809
+ {
810
+ "epoch": 0.7844474761255116,
811
+ "grad_norm": 0.6873130749235091,
812
+ "learning_rate": 1.767639142985388e-05,
813
+ "loss": 0.5429,
814
+ "step": 575
815
+ },
816
+ {
817
+ "epoch": 0.791268758526603,
818
+ "grad_norm": 0.5767437929893657,
819
+ "learning_rate": 1.762826078439802e-05,
820
+ "loss": 0.5416,
821
+ "step": 580
822
+ },
823
+ {
824
+ "epoch": 0.7980900409276944,
825
+ "grad_norm": 0.5441214283588395,
826
+ "learning_rate": 1.7579704486153738e-05,
827
+ "loss": 0.5485,
828
+ "step": 585
829
+ },
830
+ {
831
+ "epoch": 0.8049113233287858,
832
+ "grad_norm": 0.5993079641441991,
833
+ "learning_rate": 1.75307252805259e-05,
834
+ "loss": 0.5534,
835
+ "step": 590
836
+ },
837
+ {
838
+ "epoch": 0.8117326057298773,
839
+ "grad_norm": 0.5308939722920609,
840
+ "learning_rate": 1.7481325936830816e-05,
841
+ "loss": 0.5433,
842
+ "step": 595
843
+ },
844
+ {
845
+ "epoch": 0.8185538881309686,
846
+ "grad_norm": 0.5805766641945108,
847
+ "learning_rate": 1.7431509248139693e-05,
848
+ "loss": 0.5334,
849
+ "step": 600
850
+ },
851
+ {
852
+ "epoch": 0.82537517053206,
853
+ "grad_norm": 0.5536344487365091,
854
+ "learning_rate": 1.738127803112069e-05,
855
+ "loss": 0.5432,
856
+ "step": 605
857
+ },
858
+ {
859
+ "epoch": 0.8321964529331515,
860
+ "grad_norm": 0.6323856480342268,
861
+ "learning_rate": 1.733063512587967e-05,
862
+ "loss": 0.5442,
863
+ "step": 610
864
+ },
865
+ {
866
+ "epoch": 0.8390177353342428,
867
+ "grad_norm": 0.6559245786686516,
868
+ "learning_rate": 1.7279583395799617e-05,
869
+ "loss": 0.5402,
870
+ "step": 615
871
+ },
872
+ {
873
+ "epoch": 0.8458390177353342,
874
+ "grad_norm": 0.5851284475852746,
875
+ "learning_rate": 1.7228125727378736e-05,
876
+ "loss": 0.5393,
877
+ "step": 620
878
+ },
879
+ {
880
+ "epoch": 0.8526603001364257,
881
+ "grad_norm": 0.5597777332221687,
882
+ "learning_rate": 1.7176265030067262e-05,
883
+ "loss": 0.5375,
884
+ "step": 625
885
+ },
886
+ {
887
+ "epoch": 0.859481582537517,
888
+ "grad_norm": 0.5895269122186833,
889
+ "learning_rate": 1.7124004236102928e-05,
890
+ "loss": 0.5472,
891
+ "step": 630
892
+ },
893
+ {
894
+ "epoch": 0.8663028649386084,
895
+ "grad_norm": 0.5796038353372596,
896
+ "learning_rate": 1.7071346300345214e-05,
897
+ "loss": 0.5419,
898
+ "step": 635
899
+ },
900
+ {
901
+ "epoch": 0.8731241473396999,
902
+ "grad_norm": 0.5660780180688785,
903
+ "learning_rate": 1.7018294200108244e-05,
904
+ "loss": 0.535,
905
+ "step": 640
906
+ },
907
+ {
908
+ "epoch": 0.8799454297407913,
909
+ "grad_norm": 0.5364650044963768,
910
+ "learning_rate": 1.696485093499246e-05,
911
+ "loss": 0.5339,
912
+ "step": 645
913
+ },
914
+ {
915
+ "epoch": 0.8867667121418826,
916
+ "grad_norm": 0.5342927826503541,
917
+ "learning_rate": 1.6911019526715034e-05,
918
+ "loss": 0.5345,
919
+ "step": 650
920
+ },
921
+ {
922
+ "epoch": 0.8935879945429741,
923
+ "grad_norm": 0.5430545153952824,
924
+ "learning_rate": 1.6856803018938985e-05,
925
+ "loss": 0.5359,
926
+ "step": 655
927
+ },
928
+ {
929
+ "epoch": 0.9004092769440655,
930
+ "grad_norm": 0.557430477072381,
931
+ "learning_rate": 1.680220447710113e-05,
932
+ "loss": 0.5419,
933
+ "step": 660
934
+ },
935
+ {
936
+ "epoch": 0.9072305593451568,
937
+ "grad_norm": 0.542713559965484,
938
+ "learning_rate": 1.6747226988238726e-05,
939
+ "loss": 0.525,
940
+ "step": 665
941
+ },
942
+ {
943
+ "epoch": 0.9140518417462483,
944
+ "grad_norm": 0.5853720489252487,
945
+ "learning_rate": 1.6691873660814957e-05,
946
+ "loss": 0.5419,
947
+ "step": 670
948
+ },
949
+ {
950
+ "epoch": 0.9208731241473397,
951
+ "grad_norm": 0.5815003789876279,
952
+ "learning_rate": 1.663614762454316e-05,
953
+ "loss": 0.5334,
954
+ "step": 675
955
+ },
956
+ {
957
+ "epoch": 0.927694406548431,
958
+ "grad_norm": 0.5751964067857557,
959
+ "learning_rate": 1.658005203020986e-05,
960
+ "loss": 0.5424,
961
+ "step": 680
962
+ },
963
+ {
964
+ "epoch": 0.9345156889495225,
965
+ "grad_norm": 0.5788952727225173,
966
+ "learning_rate": 1.652359004949666e-05,
967
+ "loss": 0.5371,
968
+ "step": 685
969
+ },
970
+ {
971
+ "epoch": 0.9413369713506139,
972
+ "grad_norm": 0.6331625206720841,
973
+ "learning_rate": 1.6466764874800874e-05,
974
+ "loss": 0.5378,
975
+ "step": 690
976
+ },
977
+ {
978
+ "epoch": 0.9481582537517054,
979
+ "grad_norm": 0.5668796713248969,
980
+ "learning_rate": 1.640957971905504e-05,
981
+ "loss": 0.5429,
982
+ "step": 695
983
+ },
984
+ {
985
+ "epoch": 0.9549795361527967,
986
+ "grad_norm": 0.5377074944991675,
987
+ "learning_rate": 1.635203781554527e-05,
988
+ "loss": 0.5424,
989
+ "step": 700
990
+ },
991
+ {
992
+ "epoch": 0.9618008185538881,
993
+ "grad_norm": 0.5484681746224047,
994
+ "learning_rate": 1.629414241772842e-05,
995
+ "loss": 0.5287,
996
+ "step": 705
997
+ },
998
+ {
999
+ "epoch": 0.9686221009549796,
1000
+ "grad_norm": 0.5991504387787833,
1001
+ "learning_rate": 1.6235896799048145e-05,
1002
+ "loss": 0.5345,
1003
+ "step": 710
1004
+ },
1005
+ {
1006
+ "epoch": 0.975443383356071,
1007
+ "grad_norm": 0.5578925025937715,
1008
+ "learning_rate": 1.6177304252749826e-05,
1009
+ "loss": 0.5422,
1010
+ "step": 715
1011
+ },
1012
+ {
1013
+ "epoch": 0.9822646657571623,
1014
+ "grad_norm": 0.585777983796673,
1015
+ "learning_rate": 1.611836809169435e-05,
1016
+ "loss": 0.5378,
1017
+ "step": 720
1018
+ },
1019
+ {
1020
+ "epoch": 0.9890859481582538,
1021
+ "grad_norm": 0.5557445054519161,
1022
+ "learning_rate": 1.6059091648170803e-05,
1023
+ "loss": 0.5371,
1024
+ "step": 725
1025
+ },
1026
+ {
1027
+ "epoch": 0.9959072305593452,
1028
+ "grad_norm": 0.5937106051672906,
1029
+ "learning_rate": 1.599947827370807e-05,
1030
+ "loss": 0.537,
1031
+ "step": 730
1032
+ },
1033
+ {
1034
+ "epoch": 1.0027285129604366,
1035
+ "grad_norm": 0.6017563200393091,
1036
+ "learning_rate": 1.593953133888534e-05,
1037
+ "loss": 0.5097,
1038
+ "step": 735
1039
+ },
1040
+ {
1041
+ "epoch": 1.009549795361528,
1042
+ "grad_norm": 0.5789953133346527,
1043
+ "learning_rate": 1.587925423314151e-05,
1044
+ "loss": 0.4832,
1045
+ "step": 740
1046
+ },
1047
+ {
1048
+ "epoch": 1.0163710777626194,
1049
+ "grad_norm": 0.5848419672480973,
1050
+ "learning_rate": 1.5818650364583558e-05,
1051
+ "loss": 0.4869,
1052
+ "step": 745
1053
+ },
1054
+ {
1055
+ "epoch": 1.0231923601637107,
1056
+ "grad_norm": 0.578364888371545,
1057
+ "learning_rate": 1.5757723159793856e-05,
1058
+ "loss": 0.4896,
1059
+ "step": 750
1060
+ },
1061
+ {
1062
+ "epoch": 1.030013642564802,
1063
+ "grad_norm": 0.594731084476994,
1064
+ "learning_rate": 1.569647606363641e-05,
1065
+ "loss": 0.4907,
1066
+ "step": 755
1067
+ },
1068
+ {
1069
+ "epoch": 1.0368349249658937,
1070
+ "grad_norm": 0.5564767523675749,
1071
+ "learning_rate": 1.5634912539062082e-05,
1072
+ "loss": 0.4876,
1073
+ "step": 760
1074
+ },
1075
+ {
1076
+ "epoch": 1.043656207366985,
1077
+ "grad_norm": 0.5327928111213208,
1078
+ "learning_rate": 1.5573036066912828e-05,
1079
+ "loss": 0.4837,
1080
+ "step": 765
1081
+ },
1082
+ {
1083
+ "epoch": 1.0504774897680764,
1084
+ "grad_norm": 0.592472989786851,
1085
+ "learning_rate": 1.551085014572485e-05,
1086
+ "loss": 0.4912,
1087
+ "step": 770
1088
+ },
1089
+ {
1090
+ "epoch": 1.0572987721691678,
1091
+ "grad_norm": 0.5227283978305189,
1092
+ "learning_rate": 1.54483582915308e-05,
1093
+ "loss": 0.4771,
1094
+ "step": 775
1095
+ },
1096
+ {
1097
+ "epoch": 1.0641200545702592,
1098
+ "grad_norm": 0.5379678465775848,
1099
+ "learning_rate": 1.538556403766099e-05,
1100
+ "loss": 0.4851,
1101
+ "step": 780
1102
+ },
1103
+ {
1104
+ "epoch": 1.0709413369713505,
1105
+ "grad_norm": 0.5474394476151215,
1106
+ "learning_rate": 1.53224709345436e-05,
1107
+ "loss": 0.4808,
1108
+ "step": 785
1109
+ },
1110
+ {
1111
+ "epoch": 1.077762619372442,
1112
+ "grad_norm": 0.5710971261744433,
1113
+ "learning_rate": 1.525908254950394e-05,
1114
+ "loss": 0.4937,
1115
+ "step": 790
1116
+ },
1117
+ {
1118
+ "epoch": 1.0845839017735335,
1119
+ "grad_norm": 0.5610883325946029,
1120
+ "learning_rate": 1.5195402466562763e-05,
1121
+ "loss": 0.4786,
1122
+ "step": 795
1123
+ },
1124
+ {
1125
+ "epoch": 1.0914051841746248,
1126
+ "grad_norm": 0.5421423204699313,
1127
+ "learning_rate": 1.5131434286233609e-05,
1128
+ "loss": 0.485,
1129
+ "step": 800
1130
+ },
1131
+ {
1132
+ "epoch": 1.0982264665757162,
1133
+ "grad_norm": 0.5609906221418015,
1134
+ "learning_rate": 1.5067181625319226e-05,
1135
+ "loss": 0.4816,
1136
+ "step": 805
1137
+ },
1138
+ {
1139
+ "epoch": 1.1050477489768076,
1140
+ "grad_norm": 0.5566971437091169,
1141
+ "learning_rate": 1.5002648116707088e-05,
1142
+ "loss": 0.4835,
1143
+ "step": 810
1144
+ },
1145
+ {
1146
+ "epoch": 1.111869031377899,
1147
+ "grad_norm": 0.5887214075043982,
1148
+ "learning_rate": 1.4937837409163974e-05,
1149
+ "loss": 0.4815,
1150
+ "step": 815
1151
+ },
1152
+ {
1153
+ "epoch": 1.1186903137789905,
1154
+ "grad_norm": 0.5271592242391223,
1155
+ "learning_rate": 1.4872753167129681e-05,
1156
+ "loss": 0.4877,
1157
+ "step": 820
1158
+ },
1159
+ {
1160
+ "epoch": 1.125511596180082,
1161
+ "grad_norm": 0.5316500397899709,
1162
+ "learning_rate": 1.480739907050982e-05,
1163
+ "loss": 0.4801,
1164
+ "step": 825
1165
+ },
1166
+ {
1167
+ "epoch": 1.1323328785811733,
1168
+ "grad_norm": 0.5377073263146588,
1169
+ "learning_rate": 1.4741778814467752e-05,
1170
+ "loss": 0.4701,
1171
+ "step": 830
1172
+ },
1173
+ {
1174
+ "epoch": 1.1391541609822646,
1175
+ "grad_norm": 0.5624072894347254,
1176
+ "learning_rate": 1.467589610921568e-05,
1177
+ "loss": 0.4788,
1178
+ "step": 835
1179
+ },
1180
+ {
1181
+ "epoch": 1.145975443383356,
1182
+ "grad_norm": 0.5906743104838152,
1183
+ "learning_rate": 1.460975467980484e-05,
1184
+ "loss": 0.4876,
1185
+ "step": 840
1186
+ },
1187
+ {
1188
+ "epoch": 1.1527967257844476,
1189
+ "grad_norm": 0.5718421588816008,
1190
+ "learning_rate": 1.4543358265914908e-05,
1191
+ "loss": 0.493,
1192
+ "step": 845
1193
+ },
1194
+ {
1195
+ "epoch": 1.159618008185539,
1196
+ "grad_norm": 0.5631554530445114,
1197
+ "learning_rate": 1.4476710621642558e-05,
1198
+ "loss": 0.4739,
1199
+ "step": 850
1200
+ },
1201
+ {
1202
+ "epoch": 1.1664392905866303,
1203
+ "grad_norm": 0.6070569685998446,
1204
+ "learning_rate": 1.440981551528918e-05,
1205
+ "loss": 0.4902,
1206
+ "step": 855
1207
+ },
1208
+ {
1209
+ "epoch": 1.1732605729877217,
1210
+ "grad_norm": 0.5658464706783969,
1211
+ "learning_rate": 1.4342676729147843e-05,
1212
+ "loss": 0.4908,
1213
+ "step": 860
1214
+ },
1215
+ {
1216
+ "epoch": 1.180081855388813,
1217
+ "grad_norm": 0.5416876036701612,
1218
+ "learning_rate": 1.4275298059289425e-05,
1219
+ "loss": 0.4831,
1220
+ "step": 865
1221
+ },
1222
+ {
1223
+ "epoch": 1.1869031377899044,
1224
+ "grad_norm": 0.560469117786521,
1225
+ "learning_rate": 1.4207683315347982e-05,
1226
+ "loss": 0.4844,
1227
+ "step": 870
1228
+ },
1229
+ {
1230
+ "epoch": 1.1937244201909958,
1231
+ "grad_norm": 0.5481177877539514,
1232
+ "learning_rate": 1.413983632030536e-05,
1233
+ "loss": 0.4795,
1234
+ "step": 875
1235
+ },
1236
+ {
1237
+ "epoch": 1.2005457025920874,
1238
+ "grad_norm": 0.5681558280441125,
1239
+ "learning_rate": 1.4071760910275038e-05,
1240
+ "loss": 0.4784,
1241
+ "step": 880
1242
+ },
1243
+ {
1244
+ "epoch": 1.2073669849931787,
1245
+ "grad_norm": 0.5318718671934336,
1246
+ "learning_rate": 1.4003460934285218e-05,
1247
+ "loss": 0.4787,
1248
+ "step": 885
1249
+ },
1250
+ {
1251
+ "epoch": 1.21418826739427,
1252
+ "grad_norm": 0.5718755695129025,
1253
+ "learning_rate": 1.393494025406121e-05,
1254
+ "loss": 0.495,
1255
+ "step": 890
1256
+ },
1257
+ {
1258
+ "epoch": 1.2210095497953615,
1259
+ "grad_norm": 0.5599038488798523,
1260
+ "learning_rate": 1.3866202743807093e-05,
1261
+ "loss": 0.4791,
1262
+ "step": 895
1263
+ },
1264
+ {
1265
+ "epoch": 1.2278308321964528,
1266
+ "grad_norm": 0.521420144525357,
1267
+ "learning_rate": 1.3797252289986652e-05,
1268
+ "loss": 0.4878,
1269
+ "step": 900
1270
+ },
1271
+ {
1272
+ "epoch": 1.2346521145975444,
1273
+ "grad_norm": 0.5502016562716098,
1274
+ "learning_rate": 1.3728092791103636e-05,
1275
+ "loss": 0.4747,
1276
+ "step": 905
1277
+ },
1278
+ {
1279
+ "epoch": 1.2414733969986358,
1280
+ "grad_norm": 0.5622573206127941,
1281
+ "learning_rate": 1.3658728157481348e-05,
1282
+ "loss": 0.4835,
1283
+ "step": 910
1284
+ },
1285
+ {
1286
+ "epoch": 1.2482946793997272,
1287
+ "grad_norm": 0.5263982972376154,
1288
+ "learning_rate": 1.3589162311041541e-05,
1289
+ "loss": 0.469,
1290
+ "step": 915
1291
+ },
1292
+ {
1293
+ "epoch": 1.2551159618008185,
1294
+ "grad_norm": 0.5500878868160901,
1295
+ "learning_rate": 1.3519399185082667e-05,
1296
+ "loss": 0.4739,
1297
+ "step": 920
1298
+ },
1299
+ {
1300
+ "epoch": 1.26193724420191,
1301
+ "grad_norm": 0.6026056698282768,
1302
+ "learning_rate": 1.3449442724057497e-05,
1303
+ "loss": 0.4924,
1304
+ "step": 925
1305
+ },
1306
+ {
1307
+ "epoch": 1.2687585266030013,
1308
+ "grad_norm": 0.5431124968837951,
1309
+ "learning_rate": 1.3379296883350088e-05,
1310
+ "loss": 0.4892,
1311
+ "step": 930
1312
+ },
1313
+ {
1314
+ "epoch": 1.2755798090040928,
1315
+ "grad_norm": 0.580656703476472,
1316
+ "learning_rate": 1.3308965629052152e-05,
1317
+ "loss": 0.4727,
1318
+ "step": 935
1319
+ },
1320
+ {
1321
+ "epoch": 1.2824010914051842,
1322
+ "grad_norm": 0.5565788524737827,
1323
+ "learning_rate": 1.3238452937738808e-05,
1324
+ "loss": 0.4801,
1325
+ "step": 940
1326
+ },
1327
+ {
1328
+ "epoch": 1.2892223738062756,
1329
+ "grad_norm": 0.555006125503962,
1330
+ "learning_rate": 1.316776279624374e-05,
1331
+ "loss": 0.4767,
1332
+ "step": 945
1333
+ },
1334
+ {
1335
+ "epoch": 1.296043656207367,
1336
+ "grad_norm": 0.5134647139179601,
1337
+ "learning_rate": 1.3096899201433773e-05,
1338
+ "loss": 0.4884,
1339
+ "step": 950
1340
+ },
1341
+ {
1342
+ "epoch": 1.3028649386084583,
1343
+ "grad_norm": 0.5348048148302267,
1344
+ "learning_rate": 1.3025866159982911e-05,
1345
+ "loss": 0.4861,
1346
+ "step": 955
1347
+ },
1348
+ {
1349
+ "epoch": 1.30968622100955,
1350
+ "grad_norm": 0.49693099647843164,
1351
+ "learning_rate": 1.2954667688145776e-05,
1352
+ "loss": 0.4786,
1353
+ "step": 960
1354
+ },
1355
+ {
1356
+ "epoch": 1.3165075034106413,
1357
+ "grad_norm": 0.5577481647212771,
1358
+ "learning_rate": 1.288330781153053e-05,
1359
+ "loss": 0.4868,
1360
+ "step": 965
1361
+ },
1362
+ {
1363
+ "epoch": 1.3233287858117326,
1364
+ "grad_norm": 0.5399989818626563,
1365
+ "learning_rate": 1.2811790564871267e-05,
1366
+ "loss": 0.4806,
1367
+ "step": 970
1368
+ },
1369
+ {
1370
+ "epoch": 1.330150068212824,
1371
+ "grad_norm": 0.5575148329842877,
1372
+ "learning_rate": 1.2740119991799886e-05,
1373
+ "loss": 0.476,
1374
+ "step": 975
1375
+ },
1376
+ {
1377
+ "epoch": 1.3369713506139154,
1378
+ "grad_norm": 0.5547901315683899,
1379
+ "learning_rate": 1.2668300144617462e-05,
1380
+ "loss": 0.4863,
1381
+ "step": 980
1382
+ },
1383
+ {
1384
+ "epoch": 1.3437926330150067,
1385
+ "grad_norm": 0.5949235354356381,
1386
+ "learning_rate": 1.2596335084065132e-05,
1387
+ "loss": 0.4853,
1388
+ "step": 985
1389
+ },
1390
+ {
1391
+ "epoch": 1.350613915416098,
1392
+ "grad_norm": 0.5964373026015896,
1393
+ "learning_rate": 1.2524228879094482e-05,
1394
+ "loss": 0.484,
1395
+ "step": 990
1396
+ },
1397
+ {
1398
+ "epoch": 1.3574351978171897,
1399
+ "grad_norm": 0.5370227190774329,
1400
+ "learning_rate": 1.2451985606637507e-05,
1401
+ "loss": 0.4746,
1402
+ "step": 995
1403
+ },
1404
+ {
1405
+ "epoch": 1.364256480218281,
1406
+ "grad_norm": 0.5369132356481505,
1407
+ "learning_rate": 1.2379609351376069e-05,
1408
+ "loss": 0.4836,
1409
+ "step": 1000
1410
+ },
1411
+ {
1412
+ "epoch": 1.3710777626193724,
1413
+ "grad_norm": 0.547761037213123,
1414
+ "learning_rate": 1.2307104205510983e-05,
1415
+ "loss": 0.49,
1416
+ "step": 1005
1417
+ },
1418
+ {
1419
+ "epoch": 1.3778990450204638,
1420
+ "grad_norm": 0.5883424816418631,
1421
+ "learning_rate": 1.2234474268530617e-05,
1422
+ "loss": 0.4846,
1423
+ "step": 1010
1424
+ },
1425
+ {
1426
+ "epoch": 1.3847203274215554,
1427
+ "grad_norm": 0.5246360871622825,
1428
+ "learning_rate": 1.2161723646979114e-05,
1429
+ "loss": 0.4841,
1430
+ "step": 1015
1431
+ },
1432
+ {
1433
+ "epoch": 1.3915416098226467,
1434
+ "grad_norm": 0.5749156554017738,
1435
+ "learning_rate": 1.2088856454224184e-05,
1436
+ "loss": 0.4814,
1437
+ "step": 1020
1438
+ },
1439
+ {
1440
+ "epoch": 1.398362892223738,
1441
+ "grad_norm": 0.6039253797362124,
1442
+ "learning_rate": 1.2015876810224576e-05,
1443
+ "loss": 0.4808,
1444
+ "step": 1025
1445
+ },
1446
+ {
1447
+ "epoch": 1.4051841746248295,
1448
+ "grad_norm": 0.5497121870941617,
1449
+ "learning_rate": 1.1942788841297081e-05,
1450
+ "loss": 0.4862,
1451
+ "step": 1030
1452
+ },
1453
+ {
1454
+ "epoch": 1.4120054570259208,
1455
+ "grad_norm": 0.5672274153305226,
1456
+ "learning_rate": 1.1869596679883273e-05,
1457
+ "loss": 0.4836,
1458
+ "step": 1035
1459
+ },
1460
+ {
1461
+ "epoch": 1.4188267394270122,
1462
+ "grad_norm": 0.5415049021394123,
1463
+ "learning_rate": 1.1796304464315827e-05,
1464
+ "loss": 0.4721,
1465
+ "step": 1040
1466
+ },
1467
+ {
1468
+ "epoch": 1.4256480218281036,
1469
+ "grad_norm": 0.5483284640349972,
1470
+ "learning_rate": 1.172291633858454e-05,
1471
+ "loss": 0.4783,
1472
+ "step": 1045
1473
+ },
1474
+ {
1475
+ "epoch": 1.4324693042291952,
1476
+ "grad_norm": 0.5395763465174175,
1477
+ "learning_rate": 1.164943645210204e-05,
1478
+ "loss": 0.4742,
1479
+ "step": 1050
1480
+ },
1481
+ {
1482
+ "epoch": 1.4392905866302865,
1483
+ "grad_norm": 0.5536500830425668,
1484
+ "learning_rate": 1.157586895946917e-05,
1485
+ "loss": 0.4777,
1486
+ "step": 1055
1487
+ },
1488
+ {
1489
+ "epoch": 1.446111869031378,
1490
+ "grad_norm": 0.5744294294074994,
1491
+ "learning_rate": 1.1502218020240075e-05,
1492
+ "loss": 0.4797,
1493
+ "step": 1060
1494
+ },
1495
+ {
1496
+ "epoch": 1.4529331514324693,
1497
+ "grad_norm": 0.5568183996723581,
1498
+ "learning_rate": 1.1428487798687034e-05,
1499
+ "loss": 0.4796,
1500
+ "step": 1065
1501
+ },
1502
+ {
1503
+ "epoch": 1.4597544338335606,
1504
+ "grad_norm": 0.566358557717989,
1505
+ "learning_rate": 1.1354682463564985e-05,
1506
+ "loss": 0.4811,
1507
+ "step": 1070
1508
+ },
1509
+ {
1510
+ "epoch": 1.4665757162346522,
1511
+ "grad_norm": 0.5779108434063646,
1512
+ "learning_rate": 1.1280806187875855e-05,
1513
+ "loss": 0.4779,
1514
+ "step": 1075
1515
+ },
1516
+ {
1517
+ "epoch": 1.4733969986357436,
1518
+ "grad_norm": 0.5743272903122454,
1519
+ "learning_rate": 1.1206863148632588e-05,
1520
+ "loss": 0.4714,
1521
+ "step": 1080
1522
+ },
1523
+ {
1524
+ "epoch": 1.480218281036835,
1525
+ "grad_norm": 0.5406195644541576,
1526
+ "learning_rate": 1.1132857526622978e-05,
1527
+ "loss": 0.4763,
1528
+ "step": 1085
1529
+ },
1530
+ {
1531
+ "epoch": 1.4870395634379263,
1532
+ "grad_norm": 0.5214541285752015,
1533
+ "learning_rate": 1.10587935061733e-05,
1534
+ "loss": 0.475,
1535
+ "step": 1090
1536
+ },
1537
+ {
1538
+ "epoch": 1.4938608458390177,
1539
+ "grad_norm": 0.5173465487655502,
1540
+ "learning_rate": 1.0984675274911707e-05,
1541
+ "loss": 0.4844,
1542
+ "step": 1095
1543
+ },
1544
+ {
1545
+ "epoch": 1.500682128240109,
1546
+ "grad_norm": 0.5752958457077928,
1547
+ "learning_rate": 1.0910507023531472e-05,
1548
+ "loss": 0.4804,
1549
+ "step": 1100
1550
+ },
1551
+ {
1552
+ "epoch": 1.5075034106412004,
1553
+ "grad_norm": 0.5211397629386293,
1554
+ "learning_rate": 1.0836292945554032e-05,
1555
+ "loss": 0.4748,
1556
+ "step": 1105
1557
+ },
1558
+ {
1559
+ "epoch": 1.514324693042292,
1560
+ "grad_norm": 0.547130320582026,
1561
+ "learning_rate": 1.0762037237091899e-05,
1562
+ "loss": 0.4766,
1563
+ "step": 1110
1564
+ },
1565
+ {
1566
+ "epoch": 1.5211459754433834,
1567
+ "grad_norm": 0.5038548664760011,
1568
+ "learning_rate": 1.068774409661139e-05,
1569
+ "loss": 0.469,
1570
+ "step": 1115
1571
+ },
1572
+ {
1573
+ "epoch": 1.5279672578444747,
1574
+ "grad_norm": 0.5656095933677551,
1575
+ "learning_rate": 1.0613417724695261e-05,
1576
+ "loss": 0.4794,
1577
+ "step": 1120
1578
+ },
1579
+ {
1580
+ "epoch": 1.5347885402455663,
1581
+ "grad_norm": 0.5581357926336143,
1582
+ "learning_rate": 1.053906232380519e-05,
1583
+ "loss": 0.4823,
1584
+ "step": 1125
1585
+ },
1586
+ {
1587
+ "epoch": 1.5416098226466577,
1588
+ "grad_norm": 0.5263311155085898,
1589
+ "learning_rate": 1.0464682098044173e-05,
1590
+ "loss": 0.4681,
1591
+ "step": 1130
1592
+ },
1593
+ {
1594
+ "epoch": 1.548431105047749,
1595
+ "grad_norm": 0.5286624501454517,
1596
+ "learning_rate": 1.039028125291882e-05,
1597
+ "loss": 0.4728,
1598
+ "step": 1135
1599
+ },
1600
+ {
1601
+ "epoch": 1.5552523874488404,
1602
+ "grad_norm": 0.5141887196812334,
1603
+ "learning_rate": 1.0315863995101577e-05,
1604
+ "loss": 0.482,
1605
+ "step": 1140
1606
+ },
1607
+ {
1608
+ "epoch": 1.5620736698499318,
1609
+ "grad_norm": 0.541557132690286,
1610
+ "learning_rate": 1.0241434532192869e-05,
1611
+ "loss": 0.4811,
1612
+ "step": 1145
1613
+ },
1614
+ {
1615
+ "epoch": 1.5688949522510232,
1616
+ "grad_norm": 0.5483345732529048,
1617
+ "learning_rate": 1.016699707248321e-05,
1618
+ "loss": 0.4764,
1619
+ "step": 1150
1620
+ },
1621
+ {
1622
+ "epoch": 1.5757162346521145,
1623
+ "grad_norm": 0.5080094592113272,
1624
+ "learning_rate": 1.0092555824715253e-05,
1625
+ "loss": 0.4769,
1626
+ "step": 1155
1627
+ },
1628
+ {
1629
+ "epoch": 1.5825375170532059,
1630
+ "grad_norm": 0.5443585549154739,
1631
+ "learning_rate": 1.001811499784584e-05,
1632
+ "loss": 0.4851,
1633
+ "step": 1160
1634
+ },
1635
+ {
1636
+ "epoch": 1.5893587994542973,
1637
+ "grad_norm": 0.5235939718294041,
1638
+ "learning_rate": 9.943678800808e-06,
1639
+ "loss": 0.479,
1640
+ "step": 1165
1641
+ },
1642
+ {
1643
+ "epoch": 1.5961800818553888,
1644
+ "grad_norm": 0.5192995671803651,
1645
+ "learning_rate": 9.869251442273002e-06,
1646
+ "loss": 0.4792,
1647
+ "step": 1170
1648
+ },
1649
+ {
1650
+ "epoch": 1.6030013642564802,
1651
+ "grad_norm": 0.5220734079896934,
1652
+ "learning_rate": 9.794837130412373e-06,
1653
+ "loss": 0.4748,
1654
+ "step": 1175
1655
+ },
1656
+ {
1657
+ "epoch": 1.6098226466575716,
1658
+ "grad_norm": 0.5564541153796387,
1659
+ "learning_rate": 9.720440072659974e-06,
1660
+ "loss": 0.4791,
1661
+ "step": 1180
1662
+ },
1663
+ {
1664
+ "epoch": 1.6166439290586632,
1665
+ "grad_norm": 0.5577158118879879,
1666
+ "learning_rate": 9.646064475474109e-06,
1667
+ "loss": 0.48,
1668
+ "step": 1185
1669
+ },
1670
+ {
1671
+ "epoch": 1.6234652114597545,
1672
+ "grad_norm": 0.5224801618763832,
1673
+ "learning_rate": 9.571714544099688e-06,
1674
+ "loss": 0.4791,
1675
+ "step": 1190
1676
+ },
1677
+ {
1678
+ "epoch": 1.630286493860846,
1679
+ "grad_norm": 0.5031867236711891,
1680
+ "learning_rate": 9.497394482330454e-06,
1681
+ "loss": 0.4762,
1682
+ "step": 1195
1683
+ },
1684
+ {
1685
+ "epoch": 1.6371077762619373,
1686
+ "grad_norm": 0.5239289401796862,
1687
+ "learning_rate": 9.423108492271307e-06,
1688
+ "loss": 0.4671,
1689
+ "step": 1200
1690
+ },
1691
+ {
1692
+ "epoch": 1.6439290586630286,
1693
+ "grad_norm": 0.533734708582627,
1694
+ "learning_rate": 9.348860774100707e-06,
1695
+ "loss": 0.4664,
1696
+ "step": 1205
1697
+ },
1698
+ {
1699
+ "epoch": 1.65075034106412,
1700
+ "grad_norm": 0.5274801312649562,
1701
+ "learning_rate": 9.274655525833208e-06,
1702
+ "loss": 0.4762,
1703
+ "step": 1210
1704
+ },
1705
+ {
1706
+ "epoch": 1.6575716234652114,
1707
+ "grad_norm": 0.5521396681696468,
1708
+ "learning_rate": 9.200496943082074e-06,
1709
+ "loss": 0.4746,
1710
+ "step": 1215
1711
+ },
1712
+ {
1713
+ "epoch": 1.6643929058663027,
1714
+ "grad_norm": 0.5495546262058293,
1715
+ "learning_rate": 9.126389218822074e-06,
1716
+ "loss": 0.4712,
1717
+ "step": 1220
1718
+ },
1719
+ {
1720
+ "epoch": 1.6712141882673943,
1721
+ "grad_norm": 0.5336658639008486,
1722
+ "learning_rate": 9.05233654315241e-06,
1723
+ "loss": 0.4805,
1724
+ "step": 1225
1725
+ },
1726
+ {
1727
+ "epoch": 1.6780354706684857,
1728
+ "grad_norm": 0.5160783443020055,
1729
+ "learning_rate": 8.978343103059792e-06,
1730
+ "loss": 0.4807,
1731
+ "step": 1230
1732
+ },
1733
+ {
1734
+ "epoch": 1.684856753069577,
1735
+ "grad_norm": 0.5495255465692145,
1736
+ "learning_rate": 8.904413082181721e-06,
1737
+ "loss": 0.4786,
1738
+ "step": 1235
1739
+ },
1740
+ {
1741
+ "epoch": 1.6916780354706686,
1742
+ "grad_norm": 0.5454256388360902,
1743
+ "learning_rate": 8.830550660569928e-06,
1744
+ "loss": 0.4741,
1745
+ "step": 1240
1746
+ },
1747
+ {
1748
+ "epoch": 1.69849931787176,
1749
+ "grad_norm": 0.544395530290513,
1750
+ "learning_rate": 8.756760014454036e-06,
1751
+ "loss": 0.4712,
1752
+ "step": 1245
1753
+ },
1754
+ {
1755
+ "epoch": 1.7053206002728514,
1756
+ "grad_norm": 0.526798744015,
1757
+ "learning_rate": 8.683045316005439e-06,
1758
+ "loss": 0.4799,
1759
+ "step": 1250
1760
+ },
1761
+ {
1762
+ "epoch": 1.7121418826739427,
1763
+ "grad_norm": 0.5479215362022039,
1764
+ "learning_rate": 8.609410733101398e-06,
1765
+ "loss": 0.466,
1766
+ "step": 1255
1767
+ },
1768
+ {
1769
+ "epoch": 1.718963165075034,
1770
+ "grad_norm": 0.5454292018927241,
1771
+ "learning_rate": 8.53586042908939e-06,
1772
+ "loss": 0.476,
1773
+ "step": 1260
1774
+ },
1775
+ {
1776
+ "epoch": 1.7257844474761255,
1777
+ "grad_norm": 0.5438400943234664,
1778
+ "learning_rate": 8.462398562551707e-06,
1779
+ "loss": 0.4692,
1780
+ "step": 1265
1781
+ },
1782
+ {
1783
+ "epoch": 1.7326057298772168,
1784
+ "grad_norm": 0.5079296040398578,
1785
+ "learning_rate": 8.389029287070331e-06,
1786
+ "loss": 0.4543,
1787
+ "step": 1270
1788
+ },
1789
+ {
1790
+ "epoch": 1.7394270122783082,
1791
+ "grad_norm": 0.5213514483175884,
1792
+ "learning_rate": 8.315756750992082e-06,
1793
+ "loss": 0.4755,
1794
+ "step": 1275
1795
+ },
1796
+ {
1797
+ "epoch": 1.7462482946793996,
1798
+ "grad_norm": 0.5497937661522996,
1799
+ "learning_rate": 8.242585097194073e-06,
1800
+ "loss": 0.4792,
1801
+ "step": 1280
1802
+ },
1803
+ {
1804
+ "epoch": 1.7530695770804912,
1805
+ "grad_norm": 0.5092044188417733,
1806
+ "learning_rate": 8.169518462849465e-06,
1807
+ "loss": 0.4713,
1808
+ "step": 1285
1809
+ },
1810
+ {
1811
+ "epoch": 1.7598908594815825,
1812
+ "grad_norm": 0.546378739229266,
1813
+ "learning_rate": 8.096560979193547e-06,
1814
+ "loss": 0.475,
1815
+ "step": 1290
1816
+ },
1817
+ {
1818
+ "epoch": 1.766712141882674,
1819
+ "grad_norm": 0.5527538137574878,
1820
+ "learning_rate": 8.023716771290165e-06,
1821
+ "loss": 0.4744,
1822
+ "step": 1295
1823
+ },
1824
+ {
1825
+ "epoch": 1.7735334242837655,
1826
+ "grad_norm": 0.4973152890197007,
1827
+ "learning_rate": 7.950989957798477e-06,
1828
+ "loss": 0.4703,
1829
+ "step": 1300
1830
+ },
1831
+ {
1832
+ "epoch": 1.7803547066848568,
1833
+ "grad_norm": 0.5220611124048806,
1834
+ "learning_rate": 7.87838465074008e-06,
1835
+ "loss": 0.4643,
1836
+ "step": 1305
1837
+ },
1838
+ {
1839
+ "epoch": 1.7871759890859482,
1840
+ "grad_norm": 0.5019643870616508,
1841
+ "learning_rate": 7.805904955266521e-06,
1842
+ "loss": 0.4727,
1843
+ "step": 1310
1844
+ },
1845
+ {
1846
+ "epoch": 1.7939972714870396,
1847
+ "grad_norm": 0.49177653695536644,
1848
+ "learning_rate": 7.733554969427188e-06,
1849
+ "loss": 0.4783,
1850
+ "step": 1315
1851
+ },
1852
+ {
1853
+ "epoch": 1.800818553888131,
1854
+ "grad_norm": 0.504298617992289,
1855
+ "learning_rate": 7.661338783937598e-06,
1856
+ "loss": 0.4679,
1857
+ "step": 1320
1858
+ },
1859
+ {
1860
+ "epoch": 1.8076398362892223,
1861
+ "grad_norm": 0.5215596545395237,
1862
+ "learning_rate": 7.58926048194811e-06,
1863
+ "loss": 0.4696,
1864
+ "step": 1325
1865
+ },
1866
+ {
1867
+ "epoch": 1.8144611186903137,
1868
+ "grad_norm": 0.5132453847393322,
1869
+ "learning_rate": 7.51732413881306e-06,
1870
+ "loss": 0.4674,
1871
+ "step": 1330
1872
+ },
1873
+ {
1874
+ "epoch": 1.821282401091405,
1875
+ "grad_norm": 0.5446618772064099,
1876
+ "learning_rate": 7.4455338218603355e-06,
1877
+ "loss": 0.471,
1878
+ "step": 1335
1879
+ },
1880
+ {
1881
+ "epoch": 1.8281036834924966,
1882
+ "grad_norm": 0.5207518211396172,
1883
+ "learning_rate": 7.3738935901614086e-06,
1884
+ "loss": 0.4675,
1885
+ "step": 1340
1886
+ },
1887
+ {
1888
+ "epoch": 1.834924965893588,
1889
+ "grad_norm": 0.5224235181988153,
1890
+ "learning_rate": 7.302407494301838e-06,
1891
+ "loss": 0.4871,
1892
+ "step": 1345
1893
+ },
1894
+ {
1895
+ "epoch": 1.8417462482946794,
1896
+ "grad_norm": 0.5162392641826139,
1897
+ "learning_rate": 7.231079576152233e-06,
1898
+ "loss": 0.4721,
1899
+ "step": 1350
1900
+ },
1901
+ {
1902
+ "epoch": 1.848567530695771,
1903
+ "grad_norm": 0.5144000331528833,
1904
+ "learning_rate": 7.1599138686397375e-06,
1905
+ "loss": 0.4652,
1906
+ "step": 1355
1907
+ },
1908
+ {
1909
+ "epoch": 1.8553888130968623,
1910
+ "grad_norm": 0.5124395398662621,
1911
+ "learning_rate": 7.088914395520003e-06,
1912
+ "loss": 0.4759,
1913
+ "step": 1360
1914
+ },
1915
+ {
1916
+ "epoch": 1.8622100954979537,
1917
+ "grad_norm": 0.4886410778434008,
1918
+ "learning_rate": 7.0180851711496625e-06,
1919
+ "loss": 0.4666,
1920
+ "step": 1365
1921
+ },
1922
+ {
1923
+ "epoch": 1.869031377899045,
1924
+ "grad_norm": 0.5015465903206748,
1925
+ "learning_rate": 6.9474302002594e-06,
1926
+ "loss": 0.4757,
1927
+ "step": 1370
1928
+ },
1929
+ {
1930
+ "epoch": 1.8758526603001364,
1931
+ "grad_norm": 0.5299208502083286,
1932
+ "learning_rate": 6.876953477727476e-06,
1933
+ "loss": 0.4811,
1934
+ "step": 1375
1935
+ },
1936
+ {
1937
+ "epoch": 1.8826739427012278,
1938
+ "grad_norm": 0.5042038155237952,
1939
+ "learning_rate": 6.806658988353873e-06,
1940
+ "loss": 0.4744,
1941
+ "step": 1380
1942
+ },
1943
+ {
1944
+ "epoch": 1.8894952251023192,
1945
+ "grad_norm": 0.5112099000483086,
1946
+ "learning_rate": 6.736550706634994e-06,
1947
+ "loss": 0.4758,
1948
+ "step": 1385
1949
+ },
1950
+ {
1951
+ "epoch": 1.8963165075034105,
1952
+ "grad_norm": 0.4905834706956801,
1953
+ "learning_rate": 6.666632596538943e-06,
1954
+ "loss": 0.4661,
1955
+ "step": 1390
1956
+ },
1957
+ {
1958
+ "epoch": 1.9031377899045019,
1959
+ "grad_norm": 0.5212386889650834,
1960
+ "learning_rate": 6.5969086112813914e-06,
1961
+ "loss": 0.48,
1962
+ "step": 1395
1963
+ },
1964
+ {
1965
+ "epoch": 1.9099590723055935,
1966
+ "grad_norm": 0.5123640864018402,
1967
+ "learning_rate": 6.527382693102068e-06,
1968
+ "loss": 0.47,
1969
+ "step": 1400
1970
+ },
1971
+ {
1972
+ "epoch": 1.9167803547066848,
1973
+ "grad_norm": 0.4981927898415894,
1974
+ "learning_rate": 6.458058773041857e-06,
1975
+ "loss": 0.481,
1976
+ "step": 1405
1977
+ },
1978
+ {
1979
+ "epoch": 1.9236016371077762,
1980
+ "grad_norm": 0.5146201615715856,
1981
+ "learning_rate": 6.38894077072054e-06,
1982
+ "loss": 0.4763,
1983
+ "step": 1410
1984
+ },
1985
+ {
1986
+ "epoch": 1.9304229195088678,
1987
+ "grad_norm": 0.5059290660714733,
1988
+ "learning_rate": 6.320032594115169e-06,
1989
+ "loss": 0.4612,
1990
+ "step": 1415
1991
+ },
1992
+ {
1993
+ "epoch": 1.9372442019099592,
1994
+ "grad_norm": 0.5154624574991551,
1995
+ "learning_rate": 6.251338139339119e-06,
1996
+ "loss": 0.4779,
1997
+ "step": 1420
1998
+ },
1999
+ {
2000
+ "epoch": 1.9440654843110505,
2001
+ "grad_norm": 0.520065682019996,
2002
+ "learning_rate": 6.182861290421796e-06,
2003
+ "loss": 0.4758,
2004
+ "step": 1425
2005
+ },
2006
+ {
2007
+ "epoch": 1.950886766712142,
2008
+ "grad_norm": 0.5041820103599076,
2009
+ "learning_rate": 6.114605919089017e-06,
2010
+ "loss": 0.4771,
2011
+ "step": 1430
2012
+ },
2013
+ {
2014
+ "epoch": 1.9577080491132333,
2015
+ "grad_norm": 0.5103932850618732,
2016
+ "learning_rate": 6.0465758845441204e-06,
2017
+ "loss": 0.4767,
2018
+ "step": 1435
2019
+ },
2020
+ {
2021
+ "epoch": 1.9645293315143246,
2022
+ "grad_norm": 0.4939939604899659,
2023
+ "learning_rate": 5.978775033249754e-06,
2024
+ "loss": 0.4745,
2025
+ "step": 1440
2026
+ },
2027
+ {
2028
+ "epoch": 1.971350613915416,
2029
+ "grad_norm": 0.5134387665081097,
2030
+ "learning_rate": 5.911207198710388e-06,
2031
+ "loss": 0.4654,
2032
+ "step": 1445
2033
+ },
2034
+ {
2035
+ "epoch": 1.9781718963165074,
2036
+ "grad_norm": 0.5268322175857719,
2037
+ "learning_rate": 5.843876201255586e-06,
2038
+ "loss": 0.4756,
2039
+ "step": 1450
2040
+ },
2041
+ {
2042
+ "epoch": 1.984993178717599,
2043
+ "grad_norm": 0.5317341013740563,
2044
+ "learning_rate": 5.776785847823971e-06,
2045
+ "loss": 0.4829,
2046
+ "step": 1455
2047
+ },
2048
+ {
2049
+ "epoch": 1.9918144611186903,
2050
+ "grad_norm": 0.5348011864192631,
2051
+ "learning_rate": 5.709939931748017e-06,
2052
+ "loss": 0.4777,
2053
+ "step": 1460
2054
+ },
2055
+ {
2056
+ "epoch": 1.9986357435197817,
2057
+ "grad_norm": 0.48497212839055176,
2058
+ "learning_rate": 5.643342232539524e-06,
2059
+ "loss": 0.4675,
2060
+ "step": 1465
2061
+ },
2062
+ {
2063
+ "epoch": 2.0054570259208733,
2064
+ "grad_norm": 0.5048513009902976,
2065
+ "learning_rate": 5.57699651567597e-06,
2066
+ "loss": 0.4293,
2067
+ "step": 1470
2068
+ },
2069
+ {
2070
+ "epoch": 2.0122783083219646,
2071
+ "grad_norm": 0.5741103214956879,
2072
+ "learning_rate": 5.510906532387582e-06,
2073
+ "loss": 0.4209,
2074
+ "step": 1475
2075
+ },
2076
+ {
2077
+ "epoch": 2.019099590723056,
2078
+ "grad_norm": 0.5130214085529198,
2079
+ "learning_rate": 5.445076019445228e-06,
2080
+ "loss": 0.4209,
2081
+ "step": 1480
2082
+ },
2083
+ {
2084
+ "epoch": 2.0259208731241474,
2085
+ "grad_norm": 0.520457444842433,
2086
+ "learning_rate": 5.379508698949177e-06,
2087
+ "loss": 0.4239,
2088
+ "step": 1485
2089
+ },
2090
+ {
2091
+ "epoch": 2.0327421555252387,
2092
+ "grad_norm": 0.5127234405027133,
2093
+ "learning_rate": 5.314208278118605e-06,
2094
+ "loss": 0.4297,
2095
+ "step": 1490
2096
+ },
2097
+ {
2098
+ "epoch": 2.03956343792633,
2099
+ "grad_norm": 0.519641634043921,
2100
+ "learning_rate": 5.249178449082015e-06,
2101
+ "loss": 0.4172,
2102
+ "step": 1495
2103
+ },
2104
+ {
2105
+ "epoch": 2.0463847203274215,
2106
+ "grad_norm": 0.49773115879028945,
2107
+ "learning_rate": 5.184422888668473e-06,
2108
+ "loss": 0.4082,
2109
+ "step": 1500
2110
+ },
2111
+ {
2112
+ "epoch": 2.053206002728513,
2113
+ "grad_norm": 0.4994269378088386,
2114
+ "learning_rate": 5.119945258199712e-06,
2115
+ "loss": 0.4201,
2116
+ "step": 1505
2117
+ },
2118
+ {
2119
+ "epoch": 2.060027285129604,
2120
+ "grad_norm": 0.5102610726202969,
2121
+ "learning_rate": 5.055749203283138e-06,
2122
+ "loss": 0.4184,
2123
+ "step": 1510
2124
+ },
2125
+ {
2126
+ "epoch": 2.0668485675306956,
2127
+ "grad_norm": 0.5179420279961003,
2128
+ "learning_rate": 4.991838353605678e-06,
2129
+ "loss": 0.4291,
2130
+ "step": 1515
2131
+ },
2132
+ {
2133
+ "epoch": 2.0736698499317874,
2134
+ "grad_norm": 0.5107427875348688,
2135
+ "learning_rate": 4.928216322728564e-06,
2136
+ "loss": 0.4148,
2137
+ "step": 1520
2138
+ },
2139
+ {
2140
+ "epoch": 2.0804911323328787,
2141
+ "grad_norm": 0.5416051556874933,
2142
+ "learning_rate": 4.864886707883039e-06,
2143
+ "loss": 0.4241,
2144
+ "step": 1525
2145
+ },
2146
+ {
2147
+ "epoch": 2.08731241473397,
2148
+ "grad_norm": 0.5165355592944862,
2149
+ "learning_rate": 4.8018530897669495e-06,
2150
+ "loss": 0.43,
2151
+ "step": 1530
2152
+ },
2153
+ {
2154
+ "epoch": 2.0941336971350615,
2155
+ "grad_norm": 0.5321827607278135,
2156
+ "learning_rate": 4.739119032342281e-06,
2157
+ "loss": 0.4273,
2158
+ "step": 1535
2159
+ },
2160
+ {
2161
+ "epoch": 2.100954979536153,
2162
+ "grad_norm": 0.5323598331118466,
2163
+ "learning_rate": 4.676688082633677e-06,
2164
+ "loss": 0.425,
2165
+ "step": 1540
2166
+ },
2167
+ {
2168
+ "epoch": 2.107776261937244,
2169
+ "grad_norm": 0.505907638881496,
2170
+ "learning_rate": 4.6145637705278586e-06,
2171
+ "loss": 0.4219,
2172
+ "step": 1545
2173
+ },
2174
+ {
2175
+ "epoch": 2.1145975443383356,
2176
+ "grad_norm": 0.5077386983402395,
2177
+ "learning_rate": 4.5527496085740715e-06,
2178
+ "loss": 0.4296,
2179
+ "step": 1550
2180
+ },
2181
+ {
2182
+ "epoch": 2.121418826739427,
2183
+ "grad_norm": 0.5140822781413095,
2184
+ "learning_rate": 4.4912490917854526e-06,
2185
+ "loss": 0.4296,
2186
+ "step": 1555
2187
+ },
2188
+ {
2189
+ "epoch": 2.1282401091405183,
2190
+ "grad_norm": 0.5019011818755275,
2191
+ "learning_rate": 4.43006569744145e-06,
2192
+ "loss": 0.4122,
2193
+ "step": 1560
2194
+ },
2195
+ {
2196
+ "epoch": 2.1350613915416097,
2197
+ "grad_norm": 0.5281323996139589,
2198
+ "learning_rate": 4.369202884891194e-06,
2199
+ "loss": 0.4216,
2200
+ "step": 1565
2201
+ },
2202
+ {
2203
+ "epoch": 2.141882673942701,
2204
+ "grad_norm": 0.5112203302750289,
2205
+ "learning_rate": 4.308664095357906e-06,
2206
+ "loss": 0.4287,
2207
+ "step": 1570
2208
+ },
2209
+ {
2210
+ "epoch": 2.148703956343793,
2211
+ "grad_norm": 0.5005308937211395,
2212
+ "learning_rate": 4.248452751744348e-06,
2213
+ "loss": 0.4191,
2214
+ "step": 1575
2215
+ },
2216
+ {
2217
+ "epoch": 2.155525238744884,
2218
+ "grad_norm": 0.5171516047636492,
2219
+ "learning_rate": 4.188572258439263e-06,
2220
+ "loss": 0.4246,
2221
+ "step": 1580
2222
+ },
2223
+ {
2224
+ "epoch": 2.1623465211459756,
2225
+ "grad_norm": 0.5060884195233146,
2226
+ "learning_rate": 4.129026001124905e-06,
2227
+ "loss": 0.4226,
2228
+ "step": 1585
2229
+ },
2230
+ {
2231
+ "epoch": 2.169167803547067,
2232
+ "grad_norm": 0.49964343776405634,
2233
+ "learning_rate": 4.069817346585614e-06,
2234
+ "loss": 0.4203,
2235
+ "step": 1590
2236
+ },
2237
+ {
2238
+ "epoch": 2.1759890859481583,
2239
+ "grad_norm": 0.503696770859411,
2240
+ "learning_rate": 4.010949642517433e-06,
2241
+ "loss": 0.4298,
2242
+ "step": 1595
2243
+ },
2244
+ {
2245
+ "epoch": 2.1828103683492497,
2246
+ "grad_norm": 0.49161182387090824,
2247
+ "learning_rate": 3.952426217338859e-06,
2248
+ "loss": 0.4301,
2249
+ "step": 1600
2250
+ },
2251
+ {
2252
+ "epoch": 2.189631650750341,
2253
+ "grad_norm": 0.5160224829314862,
2254
+ "learning_rate": 3.89425038000262e-06,
2255
+ "loss": 0.4152,
2256
+ "step": 1605
2257
+ },
2258
+ {
2259
+ "epoch": 2.1964529331514324,
2260
+ "grad_norm": 0.5197468302526228,
2261
+ "learning_rate": 3.836425419808615e-06,
2262
+ "loss": 0.4222,
2263
+ "step": 1610
2264
+ },
2265
+ {
2266
+ "epoch": 2.203274215552524,
2267
+ "grad_norm": 0.5256478132720807,
2268
+ "learning_rate": 3.7789546062179073e-06,
2269
+ "loss": 0.4294,
2270
+ "step": 1615
2271
+ },
2272
+ {
2273
+ "epoch": 2.210095497953615,
2274
+ "grad_norm": 0.49468837343517935,
2275
+ "learning_rate": 3.721841188667883e-06,
2276
+ "loss": 0.4288,
2277
+ "step": 1620
2278
+ },
2279
+ {
2280
+ "epoch": 2.2169167803547065,
2281
+ "grad_norm": 0.5288153592830379,
2282
+ "learning_rate": 3.66508839638853e-06,
2283
+ "loss": 0.4259,
2284
+ "step": 1625
2285
+ },
2286
+ {
2287
+ "epoch": 2.223738062755798,
2288
+ "grad_norm": 0.5197337908496411,
2289
+ "learning_rate": 3.608699438219835e-06,
2290
+ "loss": 0.4189,
2291
+ "step": 1630
2292
+ },
2293
+ {
2294
+ "epoch": 2.2305593451568897,
2295
+ "grad_norm": 0.503686422687,
2296
+ "learning_rate": 3.5526775024303743e-06,
2297
+ "loss": 0.4339,
2298
+ "step": 1635
2299
+ },
2300
+ {
2301
+ "epoch": 2.237380627557981,
2302
+ "grad_norm": 0.5056806299150596,
2303
+ "learning_rate": 3.4970257565370426e-06,
2304
+ "loss": 0.42,
2305
+ "step": 1640
2306
+ },
2307
+ {
2308
+ "epoch": 2.2442019099590724,
2309
+ "grad_norm": 0.5284748263602718,
2310
+ "learning_rate": 3.441747347125944e-06,
2311
+ "loss": 0.4239,
2312
+ "step": 1645
2313
+ },
2314
+ {
2315
+ "epoch": 2.251023192360164,
2316
+ "grad_norm": 0.5304486815572842,
2317
+ "learning_rate": 3.386845399674505e-06,
2318
+ "loss": 0.4283,
2319
+ "step": 1650
2320
+ },
2321
+ {
2322
+ "epoch": 2.257844474761255,
2323
+ "grad_norm": 0.5162041364746609,
2324
+ "learning_rate": 3.332323018374739e-06,
2325
+ "loss": 0.4231,
2326
+ "step": 1655
2327
+ },
2328
+ {
2329
+ "epoch": 2.2646657571623465,
2330
+ "grad_norm": 0.5172072565381528,
2331
+ "learning_rate": 3.278183285957741e-06,
2332
+ "loss": 0.4299,
2333
+ "step": 1660
2334
+ },
2335
+ {
2336
+ "epoch": 2.271487039563438,
2337
+ "grad_norm": 0.5070978563723317,
2338
+ "learning_rate": 3.2244292635193967e-06,
2339
+ "loss": 0.4161,
2340
+ "step": 1665
2341
+ },
2342
+ {
2343
+ "epoch": 2.2783083219645293,
2344
+ "grad_norm": 0.5025104999699348,
2345
+ "learning_rate": 3.1710639903472857e-06,
2346
+ "loss": 0.4191,
2347
+ "step": 1670
2348
+ },
2349
+ {
2350
+ "epoch": 2.2851296043656206,
2351
+ "grad_norm": 0.5301696910477306,
2352
+ "learning_rate": 3.1180904837488603e-06,
2353
+ "loss": 0.4269,
2354
+ "step": 1675
2355
+ },
2356
+ {
2357
+ "epoch": 2.291950886766712,
2358
+ "grad_norm": 0.5043851490911145,
2359
+ "learning_rate": 3.0655117388808225e-06,
2360
+ "loss": 0.4222,
2361
+ "step": 1680
2362
+ },
2363
+ {
2364
+ "epoch": 2.2987721691678034,
2365
+ "grad_norm": 0.5168822676176625,
2366
+ "learning_rate": 3.0133307285798013e-06,
2367
+ "loss": 0.4251,
2368
+ "step": 1685
2369
+ },
2370
+ {
2371
+ "epoch": 2.305593451568895,
2372
+ "grad_norm": 0.49344869094402655,
2373
+ "learning_rate": 2.961550403194247e-06,
2374
+ "loss": 0.4267,
2375
+ "step": 1690
2376
+ },
2377
+ {
2378
+ "epoch": 2.3124147339699865,
2379
+ "grad_norm": 0.4945480487576485,
2380
+ "learning_rate": 2.910173690417618e-06,
2381
+ "loss": 0.4198,
2382
+ "step": 1695
2383
+ },
2384
+ {
2385
+ "epoch": 2.319236016371078,
2386
+ "grad_norm": 0.4996549457910774,
2387
+ "learning_rate": 2.859203495122861e-06,
2388
+ "loss": 0.4172,
2389
+ "step": 1700
2390
+ },
2391
+ {
2392
+ "epoch": 2.3260572987721693,
2393
+ "grad_norm": 0.5080873281642481,
2394
+ "learning_rate": 2.8086426991981524e-06,
2395
+ "loss": 0.4341,
2396
+ "step": 1705
2397
+ },
2398
+ {
2399
+ "epoch": 2.3328785811732606,
2400
+ "grad_norm": 0.518725406601656,
2401
+ "learning_rate": 2.7584941613839577e-06,
2402
+ "loss": 0.4243,
2403
+ "step": 1710
2404
+ },
2405
+ {
2406
+ "epoch": 2.339699863574352,
2407
+ "grad_norm": 0.5005709253340304,
2408
+ "learning_rate": 2.708760717111409e-06,
2409
+ "loss": 0.4183,
2410
+ "step": 1715
2411
+ },
2412
+ {
2413
+ "epoch": 2.3465211459754434,
2414
+ "grad_norm": 0.5025661764368299,
2415
+ "learning_rate": 2.659445178341967e-06,
2416
+ "loss": 0.4124,
2417
+ "step": 1720
2418
+ },
2419
+ {
2420
+ "epoch": 2.3533424283765347,
2421
+ "grad_norm": 0.526802256910979,
2422
+ "learning_rate": 2.6105503334084543e-06,
2423
+ "loss": 0.4131,
2424
+ "step": 1725
2425
+ },
2426
+ {
2427
+ "epoch": 2.360163710777626,
2428
+ "grad_norm": 0.49232219642558783,
2429
+ "learning_rate": 2.5620789468573736e-06,
2430
+ "loss": 0.4189,
2431
+ "step": 1730
2432
+ },
2433
+ {
2434
+ "epoch": 2.3669849931787175,
2435
+ "grad_norm": 0.5031403656147391,
2436
+ "learning_rate": 2.51403375929263e-06,
2437
+ "loss": 0.4271,
2438
+ "step": 1735
2439
+ },
2440
+ {
2441
+ "epoch": 2.373806275579809,
2442
+ "grad_norm": 0.47932913781175096,
2443
+ "learning_rate": 2.4664174872205447e-06,
2444
+ "loss": 0.4131,
2445
+ "step": 1740
2446
+ },
2447
+ {
2448
+ "epoch": 2.3806275579809,
2449
+ "grad_norm": 0.5000820600467547,
2450
+ "learning_rate": 2.4192328228962844e-06,
2451
+ "loss": 0.4137,
2452
+ "step": 1745
2453
+ },
2454
+ {
2455
+ "epoch": 2.3874488403819916,
2456
+ "grad_norm": 0.5424591208082046,
2457
+ "learning_rate": 2.372482434171635e-06,
2458
+ "loss": 0.4261,
2459
+ "step": 1750
2460
+ },
2461
+ {
2462
+ "epoch": 2.3942701227830834,
2463
+ "grad_norm": 0.4861046037404662,
2464
+ "learning_rate": 2.326168964344147e-06,
2465
+ "loss": 0.4168,
2466
+ "step": 1755
2467
+ },
2468
+ {
2469
+ "epoch": 2.4010914051841747,
2470
+ "grad_norm": 0.496354816091883,
2471
+ "learning_rate": 2.2802950320076925e-06,
2472
+ "loss": 0.4276,
2473
+ "step": 1760
2474
+ },
2475
+ {
2476
+ "epoch": 2.407912687585266,
2477
+ "grad_norm": 0.5114949143831609,
2478
+ "learning_rate": 2.2348632309044124e-06,
2479
+ "loss": 0.421,
2480
+ "step": 1765
2481
+ },
2482
+ {
2483
+ "epoch": 2.4147339699863575,
2484
+ "grad_norm": 0.492463054273807,
2485
+ "learning_rate": 2.18987612977805e-06,
2486
+ "loss": 0.4191,
2487
+ "step": 1770
2488
+ },
2489
+ {
2490
+ "epoch": 2.421555252387449,
2491
+ "grad_norm": 0.5151725907315208,
2492
+ "learning_rate": 2.1453362722287322e-06,
2493
+ "loss": 0.4258,
2494
+ "step": 1775
2495
+ },
2496
+ {
2497
+ "epoch": 2.42837653478854,
2498
+ "grad_norm": 0.5089343428748886,
2499
+ "learning_rate": 2.1012461765691304e-06,
2500
+ "loss": 0.4233,
2501
+ "step": 1780
2502
+ },
2503
+ {
2504
+ "epoch": 2.4351978171896316,
2505
+ "grad_norm": 0.5093727780183989,
2506
+ "learning_rate": 2.057608335682089e-06,
2507
+ "loss": 0.4241,
2508
+ "step": 1785
2509
+ },
2510
+ {
2511
+ "epoch": 2.442019099590723,
2512
+ "grad_norm": 0.49745508628055146,
2513
+ "learning_rate": 2.014425216879672e-06,
2514
+ "loss": 0.4212,
2515
+ "step": 1790
2516
+ },
2517
+ {
2518
+ "epoch": 2.4488403819918143,
2519
+ "grad_norm": 0.487004125589352,
2520
+ "learning_rate": 1.9716992617636627e-06,
2521
+ "loss": 0.42,
2522
+ "step": 1795
2523
+ },
2524
+ {
2525
+ "epoch": 2.4556616643929057,
2526
+ "grad_norm": 0.4961490394906471,
2527
+ "learning_rate": 1.9294328860875033e-06,
2528
+ "loss": 0.4256,
2529
+ "step": 1800
2530
+ },
2531
+ {
2532
+ "epoch": 2.4624829467939975,
2533
+ "grad_norm": 0.47821507063944224,
2534
+ "learning_rate": 1.8876284796197221e-06,
2535
+ "loss": 0.4162,
2536
+ "step": 1805
2537
+ },
2538
+ {
2539
+ "epoch": 2.469304229195089,
2540
+ "grad_norm": 0.5163338399882206,
2541
+ "learning_rate": 1.8462884060087963e-06,
2542
+ "loss": 0.422,
2543
+ "step": 1810
2544
+ },
2545
+ {
2546
+ "epoch": 2.47612551159618,
2547
+ "grad_norm": 0.5295705164637108,
2548
+ "learning_rate": 1.8054150026495306e-06,
2549
+ "loss": 0.4142,
2550
+ "step": 1815
2551
+ },
2552
+ {
2553
+ "epoch": 2.4829467939972716,
2554
+ "grad_norm": 0.5009709028251212,
2555
+ "learning_rate": 1.7650105805508798e-06,
2556
+ "loss": 0.4268,
2557
+ "step": 1820
2558
+ },
2559
+ {
2560
+ "epoch": 2.489768076398363,
2561
+ "grad_norm": 0.4838767553323068,
2562
+ "learning_rate": 1.7250774242052967e-06,
2563
+ "loss": 0.4152,
2564
+ "step": 1825
2565
+ },
2566
+ {
2567
+ "epoch": 2.4965893587994543,
2568
+ "grad_norm": 0.49748189595154224,
2569
+ "learning_rate": 1.6856177914595588e-06,
2570
+ "loss": 0.4188,
2571
+ "step": 1830
2572
+ },
2573
+ {
2574
+ "epoch": 2.5034106412005457,
2575
+ "grad_norm": 0.5004689758885488,
2576
+ "learning_rate": 1.6466339133871056e-06,
2577
+ "loss": 0.4166,
2578
+ "step": 1835
2579
+ },
2580
+ {
2581
+ "epoch": 2.510231923601637,
2582
+ "grad_norm": 0.5040308226552292,
2583
+ "learning_rate": 1.6081279941619036e-06,
2584
+ "loss": 0.4279,
2585
+ "step": 1840
2586
+ },
2587
+ {
2588
+ "epoch": 2.5170532060027284,
2589
+ "grad_norm": 0.5041171513103451,
2590
+ "learning_rate": 1.570102210933806e-06,
2591
+ "loss": 0.4198,
2592
+ "step": 1845
2593
+ },
2594
+ {
2595
+ "epoch": 2.52387448840382,
2596
+ "grad_norm": 0.5422732550913443,
2597
+ "learning_rate": 1.5325587137054692e-06,
2598
+ "loss": 0.4255,
2599
+ "step": 1850
2600
+ },
2601
+ {
2602
+ "epoch": 2.530695770804911,
2603
+ "grad_norm": 0.5214024951095575,
2604
+ "learning_rate": 1.495499625210782e-06,
2605
+ "loss": 0.4363,
2606
+ "step": 1855
2607
+ },
2608
+ {
2609
+ "epoch": 2.5375170532060025,
2610
+ "grad_norm": 0.52314445884365,
2611
+ "learning_rate": 1.4589270407948413e-06,
2612
+ "loss": 0.4234,
2613
+ "step": 1860
2614
+ },
2615
+ {
2616
+ "epoch": 2.544338335607094,
2617
+ "grad_norm": 0.5140411799960605,
2618
+ "learning_rate": 1.4228430282954936e-06,
2619
+ "loss": 0.4251,
2620
+ "step": 1865
2621
+ },
2622
+ {
2623
+ "epoch": 2.5511596180081857,
2624
+ "grad_norm": 0.49968313162179895,
2625
+ "learning_rate": 1.3872496279264012e-06,
2626
+ "loss": 0.42,
2627
+ "step": 1870
2628
+ },
2629
+ {
2630
+ "epoch": 2.557980900409277,
2631
+ "grad_norm": 0.4952154436629377,
2632
+ "learning_rate": 1.352148852161704e-06,
2633
+ "loss": 0.4193,
2634
+ "step": 1875
2635
+ },
2636
+ {
2637
+ "epoch": 2.5648021828103684,
2638
+ "grad_norm": 0.5024039679399355,
2639
+ "learning_rate": 1.3175426856222196e-06,
2640
+ "loss": 0.4201,
2641
+ "step": 1880
2642
+ },
2643
+ {
2644
+ "epoch": 2.57162346521146,
2645
+ "grad_norm": 0.5090323431432741,
2646
+ "learning_rate": 1.2834330849632344e-06,
2647
+ "loss": 0.4248,
2648
+ "step": 1885
2649
+ },
2650
+ {
2651
+ "epoch": 2.578444747612551,
2652
+ "grad_norm": 0.5083201849990721,
2653
+ "learning_rate": 1.249821978763882e-06,
2654
+ "loss": 0.4192,
2655
+ "step": 1890
2656
+ },
2657
+ {
2658
+ "epoch": 2.5852660300136425,
2659
+ "grad_norm": 0.49117019077982144,
2660
+ "learning_rate": 1.2167112674180868e-06,
2661
+ "loss": 0.4172,
2662
+ "step": 1895
2663
+ },
2664
+ {
2665
+ "epoch": 2.592087312414734,
2666
+ "grad_norm": 0.4927717554184724,
2667
+ "learning_rate": 1.184102823027123e-06,
2668
+ "loss": 0.417,
2669
+ "step": 1900
2670
+ },
2671
+ {
2672
+ "epoch": 2.5989085948158253,
2673
+ "grad_norm": 0.5005358781417781,
2674
+ "learning_rate": 1.1519984892937682e-06,
2675
+ "loss": 0.4185,
2676
+ "step": 1905
2677
+ },
2678
+ {
2679
+ "epoch": 2.6057298772169166,
2680
+ "grad_norm": 0.5405574472133523,
2681
+ "learning_rate": 1.1204000814180418e-06,
2682
+ "loss": 0.4272,
2683
+ "step": 1910
2684
+ },
2685
+ {
2686
+ "epoch": 2.6125511596180084,
2687
+ "grad_norm": 0.49133740899149486,
2688
+ "learning_rate": 1.0893093859945979e-06,
2689
+ "loss": 0.4145,
2690
+ "step": 1915
2691
+ },
2692
+ {
2693
+ "epoch": 2.6193724420191,
2694
+ "grad_norm": 0.509067826909446,
2695
+ "learning_rate": 1.058728160911686e-06,
2696
+ "loss": 0.4145,
2697
+ "step": 1920
2698
+ },
2699
+ {
2700
+ "epoch": 2.626193724420191,
2701
+ "grad_norm": 0.5086329096510834,
2702
+ "learning_rate": 1.028658135251772e-06,
2703
+ "loss": 0.4318,
2704
+ "step": 1925
2705
+ },
2706
+ {
2707
+ "epoch": 2.6330150068212825,
2708
+ "grad_norm": 0.48990925278828223,
2709
+ "learning_rate": 9.99101009193777e-07,
2710
+ "loss": 0.4231,
2711
+ "step": 1930
2712
+ },
2713
+ {
2714
+ "epoch": 2.639836289222374,
2715
+ "grad_norm": 0.5132773338699153,
2716
+ "learning_rate": 9.70058453916936e-07,
2717
+ "loss": 0.4277,
2718
+ "step": 1935
2719
+ },
2720
+ {
2721
+ "epoch": 2.6466575716234653,
2722
+ "grad_norm": 0.49623523079302395,
2723
+ "learning_rate": 9.415321115063212e-07,
2724
+ "loss": 0.4265,
2725
+ "step": 1940
2726
+ },
2727
+ {
2728
+ "epoch": 2.6534788540245566,
2729
+ "grad_norm": 0.4997209001664598,
2730
+ "learning_rate": 9.135235948599877e-07,
2731
+ "loss": 0.4251,
2732
+ "step": 1945
2733
+ },
2734
+ {
2735
+ "epoch": 2.660300136425648,
2736
+ "grad_norm": 0.49789202327580734,
2737
+ "learning_rate": 8.860344875977853e-07,
2738
+ "loss": 0.4099,
2739
+ "step": 1950
2740
+ },
2741
+ {
2742
+ "epoch": 2.6671214188267394,
2743
+ "grad_norm": 0.49962306285337266,
2744
+ "learning_rate": 8.590663439718198e-07,
2745
+ "loss": 0.4283,
2746
+ "step": 1955
2747
+ },
2748
+ {
2749
+ "epoch": 2.6739427012278307,
2750
+ "grad_norm": 0.49059761628317505,
2751
+ "learning_rate": 8.326206887785666e-07,
2752
+ "loss": 0.4196,
2753
+ "step": 1960
2754
+ },
2755
+ {
2756
+ "epoch": 2.680763983628922,
2757
+ "grad_norm": 0.4878700198807422,
2758
+ "learning_rate": 8.066990172726674e-07,
2759
+ "loss": 0.4259,
2760
+ "step": 1965
2761
+ },
2762
+ {
2763
+ "epoch": 2.6875852660300135,
2764
+ "grad_norm": 0.4818193357568829,
2765
+ "learning_rate": 7.813027950823838e-07,
2766
+ "loss": 0.4229,
2767
+ "step": 1970
2768
+ },
2769
+ {
2770
+ "epoch": 2.694406548431105,
2771
+ "grad_norm": 0.4863471329457305,
2772
+ "learning_rate": 7.564334581267252e-07,
2773
+ "loss": 0.4169,
2774
+ "step": 1975
2775
+ },
2776
+ {
2777
+ "epoch": 2.701227830832196,
2778
+ "grad_norm": 0.49923983484265055,
2779
+ "learning_rate": 7.320924125342713e-07,
2780
+ "loss": 0.4187,
2781
+ "step": 1980
2782
+ },
2783
+ {
2784
+ "epoch": 2.708049113233288,
2785
+ "grad_norm": 0.5018790036089281,
2786
+ "learning_rate": 7.082810345636574e-07,
2787
+ "loss": 0.4207,
2788
+ "step": 1985
2789
+ },
2790
+ {
2791
+ "epoch": 2.7148703956343794,
2792
+ "grad_norm": 0.500405512783932,
2793
+ "learning_rate": 6.850006705257689e-07,
2794
+ "loss": 0.4247,
2795
+ "step": 1990
2796
+ },
2797
+ {
2798
+ "epoch": 2.7216916780354707,
2799
+ "grad_norm": 0.5025182540120037,
2800
+ "learning_rate": 6.622526367076129e-07,
2801
+ "loss": 0.4276,
2802
+ "step": 1995
2803
+ },
2804
+ {
2805
+ "epoch": 2.728512960436562,
2806
+ "grad_norm": 0.5026425276086048,
2807
+ "learning_rate": 6.400382192979011e-07,
2808
+ "loss": 0.4247,
2809
+ "step": 2000
2810
+ },
2811
+ {
2812
+ "epoch": 2.7353342428376535,
2813
+ "grad_norm": 0.5050638447625582,
2814
+ "learning_rate": 6.183586743143195e-07,
2815
+ "loss": 0.4317,
2816
+ "step": 2005
2817
+ },
2818
+ {
2819
+ "epoch": 2.742155525238745,
2820
+ "grad_norm": 0.49008521019733564,
2821
+ "learning_rate": 5.972152275325236e-07,
2822
+ "loss": 0.4152,
2823
+ "step": 2010
2824
+ },
2825
+ {
2826
+ "epoch": 2.748976807639836,
2827
+ "grad_norm": 0.4845822249557805,
2828
+ "learning_rate": 5.766090744168215e-07,
2829
+ "loss": 0.4097,
2830
+ "step": 2015
2831
+ },
2832
+ {
2833
+ "epoch": 2.7557980900409276,
2834
+ "grad_norm": 0.49381205885998286,
2835
+ "learning_rate": 5.565413800525883e-07,
2836
+ "loss": 0.428,
2837
+ "step": 2020
2838
+ },
2839
+ {
2840
+ "epoch": 2.762619372442019,
2841
+ "grad_norm": 0.4928613780812328,
2842
+ "learning_rate": 5.370132790803862e-07,
2843
+ "loss": 0.4258,
2844
+ "step": 2025
2845
+ },
2846
+ {
2847
+ "epoch": 2.7694406548431107,
2848
+ "grad_norm": 0.5121246868411271,
2849
+ "learning_rate": 5.180258756318177e-07,
2850
+ "loss": 0.4312,
2851
+ "step": 2030
2852
+ },
2853
+ {
2854
+ "epoch": 2.776261937244202,
2855
+ "grad_norm": 0.5054202294126638,
2856
+ "learning_rate": 4.995802432670913e-07,
2857
+ "loss": 0.4228,
2858
+ "step": 2035
2859
+ },
2860
+ {
2861
+ "epoch": 2.7830832196452935,
2862
+ "grad_norm": 0.5032311759948168,
2863
+ "learning_rate": 4.816774249143254e-07,
2864
+ "loss": 0.4224,
2865
+ "step": 2040
2866
+ },
2867
+ {
2868
+ "epoch": 2.789904502046385,
2869
+ "grad_norm": 0.4892276973817304,
2870
+ "learning_rate": 4.643184328105774e-07,
2871
+ "loss": 0.4172,
2872
+ "step": 2045
2873
+ },
2874
+ {
2875
+ "epoch": 2.796725784447476,
2876
+ "grad_norm": 0.5331616596228133,
2877
+ "learning_rate": 4.4750424844461485e-07,
2878
+ "loss": 0.4282,
2879
+ "step": 2050
2880
+ },
2881
+ {
2882
+ "epoch": 2.8035470668485676,
2883
+ "grad_norm": 0.48337446719708316,
2884
+ "learning_rate": 4.3123582250141833e-07,
2885
+ "loss": 0.4324,
2886
+ "step": 2055
2887
+ },
2888
+ {
2889
+ "epoch": 2.810368349249659,
2890
+ "grad_norm": 0.5113503747508404,
2891
+ "learning_rate": 4.1551407480843143e-07,
2892
+ "loss": 0.4145,
2893
+ "step": 2060
2894
+ },
2895
+ {
2896
+ "epoch": 2.8171896316507503,
2897
+ "grad_norm": 0.5152840097770183,
2898
+ "learning_rate": 4.0033989428354903e-07,
2899
+ "loss": 0.4187,
2900
+ "step": 2065
2901
+ },
2902
+ {
2903
+ "epoch": 2.8240109140518417,
2904
+ "grad_norm": 0.4978525864335864,
2905
+ "learning_rate": 3.857141388848656e-07,
2906
+ "loss": 0.4259,
2907
+ "step": 2070
2908
+ },
2909
+ {
2910
+ "epoch": 2.830832196452933,
2911
+ "grad_norm": 0.4928747536539546,
2912
+ "learning_rate": 3.716376355621544e-07,
2913
+ "loss": 0.4137,
2914
+ "step": 2075
2915
+ },
2916
+ {
2917
+ "epoch": 2.8376534788540244,
2918
+ "grad_norm": 0.4909595295192369,
2919
+ "learning_rate": 3.581111802101221e-07,
2920
+ "loss": 0.4209,
2921
+ "step": 2080
2922
+ },
2923
+ {
2924
+ "epoch": 2.844474761255116,
2925
+ "grad_norm": 0.47984589511153064,
2926
+ "learning_rate": 3.451355376233975e-07,
2927
+ "loss": 0.4218,
2928
+ "step": 2085
2929
+ },
2930
+ {
2931
+ "epoch": 2.851296043656207,
2932
+ "grad_norm": 0.5105759387938567,
2933
+ "learning_rate": 3.327114414533018e-07,
2934
+ "loss": 0.4218,
2935
+ "step": 2090
2936
+ },
2937
+ {
2938
+ "epoch": 2.8581173260572985,
2939
+ "grad_norm": 0.49939209059879885,
2940
+ "learning_rate": 3.2083959416635723e-07,
2941
+ "loss": 0.4245,
2942
+ "step": 2095
2943
+ },
2944
+ {
2945
+ "epoch": 2.8649386084583903,
2946
+ "grad_norm": 0.47798891953751205,
2947
+ "learning_rate": 3.095206670045719e-07,
2948
+ "loss": 0.4231,
2949
+ "step": 2100
2950
+ },
2951
+ {
2952
+ "epoch": 2.8717598908594817,
2953
+ "grad_norm": 0.4945209972567527,
2954
+ "learning_rate": 2.9875529994749423e-07,
2955
+ "loss": 0.4266,
2956
+ "step": 2105
2957
+ },
2958
+ {
2959
+ "epoch": 2.878581173260573,
2960
+ "grad_norm": 0.4988874560151567,
2961
+ "learning_rate": 2.885441016760146e-07,
2962
+ "loss": 0.4231,
2963
+ "step": 2110
2964
+ },
2965
+ {
2966
+ "epoch": 2.8854024556616644,
2967
+ "grad_norm": 0.4911745856432128,
2968
+ "learning_rate": 2.788876495379644e-07,
2969
+ "loss": 0.416,
2970
+ "step": 2115
2971
+ },
2972
+ {
2973
+ "epoch": 2.892223738062756,
2974
+ "grad_norm": 0.48889972081861915,
2975
+ "learning_rate": 2.697864895154621e-07,
2976
+ "loss": 0.4194,
2977
+ "step": 2120
2978
+ },
2979
+ {
2980
+ "epoch": 2.899045020463847,
2981
+ "grad_norm": 0.4971330108179286,
2982
+ "learning_rate": 2.6124113619404796e-07,
2983
+ "loss": 0.4199,
2984
+ "step": 2125
2985
+ },
2986
+ {
2987
+ "epoch": 2.9058663028649385,
2988
+ "grad_norm": 0.48260941084833,
2989
+ "learning_rate": 2.532520727335875e-07,
2990
+ "loss": 0.4223,
2991
+ "step": 2130
2992
+ },
2993
+ {
2994
+ "epoch": 2.91268758526603,
2995
+ "grad_norm": 0.5219836641359544,
2996
+ "learning_rate": 2.4581975084095385e-07,
2997
+ "loss": 0.4236,
2998
+ "step": 2135
2999
+ },
3000
+ {
3001
+ "epoch": 2.9195088676671213,
3002
+ "grad_norm": 0.4823945964798486,
3003
+ "learning_rate": 2.3894459074448735e-07,
3004
+ "loss": 0.4276,
3005
+ "step": 2140
3006
+ },
3007
+ {
3008
+ "epoch": 2.926330150068213,
3009
+ "grad_norm": 0.48925917113417233,
3010
+ "learning_rate": 2.3262698117023728e-07,
3011
+ "loss": 0.4147,
3012
+ "step": 2145
3013
+ },
3014
+ {
3015
+ "epoch": 2.9331514324693044,
3016
+ "grad_norm": 0.5007654215500961,
3017
+ "learning_rate": 2.26867279319979e-07,
3018
+ "loss": 0.4307,
3019
+ "step": 2150
3020
+ },
3021
+ {
3022
+ "epoch": 2.939972714870396,
3023
+ "grad_norm": 0.5009786173466161,
3024
+ "learning_rate": 2.2166581085102458e-07,
3025
+ "loss": 0.4203,
3026
+ "step": 2155
3027
+ },
3028
+ {
3029
+ "epoch": 2.946793997271487,
3030
+ "grad_norm": 0.4791013138028541,
3031
+ "learning_rate": 2.170228698578013e-07,
3032
+ "loss": 0.4207,
3033
+ "step": 2160
3034
+ },
3035
+ {
3036
+ "epoch": 2.9536152796725785,
3037
+ "grad_norm": 0.48997170955684427,
3038
+ "learning_rate": 2.1293871885523088e-07,
3039
+ "loss": 0.4174,
3040
+ "step": 2165
3041
+ },
3042
+ {
3043
+ "epoch": 2.96043656207367,
3044
+ "grad_norm": 0.48195875107231834,
3045
+ "learning_rate": 2.0941358876388138e-07,
3046
+ "loss": 0.4194,
3047
+ "step": 2170
3048
+ },
3049
+ {
3050
+ "epoch": 2.9672578444747613,
3051
+ "grad_norm": 0.4938777138358861,
3052
+ "learning_rate": 2.06447678896913e-07,
3053
+ "loss": 0.4241,
3054
+ "step": 2175
3055
+ },
3056
+ {
3057
+ "epoch": 2.9740791268758526,
3058
+ "grad_norm": 0.4779185403122718,
3059
+ "learning_rate": 2.0404115694881084e-07,
3060
+ "loss": 0.4178,
3061
+ "step": 2180
3062
+ },
3063
+ {
3064
+ "epoch": 2.980900409276944,
3065
+ "grad_norm": 0.48454827866065714,
3066
+ "learning_rate": 2.0219415898589985e-07,
3067
+ "loss": 0.4162,
3068
+ "step": 2185
3069
+ },
3070
+ {
3071
+ "epoch": 2.9877216916780354,
3072
+ "grad_norm": 0.4877667234275044,
3073
+ "learning_rate": 2.009067894386505e-07,
3074
+ "loss": 0.4169,
3075
+ "step": 2190
3076
+ },
3077
+ {
3078
+ "epoch": 2.9945429740791267,
3079
+ "grad_norm": 0.4921925700400261,
3080
+ "learning_rate": 2.001791210957813e-07,
3081
+ "loss": 0.4192,
3082
+ "step": 2195
3083
+ },
3084
+ {
3085
+ "epoch": 3.0,
3086
+ "step": 2199,
3087
+ "total_flos": 250577921507328.0,
3088
+ "train_loss": 0.49602713585984115,
3089
+ "train_runtime": 3019.3082,
3090
+ "train_samples_per_second": 93.134,
3091
+ "train_steps_per_second": 0.728
3092
  }
3093
  ],
3094
  "logging_steps": 5,
3095
+ "max_steps": 2199,
3096
  "num_input_tokens_seen": 0,
3097
+ "num_train_epochs": 3,
3098
  "save_steps": 100,
3099
  "stateful_callbacks": {
3100
  "TrainerControl": {
 
3108
  "attributes": {}
3109
  }
3110
  },
3111
+ "total_flos": 250577921507328.0,
3112
  "train_batch_size": 16,
3113
  "trial_name": null,
3114
  "trial_params": null