rakhman-llm commited on
Commit
d903cab
·
verified ·
1 Parent(s): 33b004f

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a00b4752e5400013d03bdc3543638a739c5809565856c2e1fd067bc65223c01
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a43735b9df4b283992c9c4d45bee75fc6285185a63fdf5ab0ca479a5b3695e19
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f45ec9bcb28283b1fb0e751097db73046047365cbfc45350295ba1f4757128a0
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd5fe4f268d800b98de0ea4749a2562de05eb7ee03e087f4d9014c5c565df9cc
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ba4dabbcdc1ce4a55a172ce7079644c4c8c4a6b7506dc54cdcd4f9b26f6f954
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99f4372161e4aa7620062bb4cf560b7c8a1c037ae87066fa4a0856792df3e527
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d08a32404911dd432312864883a3240b833c1da247016f7258f9c59a4d0754c1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4de77d65b6ff9759de02ce30e6b375e61b559a45ccc8d168326bc1d5f160b8b2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08808805048465729,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-1500",
4
- "epoch": 0.24,
5
  "eval_steps": 500,
6
- "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -81,9 +81,9 @@
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
- "eval_runtime": 116.7651,
85
- "eval_samples_per_second": 17.128,
86
- "eval_steps_per_second": 2.141,
87
  "step": 500
88
  },
89
  {
@@ -159,9 +159,9 @@
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
- "eval_runtime": 116.7407,
163
- "eval_samples_per_second": 17.132,
164
- "eval_steps_per_second": 2.141,
165
  "step": 1000
166
  },
167
  {
@@ -237,10 +237,88 @@
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
- "eval_runtime": 116.8722,
241
- "eval_samples_per_second": 17.113,
242
- "eval_steps_per_second": 2.139,
243
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  }
245
  ],
246
  "logging_steps": 50,
@@ -260,7 +338,7 @@
260
  "attributes": {}
261
  }
262
  },
263
- "total_flos": 7307494686720000.0,
264
  "train_batch_size": 8,
265
  "trial_name": null,
266
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08704760670661926,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-2000",
4
+ "epoch": 0.32,
5
  "eval_steps": 500,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
+ "eval_runtime": 109.274,
85
+ "eval_samples_per_second": 18.303,
86
+ "eval_steps_per_second": 2.288,
87
  "step": 500
88
  },
89
  {
 
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
+ "eval_runtime": 109.2536,
163
+ "eval_samples_per_second": 18.306,
164
+ "eval_steps_per_second": 2.288,
165
  "step": 1000
166
  },
167
  {
 
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
+ "eval_runtime": 109.2355,
241
+ "eval_samples_per_second": 18.309,
242
+ "eval_steps_per_second": 2.289,
243
  "step": 1500
244
+ },
245
+ {
246
+ "epoch": 0.248,
247
+ "grad_norm": 6751.6494140625,
248
+ "learning_rate": 2.752e-05,
249
+ "loss": 0.0781,
250
+ "step": 1550
251
+ },
252
+ {
253
+ "epoch": 0.256,
254
+ "grad_norm": 5040.9033203125,
255
+ "learning_rate": 2.7439999999999998e-05,
256
+ "loss": 0.0686,
257
+ "step": 1600
258
+ },
259
+ {
260
+ "epoch": 0.264,
261
+ "grad_norm": 8748.07421875,
262
+ "learning_rate": 2.7360000000000002e-05,
263
+ "loss": 0.0689,
264
+ "step": 1650
265
+ },
266
+ {
267
+ "epoch": 0.272,
268
+ "grad_norm": 5971.705078125,
269
+ "learning_rate": 2.728e-05,
270
+ "loss": 0.0671,
271
+ "step": 1700
272
+ },
273
+ {
274
+ "epoch": 0.28,
275
+ "grad_norm": 10833.1357421875,
276
+ "learning_rate": 2.72e-05,
277
+ "loss": 0.0734,
278
+ "step": 1750
279
+ },
280
+ {
281
+ "epoch": 0.288,
282
+ "grad_norm": 10036.919921875,
283
+ "learning_rate": 2.712e-05,
284
+ "loss": 0.0715,
285
+ "step": 1800
286
+ },
287
+ {
288
+ "epoch": 0.296,
289
+ "grad_norm": 7755.1669921875,
290
+ "learning_rate": 2.704e-05,
291
+ "loss": 0.0669,
292
+ "step": 1850
293
+ },
294
+ {
295
+ "epoch": 0.304,
296
+ "grad_norm": 7584.822265625,
297
+ "learning_rate": 2.696e-05,
298
+ "loss": 0.0699,
299
+ "step": 1900
300
+ },
301
+ {
302
+ "epoch": 0.312,
303
+ "grad_norm": 10103.142578125,
304
+ "learning_rate": 2.688e-05,
305
+ "loss": 0.07,
306
+ "step": 1950
307
+ },
308
+ {
309
+ "epoch": 0.32,
310
+ "grad_norm": 5768.24267578125,
311
+ "learning_rate": 2.68e-05,
312
+ "loss": 0.0709,
313
+ "step": 2000
314
+ },
315
+ {
316
+ "epoch": 0.32,
317
+ "eval_loss": 0.08704760670661926,
318
+ "eval_runtime": 109.4348,
319
+ "eval_samples_per_second": 18.276,
320
+ "eval_steps_per_second": 2.284,
321
+ "step": 2000
322
  }
323
  ],
324
  "logging_steps": 50,
 
338
  "attributes": {}
339
  }
340
  },
341
+ "total_flos": 9743326248960000.0,
342
  "train_batch_size": 8,
343
  "trial_name": null,
344
  "trial_params": null