rakhman-llm commited on
Commit
bc6c2a9
·
verified ·
1 Parent(s): 213fae5

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a43735b9df4b283992c9c4d45bee75fc6285185a63fdf5ab0ca479a5b3695e19
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d253ddcfd4df2acb84a6afc37ea897625fd4823975484b2722f1d732a807a42
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd5fe4f268d800b98de0ea4749a2562de05eb7ee03e087f4d9014c5c565df9cc
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80072666b3be47f53f317ccfe6322071cfa6706038da8de25f13ce6816f3172e
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99f4372161e4aa7620062bb4cf560b7c8a1c037ae87066fa4a0856792df3e527
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2667454d6cffb59aea7917c33b6757bc9e126470ce377cf6d3d3b895b28bce
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4de77d65b6ff9759de02ce30e6b375e61b559a45ccc8d168326bc1d5f160b8b2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9e7d6d8721d0efdf284e8ea86037aff29ca12a45581cd9384185af5e287f2cf
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08704760670661926,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-2000",
4
- "epoch": 0.32,
5
  "eval_steps": 500,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -81,9 +81,9 @@
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
- "eval_runtime": 116.7651,
85
- "eval_samples_per_second": 17.128,
86
- "eval_steps_per_second": 2.141,
87
  "step": 500
88
  },
89
  {
@@ -159,9 +159,9 @@
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
- "eval_runtime": 116.7407,
163
- "eval_samples_per_second": 17.132,
164
- "eval_steps_per_second": 2.141,
165
  "step": 1000
166
  },
167
  {
@@ -237,9 +237,9 @@
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
- "eval_runtime": 116.8722,
241
- "eval_samples_per_second": 17.113,
242
- "eval_steps_per_second": 2.139,
243
  "step": 1500
244
  },
245
  {
@@ -315,10 +315,88 @@
315
  {
316
  "epoch": 0.32,
317
  "eval_loss": 0.08704760670661926,
318
- "eval_runtime": 116.8362,
319
- "eval_samples_per_second": 17.118,
320
- "eval_steps_per_second": 2.14,
321
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  }
323
  ],
324
  "logging_steps": 50,
@@ -338,7 +416,7 @@
338
  "attributes": {}
339
  }
340
  },
341
- "total_flos": 9743326248960000.0,
342
  "train_batch_size": 8,
343
  "trial_name": null,
344
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08615937829017639,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-2500",
4
+ "epoch": 0.4,
5
  "eval_steps": 500,
6
+ "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
+ "eval_runtime": 109.274,
85
+ "eval_samples_per_second": 18.303,
86
+ "eval_steps_per_second": 2.288,
87
  "step": 500
88
  },
89
  {
 
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
+ "eval_runtime": 109.2536,
163
+ "eval_samples_per_second": 18.306,
164
+ "eval_steps_per_second": 2.288,
165
  "step": 1000
166
  },
167
  {
 
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
+ "eval_runtime": 109.2355,
241
+ "eval_samples_per_second": 18.309,
242
+ "eval_steps_per_second": 2.289,
243
  "step": 1500
244
  },
245
  {
 
315
  {
316
  "epoch": 0.32,
317
  "eval_loss": 0.08704760670661926,
318
+ "eval_runtime": 109.4348,
319
+ "eval_samples_per_second": 18.276,
320
+ "eval_steps_per_second": 2.284,
321
  "step": 2000
322
+ },
323
+ {
324
+ "epoch": 0.328,
325
+ "grad_norm": 6016.46826171875,
326
+ "learning_rate": 2.672e-05,
327
+ "loss": 0.0663,
328
+ "step": 2050
329
+ },
330
+ {
331
+ "epoch": 0.336,
332
+ "grad_norm": 6869.53076171875,
333
+ "learning_rate": 2.6640000000000002e-05,
334
+ "loss": 0.073,
335
+ "step": 2100
336
+ },
337
+ {
338
+ "epoch": 0.344,
339
+ "grad_norm": 6099.595703125,
340
+ "learning_rate": 2.656e-05,
341
+ "loss": 0.0667,
342
+ "step": 2150
343
+ },
344
+ {
345
+ "epoch": 0.352,
346
+ "grad_norm": 6923.919921875,
347
+ "learning_rate": 2.648e-05,
348
+ "loss": 0.0653,
349
+ "step": 2200
350
+ },
351
+ {
352
+ "epoch": 0.36,
353
+ "grad_norm": 8005.85595703125,
354
+ "learning_rate": 2.64e-05,
355
+ "loss": 0.0685,
356
+ "step": 2250
357
+ },
358
+ {
359
+ "epoch": 0.368,
360
+ "grad_norm": 6473.466796875,
361
+ "learning_rate": 2.632e-05,
362
+ "loss": 0.0678,
363
+ "step": 2300
364
+ },
365
+ {
366
+ "epoch": 0.376,
367
+ "grad_norm": 7177.6328125,
368
+ "learning_rate": 2.6240000000000003e-05,
369
+ "loss": 0.0637,
370
+ "step": 2350
371
+ },
372
+ {
373
+ "epoch": 0.384,
374
+ "grad_norm": 5574.75439453125,
375
+ "learning_rate": 2.616e-05,
376
+ "loss": 0.0698,
377
+ "step": 2400
378
+ },
379
+ {
380
+ "epoch": 0.392,
381
+ "grad_norm": 6910.39599609375,
382
+ "learning_rate": 2.608e-05,
383
+ "loss": 0.0645,
384
+ "step": 2450
385
+ },
386
+ {
387
+ "epoch": 0.4,
388
+ "grad_norm": 5913.9775390625,
389
+ "learning_rate": 2.6000000000000002e-05,
390
+ "loss": 0.068,
391
+ "step": 2500
392
+ },
393
+ {
394
+ "epoch": 0.4,
395
+ "eval_loss": 0.08615937829017639,
396
+ "eval_runtime": 109.2621,
397
+ "eval_samples_per_second": 18.305,
398
+ "eval_steps_per_second": 2.288,
399
+ "step": 2500
400
  }
401
  ],
402
  "logging_steps": 50,
 
416
  "attributes": {}
417
  }
418
  },
419
+ "total_flos": 1.21791578112e+16,
420
  "train_batch_size": 8,
421
  "trial_name": null,
422
  "trial_params": null