rakhman-llm commited on
Commit
ce46b78
·
verified ·
1 Parent(s): c3f9363

Training in progress, step 4500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:935c0c9e6c9bad61c25203243a2dced5424fa2c750c3b63d12e6a9555bd1d414
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19f937f8605011a6423fdcd193fb32164c380c1076f0eed9d33dcc3767b4df87
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b6424579b540963ca5dff78ef0b1bc07be8ee502bd08a007f4baf7bfc0962f2
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ffcdfd14da9f025b2c66271aa5f608d4bbfd5d88721668100d16d60c271d53f
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:994ecdd24463592bb7a469b126032a8d12a6fc0731753b561060d5f56a11ebf4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:831ec110a79680c26e00dbf10946e10090e2d0fde59d1cc953456efe8ddb0df7
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86f6fa01b61cfbdf0a99f96e2c8520500066cb2276b7fc4c693828106716e38a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c460047761054bd92ad62049610941db058ccd6f71ff714487704282f9a72984
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08466340601444244,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-4000",
4
- "epoch": 0.64,
5
  "eval_steps": 500,
6
- "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -81,9 +81,9 @@
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
- "eval_runtime": 116.7651,
85
- "eval_samples_per_second": 17.128,
86
- "eval_steps_per_second": 2.141,
87
  "step": 500
88
  },
89
  {
@@ -159,9 +159,9 @@
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
- "eval_runtime": 116.7407,
163
- "eval_samples_per_second": 17.132,
164
- "eval_steps_per_second": 2.141,
165
  "step": 1000
166
  },
167
  {
@@ -237,9 +237,9 @@
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
- "eval_runtime": 116.8722,
241
- "eval_samples_per_second": 17.113,
242
- "eval_steps_per_second": 2.139,
243
  "step": 1500
244
  },
245
  {
@@ -315,9 +315,9 @@
315
  {
316
  "epoch": 0.32,
317
  "eval_loss": 0.08704760670661926,
318
- "eval_runtime": 116.8362,
319
- "eval_samples_per_second": 17.118,
320
- "eval_steps_per_second": 2.14,
321
  "step": 2000
322
  },
323
  {
@@ -393,9 +393,9 @@
393
  {
394
  "epoch": 0.4,
395
  "eval_loss": 0.08615937829017639,
396
- "eval_runtime": 116.9591,
397
- "eval_samples_per_second": 17.1,
398
- "eval_steps_per_second": 2.137,
399
  "step": 2500
400
  },
401
  {
@@ -471,9 +471,9 @@
471
  {
472
  "epoch": 0.48,
473
  "eval_loss": 0.08551913499832153,
474
- "eval_runtime": 116.545,
475
- "eval_samples_per_second": 17.161,
476
- "eval_steps_per_second": 2.145,
477
  "step": 3000
478
  },
479
  {
@@ -549,9 +549,9 @@
549
  {
550
  "epoch": 0.56,
551
  "eval_loss": 0.08540560305118561,
552
- "eval_runtime": 116.9131,
553
- "eval_samples_per_second": 17.107,
554
- "eval_steps_per_second": 2.138,
555
  "step": 3500
556
  },
557
  {
@@ -627,10 +627,88 @@
627
  {
628
  "epoch": 0.64,
629
  "eval_loss": 0.08466340601444244,
630
- "eval_runtime": 116.6411,
631
- "eval_samples_per_second": 17.147,
632
- "eval_steps_per_second": 2.143,
633
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  }
635
  ],
636
  "logging_steps": 50,
@@ -650,7 +728,7 @@
650
  "attributes": {}
651
  }
652
  },
653
- "total_flos": 1.948665249792e+16,
654
  "train_batch_size": 8,
655
  "trial_name": null,
656
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.0842796117067337,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-4500",
4
+ "epoch": 0.72,
5
  "eval_steps": 500,
6
+ "global_step": 4500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
+ "eval_runtime": 109.274,
85
+ "eval_samples_per_second": 18.303,
86
+ "eval_steps_per_second": 2.288,
87
  "step": 500
88
  },
89
  {
 
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
+ "eval_runtime": 109.2536,
163
+ "eval_samples_per_second": 18.306,
164
+ "eval_steps_per_second": 2.288,
165
  "step": 1000
166
  },
167
  {
 
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
+ "eval_runtime": 109.2355,
241
+ "eval_samples_per_second": 18.309,
242
+ "eval_steps_per_second": 2.289,
243
  "step": 1500
244
  },
245
  {
 
315
  {
316
  "epoch": 0.32,
317
  "eval_loss": 0.08704760670661926,
318
+ "eval_runtime": 109.4348,
319
+ "eval_samples_per_second": 18.276,
320
+ "eval_steps_per_second": 2.284,
321
  "step": 2000
322
  },
323
  {
 
393
  {
394
  "epoch": 0.4,
395
  "eval_loss": 0.08615937829017639,
396
+ "eval_runtime": 109.2621,
397
+ "eval_samples_per_second": 18.305,
398
+ "eval_steps_per_second": 2.288,
399
  "step": 2500
400
  },
401
  {
 
471
  {
472
  "epoch": 0.48,
473
  "eval_loss": 0.08551913499832153,
474
+ "eval_runtime": 109.2626,
475
+ "eval_samples_per_second": 18.305,
476
+ "eval_steps_per_second": 2.288,
477
  "step": 3000
478
  },
479
  {
 
549
  {
550
  "epoch": 0.56,
551
  "eval_loss": 0.08540560305118561,
552
+ "eval_runtime": 109.3641,
553
+ "eval_samples_per_second": 18.288,
554
+ "eval_steps_per_second": 2.286,
555
  "step": 3500
556
  },
557
  {
 
627
  {
628
  "epoch": 0.64,
629
  "eval_loss": 0.08466340601444244,
630
+ "eval_runtime": 109.2066,
631
+ "eval_samples_per_second": 18.314,
632
+ "eval_steps_per_second": 2.289,
633
  "step": 4000
634
+ },
635
+ {
636
+ "epoch": 0.648,
637
+ "grad_norm": 7243.01611328125,
638
+ "learning_rate": 2.3520000000000002e-05,
639
+ "loss": 0.0622,
640
+ "step": 4050
641
+ },
642
+ {
643
+ "epoch": 0.656,
644
+ "grad_norm": 7986.32568359375,
645
+ "learning_rate": 2.344e-05,
646
+ "loss": 0.0678,
647
+ "step": 4100
648
+ },
649
+ {
650
+ "epoch": 0.664,
651
+ "grad_norm": 9114.8974609375,
652
+ "learning_rate": 2.336e-05,
653
+ "loss": 0.0671,
654
+ "step": 4150
655
+ },
656
+ {
657
+ "epoch": 0.672,
658
+ "grad_norm": 8830.62109375,
659
+ "learning_rate": 2.328e-05,
660
+ "loss": 0.0679,
661
+ "step": 4200
662
+ },
663
+ {
664
+ "epoch": 0.68,
665
+ "grad_norm": 9311.2412109375,
666
+ "learning_rate": 2.32e-05,
667
+ "loss": 0.063,
668
+ "step": 4250
669
+ },
670
+ {
671
+ "epoch": 0.688,
672
+ "grad_norm": 31307.103515625,
673
+ "learning_rate": 2.3120000000000002e-05,
674
+ "loss": 0.0649,
675
+ "step": 4300
676
+ },
677
+ {
678
+ "epoch": 0.696,
679
+ "grad_norm": 9040.0126953125,
680
+ "learning_rate": 2.304e-05,
681
+ "loss": 0.0633,
682
+ "step": 4350
683
+ },
684
+ {
685
+ "epoch": 0.704,
686
+ "grad_norm": 7183.91650390625,
687
+ "learning_rate": 2.296e-05,
688
+ "loss": 0.0582,
689
+ "step": 4400
690
+ },
691
+ {
692
+ "epoch": 0.712,
693
+ "grad_norm": 6460.2998046875,
694
+ "learning_rate": 2.288e-05,
695
+ "loss": 0.0672,
696
+ "step": 4450
697
+ },
698
+ {
699
+ "epoch": 0.72,
700
+ "grad_norm": 6104.8671875,
701
+ "learning_rate": 2.2800000000000002e-05,
702
+ "loss": 0.0597,
703
+ "step": 4500
704
+ },
705
+ {
706
+ "epoch": 0.72,
707
+ "eval_loss": 0.0842796117067337,
708
+ "eval_runtime": 109.48,
709
+ "eval_samples_per_second": 18.268,
710
+ "eval_steps_per_second": 2.284,
711
+ "step": 4500
712
  }
713
  ],
714
  "logging_steps": 50,
 
728
  "attributes": {}
729
  }
730
  },
731
+ "total_flos": 2.192248406016e+16,
732
  "train_batch_size": 8,
733
  "trial_name": null,
734
  "trial_params": null