rakhman-llm commited on
Commit
d514e20
·
verified ·
1 Parent(s): 542ae9b

Training in progress, step 5500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11fbea9669742ba3ea39c635a607bbe423218198e460e6764bf73c6aab4c240d
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69d1609aa46c5839ce2f0e40574d9fbde5dd0b26b3c6b24672dc874df7022413
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbb7a8dea04c89ec3fc490bba9edeb6cf633b3c3f918dfcb1a7b5c8e0283a4a1
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3ba4184275341c1f83f8c7ea5cc044fa98a0d37f2482c3badd6c3656af1348b
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d740a2b255005c6e7a6893364a7dc3b77e12bd1c4d5968595e98dca1e18092e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98772bb230331f5cccf45e7b586be3da4127f0fa073760fa05315192ae7dcfa0
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cc549119fdb555619567d1cb3e1f53a8d128bcc831c85f577b9c54b5021dbcc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2b96925e850cd8ef32abae6516af5134c66eaecf4e61ddbdb73aaa1b5b6a670
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08396206796169281,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-5000",
4
- "epoch": 0.8,
5
  "eval_steps": 500,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -81,9 +81,9 @@
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
- "eval_runtime": 116.7651,
85
- "eval_samples_per_second": 17.128,
86
- "eval_steps_per_second": 2.141,
87
  "step": 500
88
  },
89
  {
@@ -159,9 +159,9 @@
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
- "eval_runtime": 116.7407,
163
- "eval_samples_per_second": 17.132,
164
- "eval_steps_per_second": 2.141,
165
  "step": 1000
166
  },
167
  {
@@ -237,9 +237,9 @@
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
- "eval_runtime": 116.8722,
241
- "eval_samples_per_second": 17.113,
242
- "eval_steps_per_second": 2.139,
243
  "step": 1500
244
  },
245
  {
@@ -315,9 +315,9 @@
315
  {
316
  "epoch": 0.32,
317
  "eval_loss": 0.08704760670661926,
318
- "eval_runtime": 116.8362,
319
- "eval_samples_per_second": 17.118,
320
- "eval_steps_per_second": 2.14,
321
  "step": 2000
322
  },
323
  {
@@ -393,9 +393,9 @@
393
  {
394
  "epoch": 0.4,
395
  "eval_loss": 0.08615937829017639,
396
- "eval_runtime": 116.9591,
397
- "eval_samples_per_second": 17.1,
398
- "eval_steps_per_second": 2.137,
399
  "step": 2500
400
  },
401
  {
@@ -471,9 +471,9 @@
471
  {
472
  "epoch": 0.48,
473
  "eval_loss": 0.08551913499832153,
474
- "eval_runtime": 116.545,
475
- "eval_samples_per_second": 17.161,
476
- "eval_steps_per_second": 2.145,
477
  "step": 3000
478
  },
479
  {
@@ -549,9 +549,9 @@
549
  {
550
  "epoch": 0.56,
551
  "eval_loss": 0.08540560305118561,
552
- "eval_runtime": 116.9131,
553
- "eval_samples_per_second": 17.107,
554
- "eval_steps_per_second": 2.138,
555
  "step": 3500
556
  },
557
  {
@@ -627,9 +627,9 @@
627
  {
628
  "epoch": 0.64,
629
  "eval_loss": 0.08466340601444244,
630
- "eval_runtime": 116.6411,
631
- "eval_samples_per_second": 17.147,
632
- "eval_steps_per_second": 2.143,
633
  "step": 4000
634
  },
635
  {
@@ -705,9 +705,9 @@
705
  {
706
  "epoch": 0.72,
707
  "eval_loss": 0.0842796117067337,
708
- "eval_runtime": 116.9361,
709
- "eval_samples_per_second": 17.103,
710
- "eval_steps_per_second": 2.138,
711
  "step": 4500
712
  },
713
  {
@@ -783,10 +783,88 @@
783
  {
784
  "epoch": 0.8,
785
  "eval_loss": 0.08396206796169281,
786
- "eval_runtime": 116.8224,
787
- "eval_samples_per_second": 17.12,
788
- "eval_steps_per_second": 2.14,
789
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  }
791
  ],
792
  "logging_steps": 50,
@@ -806,7 +884,7 @@
806
  "attributes": {}
807
  }
808
  },
809
- "total_flos": 2.43583156224e+16,
810
  "train_batch_size": 8,
811
  "trial_name": null,
812
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08360794186592102,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-5500",
4
+ "epoch": 0.88,
5
  "eval_steps": 500,
6
+ "global_step": 5500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
+ "eval_runtime": 109.274,
85
+ "eval_samples_per_second": 18.303,
86
+ "eval_steps_per_second": 2.288,
87
  "step": 500
88
  },
89
  {
 
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
+ "eval_runtime": 109.2536,
163
+ "eval_samples_per_second": 18.306,
164
+ "eval_steps_per_second": 2.288,
165
  "step": 1000
166
  },
167
  {
 
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
+ "eval_runtime": 109.2355,
241
+ "eval_samples_per_second": 18.309,
242
+ "eval_steps_per_second": 2.289,
243
  "step": 1500
244
  },
245
  {
 
315
  {
316
  "epoch": 0.32,
317
  "eval_loss": 0.08704760670661926,
318
+ "eval_runtime": 109.4348,
319
+ "eval_samples_per_second": 18.276,
320
+ "eval_steps_per_second": 2.284,
321
  "step": 2000
322
  },
323
  {
 
393
  {
394
  "epoch": 0.4,
395
  "eval_loss": 0.08615937829017639,
396
+ "eval_runtime": 109.2621,
397
+ "eval_samples_per_second": 18.305,
398
+ "eval_steps_per_second": 2.288,
399
  "step": 2500
400
  },
401
  {
 
471
  {
472
  "epoch": 0.48,
473
  "eval_loss": 0.08551913499832153,
474
+ "eval_runtime": 109.2626,
475
+ "eval_samples_per_second": 18.305,
476
+ "eval_steps_per_second": 2.288,
477
  "step": 3000
478
  },
479
  {
 
549
  {
550
  "epoch": 0.56,
551
  "eval_loss": 0.08540560305118561,
552
+ "eval_runtime": 109.3641,
553
+ "eval_samples_per_second": 18.288,
554
+ "eval_steps_per_second": 2.286,
555
  "step": 3500
556
  },
557
  {
 
627
  {
628
  "epoch": 0.64,
629
  "eval_loss": 0.08466340601444244,
630
+ "eval_runtime": 109.2066,
631
+ "eval_samples_per_second": 18.314,
632
+ "eval_steps_per_second": 2.289,
633
  "step": 4000
634
  },
635
  {
 
705
  {
706
  "epoch": 0.72,
707
  "eval_loss": 0.0842796117067337,
708
+ "eval_runtime": 109.48,
709
+ "eval_samples_per_second": 18.268,
710
+ "eval_steps_per_second": 2.284,
711
  "step": 4500
712
  },
713
  {
 
783
  {
784
  "epoch": 0.8,
785
  "eval_loss": 0.08396206796169281,
786
+ "eval_runtime": 109.1825,
787
+ "eval_samples_per_second": 18.318,
788
+ "eval_steps_per_second": 2.29,
789
  "step": 5000
790
+ },
791
+ {
792
+ "epoch": 0.808,
793
+ "grad_norm": 6047.43359375,
794
+ "learning_rate": 2.192e-05,
795
+ "loss": 0.0673,
796
+ "step": 5050
797
+ },
798
+ {
799
+ "epoch": 0.816,
800
+ "grad_norm": 6286.21484375,
801
+ "learning_rate": 2.184e-05,
802
+ "loss": 0.0609,
803
+ "step": 5100
804
+ },
805
+ {
806
+ "epoch": 0.824,
807
+ "grad_norm": 6187.03369140625,
808
+ "learning_rate": 2.1760000000000002e-05,
809
+ "loss": 0.0628,
810
+ "step": 5150
811
+ },
812
+ {
813
+ "epoch": 0.832,
814
+ "grad_norm": 4476.73095703125,
815
+ "learning_rate": 2.1680000000000002e-05,
816
+ "loss": 0.0626,
817
+ "step": 5200
818
+ },
819
+ {
820
+ "epoch": 0.84,
821
+ "grad_norm": 6180.27490234375,
822
+ "learning_rate": 2.16e-05,
823
+ "loss": 0.061,
824
+ "step": 5250
825
+ },
826
+ {
827
+ "epoch": 0.848,
828
+ "grad_norm": 8477.626953125,
829
+ "learning_rate": 2.152e-05,
830
+ "loss": 0.0638,
831
+ "step": 5300
832
+ },
833
+ {
834
+ "epoch": 0.856,
835
+ "grad_norm": 11541.119140625,
836
+ "learning_rate": 2.144e-05,
837
+ "loss": 0.0602,
838
+ "step": 5350
839
+ },
840
+ {
841
+ "epoch": 0.864,
842
+ "grad_norm": 6183.49609375,
843
+ "learning_rate": 2.136e-05,
844
+ "loss": 0.0645,
845
+ "step": 5400
846
+ },
847
+ {
848
+ "epoch": 0.872,
849
+ "grad_norm": 7597.5810546875,
850
+ "learning_rate": 2.1280000000000003e-05,
851
+ "loss": 0.067,
852
+ "step": 5450
853
+ },
854
+ {
855
+ "epoch": 0.88,
856
+ "grad_norm": 8438.478515625,
857
+ "learning_rate": 2.12e-05,
858
+ "loss": 0.0628,
859
+ "step": 5500
860
+ },
861
+ {
862
+ "epoch": 0.88,
863
+ "eval_loss": 0.08360794186592102,
864
+ "eval_runtime": 109.3353,
865
+ "eval_samples_per_second": 18.292,
866
+ "eval_steps_per_second": 2.287,
867
+ "step": 5500
868
  }
869
  ],
870
  "logging_steps": 50,
 
884
  "attributes": {}
885
  }
886
  },
887
+ "total_flos": 2.679414718464e+16,
888
  "train_batch_size": 8,
889
  "trial_name": null,
890
  "trial_params": null