rakhman-llm commited on
Commit
de92dc2
·
verified ·
1 Parent(s): 233217b

Training in progress, step 12000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef77a091d99ff91eba75355ced068d82754fe09197e0fe3fb0024d4681e880cd
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8551f34b08110ae0409b8440d3c6aebe368ed8d3db002ee5351bf9102cc149c5
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13d127c8a77b3a6f121a1e1766bf489c05dc85af0fb609f9d0474c53a6bbc073
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd2de65b175040f5da9800eea2d0dc2b6a4d304b142893b8d0191497fd8ada1
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:657e8f4565cffd7e4abf75f56b2fdcd3ae235671ae9b5c722c485957c12a53d9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d27d7df73326e1c0aee717403c0adf323847184edf4276ba1b5631d4a53dd69
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41e2a369098bdc7bbbac32b35b2de9650fe514b2c352f9f563c5554da15cddf2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e235e460f5bdfcaf65cc8c31e1e3b2c2350fb2d33d9a604b0cf64e4a8cef95de
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08207839727401733,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-11500",
4
- "epoch": 1.8399999999999999,
5
  "eval_steps": 500,
6
- "global_step": 11500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1801,6 +1801,84 @@
1801
  "eval_samples_per_second": 17.13,
1802
  "eval_steps_per_second": 2.141,
1803
  "step": 11500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1804
  }
1805
  ],
1806
  "logging_steps": 50,
@@ -1820,7 +1898,7 @@
1820
  "attributes": {}
1821
  }
1822
  },
1823
- "total_flos": 5.602412593152e+16,
1824
  "train_batch_size": 8,
1825
  "trial_name": null,
1826
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08194975554943085,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-12000",
4
+ "epoch": 1.92,
5
  "eval_steps": 500,
6
+ "global_step": 12000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1801
  "eval_samples_per_second": 17.13,
1802
  "eval_steps_per_second": 2.141,
1803
  "step": 11500
1804
+ },
1805
+ {
1806
+ "epoch": 1.8479999999999999,
1807
+ "grad_norm": 5619.83984375,
1808
+ "learning_rate": 1.152e-05,
1809
+ "loss": 0.0563,
1810
+ "step": 11550
1811
+ },
1812
+ {
1813
+ "epoch": 1.8559999999999999,
1814
+ "grad_norm": 92426.8046875,
1815
+ "learning_rate": 1.144e-05,
1816
+ "loss": 0.0588,
1817
+ "step": 11600
1818
+ },
1819
+ {
1820
+ "epoch": 1.8639999999999999,
1821
+ "grad_norm": 7583.005859375,
1822
+ "learning_rate": 1.136e-05,
1823
+ "loss": 0.0559,
1824
+ "step": 11650
1825
+ },
1826
+ {
1827
+ "epoch": 1.8719999999999999,
1828
+ "grad_norm": 6395.92578125,
1829
+ "learning_rate": 1.128e-05,
1830
+ "loss": 0.0552,
1831
+ "step": 11700
1832
+ },
1833
+ {
1834
+ "epoch": 1.88,
1835
+ "grad_norm": 9939.912109375,
1836
+ "learning_rate": 1.1200000000000001e-05,
1837
+ "loss": 0.0523,
1838
+ "step": 11750
1839
+ },
1840
+ {
1841
+ "epoch": 1.888,
1842
+ "grad_norm": 5679.93212890625,
1843
+ "learning_rate": 1.112e-05,
1844
+ "loss": 0.0585,
1845
+ "step": 11800
1846
+ },
1847
+ {
1848
+ "epoch": 1.896,
1849
+ "grad_norm": 6536.05419921875,
1850
+ "learning_rate": 1.104e-05,
1851
+ "loss": 0.0533,
1852
+ "step": 11850
1853
+ },
1854
+ {
1855
+ "epoch": 1.904,
1856
+ "grad_norm": 7333.63330078125,
1857
+ "learning_rate": 1.096e-05,
1858
+ "loss": 0.0566,
1859
+ "step": 11900
1860
+ },
1861
+ {
1862
+ "epoch": 1.912,
1863
+ "grad_norm": 7345.85009765625,
1864
+ "learning_rate": 1.0880000000000001e-05,
1865
+ "loss": 0.0555,
1866
+ "step": 11950
1867
+ },
1868
+ {
1869
+ "epoch": 1.92,
1870
+ "grad_norm": 21337.044921875,
1871
+ "learning_rate": 1.08e-05,
1872
+ "loss": 0.0576,
1873
+ "step": 12000
1874
+ },
1875
+ {
1876
+ "epoch": 1.92,
1877
+ "eval_loss": 0.08194975554943085,
1878
+ "eval_runtime": 116.8029,
1879
+ "eval_samples_per_second": 17.123,
1880
+ "eval_steps_per_second": 2.14,
1881
+ "step": 12000
1882
  }
1883
  ],
1884
  "logging_steps": 50,
 
1898
  "attributes": {}
1899
  }
1900
  },
1901
+ "total_flos": 5.845995749376e+16,
1902
  "train_batch_size": 8,
1903
  "trial_name": null,
1904
  "trial_params": null