rakhman-llm commited on
Commit
b30a9fd
·
verified ·
1 Parent(s): 2932733

Training in progress, step 13000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3440c751a156d2ea034af88bf09257dd0a13e2135cfbb097e39dfddddef310fd
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84ffb6564333bec2f290dbb25cc2aa16322f854baa8d4d551f3c98e898121f1a
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1457c08b9b3534e02e583e1c8e42d69598ca892990b039f5a31fe9230fe0935
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55950639d5a01737cdced1ff8ae565003480c956e54966fe29dbcc5ee832bbe0
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f0a30224aa29ff1d82dc265e86d4dfad17d2e9441b1e7410af0f89b7ac502d4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ede19b1f06f575564a160b0c17fcb5315d8660261b38069d03c83f6d06084b12
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b03c363c915c0dcfc4a0726bfa268bae9d4c39d40263aa6bcf80af31957091aa
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ed905996f8c375ddf1e71a02110476c5d9bb4d922dca340182e086437e4a3a1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
- "epoch": 2.0,
5
  "eval_steps": 500,
6
- "global_step": 12500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1957,6 +1957,84 @@
1957
  "eval_samples_per_second": 17.125,
1958
  "eval_steps_per_second": 2.141,
1959
  "step": 12500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1960
  }
1961
  ],
1962
  "logging_steps": 50,
@@ -1976,7 +2054,7 @@
1976
  "attributes": {}
1977
  }
1978
  },
1979
- "total_flos": 6.0895789056e+16,
1980
  "train_batch_size": 8,
1981
  "trial_name": null,
1982
  "trial_params": null
 
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
+ "epoch": 2.08,
5
  "eval_steps": 500,
6
+ "global_step": 13000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1957
  "eval_samples_per_second": 17.125,
1958
  "eval_steps_per_second": 2.141,
1959
  "step": 12500
1960
+ },
1961
+ {
1962
+ "epoch": 2.008,
1963
+ "grad_norm": 6336.53076171875,
1964
+ "learning_rate": 9.92e-06,
1965
+ "loss": 0.0466,
1966
+ "step": 12550
1967
+ },
1968
+ {
1969
+ "epoch": 2.016,
1970
+ "grad_norm": 4880.88330078125,
1971
+ "learning_rate": 9.84e-06,
1972
+ "loss": 0.0531,
1973
+ "step": 12600
1974
+ },
1975
+ {
1976
+ "epoch": 2.024,
1977
+ "grad_norm": 6478.1640625,
1978
+ "learning_rate": 9.76e-06,
1979
+ "loss": 0.0516,
1980
+ "step": 12650
1981
+ },
1982
+ {
1983
+ "epoch": 2.032,
1984
+ "grad_norm": 6105.318359375,
1985
+ "learning_rate": 9.68e-06,
1986
+ "loss": 0.0492,
1987
+ "step": 12700
1988
+ },
1989
+ {
1990
+ "epoch": 2.04,
1991
+ "grad_norm": 6270.1318359375,
1992
+ "learning_rate": 9.600000000000001e-06,
1993
+ "loss": 0.0511,
1994
+ "step": 12750
1995
+ },
1996
+ {
1997
+ "epoch": 2.048,
1998
+ "grad_norm": 5914.5458984375,
1999
+ "learning_rate": 9.52e-06,
2000
+ "loss": 0.0522,
2001
+ "step": 12800
2002
+ },
2003
+ {
2004
+ "epoch": 2.056,
2005
+ "grad_norm": 6194.03076171875,
2006
+ "learning_rate": 9.44e-06,
2007
+ "loss": 0.0535,
2008
+ "step": 12850
2009
+ },
2010
+ {
2011
+ "epoch": 2.064,
2012
+ "grad_norm": 7986.248046875,
2013
+ "learning_rate": 9.36e-06,
2014
+ "loss": 0.0529,
2015
+ "step": 12900
2016
+ },
2017
+ {
2018
+ "epoch": 2.072,
2019
+ "grad_norm": 10384.2099609375,
2020
+ "learning_rate": 9.280000000000001e-06,
2021
+ "loss": 0.0471,
2022
+ "step": 12950
2023
+ },
2024
+ {
2025
+ "epoch": 2.08,
2026
+ "grad_norm": 8849.5703125,
2027
+ "learning_rate": 9.2e-06,
2028
+ "loss": 0.0502,
2029
+ "step": 13000
2030
+ },
2031
+ {
2032
+ "epoch": 2.08,
2033
+ "eval_loss": 0.08202869445085526,
2034
+ "eval_runtime": 117.0019,
2035
+ "eval_samples_per_second": 17.094,
2036
+ "eval_steps_per_second": 2.137,
2037
+ "step": 13000
2038
  }
2039
  ],
2040
  "logging_steps": 50,
 
2054
  "attributes": {}
2055
  }
2056
  },
2057
+ "total_flos": 6.333162061824e+16,
2058
  "train_batch_size": 8,
2059
  "trial_name": null,
2060
  "trial_params": null