rakhman-llm commited on
Commit
e94bbae
·
verified ·
1 Parent(s): 5790567

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b65f0032ace99ac31a5c4eaebab678922f9196855254a04fa04cd5b09838cd25
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a00b4752e5400013d03bdc3543638a739c5809565856c2e1fd067bc65223c01
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9542f526fbcdae270a597291ca95babaa9f3b9e2d72ac373956f44fc941c5cd5
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f45ec9bcb28283b1fb0e751097db73046047365cbfc45350295ba1f4757128a0
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c88b9beda061d1039d8d779edfe7f5a8ca6add672a561cd18d21ff3443068c38
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ba4dabbcdc1ce4a55a172ce7079644c4c8c4a6b7506dc54cdcd4f9b26f6f954
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:635e57a0f4914cbbb1c670a34423e0edc1f884281d003185692f1319d135cdd1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d08a32404911dd432312864883a3240b833c1da247016f7258f9c59a4d0754c1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08949962258338928,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-1000",
4
- "epoch": 0.16,
5
  "eval_steps": 500,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -81,9 +81,9 @@
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
- "eval_runtime": 116.7651,
85
- "eval_samples_per_second": 17.128,
86
- "eval_steps_per_second": 2.141,
87
  "step": 500
88
  },
89
  {
@@ -159,10 +159,88 @@
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
- "eval_runtime": 116.7407,
163
- "eval_samples_per_second": 17.132,
164
- "eval_steps_per_second": 2.141,
165
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  }
167
  ],
168
  "logging_steps": 50,
@@ -182,7 +260,7 @@
182
  "attributes": {}
183
  }
184
  },
185
- "total_flos": 4871663124480000.0,
186
  "train_batch_size": 8,
187
  "trial_name": null,
188
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08808805048465729,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-1500",
4
+ "epoch": 0.24,
5
  "eval_steps": 500,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
+ "eval_runtime": 109.274,
85
+ "eval_samples_per_second": 18.303,
86
+ "eval_steps_per_second": 2.288,
87
  "step": 500
88
  },
89
  {
 
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
+ "eval_runtime": 109.2536,
163
+ "eval_samples_per_second": 18.306,
164
+ "eval_steps_per_second": 2.288,
165
  "step": 1000
166
+ },
167
+ {
168
+ "epoch": 0.168,
169
+ "grad_norm": 8266.658203125,
170
+ "learning_rate": 2.832e-05,
171
+ "loss": 0.0767,
172
+ "step": 1050
173
+ },
174
+ {
175
+ "epoch": 0.176,
176
+ "grad_norm": 6160.548828125,
177
+ "learning_rate": 2.824e-05,
178
+ "loss": 0.067,
179
+ "step": 1100
180
+ },
181
+ {
182
+ "epoch": 0.184,
183
+ "grad_norm": 7343.408203125,
184
+ "learning_rate": 2.816e-05,
185
+ "loss": 0.0717,
186
+ "step": 1150
187
+ },
188
+ {
189
+ "epoch": 0.192,
190
+ "grad_norm": 5661.76318359375,
191
+ "learning_rate": 2.8080000000000002e-05,
192
+ "loss": 0.0733,
193
+ "step": 1200
194
+ },
195
+ {
196
+ "epoch": 0.2,
197
+ "grad_norm": 8678.46484375,
198
+ "learning_rate": 2.8e-05,
199
+ "loss": 0.0737,
200
+ "step": 1250
201
+ },
202
+ {
203
+ "epoch": 0.208,
204
+ "grad_norm": 6331.21533203125,
205
+ "learning_rate": 2.792e-05,
206
+ "loss": 0.0696,
207
+ "step": 1300
208
+ },
209
+ {
210
+ "epoch": 0.216,
211
+ "grad_norm": 10563.5400390625,
212
+ "learning_rate": 2.784e-05,
213
+ "loss": 0.0747,
214
+ "step": 1350
215
+ },
216
+ {
217
+ "epoch": 0.224,
218
+ "grad_norm": 7221.74365234375,
219
+ "learning_rate": 2.7760000000000002e-05,
220
+ "loss": 0.0716,
221
+ "step": 1400
222
+ },
223
+ {
224
+ "epoch": 0.232,
225
+ "grad_norm": 6486.46142578125,
226
+ "learning_rate": 2.768e-05,
227
+ "loss": 0.0711,
228
+ "step": 1450
229
+ },
230
+ {
231
+ "epoch": 0.24,
232
+ "grad_norm": 6838.505859375,
233
+ "learning_rate": 2.7600000000000003e-05,
234
+ "loss": 0.0703,
235
+ "step": 1500
236
+ },
237
+ {
238
+ "epoch": 0.24,
239
+ "eval_loss": 0.08808805048465729,
240
+ "eval_runtime": 109.2355,
241
+ "eval_samples_per_second": 18.309,
242
+ "eval_steps_per_second": 2.289,
243
+ "step": 1500
244
  }
245
  ],
246
  "logging_steps": 50,
 
260
  "attributes": {}
261
  }
262
  },
263
+ "total_flos": 7307494686720000.0,
264
  "train_batch_size": 8,
265
  "trial_name": null,
266
  "trial_params": null