rakhman-llm commited on
Commit
422abb1
·
verified ·
1 Parent(s): f282882

Training in progress, step 8500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fca6aeaf4fa7d75a66f7f67555261a5bad739ad8db5c61e5370422d2a64fa2fc
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:584527770bbf61e7d3b9f5f5d7f0c7a2aed0cb0bbeab1852ed5dfa972751a3a6
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b88e6f2dade8574d8f36a8bac2bf06f1878dfc285f0ec2718f4874fafc93ed3
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5913ee9284fa8aa307711a2e744c4df8b62a0ed34b254b5187e266b9a990b3a6
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c54a623ce5f15765fc85292580b0c84fff2141008e564ba5f42059bc2565398
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c0db314fbdae64c2adb9ae851d5b0fd06db20205c5b2a3393b31aa089ac9035
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5562a3e399d7179333c76429cef48051cde3665996849a423be292e48285e2bc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3e60a31f38c722952d958f95680b7f288aeccf8a56dab49290f6f451d5b392c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08285758644342422,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-8000",
4
- "epoch": 1.28,
5
  "eval_steps": 500,
6
- "global_step": 8000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1255,6 +1255,84 @@
1255
  "eval_samples_per_second": 17.129,
1256
  "eval_steps_per_second": 2.141,
1257
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1258
  }
1259
  ],
1260
  "logging_steps": 50,
@@ -1274,7 +1352,7 @@
1274
  "attributes": {}
1275
  }
1276
  },
1277
- "total_flos": 3.897330499584e+16,
1278
  "train_batch_size": 8,
1279
  "trial_name": null,
1280
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08279111981391907,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-8500",
4
+ "epoch": 1.3599999999999999,
5
  "eval_steps": 500,
6
+ "global_step": 8500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1255
  "eval_samples_per_second": 17.129,
1256
  "eval_steps_per_second": 2.141,
1257
  "step": 8000
1258
+ },
1259
+ {
1260
+ "epoch": 1.288,
1261
+ "grad_norm": 6129.27783203125,
1262
+ "learning_rate": 1.712e-05,
1263
+ "loss": 0.0578,
1264
+ "step": 8050
1265
+ },
1266
+ {
1267
+ "epoch": 1.296,
1268
+ "grad_norm": 6502.31298828125,
1269
+ "learning_rate": 1.704e-05,
1270
+ "loss": 0.0513,
1271
+ "step": 8100
1272
+ },
1273
+ {
1274
+ "epoch": 1.304,
1275
+ "grad_norm": 10347.439453125,
1276
+ "learning_rate": 1.696e-05,
1277
+ "loss": 0.0527,
1278
+ "step": 8150
1279
+ },
1280
+ {
1281
+ "epoch": 1.312,
1282
+ "grad_norm": 7870.1796875,
1283
+ "learning_rate": 1.688e-05,
1284
+ "loss": 0.0565,
1285
+ "step": 8200
1286
+ },
1287
+ {
1288
+ "epoch": 1.32,
1289
+ "grad_norm": 7197.3447265625,
1290
+ "learning_rate": 1.6800000000000002e-05,
1291
+ "loss": 0.0538,
1292
+ "step": 8250
1293
+ },
1294
+ {
1295
+ "epoch": 1.328,
1296
+ "grad_norm": 5525.79931640625,
1297
+ "learning_rate": 1.672e-05,
1298
+ "loss": 0.0579,
1299
+ "step": 8300
1300
+ },
1301
+ {
1302
+ "epoch": 1.336,
1303
+ "grad_norm": 5812.7490234375,
1304
+ "learning_rate": 1.664e-05,
1305
+ "loss": 0.0543,
1306
+ "step": 8350
1307
+ },
1308
+ {
1309
+ "epoch": 1.3439999999999999,
1310
+ "grad_norm": 5728.1904296875,
1311
+ "learning_rate": 1.656e-05,
1312
+ "loss": 0.0572,
1313
+ "step": 8400
1314
+ },
1315
+ {
1316
+ "epoch": 1.3519999999999999,
1317
+ "grad_norm": 6965.53759765625,
1318
+ "learning_rate": 1.648e-05,
1319
+ "loss": 0.0535,
1320
+ "step": 8450
1321
+ },
1322
+ {
1323
+ "epoch": 1.3599999999999999,
1324
+ "grad_norm": 6986.52783203125,
1325
+ "learning_rate": 1.64e-05,
1326
+ "loss": 0.0549,
1327
+ "step": 8500
1328
+ },
1329
+ {
1330
+ "epoch": 1.3599999999999999,
1331
+ "eval_loss": 0.08279111981391907,
1332
+ "eval_runtime": 116.6629,
1333
+ "eval_samples_per_second": 17.143,
1334
+ "eval_steps_per_second": 2.143,
1335
+ "step": 8500
1336
  }
1337
  ],
1338
  "logging_steps": 50,
 
1352
  "attributes": {}
1353
  }
1354
  },
1355
+ "total_flos": 4.140913655808e+16,
1356
  "train_batch_size": 8,
1357
  "trial_name": null,
1358
  "trial_params": null