rakhman-llm commited on
Commit
7bfdff5
·
verified ·
1 Parent(s): de60bee

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e166df977f21f4e74668f730deac309df6a571f19acaa0dfa89c2e3c2819431
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eb8bf06dd538fcb262c2fed2f9e68d7952360b525571db7ca0f1430e447d9ed
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e99baec34b5da3301e09eb5c4f9d3ef873b304cf96feddd1cfe61690f168589
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:942c2875abedaddb5c9436b198254369aa4f7b28a3b4b68fd6fcf41053e028fd
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91f8e7f84c942f83f5d877ebb335efd09f2d70c83c135e4e031bf84b0af23af9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a7ea66d1531ee601a7eaa0403a8186669a6c59db91f4cad349d74bc0115c72f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e8e6e071a0a58044a0e6d93d2bb93dd252e4dfa6d5810853dc9dadf39da6ab9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7631547026bec9cd7a6ea58b5f8fb2fb117b688cae951965f0f9ff628a1476de
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08252418041229248,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-9500",
4
- "epoch": 1.52,
5
  "eval_steps": 500,
6
- "global_step": 9500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1489,6 +1489,84 @@
1489
  "eval_samples_per_second": 17.137,
1490
  "eval_steps_per_second": 2.142,
1491
  "step": 9500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1492
  }
1493
  ],
1494
  "logging_steps": 50,
@@ -1508,7 +1586,7 @@
1508
  "attributes": {}
1509
  }
1510
  },
1511
- "total_flos": 4.628079968256e+16,
1512
  "train_batch_size": 8,
1513
  "trial_name": null,
1514
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08249519765377045,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-10000",
4
+ "epoch": 1.6,
5
  "eval_steps": 500,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1489
  "eval_samples_per_second": 17.137,
1490
  "eval_steps_per_second": 2.142,
1491
  "step": 9500
1492
+ },
1493
+ {
1494
+ "epoch": 1.528,
1495
+ "grad_norm": 6762.00244140625,
1496
+ "learning_rate": 1.472e-05,
1497
+ "loss": 0.0529,
1498
+ "step": 9550
1499
+ },
1500
+ {
1501
+ "epoch": 1.536,
1502
+ "grad_norm": 7704.66748046875,
1503
+ "learning_rate": 1.464e-05,
1504
+ "loss": 0.0576,
1505
+ "step": 9600
1506
+ },
1507
+ {
1508
+ "epoch": 1.544,
1509
+ "grad_norm": 5400.18798828125,
1510
+ "learning_rate": 1.4560000000000001e-05,
1511
+ "loss": 0.0556,
1512
+ "step": 9650
1513
+ },
1514
+ {
1515
+ "epoch": 1.552,
1516
+ "grad_norm": 6167.47216796875,
1517
+ "learning_rate": 1.448e-05,
1518
+ "loss": 0.0547,
1519
+ "step": 9700
1520
+ },
1521
+ {
1522
+ "epoch": 1.56,
1523
+ "grad_norm": 5073.39892578125,
1524
+ "learning_rate": 1.44e-05,
1525
+ "loss": 0.0544,
1526
+ "step": 9750
1527
+ },
1528
+ {
1529
+ "epoch": 1.568,
1530
+ "grad_norm": 6849.08447265625,
1531
+ "learning_rate": 1.432e-05,
1532
+ "loss": 0.0571,
1533
+ "step": 9800
1534
+ },
1535
+ {
1536
+ "epoch": 1.576,
1537
+ "grad_norm": 6866.765625,
1538
+ "learning_rate": 1.4240000000000001e-05,
1539
+ "loss": 0.0518,
1540
+ "step": 9850
1541
+ },
1542
+ {
1543
+ "epoch": 1.584,
1544
+ "grad_norm": 8185.33740234375,
1545
+ "learning_rate": 1.416e-05,
1546
+ "loss": 0.0605,
1547
+ "step": 9900
1548
+ },
1549
+ {
1550
+ "epoch": 1.592,
1551
+ "grad_norm": 7759.45361328125,
1552
+ "learning_rate": 1.408e-05,
1553
+ "loss": 0.0581,
1554
+ "step": 9950
1555
+ },
1556
+ {
1557
+ "epoch": 1.6,
1558
+ "grad_norm": 5736.8740234375,
1559
+ "learning_rate": 1.4e-05,
1560
+ "loss": 0.0582,
1561
+ "step": 10000
1562
+ },
1563
+ {
1564
+ "epoch": 1.6,
1565
+ "eval_loss": 0.08249519765377045,
1566
+ "eval_runtime": 116.9496,
1567
+ "eval_samples_per_second": 17.101,
1568
+ "eval_steps_per_second": 2.138,
1569
+ "step": 10000
1570
  }
1571
  ],
1572
  "logging_steps": 50,
 
1586
  "attributes": {}
1587
  }
1588
  },
1589
+ "total_flos": 4.87166312448e+16,
1590
  "train_batch_size": 8,
1591
  "trial_name": null,
1592
  "trial_params": null