Saving train state of step 2000

Files changed (6) hide show

checkpoint-2000-epoch-3/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c74bc5c3109ae439029036150ad00022243260fa7e8821c38767aa4b7331cd4
 size 3025686376

 version https://git-lfs.github.com/spec/v1
+oid sha256:3383ce8239fd9a5346296993f4068931faff75aafbb1e863f55802be68be183b
 size 3025686376

checkpoint-2000-epoch-3/model_1.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:28897ec4b789c0dc382a6975366fcb16206be64b6b691a60b218831c8f6af1ea
-size 4361070048

 version https://git-lfs.github.com/spec/v1
+oid sha256:56570ecf66e2cbf1e212810317afdc44b85396298beab22e66ff759a1116f26a
+size 4361069272

checkpoint-2000-epoch-3/optimizer.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:004272d1268cb87da3d4f93da3e6e5e2184d6627957db9c697c6c8c45823211e
 size 950951226

 version https://git-lfs.github.com/spec/v1
+oid sha256:69da6783a2bb6483a2623217bf874ce7cd7d99e80a36638e0ffcc67bf80de6e7
 size 950951226

checkpoint-2000-epoch-3/scaler.pt ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:2379f6d25b7b3c11f41a4f66cb6a54cec5f124966fe580594bb5386afcd0fca1
+size 988

checkpoint-2000-epoch-3/scheduler.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e72412f426c58539f1dcfef4d31369e79764f60ce3a6e20df06cde830d8946e
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:8dad6b56b74593b411aa2335a4636d028f73ce8d740f99b52582f884503cebaa
 size 1064

run_large_training.sh CHANGED Viewed

@@ -29,7 +29,7 @@ accelerate launch run_distillation.py \
   --dataloader_num_workers 8 \
   --preprocessing_num_workers 8 \
   --ddp_timeout 7200 \
-  --dtype "bfloat16" \
   --attn_implementation "flash_attention_2" \
   --output_dir "./" \
   --do_train \

   --dataloader_num_workers 8 \
   --preprocessing_num_workers 8 \
   --ddp_timeout 7200 \
+  --dtype "float16" \
   --attn_implementation "flash_attention_2" \
   --output_dir "./" \
   --do_train \