AALF commited on
Commit
8661356
·
verified ·
1 Parent(s): bc28f1b

Delete train_openchat_starling.sh

Browse files
Files changed (1) hide show
  1. train_openchat_starling.sh +0 -54
train_openchat_starling.sh DELETED
@@ -1,54 +0,0 @@
1
- #!/bin/bash
2
- PROJ_PATH=/GLOBALFS/sysu_xjquan_2/yangzy/fusechat
3
- BASE_MODEL_NAME=openchat_3.5
4
- TEACHER_MODEL_NAME=starling
5
- MODEL_PATH=/GLOBALFS/sysu_xjquan_2/yangzy/models/$BASE_MODEL_NAME
6
- MODEL_SAVE_PATH="${PROJ_PATH}/model_ckpt/${BASE_MODEL_NAME}_${TEACHER_MODEL_NAME}_ckpt1"
7
-
8
- mkdir ${MODEL_SAVE_PATH}
9
- dataset_dir="${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split0,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split1,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split2,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split3"
10
-
11
- # OpenChat-3.5-7B <-> Starling-LM-7B-alpha
12
- export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
13
- torchrun --nproc_per_node=8 --master_port=20001 ${PROJ_PATH}/train/train.py \
14
- --model_name_or_path ${MODEL_PATH} \
15
- --data_path ${dataset_dir} \
16
- --bf16 True \
17
- --output_dir ${MODEL_SAVE_PATH} \
18
- --num_train_epochs 3 \
19
- --per_device_train_batch_size 8 \
20
- --per_device_eval_batch_size 8 \
21
- --gradient_accumulation_steps 2 \
22
- --evaluation_strategy "no" \
23
- --save_strategy "epoch" \
24
- --save_steps 10000 \
25
- --save_total_limit 5 \
26
- --learning_rate 5e-6 \
27
- --weight_decay 0. \
28
- --warmup_ratio 0.03 \
29
- --lr_scheduler_type "cosine" \
30
- --logging_steps 1 \
31
- --fsdp "full_shard auto_wrap" \
32
- --fsdp_transformer_layer_cls_to_wrap 'MistralDecoderLayer' \
33
- --tf32 True \
34
- --model_max_length 2048 \
35
- --gradient_checkpointing True \
36
- --conv_temp "openchat" \
37
- --lazy_preprocess True \
38
- --flash_attn_transformers True \
39
- --do_train \
40
- --do_distill \
41
- --distill_with_ref_model True \
42
- --distill_with_aligned_model_0 True \
43
- --distill_with_aligned_model_1 False \
44
- --distill_loss_type "ce" \
45
- --distill_teacher_temperature 1.0 \
46
- --lm_loss_weight 0.9 \
47
- --distill_greater_as_gt True \
48
- --distill_greater_as_gt_type hard \
49
- --dataloader_num_workers 8 \
50
- --remove_unused_columns False
51
-
52
- sleep 60s
53
-
54
- yhcancel 2510