Delete train_openchat_starling.sh
Browse files- train_openchat_starling.sh +0 -54
train_openchat_starling.sh
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
PROJ_PATH=/GLOBALFS/sysu_xjquan_2/yangzy/fusechat
|
3 |
-
BASE_MODEL_NAME=openchat_3.5
|
4 |
-
TEACHER_MODEL_NAME=starling
|
5 |
-
MODEL_PATH=/GLOBALFS/sysu_xjquan_2/yangzy/models/$BASE_MODEL_NAME
|
6 |
-
MODEL_SAVE_PATH="${PROJ_PATH}/model_ckpt/${BASE_MODEL_NAME}_${TEACHER_MODEL_NAME}_ckpt1"
|
7 |
-
|
8 |
-
mkdir ${MODEL_SAVE_PATH}
|
9 |
-
dataset_dir="${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split0,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split1,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split2,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split3"
|
10 |
-
|
11 |
-
# OpenChat-3.5-7B <-> Starling-LM-7B-alpha
|
12 |
-
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
13 |
-
torchrun --nproc_per_node=8 --master_port=20001 ${PROJ_PATH}/train/train.py \
|
14 |
-
--model_name_or_path ${MODEL_PATH} \
|
15 |
-
--data_path ${dataset_dir} \
|
16 |
-
--bf16 True \
|
17 |
-
--output_dir ${MODEL_SAVE_PATH} \
|
18 |
-
--num_train_epochs 3 \
|
19 |
-
--per_device_train_batch_size 8 \
|
20 |
-
--per_device_eval_batch_size 8 \
|
21 |
-
--gradient_accumulation_steps 2 \
|
22 |
-
--evaluation_strategy "no" \
|
23 |
-
--save_strategy "epoch" \
|
24 |
-
--save_steps 10000 \
|
25 |
-
--save_total_limit 5 \
|
26 |
-
--learning_rate 5e-6 \
|
27 |
-
--weight_decay 0. \
|
28 |
-
--warmup_ratio 0.03 \
|
29 |
-
--lr_scheduler_type "cosine" \
|
30 |
-
--logging_steps 1 \
|
31 |
-
--fsdp "full_shard auto_wrap" \
|
32 |
-
--fsdp_transformer_layer_cls_to_wrap 'MistralDecoderLayer' \
|
33 |
-
--tf32 True \
|
34 |
-
--model_max_length 2048 \
|
35 |
-
--gradient_checkpointing True \
|
36 |
-
--conv_temp "openchat" \
|
37 |
-
--lazy_preprocess True \
|
38 |
-
--flash_attn_transformers True \
|
39 |
-
--do_train \
|
40 |
-
--do_distill \
|
41 |
-
--distill_with_ref_model True \
|
42 |
-
--distill_with_aligned_model_0 True \
|
43 |
-
--distill_with_aligned_model_1 False \
|
44 |
-
--distill_loss_type "ce" \
|
45 |
-
--distill_teacher_temperature 1.0 \
|
46 |
-
--lm_loss_weight 0.9 \
|
47 |
-
--distill_greater_as_gt True \
|
48 |
-
--distill_greater_as_gt_type hard \
|
49 |
-
--dataloader_num_workers 8 \
|
50 |
-
--remove_unused_columns False
|
51 |
-
|
52 |
-
sleep 60s
|
53 |
-
|
54 |
-
yhcancel 2510
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|