set -x METHOD=$1 # reinforce_baseline, reinforce , rloo PRETRAIN_PATH=$2 #Qwen/Qwen2.5-7B DATA_PATH=$3 #virtuoussy/Math-RLVR REWARD_API=$4 #http://127.0.0.1:8000/get_reward PRETRAIN_PATH=PERTRAIN_PATH DATA_PATH=$DATA_PATH working_dir=$(pwd) LOG_PATH=${working_dir}/${EXP_NAME}/train.log SAVE_PATH=${working_dir}/${EXP_NAME}/checkpoint mkdir -p ${SAVE_PATH} export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ray start --head --node-ip-address 0.0.0.0 --num-gpus 8 ray job submit --address="http://127.0.0.1:8265" -- python3 -m openrlhf.cli.train_ppo_ray \ --ref_num_nodes 1 \ --ref_num_gpus_per_node 8 \ --actor_num_nodes 1 \ --actor_num_gpus_per_node 8 \ --vllm_num_engines 8 \ --vllm_tensor_parallel_size 1 \ --colocate_all_models \ --vllm_gpu_memory_utilization 0.5 \ --vllm_enable_sleep \ --deepspeed_enable_sleep \ --enforce_eager \ --pretrain ${PRETRAIN_PATH} \ --remote_rm_url ${REWARD_API} \ --save_path ${SAVE_PATH} \ --micro_train_batch_size 8 \ --train_batch_size 128 \ --micro_rollout_batch_size 16 \ --rollout_batch_size 128 \ --n_samples_per_prompt 4 \ --max_samples 30000 \ --max_epochs 1 \ --prompt_max_len 1024 \ --generate_max_len 1024 \ --zero_stage 3 \ --bf16 \ --actor_learning_rate 5e-7 \ --init_kl_coef 0.01 \ --use_kl_loss \ --advantage_estimator ${METHOD} \ --prompt_data ${DATA_PATH} \ --input_key query \ --apply_chat_template \ --packing_samples \ --normalize_reward \ --adam_offload \ --flash_attn \ --gradient_checkpointing \ 2>&1 | tee ${LOG_PATH}