Skip to content

Commit d786b83

Browse files
authored
add default lmdeploy/memory/ray log dir in RL scripts (#1374)
add default lmdeploy/memory/ray log dir in scripts
1 parent ac5552a commit d786b83

File tree

2 files changed

+35
-22
lines changed

2 files changed

+35
-22
lines changed

examples/v1/scripts/run_rl.sh

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
set -ex
2+
ray stop --force
23
# examples of usage:
34
# qwen3_8B_grpo_gsm8k training:
45
# bash examples/v1/scripts/run_rl.sh examples/v1/config/rl_qwen3_8B_grpo.py "sglang" $MODEL_PATH $DATA_PATH $EVAL_DATA_PATH
@@ -54,23 +55,33 @@ current_time=$(date "+%m%d%H")
5455
# 取模型路径的最后一级作为model_name,取数据路径的倒数第二级作为data_name
5556
model_dir_name=$(basename "$MODEL_PATH")
5657
data_dir_name=$(basename "$(dirname "$DATA_PATH")")
57-
export WORK_DIR="work_dirs/${model_dir_name}_${data_dir_name}_${infer_backend_lower}"
58-
58+
DIR=$(pwd)
59+
export WORK_DIR="${DIR}/work_dirs/${model_dir_name}_${data_dir_name}_${infer_backend_lower}"
60+
if [ ! -d "$WORK_DIR" ]; then
61+
mkdir -p "$WORK_DIR"
62+
fi
63+
export LMDEPLOY_LOG_FILE="${WORK_DIR}/lmdeploy_log_${current_time}.txt"
64+
export XTUNER_RL_MEM_DIR="${WORK_DIR}/mem_${current_time}"
5965

6066
# 2. Launch Ray cluster
6167
# 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM
6268
node_count=${NODE_COUNT:-1}
6369
total_cpus=$((node_count * 128))
6470

6571
if [ "$RAY_RANK" -eq 0 ]; then
72+
rm -rf /tmp/ray_log
73+
export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
74+
mkdir -p ${RAY_LOG_DIR}
75+
ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
6676
ray start --head \
6777
--node-ip-address="$RAY_MASTER_ADDR" \
6878
--port="$RAY_HEAD_PORT" \
6979
--dashboard-host=0.0.0.0 \
7080
--dashboard-port=$RAY_DASHBOARD_PORT \
7181
--include-dashboard=true \
7282
--disable-usage-stats \
73-
--num-cpus=$total_cpus
83+
--num-cpus=$total_cpus \
84+
--temp-dir="/tmp/ray_log/"
7485
else
7586
while true; do
7687
if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
@@ -95,11 +106,6 @@ while true; do
95106
fi
96107
done
97108

98-
# 3. start training job
99-
if [ ! -d "$WORK_DIR" ]; then
100-
mkdir -p "$WORK_DIR"
101-
fi
102-
103109
SCRIPT_NAME=$(basename "$0")
104110
cp "$0" "${WORK_DIR}/${SCRIPT_NAME}"
105111
cp "$CONFIG_PATH" "${WORK_DIR}/config.py"

examples/v1/scripts/run_rl_submit.sh

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
set -ex
2+
ray stop --force
23
# examples of usage:
34
# qwen3_8B_grpo_gsm8k training: bash examples/v1/scripts/run_rl.sh examples/v1/config/rl_qwen3_8B_grpo.py "sglang" $MODEL_PATH $DATA_PATH $EVAL_DATA_PATH
45
# qwen2.5_7B_dapo_math training: bash examples/v1/scripts/run_rl.sh examples/v1/config/rl_qwen25_7B_dapo.py "sglang" $MODEL_PATH $DATA_PATH $EVAL_DATA_PATH
@@ -29,7 +30,7 @@ export DATA_PATH=$DATA_PATH
2930
export EVAL_DATA_PATH=$EVAL_DATA_PATH
3031
export XTUNER_USE_FA3=${XTUNER_USE_FA3:-1}
3132
export XTUNER_LOG_LEVEL=${XTUNER_LOG_LEVEL:-"INFO"}
32-
33+
export PYTHONUNBUFFERED=1
3334

3435
infer_backend_lower=$(echo "$INFER_BACKEND" | tr '[:upper:]' '[:lower:]')
3536
if [ "$infer_backend_lower" = "sglang" ]; then
@@ -48,20 +49,37 @@ else
4849
exit 1
4950
fi
5051

52+
current_time=$(date "+%m%d%H")
53+
# 取模型路径的最后一级作为model_name,取数据路径的倒数第二级作为data_name
54+
model_dir_name=$(basename "$MODEL_PATH")
55+
data_dir_name=$(basename "$(dirname "$DATA_PATH")")
56+
DIR=$(pwd)
57+
export WORK_DIR="${DIR}/work_dirs/${model_dir_name}_${data_dir_name}_${infer_backend_lower}"
58+
if [ ! -d "$WORK_DIR" ]; then
59+
mkdir -p "$WORK_DIR"
60+
fi
61+
export LMDEPLOY_LOG_FILE="${WORK_DIR}/lmdeploy_log_${current_time}.txt"
62+
export XTUNER_RL_MEM_DIR="${WORK_DIR}/mem_${current_time}"
63+
5164
# 2. Launch Ray cluster
5265
# 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM
5366
node_count=${NODE_COUNT:-1}
5467
total_cpus=$((node_count * 128))
5568

5669
if [ "$RAY_RANK" -eq 0 ]; then
70+
rm -rf /tmp/ray_log
71+
export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
72+
mkdir -p ${RAY_LOG_DIR}
73+
ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
5774
ray start --head \
5875
--node-ip-address="$RAY_MASTER_ADDR" \
5976
--port="$RAY_HEAD_PORT" \
6077
--dashboard-host=0.0.0.0 \
6178
--dashboard-port=$RAY_DASHBOARD_PORT \
6279
--include-dashboard=true \
6380
--disable-usage-stats \
64-
--num-cpus=$total_cpus
81+
--num-cpus=$total_cpus \
82+
--temp-dir="/tmp/ray_log/"
6583
else
6684
while true; do
6785
if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
@@ -86,23 +104,12 @@ while true; do
86104
fi
87105
done
88106

89-
# 3. Prepare work directory and log file
90-
current_time=$(date "+%m%d%H")
91-
# 取模型路径的最后一级作为model_name,取数据路径的倒数第二级作为data_name
92-
model_dir_name=$(basename "$MODEL_PATH")
93-
data_dir_name=$(basename "$(dirname "$DATA_PATH")")
94-
export WORK_DIR="work_dirs/${model_dir_name}_${data_dir_name}_${infer_backend_lower}"
95-
96-
if [ ! -d "$WORK_DIR" ]; then
97-
mkdir -p "$WORK_DIR"
98-
fi
99-
100107
SCRIPT_NAME=$(basename "$0")
101108
cp "$0" "${WORK_DIR}/${SCRIPT_NAME}"
102109
cp "$CONFIG_PATH" "${WORK_DIR}/config.py"
103110
LOG_FILE="${WORK_DIR}/training_log_${current_time}.txt"
104111

105-
# 4. Submit training job on Head node
112+
# 3. Submit training job on Head node
106113
if [ "$RAY_RANK" -eq 0 ]; then
107114
RUNTIME_ENV_JSON="{
108115
\"env_vars\": {

0 commit comments

Comments
 (0)