Skip to content

Commit 81cedf6

Browse files
committed
Add ray head start timeout
Signed-off-by: Hemil Desai <[email protected]>
1 parent 38e0581 commit 81cedf6

File tree

3 files changed

+33
-3
lines changed

3 files changed

+33
-3
lines changed

nemo_run/run/ray/templates/ray.sub.j2

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
4747
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
4848
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
4949

50+
# Timeout in seconds for Ray head node to start (default 10 minutes)
51+
RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}
52+
5053
# Directory setup
5154
export CLUSTER_DIR={{ cluster_dir }}
5255
mkdir -p $CLUSTER_DIR
@@ -208,9 +211,16 @@ EOF
208211
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
209212

210213
# Wait for the head node container to start and for Ray to be ready
214+
elapsed_time=0
211215
while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
212-
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
216+
if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
217+
echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
218+
touch $LOG_DIR/ENDED
219+
exit 1
220+
fi
221+
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
213222
sleep 2
223+
elapsed_time=$((elapsed_time + 2))
214224
done
215225

216226
NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

test/core/execution/artifacts/expected_ray_cluster.sub

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
4949
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
5050
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
5151

52+
# Timeout in seconds for Ray head node to start (default 10 minutes)
53+
RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}
54+
5255
# Directory setup
5356
export CLUSTER_DIR=/tmp/test_jobs/test-ray-cluster
5457
mkdir -p $CLUSTER_DIR
@@ -202,9 +205,16 @@ EOF
202205
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
203206

204207
# Wait for the head node container to start and for Ray to be ready
208+
elapsed_time=0
205209
while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
206-
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
210+
if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
211+
echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
212+
touch $LOG_DIR/ENDED
213+
exit 1
214+
fi
215+
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
207216
sleep 2
217+
elapsed_time=$((elapsed_time + 2))
208218
done
209219

210220
NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

test/core/execution/artifacts/expected_ray_cluster_ssh.sub

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
5050
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
5151
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
5252

53+
# Timeout in seconds for Ray head node to start (default 10 minutes)
54+
RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600}
55+
5356
# Directory setup
5457
export CLUSTER_DIR=/lustre/fsw/projects/research/jobs/multi-node-training
5558
mkdir -p $CLUSTER_DIR
@@ -207,9 +210,16 @@ EOF
207210
srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
208211

209212
# Wait for the head node container to start and for Ray to be ready
213+
elapsed_time=0
210214
while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do
211-
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..."
215+
if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then
216+
echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..."
217+
touch $LOG_DIR/ENDED
218+
exit 1
219+
fi
220+
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
212221
sleep 2
222+
elapsed_time=$((elapsed_time + 2))
213223
done
214224

215225
NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

0 commit comments

Comments
 (0)