Skip to content

Commit d0ed9ac

Browse files
committed
fix ray templates by using --exclusive to launch ray nodes
Signed-off-by: Hemil Desai <[email protected]>
1 parent c13e4e4 commit d0ed9ac

File tree

2 files changed

+4
-4
lines changed

2 files changed

+4
-4
lines changed

nemo_run/run/ray/templates/ray.sub.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ touch $LOG_DIR/ENDED
279279
exit 1
280280
EOF
281281
)
282-
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
282+
srun {{ common_srun_args }} --container-name=ray-head --exclusive --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
283283
SRUN_PIDS["ray-head"]=$!
284284

285285
# Wait for the head node container to start and for Ray to be ready
@@ -367,7 +367,7 @@ EOF
367367
if [[ $i -eq 0 ]]; then
368368
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
369369
fi
370-
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
370+
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
371371
SRUN_PIDS["ray-worker-$i"]=$!
372372
sleep 3
373373
done

nemo_run/run/ray/templates/ray_enroot.sub.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ touch $LOG_DIR/ENDED
279279
exit 1
280280
EOF
281281
)
282-
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
282+
srun {{ common_srun_args }} --container-name=ray-head --exclusive --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
283283
SRUN_PIDS["ray-head"]=$!
284284

285285
# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
@@ -380,7 +380,7 @@ EOF
380380
if [[ $i -eq 0 ]]; then
381381
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
382382
fi
383-
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
383+
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
384384
SRUN_PIDS["ray-worker-$i"]=$!
385385
sleep 3
386386
done

0 commit comments

Comments
 (0)