File tree Expand file tree Collapse file tree 2 files changed +4
-4
lines changed
nemo_run/run/ray/templates Expand file tree Collapse file tree 2 files changed +4
-4
lines changed Original file line number Diff line number Diff line change @@ -279,7 +279,7 @@ touch $LOG_DIR/ENDED
279279exit 1
280280EOF
281281)
282- srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w " $head_node " -o $LOG_DIR /{{ ray_log_prefix }}head.log bash -x -c " $head_cmd " &
282+ srun {{ common_srun_args }} --container-name=ray-head --exclusive -- nodes=1 --ntasks=1 -w " $head_node " -o $LOG_DIR /{{ ray_log_prefix }}head.log bash -x -c " $head_cmd " &
283283SRUN_PIDS[" ray-head" ]=$!
284284
285285# Wait for the head node container to start and for Ray to be ready
367367 if [[ $i -eq 0 ]]; then
368368 OVERLAP_HEAD_AND_WORKER_ARG=" --overlap"
369369 fi
370- srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:- } --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task= $(( 16 * gpus_per_node )) -w " $node_i " -o $LOG_DIR /{{ ray_log_prefix }}worker-$i .log bash -x -c " $worker_cmd " &
370+ srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:- } --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w " $node_i " -o $LOG_DIR /{{ ray_log_prefix }}worker-$i .log bash -x -c " $worker_cmd " &
371371 SRUN_PIDS[" ray-worker-$i " ]=$!
372372 sleep 3
373373done
Original file line number Diff line number Diff line change @@ -279,7 +279,7 @@ touch $LOG_DIR/ENDED
279279exit 1
280280EOF
281281)
282- srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w " $head_node " -o $LOG_DIR /{{ ray_log_prefix }}head.log bash -x -c " $head_cmd " &
282+ srun {{ common_srun_args }} --container-name=ray-head --exclusive -- nodes=1 --ntasks=1 -w " $head_node " -o $LOG_DIR /{{ ray_log_prefix }}head.log bash -x -c " $head_cmd " &
283283SRUN_PIDS[" ray-head" ]=$!
284284
285285# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
380380 if [[ $i -eq 0 ]]; then
381381 OVERLAP_HEAD_AND_WORKER_ARG=" --overlap"
382382 fi
383- srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:- } --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task= $(( 16 * gpus_per_node )) -w " $node_i " -o $LOG_DIR /{{ ray_log_prefix }}worker-$i .log bash -x -c " $worker_cmd " &
383+ srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:- } --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w " $node_i " -o $LOG_DIR /{{ ray_log_prefix }}worker-$i .log bash -x -c " $worker_cmd " &
384384 SRUN_PIDS[" ray-worker-$i " ]=$!
385385 sleep 3
386386done
You can’t perform that action at this time.
0 commit comments