@@ -285,12 +285,9 @@ SRUN_PIDS["ray-head"]=$!
285285# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
286286get_container_pid () {
287287 local node=$1
288- local container_name=$2
289- # Use --overlap WITHOUT --container-name, then use enroot to find the container
290- # Note: Pyxis sometimes names containers as "<unknown>" so we look for the pattern in any field
291- srun --overlap --nodes=1 -w " $node " bash -c " enroot list -f | grep -E '($container_name |<unknown>)' | grep -oP '\s+\K\d+' | head -1"
288+ srun --overlap --nodes=1 -w " $node " bash -c " enroot list -f | awk 'NR>1 && \$ 2 ~ /^[0-9]+\$ / {print \$ 2; exit}'"
292289}
293- head_container_pid=$( get_container_pid " $head_node " " ray-head " )
290+ head_container_pid=$( get_container_pid " $head_node " )
294291
295292
296293# Wait for the head node container to start and for Ray to be ready
389386
390387extract_worker_units () {
391388 # Get the container PID for ray-head
392- head_container_pid=$( get_container_pid " $head_node " " ray-head " )
389+ head_container_pid=$( get_container_pid " $head_node " )
393390 if [[ -z " $head_container_pid " ]]; then
394391 echo 0
395392 return
@@ -467,7 +464,7 @@ COMMAND_WORKDIR={{ command_workdir | default('$CONTAINER_CWD') }}
467464
468465if [[ -n " $COMMAND " ]]; then
469466 # Get container PID and execute command inside it
470- head_container_pid=$( get_container_pid " $head_node " " ray-head " )
467+ head_container_pid=$( get_container_pid " $head_node " )
471468 srun --overlap --nodes=1 -w " $head_node " -o $LOG_DIR /{{ ray_log_prefix }}job.log enroot exec " $head_container_pid " bash -c " cd $COMMAND_WORKDIR && $COMMAND "
472469else
473470 echo " [INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:"
@@ -476,17 +473,11 @@ else
476473# Args 1-N launch on worker nodes (nodes 1 through N-1)
477474# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
478475
479- # Helper to get container PID
480- get_container_pid() {
481- local node=\\\$ 1
482- local container_name=\\\$ 2
483- srun --overlap --nodes=1 -w "\\\$ node" --jobid $SLURM_JOB_ID bash -c "enroot list -f | grep '\\\$ container_name' | grep -oP '\s+\K\d+' | head -1"
484- }
485476
486477WORKER_NUM=\\\$ {1:-}
487478if [[ -z "\\\$ WORKER_NUM" ]]; then
488479 # Empty means we are on the head node
489- HEAD_PID=\\\$ (get_container_pid "$head_node " "ray-head" )
480+ HEAD_PID=\\\$ (get_container_pid "$head_node ")
490481 if [[ -n "\\\$ {COMMAND:-}" ]]; then
491482 srun -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --nodes=1 -w "$head_node " --jobid $SLURM_JOB_ID enroot exec "\\\$ HEAD_PID" bash -c "cd $CONTAINER_CWD && \\\$ COMMAND"
492483 else
501492 fi
502493 nodes_array=($nodes )
503494 node="\\\$ {nodes_array[\\\$ WORKER_NUM]}"
504- WORKER_PID=\\\$ (get_container_pid "\\\$ node" "ray-worker- \\\$ WORKER_NUM" )
495+ WORKER_PID=\\\$ (get_container_pid "\\\$ node")
505496 if [[ -n "\\\$ {COMMAND:-}" ]]; then
506497 srun -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --nodes=1 -w "\\\$ node" --jobid $SLURM_JOB_ID enroot exec "\\\$ WORKER_PID" bash -c "cd $CONTAINER_CWD && \\\$ COMMAND"
507498 else
0 commit comments