Skip to content

Commit 3ab0ca9

Browse files
committed
fix
Signed-off-by: Hemil Desai <[email protected]>
1 parent 42865ab commit 3ab0ca9

File tree

2 files changed

+12
-30
lines changed

2 files changed

+12
-30
lines changed

nemo_run/run/ray/templates/ray_enroot.sub.j2

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -285,12 +285,9 @@ SRUN_PIDS["ray-head"]=$!
285285
# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
286286
get_container_pid() {
287287
local node=$1
288-
local container_name=$2
289-
# Use --overlap WITHOUT --container-name, then use enroot to find the container
290-
# Note: Pyxis sometimes names containers as "<unknown>" so we look for the pattern in any field
291-
srun --overlap --nodes=1 -w "$node" bash -c "enroot list -f | grep -E '($container_name|<unknown>)' | grep -oP '\s+\K\d+' | head -1"
288+
srun --overlap --nodes=1 -w "$node" bash -c "enroot list -f | awk 'NR>1 && \$2 ~ /^[0-9]+\$/ {print \$2; exit}'"
292289
}
293-
head_container_pid=$(get_container_pid "$head_node" "ray-head")
290+
head_container_pid=$(get_container_pid "$head_node")
294291

295292

296293
# Wait for the head node container to start and for Ray to be ready
@@ -389,7 +386,7 @@ done
389386

390387
extract_worker_units() {
391388
# Get the container PID for ray-head
392-
head_container_pid=$(get_container_pid "$head_node" "ray-head")
389+
head_container_pid=$(get_container_pid "$head_node")
393390
if [[ -z "$head_container_pid" ]]; then
394391
echo 0
395392
return
@@ -467,7 +464,7 @@ COMMAND_WORKDIR={{ command_workdir | default('$CONTAINER_CWD') }}
467464

468465
if [[ -n "$COMMAND" ]]; then
469466
# Get container PID and execute command inside it
470-
head_container_pid=$(get_container_pid "$head_node" "ray-head")
467+
head_container_pid=$(get_container_pid "$head_node")
471468
srun --overlap --nodes=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}job.log enroot exec "$head_container_pid" bash -c "cd $COMMAND_WORKDIR && $COMMAND"
472469
else
473470
echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:"
@@ -476,17 +473,11 @@ else
476473
# Args 1-N launch on worker nodes (nodes 1 through N-1)
477474
# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
478475
479-
# Helper to get container PID
480-
get_container_pid() {
481-
local node=\\\$1
482-
local container_name=\\\$2
483-
srun --overlap --nodes=1 -w "\\\$node" --jobid $SLURM_JOB_ID bash -c "enroot list -f | grep '\\\$container_name' | grep -oP '\s+\K\d+' | head -1"
484-
}
485476
486477
WORKER_NUM=\\\${1:-}
487478
if [[ -z "\\\$WORKER_NUM" ]]; then
488479
# Empty means we are on the head node
489-
HEAD_PID=\\\$(get_container_pid "$head_node" "ray-head")
480+
HEAD_PID=\\\$(get_container_pid "$head_node")
490481
if [[ -n "\\\${COMMAND:-}" ]]; then
491482
srun -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --nodes=1 -w "$head_node" --jobid $SLURM_JOB_ID enroot exec "\\\$HEAD_PID" bash -c "cd $CONTAINER_CWD && \\\$COMMAND"
492483
else
@@ -501,7 +492,7 @@ else
501492
fi
502493
nodes_array=($nodes)
503494
node="\\\${nodes_array[\\\$WORKER_NUM]}"
504-
WORKER_PID=\\\$(get_container_pid "\\\$node" "ray-worker-\\\$WORKER_NUM")
495+
WORKER_PID=\\\$(get_container_pid "\\\$node")
505496
if [[ -n "\\\${COMMAND:-}" ]]; then
506497
srun -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --nodes=1 -w "\\\$node" --jobid $SLURM_JOB_ID enroot exec "\\\$WORKER_PID" bash -c "cd $CONTAINER_CWD && \\\$COMMAND"
507498
else

test/core/execution/artifacts/expected_ray_cluster_enroot.sub

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -279,12 +279,9 @@ SRUN_PIDS["ray-head"]=$!
279279
# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
280280
get_container_pid() {
281281
local node=$1
282-
local container_name=$2
283-
# Use --overlap WITHOUT --container-name, then use enroot to find the container
284-
# Note: Pyxis sometimes names containers as "<unknown>" so we look for the pattern in any field
285-
srun --overlap --nodes=1 -w "$node" bash -c "enroot list -f | grep -E '($container_name|<unknown>)' | grep -oP '\s+\K\d+' | head -1"
282+
srun --overlap --nodes=1 -w "$node" bash -c "enroot list -f | awk 'NR>1 && \$2 ~ /^[0-9]+\$/ {print \$2; exit}'"
286283
}
287-
head_container_pid=$(get_container_pid "$head_node" "ray-head")
284+
head_container_pid=$(get_container_pid "$head_node")
288285

289286

290287
# Wait for the head node container to start and for Ray to be ready
@@ -379,7 +376,7 @@ done
379376

380377
extract_worker_units() {
381378
# Get the container PID for ray-head
382-
head_container_pid=$(get_container_pid "$head_node" "ray-head")
379+
head_container_pid=$(get_container_pid "$head_node")
383380
if [[ -z "$head_container_pid" ]]; then
384381
echo 0
385382
return
@@ -443,7 +440,7 @@ COMMAND_WORKDIR=/workspace
443440

444441
if [[ -n "$COMMAND" ]]; then
445442
# Get container PID and execute command inside it
446-
head_container_pid=$(get_container_pid "$head_node" "ray-head")
443+
head_container_pid=$(get_container_pid "$head_node")
447444
srun --overlap --nodes=1 -w "$head_node" -o $LOG_DIR/ray-job.log enroot exec "$head_container_pid" bash -c "cd $COMMAND_WORKDIR && $COMMAND"
448445
else
449446
echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:"
@@ -452,17 +449,11 @@ else
452449
# Args 1-N launch on worker nodes (nodes 1 through N-1)
453450
# Optional: set COMMAND='...' to run non-interactively instead of opening an interactive shell
454451
455-
# Helper to get container PID
456-
get_container_pid() {
457-
local node=\\\$1
458-
local container_name=\\\$2
459-
srun --overlap --nodes=1 -w "\\\$node" --jobid $SLURM_JOB_ID bash -c "enroot list -f | grep '\\\$container_name' | grep -oP '\s+\K\d+' | head -1"
460-
}
461452
462453
WORKER_NUM=\\\${1:-}
463454
if [[ -z "\\\$WORKER_NUM" ]]; then
464455
# Empty means we are on the head node
465-
HEAD_PID=\\\$(get_container_pid "$head_node" "ray-head")
456+
HEAD_PID=\\\$(get_container_pid "$head_node")
466457
if [[ -n "\\\${COMMAND:-}" ]]; then
467458
srun -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --nodes=1 -w "$head_node" --jobid $SLURM_JOB_ID enroot exec "\\\$HEAD_PID" bash -c "cd $CONTAINER_CWD && \\\$COMMAND"
468459
else
@@ -477,7 +468,7 @@ else
477468
fi
478469
nodes_array=($nodes)
479470
node="\\\${nodes_array[\\\$WORKER_NUM]}"
480-
WORKER_PID=\\\$(get_container_pid "\\\$node" "ray-worker-\\\$WORKER_NUM")
471+
WORKER_PID=\\\$(get_container_pid "\\\$node")
481472
if [[ -n "\\\${COMMAND:-}" ]]; then
482473
srun -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --nodes=1 -w "\\\$node" --jobid $SLURM_JOB_ID enroot exec "\\\$WORKER_PID" bash -c "cd $CONTAINER_CWD && \\\$COMMAND"
483474
else

0 commit comments

Comments
 (0)