Skip to content

Commit 65a7965

Browse files
chore: ray-sub - improve robustness (#968)
Signed-off-by: Stanislav Kirdey <stan@inflection.ai> Signed-off-by: Stan Kirdey <stan@inflection.ai> Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
1 parent 8d8f365 commit 65a7965

File tree

1 file changed

+40
-4
lines changed

1 file changed

+40
-4
lines changed

ray.sub

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ maybe_gres_arg() {
3131
# Check if any nodes in the partition have GRES configured
3232
# Assumes a homogeneous allocation (not a heterogeneous job)
3333
if sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep -q "gpu:"; then
34-
# Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8.
35-
if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | cut -d: -f2) ]]; then
34+
# Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8 or gpu:a100:8
35+
if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | awk -F: '{print $NF}') ]]; then
3636
echo "Error: GPUS_PER_NODE=$GPUS_PER_NODE but GRES detected is $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:") meaning GPUS_PER_NODE is not set to fully claim the GPUs on the nodes." >&2
3737
exit 1
3838
fi
@@ -124,7 +124,43 @@ nodes_array=($nodes)
124124
ip_addresses_array=()
125125

126126
for node in $nodes; do
127-
ip_address=$(host $node | awk '/has address/ { print $4 }')
127+
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
128+
echo "[DEBUG] Resolving hostname: $node using enhanced resolution methods"
129+
ip_address=""
130+
131+
# Method 1: Try host command
132+
echo "[DEBUG] Method 1: host command"
133+
ip_address=$(host $node 2>/dev/null | awk '/has address/ { print $4 }' | head -1 || true)
134+
echo "[DEBUG] host result: '$ip_address'"
135+
136+
# Method 2: If host fails, try getent
137+
if [[ -z "$ip_address" ]]; then
138+
echo "[DEBUG] Method 2: getent hosts"
139+
ip_address=$(getent hosts $node 2>/dev/null | awk '{ print $1 }' | head -1 || true)
140+
echo "[DEBUG] getent result: '$ip_address'"
141+
fi
142+
143+
# Method 3: If getent fails, try nslookup
144+
if [[ -z "$ip_address" ]]; then
145+
echo "[DEBUG] Method 3: nslookup"
146+
ip_address=$(nslookup $node 2>/dev/null | awk '/^Address: / { print $2 }' | head -1 || true)
147+
echo "[DEBUG] nslookup result: '$ip_address'"
148+
fi
149+
150+
# Method 4: If all DNS methods fail, try ping to extract IP
151+
if [[ -z "$ip_address" ]]; then
152+
echo "[DEBUG] Method 4: ping"
153+
ip_address=$(ping -c 1 $node 2>/dev/null | grep "PING" | sed 's/.*(\([^)]*\)).*/\1/' || true)
154+
echo "[DEBUG] ping result: '$ip_address'"
155+
fi
156+
157+
# If still no IP, use the hostname itself (might work if it's already an IP or resolvable)
158+
if [[ -z "$ip_address" ]]; then
159+
echo "[WARNING] Could not resolve IP for $node, using hostname as fallback"
160+
ip_address=$node
161+
fi
162+
163+
echo "[INFO] Node: $node -> IP: $ip_address"
128164
# Add the IP address to the array
129165
ip_addresses_array+=("$ip_address")
130166
done
@@ -338,7 +374,7 @@ echo "All workers connected!"
338374
# We can now launch a job on this cluster
339375
# We do so by launching a driver process on the physical node that the head node is on
340376
# This driver process is responsible for launching a job on the Ray cluster
341-
CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID --json | jq -r '.jobs[].current_working_directory')
377+
CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID | grep -oP 'WorkDir=\K[^ ]+' | head -1)
342378
if [[ -n "$COMMAND" ]]; then
343379
srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-driver.log bash -c "$COMMAND"
344380
else

0 commit comments

Comments
 (0)