chore: ray-sub - improve robustness (#968)

skirdey-inflection · terrykong · web-flow · commit 65a79657bda0 · 2025-08-26T06:27:11.000Z
Signed-off-by: Stanislav Kirdey &lt;stan@inflection.ai&gt;
Signed-off-by: Stan Kirdey &lt;stan@inflection.ai&gt;
Co-authored-by: Terry Kong &lt;terrycurtiskong@gmail.com&gt;
diff --git a/ray.sub b/ray.sub
@@ -31,8 +31,8 @@ maybe_gres_arg() {
   # Check if any nodes in the partition have GRES configured
   # Assumes a homogeneous allocation (not a heterogeneous job)
   if sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep -q "gpu:"; then
-    # Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8.
-    if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | cut -d: -f2) ]]; then
+    # Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8 or gpu:a100:8
+    if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | awk -F: '{print $NF}') ]]; then
       echo "Error: GPUS_PER_NODE=$GPUS_PER_NODE but GRES detected is $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:") meaning GPUS_PER_NODE is not set to fully claim the GPUs on the nodes." >&2
       exit 1
     fi
@@ -124,7 +124,43 @@ nodes_array=($nodes)
 ip_addresses_array=()
 
 for node in $nodes; do
-    ip_address=$(host $node | awk '/has address/ { print $4 }')
+    # Try multiple methods to get IP address - ENHANCED VERSION v2.0
+    echo "[DEBUG] Resolving hostname: $node using enhanced resolution methods"
+    ip_address=""
+    
+    # Method 1: Try host command
+    echo "[DEBUG] Method 1: host command"
+    ip_address=$(host $node 2>/dev/null | awk '/has address/ { print $4 }' | head -1 || true)
+    echo "[DEBUG] host result: '$ip_address'"
+    
+    # Method 2: If host fails, try getent
+    if [[ -z "$ip_address" ]]; then
+        echo "[DEBUG] Method 2: getent hosts"
+        ip_address=$(getent hosts $node 2>/dev/null | awk '{ print $1 }' | head -1 || true)
+        echo "[DEBUG] getent result: '$ip_address'"
+    fi
+    
+    # Method 3: If getent fails, try nslookup
+    if [[ -z "$ip_address" ]]; then
+        echo "[DEBUG] Method 3: nslookup"
+        ip_address=$(nslookup $node 2>/dev/null | awk '/^Address: / { print $2 }' | head -1 || true)
+        echo "[DEBUG] nslookup result: '$ip_address'"
+    fi
+    
+    # Method 4: If all DNS methods fail, try ping to extract IP
+    if [[ -z "$ip_address" ]]; then
+        echo "[DEBUG] Method 4: ping"
+        ip_address=$(ping -c 1 $node 2>/dev/null | grep "PING" | sed 's/.*(\([^)]*\)).*/\1/' || true)
+        echo "[DEBUG] ping result: '$ip_address'"
+    fi
+    
+    # If still no IP, use the hostname itself (might work if it's already an IP or resolvable)
+    if [[ -z "$ip_address" ]]; then
+        echo "[WARNING] Could not resolve IP for $node, using hostname as fallback"
+        ip_address=$node
+    fi
+    
+    echo "[INFO] Node: $node -> IP: $ip_address"
     # Add the IP address to the array
     ip_addresses_array+=("$ip_address")
 done
@@ -338,7 +374,7 @@ echo "All workers connected!"
 # We can now launch a job on this cluster
 # We do so by launching a driver process on the physical node that the head node is on
 # This driver process is responsible for launching a job on the Ray cluster
-CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID --json | jq -r '.jobs[].current_working_directory')
+CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID | grep -oP 'WorkDir=\K[^ ]+' | head -1)
 if [[ -n "$COMMAND" ]]; then
   srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-driver.log bash -c "$COMMAND"
 else