You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
echo"[WARNING]: Cannot increase ulimit on file descriptors to 65535 according ray recommendation: https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html. Speak to cluster admins to increase, otherwise ray may crash unexpectedly."
54
+
fi
36
55
37
56
# On our clusters, the largest port range on an idle worker appeared between 52369-64607
38
57
# (not including the other ports set by this script). So this range is chosen to be
39
58
# somewhere in the middle
40
59
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
41
-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
60
+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
42
61
43
62
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
44
63
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -82,13 +101,66 @@ gpus_per_node=8
82
101
83
102
num_retries={{ num_retries }}
84
103
104
+
# Track backgrounded srun client PIDs for head and workers
105
+
declare -A SRUN_PIDS
106
+
107
+
# Verify all backgrounded srun client processes are still alive; exit fast if any died
108
+
check_srun_processes() {
109
+
fornamein"${!SRUN_PIDS[@]}";do
110
+
pid="${SRUN_PIDS[$name]}"
111
+
# Check if the process is still running
112
+
if!kill -0 "$pid"2>/dev/null;then
113
+
echo"[ERROR] Background srun '$name' died (pid=$pid). Could be a failure in startup or an issue with the node preventing the srun to start. Attempting to exit.">&2
114
+
# Signal sidecars inside containers to terminate ASAP
115
+
touch "$LOG_DIR/ENDED"
116
+
exit 1
117
+
fi
118
+
done
119
+
}
120
+
85
121
# Getting the node names and IP addresses in the SLURM allocation
86
122
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
87
123
nodes_array=($nodes)
88
124
ip_addresses_array=()
89
125
90
126
fornodein$nodes;do
91
-
ip_address=$(getent hosts "$node"| awk '{print $1}'| head -n1)
127
+
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
128
+
echo"[DEBUG] Resolving hostname: $node using enhanced resolution methods"
0 commit comments