Skip to content

Commit c13e4e4

Browse files
authored
Update ray template (#375)
* Update ray template Signed-off-by: Hemil Desai <[email protected]> * add ray enroot exec template Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> --------- Signed-off-by: Hemil Desai <[email protected]>
1 parent de2d3cd commit c13e4e4

File tree

8 files changed

+1362
-34
lines changed

8 files changed

+1362
-34
lines changed

nemo_run/run/ray/templates/ray.sub.j2

Lines changed: 88 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,36 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
2828
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
2929

3030
# Ports for the head node
31-
PORT=${PORT:-6379}
31+
PORT=${PORT:-54514}
3232
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
3333
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
3434
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
3535
DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
36+
RAY_DEBUGGER_ARGS=
37+
if [ "${RAY_DEBUG:-}" = "legacy" ]; then
38+
RAY_DEBUGGER_ARGS="--ray-debugger-external"
39+
fi
40+
41+
# After ray>=2.47, this feature is enabled by default which creates uv venvs for any py_executable starting with `uv run`.
42+
# There is severe contention and performance issues with this enabled considering our dependencies are so large and occasionally
43+
# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task.
44+
export RAY_ENABLE_UV_RUN_RUNTIME_ENV=0
45+
46+
# Setting ulimit is recommended by ray best practices page
47+
# @ https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html
48+
# It's session based and won't affect the system outside the script
49+
# Ensure that the soft limit isn't above the hard limit
50+
if [[ $(ulimit -Hn) == "unlimited" ]] || [[ 65535 -lt $(ulimit -Hn) ]]; then
51+
ulimit -Sn 65535
52+
elif [[ $(ulimit -Hn) != "unlimited" ]] && [[ $(ulimit -Hn) -lt 65535 ]]; then
53+
echo "[WARNING]: Cannot increase ulimit on file descriptors to 65535 according ray recommendation: https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html. Speak to cluster admins to increase, otherwise ray may crash unexpectedly."
54+
fi
3655

3756
# On our clusters, the largest port range on an idle worker appeared between 52369-64607
3857
# (not including the other ports set by this script). So this range is chosen to be
3958
# somewhere in the middle
4059
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
41-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
60+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
4261

4362
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
4463
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -82,13 +101,66 @@ gpus_per_node=8
82101

83102
num_retries={{ num_retries }}
84103

104+
# Track backgrounded srun client PIDs for head and workers
105+
declare -A SRUN_PIDS
106+
107+
# Verify all backgrounded srun client processes are still alive; exit fast if any died
108+
check_srun_processes() {
109+
for name in "${!SRUN_PIDS[@]}"; do
110+
pid="${SRUN_PIDS[$name]}"
111+
# Check if the process is still running
112+
if ! kill -0 "$pid" 2>/dev/null; then
113+
echo "[ERROR] Background srun '$name' died (pid=$pid). Could be a failure in startup or an issue with the node preventing the srun to start. Attempting to exit." >&2
114+
# Signal sidecars inside containers to terminate ASAP
115+
touch "$LOG_DIR/ENDED"
116+
exit 1
117+
fi
118+
done
119+
}
120+
85121
# Getting the node names and IP addresses in the SLURM allocation
86122
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
87123
nodes_array=($nodes)
88124
ip_addresses_array=()
89125

90126
for node in $nodes; do
91-
ip_address=$(getent hosts "$node" | awk '{print $1}' | head -n1)
127+
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
128+
echo "[DEBUG] Resolving hostname: $node using enhanced resolution methods"
129+
ip_address=""
130+
131+
# Method 1: Try host command
132+
echo "[DEBUG] Method 1: host command"
133+
ip_address=$(host $node 2>/dev/null | awk '/has address/ { print $4 }' | head -1 || true)
134+
echo "[DEBUG] host result: '$ip_address'"
135+
136+
# Method 2: If host fails, try getent
137+
if [[ -z "$ip_address" ]]; then
138+
echo "[DEBUG] Method 2: getent hosts"
139+
ip_address=$(getent hosts $node 2>/dev/null | awk '{ print $1 }' | head -1 || true)
140+
echo "[DEBUG] getent result: '$ip_address'"
141+
fi
142+
143+
# Method 3: If getent fails, try nslookup
144+
if [[ -z "$ip_address" ]]; then
145+
echo "[DEBUG] Method 3: nslookup"
146+
ip_address=$(nslookup $node 2>/dev/null | awk '/^Address: / { print $2 }' | head -1 || true)
147+
echo "[DEBUG] nslookup result: '$ip_address'"
148+
fi
149+
150+
# Method 4: If all DNS methods fail, try ping to extract IP
151+
if [[ -z "$ip_address" ]]; then
152+
echo "[DEBUG] Method 4: ping"
153+
ip_address=$(ping -c 1 $node 2>/dev/null | grep "PING" | sed 's/.*(\([^)]*\)).*/\1/' || true)
154+
echo "[DEBUG] ping result: '$ip_address'"
155+
fi
156+
157+
# If still no IP, use the hostname itself (might work if it's already an IP or resolvable)
158+
if [[ -z "$ip_address" ]]; then
159+
echo "[WARNING] Could not resolve IP for $node, using hostname as fallback"
160+
ip_address=$node
161+
fi
162+
163+
echo "[INFO] Node: $node -> IP: $ip_address"
92164
# Add the IP address to the array
93165
ip_addresses_array+=("$ip_address")
94166
done
@@ -184,12 +256,13 @@ ray start --head \
184256
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
185257
--dashboard-port=${DASHBOARD_PORT} \
186258
\
187-
--node-manager-port=${NODE_MANAGER_PORT} \
188-
--object-manager-port=${OBJECT_MANAGER_PORT} \
189-
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
190-
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
191-
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
192-
--metrics-export-port=${METRICS_EXPORT_PORT} \
259+
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
260+
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
261+
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
262+
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
263+
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
264+
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
265+
$RAY_DEBUGGER_ARGS \
193266
\
194267
--block
195268
EOFINNER
@@ -207,6 +280,7 @@ exit 1
207280
EOF
208281
)
209282
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
283+
SRUN_PIDS["ray-head"]=$!
210284

211285
# Wait for the head node container to start and for Ray to be ready
212286
elapsed_time=0
@@ -217,6 +291,7 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR
217291
exit 1
218292
fi
219293
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
294+
check_srun_processes
220295
sleep 2
221296
elapsed_time=$((elapsed_time + 2))
222297
done
@@ -261,7 +336,6 @@ monitor-sidecar &
261336
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
262337
263338
cat <<EOFINNER | tee /launch-worker.sh
264-
sleep 5
265339
ray start --address "$ip_head" \
266340
--disable-usage-stats \
267341
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
@@ -274,6 +348,7 @@ ray start --address "$ip_head" \
274348
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
275349
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
276350
--metrics-export-port=${METRICS_EXPORT_PORT} \
351+
$RAY_DEBUGGER_ARGS \
277352
\
278353
--block
279354
EOFINNER
@@ -293,6 +368,7 @@ EOF
293368
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
294369
fi
295370
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
371+
SRUN_PIDS["ray-worker-$i"]=$!
296372
sleep 3
297373
done
298374

@@ -316,9 +392,10 @@ extract_worker_units() {
316392
while true; do
317393
worker_units=$(extract_worker_units)
318394
echo "[INFO] Number of actors online: $worker_units/$NUM_ACTORS"
319-
if [ "$worker_units" -eq "$NUM_ACTORS" ]; then
395+
if [[ "$worker_units" -eq "$NUM_ACTORS" ]]; then
320396
break
321397
fi
398+
check_srun_processes
322399
sleep 2
323400
done
324401

0 commit comments

Comments
 (0)