Skip to content

Commit 1363dba

Browse files
authored
fix: improve port selection and exiting early from ray.sub (#272)
Signed-off-by: Terry Kong <terryk@nvidia.com>
1 parent 044f385 commit 1363dba

File tree

1 file changed

+102
-32
lines changed

1 file changed

+102
-32
lines changed

ray.sub

Lines changed: 102 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,26 @@ CONTAINER=$CONTAINER
1818
MOUNTS=$MOUNTS
1919
COMMAND=${COMMAND:-} # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up.
2020
########################################################
21-
# Ray ports
22-
GCS_SERVER_PORT=${GCS_SERVER_PORT:-6379}
23-
DASHBOARD_PORT=${DASHBOARD_PORT:-8265}
24-
OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-8076}
25-
NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-8077}
26-
DASHBOARD_AGENT_PORT=${DASHBOARD_AGENT_PORT:-52365}
27-
DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-52366}
28-
METRICS_PORT=${METRICS_PORT:-9002}
29-
# On our clusters, the largest port range on an idle worker appeared between 52367-64607
21+
# Ports for all nodes (should be odd numbers since we place head/worker[0] on the same node) so all workers get the odd ports, but the head will get +1 the ports
22+
NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-53001}
23+
OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-53003}
24+
RUNTIME_ENV_AGENT_PORT=${RUNTIME_ENV_AGENT_PORT:-53005}
25+
DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
26+
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
27+
28+
# Ports for the head node
29+
PORT=${PORT:-6379}
30+
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
31+
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
32+
DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:-52367}
33+
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
34+
DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
35+
36+
# On our clusters, the largest port range on an idle worker appeared between 52369-64607
3037
# (not including the other ports set by this script). So this range is chosen to be
3138
# somewhere in the middle
32-
MIN_WORKER_PORT=${MIN_WORKER_PORT:-53001}
33-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-53257}
39+
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
40+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
3441
########################################################
3542

3643
# Defaults to placing uv cache inside the SLURM_SUBMIT_DIR
@@ -39,7 +46,8 @@ UV_CACHE_DIR="${UV_CACHE_DIR:-$SLURM_SUBMIT_DIR/uv_cache}"
3946
mkdir -p $UV_CACHE_DIR
4047

4148
# Create logs directory
42-
LOG_DIR="$SLURM_SUBMIT_DIR/$SLURM_JOB_ID-logs"
49+
BASE_LOG_DIR=${BASE_LOG_DIR:-$SLURM_SUBMIT_DIR}
50+
LOG_DIR="$BASE_LOG_DIR/$SLURM_JOB_ID-logs"
4351
mkdir -p $LOG_DIR
4452

4553
COMMON_SRUN_ARGS=""
@@ -56,7 +64,7 @@ COMMON_SRUN_ARGS+=" --gres=gpu:8"
5664
# Number of GPUs per node
5765
gpus_per_node=8
5866

59-
num_retries=5
67+
num_retries=3
6068

6169
# Getting the node names and IP addresses in the SLURM allocation
6270
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
@@ -72,7 +80,7 @@ done
7280
head_node=${nodes_array[0]}
7381
head_node_ip=${ip_addresses_array[0]}
7482

75-
ip_head=$head_node_ip:$GCS_SERVER_PORT
83+
ip_head=$head_node_ip:$PORT
7684

7785
# First we start the head of the ray cluster on one of the physical nodes
7886
# Set GPU/CPU resources to 0 to avoid scheduling on the head node
@@ -82,30 +90,60 @@ head_cmd=$(cat <<EOF
8290
# Overlapping srun commands will check this file to determine if we can overlap a container command
8391
touch $LOG_DIR/STARTED_RAY_HEAD
8492
env
93+
94+
exit-dramatically() {
95+
# Use SIGTERM to forcefully terminate the srun process
96+
pkill -P $$ || true
97+
kill -TERM 0 || true
98+
# As a last resort, exit with a non-zero code
99+
exit 1
100+
}
101+
export -f exit-dramatically
102+
103+
# Background process to check for ENDED file
104+
monitor-sidecar() {
105+
set +x
106+
while true; do
107+
sleep 60
108+
if [[ -f "$LOG_DIR/ENDED" ]]; then
109+
echo "Detected ENDED file, terminating..."
110+
exit-dramatically
111+
fi
112+
done
113+
}
114+
monitor-sidecar &
115+
85116
cat <<EOFINNER | tee /launch-head.sh
86117
ray start --head \
87-
--disable-usage-stats \
88-
--num-cpus=0 \
89-
--num-gpus=0 \
90-
--node-ip-address="$head_node_ip" \
91-
--port=${GCS_SERVER_PORT} \
92-
--dashboard-port=${DASHBOARD_PORT} \
93-
--object-manager-port=${OBJECT_MANAGER_PORT} \
94-
--node-manager-port=${NODE_MANAGER_PORT} \
95-
--metrics-export-port=${METRICS_PORT} \
96-
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
97-
--dashboard-agent-listen-port=${DASHBOARD_AGENT_PORT} \
98-
--block
118+
--disable-usage-stats \
119+
--num-cpus=0 \
120+
--num-gpus=0 \
121+
--node-ip-address="$head_node_ip" \
122+
--port=${PORT} \
123+
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
124+
--dashboard-grpc-port=${DASHBOARD_GRPC_PORT} \
125+
--dashboard-port=${DASHBOARD_PORT} \
126+
\
127+
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
128+
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
129+
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
130+
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
131+
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
132+
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
133+
\
134+
--block
99135
EOFINNER
100136
chmod +x /launch-head.sh
101137
102138
count=0
103-
while true; do
139+
while [[ \$count -lt $num_retries ]]; do
104140
bash /launch-head.sh
105141
count=\$((count+1))
106-
echo "Head node failed \$count times, restarting..."
142+
echo "Head node failed \$count/$num_retries times, restarting in 5 seconds..."
143+
sleep 5
107144
done
108-
echo ret_code=\$?
145+
touch $LOG_DIR/ENDED
146+
exit 1
109147
EOF
110148
)
111149
srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
@@ -120,22 +158,54 @@ for ((i = 0; i < SLURM_JOB_NUM_NODES; i++)); do
120158

121159
worker_cmd=$(cat <<EOF
122160
env
161+
162+
exit-dramatically() {
163+
# Use SIGTERM to forcefully terminate the srun process
164+
pkill -P $$ || true
165+
kill -TERM 0 || true
166+
# As a last resort, exit with a non-zero code
167+
exit 1
168+
}
169+
170+
# Background process to check for ENDED file
171+
monitor-sidecar() {
172+
set +x
173+
while true; do
174+
sleep 60
175+
if [[ -f "$LOG_DIR/ENDED" ]]; then
176+
echo "Detected ENDED file, terminating..."
177+
exit-dramatically
178+
fi
179+
done
180+
}
181+
monitor-sidecar &
182+
123183
cat <<EOFINNER | tee /launch-worker.sh
124184
ray start --address "$ip_head" \
125185
--disable-usage-stats \
126186
--resources="{\"worker_units\": $gpus_per_node}" \
127187
--min-worker-port=${MIN_WORKER_PORT} \
128188
--max-worker-port=${MAX_WORKER_PORT} \
189+
\
190+
--node-manager-port=${NODE_MANAGER_PORT} \
191+
--object-manager-port=${OBJECT_MANAGER_PORT} \
192+
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
193+
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
194+
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
195+
--metrics-export-port=${METRICS_EXPORT_PORT} \
196+
\
129197
--block
130198
EOFINNER
131199
132200
count=0
133-
while true; do
201+
while [[ \$count -lt $num_retries ]]; do
134202
bash /launch-worker.sh
135203
count=\$((count+1))
136-
echo "Worker failed \$count times, restarting..."
204+
echo "Worker failed \$count/$num_retries times, restarting in 5 seconds..."
205+
sleep 5
137206
done
138-
echo ret_code=\$?
207+
touch $LOG_DIR/ENDED
208+
exit 1
139209
EOF
140210
)
141211
if [[ $i -eq 0 ]]; then

0 commit comments

Comments
 (0)