@@ -18,19 +18,26 @@ CONTAINER=$CONTAINER
1818MOUNTS=$MOUNTS
1919COMMAND=${COMMAND:- } # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up.
2020# #######################################################
21- # Ray ports
22- GCS_SERVER_PORT=${GCS_SERVER_PORT:- 6379}
23- DASHBOARD_PORT=${DASHBOARD_PORT:- 8265}
24- OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:- 8076}
25- NODE_MANAGER_PORT=${NODE_MANAGER_PORT:- 8077}
26- DASHBOARD_AGENT_PORT=${DASHBOARD_AGENT_PORT:- 52365}
27- DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:- 52366}
28- METRICS_PORT=${METRICS_PORT:- 9002}
29- # On our clusters, the largest port range on an idle worker appeared between 52367-64607
21+ # Ports for all nodes (should be odd numbers since we place head/worker[0] on the same node) so all workers get the odd ports, but the head will get +1 the ports
22+ NODE_MANAGER_PORT=${NODE_MANAGER_PORT:- 53001}
23+ OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:- 53003}
24+ RUNTIME_ENV_AGENT_PORT=${RUNTIME_ENV_AGENT_PORT:- 53005}
25+ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:- 53007}
26+ METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:- 53009}
27+
28+ # Ports for the head node
29+ PORT=${PORT:- 6379}
30+ RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:- 10001}
31+ # REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
32+ DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:- 52367}
33+ DASHBOARD_PORT=${DASHBOARD_PORT:- 8265} # Also used by debugger
34+ DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:- 52365}
35+
36+ # On our clusters, the largest port range on an idle worker appeared between 52369-64607
3037# (not including the other ports set by this script). So this range is chosen to be
3138# somewhere in the middle
32- MIN_WORKER_PORT=${MIN_WORKER_PORT:- 53001 }
33- MAX_WORKER_PORT=${MAX_WORKER_PORT:- 53257 }
39+ MIN_WORKER_PORT=${MIN_WORKER_PORT:- 54001 }
40+ MAX_WORKER_PORT=${MAX_WORKER_PORT:- 54257 }
3441# #######################################################
3542
3643# Defaults to placing uv cache inside the SLURM_SUBMIT_DIR
@@ -39,7 +46,8 @@ UV_CACHE_DIR="${UV_CACHE_DIR:-$SLURM_SUBMIT_DIR/uv_cache}"
3946mkdir -p $UV_CACHE_DIR
4047
4148# Create logs directory
42- LOG_DIR=" $SLURM_SUBMIT_DIR /$SLURM_JOB_ID -logs"
49+ BASE_LOG_DIR=${BASE_LOG_DIR:- $SLURM_SUBMIT_DIR }
50+ LOG_DIR=" $BASE_LOG_DIR /$SLURM_JOB_ID -logs"
4351mkdir -p $LOG_DIR
4452
4553COMMON_SRUN_ARGS=" "
@@ -56,7 +64,7 @@ COMMON_SRUN_ARGS+=" --gres=gpu:8"
5664# Number of GPUs per node
5765gpus_per_node=8
5866
59- num_retries=5
67+ num_retries=3
6068
6169# Getting the node names and IP addresses in the SLURM allocation
6270nodes=$( scontrol show hostnames " $SLURM_JOB_NODELIST " )
7280head_node=${nodes_array[0]}
7381head_node_ip=${ip_addresses_array[0]}
7482
75- ip_head=$head_node_ip :$GCS_SERVER_PORT
83+ ip_head=$head_node_ip :$PORT
7684
7785# First we start the head of the ray cluster on one of the physical nodes
7886# Set GPU/CPU resources to 0 to avoid scheduling on the head node
@@ -82,30 +90,60 @@ head_cmd=$(cat <<EOF
8290# Overlapping srun commands will check this file to determine if we can overlap a container command
8391touch $LOG_DIR /STARTED_RAY_HEAD
8492env
93+
94+ exit-dramatically() {
95+ # Use SIGTERM to forcefully terminate the srun process
96+ pkill -P $$ || true
97+ kill -TERM 0 || true
98+ # As a last resort, exit with a non-zero code
99+ exit 1
100+ }
101+ export -f exit-dramatically
102+
103+ # Background process to check for ENDED file
104+ monitor-sidecar() {
105+ set +x
106+ while true; do
107+ sleep 60
108+ if [[ -f "$LOG_DIR /ENDED" ]]; then
109+ echo "Detected ENDED file, terminating..."
110+ exit-dramatically
111+ fi
112+ done
113+ }
114+ monitor-sidecar &
115+
85116cat <<EOFINNER | tee /launch-head.sh
86117ray start --head \
87- --disable-usage-stats \
88- --num-cpus=0 \
89- --num-gpus=0 \
90- --node-ip-address="$head_node_ip " \
91- --port=${GCS_SERVER_PORT} \
92- --dashboard-port=${DASHBOARD_PORT} \
93- --object-manager-port=${OBJECT_MANAGER_PORT} \
94- --node-manager-port=${NODE_MANAGER_PORT} \
95- --metrics-export-port=${METRICS_PORT} \
96- --dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
97- --dashboard-agent-listen-port=${DASHBOARD_AGENT_PORT} \
98- --block
118+ --disable-usage-stats \
119+ --num-cpus=0 \
120+ --num-gpus=0 \
121+ --node-ip-address="$head_node_ip " \
122+ --port=${PORT} \
123+ --ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
124+ --dashboard-grpc-port=${DASHBOARD_GRPC_PORT} \
125+ --dashboard-port=${DASHBOARD_PORT} \
126+ \
127+ --node-manager-port=$(( ${NODE_MANAGER_PORT} + 1 )) \
128+ --object-manager-port=$(( ${OBJECT_MANAGER_PORT} + 1 )) \
129+ --runtime-env-agent-port=$(( ${RUNTIME_ENV_AGENT_PORT} + 1 )) \
130+ --dashboard-agent-grpc-port=$(( ${DASHBOARD_AGENT_GRPC_PORT} + 1 )) \
131+ --dashboard-agent-listen-port=$(( ${DASHBOARD_AGENT_LISTEN_PORT} + 1 )) \
132+ --metrics-export-port=$(( ${METRICS_EXPORT_PORT} + 1 )) \
133+ \
134+ --block
99135EOFINNER
100136chmod +x /launch-head.sh
101137
102138count=0
103- while true ; do
139+ while [[ \$ count -lt $num_retries ]] ; do
104140 bash /launch-head.sh
105141 count=\$ ((count+1))
106- echo "Head node failed \$ count times, restarting..."
142+ echo "Head node failed \$ count/$num_retries times, restarting in 5 seconds..."
143+ sleep 5
107144done
108- echo ret_code=\$ ?
145+ touch $LOG_DIR /ENDED
146+ exit 1
109147EOF
110148)
111149srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 --ntasks=1 -w " $head_node " -o $LOG_DIR /ray-head.log bash -x -c " $head_cmd " &
@@ -120,22 +158,54 @@ for ((i = 0; i < SLURM_JOB_NUM_NODES; i++)); do
120158
121159 worker_cmd=$( cat << EOF
122160env
161+
162+ exit-dramatically() {
163+ # Use SIGTERM to forcefully terminate the srun process
164+ pkill -P $$ || true
165+ kill -TERM 0 || true
166+ # As a last resort, exit with a non-zero code
167+ exit 1
168+ }
169+
170+ # Background process to check for ENDED file
171+ monitor-sidecar() {
172+ set +x
173+ while true; do
174+ sleep 60
175+ if [[ -f "$LOG_DIR /ENDED" ]]; then
176+ echo "Detected ENDED file, terminating..."
177+ exit-dramatically
178+ fi
179+ done
180+ }
181+ monitor-sidecar &
182+
123183cat <<EOFINNER | tee /launch-worker.sh
124184ray start --address "$ip_head " \
125185 --disable-usage-stats \
126186 --resources="{\"worker_units\": $gpus_per_node }" \
127187 --min-worker-port=${MIN_WORKER_PORT} \
128188 --max-worker-port=${MAX_WORKER_PORT} \
189+ \
190+ --node-manager-port=${NODE_MANAGER_PORT} \
191+ --object-manager-port=${OBJECT_MANAGER_PORT} \
192+ --runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
193+ --dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
194+ --dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
195+ --metrics-export-port=${METRICS_EXPORT_PORT} \
196+ \
129197 --block
130198EOFINNER
131199
132200count=0
133- while true ; do
201+ while [[ \$ count -lt $num_retries ]] ; do
134202 bash /launch-worker.sh
135203 count=\$ ((count+1))
136- echo "Worker failed \$ count times, restarting..."
204+ echo "Worker failed \$ count/$num_retries times, restarting in 5 seconds..."
205+ sleep 5
137206done
138- echo ret_code=\$ ?
207+ touch $LOG_DIR /ENDED
208+ exit 1
139209EOF
140210)
141211 if [[ $i -eq 0 ]]; then
0 commit comments