Skip to content

Commit 5ff96f7

Browse files
authored
Add nsys patch in ray sub template (#318)
1 parent 213c399 commit 5ff96f7

File tree

3 files changed

+132
-3
lines changed

3 files changed

+132
-3
lines changed

nemo_run/run/ray/templates/ray.sub.j2

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
4141
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
4242
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
4343

44+
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
45+
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
46+
47+
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
48+
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
49+
4450
# Directory setup
4551
export CLUSTER_DIR={{ cluster_dir }}
4652
mkdir -p $CLUSTER_DIR
@@ -132,10 +138,44 @@ monitor-sidecar() {
132138
}
133139
monitor-sidecar &
134140
141+
# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
142+
log-sync-sidecar() {
143+
set +x
144+
if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
145+
echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
146+
return
147+
fi
148+
mkdir -p $LOG_DIR/ray
149+
while true; do
150+
sleep $RAY_LOG_SYNC_FREQUENCY
151+
if ls ${RAY_TEMP_DIR}/session_[0-9]* > /dev/null 2>&1; then
152+
for session_dir in ${RAY_TEMP_DIR}/session_[0-9]*/; do
153+
if [[ -d "\$session_dir/logs" ]]; then
154+
session_name=\$(basename "\$session_dir")
155+
mkdir -p "$LOG_DIR/ray/\$session_name"
156+
if command -v rsync > /dev/null 2>&1; then
157+
rsync -ahP "\$session_dir/logs/" "$LOG_DIR/ray/\$session_name/logs/" 2>/dev/null || true
158+
else
159+
cp -r "\$session_dir/logs" "$LOG_DIR/ray/\$session_name/"
160+
fi
161+
fi
162+
done
163+
fi
164+
if [[ -f "$LOG_DIR/ENDED" ]]; then
165+
echo "Log sync sidecar terminating..."
166+
break
167+
fi
168+
done
169+
}
170+
log-sync-sidecar &
171+
172+
# Patch nsight.py before starting Ray head
173+
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
174+
135175
cat <<EOFINNER | tee /launch-head.sh
136176
ray start --head \
137177
--disable-usage-stats \
138-
--temp-dir=/ray-cluster \
178+
--temp-dir=${RAY_TEMP_DIR} \
139179
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
140180
--node-ip-address="$head_node_ip" \
141181
--port=${PORT} \
@@ -209,6 +249,9 @@ monitor-sidecar() {
209249
}
210250
monitor-sidecar &
211251
252+
# Patch nsight.py before starting Ray worker
253+
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
254+
212255
cat <<EOFINNER | tee /launch-worker.sh
213256
sleep 5
214257
ray start --address "$ip_head" \

test/core/execution/artifacts/expected_ray_cluster.sub

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
4343
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
4444
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
4545

46+
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
47+
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
48+
49+
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
50+
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
51+
4652
# Directory setup
4753
export CLUSTER_DIR=/tmp/test_jobs/test-ray-cluster
4854
mkdir -p $CLUSTER_DIR
@@ -126,10 +132,44 @@ monitor-sidecar() {
126132
}
127133
monitor-sidecar &
128134
135+
# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
136+
log-sync-sidecar() {
137+
set +x
138+
if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
139+
echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
140+
return
141+
fi
142+
mkdir -p $LOG_DIR/ray
143+
while true; do
144+
sleep $RAY_LOG_SYNC_FREQUENCY
145+
if ls ${RAY_TEMP_DIR}/session_[0-9]* > /dev/null 2>&1; then
146+
for session_dir in ${RAY_TEMP_DIR}/session_[0-9]*/; do
147+
if [[ -d "\$session_dir/logs" ]]; then
148+
session_name=\$(basename "\$session_dir")
149+
mkdir -p "$LOG_DIR/ray/\$session_name"
150+
if command -v rsync > /dev/null 2>&1; then
151+
rsync -ahP "\$session_dir/logs/" "$LOG_DIR/ray/\$session_name/logs/" 2>/dev/null || true
152+
else
153+
cp -r "\$session_dir/logs" "$LOG_DIR/ray/\$session_name/"
154+
fi
155+
fi
156+
done
157+
fi
158+
if [[ -f "$LOG_DIR/ENDED" ]]; then
159+
echo "Log sync sidecar terminating..."
160+
break
161+
fi
162+
done
163+
}
164+
log-sync-sidecar &
165+
166+
# Patch nsight.py before starting Ray head
167+
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
168+
129169
cat <<EOFINNER | tee /launch-head.sh
130170
ray start --head \
131171
--disable-usage-stats \
132-
--temp-dir=/ray-cluster \
172+
--temp-dir=${RAY_TEMP_DIR} \
133173
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
134174
--node-ip-address="$head_node_ip" \
135175
--port=${PORT} \
@@ -199,6 +239,9 @@ monitor-sidecar() {
199239
}
200240
monitor-sidecar &
201241
242+
# Patch nsight.py before starting Ray worker
243+
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
244+
202245
cat <<EOFINNER | tee /launch-worker.sh
203246
sleep 5
204247
ray start --address "$ip_head" \

test/core/execution/artifacts/expected_ray_cluster_ssh.sub

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
4444
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
4545
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
4646

47+
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
48+
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
49+
50+
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
51+
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
52+
4753
# Directory setup
4854
export CLUSTER_DIR=/lustre/fsw/projects/research/jobs/multi-node-training
4955
mkdir -p $CLUSTER_DIR
@@ -131,10 +137,44 @@ monitor-sidecar() {
131137
}
132138
monitor-sidecar &
133139
140+
# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
141+
log-sync-sidecar() {
142+
set +x
143+
if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
144+
echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
145+
return
146+
fi
147+
mkdir -p $LOG_DIR/ray
148+
while true; do
149+
sleep $RAY_LOG_SYNC_FREQUENCY
150+
if ls ${RAY_TEMP_DIR}/session_[0-9]* > /dev/null 2>&1; then
151+
for session_dir in ${RAY_TEMP_DIR}/session_[0-9]*/; do
152+
if [[ -d "\$session_dir/logs" ]]; then
153+
session_name=\$(basename "\$session_dir")
154+
mkdir -p "$LOG_DIR/ray/\$session_name"
155+
if command -v rsync > /dev/null 2>&1; then
156+
rsync -ahP "\$session_dir/logs/" "$LOG_DIR/ray/\$session_name/logs/" 2>/dev/null || true
157+
else
158+
cp -r "\$session_dir/logs" "$LOG_DIR/ray/\$session_name/"
159+
fi
160+
fi
161+
done
162+
fi
163+
if [[ -f "$LOG_DIR/ENDED" ]]; then
164+
echo "Log sync sidecar terminating..."
165+
break
166+
fi
167+
done
168+
}
169+
log-sync-sidecar &
170+
171+
# Patch nsight.py before starting Ray head
172+
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
173+
134174
cat <<EOFINNER | tee /launch-head.sh
135175
ray start --head \
136176
--disable-usage-stats \
137-
--temp-dir=/ray-cluster \
177+
--temp-dir=${RAY_TEMP_DIR} \
138178
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
139179
--node-ip-address="$head_node_ip" \
140180
--port=${PORT} \
@@ -206,6 +246,9 @@ monitor-sidecar() {
206246
}
207247
monitor-sidecar &
208248
249+
# Patch nsight.py before starting Ray worker
250+
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
251+
209252
cat <<EOFINNER | tee /launch-worker.sh
210253
sleep 5
211254
ray start --address "$ip_head" \

0 commit comments

Comments
 (0)