Skip to content

Commit 1f21bc5

Browse files
committed
sync logs
Signed-off-by: Hemil Desai <hemild@nvidia.com>
1 parent 7d3ba61 commit 1f21bc5

File tree

3 files changed

+102
-0
lines changed

3 files changed

+102
-0
lines changed

nemo_run/run/ray/templates/ray.sub.j2

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
4141
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
4242
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
4343

44+
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
45+
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
46+
4447
# Directory setup
4548
export CLUSTER_DIR={{ cluster_dir }}
4649
mkdir -p $CLUSTER_DIR
@@ -132,6 +135,37 @@ monitor-sidecar() {
132135
}
133136
monitor-sidecar &
134137
138+
# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
139+
log-sync-sidecar() {
140+
set +x
141+
if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
142+
echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
143+
return
144+
fi
145+
mkdir -p $LOG_DIR/ray
146+
while true; do
147+
sleep $RAY_LOG_SYNC_FREQUENCY
148+
if ls /tmp/ray/session_[0-9]* > /dev/null 2>&1; then
149+
for session_dir in /tmp/ray/session_[0-9]*/; do
150+
if [[ -d "\$session_dir/logs" ]]; then
151+
session_name=\$(basename "\$session_dir")
152+
mkdir -p "$LOG_DIR/ray/\$session_name"
153+
if command -v rsync > /dev/null 2>&1; then
154+
rsync -ahP "\$session_dir/logs/" "$LOG_DIR/ray/\$session_name/logs/" 2>/dev/null || true
155+
else
156+
cp -r "\$session_dir/logs" "$LOG_DIR/ray/\$session_name/"
157+
fi
158+
fi
159+
done
160+
fi
161+
if [[ -f "$LOG_DIR/ENDED" ]]; then
162+
echo "Log sync sidecar terminating..."
163+
break
164+
fi
165+
done
166+
}
167+
log-sync-sidecar &
168+
135169
# Patch nsight.py before starting Ray head
136170
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
137171

test/core/execution/artifacts/expected_ray_cluster.sub

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
4343
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
4444
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
4545

46+
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
47+
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
48+
4649
# Directory setup
4750
export CLUSTER_DIR=/tmp/test_jobs/test-ray-cluster
4851
mkdir -p $CLUSTER_DIR
@@ -126,6 +129,37 @@ monitor-sidecar() {
126129
}
127130
monitor-sidecar &
128131
132+
# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
133+
log-sync-sidecar() {
134+
set +x
135+
if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
136+
echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
137+
return
138+
fi
139+
mkdir -p $LOG_DIR/ray
140+
while true; do
141+
sleep $RAY_LOG_SYNC_FREQUENCY
142+
if ls /tmp/ray/session_[0-9]* > /dev/null 2>&1; then
143+
for session_dir in /tmp/ray/session_[0-9]*/; do
144+
if [[ -d "\$session_dir/logs" ]]; then
145+
session_name=\$(basename "\$session_dir")
146+
mkdir -p "$LOG_DIR/ray/\$session_name"
147+
if command -v rsync > /dev/null 2>&1; then
148+
rsync -ahP "\$session_dir/logs/" "$LOG_DIR/ray/\$session_name/logs/" 2>/dev/null || true
149+
else
150+
cp -r "\$session_dir/logs" "$LOG_DIR/ray/\$session_name/"
151+
fi
152+
fi
153+
done
154+
fi
155+
if [[ -f "$LOG_DIR/ENDED" ]]; then
156+
echo "Log sync sidecar terminating..."
157+
break
158+
fi
159+
done
160+
}
161+
log-sync-sidecar &
162+
129163
# Patch nsight.py before starting Ray head
130164
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
131165

test/core/execution/artifacts/expected_ray_cluster_ssh.sub

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
4444
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
4545
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
4646

47+
# Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
48+
RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-}
49+
4750
# Directory setup
4851
export CLUSTER_DIR=/lustre/fsw/projects/research/jobs/multi-node-training
4952
mkdir -p $CLUSTER_DIR
@@ -131,6 +134,37 @@ monitor-sidecar() {
131134
}
132135
monitor-sidecar &
133136
137+
# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
138+
log-sync-sidecar() {
139+
set +x
140+
if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then
141+
echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
142+
return
143+
fi
144+
mkdir -p $LOG_DIR/ray
145+
while true; do
146+
sleep $RAY_LOG_SYNC_FREQUENCY
147+
if ls /tmp/ray/session_[0-9]* > /dev/null 2>&1; then
148+
for session_dir in /tmp/ray/session_[0-9]*/; do
149+
if [[ -d "\$session_dir/logs" ]]; then
150+
session_name=\$(basename "\$session_dir")
151+
mkdir -p "$LOG_DIR/ray/\$session_name"
152+
if command -v rsync > /dev/null 2>&1; then
153+
rsync -ahP "\$session_dir/logs/" "$LOG_DIR/ray/\$session_name/logs/" 2>/dev/null || true
154+
else
155+
cp -r "\$session_dir/logs" "$LOG_DIR/ray/\$session_name/"
156+
fi
157+
fi
158+
done
159+
fi
160+
if [[ -f "$LOG_DIR/ENDED" ]]; then
161+
echo "Log sync sidecar terminating..."
162+
break
163+
fi
164+
done
165+
}
166+
log-sync-sidecar &
167+
134168
# Patch nsight.py before starting Ray head
135169
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
136170

0 commit comments

Comments
 (0)