Skip to content

Commit 09cda88

Browse files
committed
fix up some printing
1 parent db77fb9 commit 09cda88

File tree

2 files changed

+53
-19
lines changed

2 files changed

+53
-19
lines changed

.github/scripts/monitor_slurm_job.sh

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -50,26 +50,33 @@ while [ ! -f "$output_file" ]; do
5050
done
5151

5252
echo "=== Streaming output for job $job_id ==="
53-
# Stream output while job runs (explicitly redirect to ensure output visibility)
54-
# Use stdbuf for unbuffered output to ensure immediate display in CI logs
55-
stdbuf -oL -eL tail -f "$output_file" 2>&1 &
56-
tail_pid=$!
5753

58-
# Give tail a moment to start and show initial output
59-
sleep 2
54+
# Start tail and redirect its output to file descriptor 3 for multiplexing
55+
# This allows us to stream tail output while also printing heartbeat messages
56+
exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
57+
tail_pid=$!
6058

61-
# Wait for job to complete with retry logic for transient squeue failures
59+
# Monitor job status and stream output simultaneously
6260
squeue_failures=0
63-
heartbeat_counter=0
61+
last_heartbeat=$(date +%s)
62+
6463
while true; do
65-
if squeue -j "$job_id" &>/dev/null; then
66-
squeue_failures=0
67-
# Print heartbeat every 60 seconds (12 iterations * 5 sec)
68-
heartbeat_counter=$((heartbeat_counter + 1))
69-
if [ $((heartbeat_counter % 12)) -eq 0 ]; then
70-
echo "[$(date +%H:%M:%S)] Job $job_id still running..."
64+
# Try to read from tail output (non-blocking via timeout)
65+
# Read multiple lines if available to avoid falling behind
66+
lines_read=0
67+
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
68+
echo "$line"
69+
lines_read=$((lines_read + 1))
70+
last_heartbeat=$(date +%s)
71+
# Limit burst reads to avoid starving the status check
72+
if [ $lines_read -ge 100 ]; then
73+
break
7174
fi
72-
else
75+
done
76+
77+
# Check job status
78+
current_time=$(date +%s)
79+
if ! squeue -j "$job_id" &>/dev/null; then
7380
squeue_failures=$((squeue_failures + 1))
7481
# Check if job actually completed using sacct (if available)
7582
if [ $squeue_failures -ge 3 ]; then
@@ -87,14 +94,41 @@ while true; do
8794
;;
8895
esac
8996
else
90-
# No sacct: avoid false positive by doing an extra check cycle
91-
squeue_failures=2
97+
# No sacct: assume job completed after 3 failures
98+
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
99+
break
92100
fi
93101
fi
102+
else
103+
squeue_failures=0
104+
# Print heartbeat if no output for 60 seconds
105+
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
106+
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
107+
last_heartbeat=$current_time
108+
fi
109+
fi
110+
111+
# Sleep briefly between status checks
112+
sleep 1
113+
done
114+
115+
# Drain any remaining output from tail after job completes
116+
echo "Draining remaining output..."
117+
drain_count=0
118+
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
119+
echo "$line"
120+
drain_count=$((drain_count + 1))
121+
# Safety limit to avoid infinite loop
122+
if [ $drain_count -ge 10000 ]; then
123+
echo "Warning: Truncating remaining output after 10000 lines"
124+
break
94125
fi
95-
sleep 5
96126
done
97127

128+
# Close the file descriptor and kill tail
129+
exec 3<&-
130+
kill "${tail_pid}" 2>/dev/null || true
131+
98132
# Wait for output file to finish growing (stabilize) before stopping tail
99133
if [ -f "$output_file" ]; then
100134
last_size=-1

.github/workflows/phoenix/submit-bench.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ sbatch <<EOT
4242
#SBATCH --account=gts-sbryngelson3 # charge account
4343
#SBATCH -N1 # Number of nodes required
4444
$sbatch_device_opts
45-
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
45+
#SBATCH -t 04:00:00 # Duration of the job (Ex: 15 mins)
4646
#SBATCH -q embers # QOS Name
4747
#SBATCH -o$job_slug.out # Combined output and error messages file
4848
#SBATCH -W # Do not exit until the submitted job terminates.

0 commit comments

Comments
 (0)