@@ -50,26 +50,33 @@ while [ ! -f "$output_file" ]; do
5050done
5151
5252echo " === Streaming output for job $job_id ==="
53- # Stream output while job runs (explicitly redirect to ensure output visibility)
54- # Use stdbuf for unbuffered output to ensure immediate display in CI logs
55- stdbuf -oL -eL tail -f " $output_file " 2>&1 &
56- tail_pid=$!
5753
58- # Give tail a moment to start and show initial output
59- sleep 2
54+ # Start tail and redirect its output to file descriptor 3 for multiplexing
55+ # This allows us to stream tail output while also printing heartbeat messages
56+ exec 3< <( stdbuf -oL -eL tail -f " $output_file " 2>&1 )
57+ tail_pid=$!
6058
61- # Wait for job to complete with retry logic for transient squeue failures
59+ # Monitor job status and stream output simultaneously
6260squeue_failures=0
63- heartbeat_counter=0
61+ last_heartbeat=$( date +%s)
62+
6463while true ; do
65- if squeue -j " $job_id " & > /dev/null; then
66- squeue_failures=0
67- # Print heartbeat every 60 seconds (12 iterations * 5 sec)
68- heartbeat_counter=$(( heartbeat_counter + 1 ))
69- if [ $(( heartbeat_counter % 12 )) -eq 0 ]; then
70- echo " [$( date +%H:%M:%S) ] Job $job_id still running..."
64+ # Try to read from tail output (non-blocking via timeout)
65+ # Read multiple lines if available to avoid falling behind
66+ lines_read=0
67+ while IFS= read -r -t 0.1 line < & 3 2> /dev/null; do
68+ echo " $line "
69+ lines_read=$(( lines_read + 1 ))
70+ last_heartbeat=$( date +%s)
71+ # Limit burst reads to avoid starving the status check
72+ if [ $lines_read -ge 100 ]; then
73+ break
7174 fi
72- else
75+ done
76+
77+ # Check job status
78+ current_time=$( date +%s)
79+ if ! squeue -j " $job_id " & > /dev/null; then
7380 squeue_failures=$(( squeue_failures + 1 ))
7481 # Check if job actually completed using sacct (if available)
7582 if [ $squeue_failures -ge 3 ]; then
@@ -87,14 +94,41 @@ while true; do
8794 ;;
8895 esac
8996 else
90- # No sacct: avoid false positive by doing an extra check cycle
91- squeue_failures=2
97+ # No sacct: assume job completed after 3 failures
98+ echo " [$( date +%H:%M:%S) ] Job $job_id no longer in queue"
99+ break
92100 fi
93101 fi
102+ else
103+ squeue_failures=0
104+ # Print heartbeat if no output for 60 seconds
105+ if [ $(( current_time - last_heartbeat)) -ge 60 ]; then
106+ echo " [$( date +%H:%M:%S) ] Job $job_id still running (no new output for 60s)..."
107+ last_heartbeat=$current_time
108+ fi
109+ fi
110+
111+ # Sleep briefly between status checks
112+ sleep 1
113+ done
114+
115+ # Drain any remaining output from tail after job completes
116+ echo " Draining remaining output..."
117+ drain_count=0
118+ while IFS= read -r -t 0.5 line < & 3 2> /dev/null; do
119+ echo " $line "
120+ drain_count=$(( drain_count + 1 ))
121+ # Safety limit to avoid infinite loop
122+ if [ $drain_count -ge 10000 ]; then
123+ echo " Warning: Truncating remaining output after 10000 lines"
124+ break
94125 fi
95- sleep 5
96126done
97127
128+ # Close the file descriptor and kill tail
129+ exec 3< & -
130+ kill " ${tail_pid} " 2> /dev/null || true
131+
98132# Wait for output file to finish growing (stabilize) before stopping tail
99133if [ -f " $output_file " ]; then
100134 last_size=-1
0 commit comments