Skip to content

Commit 32c80f7

Browse files
committed
fix inf. loop
1 parent 8d804e3 commit 32c80f7

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

.github/scripts/monitor_slurm_job.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,12 @@ tail_pid=$!
4848

4949
# Wait for job to complete with retry logic for transient squeue failures
5050
squeue_failures=0
51+
no_sacct_retries=0
52+
max_no_sacct_retries=5
5153
while true; do
5254
if squeue -j "$job_id" &>/dev/null; then
5355
squeue_failures=0
56+
no_sacct_retries=0
5457
else
5558
squeue_failures=$((squeue_failures + 1))
5659
# Check if job actually completed using sacct (if available)
@@ -68,7 +71,14 @@ while true; do
6871
;;
6972
esac
7073
else
71-
# No sacct: avoid false positive by doing an extra check cycle
74+
# No sacct: track retries to avoid infinite loop
75+
no_sacct_retries=$((no_sacct_retries + 1))
76+
if [ $no_sacct_retries -ge $max_no_sacct_retries ]; then
77+
echo "Warning: sacct unavailable and job not in squeue after $((no_sacct_retries * 5)) seconds"
78+
echo "Assuming job completed (output file exists and squeue consistently fails)"
79+
break
80+
fi
81+
# Reset squeue_failures to retry (but track no_sacct_retries to prevent infinite loop)
7282
squeue_failures=2
7383
fi
7484
fi

0 commit comments

Comments
 (0)