File tree Expand file tree Collapse file tree 1 file changed +11
-1
lines changed
Expand file tree Collapse file tree 1 file changed +11
-1
lines changed Original file line number Diff line number Diff line change @@ -48,9 +48,12 @@ tail_pid=$!
4848
4949# Wait for job to complete with retry logic for transient squeue failures
5050squeue_failures=0
51+ no_sacct_retries=0
52+ max_no_sacct_retries=5
5153while true ; do
5254 if squeue -j " $job_id " & > /dev/null; then
5355 squeue_failures=0
56+ no_sacct_retries=0
5457 else
5558 squeue_failures=$(( squeue_failures + 1 ))
5659 # Check if job actually completed using sacct (if available)
@@ -68,7 +71,14 @@ while true; do
6871 ;;
6972 esac
7073 else
71- # No sacct: avoid false positive by doing an extra check cycle
74+ # No sacct: track retries to avoid infinite loop
75+ no_sacct_retries=$(( no_sacct_retries + 1 ))
76+ if [ $no_sacct_retries -ge $max_no_sacct_retries ]; then
77+ echo " Warning: sacct unavailable and job not in squeue after $(( no_sacct_retries * 5 )) seconds"
78+ echo " Assuming job completed (output file exists and squeue consistently fails)"
79+ break
80+ fi
81+ # Reset squeue_failures to retry (but track no_sacct_retries to prevent infinite loop)
7282 squeue_failures=2
7383 fi
7484 fi
You can’t perform that action at this time.
0 commit comments