2020logger = logging .getLogger (__name__ )
2121
2222# From https://slurm.schedmd.com/sacct.html#SECTION_JOB-STATE-CODES
23- translate_table = {
23+ sacct_translate_table = {
2424 'PENDING' : JobState .PENDING ,
2525 'RUNNING' : JobState .RUNNING ,
2626 'CANCELLED' : JobState .CANCELLED ,
3737 'REQUEUED' : JobState .PENDING
3838}
3939
40+ squeue_translate_table = {
41+ 'PD' : JobState .PENDING ,
42+ 'R' : JobState .RUNNING ,
43+ 'CA' : JobState .CANCELLED ,
44+ 'CF' : JobState .PENDING , # (configuring),
45+ 'CG' : JobState .RUNNING , # (completing),
46+ 'CD' : JobState .COMPLETED ,
47+ 'F' : JobState .FAILED , # (failed),
48+ 'TO' : JobState .TIMEOUT , # (timeout),
49+ 'NF' : JobState .FAILED , # (node failure),
50+ 'RV' : JobState .FAILED , # (revoked) and
51+ 'SE' : JobState .FAILED # (special exit state)
52+ }
53+
4054
4155class SlurmProvider (ClusterProvider , RepresentationMixin ):
4256 """Slurm Execution Provider
@@ -155,6 +169,23 @@ def __init__(self,
155169
156170 self .regex_job_id = regex_job_id
157171 self .worker_init = worker_init + '\n '
172+ # Check if sacct works and if not fall back to squeue
173+ cmd = "sacct -X"
174+ logger .debug ("Executing %s" , cmd )
175+ retcode , stdout , stderr = self .execute_wait (cmd )
176+ # If sacct fails it should return retcode=1 stderr="Slurm accounting storage is disabled"
177+ logger .debug (f"sacct returned retcode={ retcode } stderr={ stderr } " )
178+ if retcode == 0 :
179+ logger .debug ("using sacct to get job status" )
180+ # Using state%20 to get enough characters to not truncate output
181+ # of the state. Without output can look like "<job_id> CANCELLED+"
182+ self ._cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'"
183+ self ._translate_table = sacct_translate_table
184+ else :
185+ logger .debug (f"sacct failed with retcode={ retcode } " )
186+ logger .debug ("falling back to using squeue to get job status" )
187+ self ._cmd = "squeue --noheader --format='%i %t' --job '{0}'"
188+ self ._translate_table = squeue_translate_table
158189
159190 def _status (self ):
160191 '''Returns the status list for a list of job_ids
@@ -172,16 +203,14 @@ def _status(self):
172203 logger .debug ('No active jobs, skipping status update' )
173204 return
174205
175- # Using state%20 to get enough characters to not truncate output
176- # of the state. Without output can look like "<job_id> CANCELLED+"
177- cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'" .format (job_id_list )
206+ cmd = self ._cmd .format (job_id_list )
178207 logger .debug ("Executing %s" , cmd )
179208 retcode , stdout , stderr = self .execute_wait (cmd )
180- logger .debug ("sacct returned %s %s" , stdout , stderr )
209+ logger .debug ("sacct/squeue returned %s %s" , stdout , stderr )
181210
182211 # Execute_wait failed. Do no update
183212 if retcode != 0 :
184- logger .warning ("sacct failed with non-zero exit code {}" .format (retcode ))
213+ logger .warning ("sacct/squeue failed with non-zero exit code {}" .format (retcode ))
185214 return
186215
187216 jobs_missing = set (self .resources .keys ())
@@ -193,19 +222,20 @@ def _status(self):
193222 # For example "<job_id> CANCELLED by <user_id>"
194223 # This splits and ignores anything past the first two unpacked values
195224 job_id , slurm_state , * ignore = line .split ()
196- if slurm_state not in translate_table :
225+ if slurm_state not in self . _translate_table :
197226 logger .warning (f"Slurm status { slurm_state } is not recognized" )
198- status = translate_table .get (slurm_state , JobState .UNKNOWN )
227+ status = self . _translate_table .get (slurm_state , JobState .UNKNOWN )
199228 logger .debug ("Updating job {} with slurm status {} to parsl state {!s}" .format (job_id , slurm_state , status ))
200229 self .resources [job_id ]['status' ] = JobStatus (status ,
201230 stdout_path = self .resources [job_id ]['job_stdout_path' ],
202231 stderr_path = self .resources [job_id ]['job_stderr_path' ])
203232 jobs_missing .remove (job_id )
204233
205234 # sacct can get job info after jobs have completed so this path shouldn't be hit
206- # log a warning if there are missing jobs for some reason
235+ # squeue does not report on jobs that are not running. So we are filling in the
236+ # blanks for missing jobs, we might lose some information about why the jobs failed.
207237 for missing_job in jobs_missing :
208- logger .warning ("Updating missing job {} to completed status" .format (missing_job ))
238+ logger .debug ("Updating missing job {} to completed status" .format (missing_job ))
209239 self .resources [missing_job ]['status' ] = JobStatus (
210240 JobState .COMPLETED , stdout_path = self .resources [missing_job ]['job_stdout_path' ],
211241 stderr_path = self .resources [missing_job ]['job_stderr_path' ])
0 commit comments