Skip to content

Commit bdfbb26

Browse files
authored
Fallback to squeue when sacct is missing in SlurmProvider (#3591)
Adds internal check to test whether the slurm provider should use the sacct or squeue command. Some slurm clusters might not use the accounting database sacct uses. This allows slurm clusters that use the database to use the sacct command which can be easier on the slurm scheduler, or if the database is not present switch to the squeue command which will should work on all clusters. Fixes #3590
1 parent 73f6f65 commit bdfbb26

File tree

1 file changed

+40
-10
lines changed

1 file changed

+40
-10
lines changed

parsl/providers/slurm/slurm.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
logger = logging.getLogger(__name__)
2121

2222
# From https://slurm.schedmd.com/sacct.html#SECTION_JOB-STATE-CODES
23-
translate_table = {
23+
sacct_translate_table = {
2424
'PENDING': JobState.PENDING,
2525
'RUNNING': JobState.RUNNING,
2626
'CANCELLED': JobState.CANCELLED,
@@ -37,6 +37,20 @@
3737
'REQUEUED': JobState.PENDING
3838
}
3939

40+
squeue_translate_table = {
41+
'PD': JobState.PENDING,
42+
'R': JobState.RUNNING,
43+
'CA': JobState.CANCELLED,
44+
'CF': JobState.PENDING, # (configuring),
45+
'CG': JobState.RUNNING, # (completing),
46+
'CD': JobState.COMPLETED,
47+
'F': JobState.FAILED, # (failed),
48+
'TO': JobState.TIMEOUT, # (timeout),
49+
'NF': JobState.FAILED, # (node failure),
50+
'RV': JobState.FAILED, # (revoked) and
51+
'SE': JobState.FAILED # (special exit state)
52+
}
53+
4054

4155
class SlurmProvider(ClusterProvider, RepresentationMixin):
4256
"""Slurm Execution Provider
@@ -155,6 +169,23 @@ def __init__(self,
155169

156170
self.regex_job_id = regex_job_id
157171
self.worker_init = worker_init + '\n'
172+
# Check if sacct works and if not fall back to squeue
173+
cmd = "sacct -X"
174+
logger.debug("Executing %s", cmd)
175+
retcode, stdout, stderr = self.execute_wait(cmd)
176+
# If sacct fails it should return retcode=1 stderr="Slurm accounting storage is disabled"
177+
logger.debug(f"sacct returned retcode={retcode} stderr={stderr}")
178+
if retcode == 0:
179+
logger.debug("using sacct to get job status")
180+
# Using state%20 to get enough characters to not truncate output
181+
# of the state. Without output can look like "<job_id> CANCELLED+"
182+
self._cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'"
183+
self._translate_table = sacct_translate_table
184+
else:
185+
logger.debug(f"sacct failed with retcode={retcode}")
186+
logger.debug("falling back to using squeue to get job status")
187+
self._cmd = "squeue --noheader --format='%i %t' --job '{0}'"
188+
self._translate_table = squeue_translate_table
158189

159190
def _status(self):
160191
'''Returns the status list for a list of job_ids
@@ -172,16 +203,14 @@ def _status(self):
172203
logger.debug('No active jobs, skipping status update')
173204
return
174205

175-
# Using state%20 to get enough characters to not truncate output
176-
# of the state. Without output can look like "<job_id> CANCELLED+"
177-
cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'".format(job_id_list)
206+
cmd = self._cmd.format(job_id_list)
178207
logger.debug("Executing %s", cmd)
179208
retcode, stdout, stderr = self.execute_wait(cmd)
180-
logger.debug("sacct returned %s %s", stdout, stderr)
209+
logger.debug("sacct/squeue returned %s %s", stdout, stderr)
181210

182211
# Execute_wait failed. Do no update
183212
if retcode != 0:
184-
logger.warning("sacct failed with non-zero exit code {}".format(retcode))
213+
logger.warning("sacct/squeue failed with non-zero exit code {}".format(retcode))
185214
return
186215

187216
jobs_missing = set(self.resources.keys())
@@ -193,19 +222,20 @@ def _status(self):
193222
# For example "<job_id> CANCELLED by <user_id>"
194223
# This splits and ignores anything past the first two unpacked values
195224
job_id, slurm_state, *ignore = line.split()
196-
if slurm_state not in translate_table:
225+
if slurm_state not in self._translate_table:
197226
logger.warning(f"Slurm status {slurm_state} is not recognized")
198-
status = translate_table.get(slurm_state, JobState.UNKNOWN)
227+
status = self._translate_table.get(slurm_state, JobState.UNKNOWN)
199228
logger.debug("Updating job {} with slurm status {} to parsl state {!s}".format(job_id, slurm_state, status))
200229
self.resources[job_id]['status'] = JobStatus(status,
201230
stdout_path=self.resources[job_id]['job_stdout_path'],
202231
stderr_path=self.resources[job_id]['job_stderr_path'])
203232
jobs_missing.remove(job_id)
204233

205234
# sacct can get job info after jobs have completed so this path shouldn't be hit
206-
# log a warning if there are missing jobs for some reason
235+
# squeue does not report on jobs that are not running. So we are filling in the
236+
# blanks for missing jobs, we might lose some information about why the jobs failed.
207237
for missing_job in jobs_missing:
208-
logger.warning("Updating missing job {} to completed status".format(missing_job))
238+
logger.debug("Updating missing job {} to completed status".format(missing_job))
209239
self.resources[missing_job]['status'] = JobStatus(
210240
JobState.COMPLETED, stdout_path=self.resources[missing_job]['job_stdout_path'],
211241
stderr_path=self.resources[missing_job]['job_stderr_path'])

0 commit comments

Comments
 (0)