Skip to content

Commit b107e47

Browse files
committed
Add logic to handle cases where the rm is not answering
1 parent 33b4a7b commit b107e47

File tree

1 file changed

+18
-3
lines changed

1 file changed

+18
-3
lines changed

batchspawner/batchspawner.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -273,8 +273,9 @@ async def read_job_state(self):
273273
format_template(self.batch_query_cmd, **subvars)))
274274
self.log.debug('Spawner querying job: ' + cmd)
275275
try:
276-
out = await self.run_command(cmd)
277-
self.job_status = out
276+
self.job_status = await self.run_command(cmd)
277+
except RuntimeError as e:
278+
self.job_status = e.args[0]
278279
except Exception as e:
279280
self.log.error('Error querying job ' + self.job_id)
280281
self.job_status = ''
@@ -326,6 +327,10 @@ def state_isrunning(self):
326327
"Return boolean indicating if job is running, likely by parsing self.job_status"
327328
raise NotImplementedError("Subclass must provide implementation")
328329

330+
def state_isunknown(self):
331+
"Return boolean indicating if job state retrieval failed because of the resource manager"
332+
raise False
333+
329334
def state_gethost(self):
330335
"Return string, hostname or addr of running job, likely by parsing self.job_status"
331336
raise NotImplementedError("Subclass must provide implementation")
@@ -334,7 +339,7 @@ async def poll(self):
334339
"""Poll the process"""
335340
if self.job_id is not None and len(self.job_id) > 0:
336341
await self.read_job_state()
337-
if self.state_isrunning() or self.state_ispending():
342+
if self.state_isrunning() or self.state_ispending() or self.state_isunknown():
338343
return None
339344
else:
340345
self.clear_state()
@@ -467,6 +472,8 @@ class BatchSpawnerRegexStates(BatchSpawnerBase):
467472
If this variable is set, the match object will be expanded using this string
468473
to obtain the notebook IP.
469474
See Python docs: re.match.expand""").tag(config=True)
475+
state_unknown_re = Unicode('^$',
476+
help="Regex that matches job_status if the resource manager is not answering").tag(config=True)
470477

471478
def state_ispending(self):
472479
assert self.state_pending_re, "Misconfigured: define state_running_re"
@@ -482,6 +489,13 @@ def state_isrunning(self):
482489
else:
483490
return False
484491

492+
def state_isunknown(self):
493+
assert self.state_unknown_re, "Misconfigured: define state_unknown_re"
494+
if self.job_status and re.search(self.state_unknown_re, self.job_status):
495+
return True
496+
else:
497+
return False
498+
485499
def state_gethost(self):
486500
assert self.state_exechost_re, "Misconfigured: define state_exechost_re"
487501
match = re.search(self.state_exechost_re, self.job_status)
@@ -634,6 +648,7 @@ class SlurmSpawner(UserEnvMixin,BatchSpawnerRegexStates):
634648
# RUNNING, COMPLETING = running
635649
state_pending_re = Unicode(r'^(?:PENDING|CONFIGURING)').tag(config=True)
636650
state_running_re = Unicode(r'^(?:RUNNING|COMPLETING)').tag(config=True)
651+
state_unknown_re = Unicode(r'^slurm_load_jobs error: (?:Socket timed out on send/recv)').tag(config=True)
637652
state_exechost_re = Unicode(r'\s+((?:[\w_-]+\.?)+)$').tag(config=True)
638653

639654
def parse_job_id(self, output):

0 commit comments

Comments
 (0)