@@ -273,8 +273,9 @@ async def read_job_state(self):
273
273
format_template (self .batch_query_cmd , ** subvars )))
274
274
self .log .debug ('Spawner querying job: ' + cmd )
275
275
try :
276
- out = await self .run_command (cmd )
277
- self .job_status = out
276
+ self .job_status = await self .run_command (cmd )
277
+ except RuntimeError as e :
278
+ self .job_status = e .args [0 ]
278
279
except Exception as e :
279
280
self .log .error ('Error querying job ' + self .job_id )
280
281
self .job_status = ''
@@ -326,6 +327,10 @@ def state_isrunning(self):
326
327
"Return boolean indicating if job is running, likely by parsing self.job_status"
327
328
raise NotImplementedError ("Subclass must provide implementation" )
328
329
330
+ def state_isunknown (self ):
331
+ "Return boolean indicating if job state retrieval failed because of the resource manager"
332
+ raise False
333
+
329
334
def state_gethost (self ):
330
335
"Return string, hostname or addr of running job, likely by parsing self.job_status"
331
336
raise NotImplementedError ("Subclass must provide implementation" )
@@ -334,7 +339,7 @@ async def poll(self):
334
339
"""Poll the process"""
335
340
if self .job_id is not None and len (self .job_id ) > 0 :
336
341
await self .read_job_state ()
337
- if self .state_isrunning () or self .state_ispending ():
342
+ if self .state_isrunning () or self .state_ispending () or self . state_isunknown () :
338
343
return None
339
344
else :
340
345
self .clear_state ()
@@ -467,6 +472,8 @@ class BatchSpawnerRegexStates(BatchSpawnerBase):
467
472
If this variable is set, the match object will be expanded using this string
468
473
to obtain the notebook IP.
469
474
See Python docs: re.match.expand""" ).tag (config = True )
475
+ state_unknown_re = Unicode ('^$' ,
476
+ help = "Regex that matches job_status if the resource manager is not answering" ).tag (config = True )
470
477
471
478
def state_ispending (self ):
472
479
assert self .state_pending_re , "Misconfigured: define state_running_re"
@@ -482,6 +489,13 @@ def state_isrunning(self):
482
489
else :
483
490
return False
484
491
492
+ def state_isunknown (self ):
493
+ assert self .state_unknown_re , "Misconfigured: define state_unknown_re"
494
+ if self .job_status and re .search (self .state_unknown_re , self .job_status ):
495
+ return True
496
+ else :
497
+ return False
498
+
485
499
def state_gethost (self ):
486
500
assert self .state_exechost_re , "Misconfigured: define state_exechost_re"
487
501
match = re .search (self .state_exechost_re , self .job_status )
@@ -634,6 +648,7 @@ class SlurmSpawner(UserEnvMixin,BatchSpawnerRegexStates):
634
648
# RUNNING, COMPLETING = running
635
649
state_pending_re = Unicode (r'^(?:PENDING|CONFIGURING)' ).tag (config = True )
636
650
state_running_re = Unicode (r'^(?:RUNNING|COMPLETING)' ).tag (config = True )
651
+ state_unknown_re = Unicode (r'^slurm_load_jobs error: (?:Socket timed out on send/recv)' ).tag (config = True )
637
652
state_exechost_re = Unicode (r'\s+((?:[\w_-]+\.?)+)$' ).tag (config = True )
638
653
639
654
def parse_job_id (self , output ):
0 commit comments