@@ -329,6 +329,9 @@ def __init__(self,
329329 launch_cmd = DEFAULT_LAUNCH_CMD
330330 self .launch_cmd = launch_cmd
331331
332+ self ._queue_management_thread_exit = threading .Event ()
333+ self ._queue_management_thread : Optional [threading .Thread ] = None
334+
332335 radio_mode = "htex"
333336
334337 def _warn_deprecated (self , old : str , new : str ):
@@ -450,9 +453,9 @@ def _queue_management_worker(self):
450453 """
451454 logger .debug ("Queue management worker starting" )
452455
453- while not self .bad_state_is_set :
456+ while not self .bad_state_is_set and not self . _queue_management_thread_exit . is_set () :
454457 try :
455- msgs = self .incoming_q .get ()
458+ msgs = self .incoming_q .get (timeout_ms = 1000 )
456459
457460 except IOError as e :
458461 logger .exception ("Caught broken queue with exception code {}: {}" .format (e .errno , e ))
@@ -465,57 +468,55 @@ def _queue_management_worker(self):
465468 else :
466469
467470 if msgs is None :
468- logger .debug ("Got None, exiting" )
469- return
471+ continue
472+
473+ for serialized_msg in msgs :
474+ try :
475+ msg = pickle .loads (serialized_msg )
476+ except pickle .UnpicklingError :
477+ raise BadMessage ("Message received could not be unpickled" )
470478
471- else :
472- for serialized_msg in msgs :
479+ if msg ['type' ] == 'heartbeat' :
480+ continue
481+ elif msg ['type' ] == 'result' :
473482 try :
474- msg = pickle .loads (serialized_msg )
475- except pickle .UnpicklingError :
476- raise BadMessage ("Message received could not be unpickled" )
483+ tid = msg ['task_id' ]
484+ except Exception :
485+ raise BadMessage ("Message received does not contain 'task_id' field" )
486+
487+ if tid == - 1 and 'exception' in msg :
488+ logger .warning ("Executor shutting down due to exception from interchange" )
489+ exception = deserialize (msg ['exception' ])
490+ self .set_bad_state_and_fail_all (exception )
491+ break
477492
478- if msg ['type' ] == 'heartbeat' :
479- continue
480- elif msg ['type' ] == 'result' :
493+ task_fut = self .tasks .pop (tid )
494+
495+ if 'result' in msg :
496+ result = deserialize (msg ['result' ])
497+ task_fut .set_result (result )
498+
499+ elif 'exception' in msg :
481500 try :
482- tid = msg ['task_id' ]
483- except Exception :
484- raise BadMessage ("Message received does not contain 'task_id' field" )
485-
486- if tid == - 1 and 'exception' in msg :
487- logger .warning ("Executor shutting down due to exception from interchange" )
488- exception = deserialize (msg ['exception' ])
489- self .set_bad_state_and_fail_all (exception )
490- break
491-
492- task_fut = self .tasks .pop (tid )
493-
494- if 'result' in msg :
495- result = deserialize (msg ['result' ])
496- task_fut .set_result (result )
497-
498- elif 'exception' in msg :
499- try :
500- s = deserialize (msg ['exception' ])
501- # s should be a RemoteExceptionWrapper... so we can reraise it
502- if isinstance (s , RemoteExceptionWrapper ):
503- try :
504- s .reraise ()
505- except Exception as e :
506- task_fut .set_exception (e )
507- elif isinstance (s , Exception ):
508- task_fut .set_exception (s )
509- else :
510- raise ValueError ("Unknown exception-like type received: {}" .format (type (s )))
511- except Exception as e :
512- # TODO could be a proper wrapped exception?
513- task_fut .set_exception (
514- DeserializationError ("Received exception, but handling also threw an exception: {}" .format (e )))
515- else :
516- raise BadMessage ("Message received is neither result or exception" )
501+ s = deserialize (msg ['exception' ])
502+ # s should be a RemoteExceptionWrapper... so we can reraise it
503+ if isinstance (s , RemoteExceptionWrapper ):
504+ try :
505+ s .reraise ()
506+ except Exception as e :
507+ task_fut .set_exception (e )
508+ elif isinstance (s , Exception ):
509+ task_fut .set_exception (s )
510+ else :
511+ raise ValueError ("Unknown exception-like type received: {}" .format (type (s )))
512+ except Exception as e :
513+ # TODO could be a proper wrapped exception?
514+ task_fut .set_exception (
515+ DeserializationError ("Received exception, but handling also threw an exception: {}" .format (e )))
517516 else :
518- raise BadMessage ("Message received with unknown type {}" .format (msg ['type' ]))
517+ raise BadMessage ("Message received is neither result or exception" )
518+ else :
519+ raise BadMessage ("Message received with unknown type {}" .format (msg ['type' ]))
519520
520521 logger .info ("Queue management worker finished" )
521522
@@ -815,13 +816,19 @@ def shutdown(self, timeout: float = 10.0):
815816
816817 logger .info ("Attempting HighThroughputExecutor shutdown" )
817818
819+ logger .info ("Terminating interchange and queue management thread" )
820+ self ._queue_management_thread_exit .set ()
818821 self .interchange_proc .terminate ()
819822 try :
820823 self .interchange_proc .wait (timeout = timeout )
821824 except subprocess .TimeoutExpired :
822- logger .info ("Unable to terminate Interchange process; sending SIGKILL" )
825+ logger .warning ("Unable to terminate Interchange process; sending SIGKILL" )
823826 self .interchange_proc .kill ()
824827
828+ logger .info ("Waiting for queue management thread exit" )
829+ if self ._queue_management_thread :
830+ self ._queue_management_thread .join ()
831+
825832 logger .info ("closing context sockets" )
826833 # this might block if there are outstanding messages (eg if the interchange
827834 # has gone away... probably something to do with zmq.LINGER sockopt to remove
0 commit comments