@@ -90,7 +90,10 @@ def _all_ready(self):
9090 """
9191 Checks if all futures are ready, success or done
9292 """
93- return all ([f .ready or f .success or f .done for f in self .futures ])
93+ try :
94+ return all (f .ready or f .success or f .done for f in self .futures )
95+ except Exception :
96+ return False
9497
9598 def _check_new_futures (self , call_status , f ):
9699 """Checks if a functions returned new futures to track"""
@@ -426,41 +429,54 @@ def _generate_tokens(self, callids_running, callids_done):
426429 self .callids_running_processed .update (callids_running_to_process )
427430 self .callids_done_processed .update (callids_done_to_process )
428431
432+ def _poll_and_process_job_status (self , previous_log , log_time ):
433+ """
434+ Polls the storage backend for job status, updates futures,
435+ and prints status logs.
436+
437+ Returns:
438+ new_callids_done (set): New callids that were marked as done.
439+ previous_log (str): Updated log message.
440+ log_time (float): Updated log time counter.
441+ """
442+ callids_running , callids_done = self .internal_storage .get_job_status (self .executor_id )
443+ new_callids_done = callids_done - self .callids_done_processed_status
444+
445+ self ._generate_tokens (callids_running , callids_done )
446+ self ._tag_future_as_running (callids_running )
447+ self ._tag_future_as_ready (callids_done )
448+
449+ previous_log , log_time = self ._print_status_log (previous_log , log_time )
450+
451+ return new_callids_done , previous_log , log_time
452+
429453 def run (self ):
430454 """
431- Run method
455+ Run method for the Storage job monitor thread.
432456 """
433457 logger .debug (f'ExecutorID { self .executor_id } - Starting Storage job monitor' )
434458
435459 wait_dur_sec = self .monitoring_interval
436460 previous_log = None
437461 log_time = 0
438462
439- def process_callids ():
440- nonlocal previous_log , log_time
441- callids_running , callids_done = self .internal_storage .get_job_status (self .executor_id )
442- # verify if there are new callids_done and reduce the sleep
443- new_callids_done = callids_done - self .callids_done_processed_status
444- # generate tokens and mark futures as running/done
445- self ._generate_tokens (callids_running , callids_done )
446- self ._tag_future_as_running (callids_running )
447- self ._tag_future_as_ready (callids_done )
448- previous_log , log_time = self ._print_status_log (previous_log , log_time )
449-
450- return new_callids_done
451-
452463 while not self ._all_ready ():
453464 time .sleep (wait_dur_sec )
454465 wait_dur_sec = self .monitoring_interval
455466 log_time += wait_dur_sec
456467
457468 if not self .should_run :
469+ logger .debug (f'ExecutorID { self .executor_id } - Monitor stopped externally' )
458470 break
459471
460- if len (process_callids ()) > 0 :
461- wait_dur_sec = self .monitoring_interval / 5
472+ try :
473+ new_callids_done , previous_log , log_time = self ._poll_and_process_job_status (previous_log , log_time )
474+ if new_callids_done :
475+ wait_dur_sec = self .monitoring_interval / 5
476+ except Exception as e :
477+ logger .error (f'ExecutorID { self .executor_id } - Error during monitor: { e } ' , exc_info = True )
462478
463- process_callids ( )
479+ self . _poll_and_process_job_status ( previous_log , log_time )
464480
465481 logger .debug (f'ExecutorID { self .executor_id } - Storage job monitor finished' )
466482
@@ -509,6 +525,9 @@ def start(self, fs, job_id=None, chunksize=None, generate_tokens=False):
509525 if not self .monitor .is_alive ():
510526 self .monitor .start ()
511527
528+ def is_alive (self ):
529+ self .monitor .is_alive ()
530+
512531 def remove (self , fs ):
513532 if self .monitor and self .monitor .is_alive ():
514533 self .monitor .remove_futures (fs )
0 commit comments