@@ -808,6 +808,35 @@ def wait_async(self, coro: Awaitable[T], timeout: Optional[int] = None) -> T:
808808 timeout = 60
809809 return self .event_loop .get_result (coro , timeout )
810810
811+ def update_host_timeout_error (self , host : str , remove : bool = False ) -> None :
812+ if 'CEPHADM_HOST_TIMEOUT_ERROR' in self .health_checks :
813+ hosts = self .health_checks ['CEPHADM_HOST_TIMEOUT_ERROR' ].get ('detail' , [])
814+ else :
815+ hosts = []
816+ if remove and host not in hosts :
817+ # If the host being removed isn't in the host list, there's no way we'd be removing
818+ # the last host from the list, so no need to worry about clearing the warning
819+ return
820+ elif remove :
821+ hosts .remove (host )
822+ elif not remove and host in hosts :
823+ # If the host was in the host list we got, the warning must already exist and
824+ # include the host, so nothing to be done
825+ return
826+ else : # not remove and host is not in list case
827+ hosts .append (host )
828+
829+ if not hosts :
830+ self .remove_health_warning ('CEPHADM_HOST_TIMEOUT_ERROR' )
831+ else :
832+ self .set_health_warning (
833+ 'CEPHADM_HOST_TIMEOUT_ERROR' ,
834+ f'SSH command execution failed with TimeoutError for { len (hosts )} hosts' ,
835+ len (hosts ),
836+ hosts
837+ )
838+ return
839+
811840 @contextmanager
812841 def async_timeout_handler (self , host : Optional [str ] = '' ,
813842 cmd : Optional [str ] = '' ,
@@ -819,6 +848,9 @@ def async_timeout_handler(self, host: Optional[str] = '',
819848 try :
820849 yield
821850 except (asyncio .TimeoutError , concurrent .futures .TimeoutError ):
851+ # raise health warning for timeout issue
852+ if host :
853+ self .update_host_timeout_error (host )
822854 err_str : str = ''
823855 if cmd :
824856 err_str = f'Command "{ cmd } " timed out '
@@ -841,6 +873,9 @@ def async_timeout_handler(self, host: Optional[str] = '',
841873 err_str += f'on host { host } '
842874 err_str += f' - { str (e )} '
843875 raise OrchestratorError (err_str )
876+ else :
877+ if host :
878+ self .update_host_timeout_error (host , remove = True )
844879
845880 def set_container_image (self , entity : str , image : str ) -> None :
846881 self .check_mon_command ({
0 commit comments