Skip to content

Commit 217b071

Browse files
mgr/cephadm: set a healthwarning for host SSH timeout
Fixes: https://tracker.ceph.com/issues/72345 Signed-off-by: Shweta Bhosale <[email protected]>
1 parent deeb835 commit 217b071

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

src/pybind/mgr/cephadm/module.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,35 @@ def wait_async(self, coro: Awaitable[T], timeout: Optional[int] = None) -> T:
808808
timeout = 60
809809
return self.event_loop.get_result(coro, timeout)
810810

811+
def update_host_timeout_error(self, host: str, remove: bool = False) -> None:
812+
if 'CEPHADM_HOST_TIMEOUT_ERROR' in self.health_checks:
813+
hosts = self.health_checks['CEPHADM_HOST_TIMEOUT_ERROR'].get('detail', [])
814+
else:
815+
hosts = []
816+
if remove and host not in hosts:
817+
# If the host being removed isn't in the host list, there's no way we'd be removing
818+
# the last host from the list, so no need to worry about clearing the warning
819+
return
820+
elif remove:
821+
hosts.remove(host)
822+
elif not remove and host in hosts:
823+
# If the host was in the host list we got, the warning must already exist and
824+
# include the host, so nothing to be done
825+
return
826+
else: # not remove and host is not in list case
827+
hosts.append(host)
828+
829+
if not hosts:
830+
self.remove_health_warning('CEPHADM_HOST_TIMEOUT_ERROR')
831+
else:
832+
self.set_health_warning(
833+
'CEPHADM_HOST_TIMEOUT_ERROR',
834+
f'SSH command execution failed with TimeoutError for {len(hosts)} hosts',
835+
len(hosts),
836+
hosts
837+
)
838+
return
839+
811840
@contextmanager
812841
def async_timeout_handler(self, host: Optional[str] = '',
813842
cmd: Optional[str] = '',
@@ -819,6 +848,9 @@ def async_timeout_handler(self, host: Optional[str] = '',
819848
try:
820849
yield
821850
except (asyncio.TimeoutError, concurrent.futures.TimeoutError):
851+
# raise health warning for timeout issue
852+
if host:
853+
self.update_host_timeout_error(host)
822854
err_str: str = ''
823855
if cmd:
824856
err_str = f'Command "{cmd}" timed out '
@@ -841,6 +873,9 @@ def async_timeout_handler(self, host: Optional[str] = '',
841873
err_str += f'on host {host} '
842874
err_str += f' - {str(e)}'
843875
raise OrchestratorError(err_str)
876+
else:
877+
if host:
878+
self.update_host_timeout_error(host, remove=True)
844879

845880
def set_container_image(self, entity: str, image: str) -> None:
846881
self.check_mon_command({

0 commit comments

Comments
 (0)