Skip to content

Commit 9190d05

Browse files
authored
Merge pull request ceph#60096 from adk3798/maintenance-mode-dead-host
mgr/cephadm: offline host handling for maintenance exit command Reviewed-by: John Mulligan <[email protected]>
2 parents 4365808 + 565bb14 commit 9190d05

File tree

5 files changed

+82
-36
lines changed

5 files changed

+82
-36
lines changed

doc/cephadm/host-management.rst

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -233,11 +233,16 @@ Place a host in and out of maintenance mode (stops all Ceph daemons on host):
233233
.. prompt:: bash #
234234

235235
ceph orch host maintenance enter <hostname> [--force] [--yes-i-really-mean-it]
236-
ceph orch host maintenance exit <hostname>
237-
238-
The ``--force`` flag allows the user to bypass warnings (but not alerts). The ``--yes-i-really-mean-it``
239-
flag bypasses all safety checks and will attempt to force the host into maintenance mode no
240-
matter what.
236+
ceph orch host maintenance exit <hostname> [--force] [--offline]
237+
238+
The ``--force`` flag on the ``enter`` command allows the user to bypass warnings (but not alerts).
239+
The ``--yes-i-really-mean-it`` flag bypasses all safety checks and will attempt to force the
240+
host into maintenance mode no matter what. The ``--force`` and ``--offline`` flags to the ``exit`` command
241+
can be used to to have cephadm mark a host that is in maintenance mode and offline as no longer
242+
in maintenance mode. Note in this case if the host comes online, the Ceph daemons
243+
on the host will remain in the stopped state. The ``--force`` and ``--offline`` flags to the ``exit``
244+
command are intended to be run for hosts in maintenance mode that are permanently offline
245+
before removing the host entirely from cephadm management using the ``ceph orch host rm`` command.
241246

242247
.. warning:: Using the --yes-i-really-mean-it flag to force the host to enter maintenance
243248
mode can potentially cause loss of data availability, the mon quorum to break down due

src/pybind/mgr/cephadm/module.py

Lines changed: 68 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1903,7 +1903,7 @@ def _add_host(self, spec):
19031903
self.inventory.add_host(spec)
19041904
self.offline_hosts_remove(spec.hostname)
19051905
if spec.status == 'maintenance':
1906-
self._set_maintenance_healthcheck()
1906+
self.set_maintenance_healthcheck()
19071907
self.event.set() # refresh stray health check
19081908
self.log.info('Added host %s' % spec.hostname)
19091909
return "Added host '{}' with addr '{}'".format(spec.hostname, spec.addr)
@@ -2074,6 +2074,7 @@ def run_cmd(cmd_args: dict) -> None:
20742074
self.ssh.reset_con(host)
20752075
# if host was in offline host list, we should remove it now.
20762076
self.offline_hosts_remove(host)
2077+
self.set_maintenance_healthcheck()
20772078
self.event.set() # refresh stray health check
20782079
self.log.info('Removed host %s' % host)
20792080
return "Removed {} host '{}'".format('offline' if offline else '', host)
@@ -2188,7 +2189,7 @@ def host_ok_to_stop(self, hostname: str) -> str:
21882189
self.log.info(msg)
21892190
return msg
21902191

2191-
def _set_maintenance_healthcheck(self) -> None:
2192+
def set_maintenance_healthcheck(self) -> None:
21922193
"""Raise/update or clear the maintenance health check as needed"""
21932194

21942195
in_maintenance = self.inventory.get_host_with_state("maintenance")
@@ -2272,19 +2273,21 @@ def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_reall
22722273
self.inventory._inventory[hostname] = tgt_host
22732274
self.inventory.save()
22742275

2275-
self._set_maintenance_healthcheck()
2276+
self.set_maintenance_healthcheck()
22762277
return f'Daemons for Ceph cluster {self._cluster_fsid} stopped on host {hostname}. Host {hostname} moved to maintenance mode'
22772278

22782279
@handle_orch_error
22792280
@host_exists()
2280-
def exit_host_maintenance(self, hostname: str) -> str:
2281+
def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> str:
22812282
"""Exit maintenance mode and return a host to an operational state
22822283
22832284
Returning from maintenance will enable the clusters systemd target and
22842285
start it, and remove any noout that has been added for the host if the
22852286
host has osd daemons
22862287
22872288
:param hostname: (str) host name
2289+
:param force: (bool) force removal of the host from maintenance mode
2290+
:param offline: (bool) to remove hosts that are offline from maintenance mode
22882291
22892292
:raises OrchestratorError: Unable to return from maintenance, or unset the
22902293
noout flag
@@ -2293,37 +2296,74 @@ def exit_host_maintenance(self, hostname: str) -> str:
22932296
if tgt_host['status'] != "maintenance":
22942297
raise OrchestratorError(f"Host {hostname} is not in maintenance mode")
22952298

2296-
with self.async_timeout_handler(hostname, 'cephadm host-maintenance exit'):
2297-
outs, errs, _code = self.wait_async(
2298-
CephadmServe(self)._run_cephadm(hostname, cephadmNoImage,
2299-
'host-maintenance', ['exit'], error_ok=True))
2300-
returned_msg = errs[0].split('\n')[-1]
2301-
if returned_msg.startswith('failed') or returned_msg.startswith('ERROR'):
2302-
raise OrchestratorError(
2303-
f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
2304-
2305-
if "osd" in self.cache.get_daemon_types(hostname):
2306-
crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
2307-
rc, _out, _err = self.mon_command({
2308-
'prefix': 'osd unset-group',
2309-
'flags': 'noout',
2310-
'who': [crush_node],
2311-
'format': 'json'
2312-
})
2313-
if rc:
2299+
# Given we do not regularly check maintenance mode hosts for being offline,
2300+
# we have no idea at this point whether the host is online or not.
2301+
# Keep in mind this goes both ways, as users could have run
2302+
# "ceph cephadm check-host <hostname>" when the host was in maintenance
2303+
# mode and offline and the host could have since come online. This following
2304+
# "cephadm check-host" command is being run purely so we know if the host
2305+
# is online or offline, as those should be handled differently
2306+
try:
2307+
with self.async_timeout_handler(hostname, 'cephadm check-host'):
2308+
outs, errs, _code = self.wait_async(
2309+
CephadmServe(self)._run_cephadm(
2310+
hostname, cephadmNoImage,
2311+
'check-host', [], error_ok=False
2312+
)
2313+
)
2314+
except OrchestratorError:
2315+
pass
2316+
2317+
host_offline = hostname in self.offline_hosts
2318+
2319+
if host_offline and not offline:
2320+
raise OrchestratorValidationError(
2321+
f'{hostname} is offline, please use --offline and --force to take this host out of maintenance mode')
2322+
2323+
if not host_offline and offline:
2324+
raise OrchestratorValidationError(
2325+
f'{hostname} is online, please take host out of maintenance mode without --offline.')
2326+
2327+
if offline and not force:
2328+
raise OrchestratorValidationError("Taking an offline host out of maintenance mode requires --force")
2329+
2330+
# no point trying these parts if we know the host is offline
2331+
if not host_offline:
2332+
with self.async_timeout_handler(hostname, 'cephadm host-maintenance exit'):
2333+
outs, errs, _code = self.wait_async(
2334+
CephadmServe(self)._run_cephadm(hostname, cephadmNoImage,
2335+
'host-maintenance', ['exit'], error_ok=True))
2336+
returned_msg = errs[0].split('\n')[-1]
2337+
if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')):
23142338
self.log.warning(
2315-
f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
2316-
raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
2317-
else:
2318-
self.log.info(
2319-
f"exit maintenance request has UNSET for the noout group on host {hostname}")
2339+
f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
2340+
if not force:
2341+
raise OrchestratorError(
2342+
f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
2343+
2344+
if "osd" in self.cache.get_daemon_types(hostname):
2345+
crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
2346+
rc, _out, _err = self.mon_command({
2347+
'prefix': 'osd unset-group',
2348+
'flags': 'noout',
2349+
'who': [crush_node],
2350+
'format': 'json'
2351+
})
2352+
if rc:
2353+
self.log.warning(
2354+
f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
2355+
if not force:
2356+
raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
2357+
else:
2358+
self.log.info(
2359+
f"exit maintenance request has UNSET for the noout group on host {hostname}")
23202360

23212361
# update the host record status
23222362
tgt_host['status'] = ""
23232363
self.inventory._inventory[hostname] = tgt_host
23242364
self.inventory.save()
23252365

2326-
self._set_maintenance_healthcheck()
2366+
self.set_maintenance_healthcheck()
23272367

23282368
return f"Ceph cluster {self._cluster_fsid} on {hostname} has exited maintenance mode"
23292369

src/pybind/mgr/cephadm/tests/test_cephadm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2720,6 +2720,7 @@ def test_tuned_profiles_settings_validation(self, facts, settings, expected_valu
27202720
cephadm_module.cache.facts = facts
27212721
assert cephadm_module._validate_tunedprofile_settings(spec) == expected_value
27222722

2723+
@mock.patch("cephadm.CephadmOrchestrator.set_maintenance_healthcheck", lambda _: None)
27232724
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
27242725
def test_tuned_profiles_validation(self, cephadm_module):
27252726
with with_host(cephadm_module, 'test'):

src/pybind/mgr/orchestrator/_interface.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,7 @@ def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_reall
503503
"""
504504
raise NotImplementedError()
505505

506-
def exit_host_maintenance(self, hostname: str) -> OrchResult:
506+
def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> OrchResult:
507507
"""
508508
Return a host from maintenance, restarting the clusters systemd target
509509
"""

src/pybind/mgr/orchestrator/module.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -799,11 +799,11 @@ def _host_maintenance_enter(self, hostname: str, force: bool = False, yes_i_real
799799
return HandleCommandResult(stdout=completion.result_str())
800800

801801
@_cli_write_command('orch host maintenance exit')
802-
def _host_maintenance_exit(self, hostname: str) -> HandleCommandResult:
802+
def _host_maintenance_exit(self, hostname: str, force: bool = False, offline: bool = False) -> HandleCommandResult:
803803
"""
804804
Return a host from maintenance, restarting all Ceph daemons (cephadm only)
805805
"""
806-
completion = self.exit_host_maintenance(hostname)
806+
completion = self.exit_host_maintenance(hostname, force, offline)
807807
raise_if_exception(completion)
808808

809809
return HandleCommandResult(stdout=completion.result_str())

0 commit comments

Comments
 (0)