@@ -1903,7 +1903,7 @@ def _add_host(self, spec):
19031903 self .inventory .add_host (spec )
19041904 self .offline_hosts_remove (spec .hostname )
19051905 if spec .status == 'maintenance' :
1906- self ._set_maintenance_healthcheck ()
1906+ self .set_maintenance_healthcheck ()
19071907 self .event .set () # refresh stray health check
19081908 self .log .info ('Added host %s' % spec .hostname )
19091909 return "Added host '{}' with addr '{}'" .format (spec .hostname , spec .addr )
@@ -2074,6 +2074,7 @@ def run_cmd(cmd_args: dict) -> None:
20742074 self .ssh .reset_con (host )
20752075 # if host was in offline host list, we should remove it now.
20762076 self .offline_hosts_remove (host )
2077+ self .set_maintenance_healthcheck ()
20772078 self .event .set () # refresh stray health check
20782079 self .log .info ('Removed host %s' % host )
20792080 return "Removed {} host '{}'" .format ('offline' if offline else '' , host )
@@ -2188,7 +2189,7 @@ def host_ok_to_stop(self, hostname: str) -> str:
21882189 self .log .info (msg )
21892190 return msg
21902191
2191- def _set_maintenance_healthcheck (self ) -> None :
2192+ def set_maintenance_healthcheck (self ) -> None :
21922193 """Raise/update or clear the maintenance health check as needed"""
21932194
21942195 in_maintenance = self .inventory .get_host_with_state ("maintenance" )
@@ -2272,19 +2273,21 @@ def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_reall
22722273 self .inventory ._inventory [hostname ] = tgt_host
22732274 self .inventory .save ()
22742275
2275- self ._set_maintenance_healthcheck ()
2276+ self .set_maintenance_healthcheck ()
22762277 return f'Daemons for Ceph cluster { self ._cluster_fsid } stopped on host { hostname } . Host { hostname } moved to maintenance mode'
22772278
22782279 @handle_orch_error
22792280 @host_exists ()
2280- def exit_host_maintenance (self , hostname : str ) -> str :
2281+ def exit_host_maintenance (self , hostname : str , force : bool = False , offline : bool = False ) -> str :
22812282 """Exit maintenance mode and return a host to an operational state
22822283
22832284 Returning from maintenance will enable the clusters systemd target and
22842285 start it, and remove any noout that has been added for the host if the
22852286 host has osd daemons
22862287
22872288 :param hostname: (str) host name
2289+ :param force: (bool) force removal of the host from maintenance mode
2290+ :param offline: (bool) to remove hosts that are offline from maintenance mode
22882291
22892292 :raises OrchestratorError: Unable to return from maintenance, or unset the
22902293 noout flag
@@ -2293,37 +2296,74 @@ def exit_host_maintenance(self, hostname: str) -> str:
22932296 if tgt_host ['status' ] != "maintenance" :
22942297 raise OrchestratorError (f"Host { hostname } is not in maintenance mode" )
22952298
2296- with self .async_timeout_handler (hostname , 'cephadm host-maintenance exit' ):
2297- outs , errs , _code = self .wait_async (
2298- CephadmServe (self )._run_cephadm (hostname , cephadmNoImage ,
2299- 'host-maintenance' , ['exit' ], error_ok = True ))
2300- returned_msg = errs [0 ].split ('\n ' )[- 1 ]
2301- if returned_msg .startswith ('failed' ) or returned_msg .startswith ('ERROR' ):
2302- raise OrchestratorError (
2303- f"Failed to exit maintenance state for host { hostname } , cluster { self ._cluster_fsid } " )
2304-
2305- if "osd" in self .cache .get_daemon_types (hostname ):
2306- crush_node = hostname if '.' not in hostname else hostname .split ('.' )[0 ]
2307- rc , _out , _err = self .mon_command ({
2308- 'prefix' : 'osd unset-group' ,
2309- 'flags' : 'noout' ,
2310- 'who' : [crush_node ],
2311- 'format' : 'json'
2312- })
2313- if rc :
2299+ # Given we do not regularly check maintenance mode hosts for being offline,
2300+ # we have no idea at this point whether the host is online or not.
2301+ # Keep in mind this goes both ways, as users could have run
2302+ # "ceph cephadm check-host <hostname>" when the host was in maintenance
2303+ # mode and offline and the host could have since come online. This following
2304+ # "cephadm check-host" command is being run purely so we know if the host
2305+ # is online or offline, as those should be handled differently
2306+ try :
2307+ with self .async_timeout_handler (hostname , 'cephadm check-host' ):
2308+ outs , errs , _code = self .wait_async (
2309+ CephadmServe (self )._run_cephadm (
2310+ hostname , cephadmNoImage ,
2311+ 'check-host' , [], error_ok = False
2312+ )
2313+ )
2314+ except OrchestratorError :
2315+ pass
2316+
2317+ host_offline = hostname in self .offline_hosts
2318+
2319+ if host_offline and not offline :
2320+ raise OrchestratorValidationError (
2321+ f'{ hostname } is offline, please use --offline and --force to take this host out of maintenance mode' )
2322+
2323+ if not host_offline and offline :
2324+ raise OrchestratorValidationError (
2325+ f'{ hostname } is online, please take host out of maintenance mode without --offline.' )
2326+
2327+ if offline and not force :
2328+ raise OrchestratorValidationError ("Taking an offline host out of maintenance mode requires --force" )
2329+
2330+ # no point trying these parts if we know the host is offline
2331+ if not host_offline :
2332+ with self .async_timeout_handler (hostname , 'cephadm host-maintenance exit' ):
2333+ outs , errs , _code = self .wait_async (
2334+ CephadmServe (self )._run_cephadm (hostname , cephadmNoImage ,
2335+ 'host-maintenance' , ['exit' ], error_ok = True ))
2336+ returned_msg = errs [0 ].split ('\n ' )[- 1 ]
2337+ if (returned_msg .startswith ('failed' ) or returned_msg .startswith ('ERROR' )):
23142338 self .log .warning (
2315- f"exit maintenance request failed to UNSET the noout group for { hostname } , (rc={ rc } )" )
2316- raise OrchestratorError (f"Unable to set the osds on { hostname } to noout (rc={ rc } )" )
2317- else :
2318- self .log .info (
2319- f"exit maintenance request has UNSET for the noout group on host { hostname } " )
2339+ f"Failed to exit maintenance state for host { hostname } , cluster { self ._cluster_fsid } " )
2340+ if not force :
2341+ raise OrchestratorError (
2342+ f"Failed to exit maintenance state for host { hostname } , cluster { self ._cluster_fsid } " )
2343+
2344+ if "osd" in self .cache .get_daemon_types (hostname ):
2345+ crush_node = hostname if '.' not in hostname else hostname .split ('.' )[0 ]
2346+ rc , _out , _err = self .mon_command ({
2347+ 'prefix' : 'osd unset-group' ,
2348+ 'flags' : 'noout' ,
2349+ 'who' : [crush_node ],
2350+ 'format' : 'json'
2351+ })
2352+ if rc :
2353+ self .log .warning (
2354+ f"exit maintenance request failed to UNSET the noout group for { hostname } , (rc={ rc } )" )
2355+ if not force :
2356+ raise OrchestratorError (f"Unable to set the osds on { hostname } to noout (rc={ rc } )" )
2357+ else :
2358+ self .log .info (
2359+ f"exit maintenance request has UNSET for the noout group on host { hostname } " )
23202360
23212361 # update the host record status
23222362 tgt_host ['status' ] = ""
23232363 self .inventory ._inventory [hostname ] = tgt_host
23242364 self .inventory .save ()
23252365
2326- self ._set_maintenance_healthcheck ()
2366+ self .set_maintenance_healthcheck ()
23272367
23282368 return f"Ceph cluster { self ._cluster_fsid } on { hostname } has exited maintenance mode"
23292369
0 commit comments