@@ -2272,14 +2272,16 @@ def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_reall
22722272
22732273 @handle_orch_error
22742274 @host_exists ()
2275- def exit_host_maintenance (self , hostname : str ) -> str :
2275+ def exit_host_maintenance (self , hostname : str , force : bool = False , offline : bool = False ) -> str :
22762276 """Exit maintenance mode and return a host to an operational state
22772277
22782278 Returning from maintenance will enable the clusters systemd target and
22792279 start it, and remove any noout that has been added for the host if the
22802280 host has osd daemons
22812281
22822282 :param hostname: (str) host name
2283+ :param force: (bool) force removal of the host from maintenance mode
2284+ :param offline: (bool) to remove hosts that are offline from maintenance mode
22832285
22842286 :raises OrchestratorError: Unable to return from maintenance, or unset the
22852287 noout flag
@@ -2288,30 +2290,67 @@ def exit_host_maintenance(self, hostname: str) -> str:
22882290 if tgt_host ['status' ] != "maintenance" :
22892291 raise OrchestratorError (f"Host { hostname } is not in maintenance mode" )
22902292
2291- with self .async_timeout_handler (hostname , 'cephadm host-maintenance exit' ):
2292- outs , errs , _code = self .wait_async (
2293- CephadmServe (self )._run_cephadm (hostname , cephadmNoImage ,
2294- 'host-maintenance' , ['exit' ], error_ok = True ))
2295- returned_msg = errs [0 ].split ('\n ' )[- 1 ]
2296- if returned_msg .startswith ('failed' ) or returned_msg .startswith ('ERROR' ):
2297- raise OrchestratorError (
2298- f"Failed to exit maintenance state for host { hostname } , cluster { self ._cluster_fsid } " )
2299-
2300- if "osd" in self .cache .get_daemon_types (hostname ):
2301- crush_node = hostname if '.' not in hostname else hostname .split ('.' )[0 ]
2302- rc , _out , _err = self .mon_command ({
2303- 'prefix' : 'osd unset-group' ,
2304- 'flags' : 'noout' ,
2305- 'who' : [crush_node ],
2306- 'format' : 'json'
2307- })
2308- if rc :
2293+ # Given we do not regularly check maintenance mode hosts for being offline,
2294+ # we have no idea at this point whether the host is online or not.
2295+ # Keep in mind this goes both ways, as users could have run
2296+ # "ceph cephadm check-host <hostname>" when the host was in maintenance
2297+ # mode and offline and the host could have since come online. This following
2298+ # "cephadm check-host" command is being run purely so we know if the host
2299+ # is online or offline, as those should be handled differently
2300+ try :
2301+ with self .async_timeout_handler (hostname , 'cephadm check-host' ):
2302+ outs , errs , _code = self .wait_async (
2303+ CephadmServe (self )._run_cephadm (
2304+ hostname , cephadmNoImage ,
2305+ 'check-host' , [], error_ok = False
2306+ )
2307+ )
2308+ except OrchestratorError :
2309+ pass
2310+
2311+ host_offline = hostname in self .offline_hosts
2312+
2313+ if host_offline and not offline :
2314+ raise OrchestratorValidationError (
2315+ f'{ hostname } is offline, please use --offline and --force to take this host out of maintenance mode' )
2316+
2317+ if not host_offline and offline :
2318+ raise OrchestratorValidationError (
2319+ f'{ hostname } is online, please take host out of maintenance mode without --offline.' )
2320+
2321+ if offline and not force :
2322+ raise OrchestratorValidationError ("Taking an offline host out of maintenance mode requires --force" )
2323+
2324+ # no point trying these parts if we know the host is offline
2325+ if not host_offline :
2326+ with self .async_timeout_handler (hostname , 'cephadm host-maintenance exit' ):
2327+ outs , errs , _code = self .wait_async (
2328+ CephadmServe (self )._run_cephadm (hostname , cephadmNoImage ,
2329+ 'host-maintenance' , ['exit' ], error_ok = True ))
2330+ returned_msg = errs [0 ].split ('\n ' )[- 1 ]
2331+ if (returned_msg .startswith ('failed' ) or returned_msg .startswith ('ERROR' )):
23092332 self .log .warning (
2310- f"exit maintenance request failed to UNSET the noout group for { hostname } , (rc={ rc } )" )
2311- raise OrchestratorError (f"Unable to set the osds on { hostname } to noout (rc={ rc } )" )
2312- else :
2313- self .log .info (
2314- f"exit maintenance request has UNSET for the noout group on host { hostname } " )
2333+ f"Failed to exit maintenance state for host { hostname } , cluster { self ._cluster_fsid } " )
2334+ if not force :
2335+ raise OrchestratorError (
2336+ f"Failed to exit maintenance state for host { hostname } , cluster { self ._cluster_fsid } " )
2337+
2338+ if "osd" in self .cache .get_daemon_types (hostname ):
2339+ crush_node = hostname if '.' not in hostname else hostname .split ('.' )[0 ]
2340+ rc , _out , _err = self .mon_command ({
2341+ 'prefix' : 'osd unset-group' ,
2342+ 'flags' : 'noout' ,
2343+ 'who' : [crush_node ],
2344+ 'format' : 'json'
2345+ })
2346+ if rc :
2347+ self .log .warning (
2348+ f"exit maintenance request failed to UNSET the noout group for { hostname } , (rc={ rc } )" )
2349+ if not force :
2350+ raise OrchestratorError (f"Unable to set the osds on { hostname } to noout (rc={ rc } )" )
2351+ else :
2352+ self .log .info (
2353+ f"exit maintenance request has UNSET for the noout group on host { hostname } " )
23152354
23162355 # update the host record status
23172356 tgt_host ['status' ] = ""
0 commit comments