Skip to content

Commit 5818305

Browse files
Adam Kingadk3798
authored andcommitted
cephadm: handle "systemctl start" failures during deployment better
Previously it was assumed when the deploy command fails whatever daemon we were trying to deploy does not exist on the host. However, in the specific case where deploy fails trying to start the daemon's systemd unit this is not the case. This leads us to both cleanup the keyring for the daemon and also causes us to not trigger a refresh of the daemons on the host which can make cephadm attempt to deploy another daemon instead of just reporting the existing one as failed. To get around this we need to handle that specific failure as a success in the mgr module's deploy workflow so that we refresh the daemons and report the failure as intended. https://tracker.ceph.com/issues/68536 Signed-off-by: Adam King <[email protected]>
1 parent bd0160d commit 5818305

File tree

6 files changed

+84
-4
lines changed

6 files changed

+84
-4
lines changed

src/cephadm/cephadm.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
LOG_DIR_MODE,
5858
SYSCTL_DIR,
5959
UNIT_DIR,
60+
DAEMON_FAILED_ERROR,
6061
)
6162
from cephadmlib.context import CephadmContext
6263
from cephadmlib.context_getters import (
@@ -72,6 +73,7 @@
7273
ClusterAlreadyExists,
7374
Error,
7475
UnauthorizedRegistryError,
76+
DaemonStartException,
7577
)
7678
from cephadmlib.exe_utils import find_executable, find_program
7779
from cephadmlib.call_wrappers import (
@@ -1246,7 +1248,11 @@ def deploy_daemon_units(
12461248
call_throws(ctx, ['systemctl', 'enable', unit_name])
12471249
if start:
12481250
clean_cgroup(ctx, ident.fsid, unit_name)
1249-
call_throws(ctx, ['systemctl', 'start', unit_name])
1251+
try:
1252+
call_throws(ctx, ['systemctl', 'start', unit_name])
1253+
except Exception as e:
1254+
logger.error(f'systemctl start failed for {unit_name}: {str(e)}')
1255+
raise DaemonStartException()
12501256

12511257

12521258
def _osd_unit_run_commands(
@@ -3046,7 +3052,10 @@ def get_deployment_type(
30463052
@deprecated_command
30473053
def command_deploy(ctx):
30483054
# type: (CephadmContext) -> None
3049-
_common_deploy(ctx)
3055+
try:
3056+
_common_deploy(ctx)
3057+
except DaemonStartException:
3058+
sys.exit(DAEMON_FAILED_ERROR)
30503059

30513060

30523061
def apply_deploy_config_to_ctx(
@@ -3089,7 +3098,10 @@ def command_deploy_from(ctx: CephadmContext) -> None:
30893098
config_data = read_configuration_source(ctx)
30903099
logger.debug('Loaded deploy configuration: %r', config_data)
30913100
apply_deploy_config_to_ctx(config_data, ctx)
3092-
_common_deploy(ctx)
3101+
try:
3102+
_common_deploy(ctx)
3103+
except DaemonStartException:
3104+
sys.exit(DAEMON_FAILED_ERROR)
30933105

30943106

30953107
def _common_deploy(ctx: CephadmContext) -> None:

src/cephadm/cephadmlib/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,4 @@
5454
NO_DEPRECATED = False
5555
UID_NOBODY = 65534
5656
GID_NOGROUP = 65534
57+
DAEMON_FAILED_ERROR = 17

src/cephadm/cephadmlib/exceptions.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,16 @@ class UnauthorizedRegistryError(Error):
1919

2020
class PortOccupiedError(Error):
2121
pass
22+
23+
24+
class DaemonStartException(Exception):
25+
"""
26+
Special exception type we raise when the
27+
systemctl start command fails during daemon
28+
deployment. Necessary because the cephadm mgr module
29+
needs to handle this case differently than a failure
30+
earlier in the deploy process where no attempt was made
31+
to actually start the daemon
32+
"""
33+
34+
pass

src/pybind/mgr/cephadm/serve.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,8 +1436,24 @@ async def _create_daemon(self,
14361436
config_blobs=daemon_spec.final_config,
14371437
).dump_json_str(),
14381438
use_current_daemon_image=reconfig,
1439+
error_ok=True
14391440
)
14401441

1442+
# return number corresponding to DAEMON_FAILED_ERROR
1443+
# in src/cephadm/cephadmlib/constants.
1444+
# TODO: link these together so one cannot be changed without the other
1445+
if code == 17:
1446+
# daemon failed on systemctl start command, meaning while
1447+
# deployment failed the daemon is present and we should handle
1448+
# this as if the deploy command "succeeded" and mark the daemon
1449+
# as failed later when we fetch its status
1450+
self.mgr.log.error(f'Deployment of {daemon_spec.name()} failed during "systemctl start" command')
1451+
elif code:
1452+
# some other failure earlier in the deploy process. Just raise an exception
1453+
# the same as we would in _run_cephadm on a nonzero rc
1454+
raise OrchestratorError(
1455+
f'cephadm exited with an error code: {code}, stderr: {err}')
1456+
14411457
if daemon_spec.daemon_type == 'agent':
14421458
self.mgr.agent_cache.agent_timestamp[daemon_spec.host] = datetime_now()
14431459
self.mgr.agent_cache.agent_counter[daemon_spec.host] = 1

src/pybind/mgr/cephadm/tests/test_cephadm.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ async def _ceph_volume_list(s, host, entity, cmd, **kwargs):
136136
mock.call(host, 'osd', 'ceph-volume',
137137
['--', 'lvm', 'list', '--format', 'json'],
138138
no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
139-
mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, use_current_daemon_image=False),
139+
mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, error_ok=True, use_current_daemon_image=False),
140140
mock.call(host, 'osd', 'ceph-volume',
141141
['--', 'raw', 'list', '--format', 'json'],
142142
no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
@@ -563,6 +563,7 @@ def test_daemon_check_extra_config(self, _run_cephadm, cephadm_module: CephadmOr
563563
},
564564
},
565565
}),
566+
error_ok=True,
566567
use_current_daemon_image=True,
567568
)
568569

@@ -618,6 +619,7 @@ def test_mon_crush_location_deployment(self, _run_cephadm, cephadm_module: Cepha
618619
"crush_location": "datacenter=a",
619620
},
620621
}),
622+
error_ok=True,
621623
use_current_daemon_image=False,
622624
)
623625

@@ -660,6 +662,7 @@ def test_extra_container_args(self, _run_cephadm, cephadm_module: CephadmOrchest
660662
"keyring": "[client.crash.test]\nkey = None\n",
661663
},
662664
}),
665+
error_ok=True,
663666
use_current_daemon_image=False,
664667
)
665668

@@ -702,6 +705,7 @@ def test_extra_entrypoint_args(self, _run_cephadm, cephadm_module: CephadmOrches
702705
},
703706
"config_blobs": {},
704707
}),
708+
error_ok=True,
705709
use_current_daemon_image=False,
706710
)
707711

@@ -752,6 +756,7 @@ def test_extra_entrypoint_and_container_args(self, _run_cephadm, cephadm_module:
752756
},
753757
"config_blobs": {},
754758
}),
759+
error_ok=True,
755760
use_current_daemon_image=False,
756761
)
757762

@@ -806,6 +811,7 @@ def test_extra_entrypoint_and_container_args_with_spaces(self, _run_cephadm, cep
806811
},
807812
"config_blobs": {},
808813
}),
814+
error_ok=True,
809815
use_current_daemon_image=False,
810816
)
811817

0 commit comments

Comments
 (0)