Skip to content

Commit 186b38e

Browse files
authored
Merge pull request ceph#60303 from adk3798/handle-failed-daemon-start
cephadm: handle "systemctl start" failures during deployment better Reviewed-by: John Mulligan <[email protected]>
2 parents 427cd39 + 5818305 commit 186b38e

File tree

6 files changed

+84
-4
lines changed

6 files changed

+84
-4
lines changed

src/cephadm/cephadm.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
LOG_DIR_MODE,
5858
SYSCTL_DIR,
5959
UNIT_DIR,
60+
DAEMON_FAILED_ERROR,
6061
)
6162
from cephadmlib.context import CephadmContext
6263
from cephadmlib.context_getters import (
@@ -72,6 +73,7 @@
7273
ClusterAlreadyExists,
7374
Error,
7475
UnauthorizedRegistryError,
76+
DaemonStartException,
7577
)
7678
from cephadmlib.exe_utils import find_executable, find_program
7779
from cephadmlib.call_wrappers import (
@@ -1246,7 +1248,11 @@ def deploy_daemon_units(
12461248
call_throws(ctx, ['systemctl', 'enable', unit_name])
12471249
if start:
12481250
clean_cgroup(ctx, ident.fsid, unit_name)
1249-
call_throws(ctx, ['systemctl', 'start', unit_name])
1251+
try:
1252+
call_throws(ctx, ['systemctl', 'start', unit_name])
1253+
except Exception as e:
1254+
logger.error(f'systemctl start failed for {unit_name}: {str(e)}')
1255+
raise DaemonStartException()
12501256

12511257

12521258
def _osd_unit_run_commands(
@@ -3050,7 +3056,10 @@ def get_deployment_type(
30503056
@deprecated_command
30513057
def command_deploy(ctx):
30523058
# type: (CephadmContext) -> None
3053-
_common_deploy(ctx)
3059+
try:
3060+
_common_deploy(ctx)
3061+
except DaemonStartException:
3062+
sys.exit(DAEMON_FAILED_ERROR)
30543063

30553064

30563065
def apply_deploy_config_to_ctx(
@@ -3093,7 +3102,10 @@ def command_deploy_from(ctx: CephadmContext) -> None:
30933102
config_data = read_configuration_source(ctx)
30943103
logger.debug('Loaded deploy configuration: %r', config_data)
30953104
apply_deploy_config_to_ctx(config_data, ctx)
3096-
_common_deploy(ctx)
3105+
try:
3106+
_common_deploy(ctx)
3107+
except DaemonStartException:
3108+
sys.exit(DAEMON_FAILED_ERROR)
30973109

30983110

30993111
def _common_deploy(ctx: CephadmContext) -> None:

src/cephadm/cephadmlib/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,4 @@
3636
NO_DEPRECATED = False
3737
UID_NOBODY = 65534
3838
GID_NOGROUP = 65534
39+
DAEMON_FAILED_ERROR = 17

src/cephadm/cephadmlib/exceptions.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,16 @@ class UnauthorizedRegistryError(Error):
1919

2020
class PortOccupiedError(Error):
2121
pass
22+
23+
24+
class DaemonStartException(Exception):
25+
"""
26+
Special exception type we raise when the
27+
systemctl start command fails during daemon
28+
deployment. Necessary because the cephadm mgr module
29+
needs to handle this case differently than a failure
30+
earlier in the deploy process where no attempt was made
31+
to actually start the daemon
32+
"""
33+
34+
pass

src/pybind/mgr/cephadm/serve.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,8 +1436,24 @@ async def _create_daemon(self,
14361436
config_blobs=daemon_spec.final_config,
14371437
).dump_json_str(),
14381438
use_current_daemon_image=reconfig,
1439+
error_ok=True
14391440
)
14401441

1442+
# return number corresponding to DAEMON_FAILED_ERROR
1443+
# in src/cephadm/cephadmlib/constants.
1444+
# TODO: link these together so one cannot be changed without the other
1445+
if code == 17:
1446+
# daemon failed on systemctl start command, meaning while
1447+
# deployment failed the daemon is present and we should handle
1448+
# this as if the deploy command "succeeded" and mark the daemon
1449+
# as failed later when we fetch its status
1450+
self.mgr.log.error(f'Deployment of {daemon_spec.name()} failed during "systemctl start" command')
1451+
elif code:
1452+
# some other failure earlier in the deploy process. Just raise an exception
1453+
# the same as we would in _run_cephadm on a nonzero rc
1454+
raise OrchestratorError(
1455+
f'cephadm exited with an error code: {code}, stderr: {err}')
1456+
14411457
if daemon_spec.daemon_type == 'agent':
14421458
self.mgr.agent_cache.agent_timestamp[daemon_spec.host] = datetime_now()
14431459
self.mgr.agent_cache.agent_counter[daemon_spec.host] = 1

src/pybind/mgr/cephadm/tests/test_cephadm.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ async def _ceph_volume_list(s, host, entity, cmd, **kwargs):
136136
mock.call(host, 'osd', 'ceph-volume',
137137
['--', 'lvm', 'list', '--format', 'json'],
138138
no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
139-
mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, use_current_daemon_image=False),
139+
mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, error_ok=True, use_current_daemon_image=False),
140140
mock.call(host, 'osd', 'ceph-volume',
141141
['--', 'raw', 'list', '--format', 'json'],
142142
no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
@@ -563,6 +563,7 @@ def test_daemon_check_extra_config(self, _run_cephadm, cephadm_module: CephadmOr
563563
},
564564
},
565565
}),
566+
error_ok=True,
566567
use_current_daemon_image=True,
567568
)
568569

@@ -618,6 +619,7 @@ def test_mon_crush_location_deployment(self, _run_cephadm, cephadm_module: Cepha
618619
"crush_location": "datacenter=a",
619620
},
620621
}),
622+
error_ok=True,
621623
use_current_daemon_image=False,
622624
)
623625

@@ -660,6 +662,7 @@ def test_extra_container_args(self, _run_cephadm, cephadm_module: CephadmOrchest
660662
"keyring": "[client.crash.test]\nkey = None\n",
661663
},
662664
}),
665+
error_ok=True,
663666
use_current_daemon_image=False,
664667
)
665668

@@ -702,6 +705,7 @@ def test_extra_entrypoint_args(self, _run_cephadm, cephadm_module: CephadmOrches
702705
},
703706
"config_blobs": {},
704707
}),
708+
error_ok=True,
705709
use_current_daemon_image=False,
706710
)
707711

@@ -752,6 +756,7 @@ def test_extra_entrypoint_and_container_args(self, _run_cephadm, cephadm_module:
752756
},
753757
"config_blobs": {},
754758
}),
759+
error_ok=True,
755760
use_current_daemon_image=False,
756761
)
757762

@@ -806,6 +811,7 @@ def test_extra_entrypoint_and_container_args_with_spaces(self, _run_cephadm, cep
806811
},
807812
"config_blobs": {},
808813
}),
814+
error_ok=True,
809815
use_current_daemon_image=False,
810816
)
811817

0 commit comments

Comments
 (0)