Skip to content

Commit 36e5d17

Browse files
authored
Merge pull request ceph#61487 from adk3798/cephadm-rgw-exit-timeout-secs
mgr/cephadm: allow setting up RGW delaying shutdown to complete client connections Reviewed-by: Shweta Bhosale <[email protected]>
2 parents 296e257 + b84bb72 commit 36e5d17

File tree

9 files changed

+123
-0
lines changed

9 files changed

+123
-0
lines changed

doc/cephadm/services/rgw.rst

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,41 @@ RGW daemons deployed for that RGW service. For example
229229
The daemon can still receive replication data unless it has been removed
230230
from the zonegroup and zone replication endpoints.
231231

232+
Draining client connections on shutdown
233+
---------------------------------------
234+
235+
When an RGW daemon is stopped by for any reason, including during the cephadm upgrade process,
236+
RGW offers a setting to delay shutdown as the RGW daemon attempts to complete ongoing
237+
client requests. This setting is off by default but activated manually by either passing
238+
``--stop-timeout=<timeout-in-seconds>`` to the RGW process or by setting the
239+
``rgw_exit_timeout_secs`` config option for the RGW daemon. This value may be configured in
240+
the RGW service spec file by specifying the ``rgw_exit_timeout_secs`` parameter in the spec
241+
file. For example
242+
243+
.. code-block:: yaml
244+
245+
service_type: rgw
246+
service_id: foo
247+
placement:
248+
label: rgw
249+
spec:
250+
rgw_realm: myrealm
251+
rgw_zone: myzone
252+
rgw_zonegroup: myzg
253+
rgw_exit_timeout_secs: 120
254+
255+
would tell the RGW daemons cephadm deploys for the rgw.foo service to wait up to 120
256+
seconds for current client requests to complete. Note that the RGW daemon will refuse
257+
new client requests during this time.
258+
259+
.. note:: In cephadm deployments this setting defaults to on and 120 seconds. If you would
260+
like to disable this feature you must set ``rgw_exit_timeout_secs`` to 0 in the spec
261+
262+
.. note:: Modifications to this setting in the spec will not be picked up by the RGW daemons
263+
in the service until they are redeployed using either the ``ceph orch redeploy <service-name>``
264+
or ``ceph orch daemon redeploy <daemon-name>`` commands
265+
266+
232267
Service specification
233268
---------------------
234269

src/cephadm/cephadmlib/daemons/ceph.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ def container(self, ctx: CephadmContext) -> CephContainer:
9090
# but that doesn't seem to persist in the object after it's passed
9191
# in further function calls
9292
ctr.args = ctr.args + ['--set-crush-location', c_loc]
93+
if self.identity.daemon_type == 'rgw' and config_json is not None:
94+
if 'rgw_exit_timeout_secs' in config_json:
95+
stop_timeout = config_json['rgw_exit_timeout_secs']
96+
ctr.args = ctr.args + [f'--stop-timeout={stop_timeout}']
9397
return ctr
9498

9599
_uid_gid: Optional[Tuple[int, int]] = None

src/cephadm/tests/test_cephadm.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,79 @@ def _crush_location_checker(ctx, ident, container, uid, gid, **kwargs):
461461
_cephadm.command_deploy_from(ctx)
462462
_deploy_daemon.assert_called()
463463

464+
def test_rgw_exit_timeout(self, funkypatch):
465+
"""
466+
test that rgw exit timeout secs is set properly
467+
"""
468+
funkypatch.patch('cephadm.logger')
469+
funkypatch.patch('cephadm.FileLock')
470+
_deploy_daemon = funkypatch.patch('cephadm.deploy_daemon')
471+
funkypatch.patch('cephadm.make_var_run')
472+
funkypatch.patch('cephadmlib.file_utils.make_run_dir')
473+
funkypatch.patch('os.mkdir')
474+
_migrate_sysctl = funkypatch.patch('cephadm.migrate_sysctl_dir')
475+
funkypatch.patch(
476+
'cephadm.check_unit',
477+
dest=lambda *args, **kwargs: (None, 'running', None),
478+
)
479+
funkypatch.patch(
480+
'cephadm.get_unit_name',
481+
dest=lambda *args, **kwargs: 'mon-unit-name',
482+
)
483+
funkypatch.patch(
484+
'cephadm.extract_uid_gid', dest=lambda *args, **kwargs: (0, 0)
485+
)
486+
_get_container = funkypatch.patch('cephadm.get_container')
487+
funkypatch.patch(
488+
'cephadm.apply_deploy_config_to_ctx', dest=lambda d, c: None
489+
)
490+
_fetch_configs = funkypatch.patch(
491+
'cephadmlib.context_getters.fetch_configs'
492+
)
493+
funkypatch.patch(
494+
'cephadm.read_configuration_source', dest=lambda c: {}
495+
)
496+
funkypatch.patch('cephadm.fetch_custom_config_files')
497+
498+
ctx = _cephadm.CephadmContext()
499+
ctx.name = 'rgw.foo.test.abcdef'
500+
ctx.fsid = 'b66e5288-d8ea-11ef-b953-525400f9646d'
501+
ctx.reconfig = False
502+
ctx.container_engine = mock_docker()
503+
ctx.allow_ptrace = True
504+
ctx.config_json = '-'
505+
ctx.osd_fsid = '0'
506+
ctx.tcp_ports = '3300 6789'
507+
_fetch_configs.return_value = {
508+
'rgw_exit_timeout_secs': 200
509+
}
510+
511+
_get_container.return_value = _cephadm.CephContainer.for_daemon(
512+
ctx,
513+
ident=_cephadm.DaemonIdentity(
514+
fsid='b66e5288-d8ea-11ef-b953-525400f9646d',
515+
daemon_type='rgw',
516+
daemon_id='foo.test.abcdef',
517+
),
518+
entrypoint='',
519+
args=[],
520+
container_args=[],
521+
volume_mounts={},
522+
bind_mounts=[],
523+
envs=[],
524+
privileged=False,
525+
ptrace=False,
526+
host_network=True,
527+
)
528+
529+
def _exit_timeout_secs_checker(ctx, ident, container, uid, gid, **kwargs):
530+
argval = ' '.join(container.args)
531+
assert '--stop-timeout=200' in argval
532+
533+
_deploy_daemon.side_effect = _exit_timeout_secs_checker
534+
_cephadm.command_deploy_from(ctx)
535+
_deploy_daemon.assert_called()
536+
464537
@mock.patch('cephadm.logger')
465538
@mock.patch('cephadm.fetch_custom_config_files')
466539
def test_write_custom_conf_files(self, _get_config, _logger, cephadm_fs):

src/pybind/mgr/cephadm/services/cephadmservice.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1319,6 +1319,10 @@ def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
13191319
def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
13201320
svc_spec = cast(RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
13211321
config, parent_deps = super().generate_config(daemon_spec)
1322+
1323+
if hasattr(svc_spec, 'rgw_exit_timeout_secs') and svc_spec.rgw_exit_timeout_secs:
1324+
config['rgw_exit_timeout_secs'] = svc_spec.rgw_exit_timeout_secs
1325+
13221326
rgw_deps = parent_deps + self.get_dependencies(self.mgr, svc_spec)
13231327
return config, rgw_deps
13241328

src/pybind/mgr/cephadm/tests/test_cephadm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ def remove_id_events(dd):
273273
'service_id': 'r.z',
274274
'service_name': 'rgw.r.z',
275275
'service_type': 'rgw',
276+
'spec': {'rgw_exit_timeout_secs': 120},
276277
'status': {'created': mock.ANY, 'running': 1, 'size': 1,
277278
'ports': [80]},
278279
}

src/pybind/mgr/cephadm/tests/test_migration.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,7 @@ def test_migrate_rgw_spec(cephadm_module: CephadmOrchestrator, rgw_spec_store_en
338338
'rgw_thread_pool_size=512'],
339339
'rgw_frontend_port': '5000',
340340
'rgw_frontend_type': 'beast',
341+
'rgw_exit_timeout_secs': 120,
341342
}}
342343
else:
343344
# in a real environment, we still expect the spec to be there,

src/pybind/mgr/cephadm/tests/test_spec.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def convert_to_old_style_json(j):
118118
j_c.pop('objectstore', None)
119119
j_c.pop('filter_logic', None)
120120
j_c.pop('anonymous_access', None)
121+
j_c.pop('rgw_exit_timeout_secs', None)
121122
return j_c
122123

123124
assert spec_json == convert_to_old_style_json(spec.to_json())

src/python-common/ceph/deployment/service_spec.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1237,6 +1237,7 @@ def __init__(self,
12371237
generate_cert: bool = False,
12381238
disable_multisite_sync_traffic: Optional[bool] = None,
12391239
wildcard_enabled: Optional[bool] = False,
1240+
rgw_exit_timeout_secs: int = 120,
12401241
):
12411242
assert service_type == 'rgw', service_type
12421243

@@ -1296,6 +1297,8 @@ def __init__(self,
12961297
self.wildcard_enabled = wildcard_enabled
12971298
#: Attributes for <zone-name>.rgw.buckets.data pool created in rgw realm bootstrap command
12981299
self.data_pool_attributes = data_pool_attributes
1300+
#: How long the RGW will wait to try and complete client requests when told to shut down
1301+
self.rgw_exit_timeout_secs = rgw_exit_timeout_secs
12991302

13001303
def get_port_start(self) -> List[int]:
13011304
ports = self.get_port()

src/python-common/ceph/tests/test_service_spec.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ def test_osd_unmanaged():
335335
- 10.0.0.0/8
336336
- 192.168.0.0/16
337337
spec:
338+
rgw_exit_timeout_secs: 60
338339
rgw_frontend_type: civetweb
339340
rgw_realm: default-rgw-realm
340341
rgw_zone: eu-central-1

0 commit comments

Comments
 (0)