Skip to content

Commit 28e38e3

Browse files
authored
Merge pull request ceph#59483 from kamoltat/wip-ksirivad-exit-stretch-mode
mon [stretch mode]: support disable_stretch_mode Reviewed-by: Nitzan Mordechai <[email protected]>
2 parents a91bcae + a7f3b7b commit 28e38e3

File tree

12 files changed

+945
-2
lines changed

12 files changed

+945
-2
lines changed

doc/rados/operations/stretch-mode.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,34 @@ possible, if needed).
247247

248248
.. _Changing Monitor elections: ../change-mon-elections
249249

250+
Exiting Stretch Mode
251+
=====================
252+
To exit stretch mode, run the following command:
253+
254+
.. prompt:: bash $
255+
256+
ceph mon disable_stretch_mode [{crush_rule}] --yes-i-really-mean-it
257+
258+
259+
.. describe:: {crush_rule}
260+
261+
The CRUSH rule that the user wants all pools to move back to. If this
262+
is not specified, the pools will move back to the default CRUSH rule.
263+
264+
:Type: String
265+
:Required: No.
266+
267+
The command will move the cluster back to normal mode,
268+
and the cluster will no longer be in stretch mode.
269+
All pools will move its ``size`` and ``min_size``
270+
back to the default values it started with.
271+
At this point the user is responsible for scaling down the cluster
272+
to the desired number of OSDs if they choose to operate with less number of OSDs.
273+
274+
Please note that the command will not execute when the cluster is in
275+
``recovery stretch mode``. The command will only execute when the cluster
276+
is in ``degraded stretch mode`` or ``healthy stretch mode``.
277+
250278
Limitations of Stretch Mode
251279
===========================
252280
When using stretch mode, OSDs must be located at exactly two sites.
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
roles:
2+
- - mon.a
3+
- mon.b
4+
- mgr.a
5+
- mgr.b
6+
- osd.0
7+
- osd.1
8+
- osd.2
9+
- osd.3
10+
- - mon.c
11+
- mon.d
12+
- mgr.c
13+
- mgr.d
14+
- osd.4
15+
- osd.5
16+
- osd.6
17+
- osd.7
18+
- - mon.e
19+
- - client.0
20+
21+
openstack:
22+
- volumes: # attached to each instance
23+
count: 3
24+
size: 10 # GB
25+
overrides:
26+
ceph:
27+
conf:
28+
global:
29+
mon election default strategy: 3
30+
osd pool default size: 3
31+
osd pool default min size: 2
32+
mon:
33+
debug mon: 30
34+
tasks:
35+
- install:
36+
- ceph:
37+
pre-mgr-commands:
38+
- sudo ceph config set mgr mgr_pool false --force
39+
log-ignorelist:
40+
- \(POOL_
41+
- \(CACHE_POOL_
42+
- overall HEALTH_
43+
- \(PG_AVAILABILITY\)
44+
- Reduced data availability
45+
- \(PG_DEGRADED\)
46+
- \(MON_DOWN\)
47+
- \(OSD_DATACENTER_DOWN\)
48+
- \(OSD_DOWN\)
49+
- \(OSD_HOST_DOWN\)
50+
51+
52+
- workunit:
53+
clients:
54+
client.0:
55+
- mon/mon-stretch-mode-5-mons-8-osds.sh
56+
- cephfs_test_runner:
57+
modules:
58+
- tasks.stretch_mode_disable_enable

qa/tasks/ceph_manager.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2796,6 +2796,59 @@ def _get_num_peered(self, pgs):
27962796
num += 1
27972797
return num
27982798

2799+
def _print_not_active_clean_pg(self, pgs):
2800+
"""
2801+
Print the PGs that are not active+clean.
2802+
"""
2803+
for pg in pgs:
2804+
if not (pg['state'].count('active') and
2805+
pg['state'].count('clean') and
2806+
not pg['state'].count('stale')):
2807+
log.debug(
2808+
"PG %s is not active+clean, but %s",
2809+
pg['pgid'], pg['state']
2810+
)
2811+
2812+
def pg_all_active_clean(self):
2813+
"""
2814+
Check if all pgs are active+clean
2815+
return: True if all pgs are active+clean else False
2816+
"""
2817+
pgs = self.get_pg_stats()
2818+
result = self._get_num_active_clean(pgs) == len(pgs)
2819+
if result:
2820+
log.debug("All PGs are active+clean")
2821+
else:
2822+
log.debug("Not all PGs are active+clean")
2823+
self._print_not_active_clean_pg(pgs)
2824+
return result
2825+
2826+
def _print_not_active_pg(self, pgs):
2827+
"""
2828+
Print the PGs that are not active.
2829+
"""
2830+
for pg in pgs:
2831+
if not (pg['state'].count('active')
2832+
and not pg['state'].count('stale')):
2833+
log.debug(
2834+
"PG %s is not active, but %s",
2835+
pg['pgid'], pg['state']
2836+
)
2837+
2838+
def pg_all_active(self):
2839+
"""
2840+
Check if all pgs are active
2841+
return: True if all pgs are active else False
2842+
"""
2843+
pgs = self.get_pg_stats()
2844+
result = self._get_num_active(pgs) == len(pgs)
2845+
if result:
2846+
log.debug("All PGs are active")
2847+
else:
2848+
log.debug("Not all PGs are active")
2849+
self._print_not_active_pg(pgs)
2850+
return result
2851+
27992852
def is_clean(self):
28002853
"""
28012854
True if all pgs are clean
@@ -3237,6 +3290,26 @@ def revive_mgr(self, mgr):
32373290
self.make_admin_daemon_dir(remote)
32383291
self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart()
32393292

3293+
def get_crush_rule_id(self, crush_rule_name):
3294+
"""
3295+
Get crush rule id by name
3296+
:returns: int -- crush rule id
3297+
"""
3298+
out = self.raw_cluster_cmd('osd', 'crush', 'rule', 'dump', '--format=json')
3299+
j = json.loads('\n'.join(out.split('\n')[1:]))
3300+
for rule in j:
3301+
if rule['rule_name'] == crush_rule_name:
3302+
return rule['rule_id']
3303+
assert False, 'rule %s not found' % crush_rule_name
3304+
3305+
def get_mon_dump_json(self):
3306+
"""
3307+
mon dump --format=json converted to a python object
3308+
:returns: the python object
3309+
"""
3310+
out = self.raw_cluster_cmd('mon', 'dump', '--format=json')
3311+
return json.loads('\n'.join(out.split('\n')[1:]))
3312+
32403313
def get_mon_status(self, mon):
32413314
"""
32423315
Extract all the monitor status information from the cluster
@@ -3340,6 +3413,23 @@ def get_service_task_status(self, service, status_key):
33403413
self.log(task_status)
33413414
return task_status
33423415

3416+
# Stretch mode related functions
3417+
def is_degraded_stretch_mode(self):
3418+
"""
3419+
Return whether the cluster is in degraded stretch mode
3420+
"""
3421+
try:
3422+
osdmap = self.get_osd_dump_json()
3423+
stretch_mode = osdmap.get('stretch_mode', {})
3424+
degraded_stretch_mode = stretch_mode.get('degraded_stretch_mode', 0)
3425+
self.log("is_degraded_stretch_mode: {0}".format(degraded_stretch_mode))
3426+
return degraded_stretch_mode == 1
3427+
except (TypeError, AttributeError) as e:
3428+
# Log the error or handle it as needed
3429+
self.log("Error accessing degraded_stretch_mode: {0}".format(e))
3430+
return False
3431+
3432+
33433433
def utility_task(name):
33443434
"""
33453435
Generate ceph_manager subtask corresponding to ceph_manager

0 commit comments

Comments
 (0)