Skip to content

Commit 7b41aff

Browse files
committed
qa/suites/rados: 3-az-stretch-cluster-netsplit test
Test the case where 2 DC loses connection with each other for a 3 AZ stretch cluster with stretch pool enabled. Check if cluster is accessible and PGs are active+clean after reconnected. Signed-off-by: Kamoltat <[email protected]>
1 parent fb0011a commit 7b41aff

File tree

3 files changed

+354
-14
lines changed

3 files changed

+354
-14
lines changed
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
roles:
2+
- - mon.a
3+
- mon.b
4+
- mon.c
5+
- osd.0
6+
- osd.1
7+
- osd.2
8+
- mgr.a
9+
- mgr.b
10+
- - mon.d
11+
- mon.e
12+
- mon.f
13+
- osd.3
14+
- osd.4
15+
- osd.5
16+
- mgr.c
17+
- mgr.d
18+
- - mon.g
19+
- mon.h
20+
- mon.i
21+
- osd.6
22+
- osd.7
23+
- osd.8
24+
- mgr.e
25+
- mgr.f
26+
- - client.0
27+
28+
openstack:
29+
- volumes: # attached to each instance
30+
count: 3
31+
size: 10 # GB
32+
overrides:
33+
ceph:
34+
conf:
35+
global:
36+
mon election default strategy: 3
37+
mon:
38+
client mount timeout: 60
39+
osd pool default size: 6
40+
osd_pool_default_min_size: 3
41+
osd_pool_default_pg_autoscale_mode: off
42+
debug mon: 30
43+
tasks:
44+
- install:
45+
- ceph:
46+
pre-mgr-commands:
47+
- sudo ceph config set mgr mgr_pool false --force
48+
log-ignorelist:
49+
- overall HEALTH_
50+
- \(OSDMAP_FLAGS\)
51+
- \(OSD_
52+
- \(PG_
53+
- \(POOL_
54+
- \(CACHE_POOL_
55+
- \(OBJECT_
56+
- \(SLOW_OPS\)
57+
- \(REQUEST_SLOW\)
58+
- \(TOO_FEW_PGS\)
59+
- slow request
60+
- \(POOL_APP_NOT_ENABLED\)
61+
- overall HEALTH_
62+
- \(MGR_DOWN\)
63+
- \(MON_DOWN\)
64+
- \(PG_AVAILABILITY\)
65+
- \(SLOW_OPS\)
66+
- \[WRN\]
67+
- workunit:
68+
clients:
69+
client.0:
70+
- mon/mon-stretch-pool.sh
71+
- cephfs_test_runner:
72+
modules:
73+
- tasks.test_netsplit_3az_stretch_pool

qa/tasks/ceph_test_case.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -361,31 +361,17 @@ def wait_until_true_and_hold(cls, condition, timeout, success_hold_time, check_f
361361
else:
362362
break
363363
if success_time_elapsed == success_hold_time:
364-
<<<<<<< HEAD
365364
log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time))
366-
=======
367-
log.debug("wait_until_true: success for {0}s".format(success_hold_time))
368-
>>>>>>> b8b8b268706 (qa/suites/rados/singleton/all: init mon-stretch-pool.yaml)
369365
return
370366
else:
371367
if elapsed >= timeout:
372368
if check_fn and check_fn() and retry_count < 5:
373369
elapsed = 0
374370
retry_count += 1
375-
<<<<<<< HEAD
376371
log.debug("wait_until_true_and_hold: making progress, waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
377372
else:
378373
raise TestTimeoutError("Timed out after {0}s and {1} retries".format(elapsed, retry_count))
379374
else:
380375
log.debug("wait_until_true_and_hold waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
381376
time.sleep(period)
382377
elapsed += period
383-
=======
384-
log.debug("wait_until_true: making progress, waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
385-
else:
386-
raise TestTimeoutError("Timed out after {0}s and {1} retries".format(elapsed, retry_count))
387-
else:
388-
log.debug("wait_until_true: waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
389-
time.sleep(period)
390-
elapsed += period
391-
>>>>>>> b8b8b268706 (qa/suites/rados/singleton/all: init mon-stretch-pool.yaml)
Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
from tasks.ceph_test_case import CephTestCase
2+
import logging
3+
import json
4+
from tasks.netsplit import disconnect, reconnect, get_ip_and_ports
5+
import itertools
6+
import time
7+
from io import StringIO
8+
log = logging.getLogger(__name__)
9+
10+
11+
class TestNetSplit(CephTestCase):
12+
MON_LIST = ["mon.a", "mon.d", "mon.g"]
13+
CLIENT = "client.0"
14+
CLUSTER = "ceph"
15+
WRITE_PERIOD = 10
16+
READ_PERIOD = 10
17+
RECOVERY_PERIOD = WRITE_PERIOD * 6
18+
SUCCESS_HOLD_TIME = 10
19+
PEERING_CRUSH_BUCKET_COUNT = 2
20+
PEERING_CRUSH_BUCKET_TARGET = 3
21+
PEERING_CRUSH_BUCKET_BARRIER = 'datacenter'
22+
POOL = 'pool_stretch'
23+
CRUSH_RULE = 'replicated_rule_custom'
24+
SIZE = 6
25+
MIN_SIZE = 3
26+
BUCKET_MAX = SIZE // PEERING_CRUSH_BUCKET_TARGET
27+
if (BUCKET_MAX * PEERING_CRUSH_BUCKET_TARGET) < SIZE:
28+
BUCKET_MAX += 1
29+
30+
def setUp(self):
31+
"""
32+
Set up the cluster for the test.
33+
"""
34+
super(TestNetSplit, self).setUp()
35+
36+
def tearDown(self):
37+
"""
38+
Clean up the cluter after the test.
39+
"""
40+
super(TestNetSplit, self).tearDown()
41+
42+
def _setup_pool(self, size=None, min_size=None, rule=None):
43+
"""
44+
Create a pool and set its size.
45+
"""
46+
self.mgr_cluster.mon_manager.create_pool(self.POOL, min_size=min_size)
47+
if size is not None:
48+
self.mgr_cluster.mon_manager.raw_cluster_cmd(
49+
'osd', 'pool', 'set', self.POOL, 'size', str(size))
50+
if rule is not None:
51+
self.mgr_cluster.mon_manager.raw_cluster_cmd(
52+
'osd', 'pool', 'set', self.POOL, 'crush_rule', rule)
53+
54+
def _get_pg_stats(self):
55+
"""
56+
Dump the cluster and get pg stats
57+
"""
58+
(client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
59+
arg = ['ceph', 'pg', 'dump', '--format=json']
60+
proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
61+
if proc.exitstatus != 0:
62+
log.error("pg dump failed")
63+
raise Exception("pg dump failed")
64+
out = proc.stdout.getvalue()
65+
j = json.loads('\n'.join(out.split('\n')[1:]))
66+
try:
67+
return j['pg_map']['pg_stats']
68+
except KeyError:
69+
return j['pg_stats']
70+
71+
def _get_active_pg(self, pgs):
72+
"""
73+
Get the number of active PGs
74+
"""
75+
num_active = 0
76+
for pg in pgs:
77+
if pg['state'].count('active') and not pg['state'].count('stale'):
78+
num_active += 1
79+
return num_active
80+
81+
def _print_not_active_clean_pg(self, pgs):
82+
"""
83+
Print the PGs that are not active+clean.
84+
"""
85+
for pg in pgs:
86+
if not (pg['state'].count('active') and
87+
pg['state'].count('clean') and
88+
not pg['state'].count('stale')):
89+
log.debug(
90+
"PG %s is not active+clean, but %s",
91+
pg['pgid'], pg['state']
92+
)
93+
94+
def _print_not_active_pg(self, pgs):
95+
"""
96+
Print the PGs that are not active.
97+
"""
98+
for pg in pgs:
99+
if not (pg['state'].count('active') and
100+
not pg['state'].count('stale')):
101+
log.debug(
102+
"PG %s is not active, but %s",
103+
pg['pgid'], pg['state']
104+
)
105+
106+
def _pg_all_active(self):
107+
"""
108+
Check if all pgs are active.
109+
"""
110+
pgs = self._get_pg_stats()
111+
result = self._get_active_pg(pgs) == len(pgs)
112+
if result:
113+
log.debug("All PGs are active")
114+
else:
115+
log.debug("Not all PGs are active")
116+
self._print_not_active_pg(pgs)
117+
return result
118+
119+
def _get_active_clean_pg(self, pgs):
120+
"""
121+
Get the number of active+clean PGs
122+
"""
123+
num_active_clean = 0
124+
for pg in pgs:
125+
if (pg['state'].count('active') and
126+
pg['state'].count('clean') and
127+
not pg['state'].count('stale') and
128+
not pg['state'].count('laggy')):
129+
num_active_clean += 1
130+
return num_active_clean
131+
132+
def _pg_all_active_clean(self):
133+
"""
134+
Check if all pgs are active and clean.
135+
"""
136+
pgs = self._get_pg_stats()
137+
result = self._get_active_clean_pg(pgs) == len(pgs)
138+
if result:
139+
log.debug("All PGs are active+clean")
140+
else:
141+
log.debug("Not all PGs are active+clean")
142+
self._print_not_active_clean_pg(pgs)
143+
return result
144+
145+
def _disconnect_mons(self, config):
146+
"""
147+
Disconnect the mons in the <config> list.
148+
"""
149+
disconnect(self.ctx, config)
150+
151+
def _reconnect_mons(self, config):
152+
"""
153+
Reconnect the mons in the <config> list.
154+
"""
155+
reconnect(self.ctx, config)
156+
157+
def _reply_to_mon_command(self):
158+
"""
159+
Check if the cluster is accessible.
160+
"""
161+
try:
162+
self.mgr_cluster.mon_manager.raw_cluster_cmd('status')
163+
return True
164+
except Exception:
165+
return False
166+
167+
def _check_if_disconnect(self, config):
168+
"""
169+
Check if the mons in the <config> list are disconnected.
170+
"""
171+
assert config[0].startswith('mon.')
172+
assert config[1].startswith('mon.')
173+
log.info("Checking if the {} and {} are disconnected".format(
174+
config[0], config[1]))
175+
(ip1, _) = get_ip_and_ports(self.ctx, config[0])
176+
(ip2, _) = get_ip_and_ports(self.ctx, config[1])
177+
(host1,) = self.ctx.cluster.only(config[0]).remotes.keys()
178+
(host2,) = self.ctx.cluster.only(config[1]).remotes.keys()
179+
assert host1 is not None
180+
assert host2 is not None
181+
# if the mons are disconnected, the ping should fail (exitstatus = 1)
182+
try:
183+
if (host1.run(args=["ping", "-c", "1", ip2]).exitstatus == 0 or
184+
host2.run(args=["ping", "-c", "1", ip1]).exitstatus == 0):
185+
return False
186+
except Exception:
187+
return True
188+
189+
def _check_if_connect(self, config):
190+
"""
191+
Check if the mons in the <config> list are connected.
192+
"""
193+
assert config[0].startswith('mon.')
194+
assert config[1].startswith('mon.')
195+
log.info("Checking if {} and {} are connected".format(
196+
config[0], config[1]))
197+
(ip1, _) = get_ip_and_ports(self.ctx, config[0])
198+
(ip2, _) = get_ip_and_ports(self.ctx, config[1])
199+
(host1,) = self.ctx.cluster.only(config[0]).remotes.keys()
200+
(host2,) = self.ctx.cluster.only(config[1]).remotes.keys()
201+
assert host1 is not None
202+
assert host2 is not None
203+
# if the mons are connected, the ping should succeed (exitstatus = 0)
204+
try:
205+
if (host1.run(args=["ping", "-c", "1", ip2]).exitstatus == 0 and
206+
host2.run(args=["ping", "-c", "1", ip1]).exitstatus == 0):
207+
return True
208+
except Exception:
209+
return False
210+
211+
def test_mon_netsplit(self):
212+
"""
213+
Test the mon netsplit scenario, if cluster is actually accessible.
214+
"""
215+
log.info("Running test_mon_netsplit")
216+
self._setup_pool(
217+
self.SIZE,
218+
min_size=self.MIN_SIZE,
219+
rule=self.CRUSH_RULE
220+
)
221+
# set the pool to stretch
222+
self.mgr_cluster.mon_manager.raw_cluster_cmd(
223+
'osd', 'pool', 'stretch', 'set',
224+
self.POOL, str(self.PEERING_CRUSH_BUCKET_COUNT),
225+
str(self.PEERING_CRUSH_BUCKET_TARGET),
226+
self.PEERING_CRUSH_BUCKET_BARRIER,
227+
self.CRUSH_RULE, str(self.SIZE), str(self.MIN_SIZE))
228+
# check if all the mons are connected
229+
self.wait_until_true(
230+
lambda: all(
231+
[
232+
self._check_if_connect([mon1, mon2])
233+
for mon1, mon2 in itertools.combinations(self.MON_LIST, 2)
234+
]
235+
),
236+
timeout=self.RECOVERY_PERIOD,
237+
)
238+
239+
# wait for all PGs to become active
240+
self.wait_until_true_and_hold(
241+
lambda: self._pg_all_active(),
242+
timeout=self.RECOVERY_PERIOD,
243+
success_hold_time=self.SUCCESS_HOLD_TIME
244+
)
245+
246+
# Scenario 1: disconnect Site 1 and Site 2
247+
# Site 3 is still connected to both Site 1 and Site 2
248+
config = ["mon.a", "mon.d"]
249+
# disconnect the mons
250+
self._disconnect_mons(config)
251+
# wait for the mons to be disconnected
252+
time.sleep(self.RECOVERY_PERIOD)
253+
# check if the mons are disconnected
254+
self.wait_until_true(
255+
lambda: self._check_if_disconnect(config),
256+
timeout=self.RECOVERY_PERIOD,
257+
)
258+
# check the cluster is accessible
259+
self.wait_until_true_and_hold(
260+
lambda: self._reply_to_mon_command(),
261+
timeout=self.RECOVERY_PERIOD * 5,
262+
success_hold_time=self.SUCCESS_HOLD_TIME
263+
)
264+
# reconnect the mons
265+
self._reconnect_mons(config)
266+
# wait for the mons to be reconnected
267+
time.sleep(self.RECOVERY_PERIOD)
268+
# check if the mons are reconnected
269+
self.wait_until_true(
270+
lambda: self._check_if_connect(config),
271+
timeout=self.RECOVERY_PERIOD,
272+
)
273+
# wait for the PGs to recover
274+
time.sleep(self.RECOVERY_PERIOD)
275+
# check if all PGs are active+clean
276+
self.wait_until_true_and_hold(
277+
lambda: self._pg_all_active_clean(),
278+
timeout=self.RECOVERY_PERIOD,
279+
success_hold_time=self.SUCCESS_HOLD_TIME
280+
)
281+
log.info("test_mon_netsplit passed!")

0 commit comments

Comments
 (0)