Skip to content

Commit abfa5a3

Browse files
committed
qa/suites/rados: white list + add MON_NETSPLIT tests
Some existing netsplit test in 3az + stretch mode needed white listing and check for netsplit details Make qa/tasks/mon_thrash.py set mon_netsplit_grace_period to 30 seconds when we try to freeze monitors instead of killing them. Make qa/tasks/stretch_mode_disable_enable.py set mon_netsplit_grace_period to 30 seconds during `teardown` phase only. Fixes: https://tracker.ceph.com/issues/71344 Signed-off-by: Kamoltat Sirivadhna <[email protected]>
1 parent df1105c commit abfa5a3

File tree

9 files changed

+167
-6
lines changed

9 files changed

+167
-6
lines changed

qa/suites/netsplit/ceph.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ overrides:
3131
- \(PG_AVAILABILITY\)
3232
- \(SLOW_OPS\)
3333
- \[WRN\]
34+
- \(MON_NETSPLIT\)
3435
tasks:
3536
- install:
3637
- ceph:

qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ tasks:
6464
- \(PG_AVAILABILITY\)
6565
- \(SLOW_OPS\)
6666
- \[WRN\]
67+
- \(MON_NETSPLIT\)
6768
- workunit:
6869
clients:
6970
client.0:

qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,6 @@ tasks:
4646
- \(OSD_DATACENTER_DOWN\)
4747
- \(OSD_DOWN\)
4848
- \(OSD_HOST_DOWN\)
49-
50-
5149
- workunit:
5250
clients:
5351
client.0:

qa/suites/rados/singleton/msgr-failures/few.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ overrides:
77
log-ignorelist:
88
- \(OSD_SLOW_PING_TIME
99
- \(MON_DOWN\)
10+
- \(MON_NETSPLIT\)

qa/suites/rados/singleton/msgr-failures/many.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ overrides:
1111
log-ignorelist:
1212
- \(OSD_SLOW_PING_TIME
1313
- \(MON_DOWN\)
14+
- \(MON_NETSPLIT\)

qa/tasks/mon_thrash.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ class MonitorThrasher(Thrasher):
6464
task to run with as many as just one single monitor.
6565
(default: True)
6666
freeze_mon_probability: how often to freeze the mon instead of killing it,
67-
in % (default: 0)
67+
in % (default: 10)
6868
freeze_mon_duration: how many seconds to freeze the mon (default: 15)
6969
scrub Scrub after each iteration (default: True)
7070
check_mds_failover Check if mds failover happened (default: False)
@@ -128,6 +128,15 @@ def __init__(self, ctx, manager, config, name, logger):
128128
self.scrub = self.config.get('scrub', True)
129129

130130
self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
131+
# In some cases where many monitors froze at once and revived
132+
# after a long time might cause the connection to take more time to establish.
133+
# Therefore, we increase the netsplit grace period to 30 seconds.
134+
# This is to avoid false positives in the netsplit test, while still keeping
135+
# the integrity of the test.
136+
if self.freeze_mon_probability > 0:
137+
self.manager.raw_cluster_cmd(
138+
'config', 'set', 'mon', 'mon_netsplit_grace_period', '30')
139+
131140
self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
132141

133142
assert self.max_killable() > 0, \

qa/tasks/stretch_mode_disable_enable.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,21 @@ def _bring_back_mon(self, mon):
122122
"""
123123
Bring back the mon.
124124
"""
125+
log.debug("_bring_back_mon %s", mon)
126+
# If the mon is already up, do nothing
127+
quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names()
128+
if mon in quorum_names:
129+
log.debug("mon.%s is already up", mon)
130+
return
131+
# If the mon is not up, try to bring it back
132+
log.debug("Bringing back mon.%s", mon)
125133
try:
126134
self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart()
127135
except Exception:
128136
log.error("Failed to bring back mon.{}".format(str(mon)))
129137
pass
130138

131-
def _get_host(self, osd):
139+
def _get_osd_host(self, osd):
132140
"""
133141
Get the host of the osd.
134142
"""
@@ -142,7 +150,7 @@ def _move_osd_back_to_host(self, osd):
142150
"""
143151
Move the osd back to the host.
144152
"""
145-
host = self._get_host(osd)
153+
host = self._get_osd_host(osd)
146154
assert host is not None, "The host of osd {} is not found.".format(osd)
147155
log.debug("Moving osd.%d back to %s", osd, host)
148156
self.mgr_cluster.mon_manager.raw_cluster_cmd(
@@ -155,6 +163,7 @@ def tearDown(self):
155163
Clean up the cluster after the test.
156164
"""
157165
# Remove the pool
166+
log.debug("Tear down the test")
158167
if self.POOL in self.mgr_cluster.mon_manager.pools:
159168
self.mgr_cluster.mon_manager.remove_pool(self.POOL)
160169

@@ -168,7 +177,14 @@ def tearDown(self):
168177
if osd['up'] == 0:
169178
self.mgr_cluster.mon_manager.revive_osd(osd['osd'])
170179
self._move_osd_back_to_host(osd['osd'])
171-
180+
181+
# Set the mon_netsplit_grace_period to 30 seconds.
182+
# Sometimes when many mons restart at the same time
183+
# it can take longer for the monitors to establish
184+
# a connection.
185+
self.mgr_cluster.mon_manager.raw_cluster_cmd(
186+
'config', 'set', 'mon', 'mon_netsplit_grace_period', '30'
187+
)
172188
# Bring back all the mons
173189
mons = self._get_all_mons_from_all_dc()
174190
for mon in mons:
@@ -359,6 +375,7 @@ def _stretch_mode_enabled_correctly(self):
359375
self.TIEBREAKER_MON_NAME,
360376
monmap['tiebreaker_mon']
361377
)
378+
log.debug("Stretch mode is enabled correctly.")
362379

363380
def _stretch_mode_disabled_correctly(self):
364381
"""
@@ -445,6 +462,7 @@ def _stretch_mode_disabled_correctly(self):
445462
"",
446463
monmap['tiebreaker_mon']
447464
)
465+
log.debug("Stretch mode is disabled correctly.")
448466

449467
def test_disable_stretch_mode(self):
450468
"""

qa/tasks/test_netsplit.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,50 @@ def _check_if_connect(self, config):
193193
except Exception:
194194
return False
195195

196+
def _check_mon_netsplit_warning(self):
197+
"""
198+
Returns True if MON_NETSPLIT warning exists in health checks.
199+
"""
200+
(client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
201+
arg = ['ceph', 'health', 'detail', '--format=json']
202+
proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
203+
if proc.exitstatus != 0:
204+
log.error("ceph health detail failed")
205+
raise Exception("ceph health detail failed")
206+
out = proc.stdout.getvalue()
207+
j = json.loads(out)
208+
checks = j.get("checks", {})
209+
return "MON_NETSPLIT" in checks
210+
211+
def _check_mon_netsplit_warning_raised(self, detail):
212+
"""
213+
Check if the MON_NETSPLIT warning with the given detail is raised.
214+
"""
215+
log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
216+
(client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
217+
arg = ['ceph', 'health', 'detail', '--format=json']
218+
proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
219+
if proc.exitstatus != 0:
220+
log.error("ceph health detail failed")
221+
raise Exception("ceph health detail failed")
222+
out = proc.stdout.getvalue()
223+
j = json.loads(out)
224+
# Access health checks
225+
checks = j.get("checks", {})
226+
netsplit = checks.get("MON_NETSPLIT", {})
227+
if not netsplit:
228+
log.info("MON_NETSPLIT not found in health checks")
229+
return False
230+
231+
# Check if the expected detail is present
232+
for d in netsplit.get("detail", []):
233+
if detail in d.get("message", ""):
234+
log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
235+
return True
236+
237+
log.info("MON_NETSPLIT found but detail does not match")
238+
return False
239+
196240
def test_netsplit_dc1_dc2(self):
197241
"""
198242
Test Netsplit between dc1 and dc2
@@ -220,6 +264,13 @@ def test_netsplit_dc1_dc2(self):
220264
lambda: self._check_if_disconnect(config),
221265
timeout=self.RECOVERY_PERIOD,
222266
)
267+
# check if the MON_NETSPLIT warning is raised we expect none
268+
# because this is stretch mode
269+
self.wait_until_true_and_hold(
270+
lambda: not self._check_mon_netsplit_warning(),
271+
timeout=self.RECOVERY_PERIOD,
272+
success_hold_time=self.SUCCESS_HOLD_TIME
273+
)
223274
# check the cluster is accessible
224275
self.wait_until_true_and_hold(
225276
lambda: self._reply_to_mon_command(),
@@ -263,6 +314,12 @@ def test_netsplit_dc1_dc2(self):
263314
lambda: self._check_if_connect(config),
264315
timeout=self.RECOVERY_PERIOD,
265316
)
317+
# check if no MON_NETSPLIT warning is raised
318+
self.wait_until_true_and_hold(
319+
lambda: not self._check_mon_netsplit_warning(),
320+
timeout=self.RECOVERY_PERIOD,
321+
success_hold_time=self.SUCCESS_HOLD_TIME
322+
)
266323
# check if all the PGs are active+clean
267324
self.wait_until_true_and_hold(
268325
lambda: self._pg_all_active_clean(),
@@ -305,6 +362,13 @@ def test_netsplit_arbiter_dc1_and_dc1_dc2(self):
305362
lambda: self._check_if_disconnect(dc1_dc2),
306363
timeout=self.RECOVERY_PERIOD,
307364
)
365+
# check if the MON_NETSPLIT warning is raised we expect none
366+
# because this is stretch mode
367+
self.wait_until_true_and_hold(
368+
lambda: not self._check_mon_netsplit_warning(),
369+
timeout=self.RECOVERY_PERIOD,
370+
success_hold_time=self.SUCCESS_HOLD_TIME
371+
)
308372
# check the cluster is accessible
309373
self.wait_until_true_and_hold(
310374
lambda: self._reply_to_mon_command(),
@@ -352,6 +416,12 @@ def test_netsplit_arbiter_dc1_and_dc1_dc2(self):
352416
lambda: self._check_if_connect(dc1_dc2),
353417
timeout=self.RECOVERY_PERIOD,
354418
)
419+
# check if the MON_NETSPLIT warning is not raised
420+
self.wait_until_true_and_hold(
421+
lambda: not self._check_mon_netsplit_warning(),
422+
timeout=self.RECOVERY_PERIOD,
423+
success_hold_time=self.SUCCESS_HOLD_TIME
424+
)
355425
# check if all the PGs are active+clean
356426
self.wait_until_true_and_hold(
357427
lambda: self._pg_all_active_clean(),

qa/tasks/test_netsplit_3az_stretch_pool.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,51 @@ def _check_if_connect(self, config):
209209
except Exception:
210210
return False
211211

212+
def _check_mon_netsplit_warning(self):
213+
"""
214+
Returns True if MON_NETSPLIT warning exists in health checks.
215+
"""
216+
(client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
217+
arg = ['ceph', 'health', 'detail', '--format=json']
218+
proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
219+
if proc.exitstatus != 0:
220+
log.error("ceph health detail failed")
221+
raise Exception("ceph health detail failed")
222+
out = proc.stdout.getvalue()
223+
j = json.loads(out)
224+
checks = j.get("checks", {})
225+
log.debug("checks: {}".format(checks))
226+
return "MON_NETSPLIT" in checks
227+
228+
def _check_mon_netsplit_warning_raised(self, detail):
229+
"""
230+
Check if the MON_NETSPLIT warning with the given detail is raised.
231+
"""
232+
log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
233+
(client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
234+
arg = ['ceph', 'health', 'detail', '--format=json']
235+
proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
236+
if proc.exitstatus != 0:
237+
log.error("ceph health detail failed")
238+
raise Exception("ceph health detail failed")
239+
out = proc.stdout.getvalue()
240+
j = json.loads(out)
241+
# Access health checks
242+
checks = j.get("checks", {})
243+
netsplit = checks.get("MON_NETSPLIT", {})
244+
if not netsplit:
245+
log.info("MON_NETSPLIT not found in health checks")
246+
return False
247+
248+
# Check if the expected detail is present
249+
for d in netsplit.get("detail", []):
250+
if detail in d.get("message", ""):
251+
log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
252+
return True
253+
254+
log.info("MON_NETSPLIT found but detail does not match")
255+
return False
256+
212257
def test_mon_netsplit(self):
213258
"""
214259
Test the mon netsplit scenario, if cluster is actually accessible.
@@ -247,6 +292,7 @@ def test_mon_netsplit(self):
247292
# Scenario 1: disconnect Site 1 and Site 2
248293
# Site 3 is still connected to both Site 1 and Site 2
249294
config = ["mon.a", "mon.d"]
295+
location = ["dc1", "dc2"]
250296
# disconnect the mons
251297
self._disconnect_mons(config)
252298
# wait for the mons to be disconnected
@@ -256,6 +302,16 @@ def test_mon_netsplit(self):
256302
lambda: self._check_if_disconnect(config),
257303
timeout=self.RECOVERY_PERIOD,
258304
)
305+
# check if location level MON_NETSPLIT warning is raised
306+
self.wait_until_true_and_hold(
307+
lambda: self._check_mon_netsplit_warning_raised(
308+
"Netsplit detected between {} and {}".format(
309+
location[0], location[1]
310+
),
311+
),
312+
timeout=self.RECOVERY_PERIOD,
313+
success_hold_time=self.SUCCESS_HOLD_TIME
314+
)
259315
# check the cluster is accessible
260316
self.wait_until_true_and_hold(
261317
lambda: self._reply_to_mon_command(),
@@ -271,6 +327,12 @@ def test_mon_netsplit(self):
271327
lambda: self._check_if_connect(config),
272328
timeout=self.RECOVERY_PERIOD,
273329
)
330+
# check if the MON_NETSPLIT warning is cleared
331+
self.wait_until_true_and_hold(
332+
lambda: not self._check_mon_netsplit_warning(),
333+
timeout=self.RECOVERY_PERIOD,
334+
success_hold_time=self.SUCCESS_HOLD_TIME
335+
)
274336
# wait for the PGs to recover
275337
time.sleep(self.RECOVERY_PERIOD)
276338
# check if all PGs are active+clean

0 commit comments

Comments
 (0)