Skip to content

Commit 214d614

Browse files
qa/cephfs: add tests failing MDS and FS when MDS is unhealthy
Add tests to verify that the confirmation flag is mandatory for running commands "ceph mds fail" and "ceph fs fail" when MDS has one of the two health warnings: MDS_CACHE_OVERSIZE or MDS_TRIM. Also, add MDS_CACHE_OVERSIZE and MDS_TRIM to ignorelist for test_admin.py so that QA jobs knows this an expected failure. Signed-off-by: Rishabh Dave <ridave@redhat.com>
1 parent a1af1bf commit 214d614

File tree

2 files changed

+197
-0
lines changed

2 files changed

+197
-0
lines changed

qa/suites/fs/functional/tasks/admin.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ overrides:
55
lockdep: true
66
log-ignorelist:
77
- missing required features
8+
- \(MDS_CACHE_OVERSIZED\)
9+
- \(MDS_TRIM\)
810
tasks:
911
- cephfs_test_runner:
1012
fail_on_skip: false

qa/tasks/cephfs/test_admin.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,38 @@ def setup_ec_pools(self, n, metadata=True, overwrites=True):
9191
if overwrites:
9292
self.run_ceph_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true')
9393

94+
def _get_unhealthy_mds_id(self, health_report, health_warn):
95+
'''
96+
Return MDS ID for which health warning in "health_warn" has been
97+
generated.
98+
'''
99+
# variable "msg" should hold string something like this -
100+
# 'mds.b(mds.0): Behind on trimming (865/10) max_segments: 10,
101+
# num_segments: 86
102+
msg = health_report['checks'][health_warn]['detail'][0]['message']
103+
mds_id = msg.split('(')[0]
104+
mds_id = mds_id.replace('mds.', '')
105+
return mds_id
106+
107+
def wait_till_health_warn(self, health_warn, active_mds_id, sleep=3,
108+
tries=10):
109+
errmsg = (f'Expected health warning "{health_warn}" to eventually '
110+
'show up in output of command "ceph health detail". Tried '
111+
f'{tries} times with interval of {sleep} seconds but the '
112+
'health warning didn\'t turn up.')
113+
114+
with safe_while(sleep=sleep, tries=tries, action=errmsg) as proceed:
115+
while proceed():
116+
self.get_ceph_cmd_stdout(
117+
f'tell mds.{active_mds_id} cache status')
118+
119+
health_report = json.loads(self.get_ceph_cmd_stdout(
120+
'health detail --format json'))
121+
122+
if health_warn in health_report['checks']:
123+
return
124+
125+
94126
@classhook('_add_valid_tell')
95127
class TestValidTell(TestAdminCommands):
96128
@classmethod
@@ -2154,3 +2186,166 @@ def test_fs_authorize(self):
21542186
args=(f'fs authorize {self.fs.name} {self.CLIENT_NAME} / '
21552187
f'{wrong_perm}'), retval=self.EXPECTED_ERRNO,
21562188
errmsgs=self.EXPECTED_ERRMSG)
2189+
2190+
2191+
class TestFSFail(TestAdminCommands):
2192+
2193+
MDSS_REQUIRED = 2
2194+
CLIENTS_REQUIRED = 1
2195+
2196+
def test_with_health_warn_oversize_cache(self):
2197+
'''
2198+
Test that, when health warning MDS_CACHE_OVERSIZE is present for an
2199+
MDS, command "ceph fs fail" fails without confirmation flag and passes
2200+
when confirmation flag is passed.
2201+
'''
2202+
health_warn = 'MDS_CACHE_OVERSIZED'
2203+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
2204+
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
2205+
active_mds_id = self.fs.get_active_names()[0]
2206+
2207+
self.mount_a.open_n_background('.', 400)
2208+
self.wait_till_health_warn(health_warn, active_mds_id)
2209+
2210+
# actual testing begins now.
2211+
errmsg = 'mds_cache_oversized'
2212+
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
2213+
retval=1, errmsgs=errmsg)
2214+
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
2215+
2216+
def test_with_health_warn_trim(self):
2217+
'''
2218+
Test that, when health warning MDS_TRIM is present for an MDS, command
2219+
"ceph fs fail" fails without confirmation flag and passes when
2220+
confirmation flag is passed.
2221+
'''
2222+
health_warn = 'MDS_TRIM'
2223+
# for generating health warning MDS_TRIM
2224+
self.config_set('mds', 'mds_debug_subtrees', 'true')
2225+
# this will really really slow the trimming, so that MDS_TRIM stays
2226+
# for longer.
2227+
self.config_set('mds', 'mds_log_trim_decay_rate', '60')
2228+
self.config_set('mds', 'mds_log_trim_threshold', '1')
2229+
active_mds_id = self.fs.get_active_names()[0]
2230+
2231+
self.mount_a.open_n_background('.', 400)
2232+
self.wait_till_health_warn(health_warn, active_mds_id)
2233+
2234+
# actual testing begins now.
2235+
errmsg = 'mds_trim'
2236+
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
2237+
retval=1, errmsgs=errmsg)
2238+
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
2239+
2240+
def test_with_health_warn_with_2_active_MDSs(self):
2241+
'''
2242+
Test that, when a CephFS has 2 active MDSs and one of them have either
2243+
health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph fs fail"
2244+
fails without confirmation flag and passes when confirmation flag is
2245+
passed.
2246+
'''
2247+
health_warn = 'MDS_CACHE_OVERSIZED'
2248+
self.fs.set_max_mds(2)
2249+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
2250+
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
2251+
self.fs.wait_for_daemons()
2252+
mds1_id, mds2_id = self.fs.get_active_names()
2253+
2254+
self.mount_a.open_n_background('.', 400)
2255+
# MDS ID for which health warning has been generated.
2256+
self.wait_till_health_warn(health_warn, mds1_id)
2257+
2258+
# actual testing begins now.
2259+
errmsg = 'mds_cache_oversized'
2260+
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
2261+
retval=1, errmsgs=errmsg)
2262+
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
2263+
2264+
2265+
class TestMDSFail(TestAdminCommands):
2266+
2267+
MDSS_REQUIRED = 2
2268+
CLIENTS_REQUIRED = 1
2269+
2270+
def test_with_health_warn_oversize_cache(self):
2271+
'''
2272+
Test that, when health warning MDS_CACHE_OVERSIZE is present for an
2273+
MDS, command "ceph mds fail" fails without confirmation flag and
2274+
passes when confirmation flag is passed.
2275+
'''
2276+
health_warn = 'MDS_CACHE_OVERSIZED'
2277+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
2278+
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
2279+
active_mds_id = self.fs.get_active_names()[0]
2280+
2281+
self.mount_a.open_n_background('.', 400)
2282+
self.wait_till_health_warn(health_warn, active_mds_id)
2283+
2284+
# actual testing begins now.
2285+
errmsg = 'mds_cache_oversized'
2286+
self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
2287+
retval=1, errmsgs=errmsg)
2288+
self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
2289+
2290+
def test_with_health_warn_trim(self):
2291+
'''
2292+
Test that, when health warning MDS_TRIM is present for an MDS, command
2293+
"ceph mds fail" fails without confirmation flag and passes when
2294+
confirmation is passed.
2295+
'''
2296+
health_warn = 'MDS_TRIM'
2297+
# for generating health warning MDS_TRIM
2298+
self.config_set('mds', 'mds_debug_subtrees', 'true')
2299+
# this will really really slow the trimming, so that MDS_TRIM stays
2300+
# for longer.
2301+
self.config_set('mds', 'mds_log_trim_decay_rate', '60')
2302+
self.config_set('mds', 'mds_log_trim_threshold', '1')
2303+
active_mds_id = self.fs.get_active_names()[0]
2304+
2305+
self.mount_a.open_n_background('.', 400)
2306+
self.wait_till_health_warn(health_warn, active_mds_id)
2307+
2308+
# actual testing begins now...
2309+
errmsg = 'mds_trim'
2310+
self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
2311+
retval=1, errmsgs=errmsg)
2312+
self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
2313+
2314+
def test_with_health_warn_with_2_active_MDSs(self):
2315+
'''
2316+
Test when a CephFS has 2 active MDSs and one of them have either
2317+
health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph mds fail"
2318+
fails for both MDSs without confirmation flag and passes for both when
2319+
confirmation flag is passed.
2320+
'''
2321+
health_warn = 'MDS_CACHE_OVERSIZED'
2322+
self.fs.set_max_mds(2)
2323+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
2324+
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
2325+
self.fs.wait_for_daemons()
2326+
mds1_id, mds2_id = self.fs.get_active_names()
2327+
2328+
self.mount_a.open_n_background('.', 400)
2329+
self.wait_till_health_warn(health_warn, mds1_id)
2330+
2331+
health_report = json.loads(self.get_ceph_cmd_stdout('health detail '
2332+
'--format json'))
2333+
# MDS ID for which health warning has been generated.
2334+
hw_mds_id = self._get_unhealthy_mds_id(health_report, health_warn)
2335+
if mds1_id == hw_mds_id:
2336+
non_hw_mds_id = mds2_id
2337+
elif mds2_id == hw_mds_id:
2338+
non_hw_mds_id = mds1_id
2339+
else:
2340+
raise RuntimeError('There are only 2 MDSs right now but apparently'
2341+
'health warning was raised for an MDS other '
2342+
'than these two. This is definitely an error.')
2343+
2344+
# actual testing begins now...
2345+
errmsg = 'mds_cache_oversized'
2346+
self.negtest_ceph_cmd(args=f'mds fail {non_hw_mds_id}', retval=1,
2347+
errmsgs=errmsg)
2348+
self.negtest_ceph_cmd(args=f'mds fail {hw_mds_id}', retval=1,
2349+
errmsgs=errmsg)
2350+
self.run_ceph_cmd('mds fail mds1_id --yes-i-really-mean-it')
2351+
self.run_ceph_cmd('mds fail mds2_id --yes-i-really-mean-it')

0 commit comments

Comments
 (0)