@@ -91,6 +91,38 @@ def setup_ec_pools(self, n, metadata=True, overwrites=True):
9191 if overwrites :
9292 self .run_ceph_cmd ('osd' , 'pool' , 'set' , n + "-data" , 'allow_ec_overwrites' , 'true' )
9393
94+ def _get_unhealthy_mds_id (self , health_report , health_warn ):
95+ '''
96+ Return MDS ID for which health warning in "health_warn" has been
97+ generated.
98+ '''
99+ # variable "msg" should hold string something like this -
100+ # 'mds.b(mds.0): Behind on trimming (865/10) max_segments: 10,
101+ # num_segments: 86
102+ msg = health_report ['checks' ][health_warn ]['detail' ][0 ]['message' ]
103+ mds_id = msg .split ('(' )[0 ]
104+ mds_id = mds_id .replace ('mds.' , '' )
105+ return mds_id
106+
107+ def wait_till_health_warn (self , health_warn , active_mds_id , sleep = 3 ,
108+ tries = 10 ):
109+ errmsg = (f'Expected health warning "{ health_warn } " to eventually '
110+ 'show up in output of command "ceph health detail". Tried '
111+ f'{ tries } times with interval of { sleep } seconds but the '
112+ 'health warning didn\' t turn up.' )
113+
114+ with safe_while (sleep = sleep , tries = tries , action = errmsg ) as proceed :
115+ while proceed ():
116+ self .get_ceph_cmd_stdout (
117+ f'tell mds.{ active_mds_id } cache status' )
118+
119+ health_report = json .loads (self .get_ceph_cmd_stdout (
120+ 'health detail --format json' ))
121+
122+ if health_warn in health_report ['checks' ]:
123+ return
124+
125+
94126@classhook ('_add_valid_tell' )
95127class TestValidTell (TestAdminCommands ):
96128 @classmethod
@@ -2154,3 +2186,166 @@ def test_fs_authorize(self):
21542186 args = (f'fs authorize { self .fs .name } { self .CLIENT_NAME } / '
21552187 f'{ wrong_perm } ' ), retval = self .EXPECTED_ERRNO ,
21562188 errmsgs = self .EXPECTED_ERRMSG )
2189+
2190+
2191+ class TestFSFail (TestAdminCommands ):
2192+
2193+ MDSS_REQUIRED = 2
2194+ CLIENTS_REQUIRED = 1
2195+
2196+ def test_with_health_warn_oversize_cache (self ):
2197+ '''
2198+ Test that, when health warning MDS_CACHE_OVERSIZE is present for an
2199+ MDS, command "ceph fs fail" fails without confirmation flag and passes
2200+ when confirmation flag is passed.
2201+ '''
2202+ health_warn = 'MDS_CACHE_OVERSIZED'
2203+ self .config_set ('mds' , 'mds_cache_memory_limit' , '1K' )
2204+ self .config_set ('mds' , 'mds_health_cache_threshold' , '1.00000' )
2205+ active_mds_id = self .fs .get_active_names ()[0 ]
2206+
2207+ self .mount_a .open_n_background ('.' , 400 )
2208+ self .wait_till_health_warn (health_warn , active_mds_id )
2209+
2210+ # actual testing begins now.
2211+ errmsg = 'mds_cache_oversized'
2212+ self .negtest_ceph_cmd (args = f'fs fail { self .fs .name } ' ,
2213+ retval = 1 , errmsgs = errmsg )
2214+ self .run_ceph_cmd (f'fs fail { self .fs .name } --yes-i-really-mean-it' )
2215+
2216+ def test_with_health_warn_trim (self ):
2217+ '''
2218+ Test that, when health warning MDS_TRIM is present for an MDS, command
2219+ "ceph fs fail" fails without confirmation flag and passes when
2220+ confirmation flag is passed.
2221+ '''
2222+ health_warn = 'MDS_TRIM'
2223+ # for generating health warning MDS_TRIM
2224+ self .config_set ('mds' , 'mds_debug_subtrees' , 'true' )
2225+ # this will really really slow the trimming, so that MDS_TRIM stays
2226+ # for longer.
2227+ self .config_set ('mds' , 'mds_log_trim_decay_rate' , '60' )
2228+ self .config_set ('mds' , 'mds_log_trim_threshold' , '1' )
2229+ active_mds_id = self .fs .get_active_names ()[0 ]
2230+
2231+ self .mount_a .open_n_background ('.' , 400 )
2232+ self .wait_till_health_warn (health_warn , active_mds_id )
2233+
2234+ # actual testing begins now.
2235+ errmsg = 'mds_trim'
2236+ self .negtest_ceph_cmd (args = f'fs fail { self .fs .name } ' ,
2237+ retval = 1 , errmsgs = errmsg )
2238+ self .run_ceph_cmd (f'fs fail { self .fs .name } --yes-i-really-mean-it' )
2239+
2240+ def test_with_health_warn_with_2_active_MDSs (self ):
2241+ '''
2242+ Test that, when a CephFS has 2 active MDSs and one of them have either
2243+ health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph fs fail"
2244+ fails without confirmation flag and passes when confirmation flag is
2245+ passed.
2246+ '''
2247+ health_warn = 'MDS_CACHE_OVERSIZED'
2248+ self .fs .set_max_mds (2 )
2249+ self .config_set ('mds' , 'mds_cache_memory_limit' , '1K' )
2250+ self .config_set ('mds' , 'mds_health_cache_threshold' , '1.00000' )
2251+ self .fs .wait_for_daemons ()
2252+ mds1_id , mds2_id = self .fs .get_active_names ()
2253+
2254+ self .mount_a .open_n_background ('.' , 400 )
2255+ # MDS ID for which health warning has been generated.
2256+ self .wait_till_health_warn (health_warn , mds1_id )
2257+
2258+ # actual testing begins now.
2259+ errmsg = 'mds_cache_oversized'
2260+ self .negtest_ceph_cmd (args = f'fs fail { self .fs .name } ' ,
2261+ retval = 1 , errmsgs = errmsg )
2262+ self .run_ceph_cmd (f'fs fail { self .fs .name } --yes-i-really-mean-it' )
2263+
2264+
2265+ class TestMDSFail (TestAdminCommands ):
2266+
2267+ MDSS_REQUIRED = 2
2268+ CLIENTS_REQUIRED = 1
2269+
2270+ def test_with_health_warn_oversize_cache (self ):
2271+ '''
2272+ Test that, when health warning MDS_CACHE_OVERSIZE is present for an
2273+ MDS, command "ceph mds fail" fails without confirmation flag and
2274+ passes when confirmation flag is passed.
2275+ '''
2276+ health_warn = 'MDS_CACHE_OVERSIZED'
2277+ self .config_set ('mds' , 'mds_cache_memory_limit' , '1K' )
2278+ self .config_set ('mds' , 'mds_health_cache_threshold' , '1.00000' )
2279+ active_mds_id = self .fs .get_active_names ()[0 ]
2280+
2281+ self .mount_a .open_n_background ('.' , 400 )
2282+ self .wait_till_health_warn (health_warn , active_mds_id )
2283+
2284+ # actual testing begins now.
2285+ errmsg = 'mds_cache_oversized'
2286+ self .negtest_ceph_cmd (args = f'mds fail { active_mds_id } ' ,
2287+ retval = 1 , errmsgs = errmsg )
2288+ self .run_ceph_cmd (f'mds fail { self .fs .name } --yes-i-really-mean-it' )
2289+
2290+ def test_with_health_warn_trim (self ):
2291+ '''
2292+ Test that, when health warning MDS_TRIM is present for an MDS, command
2293+ "ceph mds fail" fails without confirmation flag and passes when
2294+ confirmation is passed.
2295+ '''
2296+ health_warn = 'MDS_TRIM'
2297+ # for generating health warning MDS_TRIM
2298+ self .config_set ('mds' , 'mds_debug_subtrees' , 'true' )
2299+ # this will really really slow the trimming, so that MDS_TRIM stays
2300+ # for longer.
2301+ self .config_set ('mds' , 'mds_log_trim_decay_rate' , '60' )
2302+ self .config_set ('mds' , 'mds_log_trim_threshold' , '1' )
2303+ active_mds_id = self .fs .get_active_names ()[0 ]
2304+
2305+ self .mount_a .open_n_background ('.' , 400 )
2306+ self .wait_till_health_warn (health_warn , active_mds_id )
2307+
2308+ # actual testing begins now...
2309+ errmsg = 'mds_trim'
2310+ self .negtest_ceph_cmd (args = f'mds fail { active_mds_id } ' ,
2311+ retval = 1 , errmsgs = errmsg )
2312+ self .run_ceph_cmd (f'mds fail { self .fs .name } --yes-i-really-mean-it' )
2313+
2314+ def test_with_health_warn_with_2_active_MDSs (self ):
2315+ '''
2316+ Test when a CephFS has 2 active MDSs and one of them have either
2317+ health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph mds fail"
2318+ fails for both MDSs without confirmation flag and passes for both when
2319+ confirmation flag is passed.
2320+ '''
2321+ health_warn = 'MDS_CACHE_OVERSIZED'
2322+ self .fs .set_max_mds (2 )
2323+ self .config_set ('mds' , 'mds_cache_memory_limit' , '1K' )
2324+ self .config_set ('mds' , 'mds_health_cache_threshold' , '1.00000' )
2325+ self .fs .wait_for_daemons ()
2326+ mds1_id , mds2_id = self .fs .get_active_names ()
2327+
2328+ self .mount_a .open_n_background ('.' , 400 )
2329+ self .wait_till_health_warn (health_warn , mds1_id )
2330+
2331+ health_report = json .loads (self .get_ceph_cmd_stdout ('health detail '
2332+ '--format json' ))
2333+ # MDS ID for which health warning has been generated.
2334+ hw_mds_id = self ._get_unhealthy_mds_id (health_report , health_warn )
2335+ if mds1_id == hw_mds_id :
2336+ non_hw_mds_id = mds2_id
2337+ elif mds2_id == hw_mds_id :
2338+ non_hw_mds_id = mds1_id
2339+ else :
2340+ raise RuntimeError ('There are only 2 MDSs right now but apparently'
2341+ 'health warning was raised for an MDS other '
2342+ 'than these two. This is definitely an error.' )
2343+
2344+ # actual testing begins now...
2345+ errmsg = 'mds_cache_oversized'
2346+ self .negtest_ceph_cmd (args = f'mds fail { non_hw_mds_id } ' , retval = 1 ,
2347+ errmsgs = errmsg )
2348+ self .negtest_ceph_cmd (args = f'mds fail { hw_mds_id } ' , retval = 1 ,
2349+ errmsgs = errmsg )
2350+ self .run_ceph_cmd ('mds fail mds1_id --yes-i-really-mean-it' )
2351+ self .run_ceph_cmd ('mds fail mds2_id --yes-i-really-mean-it' )
0 commit comments