Skip to content

Commit 2784e22

Browse files
qa/cephfs: test that counters are not printed for SR MDS
- Add tests to verify that inode and stray counters are not replayed/included in the health warnings printed for the standby-replay MDS. - Add "MDS_CACHE_OVERSIZED" health warning to ignorelist to failover.yaml. - Add a helper method to qa.tasks.cephfs.filesystem.Filesystem to get MDS name of standby-replay MDS. Signed-off-by: Rishabh Dave <[email protected]>
1 parent 03dcdc1 commit 2784e22

File tree

3 files changed

+74
-1
lines changed

3 files changed

+74
-1
lines changed

qa/suites/fs/multifs/tasks/failover.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ overrides:
77
- \(MDS_UP_LESS_THAN_MAX\)
88
- \(MDS_DAMAGE\)
99
- \(FS_DEGRADED\)
10+
- \(MDS_CACHE_OVERSIZED\)
1011
ceph-fuse:
1112
disabled: true
1213
tasks:

qa/tasks/cephfs/filesystem.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,9 @@ def get_active_names(self, status=None):
11431143
"""
11441144
return self.get_daemon_names("up:active", status=status)
11451145

1146+
def get_standby_replay_names(self, status=None):
1147+
return self.get_daemon_names('up:standby-replay', status=status)
1148+
11461149
def get_all_mds_rank(self, status=None):
11471150
mdsmap = self.get_mds_map(status)
11481151
result = []

qa/tasks/cephfs/test_failover.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
import logging
44
import operator
55
from random import randint, choice
6+
from json import loads as json_loads
67

78
from tasks.cephfs.cephfs_test_case import CephFSTestCase
89
from teuthology.exceptions import CommandFailedError
10+
from teuthology.contextutil import safe_while
911
from tasks.cephfs.fuse_mount import FuseMount
1012

1113
log = logging.getLogger(__name__)
@@ -520,7 +522,8 @@ def test_connect_bootstrapping(self):
520522

521523

522524
class TestStandbyReplay(CephFSTestCase):
523-
CLIENTS_REQUIRED = 0
525+
526+
CLIENTS_REQUIRED = 1
524527
MDSS_REQUIRED = 4
525528

526529
def _confirm_no_replay(self):
@@ -706,6 +709,72 @@ def test_rank_stopped(self):
706709
status = self._confirm_single_replay()
707710
self.assertTrue(standby_count, len(list(status.get_standbys())))
708711

712+
def test_health_warn_oversize_cache_has_no_counters(self):
713+
'''
714+
Test that when MDS cache size crosses the limit, health warning
715+
printed for standy-replay MDS doesn't include inode and stray
716+
counters.
717+
718+
Tests: https://tracker.ceph.com/issues/63514
719+
'''
720+
# reduce MDS cache limit, default MDS cache limit is too high which
721+
# will unnecessarily consume too many resources and too much time.
722+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
723+
# health warning for crossing MDS cache size limit won't be raised
724+
# until a threshold. default threshold is too high. it will
725+
# unnecessarily consume so much time and resources.
726+
self.config_set('mds', 'mds_health_cache_threshold', '1.000001')
727+
# so that there is only active MDS and only 1 health warning is
728+
# produced. presence of 2 warning should cause this test to fail
729+
self.fs.set_max_mds(1)
730+
self.fs.set_allow_standby_replay(True)
731+
self._confirm_single_replay()
732+
self.fs.wait_for_daemons()
733+
# The call above (to self.fs.wait_for_daemons()) should ensure we have
734+
# only 1 active MDS on cluster
735+
active_mds_id = self.fs.get_active_names()[0]
736+
sr_mds_id = self.fs.get_standby_replay_names()[0]
737+
738+
# this should generate more than enough MDS cache to trigger health
739+
# warning MDS_CACHE_OVERSIZED.
740+
self.mount_a.open_n_background(".", 400)
741+
742+
# actual test begins now...
743+
with safe_while(sleep=3, tries=10) as proceed:
744+
while proceed():
745+
# logging cache generated so far for th sake of easy
746+
# debugging in future.
747+
self.get_ceph_cmd_stdout(f'tell mds.{active_mds_id} cache '
748+
'status')
749+
750+
health_report = self.get_ceph_cmd_stdout('health detail '
751+
'--format json')
752+
health_report = json_loads(health_report)
753+
if 'MDS_CACHE_OVERSIZED' not in health_report['checks']:
754+
log.debug('warning hasn\'t appeared in health report yet.'
755+
'trying again after some sleep...')
756+
continue
757+
758+
cache_warn = health_report['checks']['MDS_CACHE_OVERSIZED']\
759+
['detail']
760+
log.debug(f'cache_warn - {cache_warn}')
761+
# sanity check: "ceph health detail" output should've 2
762+
# warnings -- one for active MDS and other for standby-replay
763+
# MDS.
764+
if len(cache_warn) != 2:
765+
log.debug('expected 2 warnings but instead found '
766+
f'{len(cache_warn)} warnings; trying again '
767+
'after some sleep...')
768+
continue
769+
770+
for cw in cache_warn:
771+
msg = cw['message']
772+
if f'mds.{sr_mds_id}' not in cw['message']:
773+
continue
774+
self.assertNotIn('inodes in use by clients', msg)
775+
self.assertNotIn('stray files', msg)
776+
return
777+
709778

710779
class TestMultiFilesystems(CephFSTestCase):
711780
CLIENTS_REQUIRED = 2

0 commit comments

Comments
 (0)