Skip to content

Commit 0b71779

Browse files
Merge pull request ceph#56065 from rishabh-d-dave/mds-counters
mds: don't add counters in warning for standby-replay MDS Reviewed-by: Venky Shankar <[email protected]> Reviewed-by: Dhairya Parmar <[email protected]>
2 parents 18c7799 + 2784e22 commit 0b71779

File tree

4 files changed

+83
-4
lines changed

4 files changed

+83
-4
lines changed

qa/suites/fs/multifs/tasks/failover.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ overrides:
77
- \(MDS_UP_LESS_THAN_MAX\)
88
- \(MDS_DAMAGE\)
99
- \(FS_DEGRADED\)
10+
- \(MDS_CACHE_OVERSIZED\)
1011
ceph-fuse:
1112
disabled: true
1213
tasks:

qa/tasks/cephfs/filesystem.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,9 @@ def get_active_names(self, status=None):
11551155
"""
11561156
return self.get_daemon_names("up:active", status=status)
11571157

1158+
def get_standby_replay_names(self, status=None):
1159+
return self.get_daemon_names('up:standby-replay', status=status)
1160+
11581161
def get_all_mds_rank(self, status=None):
11591162
mdsmap = self.get_mds_map(status)
11601163
result = []

qa/tasks/cephfs/test_failover.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
import logging
44
import operator
55
from random import randint, choice
6+
from json import loads as json_loads
67

78
from tasks.cephfs.cephfs_test_case import CephFSTestCase
89
from teuthology.exceptions import CommandFailedError
10+
from teuthology.contextutil import safe_while
911
from tasks.cephfs.fuse_mount import FuseMount
1012

1113
log = logging.getLogger(__name__)
@@ -520,7 +522,8 @@ def test_connect_bootstrapping(self):
520522

521523

522524
class TestStandbyReplay(CephFSTestCase):
523-
CLIENTS_REQUIRED = 0
525+
526+
CLIENTS_REQUIRED = 1
524527
MDSS_REQUIRED = 4
525528

526529
def _confirm_no_replay(self):
@@ -706,6 +709,72 @@ def test_rank_stopped(self):
706709
status = self._confirm_single_replay()
707710
self.assertTrue(standby_count, len(list(status.get_standbys())))
708711

712+
def test_health_warn_oversize_cache_has_no_counters(self):
713+
'''
714+
Test that when MDS cache size crosses the limit, health warning
715+
printed for standy-replay MDS doesn't include inode and stray
716+
counters.
717+
718+
Tests: https://tracker.ceph.com/issues/63514
719+
'''
720+
# reduce MDS cache limit, default MDS cache limit is too high which
721+
# will unnecessarily consume too many resources and too much time.
722+
self.config_set('mds', 'mds_cache_memory_limit', '1K')
723+
# health warning for crossing MDS cache size limit won't be raised
724+
# until a threshold. default threshold is too high. it will
725+
# unnecessarily consume so much time and resources.
726+
self.config_set('mds', 'mds_health_cache_threshold', '1.000001')
727+
# so that there is only active MDS and only 1 health warning is
728+
# produced. presence of 2 warning should cause this test to fail
729+
self.fs.set_max_mds(1)
730+
self.fs.set_allow_standby_replay(True)
731+
self._confirm_single_replay()
732+
self.fs.wait_for_daemons()
733+
# The call above (to self.fs.wait_for_daemons()) should ensure we have
734+
# only 1 active MDS on cluster
735+
active_mds_id = self.fs.get_active_names()[0]
736+
sr_mds_id = self.fs.get_standby_replay_names()[0]
737+
738+
# this should generate more than enough MDS cache to trigger health
739+
# warning MDS_CACHE_OVERSIZED.
740+
self.mount_a.open_n_background(".", 400)
741+
742+
# actual test begins now...
743+
with safe_while(sleep=3, tries=10) as proceed:
744+
while proceed():
745+
# logging cache generated so far for th sake of easy
746+
# debugging in future.
747+
self.get_ceph_cmd_stdout(f'tell mds.{active_mds_id} cache '
748+
'status')
749+
750+
health_report = self.get_ceph_cmd_stdout('health detail '
751+
'--format json')
752+
health_report = json_loads(health_report)
753+
if 'MDS_CACHE_OVERSIZED' not in health_report['checks']:
754+
log.debug('warning hasn\'t appeared in health report yet.'
755+
'trying again after some sleep...')
756+
continue
757+
758+
cache_warn = health_report['checks']['MDS_CACHE_OVERSIZED']\
759+
['detail']
760+
log.debug(f'cache_warn - {cache_warn}')
761+
# sanity check: "ceph health detail" output should've 2
762+
# warnings -- one for active MDS and other for standby-replay
763+
# MDS.
764+
if len(cache_warn) != 2:
765+
log.debug('expected 2 warnings but instead found '
766+
f'{len(cache_warn)} warnings; trying again '
767+
'after some sleep...')
768+
continue
769+
770+
for cw in cache_warn:
771+
msg = cw['message']
772+
if f'mds.{sr_mds_id}' not in cw['message']:
773+
continue
774+
self.assertNotIn('inodes in use by clients', msg)
775+
self.assertNotIn('stray files', msg)
776+
return
777+
709778

710779
class TestMultiFilesystems(CephFSTestCase):
711780
CLIENTS_REQUIRED = 2

src/mds/Beacon.cc

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -490,9 +490,15 @@ void Beacon::notify_health(MDSRank const *mds)
490490
if (mds->mdcache->cache_overfull()) {
491491
CachedStackStringStream css;
492492
*css << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
493-
<< "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
494-
<< mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
495-
<< mds->mdcache->get_num_strays() << " stray files";
493+
<< "/" << bytes2str(mds->mdcache->cache_limit_memory()) << ")";
494+
// Don't include inode and stray counters in the report for standby-replay
495+
// MDSs. Since it is standby-replay, both will be zero, which might
496+
// confuse users.
497+
if (!mds->is_standby_replay()) {
498+
*css << "; " << mds->mdcache->num_inodes_with_caps << " inodes in "
499+
<< "use by clients, " << mds->mdcache->get_num_strays()
500+
<< " stray files";
501+
}
496502

497503
MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, css->strv());
498504
health.metrics.push_back(m);

0 commit comments

Comments
 (0)