Merge pull request ceph#56065 from rishabh-d-dave/mds-counters

rishabh-d-dave · web-flow · commit 0b7177941c52 · 2024-05-03T13:26:49.000+05:30
mds: don't add counters in warning for standby-replay MDS

Reviewed-by: Venky Shankar &lt;vshankar@redhat.com&gt;
Reviewed-by: Dhairya Parmar &lt;dparmar@redhat.com&gt;
diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml
@@ -7,6 +7,7 @@ overrides:
       - \(MDS_UP_LESS_THAN_MAX\)
       - \(MDS_DAMAGE\)
       - \(FS_DEGRADED\)
+      - \(MDS_CACHE_OVERSIZED\)
   ceph-fuse:
     disabled: true
 tasks:
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
@@ -1155,6 +1155,9 @@ def get_active_names(self, status=None):
         """
         return self.get_daemon_names("up:active", status=status)
 
+    def get_standby_replay_names(self, status=None):
+        return self.get_daemon_names('up:standby-replay', status=status)
+
     def get_all_mds_rank(self, status=None):
         mdsmap = self.get_mds_map(status)
         result = []
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
@@ -3,9 +3,11 @@
 import logging
 import operator
 from random import randint, choice
+from json import loads as json_loads
 
 from tasks.cephfs.cephfs_test_case import CephFSTestCase
 from teuthology.exceptions import CommandFailedError
+from teuthology.contextutil import safe_while
 from tasks.cephfs.fuse_mount import FuseMount
 
 log = logging.getLogger(__name__)
@@ -520,7 +522,8 @@ def test_connect_bootstrapping(self):
 
 
 class TestStandbyReplay(CephFSTestCase):
-    CLIENTS_REQUIRED = 0
+
+    CLIENTS_REQUIRED = 1
     MDSS_REQUIRED = 4
 
     def _confirm_no_replay(self):
@@ -706,6 +709,72 @@ def test_rank_stopped(self):
         status = self._confirm_single_replay()
         self.assertTrue(standby_count, len(list(status.get_standbys())))
 
+    def test_health_warn_oversize_cache_has_no_counters(self):
+        '''
+        Test that when MDS cache size crosses the limit, health warning
+        printed for standy-replay MDS doesn't include inode and stray
+        counters.
+
+        Tests: https://tracker.ceph.com/issues/63514
+        '''
+        # reduce MDS cache limit, default MDS cache limit is too high which
+        # will unnecessarily consume too many resources and too much time.
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        # health warning for crossing MDS cache size limit won't be raised
+        # until a threshold. default threshold is too high. it will
+        # unnecessarily consume so much time and resources.
+        self.config_set('mds', 'mds_health_cache_threshold', '1.000001')
+        # so that there is only active MDS and only 1 health warning is
+        # produced. presence of 2 warning should cause this test to fail
+        self.fs.set_max_mds(1)
+        self.fs.set_allow_standby_replay(True)
+        self._confirm_single_replay()
+        self.fs.wait_for_daemons()
+        # The call above (to self.fs.wait_for_daemons()) should ensure we have
+        # only 1 active MDS on cluster
+        active_mds_id = self.fs.get_active_names()[0]
+        sr_mds_id = self.fs.get_standby_replay_names()[0]
+
+        # this should generate more than enough MDS cache to trigger health
+        # warning MDS_CACHE_OVERSIZED.
+        self.mount_a.open_n_background(".", 400)
+
+        # actual test begins now...
+        with safe_while(sleep=3, tries=10) as proceed:
+            while proceed():
+                # logging cache generated so far for th sake of easy
+                # debugging in future.
+                self.get_ceph_cmd_stdout(f'tell mds.{active_mds_id} cache '
+                                          'status')
+
+                health_report = self.get_ceph_cmd_stdout('health detail '
+                                                         '--format json')
+                health_report = json_loads(health_report)
+                if 'MDS_CACHE_OVERSIZED' not in health_report['checks']:
+                    log.debug('warning hasn\'t appeared in health report yet.'
+                             'trying again after some sleep...')
+                    continue
+
+                cache_warn = health_report['checks']['MDS_CACHE_OVERSIZED']\
+                        ['detail']
+                log.debug(f'cache_warn - {cache_warn}')
+                # sanity check: "ceph health detail" output should've 2
+                # warnings -- one for active MDS and other for standby-replay
+                # MDS.
+                if len(cache_warn) != 2:
+                    log.debug('expected 2 warnings but instead found '
+                              f'{len(cache_warn)} warnings; trying again '
+                               'after some sleep...')
+                    continue
+
+                for cw in cache_warn:
+                    msg = cw['message']
+                    if f'mds.{sr_mds_id}' not in cw['message']:
+                        continue
+                    self.assertNotIn('inodes in use by clients', msg)
+                    self.assertNotIn('stray files', msg)
+                    return
+
 
 class TestMultiFilesystems(CephFSTestCase):
     CLIENTS_REQUIRED = 2
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
@@ -490,9 +490,15 @@ void Beacon::notify_health(MDSRank const *mds)
   if (mds->mdcache->cache_overfull()) {
     CachedStackStringStream css;
     *css << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
-        << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
-        << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
-        << mds->mdcache->get_num_strays() << " stray files";
+        << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << ")";
+    // Don't include inode and stray counters in the report for standby-replay
+    // MDSs. Since it is standby-replay, both will be zero, which might
+    // confuse users.
+    if (!mds->is_standby_replay()) {
+	*css << "; " << mds->mdcache->num_inodes_with_caps << " inodes in "
+	     << "use by clients, " << mds->mdcache->get_num_strays()
+	     << " stray files";
+    }
 
     MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, css->strv());
     health.metrics.push_back(m);