qa/suites/rados: white list + add MON_NETSPLIT tests

kamoltat · kamoltat · commit abfa5a367736 · 2025-10-10T20:36:50.000Z
Some existing netsplit test in 3az + stretch mode needed white listing and check for netsplit details Make qa/tasks/mon_thrash.py set mon_netsplit_grace_period to 30 seconds when we try to freeze monitors instead of killing them. Make qa/tasks/stretch_mode_disable_enable.py set mon_netsplit_grace_period to 30 seconds during `teardown` phase only. Fixes: https://tracker.ceph.com/issues/71344 Signed-off-by: Kamoltat Sirivadhna <ksirivad@redhat.com>
diff --git a/qa/suites/netsplit/ceph.yaml b/qa/suites/netsplit/ceph.yaml
@@ -31,6 +31,7 @@ overrides:
       - \(PG_AVAILABILITY\)
       - \(SLOW_OPS\)
       - \[WRN\]
+      - \(MON_NETSPLIT\)
 tasks:
 - install:
 - ceph:
diff --git a/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml b/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml
@@ -64,6 +64,7 @@ tasks:
       - \(PG_AVAILABILITY\)
       - \(SLOW_OPS\)
       - \[WRN\]
+      - \(MON_NETSPLIT\)
 - workunit:
     clients:
       client.0:
diff --git a/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml
@@ -46,8 +46,6 @@ tasks:
       - \(OSD_DATACENTER_DOWN\)
       - \(OSD_DOWN\)
       - \(OSD_HOST_DOWN\)
-
-
 - workunit:
     clients:
       client.0:
diff --git a/qa/suites/rados/singleton/msgr-failures/few.yaml b/qa/suites/rados/singleton/msgr-failures/few.yaml
@@ -7,3 +7,4 @@ overrides:
     log-ignorelist:
       - \(OSD_SLOW_PING_TIME
       - \(MON_DOWN\)
+      - \(MON_NETSPLIT\)
diff --git a/qa/suites/rados/singleton/msgr-failures/many.yaml b/qa/suites/rados/singleton/msgr-failures/many.yaml
@@ -11,3 +11,4 @@ overrides:
     log-ignorelist:
       - \(OSD_SLOW_PING_TIME
       - \(MON_DOWN\)
+      - \(MON_NETSPLIT\)
diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py
@@ -64,7 +64,7 @@ class MonitorThrasher(Thrasher):
                         task to run with as many as just one single monitor.
                         (default: True)
     freeze_mon_probability: how often to freeze the mon instead of killing it,
-                        in % (default: 0)
+                        in % (default: 10)
     freeze_mon_duration: how many seconds to freeze the mon (default: 15)
     scrub               Scrub after each iteration (default: True)
     check_mds_failover  Check if mds failover happened (default: False)
@@ -128,6 +128,15 @@ def __init__(self, ctx, manager, config, name, logger):
         self.scrub = self.config.get('scrub', True)
 
         self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
+        # In some cases where many monitors froze at once and revived
+        # after a long time might cause the connection to take more time to establish.
+        # Therefore, we increase the netsplit grace period to 30 seconds.
+        # This is to avoid false positives in the netsplit test, while still keeping
+        # the integrity of the test.
+        if self.freeze_mon_probability > 0:
+            self.manager.raw_cluster_cmd(
+                'config', 'set', 'mon', 'mon_netsplit_grace_period', '30')
+
         self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
 
         assert self.max_killable() > 0, \
diff --git a/qa/tasks/stretch_mode_disable_enable.py b/qa/tasks/stretch_mode_disable_enable.py
@@ -122,13 +122,21 @@ def _bring_back_mon(self, mon):
         """
         Bring back the mon.
         """
+        log.debug("_bring_back_mon %s", mon)
+        # If the mon is already up, do nothing
+        quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names()
+        if mon in quorum_names:
+            log.debug("mon.%s is already up", mon)
+            return
+        # If the mon is not up, try to bring it back
+        log.debug("Bringing back mon.%s", mon)
         try:
             self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart()
         except Exception:
             log.error("Failed to bring back mon.{}".format(str(mon)))
             pass
 
-    def _get_host(self, osd):
+    def _get_osd_host(self, osd):
         """
         Get the host of the osd.
         """
@@ -142,7 +150,7 @@ def _move_osd_back_to_host(self, osd):
         """
         Move the osd back to the host.
         """
-        host = self._get_host(osd)
+        host = self._get_osd_host(osd)
         assert host is not None, "The host of osd {} is not found.".format(osd)
         log.debug("Moving osd.%d back to %s", osd, host)
         self.mgr_cluster.mon_manager.raw_cluster_cmd(
@@ -155,6 +163,7 @@ def tearDown(self):
         Clean up the cluster after the test.
         """
         # Remove the pool
+        log.debug("Tear down the test")
         if self.POOL in self.mgr_cluster.mon_manager.pools:
             self.mgr_cluster.mon_manager.remove_pool(self.POOL)
 
@@ -168,7 +177,14 @@ def tearDown(self):
             if osd['up'] == 0:
                 self.mgr_cluster.mon_manager.revive_osd(osd['osd'])
                 self._move_osd_back_to_host(osd['osd'])
-        
+
+        # Set the mon_netsplit_grace_period to 30 seconds.
+        # Sometimes when many mons restart at the same time
+        # it can take longer for the monitors to establish
+        # a connection.
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'config', 'set', 'mon', 'mon_netsplit_grace_period', '30'
+        )
         # Bring back all the mons
         mons = self._get_all_mons_from_all_dc()
         for mon in mons:
@@ -359,6 +375,7 @@ def _stretch_mode_enabled_correctly(self):
             self.TIEBREAKER_MON_NAME,
             monmap['tiebreaker_mon']
         )
+        log.debug("Stretch mode is enabled correctly.")
 
     def _stretch_mode_disabled_correctly(self):
         """
@@ -445,6 +462,7 @@ def _stretch_mode_disabled_correctly(self):
             "",
             monmap['tiebreaker_mon']
         )
+        log.debug("Stretch mode is disabled correctly.")
 
     def test_disable_stretch_mode(self):
         """
diff --git a/qa/tasks/test_netsplit.py b/qa/tasks/test_netsplit.py
@@ -193,6 +193,50 @@ def _check_if_connect(self, config):
         except Exception:
             return False
 
+    def _check_mon_netsplit_warning(self):
+        """
+        Returns True if MON_NETSPLIT warning exists in health checks.
+        """
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        checks = j.get("checks", {})
+        return "MON_NETSPLIT" in checks
+
+    def _check_mon_netsplit_warning_raised(self, detail):
+        """
+        Check if the MON_NETSPLIT warning with the given detail is raised.
+        """
+        log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        # Access health checks
+        checks = j.get("checks", {})
+        netsplit = checks.get("MON_NETSPLIT", {})
+        if not netsplit:
+            log.info("MON_NETSPLIT not found in health checks")
+            return False
+
+        # Check if the expected detail is present
+        for d in netsplit.get("detail", []):
+            if detail in d.get("message", ""):
+                log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
+                return True
+
+        log.info("MON_NETSPLIT found but detail does not match")
+        return False
+
     def test_netsplit_dc1_dc2(self):
         """
         Test Netsplit between dc1 and dc2
@@ -220,6 +264,13 @@ def test_netsplit_dc1_dc2(self):
             lambda: self._check_if_disconnect(config),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if the MON_NETSPLIT warning is raised we expect none
+        # because this is stretch mode
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check the cluster is accessible
         self.wait_until_true_and_hold(
             lambda: self._reply_to_mon_command(),
@@ -263,6 +314,12 @@ def test_netsplit_dc1_dc2(self):
             lambda: self._check_if_connect(config),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if no MON_NETSPLIT warning is raised
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check if all the PGs are active+clean
         self.wait_until_true_and_hold(
             lambda: self._pg_all_active_clean(),
@@ -305,6 +362,13 @@ def test_netsplit_arbiter_dc1_and_dc1_dc2(self):
             lambda: self._check_if_disconnect(dc1_dc2),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if the MON_NETSPLIT warning is raised we expect none
+        # because this is stretch mode
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check the cluster is accessible
         self.wait_until_true_and_hold(
             lambda: self._reply_to_mon_command(),
@@ -352,6 +416,12 @@ def test_netsplit_arbiter_dc1_and_dc1_dc2(self):
             lambda: self._check_if_connect(dc1_dc2),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if the MON_NETSPLIT warning is not raised
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check if all the PGs are active+clean
         self.wait_until_true_and_hold(
             lambda: self._pg_all_active_clean(),
diff --git a/qa/tasks/test_netsplit_3az_stretch_pool.py b/qa/tasks/test_netsplit_3az_stretch_pool.py
@@ -209,6 +209,51 @@ def _check_if_connect(self, config):
         except Exception:
             return False
 
+    def _check_mon_netsplit_warning(self):
+        """
+        Returns True if MON_NETSPLIT warning exists in health checks.
+        """
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        checks = j.get("checks", {})
+        log.debug("checks: {}".format(checks))
+        return "MON_NETSPLIT" in checks
+
+    def _check_mon_netsplit_warning_raised(self, detail):
+        """
+        Check if the MON_NETSPLIT warning with the given detail is raised.
+        """
+        log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        # Access health checks
+        checks = j.get("checks", {})
+        netsplit = checks.get("MON_NETSPLIT", {})
+        if not netsplit:
+            log.info("MON_NETSPLIT not found in health checks")
+            return False
+
+        # Check if the expected detail is present
+        for d in netsplit.get("detail", []):
+            if detail in d.get("message", ""):
+                log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
+                return True
+
+        log.info("MON_NETSPLIT found but detail does not match")
+        return False
+
     def test_mon_netsplit(self):
         """
         Test the mon netsplit scenario, if cluster is actually accessible.
@@ -247,6 +292,7 @@ def test_mon_netsplit(self):
         # Scenario 1: disconnect Site 1 and Site 2
         # Site 3 is still connected to both Site 1 and Site 2
         config = ["mon.a", "mon.d"]
+        location = ["dc1", "dc2"]
         # disconnect the mons
         self._disconnect_mons(config)
         # wait for the mons to be disconnected
@@ -256,6 +302,16 @@ def test_mon_netsplit(self):
             lambda: self._check_if_disconnect(config),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if location level MON_NETSPLIT warning is raised
+        self.wait_until_true_and_hold(
+            lambda: self._check_mon_netsplit_warning_raised(
+                "Netsplit detected between {} and {}".format(
+                    location[0], location[1]
+                ),
+            ),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check the cluster is accessible
         self.wait_until_true_and_hold(
             lambda: self._reply_to_mon_command(),
@@ -271,6 +327,12 @@ def test_mon_netsplit(self):
             lambda: self._check_if_connect(config),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if the MON_NETSPLIT warning is cleared
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # wait for the PGs to recover
         time.sleep(self.RECOVERY_PERIOD)
         # check if all PGs are active+clean