Merge pull request ceph#52891 from kamoltat/wip-ksirivad-fix-59172

yuriw · web-flow · commit 80feaec5a137 · 2024-02-27T08:37:34.000-08:00
qa/tasks/ceph_manager.py: Rewrite test_pool_min_size

Reviewed-by: Samuel Just &lt;sjust@redhat.com&gt;
diff --git a/qa/suites/rados/thrash-erasure-code/thrashers/minsize_recovery.yaml b/qa/suites/rados/thrash-erasure-code/thrashers/minsize_recovery.yaml
@@ -13,7 +13,10 @@ overrides:
         osd scrub min interval: 60
         osd scrub max interval: 120
         osd max backfills: 2
+  rados:
+    wait_for_all_active_clean_pgs: true
+
 tasks:
 - thrashosds:
     timeout: 1200
-    chance_test_min_size: 3 
+    chance_test_min_size: 3
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py
@@ -891,115 +891,123 @@ def fix_pgp_num(self, pool=None):
         if self.ceph_manager.set_pool_pgpnum(pool, force):
             self.pools_to_fix_pgp_num.discard(pool)
 
+    def get_rand_pg_acting_set(self, pool_id=None):
+        """
+        Return an acting set of a random PG, you
+        have the option to specify which pool you
+        want the PG from.
+        """
+        pgs = self.ceph_manager.get_pg_stats()
+        if not pgs:
+            self.log('No pgs; doing nothing')
+            return
+        if pool_id:
+           pgs_in_pool = [pg for pg in pgs if int(pg['pgid'].split('.')[0]) == pool_id]
+           pg = random.choice(pgs_in_pool)
+        else:
+            pg = random.choice(pgs)
+        self.log('Choosing PG {id} with acting set {act}'.format(id=pg['pgid'],act=pg['acting']))
+        return pg['acting']
+
+    def get_k_m_ec_pool(self, pool, pool_json):
+        """
+        Returns k and m
+        """
+        k = 0
+        m = 99
+        try:
+            ec_profile = self.ceph_manager.get_pool_property(pool, 'erasure_code_profile')
+            ec_profile = pool_json['erasure_code_profile']
+            ec_profile_json = self.ceph_manager.raw_cluster_cmd(
+                'osd',
+                'erasure-code-profile',
+                'get',
+                ec_profile,
+                '--format=json')
+            ec_json = json.loads(ec_profile_json)
+            local_k = int(ec_json['k'])
+            local_m = int(ec_json['m'])
+            self.log("pool {pool} local_k={k} local_m={m}".format(pool=pool,
+                                                                  k=local_k, m=local_m))
+            if local_k > k:
+                self.log("setting k={local_k} from previous {k}".format(local_k=local_k, k=k))
+                k = local_k
+            if local_m < m:
+                self.log("setting m={local_m} from previous {m}".format(local_m=local_m, m=m))
+                m = local_m
+        except CommandFailedError:
+            self.log("failed to read erasure_code_profile. %s was likely removed", pool)
+            return None, None
+
+        return k, m
+
     def test_pool_min_size(self):
         """
-        Loop to selectively push PGs below their min_size and test that recovery
-        still occurs.
+        Loop to selectively push PGs to their min_size and test that recovery
+        still occurs. We achieve this by randomly picking a PG and fail the OSDs
+        according to the PG's acting set.
         """
         self.log("test_pool_min_size")
         self.all_up()
         time.sleep(60) # buffer time for recovery to start.
         self.ceph_manager.wait_for_recovery(
             timeout=self.config.get('timeout')
             )
-        minout = int(self.config.get("min_out", 1))
-        minlive = int(self.config.get("min_live", 2))
-        mindead = int(self.config.get("min_dead", 1))
         self.log("doing min_size thrashing")
         self.ceph_manager.wait_for_clean(timeout=180)
         assert self.ceph_manager.is_clean(), \
             'not clean before minsize thrashing starts'
-        while not self.stopping:
+        start = time.time()
+        while time.time() - start < self.config.get("test_min_size_duration", 1800):
             # look up k and m from all the pools on each loop, in case it
             # changes as the cluster runs
-            k = 0
-            m = 99
-            has_pools = False
             pools_json = self.ceph_manager.get_osd_dump_json()['pools']
-
+            if len(pools_json) == 0:
+                self.log("No pools yet, waiting")
+                time.sleep(5)
+                continue
             for pool_json in pools_json:
                 pool = pool_json['pool_name']
-                has_pools = True
+                pool_id = pool_json['pool']
                 pool_type = pool_json['type']  # 1 for rep, 3 for ec
                 min_size = pool_json['min_size']
                 self.log("pool {pool} min_size is {min_size}".format(pool=pool,min_size=min_size))
-                try:
-                    ec_profile = self.ceph_manager.get_pool_property(pool, 'erasure_code_profile')
-                    if pool_type != PoolType.ERASURE_CODED:
-                        continue
-                    ec_profile = pool_json['erasure_code_profile']
-                    ec_profile_json = self.ceph_manager.raw_cluster_cmd(
-                        'osd',
-                        'erasure-code-profile',
-                        'get',
-                        ec_profile,
-                        '--format=json')
-                    ec_json = json.loads(ec_profile_json)
-                    local_k = int(ec_json['k'])
-                    local_m = int(ec_json['m'])
-                    self.log("pool {pool} local_k={k} local_m={m}".format(pool=pool,
-                                                                          k=local_k, m=local_m))
-                    if local_k > k:
-                        self.log("setting k={local_k} from previous {k}".format(local_k=local_k, k=k))
-                        k = local_k
-                    if local_m < m:
-                        self.log("setting m={local_m} from previous {m}".format(local_m=local_m, m=m))
-                        m = local_m
-                except CommandFailedError:
-                    self.log("failed to read erasure_code_profile. %s was likely removed", pool)
+                if pool_type != PoolType.ERASURE_CODED:
                     continue
-
-            if has_pools :
-                self.log("using k={k}, m={m}".format(k=k,m=m))
-            else:
-                self.log("No pools yet, waiting")
-                time.sleep(5)
-                continue
-                
-            if minout > len(self.out_osds): # kill OSDs and mark out
-                self.log("forced to out an osd")
-                self.kill_osd(mark_out=True)
-                continue
-            elif mindead > len(self.dead_osds): # kill OSDs but force timeout
-                self.log("forced to kill an osd")
-                self.kill_osd()
-                continue
-            else: # make mostly-random choice to kill or revive OSDs
-                minup = max(minlive, k)
-                rand_val = random.uniform(0, 1)
-                self.log("choosing based on number of live OSDs and rand val {rand}".\
-                         format(rand=rand_val))
-                if len(self.live_osds) > minup+1 and rand_val < 0.5:
-                    # chose to knock out as many OSDs as we can w/out downing PGs
-                    
-                    most_killable = min(len(self.live_osds) - minup, m)
-                    self.log("chose to kill {n} OSDs".format(n=most_killable))
-                    for i in range(1, most_killable):
-                        self.kill_osd(mark_out=True)
-                    time.sleep(10)
-                    # try a few times since there might be a concurrent pool
-                    # creation or deletion
-                    with safe_while(
-                            sleep=25, tries=5,
-                            action='check for active or peered') as proceed:
-                        while proceed():
-                            if self.ceph_manager.all_active_or_peered():
-                                break
-                            self.log('not all PGs are active or peered')
-                else: # chose to revive OSDs, bring up a random fraction of the dead ones
-                    self.log("chose to revive osds")
-                    for i in range(1, int(rand_val * len(self.dead_osds))):
-                        self.revive_osd(i)
-
-            # let PGs repair themselves or our next knockout might kill one
-            self.ceph_manager.wait_for_clean(timeout=self.config.get('timeout'))
- 
-        # / while not self.stopping
-        self.all_up_in()
- 
-        self.ceph_manager.wait_for_recovery(
-            timeout=self.config.get('timeout')
-            )
+                else:
+                    k, m = self.get_k_m_ec_pool(pool, pool_json)
+                    if k == None and m == None:
+                        continue
+                    self.log("using k={k}, m={m}".format(k=k,m=m))
+
+                self.log("dead_osds={d}, live_osds={ld}".format(d=self.dead_osds, ld=self.live_osds))
+                minup = max(min_size, k)
+                # Choose a random PG and kill OSDs until only min_size remain
+                most_killable = min(len(self.live_osds) - minup, m)
+                self.log("chose to kill {n} OSDs".format(n=most_killable))
+                acting_set = self.get_rand_pg_acting_set(pool_id)
+                assert most_killable < len(acting_set)
+                for i in range(0, most_killable):
+                    self.kill_osd(osd=acting_set[i], mark_out=True)
+                self.log("dead_osds={d}, live_osds={ld}".format(d=self.dead_osds, ld=self.live_osds))
+                with safe_while(
+                    sleep=25, tries=5,
+                    action='check for active or peered') as proceed:
+                    while proceed():
+                        if self.ceph_manager.all_active_or_peered():
+                            break
+                        self.log('not all PGs are active or peered')
+                self.all_up_in() # revive all OSDs
+                # let PGs repair themselves or our next knockout might kill one
+                # wait_for_recovery since some workloads won't be able to go clean
+                self.ceph_manager.wait_for_recovery(
+                    timeout=self.config.get('timeout')
+                )
+        # while not self.stopping
+        self.all_up_in() # revive all OSDs
+
+        # Wait until all PGs are active+clean after we have revived all the OSDs
+        self.ceph_manager.wait_for_clean(timeout=self.config.get('timeout'))
 
     def inject_pause(self, conf_key, duration, check_after, should_be_down):
         """
@@ -2952,8 +2960,10 @@ def all_active_or_peered(self):
         """
         Wrapper to check if all PGs are active or peered
         """
+        self.log("checking for active or peered")
         pgs = self.get_pg_stats()
         if self._get_num_active(pgs) + self._get_num_peered(pgs) == len(pgs):
+            self.log("all pgs are active or peered!")
             return True
         else:
             self.dump_pgs_not_active_peered(pgs)
diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py
@@ -130,6 +130,12 @@ def task(ctx, config):
     assert isinstance(config, dict), \
         "please list clients to run on"
 
+    log.info("config is {config}".format(config=str(config)))
+    overrides = ctx.config.get('overrides', {})
+    log.info("overrides is {overrides}".format(overrides=str(overrides)))
+    teuthology.deep_merge(config, overrides.get('rados', {}))
+    log.info("config is {config}".format(config=str(config)))
+
     object_size = int(config.get('object_size', 4000000))
     op_weights = config.get('op_weights', {})
     testdir = teuthology.get_testdir(ctx)
@@ -272,6 +278,13 @@ def thread():
                     )
                 tests[id_] = proc
             run.wait(tests.values())
+            wait_for_all_active_clean_pgs = config.get("wait_for_all_active_clean_pgs", False)
+            # usually set when we do min_size testing.
+            if  wait_for_all_active_clean_pgs:
+                # Make sure we finish the test first before deleting the pool.
+                # Mainly used for test_pool_min_size
+                manager.wait_for_clean()
+                manager.wait_for_all_osds_up(timeout=1800)
 
             for pool in created_pools:
                 manager.wait_snap_trimming_complete(pool);
diff --git a/qa/tasks/thrashosds.py b/qa/tasks/thrashosds.py
@@ -63,6 +63,9 @@ def task(ctx, config):
        - kills that osd
        - revives all other osds
        - verifies that the osds fully recover
+    
+    test_min_size_duration: (1800) the number of seconds for
+        test_pool_min_size to last.
 
     timeout: (360) the number of seconds to wait for the cluster
        to become clean after each cluster change. If this doesn't