Skip to content

Commit 66b42c3

Browse files
committed
suites: host thrasher should check min_in before thrashing host
We need to check if taking host out will cause the total in osds to be less then min_in Fixes: https://tracker.ceph.com/issues/66657 Signed-off-by: Nitzan Mordechai <[email protected]>
1 parent bdbef73 commit 66b42c3

File tree

1 file changed

+35
-14
lines changed

1 file changed

+35
-14
lines changed

qa/tasks/ceph_manager.py

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -569,23 +569,45 @@ def revive_osd(self, osd=None, skip_admin_check=False):
569569

570570
def out_host(self, host=None):
571571
"""
572-
Make all osds on a host out
572+
Make all OSDs on a host out if the host has more than min_in OSDs.
573573
:param host: Host to be marked.
574574
"""
575-
# check that all osd remotes have a valid console
575+
# Check that all OSD remotes have a valid console
576576
osds = self.ceph_manager.ctx.cluster.only(teuthology.is_type('osd', self.ceph_manager.cluster))
577-
if host is None:
578-
host = random.choice(list(osds.remotes.keys()))
579-
self.log("Removing all osds in host %s" % (host,))
580-
581-
for role in osds.remotes[host]:
582-
if not role.startswith("osd."):
583-
continue
584-
osdid = int(role.split('.')[1])
585-
if self.in_osds.count(osdid) == 0:
586-
continue
587-
self.out_osd(osdid)
577+
all_hosts = list(osds.remotes.keys())
578+
min_in = self.minin
579+
580+
if host is not None:
581+
all_hosts = [host] if host in all_hosts else []
582+
583+
random.shuffle(all_hosts) # Shuffle the list to pick hosts randomly
584+
585+
for host in all_hosts:
586+
self.log("Checking the number of in OSDs in host %s" % (host,))
587+
588+
# Count the number of in OSDs in the host
589+
in_host_osd_count = 0
590+
for role in osds.remotes[host]:
591+
if role.startswith("osd."):
592+
osdid = int(role.split('.')[1])
593+
if osdid in self.in_osds:
594+
in_host_osd_count += 1
595+
596+
# Check taking out that host will cause the number
597+
# of in OSDs to be less than min_in
598+
if len(self.in_osds) - in_host_osd_count >= min_in:
599+
self.log("Removing all OSDs in host %s" % (host,))
600+
# Proceed to take out OSDs
601+
for role in osds.remotes[host]:
602+
if role.startswith("osd."):
603+
osdid = int(role.split('.')[1])
604+
if osdid in self.in_osds:
605+
self.out_osd(osdid)
606+
return
607+
else:
608+
self.log("Host %s can't be trashed as it will left %d OSDs in" % (host, len(self.in_osds) - in_host_osd_count))
588609

610+
self.log("No suitable host found to thrash")
589611

590612
def out_osd(self, osd=None):
591613
"""
@@ -1254,7 +1276,6 @@ def choose_action(self):
12541276
(minin, minout, minlive, mindead, chance_down))
12551277
actions = []
12561278
if thrash_hosts:
1257-
self.log("check thrash_hosts")
12581279
if len(self.in_osds) > minin:
12591280
self.log("check thrash_hosts: in_osds > minin")
12601281
actions.append((self.out_host, 1.0,))

0 commit comments

Comments
 (0)