Skip to content

Commit 39a09a3

Browse files
authored
Merge pull request ceph#58275 from NitzanMordhai/wip-nitzn-host-thraser-fix-min-in-checks
suites: host thrasher should check min_in before thrashing host
2 parents 24ae2c9 + 89d695f commit 39a09a3

File tree

8 files changed

+150
-17
lines changed

8 files changed

+150
-17
lines changed

qa/suites/rados/thrash/thrashers/careful.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ tasks:
2121
- thrashosds:
2222
timeout: 1200
2323
min_in: 2
24-
thrash_hosts: true
2524
chance_pgnum_grow: 1
2625
chance_pgnum_shrink: 1
2726
chance_pgpnum_fix: 1
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
overrides:
2+
ceph:
3+
log-ignorelist:
4+
- but it is still running
5+
- objects unfound and apparently lost
6+
conf:
7+
osd:
8+
osd debug reject backfill probability: .3
9+
osd scrub min interval: 60
10+
osd scrub max interval: 120
11+
osd max backfills: 3
12+
osd snap trim sleep: 2
13+
mon:
14+
mon min osdmap epochs: 50
15+
paxos service trim min: 10
16+
# prune full osdmaps regularly
17+
mon osdmap full prune min: 15
18+
mon osdmap full prune interval: 2
19+
mon osdmap full prune txsize: 2
20+
tasks:
21+
- thrashosds:
22+
timeout: 1200
23+
min_in: 2
24+
thrash_hosts: true
25+
chance_pgnum_grow: 1
26+
chance_pgnum_shrink: 1
27+
chance_pgpnum_fix: 1
28+
aggressive_pg_num_changes: false

qa/suites/rados/thrash/thrashers/default.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ tasks:
2222
- thrashosds:
2323
timeout: 1200
2424
min_in: 2
25-
thrash_hosts: true
2625
chance_pgnum_grow: 1
2726
chance_pgnum_shrink: 1
2827
chance_pgpnum_fix: 1
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
overrides:
2+
ceph:
3+
log-ignorelist:
4+
- but it is still running
5+
- objects unfound and apparently lost
6+
conf:
7+
osd:
8+
osd debug reject backfill probability: .3
9+
osd scrub min interval: 60
10+
osd scrub max interval: 120
11+
osd max backfills: 3
12+
osd snap trim sleep: 2
13+
osd delete sleep: 1
14+
mon:
15+
mon min osdmap epochs: 50
16+
paxos service trim min: 10
17+
# prune full osdmaps regularly
18+
mon osdmap full prune min: 15
19+
mon osdmap full prune interval: 2
20+
mon osdmap full prune txsize: 2
21+
tasks:
22+
- thrashosds:
23+
timeout: 1200
24+
min_in: 2
25+
thrash_hosts: true
26+
chance_pgnum_grow: 1
27+
chance_pgnum_shrink: 1
28+
chance_pgpnum_fix: 1
29+
chance_bluestore_reshard: 1
30+
bluestore_new_sharding: random
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
overrides:
2+
ceph:
3+
log-ignorelist:
4+
- but it is still running
5+
- objects unfound and apparently lost
6+
- osd_map_cache_size
7+
conf:
8+
mon:
9+
mon min osdmap epochs: 50
10+
paxos service trim min: 10
11+
# prune full osdmaps regularly
12+
mon osdmap full prune min: 15
13+
mon osdmap full prune interval: 2
14+
mon osdmap full prune txsize: 2
15+
osd:
16+
osd map cache size: 1
17+
osd scrub min interval: 60
18+
osd scrub max interval: 120
19+
osd scrub during recovery: false
20+
osd max backfills: 6
21+
osd beacon report interval: 30
22+
tasks:
23+
- thrashosds:
24+
timeout: 1800
25+
min_in: 2
26+
thrash_hosts: true
27+
chance_pgnum_grow: 0.25
28+
chance_pgnum_shrink: 0.25
29+
chance_pgpnum_fix: 0.25
30+
chance_test_map_discontinuity: 2
31+
map_discontinuity_sleep_time: 200

qa/suites/rados/thrash/thrashers/pggrow.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,5 @@ tasks:
2121
- thrashosds:
2222
timeout: 1200
2323
min_in: 2
24-
thrash_hosts: true
2524
chance_pgnum_grow: 2
2625
chance_pgpnum_fix: 1
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
overrides:
2+
ceph:
3+
log-ignorelist:
4+
- but it is still running
5+
- objects unfound and apparently lost
6+
conf:
7+
osd:
8+
osd scrub min interval: 60
9+
osd scrub max interval: 120
10+
filestore odsync write: true
11+
osd max backfills: 2
12+
osd snap trim sleep: .5
13+
mon:
14+
mon min osdmap epochs: 50
15+
paxos service trim min: 10
16+
# prune full osdmaps regularly
17+
mon osdmap full prune min: 15
18+
mon osdmap full prune interval: 2
19+
mon osdmap full prune txsize: 2
20+
tasks:
21+
- thrashosds:
22+
timeout: 1200
23+
min_in: 2
24+
thrash_hosts: true
25+
chance_pgnum_grow: 2
26+
chance_pgpnum_fix: 1

qa/tasks/ceph_manager.py

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -569,23 +569,45 @@ def revive_osd(self, osd=None, skip_admin_check=False):
569569

570570
def out_host(self, host=None):
571571
"""
572-
Make all osds on a host out
572+
Make all OSDs on a host out if the host has more than min_in OSDs.
573573
:param host: Host to be marked.
574574
"""
575-
# check that all osd remotes have a valid console
575+
# Check that all OSD remotes have a valid console
576576
osds = self.ceph_manager.ctx.cluster.only(teuthology.is_type('osd', self.ceph_manager.cluster))
577-
if host is None:
578-
host = random.choice(list(osds.remotes.keys()))
579-
self.log("Removing all osds in host %s" % (host,))
580-
581-
for role in osds.remotes[host]:
582-
if not role.startswith("osd."):
583-
continue
584-
osdid = int(role.split('.')[1])
585-
if self.in_osds.count(osdid) == 0:
586-
continue
587-
self.out_osd(osdid)
577+
all_hosts = list(osds.remotes.keys())
578+
min_in = self.minin
579+
580+
if host is not None:
581+
all_hosts = [host] if host in all_hosts else []
582+
583+
random.shuffle(all_hosts) # Shuffle the list to pick hosts randomly
584+
585+
for host in all_hosts:
586+
self.log("Checking the number of in OSDs in host %s" % (host,))
587+
588+
# Count the number of in OSDs in the host
589+
in_host_osd_count = 0
590+
for role in osds.remotes[host]:
591+
if role.startswith("osd."):
592+
osdid = int(role.split('.')[1])
593+
if osdid in self.in_osds:
594+
in_host_osd_count += 1
595+
596+
# Check taking out that host will cause the number
597+
# of in OSDs to be less than min_in
598+
if len(self.in_osds) - in_host_osd_count >= min_in:
599+
self.log("Removing all OSDs in host %s" % (host,))
600+
# Proceed to take out OSDs
601+
for role in osds.remotes[host]:
602+
if role.startswith("osd."):
603+
osdid = int(role.split('.')[1])
604+
if osdid in self.in_osds:
605+
self.out_osd(osdid)
606+
return
607+
else:
608+
self.log("Host %s can't be trashed as it will left %d OSDs in" % (host, len(self.in_osds) - in_host_osd_count))
588609

610+
self.log("No suitable host found to thrash")
589611

590612
def out_osd(self, osd=None):
591613
"""
@@ -1254,7 +1276,6 @@ def choose_action(self):
12541276
(minin, minout, minlive, mindead, chance_down))
12551277
actions = []
12561278
if thrash_hosts:
1257-
self.log("check thrash_hosts")
12581279
if len(self.in_osds) > minin:
12591280
self.log("check thrash_hosts: in_osds > minin")
12601281
actions.append((self.out_host, 1.0,))

0 commit comments

Comments
 (0)