Skip to content

Commit bc2bb38

Browse files
authored
Merge pull request ceph#57906 from kamoltat/wip-ksirivad-stretch-mode-netsplit-test
mon/ElectionLogic: tie-breaker mon ignore proposal from marked down mon Reviewed-by: Greg Farnum <[email protected]>
2 parents 130491f + 7a90d9d commit bc2bb38

File tree

14 files changed

+613
-45
lines changed

14 files changed

+613
-45
lines changed

qa/suites/netsplit/ceph.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,27 @@ overrides:
1010
mon osdmap full prune min: 15
1111
mon osdmap full prune interval: 2
1212
mon osdmap full prune txsize: 2
13+
debug mon: 30
1314
# thrashing monitors may make mgr have trouble w/ its keepalive
1415
log-ignorelist:
16+
- overall HEALTH_
17+
- \(OSDMAP_FLAGS\)
18+
- \(OSD_
19+
- \(PG_
20+
- \(POOL_
21+
- \(CACHE_POOL_
22+
- \(OBJECT_
23+
- \(SLOW_OPS\) # slow mons -> slow peering -> PG_AVAILABILITY
24+
- \(REQUEST_SLOW\)
25+
- \(TOO_FEW_PGS\)
26+
- slow request
27+
- \(POOL_APP_NOT_ENABLED\)
1528
- overall HEALTH_
1629
- \(MGR_DOWN\)
1730
- \(MON_DOWN\)
18-
# slow mons -> slow peering -> PG_AVAILABILITY
1931
- \(PG_AVAILABILITY\)
2032
- \(SLOW_OPS\)
33+
- \[WRN\]
2134
tasks:
2235
- install:
2336
- ceph:

qa/suites/netsplit/cluster.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
roles:
2-
- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3]
3-
- [mon.b, mgr.y, osd.4, osd.5, osd.6, osd.7, client.0]
4-
- [mon.c]
2+
- [mon.a, mon.b, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
3+
- [mon.c, mon.d, mgr.y, mds.b, osd.4, osd.5, osd.6, osd.7]
4+
- [mon.e, mgr.z, mds.c]
5+
- [client.0]
56
openstack:
67
- volumes: # attached to each instance
78
count: 4
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
overrides:
2+
ceph:
3+
conf:
4+
global:
5+
mon election default strategy: 3
6+
tasks:
7+
- workunit:
8+
clients:
9+
client.0:
10+
- mon/setup_stretch_cluster.sh
11+
- cephfs_test_runner:
12+
modules:
13+
- tasks.test_netsplit

qa/suites/netsplit/tests/mon_pool_ops.yaml

Lines changed: 0 additions & 21 deletions
This file was deleted.

qa/tasks/ceph_test_case.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,3 +341,37 @@ def wait_until_true(cls, condition, timeout, check_fn=None, period=5):
341341
log.debug("wait_until_true: waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
342342
time.sleep(period)
343343
elapsed += period
344+
345+
@classmethod
346+
def wait_until_true_and_hold(cls, condition, timeout, success_hold_time, check_fn=None, period=5):
347+
"""
348+
Wait until the condition is met and check if the condition holds for the remaining time.
349+
"""
350+
elapsed = 0
351+
retry_count = 0
352+
assert success_hold_time < timeout, "success_hold_time should not be greater than timeout"
353+
while True:
354+
if condition():
355+
success_time_elapsed = 0
356+
while success_time_elapsed < success_hold_time:
357+
if condition():
358+
success_time_elapsed += 1
359+
time.sleep(1)
360+
elapsed += 1
361+
else:
362+
break
363+
if success_time_elapsed == success_hold_time:
364+
log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time))
365+
return
366+
else:
367+
if elapsed >= timeout:
368+
if check_fn and check_fn() and retry_count < 5:
369+
elapsed = 0
370+
retry_count += 1
371+
log.debug("wait_until_true_and_hold: making progress, waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
372+
else:
373+
raise TestTimeoutError("Timed out after {0}s and {1} retries".format(elapsed, retry_count))
374+
else:
375+
log.debug("wait_until_true_and_hold waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
376+
time.sleep(period)
377+
elapsed += period

qa/tasks/netsplit.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212

1313
log = logging.getLogger(__name__)
1414

15+
1516
def get_ip_and_ports(ctx, daemon):
17+
"""
18+
Get the IP and port list for the <daemon>.
19+
"""
1620
assert daemon.startswith('mon.')
1721
addr = ctx.ceph['ceph'].mons['{a}'.format(a=daemon)]
1822
ips = re.findall("[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+[:[0-9]*]*", addr)
@@ -27,11 +31,16 @@ def get_ip_and_ports(ctx, daemon):
2731
port_list.append(port_str)
2832
return (plain_ip, port_list)
2933

34+
3035
def disconnect(ctx, config):
31-
assert len(config) == 2 # we can only disconnect pairs right now
36+
"""
37+
Disconnect the mons in the <config> list.
38+
"""
39+
assert len(config) == 2 # we can only disconnect pairs right now
3240
# and we can only disconnect mons right now
3341
assert config[0].startswith('mon.')
3442
assert config[1].startswith('mon.')
43+
log.info("Disconnecting {a} and {b}".format(a=config[0], b=config[1]))
3544
(ip1, _) = get_ip_and_ports(ctx, config[0])
3645
(ip2, _) = get_ip_and_ports(ctx, config[1])
3746

@@ -40,21 +49,26 @@ def disconnect(ctx, config):
4049
assert host1 is not None
4150
assert host2 is not None
4251

43-
host1.run(
44-
args = ["sudo", "iptables", "-A", "INPUT", "-p", "tcp", "-s",
45-
ip2, "-j", "DROP"]
46-
)
47-
host2.run(
48-
args = ["sudo", "iptables", "-A", "INPUT", "-p", "tcp", "-s",
49-
ip1, "-j", "DROP"]
50-
)
52+
host1.run(args=["sudo", "iptables", "-A", "INPUT",
53+
"-s", ip2, "-j", "DROP"])
54+
host1.run(args=["sudo", "iptables", "-A", "OUTPUT",
55+
"-d", ip2, "-j", "DROP"])
56+
57+
host2.run(args=["sudo", "iptables", "-A", "INPUT",
58+
"-s", ip1, "-j", "DROP"])
59+
host2.run(args=["sudo", "iptables", "-A", "OUTPUT",
60+
"-d", ip1, "-j", "DROP"])
61+
5162

5263
def reconnect(ctx, config):
53-
assert len(config) == 2 # we can only disconnect pairs right now
64+
"""
65+
Reconnect the mons in the <config> list.
66+
"""
67+
assert len(config) == 2 # we can only disconnect pairs right now
5468
# and we can only disconnect mons right now
5569
assert config[0].startswith('mon.')
5670
assert config[1].startswith('mon.')
57-
71+
log.info("Reconnecting {a} and {b}".format(a=config[0], b=config[1]))
5872
(ip1, _) = get_ip_and_ports(ctx, config[0])
5973
(ip2, _) = get_ip_and_ports(ctx, config[1])
6074

@@ -63,11 +77,12 @@ def reconnect(ctx, config):
6377
assert host1 is not None
6478
assert host2 is not None
6579

66-
host1.run(
67-
args = ["sudo", "iptables", "-D", "INPUT", "-p", "tcp", "-s",
68-
ip2, "-j", "DROP"]
69-
)
70-
host2.run(
71-
args = ["sudo", "iptables", "-D", "INPUT", "-p", "tcp", "-s",
72-
ip1, "-j", "DROP"]
73-
)
80+
host1.run(args=["sudo", "iptables", "-D", "INPUT",
81+
"-s", ip2, "-j", "DROP"])
82+
host1.run(args=["sudo", "iptables", "-D", "OUTPUT",
83+
"-d", ip2, "-j", "DROP"])
84+
85+
host2.run(args=["sudo", "iptables", "-D", "INPUT",
86+
"-s", ip1, "-j", "DROP"])
87+
host2.run(args=["sudo", "iptables", "-D", "OUTPUT",
88+
"-d", ip1, "-j", "DROP"])

0 commit comments

Comments
 (0)