Skip to content

Commit abfff2b

Browse files
authored
Merge pull request ceph#57146 from kamoltat/wip-ksirivad-fix-connection-score-json
src/mon/ConnectionTracker.cc: Fix dump function Reviewed-by Kamoltat Sirivadhna <[email protected]>
2 parents c1dddbf + ed7f4e8 commit abfff2b

File tree

4 files changed

+143
-11
lines changed

4 files changed

+143
-11
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
roles:
2+
- - mon.a
3+
- mon.b
4+
- mon.c
5+
- osd.0
6+
- osd.1
7+
- osd.2
8+
- mgr.x
9+
- client.0
10+
11+
openstack:
12+
- volumes: # attached to each instance
13+
count: 3
14+
size: 10 # GB
15+
tasks:
16+
- install:
17+
- ceph:
18+
pre-mgr-commands:
19+
- sudo ceph config set mgr mgr_pool false --force
20+
log-ignorelist:
21+
- overall HEALTH_
22+
- \(OSDMAP_FLAGS\)
23+
- \(OSD_
24+
- \(PG_
25+
- \(POOL_
26+
- \(CACHE_POOL_
27+
- \(OBJECT_
28+
- \(SLOW_OPS\)
29+
- \(REQUEST_SLOW\)
30+
- \(TOO_FEW_PGS\)
31+
- slow request
32+
- \(POOL_APP_NOT_ENABLED\)
33+
- overall HEALTH_
34+
- \(MGR_DOWN\)
35+
- \(MON_DOWN\)
36+
- \(PG_AVAILABILITY\)
37+
- \(SLOW_OPS\)
38+
- cephfs_test_runner:
39+
modules:
40+
- tasks.mon_connection_score

qa/tasks/ceph_test_case.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -353,13 +353,10 @@ def wait_until_true_and_hold(cls, condition, timeout, success_hold_time, check_f
353353
while True:
354354
if condition():
355355
success_time_elapsed = 0
356-
while success_time_elapsed < success_hold_time:
357-
if condition():
358-
success_time_elapsed += 1
359-
time.sleep(1)
360-
elapsed += 1
361-
else:
362-
break
356+
while success_time_elapsed < success_hold_time and condition():
357+
success_time_elapsed += 1
358+
time.sleep(1)
359+
elapsed += 1
363360
if success_time_elapsed == success_hold_time:
364361
log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time))
365362
return

qa/tasks/mon_connection_score.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
from tasks.ceph_test_case import CephTestCase
2+
import json
3+
import logging
4+
log = logging.getLogger(__name__)
5+
6+
7+
class TestStretchClusterNew(CephTestCase):
8+
9+
CLUSTER = "ceph"
10+
MONS = {
11+
"a": {
12+
"rank": 0,
13+
},
14+
"b": {
15+
"rank": 1,
16+
},
17+
"c": {
18+
"rank": 2,
19+
}
20+
}
21+
WRITE_PERIOD = 10
22+
RECOVERY_PERIOD = WRITE_PERIOD * 6
23+
SUCCESS_HOLD_TIME = 10
24+
25+
def setUp(self):
26+
"""
27+
Set up the cluster for the test.
28+
"""
29+
super(TestStretchClusterNew, self).setUp()
30+
31+
def tearDown(self):
32+
"""
33+
Clean up the cluter after the test.
34+
"""
35+
super(TestStretchClusterNew, self).tearDown()
36+
37+
def _check_connection_score(self):
38+
"""
39+
Check the connection score of all the mons.
40+
"""
41+
for mon, _ in self.MONS.items():
42+
# get the connection score
43+
cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd(
44+
'daemon', 'mon.{}'.format(mon),
45+
'connection', 'scores', 'dump')
46+
# parse the connection score
47+
cscore = json.loads(cscore)
48+
# check if the current mon rank is correct
49+
if cscore["rank"] != self.MONS[mon]["rank"]:
50+
log.error(
51+
"Rank mismatch {} != {}".format(
52+
cscore["rank"], self.MONS[mon]["rank"]
53+
)
54+
)
55+
return False
56+
# check if current mon have all the peer reports and ourself
57+
if len(cscore['reports']) != len(self.MONS):
58+
log.error(
59+
"Reports count mismatch {}".format(cscore['reports'])
60+
)
61+
return False
62+
63+
for report in cscore["reports"]:
64+
report_rank = []
65+
for peer in report["peer_scores"]:
66+
# check if the peer is alive
67+
if not peer["peer_alive"]:
68+
log.error("Peer {} is not alive".format(peer))
69+
return False
70+
report_rank.append(peer["peer_rank"])
71+
72+
# check if current mon has all the ranks and no duplicates
73+
expected_ranks = [
74+
rank
75+
for data in self.MONS.values()
76+
for rank in data.values()
77+
]
78+
if report_rank.sort() != expected_ranks.sort():
79+
log.error("Rank mismatch in report {}".format(report))
80+
return False
81+
82+
log.info("Connection score is clean!")
83+
return True
84+
85+
def test_connection_score(self):
86+
# check if all mons are in quorum
87+
self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3)
88+
# check if all connection scores reflect this
89+
self.wait_until_true_and_hold(
90+
lambda: self._check_connection_score(),
91+
# Wait for 4 minutes for the connection score to recover
92+
timeout=self.RECOVERY_PERIOD * 4,
93+
# Hold the clean connection score for 60 seconds
94+
success_hold_time=self.SUCCESS_HOLD_TIME * 6
95+
)

src/mon/ConnectionTracker.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -325,13 +325,13 @@ void ConnectionReport::dump(ceph::Formatter *f) const
325325
f->dump_int("rank", rank);
326326
f->dump_int("epoch", epoch);
327327
f->dump_int("version", epoch_version);
328-
f->open_object_section("peer_scores");
328+
f->open_array_section("peer_scores");
329329
for (auto i : history) {
330330
f->open_object_section("peer");
331331
f->dump_int("peer_rank", i.first);
332332
f->dump_float("peer_score", i.second);
333333
f->dump_bool("peer_alive", current.find(i.first)->second);
334-
f->close_section();
334+
f->close_section(); // peer
335335
}
336336
f->close_section(); // peer scores
337337
}
@@ -354,11 +354,11 @@ void ConnectionTracker::dump(ceph::Formatter *f) const
354354
f->dump_int("version", version);
355355
f->dump_float("half_life", half_life);
356356
f->dump_int("persist_interval", persist_interval);
357-
f->open_object_section("reports");
357+
f->open_array_section("reports");
358358
for (const auto& i : peer_reports) {
359359
f->open_object_section("report");
360360
i.second.dump(f);
361-
f->close_section();
361+
f->close_section(); // report
362362
}
363363
f->close_section(); // reports
364364
}

0 commit comments

Comments
 (0)