Skip to content

Commit 6c37c2b

Browse files
authored
Merge pull request ceph#63413 from kamoltat/wip-ksirivad-fix-71344
[DNM] ProtocolV2: make handle_existing_connection check for cookie mismatch first Reviewed-by: Ronen Friedman <[email protected]>
2 parents 6aa6c77 + 52b9429 commit 6c37c2b

File tree

5 files changed

+25
-14
lines changed

5 files changed

+25
-14
lines changed

qa/config/rados.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ overrides:
1111
osd mclock profile: high_recovery_ops
1212
mon:
1313
mon scrub interval: 300
14+
debug mon: 30

qa/suites/rados/monthrash/ceph.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ overrides:
1313
mon osdmap full prune txsize: 2
1414
mon scrub inject crc mismatch: 0.01
1515
mon scrub inject missing keys: 0.05
16+
debug ms: 20
1617
# thrashing monitors may make mgr have trouble w/ its keepalive
1718
log-ignorelist:
1819
- ScrubResult

qa/tasks/mon_thrash.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,11 +354,13 @@ def _do_thrash(self):
354354

355355
if mons_to_freeze:
356356
for mon in mons_to_freeze:
357+
self.log('freezing mon.{m}'.format(m=mon))
357358
self.freeze_mon(mon)
358359
self.log('waiting for {delay} secs to unfreeze mons'.format(
359360
delay=self.freeze_mon_duration))
360361
time.sleep(self.freeze_mon_duration)
361362
for mon in mons_to_freeze:
363+
self.log('unfreezing mon.{m}'.format(m=mon))
362364
self.unfreeze_mon(mon)
363365

364366
if self.maintain_quorum:
@@ -382,15 +384,18 @@ def _do_thrash(self):
382384
self.switch_task()
383385

384386
for mon in mons_to_kill:
387+
self.log('reviving mon.{m}'.format(m=mon))
385388
self.revive_mon(mon)
386389
# do more freezes
387390
if mons_to_freeze:
388391
for mon in mons_to_freeze:
392+
self.log('freezing mon.{m}'.format(m=mon))
389393
self.freeze_mon(mon)
390394
self.log('waiting for {delay} secs to unfreeze mons'.format(
391395
delay=self.freeze_mon_duration))
392396
time.sleep(self.freeze_mon_duration)
393397
for mon in mons_to_freeze:
398+
self.log('unfreezing mon.{m}'.format(m=mon))
394399
self.unfreeze_mon(mon)
395400

396401
self.manager.wait_for_mon_quorum_size(len(mons))

src/msg/async/ProtocolV2.cc

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,7 @@ CtPtr ProtocolV2::handle_auth_done(ceph::bufferlist &payload)
18941894
}
18951895

18961896
CtPtr ProtocolV2::finish_client_auth() {
1897+
ldout(cct, 20) << __func__ << dendl;
18971898
if (HAVE_MSGR2_FEATURE(peer_supported_features, COMPRESSION)) {
18981899
return send_compression_request();
18991900
}
@@ -1902,6 +1903,7 @@ CtPtr ProtocolV2::finish_client_auth() {
19021903
}
19031904

19041905
CtPtr ProtocolV2::finish_server_auth() {
1906+
ldout(cct, 20) << __func__ << dendl;
19051907
// server had sent AuthDone and client responded with correct pre-auth
19061908
// signature.
19071909
// We can start conditioanl msgr protocol
@@ -1918,10 +1920,12 @@ CtPtr ProtocolV2::finish_server_auth() {
19181920

19191921
CtPtr ProtocolV2::start_session_connect() {
19201922
if (!server_cookie) {
1923+
ldout(cct, 20) << __func__ << " starting a new session" << dendl;
19211924
ceph_assert(connect_seq == 0);
19221925
state = SESSION_CONNECTING;
19231926
return send_client_ident();
19241927
} else { // reconnecting to previous session
1928+
ldout(cct, 20) << __func__ << " reconnecting to session" << dendl;
19251929
state = SESSION_RECONNECTING;
19261930
ceph_assert(connect_seq > 0);
19271931
return send_reconnect();
@@ -2671,6 +2675,19 @@ CtPtr ProtocolV2::handle_existing_connection(const AsyncConnectionRef& existing)
26712675
return WRITE(wait, "wait", read_frame);
26722676
}
26732677

2678+
if (exproto->server_cookie && exproto->client_cookie &&
2679+
exproto->client_cookie != client_cookie) {
2680+
// Found previous session
2681+
// peer has reseted and we're going to reuse the existing connection
2682+
// by replacing the communication socket
2683+
ldout(cct, 1) << __func__ << " found previous session existing=" << existing
2684+
<< ", peer must have reseted." << dendl;
2685+
if (connection->policy.resetcheck) {
2686+
exproto->reset_session();
2687+
}
2688+
return reuse_connection(existing, exproto);
2689+
}
2690+
26742691
if (exproto->peer_global_seq > peer_global_seq) {
26752692
ldout(cct, 1) << __func__ << " this is a stale connection, peer_global_seq="
26762693
<< peer_global_seq
@@ -2693,19 +2710,6 @@ CtPtr ProtocolV2::handle_existing_connection(const AsyncConnectionRef& existing)
26932710
return send_server_ident();
26942711
}
26952712

2696-
if (exproto->server_cookie && exproto->client_cookie &&
2697-
exproto->client_cookie != client_cookie) {
2698-
// Found previous session
2699-
// peer has reseted and we're going to reuse the existing connection
2700-
// by replacing the communication socket
2701-
ldout(cct, 1) << __func__ << " found previous session existing=" << existing
2702-
<< ", peer must have reseted." << dendl;
2703-
if (connection->policy.resetcheck) {
2704-
exproto->reset_session();
2705-
}
2706-
return reuse_connection(existing, exproto);
2707-
}
2708-
27092713
if (exproto->client_cookie == client_cookie) {
27102714
// session establishment interrupted between client_ident and server_ident,
27112715
// continuing...

src/vstart.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1622,7 +1622,7 @@ else
16221622
debug echo "** going verbose **"
16231623
CMONDEBUG='
16241624
debug osd = 20
1625-
debug mon = 20
1625+
debug mon = 30
16261626
debug osd = 20
16271627
debug paxos = 20
16281628
debug auth = 20

0 commit comments

Comments
 (0)