Skip to content

Commit eadcb8c

Browse files
committed
mon: Track and process pending pings after election
Problem: Monitors stop pinging each other when quorum_mon_feature flag is empty. This happens when the monitor freshly starts up and never formed a quorum before, or when you restart the monitors in the cluster. Basically, Monitor startups. This problem can easily be reproduced everytime. Steps to reproduce: 1. Start 3 MONs with `connectivity` election strategy 2. Fail 1 mon. 3. Restart all the monitors (including the down monitor) 4. Observe that the connection scores of each monitor will tell you that not all monitors are alive. Which is not true because all 3 Monitors are in quorum. What happened was during monitor startups, quorum_mon_feature is empty and although they all participated in the election, when they hit the function begin_peer_ping, some monitors if not all will not send ping because of the emptry quorum_mon_feature flag. Therefore, after the election the monitors will have the wrong connection score. However, this will get resolved in the next election because now that quorum_mon_feature is populated they will start pinging each other again, hence, correct connectivity score. Solution: In begin_peer_ping, instead of just returning out of the function when quorum_mon_feature is empty, we keep track of the peers that we should ping once the election is finished. In Monitor::win_election and Monitor::lose_elections, we process the pending pings by calling begin_peer_ping on each of the peers (both peons and leader) Additionally: Improved loggings in Elector class such that debugging the pinging process gets easier. Fixes: https://tracker.ceph.com/issues/70587 Signed-off-by: Kamoltat Sirivadhna <[email protected]> sd Signed-off-by: Kamoltat Sirivadhna <[email protected]>
1 parent 54f7236 commit eadcb8c

File tree

3 files changed

+74
-16
lines changed

3 files changed

+74
-16
lines changed

src/mon/Elector.cc

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -453,22 +453,37 @@ void Elector::handle_nak(MonOpRequestRef op)
453453

454454
void Elector::begin_peer_ping(int peer)
455455
{
456-
dout(20) << __func__ << " against " << peer << dendl;
456+
dout(20) << __func__ << " with " << peer << dendl;
457457
if (live_pinging.count(peer)) {
458458
dout(20) << peer << " already in live_pinging ... return " << dendl;
459459
return;
460460
}
461-
462-
if (!mon->get_quorum_mon_features().contains_all(
461+
// Check if quorum feature is not set and we are in
462+
// STATE_INIT, STATE_PROBING, STATE_SYNCHRONIZING or STATE_ELECTING
463+
if (mon->get_quorum_mon_features().empty() &&
464+
(!mon->is_leader() && !mon->is_peon() && !mon->is_shutdown())) {
465+
dout(10) << "quorum mon feature is not yet set, "
466+
<< " we might need to wait until we form a quorum"
467+
<< dendl;
468+
pending_pings.insert(peer);
469+
return;
470+
} else if (!mon->get_quorum_mon_features().contains_all(
463471
ceph::features::mon::FEATURE_PINGING)) {
472+
dout(10) << "mon quorum does not support pinging .. return" << dendl;
464473
return;
465474
}
466475

476+
pending_pings.erase(peer);
467477
peer_tracker.report_live_connection(peer, 0); // init this peer as existing
468478
live_pinging.insert(peer);
469479
dead_pinging.erase(peer);
470480
peer_acked_ping[peer] = ceph_clock_now();
471-
if (!send_peer_ping(peer)) return;
481+
if (!send_peer_ping(peer)) {
482+
dout(20) << "send_peer_ping failed ..."
483+
<< " no need to schedule ping_check" << dendl;
484+
return;
485+
}
486+
dout(30) << "schedule ping_check against peer: " << peer << dendl;
472487
mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
473488
new C_MonContext{mon, [this, peer](int) {
474489
ping_check(peer);
@@ -497,32 +512,55 @@ bool Elector::send_peer_ping(int peer, const utime_t *n)
497512
MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl());
498513
mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer));
499514
peer_sent_ping[peer] = now;
515+
dout(20) << " sent ping successfully to peer: " << peer << dendl;
500516
return true;
501517
}
502518

519+
void Elector::process_pending_pings()
520+
{
521+
dout(10) << __func__ << " processing "
522+
<< pending_pings.size() << " pending pings" << dendl;
523+
524+
// Make a copy since begin_peer_ping will modify the set
525+
std::set<int> peers_to_ping = pending_pings;
526+
for (int peer : peers_to_ping) {
527+
begin_peer_ping(peer);
528+
}
529+
}
530+
503531
void Elector::ping_check(int peer)
504532
{
505-
dout(20) << __func__ << " to peer " << peer << dendl;
533+
dout(20) << __func__ << "ing peer " << peer << dendl;
506534

507535
if (!live_pinging.count(peer) &&
508536
!dead_pinging.count(peer)) {
509-
dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl;
537+
dout(20) << peer << " is no longer marked for pinging ... return" << dendl;
510538
return;
511539
}
512540
utime_t now = ceph_clock_now();
513541
utime_t& acked_ping = peer_acked_ping[peer];
514542
utime_t& newest_ping = peer_sent_ping[peer];
515543
if (!acked_ping.is_zero() && acked_ping < now - ping_timeout) {
544+
dout(20) << "peer " << peer << " has not acked a ping in "
545+
<< now - acked_ping << " seconds" << dendl;
516546
peer_tracker.report_dead_connection(peer, now - acked_ping);
517547
acked_ping = now;
518548
begin_dead_ping(peer);
519549
return;
520550
}
521551

522552
if (acked_ping == newest_ping) {
523-
if (!send_peer_ping(peer, &now)) return;
553+
dout(20) << "peer " << peer
554+
<< " has not acked the newest ping"
555+
<< " .. sending another ping" << dendl;
556+
if (!send_peer_ping(peer, &now)) {
557+
dout(20) << "send_peer_ping failed ..."
558+
<< " no need to schedule " << __func__ << dendl;
559+
return;
560+
}
524561
}
525562

563+
dout(30) << "schedule " << __func__ << " against peer: "<< peer << dendl;
526564
mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
527565
new C_MonContext{mon, [this, peer](int) {
528566
ping_check(peer);
@@ -533,11 +571,13 @@ void Elector::begin_dead_ping(int peer)
533571
{
534572
dout(20) << __func__ << " to peer " << peer << dendl;
535573
if (dead_pinging.count(peer)) {
574+
dout(20) << peer << " already in dead_pinging ... return" << dendl;
536575
return;
537576
}
538577

539578
live_pinging.erase(peer);
540579
dead_pinging.insert(peer);
580+
dout(30) << "schedule dead_ping against peer: " << peer << dendl;
541581
mon->timer.add_event_after(ping_timeout,
542582
new C_MonContext{mon, [this, peer](int) {
543583
dead_ping(peer);
@@ -558,6 +598,7 @@ void Elector::dead_ping(int peer)
558598

559599
peer_tracker.report_dead_connection(peer, now - acked_ping);
560600
acked_ping = now;
601+
dout(30) << "schedule " << __func__ << " against peer: " << peer << dendl;
561602
mon->timer.add_event_after(ping_timeout,
562603
new C_MonContext{mon, [this, peer](int) {
563604
dead_ping(peer);
@@ -574,13 +615,15 @@ void Elector::handle_ping(MonOpRequestRef op)
574615
switch(m->op) {
575616
case MMonPing::PING:
576617
{
618+
dout(30) << "recieved PING from "
619+
<< prank << ", sending PING_REPLY back!" << dendl;
577620
MMonPing *reply = new MMonPing(MMonPing::PING_REPLY, m->stamp, peer_tracker.get_encoded_bl());
578621
m->get_connection()->send_message(reply);
579622
}
580623
break;
581624

582625
case MMonPing::PING_REPLY:
583-
626+
dout(30) << "recieved PING_REPLY from " << prank << dendl;
584627
const utime_t& previous_acked = peer_acked_ping[prank];
585628
const utime_t& newest = peer_sent_ping[prank];
586629

@@ -591,17 +634,23 @@ void Elector::handle_ping(MonOpRequestRef op)
591634
}
592635

593636
if (m->stamp > previous_acked) {
594-
dout(20) << "m->stamp > previous_acked" << dendl;
637+
dout(30) << "recieved good PING_REPLY!" << dendl;
595638
peer_tracker.report_live_connection(prank, m->stamp - previous_acked);
596639
peer_acked_ping[prank] = m->stamp;
597-
} else{
598-
dout(20) << "m->stamp <= previous_acked .. we don't report_live_connection" << dendl;
640+
} else {
641+
dout(30) << "recieved bad PING_REPLY! it's the same or older "
642+
<< "than the most recent ack we got." << dendl;
599643
}
600644
utime_t now = ceph_clock_now();
601-
dout(30) << "now: " << now << " m->stamp: " << m->stamp << " ping_timeout: "
602-
<< ping_timeout << " PING_DIVISOR: " << PING_DIVISOR << dendl;
603645
if (now - m->stamp > ping_timeout / PING_DIVISOR) {
604-
if (!send_peer_ping(prank, &now)) return;
646+
dout(30) << "peer " << prank << " has not acked a ping in "
647+
<< now - m->stamp << " seconds, which is more than the "
648+
<< ping_timeout / PING_DIVISOR << " seconds limit."
649+
<< " Sending another ping ..." << dendl;
650+
if (!send_peer_ping(prank, &now)) {
651+
dout(10) << "send_peer_ping failed ..." << dendl;
652+
return;
653+
}
605654
}
606655
break;
607656
}

src/mon/Elector.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ class Elector : public ElectionOwner, RankProvider {
3939
* @defgroup Elector_h_class Elector
4040
* @{
4141
*/
42+
43+
private:
44+
std::set<int> pending_pings; // Monitors waiting for quorum features to be established
45+
4246
ElectionLogic logic;
4347
// connectivity validation and scoring
4448
ConnectionTracker peer_tracker;
@@ -406,6 +410,11 @@ class Elector : public ElectionOwner, RankProvider {
406410
disallowed_leaders = dl;
407411
return true;
408412
}
413+
/**
414+
* process all pending pings when quorum is established
415+
*
416+
*/
417+
void process_pending_pings();
409418
void dump_connection_scores(Formatter *f) {
410419
f->open_object_section("connection scores");
411420
peer_tracker.dump(f);

src/mon/Monitor.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2296,7 +2296,7 @@ void Monitor::win_election(epoch_t epoch, const set<int>& active, uint64_t featu
22962296
encode(m, bl);
22972297
t->put(MONITOR_STORE_PREFIX, "last_metadata", bl);
22982298
}
2299-
2299+
elector.process_pending_pings();
23002300
finish_election();
23012301
if (monmap->size() > 1 &&
23022302
monmap->get_epoch() > 0) {
@@ -2349,7 +2349,7 @@ void Monitor::lose_election(epoch_t epoch, set<int> &q, int l,
23492349
_finish_svc_election();
23502350

23512351
logger->inc(l_mon_election_lose);
2352-
2352+
elector.process_pending_pings();
23532353
finish_election();
23542354
}
23552355

0 commit comments

Comments
 (0)