Skip to content

Commit c29d6ff

Browse files
mon/Elector.cc: prevent assertion failure when receiving pings from removed monitors
When a monitor is removed from the cluster, there can be a race condition where the removed monitor is still running and sending ping messages to other monitors, while those monitors have already updated their monmap and no longer recognize the removed monitor's address. This causes MonMap::get_rank() to return -1 for the removed monitor's address, which then gets passed to MonMap::get_addrs(unsigned), causing an assertion failure since -1 cast to unsigned becomes UINT_MAX. Add defensive checks in three places to handle this scenario: 1. In begin_peer_ping(): return early if peer < 0 2. In send_peer_ping(): check both peer < 0 and peer >= ranks.size() 3. In handle_ping(): drop messages from unknown senders (rank < 0) This prevents the assertion failure and provides better logging for diagnosing such race conditions. Fixes: https://tracker.ceph.com/issues/71259 Signed-off-by: chungfengz <[email protected]>
1 parent b69aef5 commit c29d6ff

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

src/mon/Elector.cc

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,10 @@ void Elector::handle_nak(MonOpRequestRef op)
453453
void Elector::begin_peer_ping(int peer)
454454
{
455455
dout(20) << __func__ << " with " << peer << dendl;
456+
if (peer < 0) {
457+
dout(20) << __func__ << " ignoring negative peer " << peer << dendl;
458+
return;
459+
}
456460
if (live_pinging.count(peer)) {
457461
dout(20) << peer << " already in live_pinging ... return " << dendl;
458462
return;
@@ -492,7 +496,7 @@ void Elector::begin_peer_ping(int peer)
492496
bool Elector::send_peer_ping(int peer, const utime_t *n)
493497
{
494498
dout(10) << __func__ << " to peer " << peer << dendl;
495-
if (peer >= ssize(mon->monmap->ranks)) {
499+
if (peer < 0 || peer >= ssize(mon->monmap->ranks)) {
496500
// Monitor no longer exists in the monmap,
497501
// therefore, we shouldn't ping this monitor
498502
// since we cannot lookup the address!
@@ -609,6 +613,11 @@ void Elector::handle_ping(MonOpRequestRef op)
609613
MMonPing *m = static_cast<MMonPing*>(op->get_req());
610614
int prank = mon->monmap->get_rank(m->get_source_addr());
611615
dout(20) << __func__ << " from: " << prank << dendl;
616+
if (prank < 0) {
617+
dout(5) << __func__ << " from unknown addr " << m->get_source_addr()
618+
<< " mapped to rank " << prank << " (likely removed monitor) - dropping message" << dendl;
619+
return;
620+
}
612621
begin_peer_ping(prank);
613622
assimilate_connection_reports(m->tracker_bl);
614623
switch(m->op) {

0 commit comments

Comments
 (0)