Skip to content

Commit 8202e72

Browse files
author
shreyanshjain7174
committed
mon: stuck peering since warning is misleading
When osds restart or manually marked down it is common to see a HEALTH_WARN claiming that PGs have been stuck peering since awhile, even though they were active. The warning is to be issued if they really are stuck peering longer than 60s. Fixes: https://tracker.ceph.com/issues/51688 Signed-off-by: shreyanshjain7174 <[email protected]>
1 parent 4b343d6 commit 8202e72

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

src/mon/PGMap.cc

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1939,6 +1939,10 @@ void PGMap::get_stuck_stats(
19391939
val = i->second.last_unstale;
19401940
}
19411941

1942+
if ((types & STUCK_PEERING) && (i->second.state & PG_STATE_PEERING)) {
1943+
if (i->second.last_peered < val)
1944+
val = i->second.last_peered;
1945+
}
19421946
// val is now the earliest any of the requested stuck states began
19431947
if (val < cutoff) {
19441948
stuck_pgs[i->first] = i->second;
@@ -1989,6 +1993,8 @@ int PGMap::dump_stuck_pg_stats(
19891993
stuck_types |= PGMap::STUCK_DEGRADED;
19901994
else if (*i == "stale")
19911995
stuck_types |= PGMap::STUCK_STALE;
1996+
else if (*i == "peering")
1997+
stuck_types |= PGMap::STUCK_PEERING;
19921998
else {
19931999
ds << "Unknown type: " << *i << std::endl;
19942000
return -EINVAL;
@@ -3850,6 +3856,33 @@ static void _try_mark_pg_stale(
38503856
newstat->state |= PG_STATE_STALE;
38513857
newstat->last_unstale = ceph_clock_now();
38523858
}
3859+
3860+
if ((cur.state & PG_STATE_PEERING) == 0 &&
3861+
cur.acting_primary != -1 &&
3862+
osdmap.is_down(cur.acting_primary)) {
3863+
pg_stat_t *newstat;
3864+
auto q = pending_inc->pg_stat_updates.find(pgid);
3865+
if (q != pending_inc->pg_stat_updates.end()) {
3866+
if ((q->second.acting_primary == cur.acting_primary) ||
3867+
((q->second.state & PG_STATE_PEERING) == 0 &&
3868+
q->second.acting_primary != -1 &&
3869+
osdmap.is_down(q->second.acting_primary))) {
3870+
newstat = &q->second;
3871+
} else {
3872+
// pending update is no longer down or already stale
3873+
return;
3874+
}
3875+
} else {
3876+
newstat = &pending_inc->pg_stat_updates[pgid];
3877+
*newstat = cur;
3878+
}
3879+
dout(10) << __func__ << " marking pg " << pgid
3880+
<< " stale (acting_primary " << newstat->acting_primary
3881+
<< ")" << dendl;
3882+
newstat->state |= PG_STATE_PEERING;
3883+
newstat->last_peered = ceph_clock_now();
3884+
}
3885+
38533886
}
38543887

38553888
void PGMapUpdater::check_down_pgs(

src/mon/PGMap.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,8 @@ class PGMap : public PGMapDigest {
359359
static const int STUCK_UNDERSIZED = (1<<2);
360360
static const int STUCK_DEGRADED = (1<<3);
361361
static const int STUCK_STALE = (1<<4);
362-
362+
static const int STUCK_PEERING = (1<<5);
363+
363364
PGMap()
364365
: version(0),
365366
last_osdmap_epoch(0), last_pg_scan(0)

0 commit comments

Comments
 (0)