Skip to content

Commit 5f9fa43

Browse files
authored
Merge pull request ceph#55103 from kamoltat/wip-ksirivad-fix-63861
src/mon/OSDMonitor.cc: [Stretch Mode] WRN non-existent CRUSH location assigned to MON Reviewed-by: Ronen Friedman <[email protected]> Reviewed-by: Greg Farnum <[email protected]> Reviewed-by: Anthony D'Atri <[email protected]>
2 parents 6f43298 + 97b815c commit 5f9fa43

File tree

4 files changed

+51
-1
lines changed

4 files changed

+51
-1
lines changed

doc/rados/operations/health-checks.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,6 +1645,19 @@ We encourage you to fix this by making the weights even on both dividing buckets
16451645
This can be done by making sure the combined weight of the OSDs on each dividing
16461646
bucket are the same.
16471647

1648+
NONEXISTENT_MON_CRUSH_LOC_STRETCH_MODE
1649+
______________________________________
1650+
1651+
The CRUSH location specified for the monitor must belong to one of the dividing
1652+
buckets when stretch mode is enabled. With the ``tiebreaker`` monitor being the
1653+
only exception.
1654+
1655+
This warning suggests that one or more monitors have a CRUSH location that does
1656+
not belong to any of the dividing buckets in stretch mode.
1657+
1658+
We encourage you to fix this by making sure the CRUSH location of the monitor
1659+
belongs to one of the dividing buckets.
1660+
16481661
NVMeoF Gateway
16491662
--------------
16501663

src/mon/HealthMonitor.cc

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "mon/Monitor.h"
2727
#include "mon/HealthMonitor.h"
28+
#include "mon/OSDMonitor.h"
2829

2930
#include "messages/MMonHealthChecks.h"
3031

@@ -740,6 +741,8 @@ bool HealthMonitor::check_leader_health()
740741
if (g_conf().get_val<bool>("mon_warn_on_msgr2_not_enabled")) {
741742
check_if_msgr2_enabled(&next);
742743
}
744+
// STRETCH MODE
745+
check_mon_crush_loc_stretch_mode(&next);
743746

744747
if (next != leader_checks) {
745748
changed = true;
@@ -885,3 +888,33 @@ void HealthMonitor::check_if_msgr2_enabled(health_check_map_t *checks)
885888
}
886889
}
887890
}
891+
892+
void HealthMonitor::check_mon_crush_loc_stretch_mode(health_check_map_t *checks)
893+
{
894+
// Check if the CRUSH location exists for all MONs
895+
if (!mon.monmap->stretch_mode_enabled){
896+
return;
897+
}
898+
list<string> details;
899+
for (auto& i : mon.monmap->mon_info) {
900+
// Skip the tiebreaker monitor
901+
if (i.second.name == mon.monmap->tiebreaker_mon) {
902+
continue;
903+
}
904+
for (auto& pair : i.second.crush_loc){
905+
if (!mon.osdmon()->osdmap.crush->name_exists(pair.second)) {
906+
ostringstream ds;
907+
ds << "CRUSH location " << pair.second << " does not exist";
908+
details.push_back(ds.str());
909+
}
910+
}
911+
}
912+
// WARN in ceph -s if the CRUSH location does not exist
913+
if (!details.empty()) {
914+
ostringstream ss;
915+
ss << details.size() << " monitor(s) have nonexistent CRUSH location";
916+
auto &d = checks->add("NONEXISTENT_MON_CRUSH_LOC_STRETCH_MODE", HEALTH_WARN, ss.str(),
917+
details.size());
918+
d.detail.swap(details);
919+
}
920+
}

src/mon/HealthMonitor.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class HealthMonitor : public PaxosService
6666
void check_for_older_version(health_check_map_t *checks);
6767
void check_for_mon_down(health_check_map_t *checks);
6868
void check_for_clock_skew(health_check_map_t *checks);
69+
void check_mon_crush_loc_stretch_mode(health_check_map_t *checks);
6970
void check_if_msgr2_enabled(health_check_map_t *checks);
7071
bool check_leader_health();
7172
bool check_member_health();

src/mon/OSDMonitor.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15270,7 +15270,10 @@ bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_
1527015270
bool really_down = false;
1527115271
for (auto dbi : dead_buckets) {
1527215272
const string& bucket_name = dbi.first;
15273-
ceph_assert(osdmap.crush->name_exists(bucket_name));
15273+
if (!osdmap.crush->name_exists(bucket_name)) {
15274+
dout(10) << "CRUSH bucket " << bucket_name << " does not exist" << dendl;
15275+
continue;
15276+
}
1527415277
int bucket_id = osdmap.crush->get_item_id(bucket_name);
1527515278
dout(20) << "Checking " << bucket_name << " id " << bucket_id
1527615279
<< " to see if OSDs are also down" << dendl;

0 commit comments

Comments
 (0)