Skip to content

Commit a0368dc

Browse files
authored
Merge pull request ceph#61127 from VallariAg/wip-nvmeof-delete-healthcheck
mon/NVMeofGwMap: add healthcheck warning NVMEOF_GATEWAY_DELETING
2 parents b84d9fe + 56cf512 commit a0368dc

File tree

5 files changed

+60
-2
lines changed

5 files changed

+60
-2
lines changed

doc/rados/operations/health-checks.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1665,6 +1665,14 @@ Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has
16651665
crashed, the daemon log file (found at ``/var/log/ceph/``) may contain
16661666
troubleshooting information.
16671667

1668+
NVMEOF_GATEWAY_DELETING
1669+
_______________________
1670+
1671+
Some of the gateways are in the GW_DELETING state. They will stay in this
1672+
state until all the namespaces under the gateway's load balancing group are
1673+
moved to another load balancing group ID. This is done automatically by the
1674+
load balancing process. If this alert persist for a long time, there might
1675+
be an issue with that process.
16681676

16691677
Miscellaneous
16701678
-------------

src/common/options/mon.yaml.in

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,13 @@ options:
9191
default: 1000
9292
services:
9393
- mon
94+
- name: mon_nvmeofgw_delete_grace
95+
type: secs
96+
level: advanced
97+
desc: Issue NVMEOF_GATEWAY_DELETING health warning after this amount of time has elapsed
98+
default: 15_min
99+
services:
100+
- mon
94101
- name: mon_mgr_inactive_grace
95102
type: int
96103
level: advanced

src/mon/NVMeofGwMap.cc

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ int NVMeofGwMap::cfg_delete_gw(
171171
<< state.availability << " Resulting GW availability: "
172172
<< state.availability << dendl;
173173
state.subsystems.clear();//ignore subsystems of this GW
174+
utime_t now = ceph_clock_now();
175+
mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now;
174176
return 0;
175177
}
176178
}
@@ -895,10 +897,12 @@ struct CMonRequestProposal : public Context {
895897
}
896898
};
897899

898-
void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
900+
void NVMeofGwMap::get_health_checks(health_check_map_t *checks)
899901
{
900902
list<string> singleGatewayDetail;
901903
list<string> gatewayDownDetail;
904+
list<string> gatewayInDeletingDetail;
905+
int deleting_gateways = 0;
902906
for (const auto& created_map_pair: created_gws) {
903907
const auto& group_key = created_map_pair.first;
904908
auto& group = group_key.second;
@@ -915,9 +919,37 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
915919
ostringstream ss;
916920
ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ;
917921
gatewayDownDetail.push_back(ss.str());
922+
} else if (gw_created.availability == gw_availability_t::GW_DELETING) {
923+
deleting_gateways++;
924+
utime_t now = ceph_clock_now();
925+
bool found_deleting_time = false;
926+
auto gws_deleting_time = mon->nvmegwmon()->gws_deleting_time;
927+
auto group_it = gws_deleting_time.find(group_key);
928+
if (group_it != gws_deleting_time.end()) {
929+
auto& gw_map = group_it->second;
930+
auto gw_it = gw_map.find(gw_id);
931+
if (gw_it != gw_map.end()) {
932+
found_deleting_time = true;
933+
utime_t delete_time = gw_it->second;
934+
if ((now - delete_time) > g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_delete_grace").count()) {
935+
ostringstream ss;
936+
ss << "NVMeoF Gateway '" << gw_id << "' is in deleting state.";
937+
gatewayInDeletingDetail.push_back(ss.str());
938+
}
939+
}
940+
}
941+
if (!found_deleting_time) {
942+
// DELETING gateway not found in gws_deleting_time, set timeout now
943+
mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now;
944+
}
918945
}
919946
}
920947
}
948+
if (deleting_gateways == 0) {
949+
// no gateway in GW_DELETING state currently, flush old gws_deleting_time
950+
mon->nvmegwmon()->gws_deleting_time.clear();
951+
}
952+
921953
if (!singleGatewayDetail.empty()) {
922954
ostringstream ss;
923955
ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway"
@@ -934,6 +966,15 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
934966
ss.str(), gatewayDownDetail.size());
935967
d.detail.swap(gatewayDownDetail);
936968
}
969+
if (!gatewayInDeletingDetail.empty()) {
970+
ostringstream ss;
971+
ss << gatewayInDeletingDetail.size() << " gateway(s) are in deleting state"
972+
<< "; namespaces are automatically balanced across remaining gateways, "
973+
<< "this should take a few minutes.";
974+
auto& d = checks->add("NVMEOF_GATEWAY_DELETING", HEALTH_WARN,
975+
ss.str(), gatewayInDeletingDetail.size());
976+
d.detail.swap(gatewayInDeletingDetail);
977+
}
937978
}
938979

939980
int NVMeofGwMap::blocklist_gw(

src/mon/NVMeofGwMap.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ class NVMeofGwMap
144144
DECODE_FINISH(bl);
145145
}
146146

147-
void get_health_checks(health_check_map_t *checks) const;
147+
void get_health_checks(health_check_map_t *checks);
148148
};
149149

150150
#include "NVMeofGwSerialize.h"

src/mon/NVMeofGwMon.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ class NVMeofGwMon: public PaxosService,
8282
void check_subs(bool type);
8383
void check_sub(Subscription *sub);
8484

85+
std::map<NvmeGroupKey, std::map<NvmeGwId, utime_t>> gws_deleting_time;
86+
8587
private:
8688
void synchronize_last_beacon();
8789
void process_gw_down(const NvmeGwId &gw_id,

0 commit comments

Comments
 (0)