Skip to content

Commit 3b05ba1

Browse files
committed
nvmeofgw: fix host issue during redeploy, improves previous redeploy fix
This commit fixes the issue when during redeploy hosts might stuck due to many failover/failbacks during very short time frame this commit improves the previous redeploy fix since it allows to use the short beacon timeout - no impact on the failover time Signed-off-by: Leonid Chernin <[email protected]>
1 parent e0958b5 commit 3b05ba1

File tree

4 files changed

+38
-4
lines changed

4 files changed

+38
-4
lines changed

src/common/options/mon.yaml.in

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,15 @@ options:
7777
level: advanced
7878
desc: Period in seconds from last beacon to monitor marking a NVMeoF gateway as
7979
failed
80-
default: 15
80+
default: 10
8181
services:
8282
- mon
8383
- name: mon_nvmeofgw_skip_failovers_interval
8484
type: secs
8585
level: advanced
8686
desc: Period in seconds in which no failovers are performed in GW's pool-group
87-
default: 12
87+
this is equal to max GW redeploy interval
88+
default: 16
8889
services:
8990
- mon
9091
- name: mon_nvmeofgw_set_group_id_retry

src/mon/NVMeofGwMap.cc

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,34 @@ int NVMeofGwMap::do_delete_gw(
234234
void NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
235235
const NvmeGroupKey& group_key, bool &propose_pending)
236236
{
237+
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
237238
dout(4) << "GW performed the full startup " << gw_id << dendl;
238239
propose_pending = true;
239240
increment_gw_epoch( group_key);
241+
auto &st = created_gws[group_key][gw_id];
242+
const auto skip_failovers_sec = g_conf().get_val<std::chrono::seconds>
243+
("mon_nvmeofgw_skip_failovers_interval");
244+
const auto beacon_grace_sec =
245+
g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_beacon_grace");
246+
/*
247+
This is a heuristic that meant to identify "cephadm redeploy" of the nvmeof gws.
248+
We would like to identify that redeploy is going on, because it helps us to prevent
249+
redundant failover and failback actions.
250+
It is very important to minimize fo/fb during redeploy, because during redeploy
251+
all GWs go down and up again, and the amount of fo/fb that could be driven by that
252+
is big, which also triggers a lot of changes on the hosts the are nvmeof connected
253+
to the gws, even up to the point that the host will get stuck.
254+
This heuristic assumes that if a gw disappears and shows back in less than
255+
REDEPLOY_TIMEOUT seconds, then it might be that a redeploy started, so we will
256+
do a failover for this GW, but will not do failover for the next REDEPLOY_TIMEOUT.
257+
Then again for the next GW that disappears and so on.
258+
If it works as designed, than regardless of the number of GWs, redeploy will only
259+
cause one fo/fb. */
260+
if ((now - (st.last_gw_down_ts - beacon_grace_sec)) < skip_failovers_sec) {
261+
skip_failovers_for_group(group_key);
262+
dout(4) << "startup: set skip-failovers for group " << gw_id << " group "
263+
<< group_key << dendl;
264+
}
240265
}
241266

242267
void NVMeofGwMap::increment_gw_epoch(const NvmeGroupKey& group_key)
@@ -332,6 +357,7 @@ int NVMeofGwMap::process_gw_map_gw_down(
332357
dout(10) << "GW down " << gw_id << dendl;
333358
auto& st = gw_state->second;
334359
st.set_unavailable_state();
360+
st.set_last_gw_down_ts();
335361
for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
336362
fsm_handle_gw_down(
337363
gw_id, group_key, state_itr.second,

src/mon/NVMeofGwMon.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ version_t NVMeofGwMon::get_trim_to() const
158158
* function called during new paxos epochs
159159
* function called to restore in pending map all data that is not serialized
160160
* to paxos peons. Othervise it would be overriden in "pending_map = map"
161-
* currently just "allow_failovers_ts" variable is restored
161+
* currently "allow_failovers_ts" and "last_gw_down_ts" variables restored
162162
*/
163163
void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) {
164164
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
@@ -173,6 +173,8 @@ void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) {
173173
pending_map.created_gws[group_key][gw_id].allow_failovers_ts =
174174
gw_created_pair.second.allow_failovers_ts;
175175
}
176+
pending_map.created_gws[group_key][gw_id].last_gw_down_ts =
177+
gw_created_pair.second.last_gw_down_ts;
176178
}
177179
}
178180
}
@@ -671,7 +673,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
671673
<< gw_id << dendl;
672674
process_gw_down(gw_id, group_key, gw_propose, avail);
673675
pending_map.skip_failovers_for_group(group_key);
674-
dout(4) << "set skip-failovers for gw's group " << gw_id << " group "
676+
dout(4) << "fast_reboot:set skip-failovers for group " << gw_id << " group "
675677
<< group_key << dendl;
676678
} else if (
677679
pending_map.created_gws[group_key][gw_id].performed_full_startup ==

src/mon/NVMeofGwTypes.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ struct NvmeGwMonState {
153153
*/
154154
std::chrono::system_clock::time_point allow_failovers_ts =
155155
std::chrono::system_clock::now();
156+
std::chrono::system_clock::time_point last_gw_down_ts =
157+
std::chrono::system_clock::now() - std::chrono::seconds(30);
156158
NvmeGwMonState(): ana_grp_id(REDUNDANT_GW_ANA_GROUP_ID) {}
157159

158160
NvmeGwMonState(NvmeAnaGrpId id)
@@ -174,6 +176,9 @@ struct NvmeGwMonState {
174176
sm_state[grpid] = gw_states_per_group_t::GW_ACTIVE_STATE;
175177
blocklist_data[grpid].osd_epoch = 0;
176178
}
179+
void set_last_gw_down_ts(){
180+
last_gw_down_ts = std::chrono::system_clock::now();
181+
}
177182
};
178183

179184
struct NqnState {

0 commit comments

Comments
 (0)