Skip to content

Commit 720590f

Browse files
Merge pull request ceph#62205 from leonidc/wip-leonidc-redeploy-fix
nvmeofgw* :do not allow failover for gws during redeploy (fast-reboot)
2 parents 49225d3 + abdb740 commit 720590f

File tree

6 files changed

+87
-8
lines changed

6 files changed

+87
-8
lines changed

src/common/options/mon.yaml.in

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,14 @@ options:
7777
level: advanced
7878
desc: Period in seconds from last beacon to monitor marking a NVMeoF gateway as
7979
failed
80-
default: 10
80+
default: 15
81+
services:
82+
- mon
83+
- name: mon_nvmeofgw_skip_failovers_interval
84+
type: secs
85+
level: advanced
86+
desc: Period in seconds in which no failovers are performed in GW's pool-group
87+
default: 12
8188
services:
8289
- mon
8390
- name: mon_nvmeofgw_set_group_id_retry

src/mon/NVMeofGwMap.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,16 @@ void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
285285
}
286286
}
287287

288+
void NVMeofGwMap::skip_failovers_for_group(const NvmeGroupKey& group_key)
289+
{
290+
const auto skip_failovers = g_conf().get_val<std::chrono::seconds>
291+
("mon_nvmeofgw_skip_failovers_interval");
292+
for (auto& gw_created: created_gws[group_key]) {
293+
gw_created.second.allow_failovers_ts = std::chrono::system_clock::now()
294+
+ skip_failovers;
295+
}
296+
}
297+
288298
int NVMeofGwMap::process_gw_map_gw_no_subsys_no_listeners(
289299
const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending)
290300
{
@@ -523,9 +533,14 @@ void NVMeofGwMap::find_failover_candidate(
523533
#define MIN_NUM_ANA_GROUPS 0xFFF
524534
int min_num_ana_groups_in_gw = 0;
525535
int current_ana_groups_in_gw = 0;
536+
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
526537
NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID;
527538
auto& gws_states = created_gws[group_key];
528539
auto gw_state = gws_states.find(gw_id);
540+
if (gw_state->second.allow_failovers_ts > now) {
541+
dout(4) << "gw " << gw_id << " skip-failovers is set " << dendl;
542+
return;
543+
}
529544

530545
// this GW may handle several ANA groups and for each
531546
// of them need to found the candidate GW

src/mon/NVMeofGwMap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ class NVMeofGwMap
8787
const NvmeGroupKey& group_key, bool &map_modified);
8888
void gw_performed_startup(const NvmeGwId &gw_id,
8989
const NvmeGroupKey& group_key, bool &propose_pending);
90+
void skip_failovers_for_group(const NvmeGroupKey& group_key);
9091
private:
9192
int do_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
9293
int do_erase_gw_id(const NvmeGwId &gw_id,

src/mon/NVMeofGwMon.cc

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,35 @@ version_t NVMeofGwMon::get_trim_to() const
153153
return 0;
154154
}
155155

156+
/**
157+
* restore_pending_map_info
158+
* function called during new paxos epochs
159+
* function called to restore in pending map all data that is not serialized
160+
* to paxos peons. Othervise it would be overriden in "pending_map = map"
161+
* currently just "allow_failovers_ts" variable is restored
162+
*/
163+
void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) {
164+
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
165+
for (auto& created_map_pair: tmp_map.created_gws) {
166+
auto group_key = created_map_pair.first;
167+
NvmeGwMonStates& gw_created_map = created_map_pair.second;
168+
for (auto& gw_created_pair: gw_created_map) {
169+
auto gw_id = gw_created_pair.first;
170+
if (gw_created_pair.second.allow_failovers_ts > now) {
171+
// restore not persistent information upon new epochs
172+
dout(10) << " restore skip-failovers timeout for gw " << gw_id << dendl;
173+
pending_map.created_gws[group_key][gw_id].allow_failovers_ts =
174+
gw_created_pair.second.allow_failovers_ts;
175+
}
176+
}
177+
}
178+
}
179+
156180
void NVMeofGwMon::create_pending()
157181
{
182+
NVMeofGwMap tmp_map = pending_map;
158183
pending_map = map;// deep copy of the object
184+
restore_pending_map_info(tmp_map);
159185
pending_map.epoch++;
160186
dout(10) << " pending " << pending_map << dendl;
161187
}
@@ -641,11 +667,12 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
641667
if (pending_map.created_gws[group_key][gw_id].availability ==
642668
gw_availability_t::GW_AVAILABLE) {
643669
dout(1) << " Warning :GW marked as Available in the NVmeofGwMon "
644-
<< "database, performed full startup - Apply GW!"
670+
<< "database, performed full startup - Apply it but don't allow failover!"
645671
<< gw_id << dendl;
646672
process_gw_down(gw_id, group_key, gw_propose, avail);
647-
LastBeacon lb = {gw_id, group_key};
648-
last_beacon[lb] = now; //Update last beacon
673+
pending_map.skip_failovers_for_group(group_key);
674+
dout(4) << "set skip-failovers for gw's group " << gw_id << " group "
675+
<< group_key << dendl;
649676
} else if (
650677
pending_map.created_gws[group_key][gw_id].performed_full_startup ==
651678
false) {
@@ -654,6 +681,8 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
654681
pending_map.created_gws[group_key][gw_id].addr_vect =
655682
entity_addrvec_t(con->get_peer_addr());
656683
}
684+
LastBeacon lb = {gw_id, group_key};
685+
last_beacon[lb] = now; //Update last beacon
657686
goto set_propose;
658687
}
659688
// gw already created
@@ -774,15 +803,19 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
774803
% beacons_till_ack) == 0))|| (!apply_ack_logic) ) {
775804
send_ack = true;
776805
if (apply_ack_logic) {
777-
dout(10) << "ack sent: beacon index "
806+
dout(20) << "ack sent: beacon index "
778807
<< pending_map.created_gws[group_key][gw_id].beacon_index
779808
<< " gw " << gw_id <<dendl;
780809
}
781810
}
782811
if (send_ack && ((!gw_propose && epoch_filter_enabled) ||
783-
(propose && !epoch_filter_enabled)) ) {
784-
// if epoch-filter-bit: send ack to beacon in case no propose
785-
//or if changed something not relevant to gw-epoch
812+
(propose && !epoch_filter_enabled) ||
813+
(avail == gw_availability_t::GW_CREATED)) ) {
814+
/* always send beacon ack to gw in Created state,
815+
* it should be temporary state
816+
* if epoch-filter-bit: send ack to beacon in case no propose
817+
* or if changed something not relevant to gw-epoch
818+
*/
786819
if (gw_created) {
787820
// respond with a map slice correspondent to the same GW
788821
ack_map.created_gws[group_key][gw_id] = map.created_gws[group_key][gw_id];

src/mon/NVMeofGwMon.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ class NVMeofGwMon: public PaxosService,
9898
NvmeGwId &gw_id, NvmeGroupKey& group_key);
9999
epoch_t get_ack_map_epoch(bool gw_created, const NvmeGroupKey& group_key);
100100
void recreate_gw_epoch();
101+
void restore_pending_map_info(NVMeofGwMap & tmp_map);
101102
};
102103

103104
#endif /* MON_NVMEGWMONITOR_H_ */

src/mon/NVMeofGwTypes.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,28 @@ struct NvmeGwMonState {
131131
//ceph entity address allocated for the GW-client that represents this GW-id
132132
entity_addrvec_t addr_vect;
133133
uint16_t beacon_index = 0;
134+
/**
135+
* during redeploy action and maybe other emergency use-cases gw performs scenario
136+
* that we call fast-reboot. It quickly reboots(due to redeploy f.e) and sends the
137+
* first beacon to monitor in "Created" state while according to monitor FSM it
138+
* still appears "Available".
139+
* This lost of synchronizarion with GW is detected by monitor. After fast reboot, the monitor
140+
* still considers this GW as not eligible for owning any ANA group until it becomes
141+
* in Available state (sends the next beacon that includes the subsystems information).
142+
* In this specific fast-reboot case, we prefer to avoid failing over the ANA groups
143+
* that were owned by this GW for a short time frame, assuming that this GW will be
144+
* in Available State in few seconds. Doing too many failovers and failbacks
145+
* in a very short times frame, on many GWs, is causing a lot of pain to the
146+
* initiators, up to the point that they might stuck.
147+
*
148+
* So was decided to set to GW new timeout varible "allow_failovers_ts" - no failovers
149+
* are performed to GW's pool-group during 12 seconds from the time it is set.
150+
* this variable is not persistent - not serialized to peers, so need to prevent
151+
* it from being overriden by new epochs in monitor's function create_pending -
152+
* function restore_pending_map_info is called for this purpose
153+
*/
154+
std::chrono::system_clock::time_point allow_failovers_ts =
155+
std::chrono::system_clock::now();
134156
NvmeGwMonState(): ana_grp_id(REDUNDANT_GW_ANA_GROUP_ID) {}
135157

136158
NvmeGwMonState(NvmeAnaGrpId id)

0 commit comments

Comments
 (0)