Skip to content

Commit 8d402e3

Browse files
committed
mon/NVMeofGw*: fixing bugs - handle gw fast-reboot, proper handle of gw delete scenarios
Signed-off-by: Leonid Chernin <[email protected]>
1 parent 7d2cade commit 8d402e3

File tree

4 files changed

+188
-6
lines changed

4 files changed

+188
-6
lines changed

NVMeofGwMap.h

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2+
// vim: ts=8 sw=2 smarttab
3+
/*
4+
* Ceph - scalable distributed file system
5+
*
6+
* Copyright (C) 2023 IBM, Inc.
7+
*
8+
* This is free software; you can redistribute it and/or
9+
* modify it under the terms of the GNU Lesser General Public
10+
* License version 2.1, as published by the Free Software
11+
* Foundation. See file COPYING.
12+
*/
13+
14+
#ifndef MON_NVMEOFGWMAP_H_
15+
#define MON_NVMEOFGWMAP_H_
16+
#include <map>
17+
#include <iostream>
18+
#include "include/encoding.h"
19+
#include "include/utime.h"
20+
#include "common/Formatter.h"
21+
#include "common/ceph_releases.h"
22+
#include "common/version.h"
23+
#include "common/options.h"
24+
#include "common/Clock.h"
25+
#include "msg/Message.h"
26+
#include "common/ceph_time.h"
27+
#include "NVMeofGwTypes.h"
28+
#define dout_context g_ceph_context
29+
#define dout_subsys ceph_subsys_mon
30+
#undef dout_prefix
31+
#define MODULE_PREFFIX "nvmeofgw "
32+
#define dout_prefix *_dout << MODULE_PREFFIX << __PRETTY_FUNCTION__ << " "
33+
34+
35+
static const version_t STRUCT_VERSION = 2;
36+
static const version_t OLD_STRUCT_VERSION = 1;
37+
38+
using ceph::coarse_mono_clock;
39+
class Monitor;
40+
/*-------------------*/
41+
class NVMeofGwMap
42+
{
43+
public:
44+
Monitor* mon = NULL;
45+
epoch_t epoch = 0; // epoch is for Paxos synchronization mechanizm
46+
bool delay_propose = false;
47+
std::map<entity_addrvec_t , uint32_t> peer_addr_2_version;
48+
std::map<NvmeGroupKey, NvmeGwMonStates> created_gws;
49+
std::map<NvmeGroupKey, NvmeGwTimers> fsm_timers;// map that handles timers started by all Gateway FSMs
50+
void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
51+
52+
int cfg_add_gw (const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
53+
int cfg_delete_gw (const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
54+
void process_gw_map_ka (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, epoch_t& last_osd_epoch, bool &propose_pending);
55+
int process_gw_map_gw_down (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending);
56+
void update_active_timers (bool &propose_pending);
57+
void handle_abandoned_ana_groups (bool &propose_pending);
58+
void handle_removed_subsystems (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const std::vector<NvmeNqnId> &current_subsystems, bool &propose_pending);
59+
void start_timer (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid, uint8_t value);
60+
private:
61+
void add_grp_id (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid);
62+
void remove_grp_id(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, const NvmeAnaGrpId grpid);
63+
void fsm_handle_gw_down (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, gw_states_per_group_t state, NvmeAnaGrpId grpid, bool &map_modified);
64+
void fsm_handle_gw_delete (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, gw_states_per_group_t state, NvmeAnaGrpId grpid, bool &map_modified);
65+
void fsm_handle_gw_alive (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeGwMonState & gw_state, gw_states_per_group_t state,
66+
NvmeAnaGrpId grpid, epoch_t& last_osd_epoch, bool &map_modified);
67+
void fsm_handle_to_expired (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &map_modified);
68+
69+
void find_failover_candidate(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &propose_pending);
70+
void find_failback_gw (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending);
71+
void set_failover_gw_for_ANA_group (const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key, const NvmeGwId &gw_id,
72+
NvmeAnaGrpId groupid);
73+
74+
75+
int get_timer (const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid);
76+
void cancel_timer(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId anagrpid);
77+
void validate_gw_map(const NvmeGroupKey& group_key);
78+
79+
public:
80+
int blocklist_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId ANA_groupid, epoch_t &epoch, bool failover);
81+
82+
void encode(ceph::buffer::list &bl, uint64_t features) const {
83+
uint8_t version;
84+
if (HAVE_FEATURE(features, SERVER_SQUID)) version = STRUCT_VERSION;
85+
else version = OLD_STRUCT_VERSION;
86+
ENCODE_START(version, 1, bl);
87+
dout(4) << "encode1 version " << (uint64_t)version << version << " features " << features << dendl;
88+
using ceph::encode;
89+
encode(epoch, bl);// global map epoch
90+
if (version == STRUCT_VERSION) {
91+
//encode(peer_addr_2_version, bl);
92+
}
93+
encode(created_gws, bl, features); //Encode created GWs
94+
encode(fsm_timers, bl, features);
95+
ENCODE_FINISH(bl);
96+
}
97+
98+
void decode(ceph::buffer::list::const_iterator &bl) {
99+
using ceph::decode;
100+
epoch_t struct_version = 0;
101+
DECODE_START(STRUCT_VERSION, bl);
102+
DECODE_OLDEST(1);
103+
struct_version = struct_v;
104+
dout(4) << "decode version " << struct_version << dendl;
105+
decode(epoch, bl);
106+
if (struct_version == STRUCT_VERSION) {
107+
//dout(4) << "Decode peer_2_addr " << dendl;
108+
//decode(peer_addr_2_version, bl);
109+
}
110+
decode(created_gws, bl);
111+
decode(fsm_timers, bl);
112+
DECODE_FINISH(bl);
113+
}
114+
};
115+
116+
#include "NVMeofGwSerialize.h"
117+
118+
#endif /* SRC_MON_NVMEOFGWMAP_H_ */

src/mon/NVMeofGwMap.cc

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,61 @@ void NVMeofGwMap::find_failover_candidate(
424424
}
425425
}
426426

427+
void NVMeofGwMap::handle_gw_performing_fast_reboot(const NvmeGwId &gw_id,
428+
const NvmeGroupKey& group_key, bool &map_modified)
429+
{
430+
for (auto& state_itr: created_gws[group_key][gw_id].sm_state ) {
431+
fsm_handle_gw_fast_reboot(gw_id,group_key, state_itr.first, map_modified);
432+
}
433+
}
434+
435+
void NVMeofGwMap::fsm_handle_gw_fast_reboot(const NvmeGwId &gw_id,
436+
const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &map_modified)
437+
{
438+
// GW that appears in the internal map as Available, performed reboot,
439+
// need to re-apply this GW: to load proper states for all active ANA groups
440+
auto& gw_state = created_gws[group_key][gw_id];
441+
map_modified = true;
442+
gw_states_per_group_t state = gw_state.sm_state[grpid];
443+
dout(10) << "GW " << gw_id << " ANA groupId: " << grpid << " state "
444+
<< state << dendl;
445+
switch (state){
446+
case gw_states_per_group_t::GW_IDLE_STATE:
447+
case gw_states_per_group_t::GW_STANDBY_STATE:
448+
case gw_states_per_group_t::GW_ACTIVE_STATE:
449+
break;
450+
451+
case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
452+
{
453+
//restart timeout
454+
start_timer(gw_id, group_key, grpid, 3);
455+
}
456+
break;
457+
458+
case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED:
459+
{
460+
// since owner was reseted for this group, wait for the background process
461+
// to choose it again
462+
gw_state.standby_state(grpid);
463+
}
464+
break;
465+
466+
case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:
467+
{
468+
//restart timer
469+
// The blocklist was started, need to wait for the epoch in the GW
470+
start_timer(gw_id, group_key, grpid, 30);
471+
}
472+
break;
473+
474+
default:
475+
{
476+
dout(4) << "Warning: GW " << gw_id << " Invalid state " << state << dendl;
477+
}
478+
}
479+
validate_gw_map(group_key);
480+
}
481+
427482
void NVMeofGwMap::fsm_handle_gw_alive(
428483
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
429484
NvmeGwMonState & gw_state, gw_states_per_group_t state,

src/mon/NVMeofGwMap.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ class NVMeofGwMap
6161
void start_timer(
6262
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
6363
NvmeAnaGrpId anagrpid, uint8_t value);
64+
void handle_gw_performing_fast_reboot(const NvmeGwId &gw_id,
65+
const NvmeGroupKey& group_key, bool &map_modified);
6466
private:
6567
void add_grp_id(
6668
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
@@ -81,7 +83,9 @@ class NVMeofGwMap
8183
void fsm_handle_to_expired(
8284
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
8385
NvmeAnaGrpId grpid, bool &map_modified);
84-
86+
void fsm_handle_gw_fast_reboot(const NvmeGwId &gw_id,
87+
const NvmeGroupKey& group_key, NvmeAnaGrpId grpid,
88+
bool &map_modified);
8589
void find_failover_candidate(
8690
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
8791
NvmeAnaGrpId grpid, bool &propose_pending);

src/mon/NVMeofGwMon.cc

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,10 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
394394
err = 0;
395395
sstrm.str("");
396396
}
397+
if (rc == 0) {
398+
LastBeacon lb = {id, group_key};
399+
last_beacon.erase(lb);
400+
}
397401
}
398402
// propose pending would be generated by the PaxosService
399403
if ((rc != -EEXIST) && (rc != -EINVAL)) {
@@ -450,6 +454,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
450454
auto& group_gws = map.created_gws[group_key];
451455
auto gw = group_gws.find(gw_id);
452456
const BeaconSubsystems& sub = m->get_subsystems();
457+
auto now = ceph::coarse_mono_clock::now();
453458

454459
if (avail == gw_availability_t::GW_CREATED) {
455460
if (gw == group_gws.end()) {
@@ -466,17 +471,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
466471
if (pending_map.created_gws[group_key][gw_id].availability ==
467472
gw_availability_t::GW_AVAILABLE) {
468473
dout(4) << " Warning :GW marked as Available in the NVmeofGwMon "
469-
<< "database, performed full startup - Force gw to exit!"
474+
<< "database, performed full startup - Apply GW!"
470475
<< gw_id << dendl;
471-
avail = gw_availability_t::GW_UNAVAILABLE;
472-
// Monitor performs Force Failover for this GW in process_gw_map_gw_down
476+
pending_map.handle_gw_performing_fast_reboot(gw_id, group_key, propose);
477+
LastBeacon lb = {gw_id, group_key};
478+
last_beacon[lb] = now; //Update last beacon
473479
} else if (
474480
pending_map.created_gws[group_key][gw_id].performed_full_startup ==
475481
false) {
476482
pending_map.created_gws[group_key][gw_id].performed_full_startup = true;
477483
propose = true;
478-
goto set_propose;
479484
}
485+
goto set_propose;
480486
}
481487
// gw already created
482488
} else {
@@ -542,7 +548,6 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
542548
<< " beacon_epoch " << m->get_last_gwmap_epoch() << dendl;
543549
}
544550
if (avail == gw_availability_t::GW_AVAILABLE) {
545-
auto now = ceph::coarse_mono_clock::now();
546551
// check pending_map.epoch vs m->get_version() -
547552
// if different - drop the beacon
548553

0 commit comments

Comments
 (0)