nvmeofgw: fix host issue during redeploy, improves previous redeploy fix

leonidc · leonidc · commit 3b05ba180ca6 · 2025-04-07T18:47:49.000+03:00
This commit fixes the issue when during redeploy hosts might stuck due to
  many failover/failbacks during very short time frame
  this commit improves the previous redeploy fix since it allows to use the
  short beacon timeout - no impact on the failover time

Signed-off-by: Leonid Chernin &lt;leonidc@il.ibm.com&gt;
diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in
@@ -77,14 +77,15 @@ options:
   level: advanced
   desc: Period in seconds from last beacon to monitor marking a  NVMeoF gateway as
     failed
-  default: 15
+  default: 10
   services:
   - mon
 - name: mon_nvmeofgw_skip_failovers_interval
   type: secs
   level: advanced
   desc: Period in seconds in which no failovers are performed in GW's pool-group
-  default: 12
+    this is equal to max GW redeploy interval
+  default: 16
   services:
   - mon
 - name: mon_nvmeofgw_set_group_id_retry
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
@@ -234,9 +234,34 @@ int NVMeofGwMap::do_delete_gw(
 void  NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
       const NvmeGroupKey& group_key, bool &propose_pending)
 {
+  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
   dout(4) << "GW  performed the full startup " << gw_id << dendl;
   propose_pending = true;
   increment_gw_epoch( group_key);
+  auto &st = created_gws[group_key][gw_id];
+  const auto skip_failovers_sec = g_conf().get_val<std::chrono::seconds>
+    ("mon_nvmeofgw_skip_failovers_interval");
+  const auto beacon_grace_sec =
+    g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_beacon_grace");
+ /*
+    This is a heuristic that meant to identify "cephadm redeploy" of the nvmeof gws.
+    We would like to identify that redeploy is going on, because it helps us to prevent
+    redundant failover and failback actions.
+    It is very important to minimize fo/fb during redeploy, because during redeploy
+    all GWs go down and up again, and the amount of fo/fb that could be driven by that
+    is big, which also triggers a lot of changes on the hosts the are nvmeof connected
+    to the gws, even up to the point that the host will get stuck.
+    This heuristic assumes that if a gw disappears and shows back in less than
+    REDEPLOY_TIMEOUT seconds, then it might be that a redeploy started, so we will
+    do a failover for this GW, but will not do failover for the next REDEPLOY_TIMEOUT.
+    Then again for the next GW that disappears and so on.
+    If it works as designed, than regardless of the number of GWs, redeploy will only
+    cause one fo/fb. */
+  if ((now - (st.last_gw_down_ts - beacon_grace_sec)) < skip_failovers_sec) {
+    skip_failovers_for_group(group_key);
+    dout(4) << "startup: set skip-failovers for group " << gw_id << " group "
+	         << group_key << dendl;
+  }
 }
 
 void NVMeofGwMap::increment_gw_epoch(const NvmeGroupKey& group_key)
@@ -332,6 +357,7 @@ int NVMeofGwMap::process_gw_map_gw_down(
     dout(10) << "GW down " << gw_id << dendl;
     auto& st = gw_state->second;
     st.set_unavailable_state();
+    st.set_last_gw_down_ts();
     for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
       fsm_handle_gw_down(
 	gw_id, group_key, state_itr.second,
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
@@ -158,7 +158,7 @@ version_t NVMeofGwMon::get_trim_to() const
  * function called during new paxos epochs
  * function called to restore in pending map all data that is not serialized
  * to paxos peons. Othervise it would be overriden in "pending_map = map"
- * currently  just "allow_failovers_ts" variable is restored
+ * currently "allow_failovers_ts" and "last_gw_down_ts" variables restored
  */
 void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) {
   std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
@@ -173,6 +173,8 @@ void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) {
         pending_map.created_gws[group_key][gw_id].allow_failovers_ts =
           gw_created_pair.second.allow_failovers_ts;
       }
+      pending_map.created_gws[group_key][gw_id].last_gw_down_ts =
+          gw_created_pair.second.last_gw_down_ts;
     }
   }
 }
@@ -671,7 +673,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
 		<< gw_id << dendl;
 	 process_gw_down(gw_id, group_key, gw_propose, avail);
 	 pending_map.skip_failovers_for_group(group_key);
-	 dout(4) << "set skip-failovers for gw's group " << gw_id << " group "
+	 dout(4) << "fast_reboot:set skip-failovers for group " << gw_id << " group "
 	 << group_key << dendl;
       } else if (
 	pending_map.created_gws[group_key][gw_id].performed_full_startup ==
diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h
@@ -153,6 +153,8 @@ struct NvmeGwMonState {
   */
   std::chrono::system_clock::time_point allow_failovers_ts =
              std::chrono::system_clock::now();
+  std::chrono::system_clock::time_point last_gw_down_ts =
+             std::chrono::system_clock::now() - std::chrono::seconds(30);
   NvmeGwMonState(): ana_grp_id(REDUNDANT_GW_ANA_GROUP_ID) {}
 
   NvmeGwMonState(NvmeAnaGrpId id)
@@ -174,6 +176,9 @@ struct NvmeGwMonState {
     sm_state[grpid]       = gw_states_per_group_t::GW_ACTIVE_STATE;
     blocklist_data[grpid].osd_epoch = 0;
   }
+  void set_last_gw_down_ts(){
+    last_gw_down_ts = std::chrono::system_clock::now();
+  }
 };
 
 struct NqnState {