Bug#37163647 SR recovery issues

frazerclement · frazerclement · commit f0b665227c96 · 2024-12-03T23:07:45.000Z
Backport to 7.6

Test

testSystemRestart is extended with a new test GCPSaveLagLcpSR
which exercises a multi-NG system with :
 - All nodes but one suffering GCP_SAVE lag
 - A subsequent LCP being triggered
 - A subsequent System Restart

This exposes the problem mentioned in the bug
with CopyGCIReq copying state which leads to
the SR being unrecoverable.

The symptom is that the SR does not complete.

Problem

The DIH block in the Master/President role controls
the GCP_SAVE and COPY_GCI protocols.

The GCP_SAVE protocol results in updates to each
participating node's lastCommittedGCI metadata,
and the COPY_GCI protocol propagates this information
to all nodes.

System Restart code effectively assumes that :
 - The newestRestorableGCI is the max of the stored
   per-node lastCommittedGCIs
 - The max of the lastCommittedGCIs is restorable

The recoverability-robustness of the system is improved
by moving the checks of these assumptions back from
the distributed System Restart phase to the CopyGCIREQ
propagation phase where a live President instructs each
node to write new state to disk.

Improvement 1

If there is some logic problem that could threaten
future recoverability of the cluster, it causes the
President to halt immediately.

While this can cause an immediate service outage, it
surfaces + avoids the risk of unrecoverability.

The situation where a non recoverable set of GCI info is
distributed should be rare.  However if a running cluster is
upgraded to a version containing these checks then there is a
chance that a new version President will fail as a result of
inheriting inconsistent data from the previous old version
President.

This scenario is risky as it is in precisely this situation that
it is possible that the cluster is not SR recoverable, so we do
not want to risk needing an SR.

For this reason, as part of Master GCP takeover, all nodes will
align their inherited GCI info to ensure that it does not result
in an immediate failure.

Improvement 2

The President's logic in GCP_SAVEREQ is modified to avoid
directly updating the in-memory 'SYSFILE' as part of processing
GCP_SAVECONF signals from participating nodes.

The set of nodes which sent a CONF is instead stored in
a bitmap, leaving the lastCommittedGCI values intact.

When the GCP_SAVEREQ round is complete, the bitmap is used
to update the lastCommittedGci values atomically with the
newestRestorableGCI, so that any subsequent CopyGCIREQ
invocation will propagate them together.

This avoids an intermediate CopyGCIREQ (e.g. triggered
by the start of an LCP) attempting to propagate values
which would not be recoverable.

Change-Id: Ib2d5bf9dee5ae9c05670d02488adb39678ef3ac8
diff --git a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
@@ -1523,6 +1523,8 @@ class Dbdih: public SimulatedBlock {
   void copyTabReq_complete(Signal* signal, TabRecordPtr tabPtr);
 
   void gcpcommitreqLab(Signal *);
+  void validateCopyGci(Signal *);
+  void upgradeAlignCopyGci();
   void copyGciLab(Signal *, CopyGCIReq::CopyReason reason);
   void storeNewLcpIdLab(Signal *);
   void startLcpRoundLoopLab(Signal *, Uint32 startTableId, Uint32 startFragId);
@@ -2162,6 +2164,7 @@ class Dbdih: public SimulatedBlock {
       Uint32 m_new_gci;
       Uint32 m_time_between_gcp;   /* Delay between global checkpoints */
       NDB_TICKS m_start_time;
+      NdbNodeBitmask m_saveConfNodes;
     } m_master;
   } m_gcp_save;
 
diff --git a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
@@ -10523,6 +10523,14 @@ bool Dbdih::handle_master_take_over_copy_gci(Signal *signal, NodeId new_master_n
                         signal, 10, MasterGCPReq::SignalLength);
     return true;
   }
+
+  /**
+   * We are more strict than older versions, align inherited
+   * info at Master takeover before the info has a chance
+   * to be checked if we become new Master
+   */
+  upgradeAlignCopyGci();
+
   c_handled_master_take_over_copy_gci = new_master_node_id;
   return false;
 }
@@ -10920,6 +10928,7 @@ void Dbdih::MASTER_GCPhandling(Signal* signal, Uint32 failedNodeId)
       /**
        * Restart GCP_SAVE_REQ
        */
+      m_gcp_save.m_master.m_saveConfNodes.clear();
       sendLoopMacro(GCP_SAVEREQ, sendGCP_SAVEREQ, RNIL);
       break;
     }
@@ -17100,6 +17109,7 @@ void Dbdih::execGCP_NODEFINISH(Signal* signal)
   Uint32 saveGCI = old_hi;
   m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_REQ;
   m_gcp_save.m_master.m_new_gci = saveGCI;
+  m_gcp_save.m_master.m_saveConfNodes.clear();
   
 #ifdef ERROR_INSERT
   if (ERROR_INSERTED(7188))
@@ -17223,9 +17233,14 @@ void Dbdih::execGCP_SAVECONF(Signal* signal)
     return;
   }
 
+  /* Master */
   ndbrequire(saveConf->gci == m_gcp_save.m_master.m_new_gci);
   ndbrequire(saveConf->nodeId == saveConf->dihPtr);
-  SYSFILE->lastCompletedGCI[saveConf->nodeId] = saveConf->gci;  
+
+  /* Record CONF received in this round */
+  ndbrequire(!m_gcp_save.m_master.m_saveConfNodes.get(saveConf->nodeId));
+  m_gcp_save.m_master.m_saveConfNodes.set(saveConf->nodeId);
+
   GCP_SAVEhandling(signal, saveConf->nodeId);
 }//Dbdih::execGCP_SAVECONF()
 
@@ -17248,6 +17263,7 @@ void Dbdih::execGCP_SAVEREF(Signal* signal)
 
   ndbrequire(saveRef->gci == m_gcp_save.m_master.m_new_gci);
   ndbrequire(saveRef->nodeId == saveRef->dihPtr);
+  ndbrequire(!m_gcp_save.m_master.m_saveConfNodes.get(saveRef->nodeId));
 
   /**
    * Only allow reason not to save
@@ -17275,6 +17291,22 @@ void Dbdih::GCP_SAVEhandling(Signal* signal, Uint32 nodeId)
    * RESTART.
    *------------------------------------------------------------------------*/
   SYSFILE->newestRestorableGCI = m_gcp_save.m_gci;
+
+  /**
+   * Set lastCompletedGci values for all CONFed participants in
+   * the GCP Save round atomically with the newestRestorableGci
+   * now.
+   * This avoids any intermediate CopyGCIReq rounds propagating
+   * transition states prior to a GCI being fully restorable.
+   */
+  {
+    Uint32 participant = 0;
+    while ((participant = m_gcp_save.m_master.m_saveConfNodes.find_next(
+                participant + 1)) != BitmaskImpl::NotFound) {
+      SYSFILE->lastCompletedGCI[participant] = m_gcp_save.m_gci;
+    }
+    m_gcp_save.m_master.m_saveConfNodes.clear();
+  }
   if(Sysfile::getInitialStartOngoing(SYSFILE->systemRestartBits) &&
      getNodeState().startLevel == NodeState::SL_STARTED){
     jam();
@@ -17765,7 +17797,90 @@ void Dbdih::execDIHNDBTAMPER(Signal* signal)
 /*****************************************************************************/
 /* **********     FILE HANDLING MODULE                           *************/
 /*****************************************************************************/
-void Dbdih::copyGciLab(Signal* signal, CopyGCIReq::CopyReason reason) 
+void Dbdih::validateCopyGci(Signal *signal) {
+  jam();
+  /**
+   * Before we (Master) copy our GCI info to all other
+   * nodes, let's check it for sanity
+   */
+  bool newestRestorableGCIIsMax = true;
+  const Uint32 newestRestorableGCI = SYSFILE->newestRestorableGCI;
+
+  for (Uint32 i = 0; i < MAX_NDB_NODES; i++) {
+    const Uint32 nodeLastCompletedGci = SYSFILE->lastCompletedGCI[i];
+
+    if (nodeLastCompletedGci > newestRestorableGCI) {
+      jam();
+      newestRestorableGCIIsMax = false;
+    }
+  }
+
+  if (unlikely(!newestRestorableGCIIsMax)) {
+    jam();
+    g_eventLogger->error("DIH : newestRestorableGCI %u", newestRestorableGCI);
+    for (Uint32 i = 0; i < MAX_NDB_NODES; i++) {
+      if (Sysfile::getNodeStatus(i, SYSFILE->nodeStatus) != Sysfile::NS_NotDefined)
+        g_eventLogger->error("DIH : Node %u lastCompletedGCI %u", i,
+                             SYSFILE->lastCompletedGCI[i]);
+    }
+
+    if (!newestRestorableGCIIsMax) {
+      jam();
+      /**
+       * Require that the newestRestorableGci number is the max of
+       * the per-node lastCommittedGci numbers
+       * Otherwise one of those can be chosen as President in a subsequent
+       * System Restart making it unrecoverable.
+       */
+      g_eventLogger->error(
+          "DIH : Invalid CopyGCIREQ attempted, newestRestorableGCI is not max");
+    }
+
+    ndbrequire(newestRestorableGCIIsMax);
+  }
+}
+
+void Dbdih::upgradeAlignCopyGci() {
+  jam();
+
+  /**
+   * We now require that the newestRestorableGci is the
+   * highest Gci recorded in the per-node lastCompletedGci
+   * array.
+   *
+   * This is enforced in new code, but may not have been
+   * the case previously.
+   *
+   * In case we inherit invalid GCI info from the previous
+   * Master, lets filter it here to avoid e.g. a cascading
+   * failure as part of RR, which would in fact require
+   * a subsequent SR exactly when the SR is problematic.
+   *
+   * The assumption here is that any case where the
+   * lastCompletedGci is > newestRestorableGci is due to
+   * exposure of a transient state prior to GCP_SAVE round
+   * completion, which can be safely treated as though the
+   * round were not completed.
+   *
+   * When we are no longer likely to upgrade from a system
+   * which may send invalid CopyGci data, this realignment
+   * logic can be removed.
+   */
+  for (Uint32 i = 0; i < MAX_NDB_NODES; i++) {
+    if (SYSFILE->lastCompletedGCI[i] > SYSFILE->newestRestorableGCI) {
+      jam();
+      g_eventLogger->warning(
+          "DIH : Aligning lastCompletedGCI of node %u from %u to %u", i,
+          SYSFILE->lastCompletedGCI[i], SYSFILE->newestRestorableGCI);
+      /* This is only intended for one specific upgrade scenario */
+      ndbrequire(SYSFILE->lastCompletedGCI[i] ==
+                 SYSFILE->newestRestorableGCI + 1);
+      SYSFILE->lastCompletedGCI[i] = SYSFILE->newestRestorableGCI;
+    }
+  }
+}
+
+void Dbdih::copyGciLab(Signal* signal, CopyGCIReq::CopyReason reason)
 {
   if(c_copyGCIMaster.m_copyReason != CopyGCIReq::IDLE)
   {
@@ -17836,6 +17951,9 @@ void Dbdih::copyGciLab(Signal* signal, CopyGCIReq::CopyReason reason)
     }
   }
 
+  /* Check integrity of GCI info before propagating */
+  validateCopyGci(signal);
+
   sendLoopMacro(COPY_GCIREQ, sendCOPY_GCIREQ, RNIL);
 
 }//Dbdih::copyGciLab()
diff --git a/storage/ndb/test/ndbapi/testSystemRestart.cpp b/storage/ndb/test/ndbapi/testSystemRestart.cpp
@@ -4323,6 +4323,106 @@ int runCrashNodeDuringSR(NDBT_Context *ctx, NDBT_Step *step) {
   return result;
 }
 
+int checkMultipleNGs(NDBT_Context *ctx, NDBT_Step *step) {
+  NdbRestarter restarter;
+
+  if (restarter.getNumNodeGroups() < 2) {
+    g_err << "Need multiple nodegroups for test" << endl;
+    return NDBT_SKIPPED;
+  }
+
+  return NDBT_OK;
+}
+
+int runGCPSaveLagLcpSR(NDBT_Context *ctx, NDBT_Step *step) {
+  /**
+   * Test behaviour with sequence :
+   *    Multiple nodegroups
+   *    GCP Save stalled on every node except one
+   *    ...
+   *    LCP triggered
+   *    ...
+   *    System failure
+   *    System restart
+   *
+   * This exercises code which copies the DIH 'recoverable GCI'
+   * metadata around the system.
+   * A partially complete GCP is not yet restorable - the execution
+   * of an LCP should not cause a subsequent SR to attempt to
+   * restore it.
+   */
+  NdbRestarter restarter;
+
+  const Uint32 startLcpDelaySecs = 10;
+  const Uint32 postLcpDelaySecs = 5;
+  const Uint32 stallGCPSaveEICode = 7237;
+
+  const int node = restarter.getNode(NdbRestarter::NS_RANDOM);
+
+  /* Inject error on all nodes other than the node */
+  g_err << "Injecting error " << stallGCPSaveEICode
+        << " to stall GCP_SAVE on all nodes "
+        << " other than " << node << endl;
+  if (restarter.insertErrorInAllNodes(stallGCPSaveEICode) != 0) {
+    g_err << "Error inserting error in all" << endl;
+    restarter.insertErrorInAllNodes(0);
+    return NDBT_FAILED;
+  }
+  if (restarter.insertErrorInNode(node, 0) != 0) {
+    g_err << "Error clearing error in node " << node << endl;
+    restarter.insertErrorInAllNodes(0);
+    return NDBT_FAILED;
+  }
+
+  g_err << "Delay " << startLcpDelaySecs << " seconds before triggering LCP"
+        << endl;
+  NdbSleep_SecSleep(startLcpDelaySecs);
+
+  g_err << "Triggering LCP" << endl;
+  const int startLcpDumpCode = 7099;
+  if (restarter.dumpStateAllNodes(&startLcpDumpCode, 1) != 0) {
+    restarter.insertErrorInAllNodes(0);
+    return NDBT_FAILED;
+  }
+
+  g_err << "Waiting for " << postLcpDelaySecs
+        << " seconds after triggering Lcp." << endl;
+  NdbSleep_SecSleep(postLcpDelaySecs);
+
+  g_err << "Triggering immediate System Restart" << endl;
+  if (restarter.restartAll(false,  // initial
+                           true,   // nostart
+                           true,   // abort
+                           false)  // force
+      != 0) {
+    restarter.insertErrorInAllNodes(0);
+    g_err << "Triggering SR failed" << endl;
+    return NDBT_FAILED;
+  }
+
+  g_err << "Waiting for NoStart state" << endl;
+  if (restarter.waitClusterNoStart() != 0) {
+    g_err << "Failed waiting for nodes to enter NoStart state" << endl;
+    return NDBT_FAILED;
+  }
+
+  g_err << "Starting cluster" << endl;
+  if (restarter.startAll() != 0) {
+    g_err << "Failed to request start of all nodes" << endl;
+    return NDBT_FAILED;
+  }
+
+  g_err << "Waiting for cluster to recover" << endl;
+  if (restarter.waitClusterStarted() != 0) {
+    g_err << "Cluster failed to start" << endl;
+    return NDBT_FAILED;
+  }
+
+  g_err << "Cluster recovered successfully" << endl;
+
+  return NDBT_OK;
+}
+
 /**************************************************************************/
 
 NDBT_TESTSUITE(testSystemRestart);
@@ -4856,6 +4956,13 @@ TESTCASE("SystemDownDuringSR", "Check recoverability when system goes down "
                                "during system restart") {
   STEP(runCrashNodeDuringSR);
 }
+TESTCASE("GCPSaveLagLcpSR",
+         "Check recoverability when system fails during "
+         "GCP Save lag with an intermediate LCP start") {
+  INITIALIZER(checkMultipleNGs);
+  INITIALIZER(runWaitStarted);
+  STEP(runGCPSaveLagLcpSR);
+}
 
 NDBT_TESTSUITE_END(testSystemRestart);
 
diff --git a/storage/ndb/test/run-test/daily-devel--07-tests.txt b/storage/ndb/test/run-test/daily-devel--07-tests.txt
@@ -308,3 +308,7 @@ max-time: 180
 cmd: testScan
 args: -n ScanErrorHandling T1
 max-time: 480
+
+cmd: testSystemRestart
+args: -n GCPSaveLagLcpSR T1
+max-time: 240
diff --git a/storage/ndb/test/src/NdbRestarter.cpp b/storage/ndb/test/src/NdbRestarter.cpp
@@ -219,6 +219,7 @@ NdbRestarter::getNodeGroups(Vector<int>& node_groups, int * max_alive_replicas_p
     return -1;
   }
 
+  int n_groups = 0;
   Vector<int> node_group_replicas;
   for (unsigned i = 0; i < ndbNodes.size(); i++)
   {
@@ -238,6 +239,7 @@ NdbRestarter::getNodeGroups(Vector<int>& node_groups, int * max_alive_replicas_p
     if (node_group_replicas[node_group] == 0)
     {
       node_groups.push_back(node_group);
+      n_groups++;
     }
 
     node_group_replicas[node_group]++;
@@ -256,7 +258,7 @@ NdbRestarter::getNodeGroups(Vector<int>& node_groups, int * max_alive_replicas_p
     }
     *max_alive_replicas_ptr = max_alive_replicas;
   }
-  return 0;
+  return n_groups;
 }
 
 int NdbRestarter::getNumNodeGroups() {

Original file line number	Diff line number	Diff line change
`@@ -219,6 +219,7 @@ NdbRestarter::getNodeGroups(Vector<int>& node_groups, int * max_alive_replicas_p`
`219`	`219`	`return -1;`
`220`	`220`	`}`
`221`	`221`
	`222`	`+ int n_groups = 0;`
`222`	`223`	`Vector<int> node_group_replicas;`
`223`	`224`	`for (unsigned i = 0; i < ndbNodes.size(); i++)`
`224`	`225`	`{`
`@@ -238,6 +239,7 @@ NdbRestarter::getNodeGroups(Vector<int>& node_groups, int * max_alive_replicas_p`
`238`	`239`	`if (node_group_replicas[node_group] == 0)`
`239`	`240`	`{`
`240`	`241`	`node_groups.push_back(node_group);`
	`242`	`+ n_groups++;`
`241`	`243`	`}`
`242`	`244`
`243`	`245`	`node_group_replicas[node_group]++;`
`@@ -256,7 +258,7 @@ NdbRestarter::getNodeGroups(Vector<int>& node_groups, int * max_alive_replicas_p`
`256`	`258`	`}`
`257`	`259`	`*max_alive_replicas_ptr = max_alive_replicas;`
`258`	`260`	`}`
`259`		`- return 0;`
	`261`	`+ return n_groups;`
`260`	`262`	`}`
`261`	`263`
`262`	`264`	`int NdbRestarter::getNumNodeGroups() {`