Bug#37275404 AT: testNodeRestart -n MultiCrashTest T1 fails occasionally in .4node4rpl

vinc13e · vinc13e · commit 9a3e30298436 · 2024-12-12T23:55:11.000Z
Context:
runMultiCrashTest crashes a subset of the running data nodes in
a cluster in a rolling restart fashion or in parallel, to check
that whether the situations where the cluster survives or the
cluster dies are as expected.

Problem:
In one of the cases (4 replica), test gracefully crashes 3 of 4
replicas in a node group and expects the 4th replica to die as
well (via a crash insertion in QMGR). Then test checks that all
nodes expected to be dead are dead and that the remaining nodes
are alive, finally test start all nodes again. Problem is that,
when nodes are started (via mgmapi) sometimes the 4th node is
not yet connected to the cluster and therefore the 'start'
command fails.

Solution:
Ensure that, before start the nodes via mgmapi, all the nodes
are already connected to the cluster.

Change-Id: Ib2d4265f4816bc6b975570f69aebfe3952e9bb96
diff --git a/storage/ndb/test/ndbapi/testNodeRestart.cpp b/storage/ndb/test/ndbapi/testNodeRestart.cpp
@@ -1156,34 +1156,74 @@ int runMultiCrashTest(NDBT_Context *ctx, NDBT_Step *step) {
     }
     NdbSleep_SecSleep(2);
   }
+
   if (restarter.startNodes(dead_nodes, num_dead_nodes) != 0) return NDBT_FAILED;
   if (restarter.waitClusterStarted()) return NDBT_FAILED;
 
   if (num_replicas == 2) return NDBT_OK;
 
   ndbout_c("Crash two nodes per node group");
   if (num_replicas == 3) {
+    // Inject error 644 in all nodes. It will eventually hit in one node
+    // in Qmgr::stateArbitCrash.
     prepare_all_nodes_for_death(restarter);
+    int val[] = {DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1};
+    if (restarter.dumpStateAllNodes(val, 2)) {
+      return NDBT_FAILED;
+    }
   }
+  /*
+   * Restart 2 nodes in nostart mode via error insert 1006, in a 3 replica
+   * configuration 3rd node will eventually crash as well. In a 4 replica
+   * configuration remaining nodes will survive.
+   */
   crash_x_nodes_per_node_group(restarter, dead_nodes, num_dead_nodes, 2);
   if (num_replicas == 3) {
     set_all_dead(restarter, dead_nodes, num_dead_nodes);
   }
   if (!restarter.checkClusterState(dead_nodes, num_dead_nodes)) {
     return NDBT_FAILED;
   }
-  NdbSleep_SecSleep(3);
+
+  if (num_replicas == 3) {
+    /*
+     * In 3 replica setup all 3 nodes are restarted, 2 via EI 1006 1 via EI 644.
+     * Wait until al nodes enter the NOSTART state, then we can start all nodes
+     * again.
+     */
+    if (restarter.waitClusterNoStart()) {
+      return NDBT_FAILED;
+    }
+  }
   if (restarter.startNodes(dead_nodes, num_dead_nodes) != 0) return NDBT_FAILED;
   if (restarter.waitClusterStarted()) return NDBT_FAILED;
 
   if (num_replicas == 4) {
     ndbout_c("Crash three nodes per node group");
+
+    int val[] = {DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1};
+    if (restarter.dumpStateAllNodes(val, 2)) {
+      return NDBT_FAILED;
+    }
     prepare_all_nodes_for_death(restarter);
+
+    /*
+     * Restart 3 nodes in nostart mode via error insert 1006, the remaining node
+     * will eventually crash as well.
+     */
     crash_x_nodes_per_node_group(restarter, dead_nodes, num_dead_nodes, 3);
     set_all_dead(restarter, dead_nodes, num_dead_nodes);
     if (!restarter.checkClusterState(dead_nodes, num_dead_nodes)) {
       return NDBT_FAILED;
     }
+
+    /*
+     * All 4 nodes are restarted, 3 via EI 1006 1 via EI 644. Wait until all
+     * nodes enter the NOSTART state, then we can start all nodes again.
+     */
+    if (restarter.waitClusterNoStart()) {
+      return NDBT_FAILED;
+    }
     if (restarter.startNodes(dead_nodes, num_dead_nodes) != 0)
       return NDBT_FAILED;
     if (restarter.waitClusterStarted()) return NDBT_FAILED;