Skip to content

Commit 9a3e302

Browse files
committed
Bug#37275404 AT: testNodeRestart -n MultiCrashTest T1 fails occasionally in .4node4rpl
Context: runMultiCrashTest crashes a subset of the running data nodes in a cluster in a rolling restart fashion or in parallel, to check that whether the situations where the cluster survives or the cluster dies are as expected. Problem: In one of the cases (4 replica), test gracefully crashes 3 of 4 replicas in a node group and expects the 4th replica to die as well (via a crash insertion in QMGR). Then test checks that all nodes expected to be dead are dead and that the remaining nodes are alive, finally test start all nodes again. Problem is that, when nodes are started (via mgmapi) sometimes the 4th node is not yet connected to the cluster and therefore the 'start' command fails. Solution: Ensure that, before start the nodes via mgmapi, all the nodes are already connected to the cluster. Change-Id: Ib2d4265f4816bc6b975570f69aebfe3952e9bb96
1 parent 334389a commit 9a3e302

File tree

1 file changed

+41
-1
lines changed

1 file changed

+41
-1
lines changed

storage/ndb/test/ndbapi/testNodeRestart.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1156,34 +1156,74 @@ int runMultiCrashTest(NDBT_Context *ctx, NDBT_Step *step) {
11561156
}
11571157
NdbSleep_SecSleep(2);
11581158
}
1159+
11591160
if (restarter.startNodes(dead_nodes, num_dead_nodes) != 0) return NDBT_FAILED;
11601161
if (restarter.waitClusterStarted()) return NDBT_FAILED;
11611162

11621163
if (num_replicas == 2) return NDBT_OK;
11631164

11641165
ndbout_c("Crash two nodes per node group");
11651166
if (num_replicas == 3) {
1167+
// Inject error 644 in all nodes. It will eventually hit in one node
1168+
// in Qmgr::stateArbitCrash.
11661169
prepare_all_nodes_for_death(restarter);
1170+
int val[] = {DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1};
1171+
if (restarter.dumpStateAllNodes(val, 2)) {
1172+
return NDBT_FAILED;
1173+
}
11671174
}
1175+
/*
1176+
* Restart 2 nodes in nostart mode via error insert 1006, in a 3 replica
1177+
* configuration 3rd node will eventually crash as well. In a 4 replica
1178+
* configuration remaining nodes will survive.
1179+
*/
11681180
crash_x_nodes_per_node_group(restarter, dead_nodes, num_dead_nodes, 2);
11691181
if (num_replicas == 3) {
11701182
set_all_dead(restarter, dead_nodes, num_dead_nodes);
11711183
}
11721184
if (!restarter.checkClusterState(dead_nodes, num_dead_nodes)) {
11731185
return NDBT_FAILED;
11741186
}
1175-
NdbSleep_SecSleep(3);
1187+
1188+
if (num_replicas == 3) {
1189+
/*
1190+
* In 3 replica setup all 3 nodes are restarted, 2 via EI 1006 1 via EI 644.
1191+
* Wait until al nodes enter the NOSTART state, then we can start all nodes
1192+
* again.
1193+
*/
1194+
if (restarter.waitClusterNoStart()) {
1195+
return NDBT_FAILED;
1196+
}
1197+
}
11761198
if (restarter.startNodes(dead_nodes, num_dead_nodes) != 0) return NDBT_FAILED;
11771199
if (restarter.waitClusterStarted()) return NDBT_FAILED;
11781200

11791201
if (num_replicas == 4) {
11801202
ndbout_c("Crash three nodes per node group");
1203+
1204+
int val[] = {DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1};
1205+
if (restarter.dumpStateAllNodes(val, 2)) {
1206+
return NDBT_FAILED;
1207+
}
11811208
prepare_all_nodes_for_death(restarter);
1209+
1210+
/*
1211+
* Restart 3 nodes in nostart mode via error insert 1006, the remaining node
1212+
* will eventually crash as well.
1213+
*/
11821214
crash_x_nodes_per_node_group(restarter, dead_nodes, num_dead_nodes, 3);
11831215
set_all_dead(restarter, dead_nodes, num_dead_nodes);
11841216
if (!restarter.checkClusterState(dead_nodes, num_dead_nodes)) {
11851217
return NDBT_FAILED;
11861218
}
1219+
1220+
/*
1221+
* All 4 nodes are restarted, 3 via EI 1006 1 via EI 644. Wait until all
1222+
* nodes enter the NOSTART state, then we can start all nodes again.
1223+
*/
1224+
if (restarter.waitClusterNoStart()) {
1225+
return NDBT_FAILED;
1226+
}
11871227
if (restarter.startNodes(dead_nodes, num_dead_nodes) != 0)
11881228
return NDBT_FAILED;
11891229
if (restarter.waitClusterStarted()) return NDBT_FAILED;

0 commit comments

Comments
 (0)