Skip to content

Commit 2cba3bf

Browse files
smani87evergreen
authored andcommitted
SERVER-42602 Guarantees that the unconditional step down does not happen due to slow node restarts in rollback_fuzzer_[un]clean_shutdowns suites.
1 parent 79fd215 commit 2cba3bf

File tree

1 file changed

+42
-2
lines changed

1 file changed

+42
-2
lines changed

jstests/replsets/libs/rollback_test.js

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ function RollbackTest(name = "RollbackTest", replSet) {
6161
const SIGTERM = 15;
6262
const kNumDataBearingNodes = 3;
6363
const kElectableNodes = 2;
64+
const kForeverSecs = 24 * 60 * 60;
6465

6566
let rst;
6667
let curPrimary;
@@ -236,6 +237,9 @@ function RollbackTest(name = "RollbackTest", replSet) {
236237
`may prevent a rollback here.`);
237238
}
238239

240+
// Unfreeze the node if it was previously frozen, so that it can run for the election.
241+
assert.commandWorked(curSecondary.adminCommand({replSetFreeze: 0}));
242+
239243
// Ensure that the tiebreaker node is connected to the other nodes. We must do this after
240244
// we are sure that rollback has completed on the rollback node.
241245
tiebreakerNode.reconnect([curPrimary, curSecondary]);
@@ -369,6 +373,12 @@ function RollbackTest(name = "RollbackTest", replSet) {
369373
this.transitionToSyncSourceOperationsDuringRollback = function() {
370374
transitionIfAllowed(State.kSyncSourceOpsDuringRollback);
371375

376+
// If the nodes are restarted after the rollback node is able to rollback successfully and
377+
// catch up to curPrimary's oplog, then the rollback node can become the new primary.
378+
// If so, it can lead to unplanned state transitions, like unconditional step down, during
379+
// the test. To avoid those problems, prevent rollback node from starting an election.
380+
assert.commandWorked(curSecondary.adminCommand({replSetFreeze: kForeverSecs}));
381+
372382
log(`Reconnecting the secondary ${curSecondary.host} so it'll go into rollback`);
373383
// Reconnect the rollback node to the current primary, which is the node we want to sync
374384
// from. If we reconnect to both the current primary and the tiebreaker node, the rollback
@@ -431,9 +441,39 @@ function RollbackTest(name = "RollbackTest", replSet) {
431441
log(`Restarting node ${hostName}`);
432442
rst.start(nodeId, startOptions, true /* restart */);
433443

434-
// Ensure that the primary is ready to take operations before continuing. If both nodes are
435-
// connected to the tiebreaker node, the primary may switch.
444+
// Freeze the node if the restarted node is the rollback node.
445+
if (curState === State.kSyncSourceOpsDuringRollback &&
446+
rst.getNodeId(curSecondary) === nodeId) {
447+
assert.soon(() => {
448+
// Try stepping down the rollback node if it became the primary after its
449+
// restart, as it might have caught up with the original primary.
450+
curSecondary.adminCommand({"replSetStepDown": kForeverSecs, "force": true});
451+
try {
452+
// Prevent rollback node from running election. There is a chance that this
453+
// node might have started running election or became primary after
454+
// 'replSetStepDown' cmd, so 'replSetFreeze' cmd can fail.
455+
assert.commandWorked(
456+
curSecondary.adminCommand({"replSetFreeze": kForeverSecs}));
457+
return true;
458+
} catch (e) {
459+
if (e.code === ErrorCodes.NotSecondary) {
460+
return false;
461+
}
462+
throw e;
463+
}
464+
}, `Failed to run replSetFreeze cmd on ${curSecondary.host}`);
465+
}
466+
467+
const oldPrimary = curPrimary;
468+
// Wait for the new primary to be elected and ready to take operations before continuing.
436469
curPrimary = rst.getPrimary();
470+
471+
// The primary can change after node restarts only if all the 3 nodes are connected to each
472+
// other.
473+
if (curState !== State.kSteadyStateOps) {
474+
assert.eq(curPrimary, oldPrimary);
475+
}
476+
437477
curSecondary = rst.getSecondary();
438478
assert.neq(curPrimary, curSecondary);
439479
};

0 commit comments

Comments
 (0)