Skip to content

Commit b3b494a

Browse files
XueruiFaevergreen
authored andcommitted
SERVER-42825 Log and track metrics.repl.stateTransition counters after stopped killing user operation
1 parent fbdd571 commit b3b494a

22 files changed

+356
-87
lines changed

jstests/replsets/drop_databases_two_phase.js

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
load('jstests/replsets/libs/two_phase_drops.js'); // For TwoPhaseDropCollectionTest.
2121
load('jstests/libs/check_log.js');
22+
load("jstests/replsets/rslib.js");
2223

2324
// Returns a list of all collections in a given database. Use 'args' as the
2425
// 'listCollections' command arguments.
@@ -44,11 +45,6 @@ function listCollectionNames(database, args) {
4445
return listCollections(database, args).map(c => c.name);
4546
}
4647

47-
// Sets a fail point on a specified node.
48-
function setFailPoint(node, failpoint, mode) {
49-
assert.commandWorked(node.adminCommand({configureFailPoint: failpoint, mode: mode}));
50-
}
51-
5248
var dbNameToDrop = 'dbToDrop';
5349
var replTest = new ReplSetTest({nodes: [{}, {}, {arbiter: true}]});
5450

@@ -72,7 +68,7 @@ assert.eq(1, collToDrop.find().itcount());
7268
// Pause application on secondary so that commit point doesn't advance, meaning that a dropped
7369
// database on the primary will remain in 'drop-pending' state.
7470
jsTestLog("Pausing oplog application on the secondary node.");
75-
setFailPoint(secondary, "rsSyncApplyStop", "alwaysOn");
71+
setFailPoint(secondary, "rsSyncApplyStop");
7672

7773
// Make sure the collection was created.
7874
assert.contains(collNameToDrop,
@@ -139,7 +135,7 @@ assert.commandFailedWithCode(dbToDrop.adminCommand('restartCatalog'),
139135
// Let the secondary apply the collection drop operation, so that the replica set commit point
140136
// will advance, and the 'Database' phase of the database drop will complete on the primary.
141137
jsTestLog("Restarting oplog application on the secondary node.");
142-
setFailPoint(secondary, "rsSyncApplyStop", "off");
138+
clearFailPoint(secondary, "rsSyncApplyStop");
143139

144140
jsTestLog("Waiting for collection drop operation to replicate to all nodes.");
145141
replTest.awaitReplication();

jstests/replsets/force_sync_source_candidate.js

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
(function() {
88
"use strict";
99

10+
load("jstests/replsets/rslib.js");
11+
1012
const failpointName = "forceSyncSourceCandidate";
1113

1214
const rst = new ReplSetTest({
@@ -18,15 +20,13 @@ const rst = new ReplSetTest({
1820
});
1921
const nodes = rst.startSet();
2022

21-
function setFailPoint(node, syncSource) {
22-
const dataObj = {hostAndPort: syncSource.host};
23-
assert.commandWorked(
24-
node.adminCommand({configureFailPoint: failpointName, mode: "alwaysOn", data: dataObj}));
23+
function getDataObj(syncSource) {
24+
return {hostAndPort: syncSource.host};
2525
}
2626

27-
setFailPoint(nodes[1], nodes[0]);
28-
setFailPoint(nodes[2], nodes[1]);
29-
setFailPoint(nodes[3], nodes[2]);
27+
setFailPoint(nodes[1], failpointName, getDataObj(nodes[0]));
28+
setFailPoint(nodes[2], failpointName, getDataObj(nodes[1]));
29+
setFailPoint(nodes[3], failpointName, getDataObj(nodes[2]));
3030

3131
rst.initiate();
3232
const primary = rst.getPrimary();

jstests/replsets/kill_reads_with_prepare_conflicts_during_step_down.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ load("jstests/core/txns/libs/prepare_helpers.js");
1212
load("jstests/libs/check_log.js");
1313

1414
// Start one of the nodes with priority: 0 to avoid elections.
15-
var rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0}}]});
15+
const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0}}]});
1616
rst.startSet();
1717
rst.initiate();
1818

@@ -84,7 +84,8 @@ rst.waitForState(primary, ReplSetTest.State.SECONDARY);
8484

8585
// Validate that the read operation got killed during step down.
8686
let replMetrics = assert.commandWorked(primaryAdmin.adminCommand({serverStatus: 1})).metrics.repl;
87-
assert.eq(replMetrics.stepDown.userOperationsKilled, 1);
87+
assert.eq(replMetrics.stateTransition.lastStateTransition, "stepDown");
88+
assert.eq(replMetrics.stateTransition.userOperationsKilled, 1);
8889

8990
// Allow the primary to be re-elected, and wait for it.
9091
assert.commandWorked(primaryAdmin.adminCommand({replSetFreeze: 0}));

jstests/replsets/kills_reads_with_prepare_conflicts_during_stepup.js renamed to jstests/replsets/kill_reads_with_prepare_conflicts_during_step_up.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
load("jstests/core/txns/libs/prepare_helpers.js");
1212
load("jstests/libs/check_log.js");
1313

14-
var rst = new ReplSetTest({nodes: 2});
14+
const rst = new ReplSetTest({nodes: 2});
1515
rst.startSet();
1616

1717
const config = rst.getReplSetConfig();
@@ -108,6 +108,11 @@ rst.waitForState(primary, ReplSetTest.State.SECONDARY);
108108

109109
primary = rst.getPrimary();
110110

111+
// Validate that the read operation got killed during step up.
112+
let replMetrics = assert.commandWorked(primary.adminCommand({serverStatus: 1})).metrics.repl;
113+
assert.eq(replMetrics.stateTransition.lastStateTransition, "stepUp");
114+
assert.eq(replMetrics.stateTransition.userOperationsKilled, 1);
115+
111116
// Make sure we can successfully commit the prepared transaction.
112117
jsTestLog("Restoring shell session state");
113118
session = PrepareHelpers.createSessionWithGivenId(primary, sessionID);

jstests/replsets/no_disconnect_on_stepdown.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,10 @@ function runStepDownTest({description, failpoint, operation, errorCode}) {
6969

7070
// Validate the number of operations killed on step down and number of failed unacknowledged
7171
// writes resulted in network disconnection.
72-
let replMetrics =
72+
const replMetrics =
7373
assert.commandWorked(primaryAdmin.adminCommand({serverStatus: 1})).metrics.repl;
74-
assert.eq(replMetrics.stepDown.userOperationsKilled, 1);
74+
assert.eq(replMetrics.stateTransition.lastStateTransition, "stepDown");
75+
assert.eq(replMetrics.stateTransition.userOperationsKilled, 1);
7576
assert.eq(replMetrics.network.notMasterUnacknowledgedWrites, 0);
7677

7778
// Allow the primary to be re-elected, and wait for it.

jstests/replsets/read_operations_during_rollback.js

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,16 @@
11
/*
22
* This test makes sure 'find' and 'getMore' commands fail correctly during rollback.
3+
* @tags: [requires_majority_read_concern]
34
*/
45
(function() {
56
"use strict";
67

78
load("jstests/replsets/libs/rollback_test.js");
9+
load("jstests/replsets/rslib.js");
810

911
const dbName = "test";
1012
const collName = "coll";
1113

12-
let setFailPoint = (node, failpoint) => {
13-
jsTestLog("Setting fail point " + failpoint);
14-
assert.commandWorked(node.adminCommand({configureFailPoint: failpoint, mode: "alwaysOn"}));
15-
};
16-
17-
let clearFailPoint = (node, failpoint) => {
18-
jsTestLog("Clearing fail point " + failpoint);
19-
assert.commandWorked(node.adminCommand({configureFailPoint: failpoint, mode: "off"}));
20-
};
21-
2214
// Set up Rollback Test.
2315
let rollbackTest = new RollbackTest();
2416

@@ -91,6 +83,15 @@ clearFailPoint(rollbackNode, "rollbackHangAfterTransitionToRollback");
9183

9284
rollbackTest.transitionToSteadyStateOperations();
9385

86+
const replMetrics = assert.commandWorked(rollbackNode.adminCommand({serverStatus: 1})).metrics.repl;
87+
assert.eq(replMetrics.stateTransition.lastStateTransition, "rollback");
88+
assert(replMetrics.stateTransition.userOperationsRunning,
89+
() => "Response should have a 'stateTransition.userOperationsRunning' field: " +
90+
tojson(replMetrics));
91+
assert(replMetrics.stateTransition.userOperationsKilled,
92+
() => "Response should have a 'stateTransition.userOperationsKilled' field: " +
93+
tojson(replMetrics));
94+
9495
// Check the replica set.
9596
rollbackTest.stop();
9697
}());

jstests/replsets/read_operations_during_step_down.js

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
load("jstests/libs/check_log.js");
66
load('jstests/libs/parallelTester.js');
77
load("jstests/libs/curop_helpers.js"); // for waitForCurOpByFailPoint().
8+
load("jstests/replsets/rslib.js");
89

910
(function() {
1011

@@ -80,17 +81,13 @@ checkLog.contains(primary, "Starting to kill user operations");
8081

8182
// Enable "waitAfterCommandFinishesExecution" fail point to make sure the find and get more
8283
// commands on database 'test' does not complete before step down.
83-
assert.commandWorked(primaryAdmin.runCommand({
84-
configureFailPoint: "waitAfterCommandFinishesExecution",
85-
data: {ns: primaryColl.getFullName(), commands: ["find", "getMore"]},
86-
mode: "alwaysOn"
87-
}));
84+
setFailPoint(primaryAdmin,
85+
"waitAfterCommandFinishesExecution",
86+
{ns: collNss, commands: ["find", "getMore"]});
8887

8988
jsTestLog("4. Disable fail points");
90-
assert.commandWorked(
91-
primaryAdmin.runCommand({configureFailPoint: "waitInFindBeforeMakingBatch", mode: "off"}));
92-
assert.commandWorked(primaryAdmin.runCommand(
93-
{configureFailPoint: "waitAfterPinningCursorBeforeGetMoreBatch", mode: "off"}));
89+
clearFailPoint(primaryAdmin, "waitInFindBeforeMakingBatch");
90+
clearFailPoint(primaryAdmin, "waitAfterPinningCursorBeforeGetMoreBatch");
9491

9592
// Wait until the primary transitioned to SECONDARY state.
9693
joinStepDownThread();
@@ -99,8 +96,7 @@ rst.waitForState(primary, ReplSetTest.State.SECONDARY);
9996
// We don't want to check if we have reached "waitAfterCommandFinishesExecution" fail point
10097
// because we already know that the primary has stepped down successfully. This implies that
10198
// the find and get more commands are still running even after the node stepped down.
102-
assert.commandWorked(primaryAdmin.runCommand(
103-
{configureFailPoint: "waitAfterCommandFinishesExecution", mode: "off"}));
99+
clearFailPoint(primaryAdmin, "waitAfterCommandFinishesExecution");
104100

105101
// Wait for find & getmore thread to join.
106102
joinGetMoreThread();
@@ -113,10 +109,11 @@ assert.docEq([{_id: 0}], getMoreRes.cursor.nextBatch);
113109

114110
// Validate that no operations got killed on step down and no network disconnection happened due
115111
// to failed unacknowledged operations.
116-
let replMetrics = assert.commandWorked(primaryAdmin.adminCommand({serverStatus: 1})).metrics.repl;
117-
assert.eq(replMetrics.stepDown.userOperationsKilled, 0);
112+
const replMetrics = assert.commandWorked(primaryAdmin.adminCommand({serverStatus: 1})).metrics.repl;
113+
assert.eq(replMetrics.stateTransition.lastStateTransition, "stepDown");
114+
assert.eq(replMetrics.stateTransition.userOperationsKilled, 0);
118115
// Should account for find and getmore commands issued before step down.
119-
assert.gte(replMetrics.stepDown.userOperationsRunning, 2);
116+
assert.gte(replMetrics.stateTransition.userOperationsRunning, 2);
120117
assert.eq(replMetrics.network.notMasterUnacknowledgedWrites, 0);
121118

122119
rst.stopSet();
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/*
2+
* Test that the read operations are not killed and their connections are also not
3+
* closed during step up.
4+
*/
5+
load("jstests/libs/check_log.js");
6+
load('jstests/libs/parallelTester.js');
7+
load("jstests/libs/curop_helpers.js"); // for waitForCurOpByFailPoint().
8+
load("jstests/replsets/rslib.js");
9+
10+
(function() {
11+
12+
"use strict";
13+
14+
const testName = jsTestName();
15+
const dbName = "test";
16+
const collName = "coll";
17+
18+
const rst = new ReplSetTest({name: testName, nodes: 2});
19+
rst.startSet();
20+
rst.initiateWithHighElectionTimeout();
21+
22+
const primary = rst.getPrimary();
23+
const primaryDB = primary.getDB(dbName);
24+
const primaryColl = primaryDB[collName];
25+
26+
const secondary = rst.getSecondary();
27+
const secondaryDB = secondary.getDB(dbName);
28+
const secondaryAdmin = secondary.getDB("admin");
29+
const secondaryColl = secondaryDB[collName];
30+
const secondaryCollNss = secondaryColl.getFullName();
31+
32+
TestData.dbName = dbName;
33+
TestData.collName = collName;
34+
35+
jsTestLog("1. Do a document write");
36+
assert.commandWorked(
37+
        primaryColl.insert({_id: 0}, {"writeConcern": {"w": "majority"}}));
38+
rst.awaitReplication();
39+
40+
// Open a cursor on secondary.
41+
const cursorIdToBeReadAfterStepUp =
42+
assert.commandWorked(secondaryDB.runCommand({"find": collName, batchSize: 0})).cursor.id;
43+
44+
jsTestLog("2. Start blocking getMore cmd before step up");
45+
const joinGetMoreThread = startParallelShell(() => {
46+
// Open another cursor on secondary before step up.
47+
secondaryDB = db.getSiblingDB(TestData.dbName);
48+
secondaryDB.getMongo().setSlaveOk(true);
49+
50+
const cursorIdToBeReadDuringStepUp =
51+
assert.commandWorked(secondaryDB.runCommand({"find": TestData.collName, batchSize: 0}))
52+
.cursor.id;
53+
54+
// Enable the fail point for get more cmd.
55+
assert.commandWorked(db.adminCommand(
56+
{configureFailPoint: "waitAfterPinningCursorBeforeGetMoreBatch", mode: "alwaysOn"}));
57+
58+
getMoreRes = assert.commandWorked(secondaryDB.runCommand(
59+
{"getMore": cursorIdToBeReadDuringStepUp, collection: TestData.collName}));
60+
assert.docEq([{_id: 0}], getMoreRes.cursor.nextBatch);
61+
}, secondary.port);
62+
63+
// Wait for getmore cmd to reach the fail point.
64+
waitForCurOpByFailPoint(
65+
secondaryAdmin, secondaryCollNss, "waitAfterPinningCursorBeforeGetMoreBatch");
66+
67+
jsTestLog("2. Start blocking find cmd before step up");
68+
const joinFindThread = startParallelShell(() => {
69+
secondaryDB = db.getSiblingDB(TestData.dbName);
70+
secondaryDB.getMongo().setSlaveOk(true);
71+
72+
// Enable the fail point for find cmd.
73+
assert.commandWorked(
74+
db.adminCommand({configureFailPoint: "waitInFindBeforeMakingBatch", mode: "alwaysOn"}));
75+
76+
const findRes = assert.commandWorked(secondaryDB.runCommand({"find": TestData.collName}));
77+
assert.docEq([{_id: 0}], findRes.cursor.firstBatch);
78+
}, secondary.port);
79+
80+
// Wait for find cmd to reach the fail point.
81+
waitForCurOpByFailPoint(secondaryAdmin, secondaryCollNss, "waitInFindBeforeMakingBatch");
82+
83+
jsTestLog("3. Make primary step up");
84+
const joinStepUpThread = startParallelShell(() => {
85+
assert.commandWorked(db.adminCommand({"replSetStepUp": 100, "force": true}));
86+
}, secondary.port);
87+
88+
// Wait until the step up has started to kill user operations.
89+
checkLog.contains(secondary, "Starting to kill user operations");
90+
91+
// Enable "waitAfterCommandFinishesExecution" fail point to make sure the find and get more
92+
// commands on database 'test' does not complete before step up.
93+
setFailPoint(secondaryAdmin,
94+
"waitAfterCommandFinishesExecution",
95+
{ns: secondaryCollNss, commands: ["find", "getMore"]});
96+
97+
jsTestLog("4. Disable fail points");
98+
clearFailPoint(secondaryAdmin, "waitInFindBeforeMakingBatch");
99+
clearFailPoint(secondaryAdmin, "waitAfterPinningCursorBeforeGetMoreBatch");
100+
101+
// Wait until the secondary transitioned to PRIMARY state.
102+
joinStepUpThread();
103+
rst.waitForState(secondary, ReplSetTest.State.PRIMARY);
104+
105+
// We don't want to check if we have reached "waitAfterCommandFinishesExecution" fail point
106+
// because we already know that the secondary has stepped up successfully. This implies that
107+
// the find and get more commands are still running even after the node stepped up.
108+
clearFailPoint(secondaryAdmin, "waitAfterCommandFinishesExecution");
109+
110+
// Wait for find & getmore thread to join.
111+
joinGetMoreThread();
112+
joinFindThread();
113+
114+
jsTestLog("5. Start get more cmd after step up");
115+
const getMoreRes = assert.commandWorked(
116+
secondaryDB.runCommand({"getMore": cursorIdToBeReadAfterStepUp, collection: collName}));
117+
assert.docEq([{_id: 0}], getMoreRes.cursor.nextBatch);
118+
119+
// Validate that no operations got killed on step up and no network disconnection happened due
120+
// to failed unacknowledged operations.
121+
const replMetrics =
122+
assert.commandWorked(secondaryAdmin.adminCommand({serverStatus: 1})).metrics.repl;
123+
assert.eq(replMetrics.stateTransition.lastStateTransition, "stepUp");
124+
assert.eq(replMetrics.stateTransition.userOperationsKilled, 0);
125+
// Should account for find and getmore commands issued before step up.
126+
assert.gte(replMetrics.stateTransition.userOperationsRunning, 2);
127+
assert.eq(replMetrics.network.notMasterUnacknowledgedWrites, 0);
128+
129+
rst.stopSet();
130+
})();

jstests/replsets/rslib.js

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ var awaitRSClientHosts;
1515
var getLastOpTime;
1616
var setLogVerbosity;
1717
var stopReplicationAndEnforceNewPrimaryToCatchUp;
18+
var setFailPoint;
19+
var clearFailPoint;
1820

1921
(function() {
2022
"use strict";
@@ -488,4 +490,21 @@ stopReplicationAndEnforceNewPrimaryToCatchUp = function(rst, node) {
488490
latestOpOnNewPrimary: latestOpOnNewPrimary
489491
};
490492
};
493+
494+
/**
495+
* Sets the specified failpoint to 'alwaysOn' on the node.
496+
*/
497+
setFailPoint = function(node, failpoint, data = {}) {
498+
jsTestLog("Setting fail point " + failpoint);
499+
assert.commandWorked(
500+
node.adminCommand({configureFailPoint: failpoint, mode: "alwaysOn", data}));
501+
};
502+
503+
/**
504+
* Sets the specified failpoint to 'off' on the node.
505+
*/
506+
clearFailPoint = function(node, failpoint) {
507+
jsTestLog("Clearing fail point " + failpoint);
508+
assert.commandWorked(node.adminCommand({configureFailPoint: failpoint, mode: "off"}));
509+
};
491510
}());

jstests/replsets/step_down_on_secondary.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ joinReadThread();
124124

125125
// Validate that the read operation got killed during step down.
126126
const replMetrics = assert.commandWorked(primary.adminCommand({serverStatus: 1})).metrics.repl;
127-
assert.eq(replMetrics.stepDown.userOperationsKilled, 1, replMetrics);
127+
assert.eq(replMetrics.stateTransition.lastStateTransition, "stepDown");
128+
assert.eq(replMetrics.stateTransition.userOperationsKilled, 1, replMetrics);
128129

129130
jsTestLog("Check nodes have correct data");
130131
assert.docEq(newPrimary.getDB(dbName)[collName].find({_id: 0}).toArray(), [{_id: 0, b: 1}]);

0 commit comments

Comments
 (0)