Skip to content

Commit 310951b

Browse files
craig[bot]tbg
andcommitted
107613: kvserver: avoid moving parts in closedts tests r=erikgrinaker a=tbg Many tests (two just this last week[^1][^2]) are flaky because the replicate queue can make rebalancing decisions that undermine the state the test is trying to set up. Often, this happens "accidentally" because ReplicationAuto is our default ReplicationMode. This PR improves the situation at least for closed timestamp integration tests by switching them all over to `ReplicationManual` (and preventing any new ones from accidentally using `ReplicationAuto` in the future). This can be seen as a small step towards cockroachdb#107528, which I am increasingly convinced is an ocean worth boiling. [^1]: cockroachdb#107179 [^2]: cockroachdb#101824 Epic: None Release note: None Co-authored-by: Tobias Grieger <[email protected]>
2 parents 57b6b70 + e3e9b77 commit 310951b

File tree

3 files changed

+56
-37
lines changed

3 files changed

+56
-37
lines changed

pkg/kv/kvserver/client_rangefeed_test.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -221,15 +221,17 @@ func TestRangefeedIsRoutedToNonVoter(t *testing.T) {
221221
defer log.Scope(t).Close(t)
222222

223223
ctx := context.Background()
224-
clusterArgs := aggressiveResolvedTimestampClusterArgs
225-
// We want to manually add a non-voter to a range in this test, so disable
226-
// the replicateQueue to prevent it from disrupting the test.
227-
clusterArgs.ReplicationMode = base.ReplicationManual
224+
clusterArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
228225
// NB: setupClusterForClosedTSTesting sets a low closed timestamp target
229226
// duration.
227+
// NB: the replicate queue is disabled in this test, so we can manually add a
228+
// non-voter to a range without being disrupted.
230229
tc, _, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, clusterArgs, "cttest", "kv")
231230
defer tc.Stopper().Stop(ctx)
232-
tc.AddNonVotersOrFatal(t, desc.StartKey.AsRawKey(), tc.Target(1))
231+
// This test doesn't want the default voters on s2 and s3. We want only
232+
// the voter on s1 and a non-voter on s2.
233+
desc = tc.RemoveVotersOrFatal(t, desc.StartKey.AsRawKey(), tc.Target(1), tc.Target(2))
234+
desc = tc.AddNonVotersOrFatal(t, desc.StartKey.AsRawKey(), tc.Target(1))
233235

234236
db := tc.Server(1).DB()
235237
ds := tc.Server(1).DistSenderI().(*kvcoord.DistSender)
@@ -269,7 +271,8 @@ func TestRangefeedIsRoutedToNonVoter(t *testing.T) {
269271
}
270272
rangefeedCancel()
271273
require.Regexp(t, "context canceled", <-rangefeedErrChan)
272-
require.Regexp(t, "attempting to create a RangeFeed over replica.*2NON_VOTER", getRecAndFinish().String())
274+
require.Regexp(t, `attempting to create a RangeFeed over replica .n2,s2.:\dNON_VOTER`,
275+
getRecAndFinish().String())
273276
}
274277

275278
// TestRangefeedWorksOnLivenessRange ensures that a rangefeed works as expected

pkg/kv/kvserver/closed_timestamp_test.go

Lines changed: 43 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ import (
5656
"golang.org/x/sync/errgroup"
5757
)
5858

59-
var aggressiveResolvedTimestampClusterArgs = base.TestClusterArgs{
59+
var aggressiveResolvedTimestampManuallyReplicatedClusterArgs = base.TestClusterArgs{
60+
ReplicationMode: base.ReplicationManual,
6061
ServerArgs: base.TestServerArgs{
6162
RaftConfig: base.RaftConfig{
6263
// With expiration-based leases, we may be unable to maintain leases under
@@ -81,12 +82,13 @@ func TestClosedTimestampCanServe(t *testing.T) {
8182
testutils.RunTrueAndFalse(t, "withNonVoters", func(t *testing.T, withNonVoters bool) {
8283
ctx := context.Background()
8384
dbName, tableName := "cttest", "kv"
84-
clusterArgs := aggressiveResolvedTimestampClusterArgs
85-
// Disable the replicateQueue so that it doesn't interfere with replica
86-
// membership ranges.
87-
clusterArgs.ReplicationMode = base.ReplicationManual
85+
// NB: replicate queue is disabled here, so won't interfere with our manual
86+
// changes.
87+
clusterArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
8888
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, clusterArgs, dbName, tableName)
8989
defer tc.Stopper().Stop(ctx)
90+
// This test doesn't want the range 3x replicated. Move back to 1x.
91+
desc = tc.RemoveVotersOrFatal(t, desc.StartKey.AsRawKey(), tc.Target(1), tc.Target(2))
9092

9193
if _, err := db0.Exec(`INSERT INTO cttest.kv VALUES(1, $1)`, "foo"); err != nil {
9294
t.Fatal(err)
@@ -152,12 +154,14 @@ func TestClosedTimestampCanServeOnVoterIncoming(t *testing.T) {
152154

153155
ctx := context.Background()
154156
dbName, tableName := "cttest", "kv"
155-
clusterArgs := aggressiveResolvedTimestampClusterArgs
156-
clusterArgs.ReplicationMode = base.ReplicationManual
157+
clusterArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
157158
knobs, ltk := makeReplicationTestKnobs()
158159
clusterArgs.ServerArgs.Knobs = knobs
159160
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, clusterArgs, dbName, tableName)
160161
defer tc.Stopper().Stop(ctx)
162+
// This test actually doesn't want the range to be 3x replicated. Move it
163+
// back down to one replica.
164+
desc = tc.RemoveVotersOrFatal(t, desc.StartKey.AsRawKey(), tc.Target(1), tc.Target(2))
161165

162166
if _, err := db0.Exec(`INSERT INTO cttest.kv VALUES(1, $1)`, "foo"); err != nil {
163167
t.Fatal(err)
@@ -192,7 +196,8 @@ func TestClosedTimestampCanServeThroughoutLeaseTransfer(t *testing.T) {
192196
skip.UnderRace(t)
193197

194198
ctx := context.Background()
195-
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, aggressiveResolvedTimestampClusterArgs, "cttest", "kv")
199+
cArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
200+
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, cArgs, "cttest", "kv")
196201
defer tc.Stopper().Stop(ctx)
197202
repls := replsForRange(ctx, t, tc, desc)
198203

@@ -270,7 +275,8 @@ func TestClosedTimestampCantServeWithConflictingIntent(t *testing.T) {
270275
defer txnwait.TestingOverrideTxnLivenessThreshold(time.Hour)()
271276

272277
ctx := context.Background()
273-
tc, _, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, aggressiveResolvedTimestampClusterArgs, "cttest", "kv")
278+
cArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
279+
tc, _, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, cArgs, "cttest", "kv")
274280
defer tc.Stopper().Stop(ctx)
275281
repls := replsForRange(ctx, t, tc, desc)
276282
ds := tc.Server(0).DistSenderI().(*kvcoord.DistSender)
@@ -377,7 +383,8 @@ func TestClosedTimestampCanServeAfterSplitAndMerges(t *testing.T) {
377383
skip.UnderRace(t)
378384

379385
ctx := context.Background()
380-
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, aggressiveResolvedTimestampClusterArgs, "cttest", "kv")
386+
cArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
387+
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, cArgs, "cttest", "kv")
381388
repls := replsForRange(ctx, t, tc, desc)
382389
// Disable the automatic merging.
383390
if _, err := db0.Exec("SET CLUSTER SETTING kv.range_merge.queue_enabled = false"); err != nil {
@@ -457,7 +464,8 @@ func TestClosedTimestampCantServeBasedOnUncertaintyLimit(t *testing.T) {
457464
ctx := context.Background()
458465
// Set up the target duration to be very long and rely on lease transfers to
459466
// drive MaxClosed.
460-
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, aggressiveResolvedTimestampClusterArgs, "cttest", "kv")
467+
cArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
468+
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, cArgs, "cttest", "kv")
461469
defer tc.Stopper().Stop(ctx)
462470
repls := replsForRange(ctx, t, tc, desc)
463471

@@ -490,7 +498,8 @@ func TestClosedTimestampCanServeForWritingTransaction(t *testing.T) {
490498
skip.UnderRace(t)
491499

492500
ctx := context.Background()
493-
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, aggressiveResolvedTimestampClusterArgs, "cttest", "kv")
501+
cArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
502+
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, cArgs, "cttest", "kv")
494503
defer tc.Stopper().Stop(ctx)
495504
repls := replsForRange(ctx, t, tc, desc)
496505

@@ -537,7 +546,8 @@ func TestClosedTimestampCantServeForNonTransactionalReadRequest(t *testing.T) {
537546
skip.UnderRace(t)
538547

539548
ctx := context.Background()
540-
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, aggressiveResolvedTimestampClusterArgs, "cttest", "kv")
549+
cArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
550+
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, cArgs, "cttest", "kv")
541551
defer tc.Stopper().Stop(ctx)
542552
repls := replsForRange(ctx, t, tc, desc)
543553

@@ -579,7 +589,8 @@ func TestClosedTimestampCantServeForNonTransactionalBatch(t *testing.T) {
579589

580590
testutils.RunTrueAndFalse(t, "tsFromServer", func(t *testing.T, tsFromServer bool) {
581591
ctx := context.Background()
582-
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, aggressiveResolvedTimestampClusterArgs, "cttest", "kv")
592+
cArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
593+
tc, db0, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, cArgs, "cttest", "kv")
583594
defer tc.Stopper().Stop(ctx)
584595
repls := replsForRange(ctx, t, tc, desc)
585596

@@ -683,6 +694,7 @@ func TestClosedTimestampFrozenAfterSubsumption(t *testing.T) {
683694
kvserver.ExpirationLeasesOnly.Override(ctx, &cs.SV, false) // override metamorphism
684695

685696
clusterArgs := base.TestClusterArgs{
697+
ReplicationMode: base.ReplicationManual,
686698
ServerArgs: base.TestServerArgs{
687699
Settings: cs,
688700
RaftConfig: base.RaftConfig{
@@ -1158,9 +1170,13 @@ func pickRandomTarget(
11581170
tc serverutils.TestClusterInterface, lh roachpb.ReplicationTarget, desc roachpb.RangeDescriptor,
11591171
) (t roachpb.ReplicationTarget) {
11601172
for {
1161-
if t = tc.Target(rand.Intn(len(desc.InternalReplicas))); t != lh {
1173+
n := len(desc.InternalReplicas)
1174+
if t = tc.Target(rand.Intn(n)); t != lh {
11621175
return t
11631176
}
1177+
if n <= 1 {
1178+
panic(fmt.Sprintf("the only target in %v is the leaseholder %v", desc, lh))
1179+
}
11641180
}
11651181
}
11661182

@@ -1184,20 +1200,23 @@ func aggressiveResolvedTimestampPushKnobs() *kvserver.StoreTestingKnobs {
11841200
}
11851201
}
11861202

1187-
// setupClusterForClosedTSTesting creates a test cluster that is prepared to
1188-
// exercise follower reads. The returned test cluster has follower reads enabled
1189-
// using the given targetDuration and testingCloseFraction. In addition to the
1190-
// newly minted test cluster, this function returns a db handle to node 0, a
1191-
// range descriptor for the range used by the table `{dbName}.{tableName}`. It
1192-
// is the caller's responsibility to Stop the Stopper on the returned test
1193-
// cluster when done.
1203+
// setupClusterForClosedTSTesting creates a three node test cluster that is
1204+
// prepared to exercise follower reads. The returned test cluster has follower
1205+
// reads enabled using the given targetDuration and testingCloseFraction. In
1206+
// addition to the newly minted test cluster, this function returns a db handle
1207+
// to node 0, a range descriptor for the range used by the table
1208+
// `{dbName}.{tableName}`. It is the caller's responsibility to Stop the Stopper
1209+
// on the returned test cluster when done. The range represented by kvTableDesc
1210+
// will be fully replicated; other ranges won't be by default.
11941211
func setupClusterForClosedTSTesting(
11951212
ctx context.Context,
11961213
t *testing.T,
11971214
targetDuration, sideTransportInterval time.Duration,
11981215
clusterArgs base.TestClusterArgs,
11991216
dbName, tableName string,
12001217
) (tc serverutils.TestClusterInterface, db0 *gosql.DB, kvTableDesc roachpb.RangeDescriptor) {
1218+
require.Equal(t, base.ReplicationManual, clusterArgs.ReplicationMode)
1219+
12011220
const numNodes = 3
12021221
if sideTransportInterval == 0 {
12031222
sideTransportInterval = targetDuration / 4
@@ -1212,10 +1231,8 @@ SET CLUSTER SETTING kv.allocator.load_based_rebalancing = 'off';
12121231
`, targetDuration, sideTransportInterval),
12131232
";")...)
12141233

1215-
// Disable replicate queues to avoid errant lease transfers.
1216-
//
1217-
// See: https://github.com/cockroachdb/cockroach/issues/101824.
1218-
tc.ToggleReplicateQueues(false)
1234+
desc = tc.AddVotersOrFatal(t, desc.StartKey.AsRawKey(), tc.Target(1), tc.Target(2))
1235+
require.EqualValues(t, 3, numNodes) // reminder to update the AddVotersOrFatal if numNodes ever changed
12191236

12201237
return tc, tc.ServerConn(0), desc
12211238
}

pkg/kv/kvserver/replica_rangefeed_test.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,7 +1116,8 @@ func TestReplicaRangefeedPushesTransactions(t *testing.T) {
11161116
defer log.Scope(t).Close(t)
11171117

11181118
ctx := context.Background()
1119-
tc, db, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, aggressiveResolvedTimestampClusterArgs, "cttest", "kv")
1119+
cArgs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
1120+
tc, db, desc := setupClusterForClosedTSTesting(ctx, t, testingTargetDuration, 0, cArgs, "cttest", "kv")
11201121
defer tc.Stopper().Stop(ctx)
11211122
repls := replsForRange(ctx, t, tc, desc)
11221123

@@ -1245,8 +1246,7 @@ func TestRangefeedCheckpointsRecoverFromLeaseExpiration(t *testing.T) {
12451246
st := cluster.MakeTestingClusterSettings()
12461247
kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false) // override metamorphism
12471248

1248-
cargs := aggressiveResolvedTimestampClusterArgs
1249-
cargs.ReplicationMode = base.ReplicationManual
1249+
cargs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
12501250
manualClock := hlc.NewHybridManualClock()
12511251
cargs.ServerArgs = base.TestServerArgs{
12521252
Settings: st,
@@ -1430,8 +1430,7 @@ func TestNewRangefeedForceLeaseRetry(t *testing.T) {
14301430

14311431
var timeoutSimulated bool
14321432

1433-
cargs := aggressiveResolvedTimestampClusterArgs
1434-
cargs.ReplicationMode = base.ReplicationManual
1433+
cargs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
14351434
manualClock := hlc.NewHybridManualClock()
14361435
cargs.ServerArgs = base.TestServerArgs{
14371436
Knobs: base.TestingKnobs{

0 commit comments

Comments
 (0)