Skip to content

Commit 9bf71dd

Browse files
craig[bot]tbg
andcommitted
156540: kvserver: deflake and improve obs in TestPromoteNonVoterInAddVoter r=tbg a=tbg This attempts to finally put cockroachdb#156530 to rest. See individual commits, but broad stokes are that we repeatedly force-process the split queue (and not only the replicate queue), to make sure the range can quickly have the span config applied to it. I also added a bunch of span config and descriptor related debug output that should make it clear where the problem is should this test fail again. Unfortunately, reproducing failures in this test seems to be difficult, though others seem to have managed. Closes cockroachdb#156530. Epic: none Co-authored-by: Tobias Grieger <[email protected]>
2 parents 9bcc3fd + 28c661c commit 9bf71dd

File tree

3 files changed

+67
-36
lines changed

3 files changed

+67
-36
lines changed

pkg/kv/kvserver/lease_queue.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ func (lq *leaseQueue) process(
137137

138138
if transferOp, ok := change.Op.(plan.AllocationTransferLeaseOp); ok {
139139
lease, _ := repl.GetLease()
140-
log.KvDistribution.Infof(ctx, "transferring lease to s%d usage=%v, lease=[%v type=%v]", transferOp.Target, transferOp.Usage, lease, lease.Type())
140+
log.KvDistribution.Infof(ctx, "transferring lease to %d usage=%v, lease=[%v type=%v]", transferOp.Target, transferOp.Usage, lease, lease.Type())
141141
lq.lastLeaseTransfer.Store(timeutil.Now())
142142
changeID := lq.as.NonMMAPreTransferLease(
143143
ctx,

pkg/kv/kvserver/replicate_queue_test.go

Lines changed: 63 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ import (
4141
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
4242
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
4343
"github.com/cockroachdb/cockroach/pkg/spanconfig"
44-
"github.com/cockroachdb/cockroach/pkg/spanconfig/spanconfigstore"
4544
"github.com/cockroachdb/cockroach/pkg/sql/catalog/bootstrap"
4645
"github.com/cockroachdb/cockroach/pkg/testutils"
4746
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
@@ -618,7 +617,7 @@ func checkReplicaCount(
618617
rangeDesc *roachpb.RangeDescriptor,
619618
voterCount, nonVoterCount int,
620619
) (bool, error) {
621-
err := forceScanOnAllReplicationQueues(tc)
620+
err := forceScanOnAllReplicationAndSplitQueues(tc)
622621
if err != nil {
623622
log.KvDistribution.Infof(ctx, "store.ForceReplicationScanAndProcess() failed with: %s", err)
624623
return false, err
@@ -1544,7 +1543,7 @@ func TestReplicateQueueSwapVotersWithNonVoters(t *testing.T) {
15441543
" num_replicas=5, num_voters=1")
15451544
require.NoError(t, err)
15461545
testutils.SucceedsSoon(t, func() error {
1547-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
1546+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
15481547
return err
15491548
}
15501549
scratchRange := tc.LookupRangeOrFatal(t, scratchKey)
@@ -1559,7 +1558,7 @@ func TestReplicateQueueSwapVotersWithNonVoters(t *testing.T) {
15591558

15601559
checkRelocated := func(t *testing.T, voterStores, nonVoterStores []roachpb.StoreID) {
15611560
testutils.SucceedsSoon(t, func() error {
1562-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
1561+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
15631562
return err
15641563
}
15651564
scratchRange := tc.LookupRangeOrFatal(t, scratchKey)
@@ -1665,7 +1664,7 @@ func TestReplicateQueueShouldQueueNonVoter(t *testing.T) {
16651664
// Make sure that the range has conformed to the constraints we just set
16661665
// above.
16671666
require.Eventually(t, func() bool {
1668-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
1667+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
16691668
log.KvDistribution.Warningf(ctx, "received error while forcing a replicateQueue scan: %s", err)
16701669
return false
16711670
}
@@ -1785,13 +1784,24 @@ func toggleReplicationQueues(tc *testcluster.TestCluster, active bool) {
17851784
}
17861785
}
17871786

1788-
func forceScanOnAllReplicationQueues(tc *testcluster.TestCluster) (err error) {
1787+
func forceScanOnAllReplicationAndSplitQueues(tc *testcluster.TestCluster) (err error) {
1788+
// We force scans on the replication and split queues because sometimes splits
1789+
// are necessary to apply zone config changes, which then lead to further
1790+
// replication decisions. See #156530 for an example of this.
17891791
for _, s := range tc.Servers {
1790-
err = s.GetStores().(*kvserver.Stores).VisitStores(func(store *kvserver.Store) error {
1791-
return store.ForceReplicationScanAndProcess()
1792-
})
1792+
if err := s.GetStores().(*kvserver.Stores).VisitStores(func(store *kvserver.Store) error {
1793+
if err := store.ForceReplicationScanAndProcess(); err != nil {
1794+
return err
1795+
}
1796+
if err := store.ForceSplitScanAndProcess(); err != nil {
1797+
return err
1798+
}
1799+
return nil
1800+
}); err != nil {
1801+
return err
1802+
}
17931803
}
1794-
return err
1804+
return nil
17951805
}
17961806

17971807
func toggleSplitQueues(tc *testcluster.TestCluster, active bool) {
@@ -2189,20 +2199,13 @@ func TestPromoteNonVoterInAddVoter(t *testing.T) {
21892199
defer testutils.StartExecTrace(t, scope.GetDirectory()).Finish(t)
21902200

21912201
ctx := context.Background()
2192-
st := cluster.MakeTestingClusterSettings()
2193-
// NB: Ensure that tables created by the test start off on their own range.
2194-
// This ensures that any span config changes made by the test don't have to
2195-
// first induce a split, which is a known source of flakiness.
2196-
spanconfigstore.StorageCoalesceAdjacentSetting.Override(ctx, &st.SV, false)
2197-
spanconfigstore.TenantCoalesceAdjacentSetting.Override(ctx, &st.SV, false)
21982202

21992203
// Create 7 stores: 3 in Region 1, 2 in Region 2, and 2 in Region 3.
22002204
const numNodes = 7
22012205
serverArgs := make(map[int]base.TestServerArgs)
22022206
regions := [numNodes]int{1, 1, 1, 2, 2, 3, 3}
22032207
for i := 0; i < numNodes; i++ {
22042208
serverArgs[i] = base.TestServerArgs{
2205-
Settings: st,
22062209
Locality: roachpb.Locality{
22072210
Tiers: []roachpb.Tier{
22082211
{
@@ -2237,7 +2240,7 @@ func TestPromoteNonVoterInAddVoter(t *testing.T) {
22372240
setConstraintFn("RANGE meta", 7, 7, "")
22382241
setConstraintFn("RANGE default", 7, 7, "")
22392242
testutils.SucceedsSoon(t, func() error {
2240-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
2243+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
22412244
return err
22422245
}
22432246
s, err := sqlutils.RowsToDataDrivenOutput(sqlutils.MakeSQLRunner(tc.Conns[0]).Query(t, `
@@ -2271,36 +2274,64 @@ SELECT * FROM (
22712274
t *testing.T,
22722275
tc *testcluster.TestCluster,
22732276
db *gosql.DB,
2274-
) (numVoters, numNonVoters int, err error) {
2275-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
2276-
return 0, 0, err
2277+
) (numVoters, numNonVoters int, _ roachpb.RangeDescriptor, err error) {
2278+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
2279+
return 0, 0, roachpb.RangeDescriptor{}, err
22772280
}
22782281

2279-
var rangeID roachpb.RangeID
2280-
if err := db.QueryRow("SELECT range_id FROM [SHOW RANGES FROM TABLE t] LIMIT 1").Scan(&rangeID); err != nil {
2281-
return 0, 0, err
2282+
var key roachpb.Key
2283+
require.NoError(t,
2284+
db.QueryRow(`
2285+
SELECT start_key from crdb_internal.ranges_no_leases WHERE range_id IN
2286+
(SELECT range_id FROM [SHOW RANGES FROM TABLE t] LIMIT 1);
2287+
`).Scan(&key))
2288+
desc, err := tc.LookupRange(key)
2289+
if err != nil {
2290+
return 0, 0, roachpb.RangeDescriptor{}, err
22822291
}
2292+
22832293
iterateOverAllStores(t, tc, func(s *kvserver.Store) error {
2284-
if replica, err := s.GetReplica(rangeID); err == nil && replica.OwnsValidLease(ctx, replica.Clock().NowAsClockTimestamp()) {
2294+
if replica, err := s.GetReplica(desc.RangeID); err == nil && replica.OwnsValidLease(ctx,
2295+
replica.Clock().NowAsClockTimestamp()) {
22852296
desc := replica.Desc()
22862297
numVoters = len(desc.Replicas().VoterDescriptors())
22872298
numNonVoters = len(desc.Replicas().NonVoterDescriptors())
22882299
}
22892300
return nil
22902301
})
2291-
return numVoters, numNonVoters, nil
2302+
return numVoters, numNonVoters, desc, nil
22922303
}
22932304

22942305
// Ensure we are meeting our ZONE survival configuration.
2306+
logMore := time.After(15 * time.Second)
22952307
testutils.SucceedsSoon(t, func() error {
2296-
numVoters, numNonVoters, err := computeNumberOfReplicas(t, tc, db)
2308+
numVoters, numNonVoters, desc, err := computeNumberOfReplicas(t, tc, db)
2309+
22972310
require.NoError(t, err)
2311+
select {
2312+
default:
2313+
case <-logMore:
2314+
// If the retry loop has been stuck for a while, log the span config
2315+
// as seen by each store. For it to apply, the range's span needs to
2316+
// be contained in the start key's span config. If this is not the
2317+
// case, it can explain why the replication changes are not being made.
2318+
iterateOverAllStores(t, tc, func(s *kvserver.Store) error {
2319+
cfg, sp, err := s.GetStoreConfig().SpanConfigSubscriber.GetSpanConfigForKey(ctx, desc.StartKey)
2320+
if err != nil {
2321+
return err
2322+
}
2323+
t.Logf("s%d: r%d %s -> span config %s %s", s.StoreID(), desc.RangeID, desc.RSpan(), sp, &cfg)
2324+
return nil
2325+
})
2326+
}
2327+
22982328
if numVoters != 3 {
2299-
return errors.Newf("expected 3 voters; got %d", numVoters)
2329+
return errors.Newf("expected 3 voters for r%d; got %d", desc.RangeID, numVoters)
23002330
}
23012331
if numNonVoters != 2 {
2302-
return errors.Newf("expected 2 non-voters; got %v", numNonVoters)
2332+
return errors.Newf("expected 2 non-voters for r%d; got %v", desc.RangeID, numNonVoters)
23032333
}
2334+
t.Logf("success: %d has %d voters and %d non-voters", desc.RangeID, numVoters, numNonVoters)
23042335
return nil
23052336
})
23062337

@@ -2316,13 +2347,13 @@ SELECT * FROM (
23162347

23172348
// Ensure we are meeting our REGION survival configuration.
23182349
testutils.SucceedsSoon(t, func() error {
2319-
numVoters, numNonVoters, err := computeNumberOfReplicas(t, tc, db)
2350+
numVoters, numNonVoters, desc, err := computeNumberOfReplicas(t, tc, db)
23202351
require.NoError(t, err)
23212352
if numVoters != 5 {
2322-
return errors.Newf("expected 5 voters; got %d", numVoters)
2353+
return errors.Newf("expected 5 voters for r%d; got %d", desc.RangeID, numVoters)
23232354
}
23242355
if numNonVoters != 0 {
2325-
return errors.Newf("expected 0 non-voters; got %v", numNonVoters)
2356+
return errors.Newf("expected 0 non-voters for r%d; got %v", desc.RangeID, numNonVoters)
23262357
}
23272358
return nil
23282359
})

pkg/spanconfig/spanconfigstore/span_store.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ import (
1919
"github.com/cockroachdb/errors"
2020
)
2121

22-
// TenantCoalesceAdjacentSetting is a public cluster setting that
22+
// tenantCoalesceAdjacentSetting is a public cluster setting that
2323
// controls whether we coalesce adjacent ranges across all secondary
2424
// tenant keyspaces if they have the same span config.
25-
var TenantCoalesceAdjacentSetting = settings.RegisterBoolSetting(
25+
var tenantCoalesceAdjacentSetting = settings.RegisterBoolSetting(
2626
settings.SystemOnly,
2727
"spanconfig.tenant_coalesce_adjacent.enabled",
2828
`collapse adjacent ranges with the same span configs across all secondary tenant keyspaces`,
@@ -157,7 +157,7 @@ func (s *spanConfigStore) computeSplitKey(
157157
return roachpb.RKey(match.span.Key), nil
158158
}
159159
} else {
160-
if !TenantCoalesceAdjacentSetting.Get(&s.settings.SV) {
160+
if !tenantCoalesceAdjacentSetting.Get(&s.settings.SV) {
161161
return roachpb.RKey(match.span.Key), nil
162162
}
163163
}

0 commit comments

Comments
 (0)