Skip to content

Commit 3b1e9fa

Browse files
craig[bot]dodeca12
andcommitted
Merge #153373
153373: kvserver: resolve range-size backpressure preventing spanconfig updates r=iskettaneh a=dodeca12 This PR ensures that all spanconfig updates bypass the backpressure mechanism. This is done by excluding the `system.span_configurations` table from backpressurable spans by splitting the backpressurable range around it. #### Implementation ##### Commit 1: Reproduce the Issue - Added comprehensive test TestSpanConfigUpdatesBlockedByRangeSizeBackpressureOnDefaultRanges that reproduces the catch-22 scenario - The test repeatedly writes spanconfig updates for a single key until the range becomes too large to split, triggering backpressure - Demonstrates how spanconfig updates (including protected timestamp deletions) get blocked by backpressure: ```log W250910 14:26:03.302675 144058 kv/kvserver/replica_backpressure.go:263 [T1,Vsystem,n1,s1,r77/1:/Table/4{7/1/"\xf…-8}] 481 applying backpressure to limit range growth on batch Put [/Table/47/1/"\xfa"/0], [txn: cfbe5191], [can-forward-ts] I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 ERROR! BREAKING OUT OF LOOP, numWrites successful: 62690, error: upsert-span-cfgs: split failed while applying backpressure to Put [/Table/47/1/"\xfa"/0], [txn: cfbe5191], [can-forward-ts] on range r77:/Table/4{7/1/"\xfa"-8} [(n1,s1):1, next=2, gen=1]: could not find valid split key I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 +(1) attached stack trace I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + -- stack trace: I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | github.com/cockroachdb/cockroach/pkg/sql.(*InternalExecutor).execInternal.func1.1 I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | pkg/sql/internal.go:1260 I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | github.com/cockroachdb/cockroach/pkg/sql.(*rowsIterator).Close I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | pkg/sql/internal.go:608 I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | github.com/cockroachdb/cockroach/pkg/sql.(*rowsIterator).Next I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | pkg/sql/internal.go:584 I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | github.com/cockroachdb/cockroach/pkg/sql.(*InternalExecutor).execIEStmt I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | pkg/sql/internal.go:872 I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | github.com/cockroachdb/cockroach/pkg/sql.(*InternalExecutor).ExecEx I250910 14:26:03.304650 50 kv/kvserver_test/client_spanconfig_backpressure_test.go:209 [-] 482 + | pkg/sql/internal.go:820 . . . ``` ##### Commit 2: - **Modified `backpressurableSpans`** in `replica_backpressure.go`: Split the original backpressurable span `{SystemConfigTableDataMax, TableDataMax}` into two spans that exclude the span_configurations table: - `{SystemConfigTableDataMax, span_configurations_table_start}` - `{span_configurations_table_end, TableDataMax}` - **Added explanatory comments**: Documented that this prevents catch-22 situations where protected timestamp updates or garbage collection TTL updates are blocked by backpressure - **Updated test expectations**: Modified the aforementioned test to expect that spanconfig updates should succeed (bypass backpressure) rather than fail Expected behaviour - any span config updates now bypass the range-size backpressure mechanism. Fixes: #146982 Release note: None Co-authored-by: Swapneeth Gorantla <[email protected]>
2 parents f5a38d8 + 04e58b8 commit 3b1e9fa

File tree

3 files changed

+248
-3
lines changed

3 files changed

+248
-3
lines changed

pkg/keys/constants.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,10 @@ var (
374374
NamespaceTableMin = SystemSQLCodec.TablePrefix(NamespaceTableID)
375375
// NamespaceTableMax is the end key of system.namespace.
376376
NamespaceTableMax = SystemSQLCodec.TablePrefix(NamespaceTableID + 1)
377+
// SpanConfigTableMin is the start key of system.span_configurations.
378+
SpanConfigTableMin = SystemSQLCodec.TablePrefix(SpanConfigurationsTableID)
379+
// SpanConfigTableMax is the end key of system.span_configurations.
380+
SpanConfigTableMax = SystemSQLCodec.TablePrefix(SpanConfigurationsTableID + 1)
377381

378382
// 4. Non-system tenant SQL keys
379383
//

pkg/kv/kvserver/client_replica_backpressure_test.go

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,36 @@ package kvserver_test
88
import (
99
"context"
1010
"fmt"
11+
math "math"
1112
"net/url"
13+
"strings"
1214
"sync"
1315
"sync/atomic"
1416
"testing"
1517
"time"
1618

1719
"github.com/cockroachdb/cockroach/pkg/base"
1820
"github.com/cockroachdb/cockroach/pkg/keys"
21+
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
1922
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
2023
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
2124
"github.com/cockroachdb/cockroach/pkg/roachpb"
25+
"github.com/cockroachdb/cockroach/pkg/spanconfig"
26+
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
2227
"github.com/cockroachdb/cockroach/pkg/testutils"
2328
"github.com/cockroachdb/cockroach/pkg/testutils/pgurlutils"
29+
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
2430
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
2531
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
2632
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
33+
"github.com/cockroachdb/cockroach/pkg/util/hlc"
2734
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
2835
"github.com/cockroachdb/cockroach/pkg/util/log"
36+
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
2937
"github.com/cockroachdb/cockroach/pkg/util/randutil"
3038
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
3139
"github.com/cockroachdb/errors"
40+
"github.com/dustin/go-humanize"
3241
"github.com/jackc/pgx/v5"
3342
"github.com/stretchr/testify/require"
3443
)
@@ -332,3 +341,233 @@ func TestBackpressureNotAppliedWhenReducingRangeSize(t *testing.T) {
332341
require.Error(t, <-upsertErrCh)
333342
})
334343
}
344+
345+
// TestSpanConfigUpdatesDoNotGetBlockByRangeSizeBackpressureOnDefaultRanges
346+
// verifies that spanconfig updates do not block by backpressure when the
347+
// `system.span_configurations` table range becomes full, showing the allowlist
348+
// is working.
349+
//
350+
// Test strategy:
351+
// 1. Configure `system.span_configurations` table range to be a small size (8 KiB).
352+
// 2. Write many large spanconfig records (2 KiB each) to fill up the range.
353+
// 3. Verify spanconfig updates do not fail due to backpressure when the range is full,
354+
// 4. This test recreates the scenario where spanconfig updates do not fail due to
355+
// backpressure.
356+
func TestSpanConfigUpdatesDoNotGetBlockByRangeSizeBackpressureOnDefaultRanges(t *testing.T) {
357+
defer leaktest.AfterTest(t)()
358+
defer log.Scope(t).Close(t)
359+
360+
ctx := context.Background()
361+
362+
const (
363+
overloadMaxRangeBytes = 8 << 10 // 8 KiB, a saner value than default 512 MiB for testing
364+
overloadMinRangeBytes = 2 << 10 // 2 KiB
365+
numWrites = 16 // enough to hit backpressure for 8 KiB range & 2 KiB spanconfig
366+
defaultMaxBytes = 512 << 20 // default max bytes for a range
367+
)
368+
s := serverutils.StartServerOnly(t, base.TestServerArgs{})
369+
defer s.Stopper().Stop(ctx)
370+
371+
store, err := s.GetStores().(*kvserver.Stores).GetStore(s.GetFirstStoreID())
372+
require.NoError(t, err)
373+
374+
waitForSpanConfig := func(t *testing.T, tc serverutils.TestServerInterface,
375+
tablePrefix roachpb.Key, expRangeMaxBytes int64) {
376+
testutils.SucceedsSoon(t, func() error {
377+
_, r := getFirstStoreReplica(t, tc, tablePrefix)
378+
conf, err := r.LoadSpanConfig(ctx)
379+
if err != nil {
380+
return err
381+
}
382+
if conf.RangeMaxBytes != expRangeMaxBytes {
383+
return fmt.Errorf("expected RangeMaxBytes %d, got %d",
384+
expRangeMaxBytes, conf.RangeMaxBytes)
385+
}
386+
return nil
387+
})
388+
}
389+
390+
spanConfigTablePrefix := keys.SystemSQLCodec.TablePrefix(
391+
keys.SpanConfigurationsTableID)
392+
393+
t.Logf("targeting span_configurations table at key: %s (table ID %d)\n",
394+
spanConfigTablePrefix, keys.SpanConfigurationsTableID)
395+
396+
scratchKey, err := s.ScratchRange()
397+
require.NoError(t, err)
398+
399+
testutils.SucceedsSoon(t, func() error {
400+
repl := store.LookupReplica(roachpb.RKey(scratchKey))
401+
if got := repl.GetMaxBytes(ctx); got != defaultMaxBytes {
402+
return errors.Errorf(
403+
"range max bytes values did not start at %d; got %d",
404+
defaultMaxBytes, got)
405+
}
406+
return nil
407+
})
408+
409+
systemSpanConfigurationsTableSpan := roachpb.Span{
410+
Key: spanConfigTablePrefix,
411+
EndKey: spanConfigTablePrefix.PrefixEnd(),
412+
}
413+
414+
target := spanconfig.MakeTargetFromSpan(systemSpanConfigurationsTableSpan)
415+
416+
systemSpanConfig := roachpb.SpanConfig{
417+
RangeMaxBytes: overloadMaxRangeBytes,
418+
RangeMinBytes: overloadMinRangeBytes,
419+
}
420+
421+
configBytessdfsdf, err := protoutil.Marshal(&systemSpanConfig)
422+
require.NoError(t, err)
423+
t.Logf("marshalled systemSpanConfig size: %d bytes", len(configBytessdfsdf))
424+
425+
record, err := spanconfig.MakeRecord(target, systemSpanConfig)
426+
require.NoError(t, err)
427+
428+
kvaccessor := s.SpanConfigKVAccessor().(spanconfig.KVAccessor)
429+
430+
err = kvaccessor.UpdateSpanConfigRecords(
431+
ctx, []spanconfig.Target{target},
432+
[]spanconfig.Record{record}, hlc.MinTimestamp, hlc.MaxTimestamp)
433+
require.NoError(t, err)
434+
435+
waitForSpanConfig(t, s, spanConfigTablePrefix, overloadMaxRangeBytes)
436+
437+
// Check if the range is using our custom config.
438+
repl := store.LookupReplica(keys.MustAddr(spanConfigTablePrefix))
439+
if repl != nil {
440+
conf, err := repl.LoadSpanConfig(ctx)
441+
require.NoError(t, err)
442+
t.Logf("current range config - RangeMaxBytes: %d bytes (%s), "+
443+
"RangeMinBytes: %d bytes (%s)",
444+
conf.RangeMaxBytes, humanize.Bytes(uint64(conf.RangeMaxBytes)),
445+
conf.RangeMinBytes, humanize.Bytes(uint64(conf.RangeMinBytes)))
446+
447+
}
448+
449+
t.Logf("targeting span_configurations table at key: %s (table ID %d)\n",
450+
spanConfigTablePrefix, keys.SpanConfigurationsTableID)
451+
452+
// Create a single target for the scratch range (this will be stored in system.span_configurations)
453+
scratchTarget := spanconfig.MakeTargetFromSpan(roachpb.Span{
454+
Key: scratchKey,
455+
EndKey: scratchKey.PrefixEnd(),
456+
})
457+
458+
// This is a large spanconfig for a scratch range with relevant fields set
459+
// to maximum int64 and int32 values. This is done to have a spanconfig that
460+
// is large enough to trigger backpressure without having to write a million
461+
// records.
462+
// We want this config to be relatively large - this is done via setting
463+
// values to have max values and multiple fields as this config gets
464+
// marshalled into a protobuf and protobuf uses variant encoding, which
465+
// means larger values take more bytes to encode.
466+
spanConfig2KiB := roachpb.SpanConfig{ // 2078 bytes ~ 2 KiB.
467+
RangeMaxBytes: math.MaxInt64,
468+
RangeMinBytes: math.MaxInt64,
469+
GCPolicy: roachpb.GCPolicy{
470+
TTLSeconds: math.MaxInt32,
471+
ProtectionPolicies: []roachpb.ProtectionPolicy{
472+
{
473+
ProtectedTimestamp: hlc.MaxTimestamp,
474+
},
475+
{
476+
ProtectedTimestamp: hlc.MaxTimestamp,
477+
},
478+
},
479+
},
480+
NumReplicas: math.MaxInt32,
481+
GlobalReads: true,
482+
NumVoters: math.MaxInt32,
483+
VoterConstraints: []roachpb.ConstraintsConjunction{
484+
{
485+
Constraints: []roachpb.Constraint{
486+
{Key: "max_key", Value: strings.Repeat("x", 1024)}, // very long constraint value
487+
},
488+
},
489+
},
490+
LeasePreferences: []roachpb.LeasePreference{
491+
{
492+
Constraints: []roachpb.Constraint{
493+
{Key: "max_key", Value: strings.Repeat("y", 1024)}, // very long constraint value
494+
},
495+
},
496+
},
497+
}
498+
499+
configBytes, err := protoutil.Marshal(&spanConfig2KiB)
500+
require.NoError(t, err)
501+
502+
require.GreaterOrEqual(t, len(configBytes), 2048,
503+
"spanConfig2KiB should be at least 2 KiB in size")
504+
505+
// Create a record with the span configuration.
506+
testRecord, err := spanconfig.MakeRecord(scratchTarget, spanConfig2KiB)
507+
require.NoError(t, err)
508+
509+
// Write span configurations using KVAccessor.
510+
// We expect this to fail due to backpressure.
511+
var i int
512+
for i = 0; i < numWrites; i++ {
513+
// Use KVAccessor to update span configurations.
514+
err = kvaccessor.UpdateSpanConfigRecords(ctx, nil,
515+
[]spanconfig.Record{testRecord}, hlc.MinTimestamp, hlc.MaxTimestamp)
516+
if err != nil {
517+
break
518+
}
519+
}
520+
521+
// Assert that the operation does not fail due to backpressure.
522+
require.NoError(t, err,
523+
"expected span config writes to not fail due to backpressure, but they did")
524+
525+
systemSpanConfigurationsTableSpanMVCCStats := roachpb.Span{
526+
Key: keys.SystemSQLCodec.TablePrefix(keys.SpanConfigurationsTableID),
527+
EndKey: keys.SystemSQLCodec.TablePrefix(keys.SpanConfigurationsTableID + 1),
528+
}
529+
530+
distSender := s.DistSenderI().(*kvcoord.DistSender)
531+
532+
// Track aggregate MVCC stats across all SpanConfigurationsTable ranges
533+
var aggregateStats enginepb.MVCCStats
534+
var rangeCount int
535+
536+
for key := systemSpanConfigurationsTableSpanMVCCStats.Key; key.Compare(systemSpanConfigurationsTableSpanMVCCStats.EndKey) < 0; {
537+
desc, err := distSender.RangeDescriptorCache().Lookup(ctx, keys.MustAddr(key))
538+
require.NoError(t, err)
539+
d := desc.Desc
540+
541+
rangeRepl := store.LookupReplica(d.StartKey)
542+
if rangeRepl != nil {
543+
stats := rangeRepl.GetMVCCStats()
544+
aggregateStats.Add(stats)
545+
rangeCount++
546+
}
547+
548+
// Move to next range.
549+
key = d.EndKey.AsRawKey()
550+
if key.Equal(roachpb.KeyMax) {
551+
break
552+
}
553+
}
554+
555+
require.Greater(t, aggregateStats.Total(), int64(overloadMaxRangeBytes))
556+
557+
smallSpanConfig := roachpb.SpanConfig{
558+
GCPolicy: roachpb.GCPolicy{
559+
TTLSeconds: 0,
560+
},
561+
}
562+
563+
smallSpanconfigRecord, err := spanconfig.MakeRecord(scratchTarget, smallSpanConfig)
564+
require.NoError(t, err)
565+
566+
smallSpanconfigRecordWriteErr := kvaccessor.UpdateSpanConfigRecords(ctx,
567+
[]spanconfig.Target{scratchTarget}, []spanconfig.Record{smallSpanconfigRecord},
568+
hlc.MinTimestamp, hlc.MaxTimestamp)
569+
570+
require.NoError(t, smallSpanconfigRecordWriteErr,
571+
"expected smallSpanconfigRecord write to not fail due to backpressure")
572+
573+
}

pkg/kv/kvserver/replica_backpressure.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,11 @@ var backpressureByteTolerance = settings.RegisterByteSizeSetting(
8888
// to be backpressured.
8989
var backpressurableSpans = []roachpb.Span{
9090
{Key: keys.TimeseriesPrefix, EndKey: keys.TimeseriesKeyMax},
91-
// Backpressure from the end of the system config forward instead of
92-
// over all table data to avoid backpressuring unsplittable ranges.
93-
{Key: keys.SystemConfigTableDataMax, EndKey: keys.TableDataMax},
91+
// Exclude the span_configurations table to avoid
92+
// catch-22 situations where protected timestamp updates or garbage
93+
// collection TTL updates are blocked by backpressure.
94+
{Key: keys.SystemConfigTableDataMax, EndKey: keys.SpanConfigTableMin},
95+
{Key: keys.SpanConfigTableMax, EndKey: keys.TableDataMax},
9496
}
9597

9698
// canBackpressureBatch returns whether the provided BatchRequest is eligible

0 commit comments

Comments
 (0)