Skip to content

Commit 16ea387

Browse files
authored
Merge pull request #147586 from cockroachdb/blathers/backport-release-24.3-146331
release-24.3: kvserver: introduce setting to periodically reset split samples
2 parents 4f779ed + 8c0716c commit 16ea387

File tree

4 files changed

+118
-4
lines changed

4 files changed

+118
-4
lines changed

pkg/kv/kvserver/asim/state/split_decider.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ func (lsc loadSplitConfig) StatThreshold(_ split.SplitObjective) float64 {
5858
return lsc.settings.SplitQPSThreshold
5959
}
6060

61+
// SampleResetDuration returns the duration that any sampling structure should
62+
// retain data for before resetting.
63+
func (lsc loadSplitConfig) SampleResetDuration() time.Duration {
64+
return 0 /* disabled */
65+
}
66+
6167
// SplitDecider implements the LoadSplitter interface.
6268
type SplitDecider struct {
6369
deciders map[RangeID]*split.Decider

pkg/kv/kvserver/replica_split_load.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,20 @@ var SplitByLoadCPUThreshold = settings.RegisterDurationSetting(
6464
settings.WithPublic,
6565
)
6666

67+
// SplitSampleResetDuration wraps "kv.range_split.load_sample_reset_duration".
68+
// This is the duration after which the load based split sampler will reset its
69+
// state, regardless of any split suggestions made. This is useful when the
70+
// load on a range is non-stationary.
71+
var SplitSampleResetDuration = settings.RegisterDurationSetting(
72+
settings.SystemOnly,
73+
"kv.range_split.load_sample_reset_duration",
74+
"the duration after which the load based split sampler will reset its state, "+
75+
"regardless of any split suggestions made, when zero, the sampler will "+
76+
"never reset",
77+
0, /* disabled */
78+
settings.DurationWithMinimumOrZeroDisable(10*time.Second),
79+
)
80+
6781
func (obj LBRebalancingObjective) ToSplitObjective() split.SplitObjective {
6882
switch obj {
6983
case LBRebalancingQueries:
@@ -121,6 +135,12 @@ func (c *replicaSplitConfig) StatThreshold(obj split.SplitObjective) float64 {
121135
}
122136
}
123137

138+
// SampleResetDuration returns the duration that any sampling structure should
139+
// retain data for before resetting.
140+
func (c *replicaSplitConfig) SampleResetDuration() time.Duration {
141+
return SplitSampleResetDuration.Get(&c.st.SV)
142+
}
143+
124144
// SplitByLoadEnabled returns whether load based splitting is enabled.
125145
// Although this is a method of *Replica, the configuration is really global,
126146
// shared across all stores.

pkg/kv/kvserver/split/decider.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ type LoadSplitConfig interface {
6060
// StatThreshold returns the threshold for load above which the range
6161
// should be considered split.
6262
StatThreshold(SplitObjective) float64
63+
// SampleResetDuration returns the duration that any sampling structure
64+
// should retain data for before resetting.
65+
SampleResetDuration() time.Duration
6366
}
6467

6568
type RandSource interface {
@@ -151,6 +154,7 @@ type Decider struct {
151154

152155
// Fields tracking split key suggestions.
153156
splitFinder LoadBasedSplitter // populated when engaged or decided
157+
splitFinderInitAt time.Time // when the split finder was initialized
154158
lastSplitSuggestion time.Time // last stipulation to client to carry out split
155159
suggestionsMade int // suggestions made since last reset
156160

@@ -236,6 +240,7 @@ func (d *Decider) recordLocked(
236240
if d.mu.lastStatVal >= d.config.StatThreshold(d.mu.objective) {
237241
if d.mu.splitFinder == nil {
238242
d.mu.splitFinder = d.config.NewLoadBasedSplitter(now, d.mu.objective)
243+
d.mu.splitFinderInitAt = now
239244
}
240245
} else {
241246
d.mu.splitFinder = nil
@@ -274,6 +279,15 @@ func (d *Decider) recordLocked(
274279
}
275280
}
276281
}
282+
// If the split finder has been initialized for longer than the sample
283+
// reset duration, then we discard the split finder and start over. This is
284+
// to prevent the split finder from being stuck in a state where it is not
285+
// finding a split key based on earlier sampled keys, but could find one if
286+
// it were to sample new keys with higher probability.
287+
if sampleResetDuration := d.config.SampleResetDuration(); sampleResetDuration != 0 &&
288+
now.Sub(d.mu.splitFinderInitAt) >= sampleResetDuration {
289+
d.mu.splitFinder = nil
290+
}
277291
}
278292
return false
279293
}
@@ -364,6 +378,7 @@ func (d *Decider) resetLocked(now time.Time) {
364378
d.mu.lastStatVal = 0
365379
d.mu.count = 0
366380
d.mu.maxStat.reset(now, d.config.StatRetention())
381+
d.mu.splitFinderInitAt = time.Time{}
367382
d.mu.splitFinder = nil
368383
d.mu.suggestionsMade = 0
369384
d.mu.lastSplitSuggestion = time.Time{}

pkg/kv/kvserver/split/decider_test.go

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ import (
2222
// testLoadSplitConfig implements the LoadSplitConfig interface and may be used
2323
// in testing.
2424
type testLoadSplitConfig struct {
25-
randSource RandSource
26-
useWeighted bool
27-
statRetention time.Duration
28-
statThreshold float64
25+
randSource RandSource
26+
useWeighted bool
27+
statRetention time.Duration
28+
statThreshold float64
29+
sampleResetDuration time.Duration
2930
}
3031

3132
// NewLoadBasedSplitter returns a new LoadBasedSplitter that may be used to
@@ -50,6 +51,12 @@ func (t *testLoadSplitConfig) StatThreshold(_ SplitObjective) float64 {
5051
return t.statThreshold
5152
}
5253

54+
// SampleResetDuration returns the duration that any sampling structure should
55+
// retain data for before resetting.
56+
func (t *testLoadSplitConfig) SampleResetDuration() time.Duration {
57+
return t.sampleResetDuration
58+
}
59+
5360
func ld(n int) func(SplitObjective) int {
5461
return func(_ SplitObjective) int {
5562
return n
@@ -448,3 +455,69 @@ func TestDeciderMetrics(t *testing.T) {
448455
assert.Equal(t, dAllInsufficientCounters.loadSplitterMetrics.PopularKeyCount.Count(), int64(0))
449456
assert.Equal(t, dAllInsufficientCounters.loadSplitterMetrics.NoSplitKeyCount.Count(), int64(0))
450457
}
458+
459+
// TestDeciderSampleReset tests the sample reset functionality of the decider,
460+
// when the sample reset duration is non-zero, the split finder should be reset
461+
// after the given duration. When the sample reset duration is zero, the split
462+
// finder should not be reset.
463+
func TestDeciderSampleReset(t *testing.T) {
464+
defer leaktest.AfterTest(t)()
465+
466+
rng := rand.New(rand.NewSource(12))
467+
loadSplitConfig := testLoadSplitConfig{
468+
randSource: rng,
469+
useWeighted: false,
470+
statRetention: 2 * time.Second,
471+
statThreshold: 1,
472+
sampleResetDuration: 10 * time.Second,
473+
}
474+
ctx := context.Background()
475+
tick := 0
476+
477+
var d Decider
478+
Init(&d, &loadSplitConfig, &LoadSplitterMetrics{
479+
PopularKeyCount: metric.NewCounter(metric.Metadata{}),
480+
NoSplitKeyCount: metric.NewCounter(metric.Metadata{}),
481+
}, SplitQPS)
482+
483+
require.Nil(t, d.mu.splitFinder)
484+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
485+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
486+
})
487+
// The split finder should be created as the second sample is recorded and
488+
// the stat remains above the threshold (1) each tick.
489+
for i := 0; i < 10; i++ {
490+
tick += 1000
491+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
492+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
493+
})
494+
require.NotNil(t, d.mu.splitFinder, (*lockedDecider)(&d))
495+
}
496+
497+
// Tick one more time, now the sample reset duration (10s) has passed and the
498+
// split finder should be reset.
499+
tick += 1000
500+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
501+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
502+
})
503+
require.Nil(t, d.mu.splitFinder, (*lockedDecider)(&d))
504+
505+
// Immediately following the last tick where the splitFinder was reset, it
506+
// should be recreated as the stat is still above the threshold.
507+
for i := 0; i < 10; i++ {
508+
tick += 1000
509+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
510+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
511+
})
512+
require.NotNil(t, d.mu.splitFinder, (*lockedDecider)(&d))
513+
}
514+
// Set the sample reset duration to 0, which should cause the split finder to
515+
// not be reset in the next tick, unlike before when the sample reset
516+
// duration was 10s.
517+
loadSplitConfig.sampleResetDuration = 0
518+
tick += 1000
519+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
520+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
521+
})
522+
require.NotNil(t, d.mu.splitFinder, (*lockedDecider)(&d))
523+
}

0 commit comments

Comments
 (0)