Skip to content

Commit 305c21b

Browse files
authored
Merge pull request #147588 from cockroachdb/blathers/backport-release-25.2-146331
release-25.2: kvserver: introduce setting to periodically reset split samples
2 parents ee29922 + 00f6d3c commit 305c21b

File tree

4 files changed

+115
-4
lines changed

4 files changed

+115
-4
lines changed

pkg/kv/kvserver/asim/state/split_decider.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ func (lsc loadSplitConfig) StatThreshold(_ split.SplitObjective) float64 {
5858
return lsc.settings.SplitQPSThreshold
5959
}
6060

61+
// SampleResetDuration returns the duration that any sampling structure should
62+
// retain data for before resetting.
63+
func (lsc loadSplitConfig) SampleResetDuration() time.Duration {
64+
return 0 /* disabled */
65+
}
66+
6167
// SplitDecider implements the LoadSplitter interface.
6268
type SplitDecider struct {
6369
deciders map[RangeID]*split.Decider

pkg/kv/kvserver/replica_split_load.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,20 @@ var SplitByLoadCPUThreshold = settings.RegisterDurationSetting(
6464
settings.WithPublic,
6565
)
6666

67+
// SplitSampleResetDuration wraps "kv.range_split.load_sample_reset_duration".
68+
// This is the duration after which the load based split sampler will reset its
69+
// state, regardless of any split suggestions made. This is useful when the
70+
// load on a range is non-stationary.
71+
var SplitSampleResetDuration = settings.RegisterDurationSetting(
72+
settings.SystemOnly,
73+
"kv.range_split.load_sample_reset_duration",
74+
"the duration after which the load based split sampler will reset its state, "+
75+
"regardless of any split suggestions made, when zero, the sampler will "+
76+
"never reset",
77+
0, /* disabled */
78+
settings.DurationWithMinimumOrZeroDisable(10*time.Second),
79+
)
80+
6781
func (obj LBRebalancingObjective) ToSplitObjective() split.SplitObjective {
6882
switch obj {
6983
case LBRebalancingQueries:
@@ -121,6 +135,12 @@ func (c *replicaSplitConfig) StatThreshold(obj split.SplitObjective) float64 {
121135
}
122136
}
123137

138+
// SampleResetDuration returns the duration that any sampling structure should
139+
// retain data for before resetting.
140+
func (c *replicaSplitConfig) SampleResetDuration() time.Duration {
141+
return SplitSampleResetDuration.Get(&c.st.SV)
142+
}
143+
124144
// SplitByLoadEnabled returns whether load based splitting is enabled.
125145
// Although this is a method of *Replica, the configuration is really global,
126146
// shared across all stores.

pkg/kv/kvserver/split/decider.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ type LoadSplitConfig interface {
7575
// StatThreshold returns the threshold for load above which the range
7676
// should be considered split.
7777
StatThreshold(SplitObjective) float64
78+
// SampleResetDuration returns the duration that any sampling structure
79+
// should retain data for before resetting.
80+
SampleResetDuration() time.Duration
7881
}
7982

8083
type RandSource interface {
@@ -167,6 +170,7 @@ type Decider struct {
167170

168171
// Fields tracking split key suggestions.
169172
splitFinder LoadBasedSplitter // populated when engaged or decided
173+
splitFinderInitAt time.Time // when the split finder was initialized
170174
lastSplitSuggestion time.Time // last stipulation to client to carry out split
171175
suggestionsMade int // suggestions made since last reset
172176

@@ -252,6 +256,7 @@ func (d *Decider) recordLocked(
252256
if d.mu.lastStatVal >= d.config.StatThreshold(d.mu.objective) {
253257
if d.mu.splitFinder == nil {
254258
d.mu.splitFinder = d.config.NewLoadBasedSplitter(now, d.mu.objective)
259+
d.mu.splitFinderInitAt = now
255260
}
256261
} else {
257262
d.mu.splitFinder = nil
@@ -305,6 +310,15 @@ func (d *Decider) recordLocked(
305310
}
306311
}
307312
}
313+
// If the split finder has been initialized for longer than the sample
314+
// reset duration, then we discard the split finder and start over. This is
315+
// to prevent the split finder from being stuck in a state where it is not
316+
// finding a split key based on earlier sampled keys, but could find one if
317+
// it were to sample new keys with higher probability.
318+
if sampleResetDuration := d.config.SampleResetDuration(); sampleResetDuration != 0 &&
319+
now.Sub(d.mu.splitFinderInitAt) >= sampleResetDuration {
320+
d.mu.splitFinder = nil
321+
}
308322
}
309323
return false
310324
}
@@ -410,6 +424,7 @@ func (d *Decider) resetLocked(now time.Time) {
410424
d.mu.lastStatVal = 0
411425
d.mu.count = 0
412426
d.mu.maxStat.reset(now, d.config.StatRetention())
427+
d.mu.splitFinderInitAt = time.Time{}
413428
d.mu.splitFinder = nil
414429
d.mu.suggestionsMade = 0
415430
d.mu.lastSplitSuggestion = time.Time{}

pkg/kv/kvserver/split/decider_test.go

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ import (
2222
// testLoadSplitConfig implements the LoadSplitConfig interface and may be used
2323
// in testing.
2424
type testLoadSplitConfig struct {
25-
randSource RandSource
26-
useWeighted bool
27-
statRetention time.Duration
28-
statThreshold float64
25+
randSource RandSource
26+
useWeighted bool
27+
statRetention time.Duration
28+
statThreshold float64
29+
sampleResetDuration time.Duration
2930
}
3031

3132
// NewLoadBasedSplitter returns a new LoadBasedSplitter that may be used to
@@ -50,6 +51,12 @@ func (t *testLoadSplitConfig) StatThreshold(_ SplitObjective) float64 {
5051
return t.statThreshold
5152
}
5253

54+
// SampleResetDuration returns the duration that any sampling structure should
55+
// retain data for before resetting.
56+
func (t *testLoadSplitConfig) SampleResetDuration() time.Duration {
57+
return t.sampleResetDuration
58+
}
59+
5360
func ld(n int) func(SplitObjective) int {
5461
return func(_ SplitObjective) int {
5562
return n
@@ -561,3 +568,66 @@ func TestDeciderMetrics(t *testing.T) {
561568
assert.Equal(t, dAllInsufficientCounters.loadSplitterMetrics.ClearDirectionCount.Count(), int64(0))
562569

563570
}
571+
572+
// TestDeciderSampleReset tests the sample reset functionality of the decider,
573+
// when the sample reset duration is non-zero, the split finder should be reset
574+
// after the given duration. When the sample reset duration is zero, the split
575+
// finder should not be reset.
576+
func TestDeciderSampleReset(t *testing.T) {
577+
defer leaktest.AfterTest(t)()
578+
579+
rng := rand.New(rand.NewPCG(12, 12))
580+
loadSplitConfig := testLoadSplitConfig{
581+
randSource: rng,
582+
useWeighted: false,
583+
statRetention: 2 * time.Second,
584+
statThreshold: 1,
585+
sampleResetDuration: 10 * time.Second,
586+
}
587+
ctx := context.Background()
588+
tick := 0
589+
590+
var d Decider
591+
Init(&d, &loadSplitConfig, newSplitterMetrics(), SplitQPS)
592+
593+
require.Nil(t, d.mu.splitFinder)
594+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
595+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
596+
})
597+
// The split finder should be created as the second sample is recorded and
598+
// the stat remains above the threshold (1) each tick.
599+
for i := 0; i < 10; i++ {
600+
tick += 1000
601+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
602+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
603+
})
604+
require.NotNil(t, d.mu.splitFinder, (*lockedDecider)(&d))
605+
}
606+
607+
// Tick one more time, now the sample reset duration (10s) has passed and the
608+
// split finder should be reset.
609+
tick += 1000
610+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
611+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
612+
})
613+
require.Nil(t, d.mu.splitFinder, (*lockedDecider)(&d))
614+
615+
// Immediately following the last tick where the splitFinder was reset, it
616+
// should be recreated as the stat is still above the threshold.
617+
for i := 0; i < 10; i++ {
618+
tick += 1000
619+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
620+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
621+
})
622+
require.NotNil(t, d.mu.splitFinder, (*lockedDecider)(&d))
623+
}
624+
// Set the sample reset duration to 0, which should cause the split finder to
625+
// not be reset in the next tick, unlike before when the sample reset
626+
// duration was 10s.
627+
loadSplitConfig.sampleResetDuration = 0
628+
tick += 1000
629+
d.Record(ctx, ms(tick), ld(100), func() roachpb.Span {
630+
return roachpb.Span{Key: keys.SystemSQLCodec.TablePrefix(uint32(0))}
631+
})
632+
require.NotNil(t, d.mu.splitFinder, (*lockedDecider)(&d))
633+
}

0 commit comments

Comments
 (0)