Skip to content

Commit c9e7aad

Browse files
craig[bot]tbg
andcommitted
Merge #147913
147913: kvserver: don't subsume self-blocking latency in scheduler latency r=tbg a=tbg We noticed elevated p99s during scale testing that disappeared when disabling internal timeseries storage. The theory is that the timeseries replicas are quite busy, and the scheduler latency was (unintentionally, I argue) picking up raft **handling** latencies because it wasn't accounting for the case in which handling a replica was delayed by an inflight handling of that same replica. This commit addresses this issue, so that the scheduling latency metrics truly reflect only the scheduling latency. Fixes #147911 Informs #147758 Release note: None Co-authored-by: Tobias Grieger <[email protected]>
2 parents 2a2cb3f + 7d7ee80 commit c9e7aad

File tree

2 files changed

+112
-5
lines changed

2 files changed

+112
-5
lines changed

pkg/kv/kvserver/scheduler.go

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvflowcontrol/rac2"
1515
"github.com/cockroachdb/cockroach/pkg/roachpb"
1616
"github.com/cockroachdb/cockroach/pkg/util"
17+
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
1718
"github.com/cockroachdb/cockroach/pkg/util/log"
1819
"github.com/cockroachdb/cockroach/pkg/util/stop"
1920
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
@@ -22,6 +23,10 @@ import (
2223

2324
const rangeIDChunkSize = 1000
2425

26+
type testProcessorI interface {
27+
processTestEvent(roachpb.RangeID, *raftSchedulerShard, raftScheduleState)
28+
}
29+
2530
type rangeIDChunk struct {
2631
// Valid contents are buf[rd:wr], read at buf[rd], write at buf[wr].
2732
buf [rangeIDChunkSize]roachpb.RangeID
@@ -134,11 +139,14 @@ const (
134139
stateRaftTick
135140
stateRACv2PiggybackedAdmitted
136141
stateRACv2RangeController
142+
stateTestIntercept // used for testing, CrdbTestBuild only
137143
)
138144

139145
type raftScheduleState struct {
140146
flags raftScheduleFlags
141-
begin crtime.Mono
147+
// When this event was queued. This is set if and only if the item is present
148+
// in the raft scheduler shard's queue.
149+
queued crtime.Mono
142150

143151
// The number of ticks queued. Usually it's 0 or 1, but may go above if the
144152
// scheduling or processing is slow. It is limited by raftScheduler.maxTicks,
@@ -375,8 +383,12 @@ func (ss *raftSchedulerShard) worker(
375383
ss.state[id] = raftScheduleState{flags: stateQueued}
376384
ss.Unlock()
377385

386+
if util.RaceEnabled && state.queued == 0 {
387+
// See state.queued for the invariant being checked here.
388+
log.Fatalf(ctx, "raftSchedulerShard.worker called with zero queued: %+v", state)
389+
}
378390
// Record the scheduling latency for the range.
379-
lat := state.begin.Elapsed()
391+
lat := state.queued.Elapsed()
380392
metrics.RaftSchedulerLatency.RecordValue(int64(lat))
381393

382394
// Process requests first. This avoids a scenario where a tick and a
@@ -413,6 +425,9 @@ func (ss *raftSchedulerShard) worker(
413425
if state.flags&stateRACv2RangeController != 0 {
414426
processor.processRACv2RangeController(ctx, id)
415427
}
428+
if buildutil.CrdbTestBuild && state.flags&stateTestIntercept != 0 {
429+
processor.(testProcessorI).processTestEvent(id, ss, state)
430+
}
416431

417432
ss.Lock()
418433
state = ss.state[id]
@@ -442,6 +457,13 @@ func (ss *raftSchedulerShard) worker(
442457
// and the worker does not go back to sleep between the current
443458
// iteration and the next iteration, so no change to num_signals
444459
// is needed.
460+
//
461+
// NB: we overwrite state.begin unconditionally since the next processing
462+
// can not possibly happen before the current processing is done (i.e.
463+
// now). We do not want the scheduler latency to pick up the time spent
464+
// handling this replica.
465+
state.queued = crtime.NowMono()
466+
ss.state[id] = state
445467
ss.queue.Push(id)
446468
}
447469
}
@@ -473,11 +495,13 @@ func (ss *raftSchedulerShard) enqueue1Locked(
473495
if newState.flags&stateQueued == 0 {
474496
newState.flags |= stateQueued
475497
queued++
498+
if util.RaceEnabled && newState.queued != 0 {
499+
// See newState.queued for the invariant being checked here.
500+
log.Fatalf(context.Background(), "raftSchedulerShard.enqueue1Locked called with non-zero queued: %+v", newState)
501+
}
502+
newState.queued = now
476503
ss.queue.Push(id)
477504
}
478-
if newState.begin == 0 {
479-
newState.begin = now
480-
}
481505
ss.state[id] = newState
482506
return queued
483507
}

pkg/kv/kvserver/scheduler_test.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/cockroachdb/cockroach/pkg/util/stop"
2727
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2828
"github.com/cockroachdb/errors"
29+
"github.com/stretchr/testify/assert"
2930
"github.com/stretchr/testify/require"
3031
)
3132

@@ -135,15 +136,19 @@ type testProcessor struct {
135136
rac2RangeController map[roachpb.RangeID]int
136137
ready func(roachpb.RangeID)
137138
}
139+
testEventCh chan func(roachpb.RangeID, *raftSchedulerShard, raftScheduleState)
138140
}
139141

142+
var _ testProcessorI = (*testProcessor)(nil)
143+
140144
func newTestProcessor() *testProcessor {
141145
p := &testProcessor{}
142146
p.mu.raftReady = make(map[roachpb.RangeID]int)
143147
p.mu.raftRequest = make(map[roachpb.RangeID]int)
144148
p.mu.raftTick = make(map[roachpb.RangeID]int)
145149
p.mu.rac2PiggybackedAdmitted = make(map[roachpb.RangeID]int)
146150
p.mu.rac2RangeController = make(map[roachpb.RangeID]int)
151+
p.testEventCh = make(chan func(roachpb.RangeID, *raftSchedulerShard, raftScheduleState), 10)
147152
return p
148153
}
149154

@@ -192,6 +197,16 @@ func (p *testProcessor) processRACv2RangeController(_ context.Context, rangeID r
192197
p.mu.Unlock()
193198
}
194199

200+
func (p *testProcessor) processTestEvent(
201+
id roachpb.RangeID, ss *raftSchedulerShard, ev raftScheduleState,
202+
) {
203+
select {
204+
case fn := <-p.testEventCh:
205+
fn(id, ss, ev)
206+
default:
207+
}
208+
}
209+
195210
func (p *testProcessor) readyCount(rangeID roachpb.RangeID) int {
196211
p.mu.Lock()
197212
defer p.mu.Unlock()
@@ -352,6 +367,74 @@ func TestSchedulerBuffering(t *testing.T) {
352367
}
353368
}
354369

370+
func TestSchedulerEnqueueWhileProcessing(t *testing.T) {
371+
defer leaktest.AfterTest(t)()
372+
defer log.Scope(t).Close(t)
373+
skip.UnderNonTestBuild(t) // stateTestIntercept needs CrdbTestBuild
374+
375+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
376+
defer cancel()
377+
378+
stopper := stop.NewStopper()
379+
defer stopper.Stop(ctx)
380+
381+
m := newStoreMetrics(metric.TestSampleInterval)
382+
p := newTestProcessor()
383+
s := newRaftScheduler(log.MakeTestingAmbientContext(stopper.Tracer()), m, p, 1, 1, 1, 5)
384+
s.Start(stopper)
385+
386+
done := make(chan struct{})
387+
388+
// Inject code into the "middle" of event processing - after having consumed
389+
// from the queue, but before re-checking of overlapping enqueue calls.
390+
p.testEventCh <- func(id roachpb.RangeID, ss *raftSchedulerShard, ev raftScheduleState) {
391+
// First call into this method.
392+
//
393+
// The event calling into us must have `ev.queued` set; it was set when
394+
// enqueuing.
395+
assert.NotZero(t, ev.queued)
396+
397+
// Even though our event is currently being processed, there is a queued
398+
// and otherwise blank event in the scheduler state (which is how we have
399+
// concurrent enqueue calls coalesce onto the still pending processing of
400+
// the current event).
401+
ss.Lock()
402+
statePre := ss.state[id]
403+
ss.Unlock()
404+
405+
assert.Zero(t, statePre.queued)
406+
assert.Equal(t, stateQueued, statePre.flags)
407+
408+
// Simulate a concurrent actor that enqueues the same range again.
409+
// This will not trigger the interceptor again, since the done channel
410+
// is closed by that time.
411+
s.enqueue1(stateTestIntercept, 1)
412+
413+
// Seeing that there is an existing "queued" event, the enqueue call below
414+
// should not populate `queued`. Instead, this will be the job of our
415+
// caller when it *actually* pushes into the queue again after fully
416+
// having handled `ev`.
417+
ss.Lock()
418+
statePost := ss.state[id]
419+
ss.Unlock()
420+
421+
assert.Zero(t, statePost.queued)
422+
assert.Equal(t, stateQueued|stateTestIntercept, statePost.flags)
423+
close(done)
424+
}
425+
p.testEventCh <- func(id roachpb.RangeID, shard *raftSchedulerShard, ev raftScheduleState) {
426+
// Second call into this method, i.e. the overlappingly-enqeued event is
427+
// being processed. Check that `queued` is now set.
428+
assert.NotZero(t, ev.queued)
429+
}
430+
s.enqueue1(stateTestIntercept, 1) // will become 'ev' in the intercept
431+
select {
432+
case <-done:
433+
case <-ctx.Done():
434+
t.Fatal(ctx.Err())
435+
}
436+
}
437+
355438
func TestNewSchedulerShards(t *testing.T) {
356439
defer leaktest.AfterTest(t)()
357440
defer log.Scope(t).Close(t)

0 commit comments

Comments
 (0)