kvserver: don't subsume self-blocking latency in scheduler latency

tbg · tbg · commit 5e7b3a2a91a2 · 2025-06-30T13:35:28.000+02:00
We noticed elevated p99s during scale testing that disappeared when disabling internal timeseries storage. The theory is that the timeseries replicas are quite busy, and the scheduler latency was (unintentionally, I argue) picking up raft **handling** latencies because it wasn't accounting for the case in which handling a replica was delayed by an inflight handling of that same replica. This commit addresses this issue, so that the scheduling latency metrics truly reflect only the scheduling latency. Fixes #147911 Release note: None
diff --git a/pkg/kv/kvserver/scheduler.go b/pkg/kv/kvserver/scheduler.go
@@ -395,6 +395,9 @@ func (ss *raftSchedulerShard) worker(
 		ss.Unlock()
 
 		// Record the scheduling latency for the range.
+		if buildutil.CrdbTestBuild && state.begin == 0 {
+			log.Fatalf(ctx, "raftSchedulerShard.worker called with zero begin: %+v", state)
+		}
 		lat := state.begin.Elapsed()
 		metrics.RaftSchedulerLatency.RecordValue(int64(lat))
 
@@ -437,8 +440,7 @@ func (ss *raftSchedulerShard) worker(
 		}
 
 		ss.Lock()
-		state = ss.state[id]
-		if state.flags == stateQueued {
+		if ss.state[id].flags == stateQueued {
 			// No further processing required by the range ID, clear it from the
 			// state map.
 			delete(ss.state, id)
@@ -464,6 +466,14 @@ func (ss *raftSchedulerShard) worker(
 			//   and the worker does not go back to sleep between the current
 			//   iteration and the next iteration, so no change to num_signals
 			//   is needed.
+			//
+			// NB: we overwrite state.begin unconditionally since the next processing
+			// can not possibly happen before the current processing is done (i.e.
+			// now). We do not want the scheduler latency to pick up the time spent
+			// handling this replica.
+			state = ss.state[id]
+			state.begin = crtime.NowMono()
+			ss.state[id] = state
 			ss.queue.Push(id)
 		}
 	}
@@ -495,10 +505,11 @@ func (ss *raftSchedulerShard) enqueue1Locked(
 	if newState.flags&stateQueued == 0 {
 		newState.flags |= stateQueued
 		queued++
-		ss.queue.Push(id)
-	}
-	if newState.begin == 0 {
+		if buildutil.CrdbTestBuild && newState.begin != 0 {
+			log.Fatalf(context.Background(), "raftSchedulerShard.enqueue1Locked called with non-zero begin: %+v", newState)
+		}
 		newState.begin = now
+		ss.queue.Push(id)
 	}
 	ss.state[id] = newState
 	return queued
diff --git a/pkg/kv/kvserver/scheduler_test.go b/pkg/kv/kvserver/scheduler_test.go
@@ -418,8 +418,7 @@ func TestSchedulerEnqueueWhileProcessing(t *testing.T) {
 		statePost := ss.state[id]
 		ss.Unlock()
 
-		// TODO(tbg): enable in follow-up commit.
-		// assert.Zero(t, statePost.begin)
+		assert.Zero(t, statePost.begin)
 		assert.Equal(t, stateQueued|stateTestIntercept, statePost.flags)
 		close(done)
 	}

Original file line number	Diff line number	Diff line change
`@@ -418,8 +418,7 @@ func TestSchedulerEnqueueWhileProcessing(t *testing.T) {`
`418`	`418`	`statePost := ss.state[id]`
`419`	`419`	`ss.Unlock()`
`420`	`420`
`421`		`- // TODO(tbg): enable in follow-up commit.`
`422`		`- // assert.Zero(t, statePost.begin)`
	`421`	`+ assert.Zero(t, statePost.begin)`
`423`	`422`	`assert.Equal(t, stateQueued\|stateTestIntercept, statePost.flags)`
`424`	`423`	`close(done)`
`425`	`424`	`}`