storeliveness: refactor channel-based sendQueue with slices of slpb.Message

dodeca12 · dodeca12 · commit 0e6be32628a8 · 2025-11-06T14:24:36.000-05:00
Previously, outgoing messages in the transport layer were queued using a buffered channel (`messages chan slpb.Message`). Messages were processed by pulling them one-by-one from the channel and batching them using a timer-based approach. This patch refactors the queue implementation to use a mutex-protected slice (`msgs []slpb.Message`) instead of a channel. This change simplifies the batching logic by allowing all queued messages to be drained atomically in a single operation, rather than pulling them individually from a channel. With this refactor, we also increase the queue capacity to 100,000 messages, since the per-store receive queue size of 10,000 messages (since the send queue is per-node and serves multiple stores). The refactor also allows the batching mechansim to use a "sleep-then-drain" approach when compared to the existing timer-based approach. The timer-based approach had a subtle issue where `processQueue` would block in a select statement waiting on `q.messages` while batching, and when a new message was enqueued (which signals `q.messages`), it would immediately wake up the blocked goroutine, causing spikes in runnable goroutines. The new `sendQueue` struct provides `Append()` to add messages, `Drain()` to atomically retrieve all messages, and `Size()` to track the total byte size of queued messages. The `processQueue` method now drains all messages at once and sleeps for the batch duration, rather than using the previous timer-based batching approach. By sleeping first and then draining all messages atomically, we avoid the aforementioned wake-up spikes and achieve better pacing behaviour. Part of: #148210 Release note: None
diff --git a/pkg/kv/kvserver/storeliveness/support_manager.go b/pkg/kv/kvserver/storeliveness/support_manager.go
@@ -432,7 +432,7 @@ func (sm *SupportManager) handleMessages(ctx context.Context, msgs []*slpb.Messa
 // maxReceiveQueueSize is the maximum number of messages the receive queue can
 // store. If message consumption is slow (e.g. due to a disk stall) and the
 // queue reaches maxReceiveQueueSize, incoming messages will be dropped.
-const maxReceiveQueueSize = 10000
+const maxReceiveQueueSize = 10_000
 
 var receiveQueueSizeLimitReachedErr = errors.Errorf("store liveness receive queue is full")
 
diff --git a/pkg/kv/kvserver/storeliveness/transport.go b/pkg/kv/kvserver/storeliveness/transport.go
@@ -25,8 +25,12 @@ import (
 )
 
 const (
-	// Outgoing messages are queued per-node on a channel of this size.
-	sendBufferSize = 1000
+	// Outgoing messages are queued per-node on a slice of this size.
+	// This is set to 100_000 because the receiver side has
+	// maxReceiveQueueSize = 10_000 per store.
+	// Since this is a per-node queue that may handle messages for multiple
+	// stores, it should be at least 10x the per-store queue size.
+	maxSendQueueSize = 100_000
 
 	// When no message has been queued for this duration, the corresponding
 	// instance of processQueue will shut down.
@@ -52,9 +56,51 @@ type MessageHandler interface {
 	HandleMessage(msg *slpb.Message) error
 }
 
+var sendQueueSizeLimitReachedErr = errors.Errorf("store liveness send queue is full")
+
 // sendQueue is a queue of outgoing Messages.
 type sendQueue struct {
-	messages chan slpb.Message
+	// msgs is a slice of messages that are queued to be sent.
+	mu struct {
+		syncutil.Mutex
+		msgs []slpb.Message
+		// size is the total size in bytes of the messages in the queue.
+		size int64
+	}
+	// directSend is the channel used to signal the processQueue goroutine.
+	directSend chan struct{}
+}
+
+func newSendQueue() sendQueue {
+	return sendQueue{
+		directSend: make(chan struct{}, 1),
+	}
+}
+
+func (q *sendQueue) append(msg slpb.Message) error {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Drop messages if maxSendQueueSize is reached.
+	if len(q.mu.msgs) >= maxSendQueueSize {
+		return sendQueueSizeLimitReachedErr
+	}
+	q.mu.msgs = append(q.mu.msgs, msg)
+	q.mu.size += int64(msg.Size())
+	select {
+	case q.directSend <- struct{}{}:
+	default:
+	}
+	return nil
+}
+
+func (q *sendQueue) drain() ([]slpb.Message, int64) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	msgs := q.mu.msgs
+	q.mu.msgs = nil
+	size := q.mu.size
+	q.mu.size = 0
+	return msgs, size
 }
 
 // Transport handles the RPC messages for Store Liveness.
@@ -248,12 +294,8 @@ func (t *Transport) EnqueueMessage(ctx context.Context, msg slpb.Message) (enque
 		}
 	}
 
-	select {
-	case q.messages <- msg:
-		t.metrics.SendQueueSize.Inc(1)
-		t.metrics.SendQueueBytes.Inc(int64(msg.Size()))
-		return true
-	default:
+	msgSize := int64(msg.Size())
+	if err := q.append(msg); err != nil {
 		if logQueueFullEvery.ShouldLog() {
 			log.KvExec.Warningf(
 				t.AnnotateCtx(context.Background()),
@@ -263,14 +305,18 @@ func (t *Transport) EnqueueMessage(ctx context.Context, msg slpb.Message) (enque
 		t.metrics.MessagesSendDropped.Inc(1)
 		return false
 	}
+
+	t.metrics.SendQueueSize.Inc(1)
+	t.metrics.SendQueueBytes.Inc(msgSize)
+	return true
 }
 
 // getQueue returns the queue for the specified node ID and a boolean
 // indicating whether the queue already exists (true) or was created (false).
 func (t *Transport) getQueue(nodeID roachpb.NodeID) (*sendQueue, bool) {
 	queue, ok := t.queues.Load(nodeID)
 	if !ok {
-		q := sendQueue{messages: make(chan slpb.Message, sendBufferSize)}
+		q := newSendQueue()
 		queue, ok = t.queues.LoadOrStore(nodeID, &q)
 	}
 	return queue, ok
@@ -287,21 +333,18 @@ func (t *Transport) startProcessNewQueue(
 ) (started bool) {
 	cleanup := func() {
 		q, ok := t.getQueue(toNodeID)
+		if !ok {
+			return
+		}
 		t.queues.Delete(toNodeID)
 		// Account for all remaining messages in the queue. EnqueueMessage may be
 		// writing to the queue concurrently, so it's possible that we won't
 		// account for a few messages below.
-		if ok {
-			for {
-				select {
-				case m := <-q.messages:
-					t.metrics.MessagesSendDropped.Inc(1)
-					t.metrics.SendQueueSize.Dec(1)
-					t.metrics.SendQueueBytes.Dec(int64(m.Size()))
-				default:
-					return
-				}
-			}
+		msgs, msgsSize := q.drain()
+		if len(msgs) > 0 {
+			t.metrics.MessagesSendDropped.Inc(int64(len(msgs)))
+			t.metrics.SendQueueSize.Dec(int64(len(msgs)))
+			t.metrics.SendQueueBytes.Dec(msgsSize)
 		}
 	}
 	worker := func(ctx context.Context) {
@@ -362,10 +405,17 @@ func (t *Transport) processQueue(
 	}
 	var idleTimer timeutil.Timer
 	defer idleTimer.Stop()
-	var batchTimer timeutil.Timer
-	defer batchTimer.Stop()
 	batch := &slpb.MessageBatch{}
 	for {
+		// Drain the timer channel before resetting to avoid receiving stale
+		// timeout signals. This can happen if the timer fires while we're
+		// processing messages (e.g., during the batchDuration sleep).
+		if !idleTimer.Stop() {
+			select {
+			case <-idleTimer.C:
+			default:
+			}
+		}
 		idleTimer.Reset(getIdleTimeout())
 		select {
 		case <-t.stopper.ShouldQuiesce():
@@ -375,23 +425,34 @@ func (t *Transport) processQueue(
 			t.metrics.SendQueueIdle.Inc(1)
 			return nil
 
-		case msg := <-q.messages:
-			batch.Messages = append(batch.Messages, msg)
-			t.metrics.SendQueueSize.Dec(1)
-			t.metrics.SendQueueBytes.Dec(int64(msg.Size()))
-
-			// Pull off as many queued requests as possible within batchDuration.
-			batchTimer.Reset(batchDuration)
-			for done := false; !done; {
-				select {
-				case msg = <-q.messages:
-					batch.Messages = append(batch.Messages, msg)
-					t.metrics.SendQueueSize.Dec(1)
-					t.metrics.SendQueueBytes.Dec(int64(msg.Size()))
-				case <-batchTimer.C:
-					done = true
-				}
+		case <-q.directSend:
+			// Sleep for batchDuration to batch messages, then drain all accumulated
+			// messages at once. We use sleep-then-drain instead of a timer-based
+			// batching mechanism (e.g., select between timer and message channel)
+			// to avoid frequent goroutine wake-ups.
+			//
+			// With timer-based batching, the goroutine would wake up on each message
+			// arrival during the batching window.
+			// This creates a pattern where the goroutine wakes up repeatedly
+			// during batching, causing spikes in runnable goroutines and
+			// suboptimal scheduling behaviour.
+			//
+			// By sleeping first and then draining all messages in a single operation,
+			// we ensure the goroutine wakes up only once per batch period, after
+			// the batching window has elapsed. This keeps the runnable goroutine
+			// count stable and reduces scheduling overhead.
+			time.Sleep(batchDuration)
+			select {
+			case <-q.directSend:
+			default:
+			}
+			var batchMessagesSize int64
+			batch.Messages, batchMessagesSize = q.drain()
+			if len(batch.Messages) == 0 {
+				continue
 			}
+			t.metrics.SendQueueSize.Dec(int64(len(batch.Messages)))
+			t.metrics.SendQueueBytes.Dec(batchMessagesSize)
 
 			batch.Now = t.clock.NowAsClockTimestamp()
 			if err = stream.Send(batch); err != nil {
diff --git a/pkg/kv/kvserver/storeliveness/transport_test.go b/pkg/kv/kvserver/storeliveness/transport_test.go
@@ -398,6 +398,7 @@ func TestTransportSendToMissingStore(t *testing.T) {
 			return errors.New("still waiting to receive message")
 		},
 	)
+
 	require.Equal(t, int64(2), tt.transports[sender.NodeID].metrics.MessagesSent.Count())
 }
 
@@ -532,7 +533,7 @@ func TestTransportIdleSendQueue(t *testing.T) {
 	handler := tt.AddStore(receiver)
 
 	tt.transports[sender.NodeID].knobs.OverrideIdleTimeout = func() time.Duration {
-		return time.Millisecond
+		return 100 * time.Millisecond
 	}
 
 	// Send and receive a message.

Original file line number	Diff line number	Diff line change
`@@ -398,6 +398,7 @@ func TestTransportSendToMissingStore(t *testing.T) {`
`398`	`398`	`return errors.New("still waiting to receive message")`
`399`	`399`	`},`
`400`	`400`	`)`
	`401`	`+`
`401`	`402`	`require.Equal(t, int64(2), tt.transports[sender.NodeID].metrics.MessagesSent.Count())`
`402`	`403`	`}`
`403`	`404`
`@@ -532,7 +533,7 @@ func TestTransportIdleSendQueue(t *testing.T) {`
`532`	`533`	`handler := tt.AddStore(receiver)`
`533`	`534`
`534`	`535`	`tt.transports[sender.NodeID].knobs.OverrideIdleTimeout = func() time.Duration {`
`535`		`- return time.Millisecond`
	`536`	`+ return 100 * time.Millisecond`
`536`	`537`	`}`
`537`	`538`
`538`	`539`	`// Send and receive a message.`