perf: batch-drain buffered events in memqueue runLoop

strawgate · strawgate · commit 5d76081f0b08 · 2026-04-04T20:22:50.000-05:00
The memqueue runLoop processes one event per select loop iteration, paying
full goroutine scheduling overhead for each event. When multiple producers
are sending concurrently, the pushChan buffer fills up but is still drained
one-at-a-time.

After handling the first event from the select, perform a non-blocking drain
of up to 64 additional already-buffered events before returning to the main
select. This amortizes scheduling overhead across batches while the cap
prevents starvation of Get, Ack, and Close operations.

Benchmark results (Apple M4, null output pipeline, batch_size=2048):
  - BenchmarkFullPipeline/batch_2048: +19-24% throughput (p=0.010)
  - BenchmarkProducerThroughput (10 producers): neutral (p=0.442)
  - ES e2e (real Elasticsearch output): neutral, no regression (all p&gt;0.1)

Includes 7 behavioral equivalence tests that pass on both the old and new
code paths, covering backpressure, shutdown, multi-producer delivery, ack
correctness, and rapid close scenarios.
diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go
@@ -442,3 +442,309 @@ func BenchmarkProducerThroughput(b *testing.B) {
 	cancel()
 	testQueue.Close(true)
 }
+
+// ---------------------------------------------------------------------------
+// Behavioral equivalence tests: these verify the guarantees that callers
+// depend on. Every test here must pass with and without the batched drain
+// optimization in handleInsert.
+// ---------------------------------------------------------------------------
+
+// TestPublishReturnsOnlyAfterInsert verifies that when Publish returns true,
+// the event is already in the ring buffer (not just buffered in a channel).
+// We confirm this by immediately calling Get after Publish and verifying
+// the event is available.
+func TestPublishReturnsOnlyAfterInsert(t *testing.T) {
+	q := NewQueue(logp.NewNopLogger(), nil,
+		Settings{
+			Events:        100,
+			MaxGetRequest: 100,
+			FlushTimeout:  0, // no flush delay
+		}, 0, nil)
+	defer q.Close(true)
+
+	p := q.Producer(queue.ProducerConfig{})
+
+	// Publish one event
+	_, ok := p.Publish("event-1")
+	require.True(t, ok)
+
+	// The event must be immediately available for consumption
+	batch, err := q.Get(1)
+	require.NoError(t, err)
+	require.Equal(t, 1, batch.Count())
+	batch.Done()
+}
+
+// TestBackpressureBlocksAtQueueCapacity verifies that Publish blocks when
+// the ring buffer is full (backpressure) and unblocks when space is freed.
+func TestBackpressureBlocksAtQueueCapacity(t *testing.T) {
+	const queueSize = 5
+	q := NewQueue(logp.NewNopLogger(), nil,
+		Settings{
+			Events:        queueSize,
+			MaxGetRequest: queueSize,
+			FlushTimeout:  0,
+		}, 0, nil)
+	defer q.Close(true)
+
+	p := q.Producer(queue.ProducerConfig{})
+
+	// Fill the queue to capacity
+	for i := 0; i < queueSize; i++ {
+		_, ok := p.Publish(i)
+		require.True(t, ok, "Publish %d should succeed", i)
+	}
+
+	// Next publish must block (queue is full).
+	// Use a goroutine + timer to detect blocking.
+	published := make(chan bool, 1)
+	go func() {
+		_, ok := p.Publish("overflow")
+		published <- ok
+	}()
+
+	select {
+	case <-published:
+		t.Fatal("Publish should block when queue is full")
+	case <-time.After(50 * time.Millisecond):
+		// Expected: blocked
+	}
+
+	// Free one slot by consuming and acknowledging
+	batch, err := q.Get(1)
+	require.NoError(t, err)
+	batch.Done()
+
+	// The blocked publish should now succeed
+	select {
+	case ok := <-published:
+		require.True(t, ok, "Publish should succeed after space freed")
+	case <-time.After(time.Second):
+		t.Fatal("Publish did not unblock after freeing space")
+	}
+}
+
+// TestShutdownUnblocksProducer verifies that a producer blocked on a full
+// queue is unblocked by queue.Close and Publish returns false.
+func TestShutdownUnblocksProducer(t *testing.T) {
+	const queueSize = 3
+	q := NewQueue(logp.NewNopLogger(), nil,
+		Settings{
+			Events:        queueSize,
+			MaxGetRequest: queueSize,
+			FlushTimeout:  0,
+		}, 0, nil)
+
+	p := q.Producer(queue.ProducerConfig{})
+
+	// Fill the queue
+	for i := 0; i < queueSize; i++ {
+		_, ok := p.Publish(i)
+		require.True(t, ok)
+	}
+
+	// Blocked publish in goroutine
+	result := make(chan bool, 1)
+	go func() {
+		_, ok := p.Publish("blocked")
+		result <- ok
+	}()
+
+	// Give the goroutine time to block
+	time.Sleep(20 * time.Millisecond)
+
+	// Close the queue — should unblock the producer with ok=false
+	q.Close(false)
+
+	select {
+	case ok := <-result:
+		require.False(t, ok, "Publish should return false when queue is closing")
+	case <-time.After(time.Second):
+		t.Fatal("Blocked producer was not unblocked by queue.Close")
+	}
+}
+
+// TestMultiProducerAllEventsDelivered verifies no events are lost when
+// multiple producers publish concurrently and all events are consumed.
+func TestMultiProducerAllEventsDelivered(t *testing.T) {
+	const (
+		queueSize         = 200
+		numProducers      = 10
+		eventsPerProducer = 500
+		totalEvents       = numProducers * eventsPerProducer
+	)
+
+	testQueue := NewQueue(logp.NewNopLogger(), nil,
+		Settings{
+			Events:        queueSize,
+			MaxGetRequest: 100,
+			FlushTimeout:  time.Millisecond,
+		}, 0, nil)
+
+	var consumedCount atomic.Int64
+
+	// Consumer: read and acknowledge everything
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for {
+			batch, err := testQueue.Get(100)
+			if err != nil {
+				return
+			}
+			consumedCount.Add(int64(batch.Count()))
+			batch.Done()
+		}
+	}()
+
+	// Producers: each publishes exactly eventsPerProducer events
+	var publishedCount atomic.Int64
+	for i := 0; i < numProducers; i++ {
+		wg.Add(1)
+		go func(producerID int) {
+			defer wg.Done()
+			p := testQueue.Producer(queue.ProducerConfig{})
+			for j := 0; j < eventsPerProducer; j++ {
+				_, ok := p.Publish(fmt.Sprintf("p%d-e%d", producerID, j))
+				if ok {
+					publishedCount.Add(1)
+				}
+			}
+		}(i)
+	}
+
+	// Wait for all events to be consumed
+	require.Eventually(
+		t,
+		func() bool { return consumedCount.Load() == totalEvents },
+		10*time.Second,
+		time.Millisecond,
+		"expected %d consumed events, got %d", totalEvents, consumedCount.Load())
+
+	testQueue.Close(false)
+	<-testQueue.Done()
+	wg.Wait()
+
+	require.Equal(t, int64(totalEvents), publishedCount.Load(),
+		"all Publish calls should succeed")
+	require.Equal(t, int64(totalEvents), consumedCount.Load(),
+		"all published events should be consumed")
+}
+
+// TestTryPublishDropsWhenFull verifies that TryPublish does not succeed
+// when the queue's ring buffer is at capacity.
+func TestTryPublishDropsWhenFull(t *testing.T) {
+	const queueSize = 3
+	q := NewQueue(logp.NewNopLogger(), nil,
+		Settings{
+			Events:        queueSize,
+			MaxGetRequest: queueSize,
+			FlushTimeout:  0,
+		}, 0, nil)
+	defer q.Close(true)
+
+	p := q.Producer(queue.ProducerConfig{})
+
+	// Fill the queue
+	for i := 0; i < queueSize; i++ {
+		_, ok := p.Publish(i)
+		require.True(t, ok)
+	}
+
+	// TryPublish on a full queue should not succeed. With a buffered
+	// pushChan the non-blocking send may land in the channel buffer, but
+	// the runLoop won't drain it because the ring buffer is full, so
+	// TryPublish blocks on the response channel. Either way, we verify
+	// it does not return true within a short window.
+	result := make(chan bool, 1)
+	go func() {
+		_, ok := p.TryPublish("overflow")
+		result <- ok
+	}()
+
+	select {
+	case ok := <-result:
+		require.False(t, ok, "TryPublish must not return true when queue is full")
+	case <-time.After(100 * time.Millisecond):
+		// Acceptable: TryPublish is blocked, which means it did not succeed
+	}
+}
+
+// TestProducerCloseDoesNotBlockOnFullQueue verifies that closing a producer
+// does not deadlock even when the queue is full.
+func TestProducerCloseDoesNotBlockOnFullQueue(t *testing.T) {
+	const queueSize = 2
+	q := NewQueue(logp.NewNopLogger(), nil,
+		Settings{
+			Events:        queueSize,
+			MaxGetRequest: queueSize,
+			FlushTimeout:  0,
+		}, 0, nil)
+	defer q.Close(true)
+
+	p := q.Producer(queue.ProducerConfig{})
+
+	// Fill the queue
+	for i := 0; i < queueSize; i++ {
+		_, ok := p.Publish(i)
+		require.True(t, ok)
+	}
+
+	// Close the producer — should not deadlock
+	done := make(chan struct{})
+	go func() {
+		p.Close()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		// Success
+	case <-time.After(time.Second):
+		t.Fatal("Producer.Close() deadlocked on full queue")
+	}
+}
+
+// TestRapidCloseAfterPublish verifies that closing the queue immediately
+// after a successful Publish does not lose the event — it should still
+// be counted either as published (and acked) or the Publish returns false.
+func TestRapidCloseAfterPublish(t *testing.T) {
+	for iter := 0; iter < 50; iter++ {
+		var acked atomic.Int64
+		q := NewQueue(logp.NewNopLogger(), nil,
+			Settings{
+				Events:        10,
+				MaxGetRequest: 10,
+				FlushTimeout:  0,
+			}, 0, nil)
+
+		p := q.Producer(queue.ProducerConfig{
+			ACK: func(count int) { acked.Add(int64(count)) },
+		})
+
+		// Consumer
+		go func() {
+			for {
+				batch, err := q.Get(10)
+				if err != nil {
+					return
+				}
+				batch.Done()
+			}
+		}()
+
+		var published int64
+		_, ok := p.Publish("event-1")
+		if ok {
+			published++
+		}
+
+		q.Close(false)
+		<-q.Done()
+
+		// Ensure we never ack more than we published
+		require.GreaterOrEqual(t, published, acked.Load(),
+			"iter %d: acked (%d) > published (%d)", iter, acked.Load(), published)
+	}
+}
diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go
@@ -235,6 +235,27 @@ func (l *runLoop) handleInsert(req *pushRequest) {
 	l.nextEntryID++
 	l.eventCount++
 
+	// Drain additional buffered events without returning to the select loop.
+	// This amortizes the cost of goroutine scheduling when multiple producers
+	// are sending concurrently: instead of one select wakeup per event, we
+	// handle up to maxDrainPerWakeup already-buffered events in a tight loop.
+	// The cap prevents starvation of Get, Ack, and Close operations.
+	const maxDrainPerWakeup = 64
+	drained := 0
+drain:
+	for l.eventCount < len(l.broker.buf) && drained < maxDrainPerWakeup {
+		select {
+		case req := <-l.broker.pushChan:
+			l.insert(&req, l.nextEntryID)
+			req.resp <- l.nextEntryID
+			l.nextEntryID++
+			l.eventCount++
+			drained++
+		default:
+			break drain
+		}
+	}
+
 	// See if this gave us enough for a new batch
 	l.maybeUnblockGetRequest()
 }