fix: handle jitter correctly (#2278)

StarpTech · web-flow · commit a205890a07fa · 2025-10-16T10:28:58.000+02:00
diff --git a/router/core/router.go b/router/core/router.go
@@ -1399,12 +1399,6 @@ func (r *Router) Shutdown(ctx context.Context) error {
 		ctx = ctxWithTimer
 	}
 
-	if r.configPoller != nil {
-		if subErr := r.configPoller.Stop(ctx); subErr != nil {
-			err.Append(fmt.Errorf("failed to stop config poller: %w", subErr))
-		}
-	}
-
 	if r.httpServer != nil {
 		if subErr := r.httpServer.Shutdown(ctx); subErr != nil {
 			if errors.Is(subErr, context.DeadlineExceeded) {
diff --git a/router/pkg/controlplane/configpoller/config_poller.go b/router/pkg/controlplane/configpoller/config_poller.go
@@ -26,8 +26,6 @@ type ConfigPoller interface {
 	// If the Config is nil, no new config is available and the current config should be used.
 	// and updates the latest router config version. This method is only used for the initial config
 	GetRouterConfig(ctx context.Context) (*routerconfig.Response, error)
-	// Stop stops the config poller. After calling stop, the config poller cannot be used again.
-	Stop(ctx context.Context) error
 }
 
 type configPoller struct {
@@ -65,11 +63,6 @@ func (c *configPoller) Version() string {
 	return c.latestRouterConfigVersion
 }
 
-// Stop stops the config poller
-func (c *configPoller) Stop(_ context.Context) error {
-	return c.poller.Stop()
-}
-
 func (c *configPoller) Subscribe(ctx context.Context, handler func(newConfig *nodev1.RouterConfig, _ string) error) {
 	c.poller.Subscribe(ctx, func() {
 		start := time.Now()
diff --git a/router/pkg/controlplane/poll.go b/router/pkg/controlplane/poll.go
@@ -11,58 +11,52 @@ type Poller interface {
 	// Subscribe subscribes to the poller with a handler function that will be invoked
 	// Must only be called once. If the handler is busy during a tick, the next tick will be skipped.
 	Subscribe(ctx context.Context, handler func())
-	// Stop stops the poller. That means no more events will be emitted.
-	Stop() error
 }
 
 type Poll struct {
-	ticker *time.Ticker
-
+	interval  time.Duration
 	maxJitter time.Duration
 }
 
 // NewPoll creates a new poller that emits events at the given interval
 // and executes the given handler function in a separate goroutine.
 func NewPoll(interval time.Duration, maxJitter time.Duration) *Poll {
-	p := &Poll{
-		maxJitter: maxJitter,
+	// interval must be positive
+	if interval <= 0 {
+		panic("non-positive interval")
 	}
 
-	// maxJitter must be positive, otherwise the random duration function will panic
+	// maxJitter must be non-negative, otherwise the random duration function will panic
 	if maxJitter < 0 {
 		panic("negative max jitter")
 	}
 
-	p.ticker = time.NewTicker(interval)
-
-	return p
-}
-
-// Stop stops the poller. That means no more events will be emitted.
-// After calling stop, the poller cannot be used again.
-func (c *Poll) Stop() error {
-	c.ticker.Stop()
-	return nil
+	return &Poll{
+		interval:  interval,
+		maxJitter: maxJitter,
+	}
 }
 
 func (c *Poll) Subscribe(ctx context.Context, handler func()) {
 	go func() {
+		// Calculate initial delay: interval + jitter
+		jitter := timex.RandomDuration(c.maxJitter)
+		timer := time.NewTimer(c.interval + jitter)
+		defer timer.Stop()
+
 		for {
 			select {
 			case <-ctx.Done():
-				c.ticker.Stop()
 				return
-			case <-c.ticker.C:
-				// If the current handler is still in progress
-				// the next tick will be skipped. This is how a timer
-				// is implemented in the standard library.
-
-				// Add jitter to the interval
-				// This is to prevent all clients from hitting the server at exactly the same time,
-				// which could cause a burst load issue
-				time.Sleep(timex.RandomDuration(c.maxJitter))
-
+			case <-timer.C:
+				// Execute handler
 				handler()
+
+				// Calculate next execution time: interval + new jitter
+				// This ensures we always wait at least 'interval' time between executions
+				jitter := timex.RandomDuration(c.maxJitter)
+				nextDelay := c.interval + jitter
+				timer.Reset(nextDelay)
 			}
 		}
 	}()
diff --git a/router/pkg/controlplane/poll_test.go b/router/pkg/controlplane/poll_test.go
@@ -1,10 +1,13 @@
 package controlplane
 
 import (
+	"context"
+	"sync/atomic"
 	"testing"
 	"time"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func Test_Poller(t *testing.T) {
@@ -23,13 +26,150 @@ func Test_Poller(t *testing.T) {
 		})
 	})
 
-	// This is a guarunteed pass because Poll.Stop() always returns nil,
-	// but it's good to have a test for it should there be an error in the future
-	t.Run("stopping should work correctly", func(t *testing.T) {
-		p := NewPoll(1*time.Second, 0*time.Second)
+	t.Run("interval plus jitter timing should work correctly", func(t *testing.T) {
+		interval := 100 * time.Millisecond
+		maxJitter := 50 * time.Millisecond
+		expectedMinInterval := interval
+		expectedMaxInterval := interval + maxJitter
 
-		err := p.Stop()
+		p := NewPoll(interval, maxJitter)
 
-		assert.NoError(t, err)
+		// Record execution timestamps
+		var timestamps []time.Time
+		executionCount := 0
+		targetExecutions := 4
+
+		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer cancel()
+
+		p.Subscribe(ctx, func() {
+			timestamps = append(timestamps, time.Now())
+			executionCount++
+
+			// Cancel after we have enough executions
+			if executionCount >= targetExecutions {
+				cancel()
+			}
+		})
+
+		// Wait for context to be cancelled or timeout
+		<-ctx.Done()
+
+		// We should have at least 2 executions to measure intervals
+		require.GreaterOrEqual(t, len(timestamps), 2, "should have at least 2 executions")
+
+		// Calculate intervals between executions
+		for i := 1; i < len(timestamps); i++ {
+			actualInterval := timestamps[i].Sub(timestamps[i-1])
+
+			// Each interval should be at least the minimum interval
+			assert.GreaterOrEqual(t, actualInterval, expectedMinInterval,
+				"execution %d: actual interval %v should be >= minimum interval %v",
+				i, actualInterval, expectedMinInterval)
+
+			// Each interval should be at most interval + maxJitter
+			assert.LessOrEqual(t, actualInterval, expectedMaxInterval,
+				"execution %d: actual interval %v should be <= maximum interval %v",
+				i, actualInterval, expectedMaxInterval)
+
+			t.Logf("execution %d: interval = %v (expected: %v to %v)",
+				i, actualInterval, expectedMinInterval, expectedMaxInterval)
+		}
+	})
+
+	t.Run("should not allow concurrent handler invocations", func(t *testing.T) {
+		interval := 50 * time.Millisecond
+		maxJitter := 0 * time.Millisecond // No jitter for predictable timing
+
+		p := NewPoll(interval, maxJitter)
+
+		var concurrentInvocations int32
+		var maxConcurrentInvocations int32
+		var totalInvocations int32
+
+		ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
+		defer cancel()
+
+		p.Subscribe(ctx, func() {
+			// Increment concurrent counter
+			current := atomic.AddInt32(&concurrentInvocations, 1)
+
+			// Track the maximum concurrent invocations we've seen
+			for {
+				max := atomic.LoadInt32(&maxConcurrentInvocations)
+				if current <= max || atomic.CompareAndSwapInt32(&maxConcurrentInvocations, max, current) {
+					break
+				}
+			}
+
+			// Increment total invocations
+			atomic.AddInt32(&totalInvocations, 1)
+
+			// Simulate work that takes longer than the interval
+			// This should cause subsequent timer events to be skipped
+			time.Sleep(150 * time.Millisecond)
+
+			// Decrement concurrent counter
+			atomic.AddInt32(&concurrentInvocations, -1)
+		})
+
+		// Wait for context timeout
+		<-ctx.Done()
+
+		// Verify that we never had more than 1 concurrent invocation
+		maxConcurrent := atomic.LoadInt32(&maxConcurrentInvocations)
+		totalInvoked := atomic.LoadInt32(&totalInvocations)
+
+		assert.Equal(t, int32(1), maxConcurrent,
+			"should never have more than 1 concurrent handler invocation")
+
+		// We should have fewer invocations than if they were all allowed to run
+		// (500ms test duration / 50ms interval = 10 possible, but many should be skipped)
+		assert.Greater(t, int32(10), totalInvoked,
+			"some invocations should have been skipped due to handler still running")
+
+		// But we should have at least some invocations
+		assert.Greater(t, totalInvoked, int32(0),
+			"should have at least some handler invocations")
+
+		t.Logf("total invocations: %d, max concurrent: %d", totalInvoked, maxConcurrent)
+	})
+
+	t.Run("should stop polling when context is cancelled", func(t *testing.T) {
+		interval := 50 * time.Millisecond
+		maxJitter := 0 * time.Millisecond // No jitter for predictable timing
+
+		p := NewPoll(interval, maxJitter)
+
+		var executionCount int32
+
+		ctx, cancel := context.WithCancel(context.Background())
+
+		// Start polling
+		go p.Subscribe(ctx, func() {
+			atomic.AddInt32(&executionCount, 1)
+		})
+
+		// Let it run for a bit to ensure polling starts
+		time.Sleep(150 * time.Millisecond)
+		countBeforeCancel := atomic.LoadInt32(&executionCount)
+
+		// Cancel the context
+		cancel()
+
+		// Wait for any pending executions to complete and verify no new ones occur
+		time.Sleep(200 * time.Millisecond)
+		countAfterCancel := atomic.LoadInt32(&executionCount)
+
+		// Verify that we had some executions before cancellation
+		assert.Greater(t, countBeforeCancel, int32(0),
+			"should have had executions before context cancellation")
+
+		// Verify that no new executions occurred after cancellation
+		assert.Equal(t, countBeforeCancel, countAfterCancel,
+			"should not have new executions after context cancellation")
+
+		t.Logf("executions before cancel: %d, executions after cancel: %d",
+			countBeforeCancel, countAfterCancel)
 	})
 }