fix(shard-distributor): separate watch event processing from the cache refresh (#7670)

arzonus · web-flow · commit 3c8a7b82bfb4 · 2026-02-09T12:37:48.000+01:00
**What changed?** * Watch event processing in `watch` function is separated from a call of `refreshCache` function  **Why?** * We observed that intensive watch event updates may cause a growing backlog on the server side and etcd that may lead to OOMKills  **How did you test it?** * Unit tests * Run on dev cluster  **Potential risks** N/A  **Release notes** N/A  **Documentation Changes** N/A --- ## Reviewer Validation **PR Description Quality** (check these before reviewing code): - [ ] **"What changed"** provides a clear 1-2 line summary - [ ] Project Issue is linked - [ ] **"Why"** explains the full motivation with sufficient context - [ ] **Testing is documented:** - [ ] Unit test commands are included (with exact `go test` invocation) - [ ] Integration test setup/commands included (if integration tests were run) - [ ] Canary testing details included (if canary was mentioned) - [ ] **Potential risks** section is thoughtfully filled out (or legitimately N/A) - [ ] **Release notes** included if this completes a user-facing feature - [ ] **Documentation** needs are addressed (or noted if uncertain)
diff --git a/go.mod b/go.mod
@@ -76,6 +76,7 @@ require (
 	github.com/ncruces/go-sqlite3 v0.22.0
 	github.com/opensearch-project/opensearch-go/v4 v4.1.0
 	github.com/robfig/cron/v3 v3.0.1
+	go.etcd.io/etcd/api/v3 v3.5.5
 	go.uber.org/mock v0.5.0
 )
 
@@ -89,7 +90,6 @@ require (
 	github.com/tetratelabs/wazero v1.8.2 // indirect
 	github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
 	github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
-	go.etcd.io/etcd/api/v3 v3.5.5 // indirect
 	go.etcd.io/etcd/client/pkg/v3 v3.5.5 // indirect
 	google.golang.org/genproto v0.0.0-20231016165738-49dd2c1f3d0b // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20231012201019-e917dd12ba7a // indirect
diff --git a/service/sharddistributor/store/etcd/executorstore/shardcache/namespaceshardcache.go b/service/sharddistributor/store/etcd/executorstore/shardcache/namespaceshardcache.go
@@ -122,22 +122,52 @@ func (n *namespaceShardToExecutor) Subscribe(ctx context.Context) (<-chan map[*s
 }
 
 func (n *namespaceShardToExecutor) namespaceRefreshLoop() {
+	triggerCh := n.runWatchLoop()
+
 	for {
-		if err := n.watch(); err != nil {
-			n.logger.Error("error watching in namespaceRefreshLoop, retrying...", tag.Error(err))
-			n.timeSource.Sleep(backoff.JitDuration(
-				namespaceRefreshLoopWatchRetryInterval,
-				namespaceRefreshLoopWatchJitterCoeff,
-			))
-			continue
-		}
+		select {
+		case <-n.stopCh:
+			n.logger.Info("stop channel closed, exiting namespaceRefreshLoop")
+			return
 
-		n.logger.Info("namespaceRefreshLoop is exiting")
-		return
+		case _, ok := <-triggerCh:
+			if !ok {
+				n.logger.Info("trigger channel closed, exiting namespaceRefreshLoop")
+				return
+			}
+
+			if err := n.refresh(context.Background()); err != nil {
+				n.logger.Error("failed to refresh namespace shard to executor", tag.Error(err))
+			}
+		}
 	}
 }
 
-func (n *namespaceShardToExecutor) watch() error {
+func (n *namespaceShardToExecutor) runWatchLoop() <-chan struct{} {
+	triggerCh := make(chan struct{}, 1)
+
+	go func() {
+		defer close(triggerCh)
+
+		for {
+			if err := n.watch(triggerCh); err != nil {
+				n.logger.Error("error watching in namespaceRefreshLoop, retrying...", tag.Error(err))
+				n.timeSource.Sleep(backoff.JitDuration(
+					namespaceRefreshLoopWatchRetryInterval,
+					namespaceRefreshLoopWatchJitterCoeff,
+				))
+				continue
+			}
+
+			n.logger.Info("namespaceRefreshLoop is exiting")
+			return
+		}
+	}()
+
+	return triggerCh
+}
+
+func (n *namespaceShardToExecutor) watch(triggerCh chan<- struct{}) error {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 
@@ -151,6 +181,7 @@ func (n *namespaceShardToExecutor) watch() error {
 	for {
 		select {
 		case <-n.stopCh:
+			n.logger.Info("stop channel closed, exiting watch loop")
 			return nil
 
 		case watchResp, ok := <-watchChan:
@@ -170,10 +201,14 @@ func (n *namespaceShardToExecutor) watch() error {
 				}
 			}
 
-			if shouldRefresh {
-				if err := n.refresh(context.Background()); err != nil {
-					n.logger.Error("failed to refresh namespace shard to executor", tag.Error(err))
-				}
+			if !shouldRefresh {
+				continue
+			}
+
+			select {
+			case triggerCh <- struct{}{}:
+			default:
+				n.logger.Info("Cache is being refreshed, skipping trigger")
 			}
 		}
 	}
diff --git a/service/sharddistributor/store/etcd/executorstore/shardcache/namespaceshardcache_test.go b/service/sharddistributor/store/etcd/executorstore/shardcache/namespaceshardcache_test.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	"go.etcd.io/etcd/api/v3/mvccpb"
 	clientv3 "go.etcd.io/etcd/client/v3"
 	"go.uber.org/goleak"
 	"go.uber.org/mock/gomock"
@@ -159,6 +160,8 @@ func TestNamespaceShardToExecutor_watch_watchChanErrors(t *testing.T) {
 	e, err := newNamespaceShardToExecutor(testPrefix, testNamespace, mockClient, stopCh, logger, clock.NewRealTimeSource())
 	require.NoError(t, err)
 
+	triggerChan := make(chan struct{}, 1)
+
 	// Test Case #1
 	// Test received compact revision error from watch channel
 	{
@@ -168,7 +171,7 @@ func TestNamespaceShardToExecutor_watch_watchChanErrors(t *testing.T) {
 			}
 		}()
 
-		err = e.watch()
+		err = e.watch(triggerChan)
 		require.Error(t, err)
 		assert.ErrorContains(t, err, "etcdserver: mvcc: required revision has been compacted")
 	}
@@ -177,12 +180,147 @@ func TestNamespaceShardToExecutor_watch_watchChanErrors(t *testing.T) {
 	// Test closed watch channel
 	{
 		close(watchChan)
-		err = e.watch()
+		err = e.watch(triggerChan)
 		require.Error(t, err)
 		assert.ErrorContains(t, err, "watch channel closed")
 	}
 }
 
+func TestNamespaceShardToExecutor_watch_triggerChBlocking(t *testing.T) {
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+
+	logger := testlogger.New(t)
+	mockClient := etcdclient.NewMockClient(ctrl)
+	stopCh := make(chan struct{})
+	testPrefix := "/test-prefix"
+	testNamespace := "test-namespace"
+
+	watchChan := make(chan clientv3.WatchResponse)
+	mockClient.EXPECT().
+		Watch(gomock.Any(), gomock.Any(), gomock.Any()).
+		Return(watchChan)
+
+	e, err := newNamespaceShardToExecutor(testPrefix, testNamespace, mockClient, stopCh, logger, clock.NewRealTimeSource())
+	require.NoError(t, err)
+
+	// Create a triggerCh with buffer size 1, but never read from it
+	triggerChan := make(chan struct{}, 1)
+
+	executorKey := etcdkeys.BuildExecutorKey(testPrefix, testNamespace, "executor-1", etcdkeys.ExecutorAssignedStateKey)
+
+	// Start watch in a goroutine
+	watchDone := make(chan error, 1)
+	go func() {
+		watchDone <- e.watch(triggerChan)
+	}()
+
+	// Send many events - the loop should not block even though triggerCh is full
+	for i := 0; i < 100; i++ {
+		select {
+		case watchChan <- clientv3.WatchResponse{
+			Events: []*clientv3.Event{
+				{
+					Type: clientv3.EventTypePut,
+					Kv: &mvccpb.KeyValue{
+						Key: []byte(executorKey),
+					},
+				},
+			},
+		}:
+		case <-time.After(100 * time.Millisecond):
+			t.Fatal("watch loop is stuck - could not send event to watchChan")
+		}
+	}
+
+	// Close stopCh to exit the watch loop
+	close(stopCh)
+
+	select {
+	case err := <-watchDone:
+		assert.NoError(t, err)
+	case <-time.After(1 * time.Second):
+		t.Fatal("watch loop did not exit after stopCh was closed")
+	}
+}
+
+func TestNamespaceShardToExecutor_namespaceRefreshLoop_triggersRefresh(t *testing.T) {
+	defer goleak.VerifyNone(t)
+
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+
+	logger := testlogger.New(t)
+	mockClient := etcdclient.NewMockClient(ctrl)
+	timeSource := clock.NewMockedTimeSource()
+	stopCh := make(chan struct{})
+	testPrefix := "/test-prefix"
+	testNamespace := "test-namespace"
+	executorID := "executor-1"
+
+	watchChan := make(chan clientv3.WatchResponse)
+	mockClient.EXPECT().
+		Watch(gomock.Any(), gomock.Any(), gomock.Any()).
+		Return(watchChan)
+
+	executorPrefix := etcdkeys.BuildExecutorsPrefix(testPrefix, testNamespace)
+	executorKey := etcdkeys.BuildMetadataKey(
+		testPrefix,
+		testNamespace,
+		executorID,
+		"metadata-key",
+	)
+
+	// Mock Get call for refresh
+	mockClient.EXPECT().
+		Get(gomock.Any(), executorPrefix, gomock.Any()).
+		Return(
+			&clientv3.GetResponse{Kvs: []*mvccpb.KeyValue{
+				{
+					Key:   []byte(executorKey),
+					Value: []byte("metadata-value"),
+				},
+			}},
+			nil,
+		)
+
+	e, err := newNamespaceShardToExecutor(testPrefix, testNamespace, mockClient, stopCh, logger, timeSource)
+	require.NoError(t, err)
+
+	wg := sync.WaitGroup{}
+	wg.Add(1)
+
+	go func() {
+		defer wg.Done()
+		e.namespaceRefreshLoop()
+	}()
+
+	// Send a watch event with ExecutorAssignedStateKey to trigger refresh
+	go func() {
+		watchChan <- clientv3.WatchResponse{
+			Events: []*clientv3.Event{
+				{
+					Type: clientv3.EventTypePut,
+					Kv: &mvccpb.KeyValue{
+						Key: []byte(executorKey),
+					},
+				},
+			},
+		}
+	}()
+
+	require.Eventually(t, func() bool {
+		e.RLock()
+		defer e.RUnlock()
+		_, ok := e.shardOwners[executorID]
+		return ok
+	}, time.Second, 1*time.Millisecond, "expected executor to be added to shardOwners")
+
+	// Close stopCh to exit the loop
+	close(stopCh)
+	wg.Wait()
+}
+
 func TestNamespaceShardToExecutor_namespaceRefreshLoop_watchError(t *testing.T) {
 	defer goleak.VerifyNone(t)
 
@@ -209,9 +347,12 @@ func TestNamespaceShardToExecutor_namespaceRefreshLoop_watchError(t *testing.T)
 		Return(watchChanClosed)
 
 	// mock for third watch call that will be used when stopCh is closed
+	// maybe called or not if stopCh is closed before retry interval
 	mockClient.EXPECT().
 		Watch(gomock.Any(), gomock.Any(), gomock.Any()).
-		Return(make(chan clientv3.WatchResponse))
+		Return(make(chan clientv3.WatchResponse)).
+		MinTimes(0).
+		MaxTimes(1)
 
 	e, err := newNamespaceShardToExecutor(testPrefix, testNamespace, mockClient, stopCh, logger, timeSource)
 	require.NoError(t, err)