Allow list items to be processed in parallel

shady-canva · shady-canva · commit 849c153892c0 · 2025-06-17T22:23:44.000+10:00
Signed-off-by: Shady Rafehi &lt;shady@canva.com&gt;
diff --git a/pkg/cache/cluster.go b/pkg/cache/cluster.go
@@ -56,6 +56,8 @@ const (
 	// Limit is required to avoid memory spikes during cache initialization.
 	// The default limit of 50 is chosen based on experiments.
 	defaultListSemaphoreWeight = 50
+	// defaultListItemSemaphoreWeight limits the amount of items to process in parallel for each k8s list.
+	defaultListItemSemaphoreWeight = int64(1)
 	// defaultEventProcessingInterval is the default interval for processing events
 	defaultEventProcessingInterval = 100 * time.Millisecond
 )
@@ -164,15 +166,16 @@ type ListRetryFunc func(err error) bool
 func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCache {
 	log := textlogger.NewLogger(textlogger.NewConfig())
 	cache := &clusterCache{
-		settings:           Settings{ResourceHealthOverride: &noopSettings{}, ResourcesFilter: &noopSettings{}},
-		apisMeta:           make(map[schema.GroupKind]*apiMeta),
-		eventMetaCh:        nil,
-		listPageSize:       defaultListPageSize,
-		listPageBufferSize: defaultListPageBufferSize,
-		listSemaphore:      semaphore.NewWeighted(defaultListSemaphoreWeight),
-		resources:          make(map[kube.ResourceKey]*Resource),
-		nsIndex:            make(map[string]map[kube.ResourceKey]*Resource),
-		config:             config,
+		settings:                Settings{ResourceHealthOverride: &noopSettings{}, ResourcesFilter: &noopSettings{}},
+		apisMeta:                make(map[schema.GroupKind]*apiMeta),
+		eventMetaCh:             nil,
+		listPageSize:            defaultListPageSize,
+		listPageBufferSize:      defaultListPageBufferSize,
+		listSemaphore:           semaphore.NewWeighted(defaultListSemaphoreWeight),
+		listItemSemaphoreWeight: defaultListItemSemaphoreWeight,
+		resources:               make(map[kube.ResourceKey]*Resource),
+		nsIndex:                 make(map[string]map[kube.ResourceKey]*Resource),
+		config:                  config,
 		kubectl: &kube.KubectlCmd{
 			Log:    log,
 			Tracer: tracing.NopTracer{},
@@ -219,8 +222,9 @@ type clusterCache struct {
 	// size of a page for list operations pager.
 	listPageSize int64
 	// number of pages to prefetch for list pager.
-	listPageBufferSize int32
-	listSemaphore      WeightedSemaphore
+	listPageBufferSize      int32
+	listSemaphore           WeightedSemaphore
+	listItemSemaphoreWeight int64
 
 	// retry options for list operations
 	listRetryLimit      int32
@@ -262,6 +266,35 @@ type clusterCacheSync struct {
 	resyncTimeout time.Duration
 }
 
+// listItemTaskLimiter limits the amount of list items to process in parallel.
+type listItemTaskLimiter struct {
+	sem WeightedSemaphore
+	wg  sync.WaitGroup
+}
+
+// Run executes the given task concurrently, blocking if the pool is at capacity.
+func (t *listItemTaskLimiter) Run(ctx context.Context, task func()) error {
+	t.wg.Add(1)
+	if err := t.sem.Acquire(ctx, 1); err != nil {
+		t.wg.Done()
+		return fmt.Errorf("failed to acquire semaphore: %w", err)
+	}
+
+	go func() {
+		defer t.wg.Done()
+		defer t.sem.Release(1)
+
+		task()
+	}()
+
+	return nil
+}
+
+// Wait blocks until all submitted tasks have completed.
+func (t *listItemTaskLimiter) Wait() {
+	t.wg.Wait()
+}
+
 // ListRetryFuncNever never retries on errors
 func ListRetryFuncNever(_ error) bool {
 	return false
@@ -446,6 +479,13 @@ func (c *clusterCache) newResource(un *unstructured.Unstructured) *Resource {
 	return resource
 }
 
+func (c *clusterCache) newListItemTaskLimiter() *listItemTaskLimiter {
+	return &listItemTaskLimiter{
+		sem: semaphore.NewWeighted(c.listItemSemaphoreWeight),
+		wg:  sync.WaitGroup{},
+	}
+}
+
 func (c *clusterCache) setNode(n *Resource) {
 	key := n.ResourceKey()
 	c.resources[key] = n
@@ -629,17 +669,33 @@ func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.Reso
 
 // loadInitialState loads the state of all the resources retrieved by the given resource client.
 func (c *clusterCache) loadInitialState(ctx context.Context, api kube.APIResourceInfo, resClient dynamic.ResourceInterface, ns string, lock bool) (string, error) {
-	var items []*Resource
+	var (
+		items    []*Resource
+		listLock = sync.Mutex{}
+		limiter  = c.newListItemTaskLimiter()
+	)
+
 	resourceVersion, err := c.listResources(ctx, resClient, func(listPager *pager.ListPager) error {
 		return listPager.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error {
 			if un, ok := obj.(*unstructured.Unstructured); !ok {
 				return fmt.Errorf("object %s/%s has an unexpected type", un.GroupVersionKind().String(), un.GetName())
 			} else {
-				items = append(items, c.newResource(un))
+				if err := limiter.Run(ctx, func() {
+					newRes := c.newResource(un)
+					listLock.Lock()
+					items = append(items, newRes)
+					listLock.Unlock()
+				}); err != nil {
+					return fmt.Errorf("failed to process list item: %w", err)
+				}
 			}
 			return nil
 		})
 	})
+
+	// Wait until all items have completed processing.
+	limiter.Wait()
+
 	if err != nil {
 		return "", fmt.Errorf("failed to load initial state of resource %s: %w", api.GroupKind.String(), err)
 	}
@@ -938,19 +994,29 @@ func (c *clusterCache) sync() error {
 		lock.Unlock()
 
 		return c.processApi(client, api, func(resClient dynamic.ResourceInterface, ns string) error {
+			limiter := c.newListItemTaskLimiter()
+
 			resourceVersion, err := c.listResources(ctx, resClient, func(listPager *pager.ListPager) error {
 				return listPager.EachListItem(context.Background(), metav1.ListOptions{}, func(obj runtime.Object) error {
 					if un, ok := obj.(*unstructured.Unstructured); !ok {
 						return fmt.Errorf("object %s/%s has an unexpected type", un.GroupVersionKind().String(), un.GetName())
 					} else {
-						newRes := c.newResource(un)
-						lock.Lock()
-						c.setNode(newRes)
-						lock.Unlock()
+						if err := limiter.Run(ctx, func() {
+							newRes := c.newResource(un)
+							lock.Lock()
+							c.setNode(newRes)
+							lock.Unlock()
+						}); err != nil {
+							return fmt.Errorf("failed to process list item: %w", err)
+						}
 					}
 					return nil
 				})
 			})
+
+			// Wait until all items have completed processing.
+			limiter.Wait()
+
 			if err != nil {
 				if c.isRestrictedResource(err) {
 					keep := false
diff --git a/pkg/cache/cluster_test.go b/pkg/cache/cluster_test.go
@@ -147,46 +147,75 @@ func getChildren(cluster *clusterCache, un *unstructured.Unstructured) []*Resour
 	return hierarchy[1:]
 }
 
-// Benchmark_sync is meant to simulate cluster initialization when populateResourceInfoHandler does nontrivial work.
-func Benchmark_sync(t *testing.B) {
-	resources := []runtime.Object{}
-	for i := 0; i < 100; i++ {
-		resources = append(resources, &corev1.Pod{
-			ObjectMeta: metav1.ObjectMeta{
-				Name:      fmt.Sprintf("pod-%d", i),
-				Namespace: "default",
-			},
-		}, &appsv1.ReplicaSet{
-			ObjectMeta: metav1.ObjectMeta{
-				Name:      fmt.Sprintf("rs-%d", i),
-				Namespace: "default",
-			},
-		}, &appsv1.Deployment{
-			ObjectMeta: metav1.ObjectMeta{
-				Name:      fmt.Sprintf("deploy-%d", i),
-				Namespace: "default",
-			},
-		}, &appsv1.StatefulSet{
-			ObjectMeta: metav1.ObjectMeta{
-				Name:      fmt.Sprintf("sts-%d", i),
-				Namespace: "default",
-			},
-		})
-	}
+// BenchmarkSync benchmarks cluster initialization when populateResourceInfoHandler does nontrivial work.
+// The benchmark is executed using different list item semaphore weights.
+func BenchmarkSync(b *testing.B) {
+	run := func(bb *testing.B, weight int64, overhead time.Duration) {
+		resources := []runtime.Object{}
+		for i := 0; i < 100; i++ {
+			resources = append(resources, &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      fmt.Sprintf("pod-%d", i),
+					Namespace: "default",
+				},
+			}, &appsv1.ReplicaSet{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      fmt.Sprintf("rs-%d", i),
+					Namespace: "default",
+				},
+			}, &appsv1.Deployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      fmt.Sprintf("deploy-%d", i),
+					Namespace: "default",
+				},
+			}, &appsv1.StatefulSet{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      fmt.Sprintf("sts-%d", i),
+					Namespace: "default",
+				},
+			})
+		}
 
-	c := newCluster(t, resources...)
+		c := newCluster(bb, resources...)
+		c.listItemSemaphoreWeight = weight
 
-	c.populateResourceInfoHandler = func(_ *unstructured.Unstructured, _ bool) (info any, cacheManifest bool) {
-		time.Sleep(10 * time.Microsecond)
-		return nil, false
-	}
+		c.populateResourceInfoHandler = func(_ *unstructured.Unstructured, _ bool) (info any, cacheManifest bool) {
+			time.Sleep(overhead)
+			return nil, false
+		}
 
-	t.ResetTimer()
+		bb.ResetTimer()
 
-	for n := 0; n < t.N; n++ {
-		err := c.sync()
-		require.NoError(t, err)
+		for n := 0; n < bb.N; n++ {
+			err := c.sync()
+			require.NoError(bb, err)
+		}
 	}
+
+	b.Run("weight=1,overhead=100μs", func(bb *testing.B) {
+		run(bb, 1, 100*time.Microsecond)
+	})
+	b.Run("weight=2,overhead=100μs", func(bb *testing.B) {
+		run(bb, 2, 100*time.Microsecond)
+	})
+	b.Run("weight=4,overhead=100μs", func(bb *testing.B) {
+		run(bb, 4, 100*time.Microsecond)
+	})
+	b.Run("weight=8,overhead=100μs", func(bb *testing.B) {
+		run(bb, 8, 100*time.Microsecond)
+	})
+	b.Run("weight=1,overhead=500μs", func(bb *testing.B) {
+		run(bb, 1, 500*time.Microsecond)
+	})
+	b.Run("weight=2,overhead=500μs", func(bb *testing.B) {
+		run(bb, 2, 500*time.Microsecond)
+	})
+	b.Run("weight=4,overhead=500μs", func(bb *testing.B) {
+		run(bb, 4, 500*time.Microsecond)
+	})
+	b.Run("weight=8,overhead=500μs", func(bb *testing.B) {
+		run(bb, 8, 500*time.Microsecond)
+	})
 }
 
 func TestEnsureSynced(t *testing.T) {
diff --git a/pkg/cache/settings.go b/pkg/cache/settings.go
@@ -102,6 +102,13 @@ func SetListSemaphore(listSemaphore WeightedSemaphore) UpdateSettingsFunc {
 	}
 }
 
+// SetListItemSemaphoreWeight sets the weight to limit the amount of k8s list items to process in parallel.
+func SetListItemSemaphoreWeight(listItemSemaphoreWeight int64) UpdateSettingsFunc {
+	return func(cache *clusterCache) {
+		cache.listItemSemaphoreWeight = listItemSemaphoreWeight
+	}
+}
+
 // SetResyncTimeout updates cluster re-sync timeout
 func SetResyncTimeout(timeout time.Duration) UpdateSettingsFunc {
 	return func(cache *clusterCache) {
diff --git a/pkg/cache/settings_test.go b/pkg/cache/settings_test.go
@@ -72,3 +72,12 @@ func TestSetEventsProcessingInterval(t *testing.T) {
 	cache.Invalidate(SetEventProcessingInterval(interval))
 	assert.Equal(t, interval, cache.eventProcessingInterval)
 }
+
+func TestSetListItemSemaphoreWeight(t *testing.T) {
+	cache := NewClusterCache(&rest.Config{})
+	assert.Equal(t, defaultListItemSemaphoreWeight, cache.listItemSemaphoreWeight)
+
+	weight := int64(8)
+	cache.Invalidate(SetListItemSemaphoreWeight(weight))
+	assert.Equal(t, weight, cache.listItemSemaphoreWeight)
+}

Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,13 @@ func SetListSemaphore(listSemaphore WeightedSemaphore) UpdateSettingsFunc {`
`102`	`102`	`}`
`103`	`103`	`}`
`104`	`104`
	`105`	`+// SetListItemSemaphoreWeight sets the weight to limit the amount of k8s list items to process in parallel.`
	`106`	`+func SetListItemSemaphoreWeight(listItemSemaphoreWeight int64) UpdateSettingsFunc {`
	`107`	`+ return func(cache *clusterCache) {`
	`108`	`+ cache.listItemSemaphoreWeight = listItemSemaphoreWeight`
	`109`	`+ }`
	`110`	`+}`
	`111`	`+`
`105`	`112`	`// SetResyncTimeout updates cluster re-sync timeout`
`106`	`113`	`func SetResyncTimeout(timeout time.Duration) UpdateSettingsFunc {`
`107`	`114`	`return func(cache *clusterCache) {`