Skip to content

Commit 67f0428

Browse files
committed
DRA resourceslice controller: delay sync
When deleting a bunch of slices, the delete events queue the pool while it is being synced. It then got synced again immediately, while the deleted slices were still being removed from the informer cache. The obsolete slice in the cache caused the controller to delete it again, which fails with a "not found". That error is ignored, but this still caused extra API calls. Now syncing gets delayed with a configuration duration (default: 30 seconds) so the informer cache is more likely to be up-to-date when the pool gets synced again.
1 parent 99cf2d8 commit 67f0428

File tree

2 files changed

+34
-16
lines changed

2 files changed

+34
-16
lines changed

staging/src/k8s.io/dynamic-resource-allocation/resourceslice/resourceslicecontroller.go

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ const (
5858
//
5959
// To mitigate this, we use a TTL and check a pool again once added slices expire.
6060
defaultMutationCacheTTL = time.Minute
61+
62+
// defaultSyncDelay defines how long to wait between receiving the most recent
63+
// informer event and syncing again. This is long enough that the informer cache
64+
// should be up-to-date (matter mostly for deletes because an out-dated cache
65+
// causes redundant delete API calls) and not too long that a human mistake
66+
// doesn't get fixed while that human is waiting for it.
67+
defaultSyncDelay = 30 * time.Second
6168
)
6269

6370
// Controller synchronizes information about resources of one driver with
@@ -74,6 +81,7 @@ type Controller struct {
7481
queue workqueue.TypedRateLimitingInterface[string]
7582
sliceStore cache.MutationCache
7683
mutationCacheTTL time.Duration
84+
syncDelay time.Duration
7785

7886
// Must use atomic access...
7987
numCreates int64
@@ -194,6 +202,15 @@ type Options struct {
194202
// MutationCacheTTL can be used to change the default TTL of one minute.
195203
// See source code for details.
196204
MutationCacheTTL *time.Duration
205+
206+
// SyncDelay defines how long to wait between receiving the most recent
207+
// informer event and syncing again. The default is 30 seconds.
208+
//
209+
// This is long enough that the informer cache should be up-to-date
210+
// (matter mostly for deletes because an out-dated cache causes
211+
// redundant delete API calls) and not too long that a human mistake
212+
// doesn't get fixed while that human is waiting for it.
213+
SyncDelay *time.Duration
197214
}
198215

199216
// Stop cancels all background activity and blocks until the controller has stopped.
@@ -267,17 +284,15 @@ func newController(ctx context.Context, options Options) (*Controller, error) {
267284
owner: options.Owner.DeepCopy(),
268285
queue: options.Queue,
269286
resources: options.Resources,
270-
mutationCacheTTL: defaultMutationCacheTTL,
287+
mutationCacheTTL: ptr.Deref(options.MutationCacheTTL, defaultMutationCacheTTL),
288+
syncDelay: ptr.Deref(options.SyncDelay, defaultSyncDelay),
271289
}
272290
if c.queue == nil {
273291
c.queue = workqueue.NewTypedRateLimitingQueueWithConfig(
274292
workqueue.DefaultTypedControllerRateLimiter[string](),
275293
workqueue.TypedRateLimitingQueueConfig[string]{Name: "node_resource_slices"},
276294
)
277295
}
278-
if options.MutationCacheTTL != nil {
279-
c.mutationCacheTTL = *options.MutationCacheTTL
280-
}
281296
if err := c.initInformer(ctx); err != nil {
282297
return nil, err
283298
}
@@ -321,7 +336,7 @@ func (c *Controller) initInformer(ctx context.Context) error {
321336
return
322337
}
323338
logger.V(5).Info("ResourceSlice add", "slice", klog.KObj(slice))
324-
c.queue.Add(slice.Spec.Pool.Name)
339+
c.queue.AddAfter(slice.Spec.Pool.Name, c.syncDelay)
325340
},
326341
UpdateFunc: func(old, new any) {
327342
oldSlice, ok := old.(*resourceapi.ResourceSlice)
@@ -337,8 +352,8 @@ func (c *Controller) initInformer(ctx context.Context) error {
337352
} else {
338353
logger.V(5).Info("ResourceSlice update", "slice", klog.KObj(newSlice))
339354
}
340-
c.queue.Add(oldSlice.Spec.Pool.Name)
341-
c.queue.Add(newSlice.Spec.Pool.Name)
355+
c.queue.AddAfter(oldSlice.Spec.Pool.Name, c.syncDelay)
356+
c.queue.AddAfter(newSlice.Spec.Pool.Name, c.syncDelay)
342357
},
343358
DeleteFunc: func(obj any) {
344359
if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok {
@@ -349,7 +364,7 @@ func (c *Controller) initInformer(ctx context.Context) error {
349364
return
350365
}
351366
logger.V(5).Info("ResourceSlice delete", "slice", klog.KObj(slice))
352-
c.queue.Add(slice.Spec.Pool.Name)
367+
c.queue.AddAfter(slice.Spec.Pool.Name, c.syncDelay)
353368
},
354369
})
355370
if err != nil {
@@ -674,7 +689,7 @@ func (c *Controller) syncPool(ctx context.Context, poolName string) error {
674689
case err == nil:
675690
atomic.AddInt64(&c.numDeletes, 1)
676691
case apierrors.IsNotFound(err):
677-
// okay
692+
logger.V(5).Info("Resource slice was already deleted earlier", "slice", klog.KObj(slice))
678693
default:
679694
return fmt.Errorf("delete resource slice: %w", err)
680695
}

staging/src/k8s.io/dynamic-resource-allocation/resourceslice/resourceslicecontroller_test.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"strconv"
2323
"sync"
2424
"testing"
25+
"time"
2526

2627
"github.com/stretchr/testify/assert"
2728
"github.com/stretchr/testify/require"
@@ -87,6 +88,7 @@ func TestControllerSyncPool(t *testing.T) {
8788
)
8889

8990
testCases := map[string]struct {
91+
syncDelay *time.Duration
9092
// nodeUID is empty if not a node-local.
9193
nodeUID types.UID
9294
// noOwner completely disables setting an owner.
@@ -142,7 +144,8 @@ func TestControllerSyncPool(t *testing.T) {
142144
},
143145
},
144146
"remove-pool": {
145-
nodeUID: nodeUID,
147+
nodeUID: nodeUID,
148+
syncDelay: ptr.To(time.Duration(0)),
146149
initialObjects: []runtime.Object{
147150
MakeResourceSlice().Name(resourceSlice1).UID(resourceSlice1).
148151
NodeOwnerReferences(ownerName, string(nodeUID)).NodeName(ownerName).
@@ -653,6 +656,7 @@ func TestControllerSyncPool(t *testing.T) {
653656
Owner: owner,
654657
Resources: test.inputDriverResources,
655658
Queue: &queue,
659+
SyncDelay: test.syncDelay,
656660
})
657661
defer ctrl.Stop()
658662
require.NoError(t, err, "unexpected controller creation error")
@@ -673,15 +677,14 @@ func TestControllerSyncPool(t *testing.T) {
673677

674678
assert.Equal(t, test.expectedStats, ctrl.GetStats())
675679

676-
// The informer might have added a work item after ctrl.run returned.
680+
// The informer might have added a work item before or after ctrl.run returned,
681+
// therefore we cannot compare the `Later` field. It's either defaultMutationCacheTTL
682+
// (last AddAfter call was after a create) or defaultSyncDelay (last AddAfter was
683+
// from informer event handler).
677684
actualState := queue.State()
678-
actualState.Ready = nil
685+
actualState.Later = nil
679686
var expectState workqueue.MockState[string]
680-
if test.expectedStats.NumCreates > 0 {
681-
expectState.Later = []workqueue.MockDelayedItem[string]{{Item: poolName, Duration: defaultMutationCacheTTL}}
682-
}
683687
assert.Equal(t, expectState, actualState)
684-
685688
})
686689
}
687690
}

0 commit comments

Comments
 (0)