@@ -20,6 +20,7 @@ import (
20
20
"context"
21
21
"errors"
22
22
"fmt"
23
+ "sort"
23
24
"sync"
24
25
"time"
25
26
@@ -44,6 +45,10 @@ const (
44
45
// resyncPeriod for informer
45
46
// TODO (https://github.com/kubernetes/kubernetes/issues/123688): disable?
46
47
resyncPeriod = time .Duration (10 * time .Minute )
48
+
49
+ // poolNameIndex is the name for the ResourceSlice store's index function,
50
+ // which is to index by ResourceSlice.Spec.Pool.Name
51
+ poolNameIndex = "poolName"
47
52
)
48
53
49
54
// Controller synchronizes information about resources of one driver with
@@ -58,7 +63,7 @@ type Controller struct {
58
63
wg sync.WaitGroup
59
64
// The queue is keyed with the pool name that needs work.
60
65
queue workqueue.TypedRateLimitingInterface [string ]
61
- sliceStore cache.Store
66
+ sliceStore cache.Indexer
62
67
63
68
mutex sync.RWMutex
64
69
@@ -109,24 +114,11 @@ type Owner struct {
109
114
// the controller is inactive. This can happen when kubelet is run stand-alone
110
115
// without an apiserver. In that case we can't and don't need to publish
111
116
// ResourceSlices.
112
- func StartController (ctx context.Context , kubeClient kubernetes.Interface , driver string , owner Owner , resources * DriverResources ) * Controller {
113
- if kubeClient == nil {
114
- return nil
115
- }
116
-
117
+ func StartController (ctx context.Context , kubeClient kubernetes.Interface , driver string , owner Owner , resources * DriverResources ) (* Controller , error ) {
117
118
logger := klog .FromContext (ctx )
118
- ctx , cancel := context .WithCancelCause (ctx )
119
-
120
- c := & Controller {
121
- cancel : cancel ,
122
- kubeClient : kubeClient ,
123
- driver : driver ,
124
- owner : owner ,
125
- queue : workqueue .NewTypedRateLimitingQueueWithConfig (
126
- workqueue .DefaultTypedControllerRateLimiter [string ](),
127
- workqueue.TypedRateLimitingQueueConfig [string ]{Name : "node_resource_slices" },
128
- ),
129
- resources : resources ,
119
+ c , err := newController (ctx , kubeClient , driver , owner , resources )
120
+ if err != nil {
121
+ return nil , fmt .Errorf ("create controller: %w" , err )
130
122
}
131
123
132
124
logger .V (3 ).Info ("Starting" )
@@ -142,7 +134,7 @@ func StartController(ctx context.Context, kubeClient kubernetes.Interface, drive
142
134
c .queue .Add (poolName )
143
135
}
144
136
145
- return c
137
+ return c , nil
146
138
}
147
139
148
140
// Stop cancels all background activity and blocks until the controller has stopped.
@@ -175,20 +167,53 @@ func (c *Controller) Update(resources *DriverResources) {
175
167
}
176
168
}
177
169
178
- // run is running in the background. It handles blocking initialization (like
179
- // syncing the informer) and then syncs the actual with the desired state.
180
- func (c * Controller ) run (ctx context.Context ) {
170
+ // newController creates a new controller.
171
+ func newController (ctx context.Context , kubeClient kubernetes.Interface , driver string , owner Owner , resources * DriverResources ) (* Controller , error ) {
172
+ if kubeClient == nil {
173
+ return nil , fmt .Errorf ("kubeClient is nil" )
174
+ }
175
+
176
+ ctx , cancel := context .WithCancelCause (ctx )
177
+
178
+ c := & Controller {
179
+ cancel : cancel ,
180
+ kubeClient : kubeClient ,
181
+ driver : driver ,
182
+ owner : owner ,
183
+ queue : workqueue .NewTypedRateLimitingQueueWithConfig (
184
+ workqueue .DefaultTypedControllerRateLimiter [string ](),
185
+ workqueue.TypedRateLimitingQueueConfig [string ]{Name : "node_resource_slices" },
186
+ ),
187
+ resources : resources ,
188
+ }
189
+
190
+ if err := c .initInformer (ctx ); err != nil {
191
+ return nil , err
192
+ }
193
+ return c , nil
194
+ }
195
+
196
+ // initInformer initializes the informer used to watch for changes to the resources slice.
197
+ func (c * Controller ) initInformer (ctx context.Context ) error {
181
198
logger := klog .FromContext (ctx )
182
199
183
200
// We always filter by driver name, by node name only for node-local resources.
184
201
selector := fields.Set {resourceapi .ResourceSliceSelectorDriver : c .driver }
185
202
if c .owner .APIVersion == "v1" && c .owner .Kind == "Node" {
186
203
selector [resourceapi .ResourceSliceSelectorNodeName ] = c .owner .Name
187
204
}
188
- informer := resourceinformers .NewFilteredResourceSliceInformer (c .kubeClient , resyncPeriod , nil , func (options * metav1.ListOptions ) {
205
+ informer := resourceinformers .NewFilteredResourceSliceInformer (c .kubeClient , resyncPeriod , cache.Indexers {
206
+ poolNameIndex : func (obj interface {}) ([]string , error ) {
207
+ slice , ok := obj .(* resourceapi.ResourceSlice )
208
+ if ! ok {
209
+ return []string {}, nil
210
+ }
211
+ return []string {slice .Spec .Pool .Name }, nil
212
+ },
213
+ }, func (options * metav1.ListOptions ) {
189
214
options .FieldSelector = selector .String ()
190
215
})
191
- c .sliceStore = informer .GetStore ()
216
+ c .sliceStore = informer .GetIndexer ()
192
217
handler , err := informer .AddEventHandler (cache.ResourceEventHandlerFuncs {
193
218
AddFunc : func (obj any ) {
194
219
slice , ok := obj .(* resourceapi.ResourceSlice )
@@ -228,10 +253,8 @@ func (c *Controller) run(ctx context.Context) {
228
253
},
229
254
})
230
255
if err != nil {
231
- logger .Error (err , "Registering event handler on the ResourceSlice informer failed, disabling resource monitoring" )
232
- return
256
+ return fmt .Errorf ("registering event handler on the ResourceSlice informer: %w" , err )
233
257
}
234
-
235
258
// Start informer and wait for our cache to be populated.
236
259
logger .V (3 ).Info ("Starting ResourceSlice informer and waiting for it to sync" )
237
260
c .wg .Add (1 )
@@ -245,13 +268,15 @@ func (c *Controller) run(ctx context.Context) {
245
268
select {
246
269
case <- time .After (time .Second ):
247
270
case <- ctx .Done ():
248
- return
271
+ return fmt . Errorf ( "sync ResourceSlice informer: %w" , context . Cause ( ctx ))
249
272
}
250
273
}
251
274
logger .V (3 ).Info ("ResourceSlice informer has synced" )
275
+ return nil
276
+ }
252
277
253
- // Seed the
254
-
278
+ // run is running in the background.
279
+ func ( c * Controller ) run ( ctx context. Context ) {
255
280
for c .processNextWorkItem (ctx ) {
256
281
}
257
282
}
@@ -295,10 +320,13 @@ func (c *Controller) syncPool(ctx context.Context, poolName string) error {
295
320
logger := klog .FromContext (ctx )
296
321
297
322
// Gather information about the actual and desired state.
298
- // TODO: index by pool name.
299
323
var slices []* resourceapi.ResourceSlice
300
- for _ , obj := range c .sliceStore .List () {
301
- if slice , ok := obj .(* resourceapi.ResourceSlice ); ok && slice .Spec .Pool .Name == poolName {
324
+ objs , err := c .sliceStore .ByIndex (poolNameIndex , poolName )
325
+ if err != nil {
326
+ return fmt .Errorf ("retrieve ResourceSlice objects: %w" , err )
327
+ }
328
+ for _ , obj := range objs {
329
+ if slice , ok := obj .(* resourceapi.ResourceSlice ); ok {
302
330
slices = append (slices , slice )
303
331
}
304
332
}
@@ -346,6 +374,11 @@ func (c *Controller) syncPool(ctx context.Context, poolName string) error {
346
374
}
347
375
slices = currentSlices
348
376
377
+ // Sort by name to ensure that keeping only the first slice is deterministic.
378
+ sort .Slice (slices , func (i , j int ) bool {
379
+ return slices [i ].Name < slices [j ].Name
380
+ })
381
+
349
382
if pool , ok := resources .Pools [poolName ]; ok {
350
383
if pool .Generation > generation {
351
384
generation = pool .Generation
@@ -397,6 +430,8 @@ func (c *Controller) syncPool(ctx context.Context, poolName string) error {
397
430
398
431
logger .V (5 ).Info ("Removing resource slices after pool removal" , "obsoleteSlices" , klog .KObjSlice (obsoleteSlices ), "slices" , klog .KObjSlice (slices ), "numDevices" , len (pool .Devices ))
399
432
obsoleteSlices = append (obsoleteSlices , slices ... )
433
+ // No need to create or update the slices.
434
+ slices = nil
400
435
}
401
436
402
437
// Remove stale slices.
@@ -420,7 +455,7 @@ func (c *Controller) syncPool(ctx context.Context, poolName string) error {
420
455
// TODO: switch to SSA once unit testing supports it.
421
456
logger .V (5 ).Info ("Updating existing resource slice" , "slice" , klog .KObj (slice ))
422
457
if _ , err := c .kubeClient .ResourceV1alpha3 ().ResourceSlices ().Update (ctx , slice , metav1.UpdateOptions {}); err != nil {
423
- return fmt .Errorf ("delete resource slice: %w" , err )
458
+ return fmt .Errorf ("update resource slice: %w" , err )
424
459
}
425
460
}
426
461
0 commit comments