@@ -22,7 +22,7 @@ import (
22
22
"fmt"
23
23
"time"
24
24
25
- "k8s.io/api/core/v1"
25
+ v1 "k8s.io/api/core/v1"
26
26
apierrors "k8s.io/apimachinery/pkg/api/errors"
27
27
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28
28
"k8s.io/apimachinery/pkg/types"
@@ -32,9 +32,11 @@ import (
32
32
clientset "k8s.io/client-go/kubernetes"
33
33
"k8s.io/client-go/kubernetes/scheme"
34
34
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
35
+ corelisters "k8s.io/client-go/listers/core/v1"
35
36
"k8s.io/client-go/tools/cache"
36
37
"k8s.io/client-go/tools/record"
37
38
clientretry "k8s.io/client-go/util/retry"
39
+ "k8s.io/client-go/util/workqueue"
38
40
cloudprovider "k8s.io/cloud-provider"
39
41
cloudproviderapi "k8s.io/cloud-provider/api"
40
42
cloudnodeutil "k8s.io/cloud-provider/node/helpers"
@@ -84,6 +86,7 @@ var UpdateNodeSpecBackoff = wait.Backoff{
84
86
Jitter : 1.0 ,
85
87
}
86
88
89
+ // CloudNodeController is the controller implementation for Node resources
87
90
type CloudNodeController struct {
88
91
nodeInformer coreinformers.NodeInformer
89
92
kubeClient clientset.Interface
@@ -92,6 +95,10 @@ type CloudNodeController struct {
92
95
cloud cloudprovider.Interface
93
96
94
97
nodeStatusUpdateFrequency time.Duration
98
+
99
+ nodesLister corelisters.NodeLister
100
+ nodesSynced cache.InformerSynced
101
+ workqueue workqueue.RateLimitingInterface
95
102
}
96
103
97
104
// NewCloudNodeController creates a CloudNodeController object
@@ -120,38 +127,112 @@ func NewCloudNodeController(
120
127
recorder : recorder ,
121
128
cloud : cloud ,
122
129
nodeStatusUpdateFrequency : nodeStatusUpdateFrequency ,
130
+ nodesLister : nodeInformer .Lister (),
131
+ nodesSynced : nodeInformer .Informer ().HasSynced ,
132
+ workqueue : workqueue .NewNamedRateLimitingQueue (workqueue .DefaultControllerRateLimiter (), "Nodes" ),
123
133
}
124
134
125
135
// Use shared informer to listen to add/update of nodes. Note that any nodes
126
136
// that exist before node controller starts will show up in the update method
127
137
cnc .nodeInformer .Informer ().AddEventHandler (cache.ResourceEventHandlerFuncs {
128
- AddFunc : func ( obj interface {}) { cnc .AddCloudNode ( context . TODO (), obj ) } ,
129
- UpdateFunc : func (oldObj , newObj interface {}) { cnc .UpdateCloudNode ( context . TODO (), oldObj , newObj ) },
138
+ AddFunc : cnc .enqueueNode ,
139
+ UpdateFunc : func (oldObj , newObj interface {}) { cnc .enqueueNode ( newObj ) },
130
140
})
131
141
132
142
return cnc , nil
133
143
}
134
144
145
+ // Run will sync informer caches and starting workers.
135
146
// This controller updates newly registered nodes with information
136
147
// from the cloud provider. This call is blocking so should be called
137
148
// via a goroutine
138
149
func (cnc * CloudNodeController ) Run (stopCh <- chan struct {}) {
139
150
defer utilruntime .HandleCrash ()
151
+ defer cnc .workqueue .ShutDown ()
152
+
153
+ // Wait for the caches to be synced before starting workers
154
+ klog .Info ("Waiting for informer caches to sync" )
155
+ if ok := cache .WaitForCacheSync (stopCh , cnc .nodesSynced ); ! ok {
156
+ klog .Errorf ("failed to wait for caches to sync" )
157
+ return
158
+ }
140
159
141
- // The following loops run communicate with the APIServer with a worst case complexity
160
+ // The periodic loop for updateNodeStatus communicates with the APIServer with a worst case complexity
142
161
// of O(num_nodes) per cycle. These functions are justified here because these events fire
143
162
// very infrequently. DO NOT MODIFY this to perform frequent operations.
163
+ go wait .Until (func () { cnc .UpdateNodeStatus (context .TODO ()) }, cnc .nodeStatusUpdateFrequency , stopCh )
164
+ go wait .Until (cnc .runWorker , time .Second , stopCh )
165
+
166
+ <- stopCh
167
+ }
168
+
169
+ // runWorker is a long-running function that will continually call the
170
+ // processNextWorkItem function in order to read and process a message on the
171
+ // workqueue.
172
+ func (cnc * CloudNodeController ) runWorker () {
173
+ for cnc .processNextWorkItem () {
174
+ }
175
+ }
176
+
177
+ // processNextWorkItem will read a single work item off the workqueue and
178
+ // attempt to process it, by calling the syncHandler.
179
+ func (cnc * CloudNodeController ) processNextWorkItem () bool {
180
+ obj , shutdown := cnc .workqueue .Get ()
181
+ if shutdown {
182
+ return false
183
+ }
184
+
185
+ // We wrap this block in a func so we can defer cnc.workqueue.Done.
186
+ err := func (obj interface {}) error {
187
+ defer cnc .workqueue .Done (obj )
188
+
189
+ var key string
190
+ var ok bool
191
+ if key , ok = obj .(string ); ! ok {
192
+ cnc .workqueue .Forget (obj )
193
+ utilruntime .HandleError (fmt .Errorf ("expected string in workqueue but got %#v" , obj ))
194
+ return nil
195
+ }
196
+
197
+ // Run the syncHandler, passing it the key of the
198
+ // Node resource to be synced.
199
+ if err := cnc .syncHandler (key ); err != nil {
200
+ // Put the item back on the workqueue to handle any transient errors.
201
+ cnc .workqueue .AddRateLimited (key )
202
+ return fmt .Errorf ("error syncing '%s': %s, requeuing" , key , err .Error ())
203
+ }
204
+
205
+ // Finally, if no error occurs we Forget this item so it does not
206
+ // get queued again until another change happens.
207
+ cnc .workqueue .Forget (obj )
208
+ return nil
209
+ }(obj )
210
+
211
+ if err != nil {
212
+ utilruntime .HandleError (err )
213
+ return true
214
+ }
215
+
216
+ return true
217
+ }
218
+
219
+ // syncHandler implements the logic of the controller.
220
+ func (cnc * CloudNodeController ) syncHandler (key string ) error {
221
+ _ , name , err := cache .SplitMetaNamespaceKey (key )
222
+ if err != nil {
223
+ utilruntime .HandleError (fmt .Errorf ("invalid resource key: %s" , key ))
224
+ return nil
225
+ }
144
226
145
- // Start a loop to periodically update the node addresses obtained from the cloud
146
- wait .Until (func () { cnc .UpdateNodeStatus (context .TODO ()) }, cnc .nodeStatusUpdateFrequency , stopCh )
227
+ return cnc .syncNode (context .TODO (), name )
147
228
}
148
229
149
230
// UpdateNodeStatus updates the node status, such as node addresses
150
- func (cnc * CloudNodeController ) UpdateNodeStatus (ctx context.Context ) {
231
+ func (cnc * CloudNodeController ) UpdateNodeStatus (ctx context.Context ) error {
151
232
nodes , err := cnc .kubeClient .CoreV1 ().Nodes ().List (context .TODO (), metav1.ListOptions {ResourceVersion : "0" })
152
233
if err != nil {
153
234
klog .Errorf ("Error monitoring node status: %v" , err )
154
- return
235
+ return err
155
236
}
156
237
157
238
for i := range nodes .Items {
@@ -169,6 +250,20 @@ func (cnc *CloudNodeController) UpdateNodeStatus(ctx context.Context) {
169
250
klog .Errorf ("Error reconciling node labels for node %q, err: %v" , node .Name , err )
170
251
}
171
252
}
253
+
254
+ return nil
255
+ }
256
+
257
+ // enqueueNode takes a Node resource and converts it into a key
258
+ // string which is then put onto the work queue.
259
+ func (cnc * CloudNodeController ) enqueueNode (obj interface {}) {
260
+ var key string
261
+ var err error
262
+ if key , err = cache .MetaNamespaceKeyFunc (obj ); err != nil {
263
+ utilruntime .HandleError (err )
264
+ return
265
+ }
266
+ cnc .workqueue .Add (key )
172
267
}
173
268
174
269
// reconcileNodeLabels reconciles node labels transitioning from beta to GA
@@ -273,122 +368,98 @@ func (cnc *CloudNodeController) updateNodeAddress(ctx context.Context, node *v1.
273
368
// in a retry-if-conflict loop.
274
369
type nodeModifier func (* v1.Node )
275
370
276
- func (cnc * CloudNodeController ) UpdateCloudNode (ctx context.Context , _ , newObj interface {}) {
277
- node , ok := newObj .(* v1.Node )
278
- if ! ok {
279
- utilruntime .HandleError (fmt .Errorf ("unexpected object type: %v" , newObj ))
280
- return
281
- }
282
-
283
- cloudTaint := getCloudTaint (node .Spec .Taints )
284
- if cloudTaint == nil {
285
- // The node has already been initialized so nothing to do.
286
- return
287
- }
288
-
289
- cnc .initializeNode (ctx , node )
290
- }
291
-
292
- // AddCloudNode handles initializing new nodes registered with the cloud taint.
293
- func (cnc * CloudNodeController ) AddCloudNode (ctx context.Context , obj interface {}) {
294
- node := obj .(* v1.Node )
295
-
296
- cloudTaint := getCloudTaint (node .Spec .Taints )
297
- if cloudTaint == nil {
298
- klog .V (2 ).Infof ("This node %s is registered without the cloud taint. Will not process." , node .Name )
299
- return
300
- }
301
-
302
- cnc .initializeNode (ctx , node )
303
- }
304
-
305
- // This processes nodes that were added into the cluster, and cloud initialize them if appropriate
306
- func (cnc * CloudNodeController ) initializeNode (ctx context.Context , node * v1.Node ) {
307
- klog .Infof ("Initializing node %s with cloud provider" , node .Name )
308
-
309
- err := clientretry .RetryOnConflict (UpdateNodeSpecBackoff , func () error {
310
- // TODO(wlan0): Move this logic to the route controller using the node taint instead of condition
311
- // Since there are node taints, do we still need this?
312
- // This condition marks the node as unusable until routes are initialized in the cloud provider
313
- if cnc .cloud .ProviderName () == "gce" {
314
- if err := cloudnodeutil .SetNodeCondition (cnc .kubeClient , types .NodeName (node .Name ), v1.NodeCondition {
315
- Type : v1 .NodeNetworkUnavailable ,
316
- Status : v1 .ConditionTrue ,
317
- Reason : "NoRouteCreated" ,
318
- Message : "Node created without a route" ,
319
- LastTransitionTime : metav1 .Now (),
320
- }); err != nil {
321
- return err
322
- }
323
- }
324
- return nil
325
- })
371
+ // syncNode handles updating existing nodes registered with the cloud taint
372
+ // and processes nodes that were added into the cluster, and cloud initialize them if appropriate.
373
+ func (cnc * CloudNodeController ) syncNode (ctx context.Context , nodeName string ) error {
374
+ curNode , err := cnc .nodeInformer .Lister ().Get (nodeName )
326
375
if err != nil {
327
- utilruntime . HandleError (err )
328
- return
329
- }
376
+ if apierrors . IsNotFound (err ) {
377
+ return nil
378
+ }
330
379
331
- curNode , err := cnc .kubeClient .CoreV1 ().Nodes ().Get (context .TODO (), node .Name , metav1.GetOptions {})
332
- if err != nil {
333
- utilruntime .HandleError (fmt .Errorf ("failed to get node %s: %v" , node .Name , err ))
334
- return
380
+ return err
335
381
}
336
382
337
383
cloudTaint := getCloudTaint (curNode .Spec .Taints )
338
384
if cloudTaint == nil {
339
385
// Node object received from event had the cloud taint but was outdated,
340
- // the node has actually already been initialized.
341
- return
386
+ // the node has actually already been initialized, so this sync event can be ignored .
387
+ return nil
342
388
}
343
389
344
- providerID , err := cnc .getProviderID (ctx , curNode )
390
+ klog .Infof ("Initializing node %s with cloud provider" , nodeName )
391
+
392
+ copyNode := curNode .DeepCopy ()
393
+ providerID , err := cnc .getProviderID (ctx , copyNode )
345
394
if err != nil {
346
- utilruntime .HandleError (fmt .Errorf ("failed to get provider ID for node %s at cloudprovider: %v" , node .Name , err ))
347
- return
395
+ return fmt .Errorf ("failed to get provider ID for node %s at cloudprovider: %v" , nodeName , err )
348
396
}
349
397
350
- instanceMetadata , err := cnc .getInstanceMetadata (ctx , providerID , curNode )
398
+ instanceMetadata , err := cnc .getInstanceMetadata (ctx , providerID , copyNode )
351
399
if err != nil {
352
- utilruntime .HandleError (fmt .Errorf ("failed to get instance metadata for node %s: %v" , node .Name , err ))
353
- return
400
+ return fmt .Errorf ("failed to get instance metadata for node %s: %v" , nodeName , err )
354
401
}
355
402
356
- nodeModifiers , err := cnc .getNodeModifiersFromCloudProvider (ctx , providerID , curNode , instanceMetadata )
403
+ nodeModifiers , err := cnc .getNodeModifiersFromCloudProvider (ctx , providerID , copyNode , instanceMetadata )
357
404
if err != nil {
358
- utilruntime .HandleError (fmt .Errorf ("failed to initialize node %s at cloudprovider: %v" , node .Name , err ))
359
- return
405
+ return fmt .Errorf ("failed to get node modifiers from cloud provider: %v" , err )
360
406
}
361
407
362
408
nodeModifiers = append (nodeModifiers , func (n * v1.Node ) {
363
409
n .Spec .Taints = excludeCloudTaint (n .Spec .Taints )
364
410
})
365
411
366
412
err = clientretry .RetryOnConflict (UpdateNodeSpecBackoff , func () error {
367
- curNode , err := cnc .kubeClient .CoreV1 ().Nodes ().Get (context .TODO (), node .Name , metav1.GetOptions {})
368
- if err != nil {
369
- return err
413
+ var curNode * v1.Node
414
+ if cnc .cloud .ProviderName () == "gce" {
415
+ // TODO(wlan0): Move this logic to the route controller using the node taint instead of condition
416
+ // Since there are node taints, do we still need this?
417
+ // This condition marks the node as unusable until routes are initialized in the cloud provider
418
+ if err := cloudnodeutil .SetNodeCondition (cnc .kubeClient , types .NodeName (nodeName ), v1.NodeCondition {
419
+ Type : v1 .NodeNetworkUnavailable ,
420
+ Status : v1 .ConditionTrue ,
421
+ Reason : "NoRouteCreated" ,
422
+ Message : "Node created without a route" ,
423
+ LastTransitionTime : metav1 .Now (),
424
+ }); err != nil {
425
+ return err
426
+ }
427
+
428
+ // fetch latest node from API server since GCE-specific condition was set and informer cache may be stale
429
+ curNode , err = cnc .kubeClient .CoreV1 ().Nodes ().Get (context .TODO (), nodeName , metav1.GetOptions {})
430
+ if err != nil {
431
+ return err
432
+ }
433
+ } else {
434
+ curNode , err = cnc .nodeInformer .Lister ().Get (nodeName )
435
+ if err != nil {
436
+ return err
437
+ }
370
438
}
371
439
440
+ newNode := curNode .DeepCopy ()
372
441
for _ , modify := range nodeModifiers {
373
- modify (curNode )
442
+ modify (newNode )
374
443
}
375
444
376
- _ , err = cnc .kubeClient .CoreV1 ().Nodes ().Update (context .TODO (), curNode , metav1.UpdateOptions {})
445
+ _ , err = cnc .kubeClient .CoreV1 ().Nodes ().Update (context .TODO (), newNode , metav1.UpdateOptions {})
377
446
if err != nil {
378
447
return err
379
448
}
380
449
381
450
// After adding, call UpdateNodeAddress to set the CloudProvider provided IPAddresses
382
451
// So that users do not see any significant delay in IP addresses being filled into the node
383
- cnc .updateNodeAddress (ctx , curNode , instanceMetadata )
452
+ cnc .updateNodeAddress (ctx , newNode , instanceMetadata )
384
453
385
- klog .Infof ("Successfully initialized node %s with cloud provider" , node . Name )
454
+ klog .Infof ("Successfully initialized node %s with cloud provider" , nodeName )
386
455
return nil
387
456
})
388
457
if err != nil {
389
- utilruntime .HandleError (err )
390
- return
458
+ return err
391
459
}
460
+
461
+ cnc .recorder .Event (copyNode , v1 .EventTypeNormal , "Synced" , "Node synced successfully" )
462
+ return nil
392
463
}
393
464
394
465
// getNodeModifiersFromCloudProvider returns a slice of nodeModifiers that update
0 commit comments