@@ -54,19 +54,30 @@ const (
54
54
healthCheckPollInterval = 10 * time .Second
55
55
healthCheckRequestTimeout = 5 * time .Second
56
56
healthCheckUnhealthyThreshold = 10
57
+ initialCacheSyncTimeout = 5 * time .Minute
57
58
clusterCacheControllerName = "cluster-cache-tracker"
58
59
)
59
60
61
+ // ErrClusterLocked is returned in methods that require cluster-level locking
62
+ // if the cluster is already locked by another concurrent call.
63
+ var ErrClusterLocked = errors .New ("cluster is locked already" )
64
+
60
65
// ClusterCacheTracker manages client caches for workload clusters.
61
66
type ClusterCacheTracker struct {
62
67
log logr.Logger
63
68
clientUncachedObjects []client.Object
64
69
client client.Client
65
70
scheme * runtime.Scheme
66
71
67
- lock sync.RWMutex
72
+ // clusterAccessorsLock is used to lock the access to the clusterAccessors map.
73
+ clusterAccessorsLock sync.RWMutex
74
+ // clusterAccessors is the map of clusterAccessors by cluster.
68
75
clusterAccessors map [client.ObjectKey ]* clusterAccessor
69
- indexes []Index
76
+ // clusterLock is a per-cluster lock used whenever we're locking for a specific cluster.
77
+ // E.g. for actions like creating a client or adding watches.
78
+ clusterLock * keyedMutex
79
+
80
+ indexes []Index
70
81
71
82
// controllerPodMetadata is the Pod metadata of the controller using this ClusterCacheTracker.
72
83
// This is only set when the POD_NAMESPACE, POD_NAME and POD_UID environment variables are set.
@@ -129,16 +140,14 @@ func NewClusterCacheTracker(manager ctrl.Manager, options ClusterCacheTrackerOpt
129
140
client : manager .GetClient (),
130
141
scheme : manager .GetScheme (),
131
142
clusterAccessors : make (map [client.ObjectKey ]* clusterAccessor ),
143
+ clusterLock : newKeyedMutex (),
132
144
indexes : options .Indexes ,
133
145
}, nil
134
146
}
135
147
136
148
// GetClient returns a cached client for the given cluster.
137
149
func (t * ClusterCacheTracker ) GetClient (ctx context.Context , cluster client.ObjectKey ) (client.Client , error ) {
138
- t .lock .Lock ()
139
- defer t .lock .Unlock ()
140
-
141
- accessor , err := t .getClusterAccessorLH (ctx , cluster , t .indexes ... )
150
+ accessor , err := t .getClusterAccessor (ctx , cluster , t .indexes ... )
142
151
if err != nil {
143
152
return nil , err
144
153
}
@@ -148,10 +157,7 @@ func (t *ClusterCacheTracker) GetClient(ctx context.Context, cluster client.Obje
148
157
149
158
// GetRESTConfig returns a cached REST config for the given cluster.
150
159
func (t * ClusterCacheTracker ) GetRESTConfig (ctc context.Context , cluster client.ObjectKey ) (* rest.Config , error ) {
151
- t .lock .Lock ()
152
- defer t .lock .Unlock ()
153
-
154
- accessor , err := t .getClusterAccessorLH (ctc , cluster , t .indexes ... )
160
+ accessor , err := t .getClusterAccessor (ctc , cluster , t .indexes ... )
155
161
if err != nil {
156
162
return nil , err
157
163
}
@@ -169,29 +175,68 @@ type clusterAccessor struct {
169
175
170
176
// clusterAccessorExists returns true if a clusterAccessor exists for cluster.
171
177
func (t * ClusterCacheTracker ) clusterAccessorExists (cluster client.ObjectKey ) bool {
172
- t .lock .RLock ()
173
- defer t .lock .RUnlock ()
178
+ t .clusterAccessorsLock .RLock ()
179
+ defer t .clusterAccessorsLock .RUnlock ()
174
180
175
181
_ , exists := t .clusterAccessors [cluster ]
176
182
return exists
177
183
}
178
184
179
- // getClusterAccessorLH first tries to return an already-created clusterAccessor for cluster, falling back to creating a
180
- // new clusterAccessor if needed. Note, this method requires t.lock to already be held (LH=lock held).
181
- func (t * ClusterCacheTracker ) getClusterAccessorLH (ctx context.Context , cluster client.ObjectKey , indexes ... Index ) (* clusterAccessor , error ) {
182
- a := t .clusterAccessors [cluster ]
183
- if a != nil {
184
- return a , nil
185
+ // loadAccessor loads a clusterAccessor.
186
+ func (t * ClusterCacheTracker ) loadAccessor (cluster client.ObjectKey ) (* clusterAccessor , bool ) {
187
+ t .clusterAccessorsLock .RLock ()
188
+ defer t .clusterAccessorsLock .RUnlock ()
189
+
190
+ accessor , ok := t .clusterAccessors [cluster ]
191
+ return accessor , ok
192
+ }
193
+
194
+ // storeAccessor stores a clusterAccessor.
195
+ func (t * ClusterCacheTracker ) storeAccessor (cluster client.ObjectKey , accessor * clusterAccessor ) {
196
+ t .clusterAccessorsLock .Lock ()
197
+ defer t .clusterAccessorsLock .Unlock ()
198
+
199
+ t .clusterAccessors [cluster ] = accessor
200
+ }
201
+
202
+ // getClusterAccessor returns a clusterAccessor for cluster.
203
+ // It first tries to return an already-created clusterAccessor.
204
+ // It then falls back to create a new clusterAccessor if needed.
205
+ // If there is already another go routine trying to create a clusterAccessor
206
+ // for the same cluster, an error is returned.
207
+ func (t * ClusterCacheTracker ) getClusterAccessor (ctx context.Context , cluster client.ObjectKey , indexes ... Index ) (* clusterAccessor , error ) {
208
+ log := ctrl .LoggerFrom (ctx , "cluster" , klog .KRef (cluster .Namespace , cluster .Name ))
209
+
210
+ // If the clusterAccessor already exists, return early.
211
+ if accessor , ok := t .loadAccessor (cluster ); ok {
212
+ return accessor , nil
185
213
}
186
214
187
- a , err := t .newClusterAccessor (ctx , cluster , indexes ... )
188
- if err != nil {
189
- return nil , errors .Wrap (err , "error creating client and cache for remote cluster" )
215
+ // clusterAccessor doesn't exist yet, we might have to initialize one.
216
+ // Lock on the cluster to ensure only one clusterAccessor is initialized
217
+ // for the cluster at the same time.
218
+ // Return an error if another go routine already tries to create a clusterAccessor.
219
+ if ok := t .clusterLock .TryLock (cluster ); ! ok {
220
+ return nil , errors .Wrapf (ErrClusterLocked , "failed to create cluster accessor: failed to get lock for cluster" )
190
221
}
222
+ defer t .clusterLock .Unlock (cluster )
191
223
192
- t .clusterAccessors [cluster ] = a
224
+ // Until we got the cluster lock a different goroutine might have initialized the clusterAccessor
225
+ // for this cluster successfully already. If this is the case we return it.
226
+ if accessor , ok := t .loadAccessor (cluster ); ok {
227
+ return accessor , nil
228
+ }
229
+
230
+ // We are the go routine who has to initialize the clusterAccessor.
231
+ log .V (4 ).Info ("Creating new cluster accessor" )
232
+ accessor , err := t .newClusterAccessor (ctx , cluster , indexes ... )
233
+ if err != nil {
234
+ return nil , errors .Wrap (err , "failed to create cluster accessor" )
235
+ }
193
236
194
- return a , nil
237
+ log .V (4 ).Info ("Storing new cluster accessor" )
238
+ t .storeAccessor (cluster , accessor )
239
+ return accessor , nil
195
240
}
196
241
197
242
// newClusterAccessor creates a new clusterAccessor.
@@ -265,7 +310,12 @@ func (t *ClusterCacheTracker) newClusterAccessor(ctx context.Context, cluster cl
265
310
266
311
// Start the cache!!!
267
312
go cache .Start (cacheCtx ) //nolint:errcheck
268
- if ! cache .WaitForCacheSync (cacheCtx ) {
313
+
314
+ // Wait until the cache is initially synced
315
+ cacheSyncCtx , cacheSyncCtxCancel := context .WithTimeout (ctx , initialCacheSyncTimeout )
316
+ defer cacheSyncCtxCancel ()
317
+ if ! cache .WaitForCacheSync (cacheSyncCtx ) {
318
+ cache .Stop ()
269
319
return nil , fmt .Errorf ("failed waiting for cache for remote cluster %v to sync: %w" , cluster , cacheCtx .Err ())
270
320
}
271
321
@@ -337,8 +387,8 @@ func (t *ClusterCacheTracker) createClient(config *rest.Config, cluster client.O
337
387
338
388
// deleteAccessor stops a clusterAccessor's cache and removes the clusterAccessor from the tracker.
339
389
func (t * ClusterCacheTracker ) deleteAccessor (_ context.Context , cluster client.ObjectKey ) {
340
- t .lock .Lock ()
341
- defer t .lock .Unlock ()
390
+ t .clusterAccessorsLock .Lock ()
391
+ defer t .clusterAccessorsLock .Unlock ()
342
392
343
393
a , exists := t .clusterAccessors [cluster ]
344
394
if ! exists {
@@ -387,25 +437,30 @@ func (t *ClusterCacheTracker) Watch(ctx context.Context, input WatchInput) error
387
437
return errors .New ("input.Name is required" )
388
438
}
389
439
390
- t .lock .Lock ()
391
- defer t .lock .Unlock ()
392
-
393
- a , err := t .getClusterAccessorLH (ctx , input .Cluster , t .indexes ... )
440
+ accessor , err := t .getClusterAccessor (ctx , input .Cluster , t .indexes ... )
394
441
if err != nil {
395
- return err
442
+ return errors .Wrapf (err , "failed to add %s watch on cluster %s" , input .Kind , klog .KRef (input .Cluster .Namespace , input .Cluster .Name ))
443
+ }
444
+
445
+ // We have to lock the cluster, so that the watch is not created multiple times in parallel.
446
+ ok := t .clusterLock .TryLock (input .Cluster )
447
+ if ! ok {
448
+ return errors .Wrapf (ErrClusterLocked , "failed to add %s watch on cluster %s: failed to get lock for cluster" , input .Kind , klog .KRef (input .Cluster .Namespace , input .Cluster .Name ))
396
449
}
450
+ defer t .clusterLock .Unlock (input .Cluster )
397
451
398
- if a .watches .Has (input .Name ) {
399
- t .log .V (6 ).Info ("Watch already exists" , "Cluster" , klog .KRef (input .Cluster .Namespace , input .Cluster .Name ), "name" , input .Name )
452
+ if accessor .watches .Has (input .Name ) {
453
+ log := ctrl .LoggerFrom (ctx )
454
+ log .V (6 ).Info ("Watch already exists" , "Cluster" , klog .KRef (input .Cluster .Namespace , input .Cluster .Name ), "name" , input .Name )
400
455
return nil
401
456
}
402
457
403
458
// Need to create the watch
404
- if err := input .Watcher .Watch (source .NewKindWithCache (input .Kind , a .cache ), input .EventHandler , input .Predicates ... ); err != nil {
405
- return errors .Wrap (err , "error creating watch" )
459
+ if err := input .Watcher .Watch (source .NewKindWithCache (input .Kind , accessor .cache ), input .EventHandler , input .Predicates ... ); err != nil {
460
+ return errors .Wrapf (err , "failed to add %s watch on cluster %s: failed to create watch" , input . Kind , klog . KRef ( input . Cluster . Namespace , input . Cluster . Name ) )
406
461
}
407
462
408
- a .watches .Insert (input .Name )
463
+ accessor .watches .Insert (input .Name )
409
464
410
465
return nil
411
466
}
@@ -472,7 +527,7 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health
472
527
return false , nil
473
528
}
474
529
475
- if ! t . clusterAccessorExists (in .cluster ) {
530
+ if _ , ok := t . loadAccessor (in .cluster ); ! ok {
476
531
// Cache for this cluster has already been cleaned up.
477
532
// Nothing to do, so return true.
478
533
return true , nil
@@ -505,7 +560,7 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health
505
560
// An error returned implies the health check has failed a sufficient number of
506
561
// times for the cluster to be considered unhealthy
507
562
// NB. we are ignoring ErrWaitTimeout because this error happens when the channel is close, that in this case
508
- // happens when the cache is explicitly stopped.F
563
+ // happens when the cache is explicitly stopped.
509
564
if err != nil && err != wait .ErrWaitTimeout {
510
565
t .log .Error (err , "Error health checking cluster" , "Cluster" , klog .KRef (in .cluster .Namespace , in .cluster .Name ))
511
566
t .deleteAccessor (ctx , in .cluster )
0 commit comments