@@ -654,14 +654,18 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health
654
654
}
655
655
656
656
err := wait .PollUntilContextCancel (ctx , in .interval , true , runHealthCheckWithThreshold )
657
- // An error returned implies the health check has failed a sufficient number of
658
- // times for the cluster to be considered unhealthy
659
- // NB. we are ignoring ErrWaitTimeout because this error happens when the channel is close, that in this case
660
- // happens when the cache is explicitly stopped.
661
- if err != nil && ! wait .Interrupted (err ) {
657
+ // An error returned implies the health check has failed a sufficient number of times for the cluster
658
+ // to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the
659
+ // cache context into wait.PollUntilContextCancel).
660
+ // NB. Log all errors that occurred even if this error might just be from a cancel of the cache context
661
+ // when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier.
662
+ if err != nil {
662
663
t .log .Error (err , "Error health checking cluster" , "Cluster" , klog .KRef (in .cluster .Namespace , in .cluster .Name ))
663
- t .deleteAccessor (ctx , in .cluster )
664
664
}
665
+ // Ensure in any case that the accessor is deleted (even if it is a no-op).
666
+ // NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the
667
+ // cluster is reachable again
668
+ t .deleteAccessor (ctx , in .cluster )
665
669
}
666
670
667
671
// newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls.
0 commit comments