@@ -654,14 +654,18 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health
654654 }
655655
656656 err := wait .PollUntilContextCancel (ctx , in .interval , true , runHealthCheckWithThreshold )
657- // An error returned implies the health check has failed a sufficient number of
658- // times for the cluster to be considered unhealthy
659- // NB. we are ignoring ErrWaitTimeout because this error happens when the channel is close, that in this case
660- // happens when the cache is explicitly stopped.
661- if err != nil && ! wait .Interrupted (err ) {
657+ // An error returned implies the health check has failed a sufficient number of times for the cluster
658+ // to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the
659+ // cache context into wait.PollUntilContextCancel).
660+ // NB. Log all errors that occurred even if this error might just be from a cancel of the cache context
661+ // when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier.
662+ if err != nil {
662663 t .log .Error (err , "Error health checking cluster" , "Cluster" , klog .KRef (in .cluster .Namespace , in .cluster .Name ))
663- t .deleteAccessor (ctx , in .cluster )
664664 }
665+ // Ensure in any case that the accessor is deleted (even if it is a no-op).
666+ // NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the
667+ // cluster is reachable again
668+ t .deleteAccessor (ctx , in .cluster )
665669}
666670
667671// newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls.
0 commit comments