Skip to content

Commit 6762aed

Browse files
authored
Merge pull request #9025 from sbueringer/pr-fix-cct-healthcheck
🌱 ClusterCacheTracker: fix accessor deletion on health check failure
2 parents 2cd922b + 1505143 commit 6762aed

File tree

1 file changed

+10
-6
lines changed

1 file changed

+10
-6
lines changed

controllers/remote/cluster_cache_tracker.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -654,14 +654,18 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health
654654
}
655655

656656
err := wait.PollUntilContextCancel(ctx, in.interval, true, runHealthCheckWithThreshold)
657-
// An error returned implies the health check has failed a sufficient number of
658-
// times for the cluster to be considered unhealthy
659-
// NB. we are ignoring ErrWaitTimeout because this error happens when the channel is close, that in this case
660-
// happens when the cache is explicitly stopped.
661-
if err != nil && !wait.Interrupted(err) {
657+
// An error returned implies the health check has failed a sufficient number of times for the cluster
658+
// to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the
659+
// cache context into wait.PollUntilContextCancel).
660+
// NB. Log all errors that occurred even if this error might just be from a cancel of the cache context
661+
// when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier.
662+
if err != nil {
662663
t.log.Error(err, "Error health checking cluster", "Cluster", klog.KRef(in.cluster.Namespace, in.cluster.Name))
663-
t.deleteAccessor(ctx, in.cluster)
664664
}
665+
// Ensure in any case that the accessor is deleted (even if it is a no-op).
666+
// NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the
667+
// cluster is reachable again
668+
t.deleteAccessor(ctx, in.cluster)
665669
}
666670

667671
// newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls.

0 commit comments

Comments
 (0)