pkg/cvo/availableupdates: Preserve update advice on update-service failures

wking · wking · commit d2b092a4518b · 2024-10-30T13:49:38.000-07:00
RemoteFailed, ResponseFailed, and ResponseInvalid reasons are all server-side issues. This commit makes clusters more resilient to OpenShift Update Service (OSUS) issues by preserving the cache of previously-retrieved advice for up to 24 hours, while we wait for OSUS to recover (or proxies or other network configuration between the cluster and its OSUS to be fixed). OSUS advice does not change often, and the only risk of acting on stale advice is that you might not hear about recently-declared Conditional Update risks [1]. The RetrievedUpdates condition should be displayed in the update-selection user interfaces, and the CannotRetrieveUpdates alert will be firing, so cluster administrators will be aware of the risk of stale data, and can decide whether to wait for OSUS to recover, or to initiate an update based on the stale information (which they can supplement with additional checks like [any new risks declared in graph-data recently? [2]). At the moment, restarting the cluster-version operator will also clear the cache. We could reload it from ClusterVersion status, but I'm deferring that for future work. [1]: https://docs.openshift.com/container-platform/4.17/updating/understanding_updates/understanding-update-channels-release.html#conditional-updates-overview_understanding-update-channels-releases [2]: https://github.com/openshift/cincinnati-graph-data/commits/master/blocked-edges
diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go
@@ -57,6 +57,8 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
 	// updates are only checked at most once per minimumUpdateCheckInterval or if the generation changes
 	optrAvailableUpdates := optr.getAvailableUpdates()
 	needFreshFetch := true
+	preserveCacheOnFailure := false
+	maximumCacheInterval := 24 * time.Hour
 	if optrAvailableUpdates == nil {
 		klog.V(2).Info("First attempt to retrieve available updates")
 		optrAvailableUpdates = &availableUpdates{}
@@ -66,14 +68,18 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
 		for i := range config.Status.ConditionalUpdates {
 			optrAvailableUpdates.ConditionalUpdates = append(optrAvailableUpdates.ConditionalUpdates, *config.Status.ConditionalUpdates[i].DeepCopy())
 		}
-	} else if !optrAvailableUpdates.RecentlyChanged(optr.minimumUpdateCheckInterval) {
-		klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
 	} else if channel != optrAvailableUpdates.Channel {
 		klog.V(2).Infof("Retrieving available updates again, because the channel has changed from %q to %q", optrAvailableUpdates.Channel, channel)
 	} else if desiredArch != optrAvailableUpdates.Architecture {
 		klog.V(2).Infof("Retrieving available updates again, because the architecture has changed from %q to %q", optrAvailableUpdates.Architecture, desiredArch)
+	} else if !optrAvailableUpdates.RecentlyChanged(maximumCacheInterval) {
+		klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since last change at %s.  Will clear the cache if this fails.", maximumCacheInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
+	} else if !optrAvailableUpdates.RecentlyAttempted(optr.minimumUpdateCheckInterval) {
+		klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since last attempt at %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
+		preserveCacheOnFailure = true
 	} else if updateService == optrAvailableUpdates.UpdateService || (updateService == defaultUpdateService && optrAvailableUpdates.UpdateService == "") {
 		needsConditionalUpdateEval := false
+		preserveCacheOnFailure = true
 		for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates {
 			if recommended := findRecommendedCondition(conditionalUpdate.Conditions); recommended == nil {
 				needsConditionalUpdateEval = true
@@ -126,11 +132,19 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
 		optrAvailableUpdates.UpdateService = updateService
 		optrAvailableUpdates.Channel = channel
 		optrAvailableUpdates.Architecture = desiredArch
-		optrAvailableUpdates.Current = current
-		optrAvailableUpdates.Updates = updates
-		optrAvailableUpdates.ConditionalUpdates = conditionalUpdates
 		optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry
 		optrAvailableUpdates.Condition = condition
+
+		responseFailed := (condition.Type == configv1.RetrievedUpdates &&
+			condition.Status == configv1.ConditionFalse &&
+			(condition.Reason == "RemoteFailed" ||
+				condition.Reason == "ResponseFailed" ||
+				condition.Reason == "ResponseInvalid"))
+		if !responseFailed || (responseFailed && !preserveCacheOnFailure) {
+			optrAvailableUpdates.Current = current
+			optrAvailableUpdates.Updates = updates
+			optrAvailableUpdates.ConditionalUpdates = conditionalUpdates
+		}
 	}
 
 	optrAvailableUpdates.evaluateConditionalUpdates(ctx)
@@ -183,10 +197,14 @@ type availableUpdates struct {
 	Condition configv1.ClusterOperatorStatusCondition
 }
 
-func (u *availableUpdates) RecentlyChanged(interval time.Duration) bool {
+func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
 	return u.LastAttempt.After(time.Now().Add(-interval))
 }
 
+func (u *availableUpdates) RecentlyChanged(interval time.Duration) bool {
+	return u.LastSyncOrConfigChange.After(time.Now().Add(-interval))
+}
+
 func (u *availableUpdates) NeedsUpdate(original *configv1.ClusterVersion) *configv1.ClusterVersion {
 	if u == nil {
 		return nil
diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go
@@ -2675,17 +2675,18 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
 			},
 		},
 		{
-			name: "if last check time was too recent, do nothing",
+			name: "if last successful check time was too recent, do nothing",
 			handler: func(w http.ResponseWriter, req *http.Request) {
 				http.Error(w, "bad things", http.StatusInternalServerError)
 			},
 			optr: &Operator{
 				updateService:              "http://localhost:8080/graph",
 				minimumUpdateCheckInterval: 1 * time.Minute,
 				availableUpdates: &availableUpdates{
-					UpdateService: "http://localhost:8080/graph",
-					Channel:       "fast",
-					LastAttempt:   time.Now(),
+					UpdateService:          "http://localhost:8080/graph",
+					Channel:                "fast",
+					LastAttempt:            time.Now(),
+					LastSyncOrConfigChange: time.Now(),
 				},
 				release: configv1.Release{
 					Version: "4.0.1",