Skip to content

Commit d2b092a

Browse files
committed
pkg/cvo/availableupdates: Preserve update advice on update-service failures
RemoteFailed, ResponseFailed, and ResponseInvalid reasons are all server-side issues. This commit makes clusters more resilient to OpenShift Update Service (OSUS) issues by preserving the cache of previously-retrieved advice for up to 24 hours, while we wait for OSUS to recover (or proxies or other network configuration between the cluster and its OSUS to be fixed). OSUS advice does not change often, and the only risk of acting on stale advice is that you might not hear about recently-declared Conditional Update risks [1]. The RetrievedUpdates condition should be displayed in the update-selection user interfaces, and the CannotRetrieveUpdates alert will be firing, so cluster administrators will be aware of the risk of stale data, and can decide whether to wait for OSUS to recover, or to initiate an update based on the stale information (which they can supplement with additional checks like [any new risks declared in graph-data recently? [2]). At the moment, restarting the cluster-version operator will also clear the cache. We could reload it from ClusterVersion status, but I'm deferring that for future work. [1]: https://docs.openshift.com/container-platform/4.17/updating/understanding_updates/understanding-update-channels-release.html#conditional-updates-overview_understanding-update-channels-releases [2]: https://github.com/openshift/cincinnati-graph-data/commits/master/blocked-edges
1 parent adf0cf5 commit d2b092a

File tree

2 files changed

+29
-10
lines changed

2 files changed

+29
-10
lines changed

pkg/cvo/availableupdates.go

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
5757
// updates are only checked at most once per minimumUpdateCheckInterval or if the generation changes
5858
optrAvailableUpdates := optr.getAvailableUpdates()
5959
needFreshFetch := true
60+
preserveCacheOnFailure := false
61+
maximumCacheInterval := 24 * time.Hour
6062
if optrAvailableUpdates == nil {
6163
klog.V(2).Info("First attempt to retrieve available updates")
6264
optrAvailableUpdates = &availableUpdates{}
@@ -66,14 +68,18 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
6668
for i := range config.Status.ConditionalUpdates {
6769
optrAvailableUpdates.ConditionalUpdates = append(optrAvailableUpdates.ConditionalUpdates, *config.Status.ConditionalUpdates[i].DeepCopy())
6870
}
69-
} else if !optrAvailableUpdates.RecentlyChanged(optr.minimumUpdateCheckInterval) {
70-
klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
7171
} else if channel != optrAvailableUpdates.Channel {
7272
klog.V(2).Infof("Retrieving available updates again, because the channel has changed from %q to %q", optrAvailableUpdates.Channel, channel)
7373
} else if desiredArch != optrAvailableUpdates.Architecture {
7474
klog.V(2).Infof("Retrieving available updates again, because the architecture has changed from %q to %q", optrAvailableUpdates.Architecture, desiredArch)
75+
} else if !optrAvailableUpdates.RecentlyChanged(maximumCacheInterval) {
76+
klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since last change at %s. Will clear the cache if this fails.", maximumCacheInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
77+
} else if !optrAvailableUpdates.RecentlyAttempted(optr.minimumUpdateCheckInterval) {
78+
klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since last attempt at %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
79+
preserveCacheOnFailure = true
7580
} else if updateService == optrAvailableUpdates.UpdateService || (updateService == defaultUpdateService && optrAvailableUpdates.UpdateService == "") {
7681
needsConditionalUpdateEval := false
82+
preserveCacheOnFailure = true
7783
for _, conditionalUpdate := range optrAvailableUpdates.ConditionalUpdates {
7884
if recommended := findRecommendedCondition(conditionalUpdate.Conditions); recommended == nil {
7985
needsConditionalUpdateEval = true
@@ -126,11 +132,19 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
126132
optrAvailableUpdates.UpdateService = updateService
127133
optrAvailableUpdates.Channel = channel
128134
optrAvailableUpdates.Architecture = desiredArch
129-
optrAvailableUpdates.Current = current
130-
optrAvailableUpdates.Updates = updates
131-
optrAvailableUpdates.ConditionalUpdates = conditionalUpdates
132135
optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry
133136
optrAvailableUpdates.Condition = condition
137+
138+
responseFailed := (condition.Type == configv1.RetrievedUpdates &&
139+
condition.Status == configv1.ConditionFalse &&
140+
(condition.Reason == "RemoteFailed" ||
141+
condition.Reason == "ResponseFailed" ||
142+
condition.Reason == "ResponseInvalid"))
143+
if !responseFailed || (responseFailed && !preserveCacheOnFailure) {
144+
optrAvailableUpdates.Current = current
145+
optrAvailableUpdates.Updates = updates
146+
optrAvailableUpdates.ConditionalUpdates = conditionalUpdates
147+
}
134148
}
135149

136150
optrAvailableUpdates.evaluateConditionalUpdates(ctx)
@@ -183,10 +197,14 @@ type availableUpdates struct {
183197
Condition configv1.ClusterOperatorStatusCondition
184198
}
185199

186-
func (u *availableUpdates) RecentlyChanged(interval time.Duration) bool {
200+
func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
187201
return u.LastAttempt.After(time.Now().Add(-interval))
188202
}
189203

204+
func (u *availableUpdates) RecentlyChanged(interval time.Duration) bool {
205+
return u.LastSyncOrConfigChange.After(time.Now().Add(-interval))
206+
}
207+
190208
func (u *availableUpdates) NeedsUpdate(original *configv1.ClusterVersion) *configv1.ClusterVersion {
191209
if u == nil {
192210
return nil

pkg/cvo/cvo_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2675,17 +2675,18 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
26752675
},
26762676
},
26772677
{
2678-
name: "if last check time was too recent, do nothing",
2678+
name: "if last successful check time was too recent, do nothing",
26792679
handler: func(w http.ResponseWriter, req *http.Request) {
26802680
http.Error(w, "bad things", http.StatusInternalServerError)
26812681
},
26822682
optr: &Operator{
26832683
updateService: "http://localhost:8080/graph",
26842684
minimumUpdateCheckInterval: 1 * time.Minute,
26852685
availableUpdates: &availableUpdates{
2686-
UpdateService: "http://localhost:8080/graph",
2687-
Channel: "fast",
2688-
LastAttempt: time.Now(),
2686+
UpdateService: "http://localhost:8080/graph",
2687+
Channel: "fast",
2688+
LastAttempt: time.Now(),
2689+
LastSyncOrConfigChange: time.Now(),
26892690
},
26902691
release: configv1.Release{
26912692
Version: "4.0.1",

0 commit comments

Comments
 (0)