diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 62994ca22..ffda2bf1b 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -327,6 +327,14 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status failingCondition.Reason = failingReason failingCondition.Message = failingMessage } + if failure != nil && + strings.HasPrefix(progressReason, slowCOUpdatePrefix) { + failingCondition.Status = configv1.ConditionUnknown + failingCondition.Reason = "SlowClusterOperator" + failingCondition.Message = progressMessage + } + progressReason = strings.TrimPrefix(progressReason, slowCOUpdatePrefix) + resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition) // update progressing @@ -537,6 +545,8 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus, } } +const slowCOUpdatePrefix = "Slow::" + // convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as // still making internal progress. The general error we try to suppress is an operator or operators still being // progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines @@ -555,28 +565,67 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin case payload.UpdateEffectReport: return uErr.Reason, uErr.Error(), false case payload.UpdateEffectNone: - return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true + return convertErrorToProgressingForUpdateEffectNone(uErr, now) case payload.UpdateEffectFail: return "", "", false case payload.UpdateEffectFailAfterInterval: - var exceeded []string - threshold := now.Add(-(40 * time.Minute)) - names := uErr.Names - if len(names) == 0 { - names = []string{uErr.Name} - } - for _, name := range names { - if payload.COUpdateStartTimesGet(name).Before(threshold) { + return convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr, now) + } + return "", "", false +} + +func convertErrorToProgressingForUpdateEffectNone(uErr *payload.UpdateError, now time.Time) (string, string, bool) { + var exceeded []string + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + var machineConfig bool + for _, name := range names { + m := 30 * time.Minute + // It takes longer to upgrade MCO + if name == "machine-config" { + m = 3 * m + } + t := payload.COUpdateStartTimesGet(name) + if (!t.IsZero()) && t.Before(now.Add(-(m))) { + if name == "machine-config" { + machineConfig = true + } else { exceeded = append(exceeded, name) } } - if len(exceeded) > 0 { - return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false - } else { - return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true + } + // returns true in those slow cases because it is still only a suspicion + if len(exceeded) > 0 && !machineConfig { + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true + } + if len(exceeded) > 0 && machineConfig { + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true + } + if len(exceeded) == 0 && machineConfig { + return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true + } + return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true +} + +func convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr *payload.UpdateError, now time.Time) (string, string, bool) { + var exceeded []string + threshold := now.Add(-(40 * time.Minute)) + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + for _, name := range names { + if payload.COUpdateStartTimesGet(name).Before(threshold) { + exceeded = append(exceeded, name) } } - return "", "", false + if len(exceeded) > 0 { + return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false + } else { + return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true + } } // syncFailingStatus handles generic errors in the cluster version. It tries to preserve diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index c2aae33af..e70557556 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -5,6 +5,7 @@ import ( "fmt" "reflect" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -358,12 +359,27 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t type args struct { syncWorkerStatus *SyncWorkerStatus } + payload.COUpdateStartTimesEnsure("co-not-timeout") + payload.COUpdateStartTimesEnsure("co-bar-not-timeout") + payload.COUpdateStartTimesEnsure("co-foo-not-timeout") + payload.COUpdateStartTimesAt("co-timeout", time.Now().Add(-60*time.Minute)) + payload.COUpdateStartTimesAt("co-bar-timeout", time.Now().Add(-60*time.Minute)) + defer func() { + payload.COUpdateStartTimesRemove("co-not-timeout") + payload.COUpdateStartTimesRemove("co-bar-not-timeout") + payload.COUpdateStartTimesRemove("co-foo-not-timeout") + payload.COUpdateStartTimesRemove("co-timeout") + payload.COUpdateStartTimesRemove("co-bar-timeout") + }() + tests := []struct { name string args args shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool expectedConditionNotModified *configv1.ClusterOperatorStatusCondition expectedConditionModified *configv1.ClusterOperatorStatusCondition + machineConfigTimeout bool + expectedProgressingCondition *configv1.ClusterOperatorStatusCondition }{ { name: "no errors are present", @@ -394,10 +410,12 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t name: "single UpdateEffectNone error", args: args{ syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, Failure: &payload.UpdateError{ UpdateEffect: payload.UpdateEffectNone, Reason: "ClusterOperatorUpdating", Message: "Cluster operator A is updating", + Name: "co-not-timeout", }, }, }, @@ -412,6 +430,110 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Type: ClusterStatusFailing, Status: configv1.ConditionFalse, }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on co-not-timeout", + }, + }, + { + name: "single UpdateEffectNone error and timeout", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "co-timeout", + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "single UpdateEffectNone error and machine-config", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "machine-config", + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionFalse, + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on machine-config", + }, + }, + { + name: "single UpdateEffectNone error and machine-config timeout", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "machine-config", + }, + }, + }, + machineConfigTimeout: true, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on machine-config over 90 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on machine-config over 90 minutes which is longer than expected", + }, }, { name: "single condensed UpdateEffectFail UpdateError", @@ -587,6 +709,137 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator C is degraded", }, }, + { + name: "MultipleErrors: all updating and none slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-not-timeout", "co-bar-not-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionFalse, + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-not-timeout, co-bar-not-timeout, co-foo-not-timeout", + }, + }, + { + name: "MultipleErrors: all updating and one slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-not-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "MultipleErrors: all updating and some slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "MultipleErrors: all updating and all slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-timeout", "machine-config"}, + }, + }, + }, + machineConfigTimeout: true, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected", + }, + }, } for _, tc := range tests { tc := tc @@ -606,6 +859,11 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t {true, false}, {true, true}, } + if tc.machineConfigTimeout { + payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-120*time.Minute)) + } else { + payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-60*time.Minute)) + } for _, c := range combinations { tc.args.syncWorkerStatus.Reconciling = c.isReconciling cvStatus := &configv1.ClusterVersionStatus{} @@ -621,7 +879,15 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" { t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff) } + + if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty { + progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing) + if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" { + t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff) + } + } } + payload.COUpdateStartTimesRemove("machine-config") }) } } diff --git a/pkg/payload/task.go b/pkg/payload/task.go index 1ac592a56..a260573af 100644 --- a/pkg/payload/task.go +++ b/pkg/payload/task.go @@ -47,9 +47,15 @@ func InitCOUpdateStartTimes() { // COUpdateStartTimesEnsure adds name to clusterOperatorUpdateStartTimes map and sets to // current time if name does not already exist in map. func COUpdateStartTimesEnsure(name string) { + COUpdateStartTimesAt(name, time.Now()) +} + +// COUpdateStartTimesAt adds name to clusterOperatorUpdateStartTimes map and sets to +// t if name does not already exist in map. +func COUpdateStartTimesAt(name string, t time.Time) { clusterOperatorUpdateStartTimes.lock.Lock() if _, ok := clusterOperatorUpdateStartTimes.m[name]; !ok { - clusterOperatorUpdateStartTimes.m[name] = time.Now() + clusterOperatorUpdateStartTimes.m[name] = t } clusterOperatorUpdateStartTimes.lock.Unlock() }