Skip to content

Commit ba7f4f7

Browse files
committed
OCPBUGS-23514: Failing=Unknown upon long CO updating
When it takes too long (90m+ for machine-config and 30m+ for others) to upgrade a cluster operator, clusterversion shows a message with the indication that the upgrade might hit some issue. This will cover the case in the related OCPBUGS-23538: for some reason, the pod under the deployment that manages the CO hit CrashLoopBackOff. Deployment controller does not give useful conditions in this situation [1]. Otherwise, checkDeploymentHealth [2] would detect it. Instead of CVO's figuring out the underlying pod's CrashLoopBackOff which might be better to be implemented by deployment controller, it is expected that our cluster admin starts to dig into the cluster when such a message pops up. In addition to the condition's message. We propagate Fail=Unknown to make it available for other automations, such as update-status command. [1]. kubernetes/kubernetes#106054 [2]. https://github.com/openshift/cluster-version-operator/blob/08c0459df5096e9f16fad3af2831b62d06d415ee/lib/resourcebuilder/apps.go#L79-L136
1 parent ff3778c commit ba7f4f7

File tree

3 files changed

+316
-2
lines changed

3 files changed

+316
-2
lines changed

pkg/cvo/status.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,15 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status
327327
failingCondition.Reason = failingReason
328328
failingCondition.Message = failingMessage
329329
}
330+
if failure != nil &&
331+
skipFailure &&
332+
(progressReason == "ClusterOperatorUpdating" || progressReason == "ClusterOperatorsUpdating") &&
333+
strings.HasPrefix(progressMessage, slowCOUpdatePrefix) {
334+
progressMessage = strings.TrimPrefix(progressMessage, slowCOUpdatePrefix)
335+
failingCondition.Status = configv1.ConditionUnknown
336+
failingCondition.Reason = "SlowClusterOperator"
337+
failingCondition.Message = progressMessage
338+
}
330339
resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition)
331340

332341
// update progressing
@@ -537,6 +546,8 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus,
537546
}
538547
}
539548

549+
const slowCOUpdatePrefix = "Slow::"
550+
540551
// convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as
541552
// still making internal progress. The general error we try to suppress is an operator or operators still being
542553
// progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines
@@ -555,7 +566,38 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
555566
case payload.UpdateEffectReport:
556567
return uErr.Reason, uErr.Error(), false
557568
case payload.UpdateEffectNone:
558-
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
569+
var exceeded []string
570+
names := uErr.Names
571+
if len(names) == 0 {
572+
names = []string{uErr.Name}
573+
}
574+
var machineConfig bool
575+
for _, name := range names {
576+
m := 30 * time.Minute
577+
// It takes longer to upgrade MCO
578+
if name == "machine-config" {
579+
m = 3 * m
580+
}
581+
t := payload.COUpdateStartTimesGet(name)
582+
if (!t.IsZero()) && t.Before(now.Add(-(m))) {
583+
if name == "machine-config" {
584+
machineConfig = true
585+
} else {
586+
exceeded = append(exceeded, name)
587+
}
588+
}
589+
}
590+
// returns true in those slow cases because it is still only a suspicion
591+
if len(exceeded) > 0 && !machineConfig {
592+
return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true
593+
}
594+
if len(exceeded) > 0 && machineConfig {
595+
return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true
596+
}
597+
if len(exceeded) == 0 && machineConfig {
598+
return uErr.Reason, fmt.Sprintf("%swaiting on machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix), true
599+
}
600+
return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true
559601
case payload.UpdateEffectFail:
560602
return "", "", false
561603
case payload.UpdateEffectFailAfterInterval:

pkg/cvo/status_test.go

Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"reflect"
77
"testing"
8+
"time"
89

910
"github.com/google/go-cmp/cmp"
1011
"github.com/google/go-cmp/cmp/cmpopts"
@@ -358,12 +359,27 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
358359
type args struct {
359360
syncWorkerStatus *SyncWorkerStatus
360361
}
362+
payload.COUpdateStartTimesEnsure("co-not-timeout")
363+
payload.COUpdateStartTimesEnsure("co-bar-not-timeout")
364+
payload.COUpdateStartTimesEnsure("co-foo-not-timeout")
365+
payload.COUpdateStartTimesAt("co-timeout", time.Now().Add(-60*time.Minute))
366+
payload.COUpdateStartTimesAt("co-bar-timeout", time.Now().Add(-60*time.Minute))
367+
defer func() {
368+
payload.COUpdateStartTimesRemove("co-not-timeout")
369+
payload.COUpdateStartTimesRemove("co-bar-not-timeout")
370+
payload.COUpdateStartTimesRemove("co-foo-not-timeout")
371+
payload.COUpdateStartTimesRemove("co-timeout")
372+
payload.COUpdateStartTimesRemove("co-bar-timeout")
373+
}()
374+
361375
tests := []struct {
362376
name string
363377
args args
364378
shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool
365379
expectedConditionNotModified *configv1.ClusterOperatorStatusCondition
366380
expectedConditionModified *configv1.ClusterOperatorStatusCondition
381+
machineConfigTimeout bool
382+
expectedProgressingCondition *configv1.ClusterOperatorStatusCondition
367383
}{
368384
{
369385
name: "no errors are present",
@@ -394,10 +410,12 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
394410
name: "single UpdateEffectNone error",
395411
args: args{
396412
syncWorkerStatus: &SyncWorkerStatus{
413+
Actual: configv1.Release{Version: "1.2.3"},
397414
Failure: &payload.UpdateError{
398415
UpdateEffect: payload.UpdateEffectNone,
399416
Reason: "ClusterOperatorUpdating",
400417
Message: "Cluster operator A is updating",
418+
Name: "co-not-timeout",
401419
},
402420
},
403421
},
@@ -412,6 +430,110 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
412430
Type: ClusterStatusFailing,
413431
Status: configv1.ConditionFalse,
414432
},
433+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
434+
Type: configv1.OperatorProgressing,
435+
Status: configv1.ConditionTrue,
436+
Reason: "ClusterOperatorUpdating",
437+
Message: "Working towards 1.2.3: waiting on co-not-timeout",
438+
},
439+
},
440+
{
441+
name: "single UpdateEffectNone error and timeout",
442+
args: args{
443+
syncWorkerStatus: &SyncWorkerStatus{
444+
Actual: configv1.Release{Version: "1.2.3"},
445+
Failure: &payload.UpdateError{
446+
UpdateEffect: payload.UpdateEffectNone,
447+
Reason: "ClusterOperatorUpdating",
448+
Message: "Cluster operator A is updating",
449+
Name: "co-timeout",
450+
},
451+
},
452+
},
453+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
454+
Type: ClusterStatusFailing,
455+
Status: configv1.ConditionTrue,
456+
Reason: "ClusterOperatorUpdating",
457+
Message: "Cluster operator A is updating",
458+
},
459+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
460+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
461+
Type: ClusterStatusFailing,
462+
Status: configv1.ConditionUnknown,
463+
Reason: "SlowClusterOperator",
464+
Message: "waiting on co-timeout over 30 minutes which is longer than expected",
465+
},
466+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
467+
Type: configv1.OperatorProgressing,
468+
Status: configv1.ConditionTrue,
469+
Reason: "ClusterOperatorUpdating",
470+
Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected",
471+
},
472+
},
473+
{
474+
name: "single UpdateEffectNone error and machine-config",
475+
args: args{
476+
syncWorkerStatus: &SyncWorkerStatus{
477+
Actual: configv1.Release{Version: "1.2.3"},
478+
Failure: &payload.UpdateError{
479+
UpdateEffect: payload.UpdateEffectNone,
480+
Reason: "ClusterOperatorUpdating",
481+
Message: "Cluster operator A is updating",
482+
Name: "machine-config",
483+
},
484+
},
485+
},
486+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
487+
Type: ClusterStatusFailing,
488+
Status: configv1.ConditionTrue,
489+
Reason: "ClusterOperatorUpdating",
490+
Message: "Cluster operator A is updating",
491+
},
492+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
493+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
494+
Type: ClusterStatusFailing,
495+
Status: configv1.ConditionFalse,
496+
},
497+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
498+
Type: configv1.OperatorProgressing,
499+
Status: configv1.ConditionTrue,
500+
Reason: "ClusterOperatorUpdating",
501+
Message: "Working towards 1.2.3: waiting on machine-config",
502+
},
503+
},
504+
{
505+
name: "single UpdateEffectNone error and machine-config timeout",
506+
args: args{
507+
syncWorkerStatus: &SyncWorkerStatus{
508+
Actual: configv1.Release{Version: "1.2.3"},
509+
Failure: &payload.UpdateError{
510+
UpdateEffect: payload.UpdateEffectNone,
511+
Reason: "ClusterOperatorUpdating",
512+
Message: "Cluster operator A is updating",
513+
Name: "machine-config",
514+
},
515+
},
516+
},
517+
machineConfigTimeout: true,
518+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
519+
Type: ClusterStatusFailing,
520+
Status: configv1.ConditionTrue,
521+
Reason: "ClusterOperatorUpdating",
522+
Message: "Cluster operator A is updating",
523+
},
524+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
525+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
526+
Type: ClusterStatusFailing,
527+
Status: configv1.ConditionUnknown,
528+
Reason: "SlowClusterOperator",
529+
Message: "waiting on machine-config over 90 minutes which is longer than expected",
530+
},
531+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
532+
Type: configv1.OperatorProgressing,
533+
Status: configv1.ConditionTrue,
534+
Reason: "ClusterOperatorUpdating",
535+
Message: "Working towards 1.2.3: waiting on machine-config over 90 minutes which is longer than expected",
536+
},
415537
},
416538
{
417539
name: "single condensed UpdateEffectFail UpdateError",
@@ -587,6 +709,137 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
587709
Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator C is degraded",
588710
},
589711
},
712+
{
713+
name: "MultipleErrors: all updating and none slow",
714+
args: args{
715+
syncWorkerStatus: &SyncWorkerStatus{
716+
Actual: configv1.Release{Version: "1.2.3"},
717+
Failure: &payload.UpdateError{
718+
UpdateEffect: payload.UpdateEffectNone,
719+
Reason: "ClusterOperatorsUpdating",
720+
Message: "some-message",
721+
Names: []string{"co-not-timeout", "co-bar-not-timeout", "co-foo-not-timeout"},
722+
},
723+
},
724+
},
725+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
726+
Type: ClusterStatusFailing,
727+
Status: configv1.ConditionTrue,
728+
Reason: "ClusterOperatorsUpdating",
729+
Message: "some-message",
730+
},
731+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
732+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
733+
Type: ClusterStatusFailing,
734+
Status: configv1.ConditionFalse,
735+
},
736+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
737+
Type: configv1.OperatorProgressing,
738+
Status: configv1.ConditionTrue,
739+
Reason: "ClusterOperatorsUpdating",
740+
Message: "Working towards 1.2.3: waiting on co-not-timeout, co-bar-not-timeout, co-foo-not-timeout",
741+
},
742+
},
743+
{
744+
name: "MultipleErrors: all updating and one slow",
745+
args: args{
746+
syncWorkerStatus: &SyncWorkerStatus{
747+
Actual: configv1.Release{Version: "1.2.3"},
748+
Failure: &payload.UpdateError{
749+
UpdateEffect: payload.UpdateEffectNone,
750+
Reason: "ClusterOperatorsUpdating",
751+
Message: "some-message",
752+
Names: []string{"co-timeout", "co-bar-not-timeout", "co-foo-not-timeout"},
753+
},
754+
},
755+
},
756+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
757+
Type: ClusterStatusFailing,
758+
Status: configv1.ConditionTrue,
759+
Reason: "ClusterOperatorsUpdating",
760+
Message: "some-message",
761+
},
762+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
763+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
764+
Type: ClusterStatusFailing,
765+
Status: configv1.ConditionUnknown,
766+
Reason: "SlowClusterOperator",
767+
Message: "waiting on co-timeout over 30 minutes which is longer than expected",
768+
},
769+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
770+
Type: configv1.OperatorProgressing,
771+
Status: configv1.ConditionTrue,
772+
Reason: "ClusterOperatorsUpdating",
773+
Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected",
774+
},
775+
},
776+
{
777+
name: "MultipleErrors: all updating and some slow",
778+
args: args{
779+
syncWorkerStatus: &SyncWorkerStatus{
780+
Actual: configv1.Release{Version: "1.2.3"},
781+
Failure: &payload.UpdateError{
782+
UpdateEffect: payload.UpdateEffectNone,
783+
Reason: "ClusterOperatorsUpdating",
784+
Message: "some-message",
785+
Names: []string{"co-timeout", "co-bar-timeout", "co-foo-not-timeout"},
786+
},
787+
},
788+
},
789+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
790+
Type: ClusterStatusFailing,
791+
Status: configv1.ConditionTrue,
792+
Reason: "ClusterOperatorsUpdating",
793+
Message: "some-message",
794+
},
795+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
796+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
797+
Type: ClusterStatusFailing,
798+
Status: configv1.ConditionUnknown,
799+
Reason: "SlowClusterOperator",
800+
Message: "waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected",
801+
},
802+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
803+
Type: configv1.OperatorProgressing,
804+
Status: configv1.ConditionTrue,
805+
Reason: "ClusterOperatorsUpdating",
806+
Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected",
807+
},
808+
},
809+
{
810+
name: "MultipleErrors: all updating and all slow",
811+
args: args{
812+
syncWorkerStatus: &SyncWorkerStatus{
813+
Actual: configv1.Release{Version: "1.2.3"},
814+
Failure: &payload.UpdateError{
815+
UpdateEffect: payload.UpdateEffectNone,
816+
Reason: "ClusterOperatorsUpdating",
817+
Message: "some-message",
818+
Names: []string{"co-timeout", "co-bar-timeout", "machine-config"},
819+
},
820+
},
821+
},
822+
machineConfigTimeout: true,
823+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
824+
Type: ClusterStatusFailing,
825+
Status: configv1.ConditionTrue,
826+
Reason: "ClusterOperatorsUpdating",
827+
Message: "some-message",
828+
},
829+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
830+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
831+
Type: ClusterStatusFailing,
832+
Status: configv1.ConditionUnknown,
833+
Reason: "SlowClusterOperator",
834+
Message: "waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected",
835+
},
836+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
837+
Type: configv1.OperatorProgressing,
838+
Status: configv1.ConditionTrue,
839+
Reason: "ClusterOperatorsUpdating",
840+
Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected",
841+
},
842+
},
590843
}
591844
for _, tc := range tests {
592845
tc := tc
@@ -606,6 +859,11 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
606859
{true, false},
607860
{true, true},
608861
}
862+
if tc.machineConfigTimeout {
863+
payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-120*time.Minute))
864+
} else {
865+
payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-60*time.Minute))
866+
}
609867
for _, c := range combinations {
610868
tc.args.syncWorkerStatus.Reconciling = c.isReconciling
611869
cvStatus := &configv1.ClusterVersionStatus{}
@@ -621,7 +879,15 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
621879
if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" {
622880
t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
623881
}
882+
883+
if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty {
884+
progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing)
885+
if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" {
886+
t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
887+
}
888+
}
624889
}
890+
payload.COUpdateStartTimesRemove("machine-config")
625891
})
626892
}
627893
}

0 commit comments

Comments
 (0)