Skip to content

Commit 3c3b82b

Browse files
hongkailiuopenshift-cherrypick-robot
authored andcommitted
OCPBUGS-23514: Failing=Unknown upon long CO updating
When it takes too long (90m+ for machine-config and 30m+ for others) to upgrade a cluster operator, clusterversion shows a message with the indication that the upgrade might hit some issue. This will cover the case in the related OCPBUGS-23538: for some reason, the pod under the deployment that manages the CO hit CrashLoopBackOff. Deployment controller does not give useful conditions in this situation [1]. Otherwise, checkDeploymentHealth [2] would detect it. Instead of CVO's figuring out the underlying pod's CrashLoopBackOff which might be better to be implemented by deployment controller, it is expected that our cluster admin starts to dig into the cluster when such a message pops up. In addition to the condition's message. We propagate Fail=Unknown to make it available for other automations, such as update-status command. [1]. kubernetes/kubernetes#106054 [2]. https://github.com/openshift/cluster-version-operator/blob/08c0459df5096e9f16fad3af2831b62d06d415ee/lib/resourcebuilder/apps.go#L79-L136
1 parent b06c462 commit 3c3b82b

File tree

3 files changed

+316
-2
lines changed

3 files changed

+316
-2
lines changed

pkg/cvo/status.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,15 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status
321321
failingCondition.Reason = failingReason
322322
failingCondition.Message = failingMessage
323323
}
324+
if failure != nil &&
325+
skipFailure &&
326+
(progressReason == "ClusterOperatorUpdating" || progressReason == "ClusterOperatorsUpdating") &&
327+
strings.HasPrefix(progressMessage, slowCOUpdatePrefix) {
328+
progressMessage = strings.TrimPrefix(progressMessage, slowCOUpdatePrefix)
329+
failingCondition.Status = configv1.ConditionUnknown
330+
failingCondition.Reason = "SlowClusterOperator"
331+
failingCondition.Message = progressMessage
332+
}
324333
resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition)
325334

326335
// update progressing
@@ -531,6 +540,8 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus,
531540
}
532541
}
533542

543+
const slowCOUpdatePrefix = "Slow::"
544+
534545
// convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as
535546
// still making internal progress. The general error we try to suppress is an operator or operators still being
536547
// progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines
@@ -549,7 +560,38 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
549560
case payload.UpdateEffectReport:
550561
return uErr.Reason, uErr.Error(), false
551562
case payload.UpdateEffectNone:
552-
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
563+
var exceeded []string
564+
names := uErr.Names
565+
if len(names) == 0 {
566+
names = []string{uErr.Name}
567+
}
568+
var machineConfig bool
569+
for _, name := range names {
570+
m := 30 * time.Minute
571+
// It takes longer to upgrade MCO
572+
if name == "machine-config" {
573+
m = 3 * m
574+
}
575+
t := payload.COUpdateStartTimesGet(name)
576+
if (!t.IsZero()) && t.Before(now.Add(-(m))) {
577+
if name == "machine-config" {
578+
machineConfig = true
579+
} else {
580+
exceeded = append(exceeded, name)
581+
}
582+
}
583+
}
584+
// returns true in those slow cases because it is still only a suspicion
585+
if len(exceeded) > 0 && !machineConfig {
586+
return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true
587+
}
588+
if len(exceeded) > 0 && machineConfig {
589+
return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true
590+
}
591+
if len(exceeded) == 0 && machineConfig {
592+
return uErr.Reason, fmt.Sprintf("%swaiting on machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix), true
593+
}
594+
return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true
553595
case payload.UpdateEffectFail:
554596
return "", "", false
555597
case payload.UpdateEffectFailAfterInterval:

pkg/cvo/status_test.go

Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"reflect"
77
"testing"
8+
"time"
89

910
"github.com/google/go-cmp/cmp"
1011
"github.com/google/go-cmp/cmp/cmpopts"
@@ -353,12 +354,27 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
353354
type args struct {
354355
syncWorkerStatus *SyncWorkerStatus
355356
}
357+
payload.COUpdateStartTimesEnsure("co-not-timeout")
358+
payload.COUpdateStartTimesEnsure("co-bar-not-timeout")
359+
payload.COUpdateStartTimesEnsure("co-foo-not-timeout")
360+
payload.COUpdateStartTimesAt("co-timeout", time.Now().Add(-60*time.Minute))
361+
payload.COUpdateStartTimesAt("co-bar-timeout", time.Now().Add(-60*time.Minute))
362+
defer func() {
363+
payload.COUpdateStartTimesRemove("co-not-timeout")
364+
payload.COUpdateStartTimesRemove("co-bar-not-timeout")
365+
payload.COUpdateStartTimesRemove("co-foo-not-timeout")
366+
payload.COUpdateStartTimesRemove("co-timeout")
367+
payload.COUpdateStartTimesRemove("co-bar-timeout")
368+
}()
369+
356370
tests := []struct {
357371
name string
358372
args args
359373
shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool
360374
expectedConditionNotModified *configv1.ClusterOperatorStatusCondition
361375
expectedConditionModified *configv1.ClusterOperatorStatusCondition
376+
machineConfigTimeout bool
377+
expectedProgressingCondition *configv1.ClusterOperatorStatusCondition
362378
}{
363379
{
364380
name: "no errors are present",
@@ -389,10 +405,12 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
389405
name: "single UpdateEffectNone error",
390406
args: args{
391407
syncWorkerStatus: &SyncWorkerStatus{
408+
Actual: configv1.Release{Version: "1.2.3"},
392409
Failure: &payload.UpdateError{
393410
UpdateEffect: payload.UpdateEffectNone,
394411
Reason: "ClusterOperatorUpdating",
395412
Message: "Cluster operator A is updating",
413+
Name: "co-not-timeout",
396414
},
397415
},
398416
},
@@ -407,6 +425,110 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
407425
Type: ClusterStatusFailing,
408426
Status: configv1.ConditionFalse,
409427
},
428+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
429+
Type: configv1.OperatorProgressing,
430+
Status: configv1.ConditionTrue,
431+
Reason: "ClusterOperatorUpdating",
432+
Message: "Working towards 1.2.3: waiting on co-not-timeout",
433+
},
434+
},
435+
{
436+
name: "single UpdateEffectNone error and timeout",
437+
args: args{
438+
syncWorkerStatus: &SyncWorkerStatus{
439+
Actual: configv1.Release{Version: "1.2.3"},
440+
Failure: &payload.UpdateError{
441+
UpdateEffect: payload.UpdateEffectNone,
442+
Reason: "ClusterOperatorUpdating",
443+
Message: "Cluster operator A is updating",
444+
Name: "co-timeout",
445+
},
446+
},
447+
},
448+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
449+
Type: ClusterStatusFailing,
450+
Status: configv1.ConditionTrue,
451+
Reason: "ClusterOperatorUpdating",
452+
Message: "Cluster operator A is updating",
453+
},
454+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
455+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
456+
Type: ClusterStatusFailing,
457+
Status: configv1.ConditionUnknown,
458+
Reason: "SlowClusterOperator",
459+
Message: "waiting on co-timeout over 30 minutes which is longer than expected",
460+
},
461+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
462+
Type: configv1.OperatorProgressing,
463+
Status: configv1.ConditionTrue,
464+
Reason: "ClusterOperatorUpdating",
465+
Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected",
466+
},
467+
},
468+
{
469+
name: "single UpdateEffectNone error and machine-config",
470+
args: args{
471+
syncWorkerStatus: &SyncWorkerStatus{
472+
Actual: configv1.Release{Version: "1.2.3"},
473+
Failure: &payload.UpdateError{
474+
UpdateEffect: payload.UpdateEffectNone,
475+
Reason: "ClusterOperatorUpdating",
476+
Message: "Cluster operator A is updating",
477+
Name: "machine-config",
478+
},
479+
},
480+
},
481+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
482+
Type: ClusterStatusFailing,
483+
Status: configv1.ConditionTrue,
484+
Reason: "ClusterOperatorUpdating",
485+
Message: "Cluster operator A is updating",
486+
},
487+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
488+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
489+
Type: ClusterStatusFailing,
490+
Status: configv1.ConditionFalse,
491+
},
492+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
493+
Type: configv1.OperatorProgressing,
494+
Status: configv1.ConditionTrue,
495+
Reason: "ClusterOperatorUpdating",
496+
Message: "Working towards 1.2.3: waiting on machine-config",
497+
},
498+
},
499+
{
500+
name: "single UpdateEffectNone error and machine-config timeout",
501+
args: args{
502+
syncWorkerStatus: &SyncWorkerStatus{
503+
Actual: configv1.Release{Version: "1.2.3"},
504+
Failure: &payload.UpdateError{
505+
UpdateEffect: payload.UpdateEffectNone,
506+
Reason: "ClusterOperatorUpdating",
507+
Message: "Cluster operator A is updating",
508+
Name: "machine-config",
509+
},
510+
},
511+
},
512+
machineConfigTimeout: true,
513+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
514+
Type: ClusterStatusFailing,
515+
Status: configv1.ConditionTrue,
516+
Reason: "ClusterOperatorUpdating",
517+
Message: "Cluster operator A is updating",
518+
},
519+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
520+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
521+
Type: ClusterStatusFailing,
522+
Status: configv1.ConditionUnknown,
523+
Reason: "SlowClusterOperator",
524+
Message: "waiting on machine-config over 90 minutes which is longer than expected",
525+
},
526+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
527+
Type: configv1.OperatorProgressing,
528+
Status: configv1.ConditionTrue,
529+
Reason: "ClusterOperatorUpdating",
530+
Message: "Working towards 1.2.3: waiting on machine-config over 90 minutes which is longer than expected",
531+
},
410532
},
411533
{
412534
name: "single condensed UpdateEffectFail UpdateError",
@@ -582,6 +704,137 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
582704
Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator C is degraded",
583705
},
584706
},
707+
{
708+
name: "MultipleErrors: all updating and none slow",
709+
args: args{
710+
syncWorkerStatus: &SyncWorkerStatus{
711+
Actual: configv1.Release{Version: "1.2.3"},
712+
Failure: &payload.UpdateError{
713+
UpdateEffect: payload.UpdateEffectNone,
714+
Reason: "ClusterOperatorsUpdating",
715+
Message: "some-message",
716+
Names: []string{"co-not-timeout", "co-bar-not-timeout", "co-foo-not-timeout"},
717+
},
718+
},
719+
},
720+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
721+
Type: ClusterStatusFailing,
722+
Status: configv1.ConditionTrue,
723+
Reason: "ClusterOperatorsUpdating",
724+
Message: "some-message",
725+
},
726+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
727+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
728+
Type: ClusterStatusFailing,
729+
Status: configv1.ConditionFalse,
730+
},
731+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
732+
Type: configv1.OperatorProgressing,
733+
Status: configv1.ConditionTrue,
734+
Reason: "ClusterOperatorsUpdating",
735+
Message: "Working towards 1.2.3: waiting on co-not-timeout, co-bar-not-timeout, co-foo-not-timeout",
736+
},
737+
},
738+
{
739+
name: "MultipleErrors: all updating and one slow",
740+
args: args{
741+
syncWorkerStatus: &SyncWorkerStatus{
742+
Actual: configv1.Release{Version: "1.2.3"},
743+
Failure: &payload.UpdateError{
744+
UpdateEffect: payload.UpdateEffectNone,
745+
Reason: "ClusterOperatorsUpdating",
746+
Message: "some-message",
747+
Names: []string{"co-timeout", "co-bar-not-timeout", "co-foo-not-timeout"},
748+
},
749+
},
750+
},
751+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
752+
Type: ClusterStatusFailing,
753+
Status: configv1.ConditionTrue,
754+
Reason: "ClusterOperatorsUpdating",
755+
Message: "some-message",
756+
},
757+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
758+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
759+
Type: ClusterStatusFailing,
760+
Status: configv1.ConditionUnknown,
761+
Reason: "SlowClusterOperator",
762+
Message: "waiting on co-timeout over 30 minutes which is longer than expected",
763+
},
764+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
765+
Type: configv1.OperatorProgressing,
766+
Status: configv1.ConditionTrue,
767+
Reason: "ClusterOperatorsUpdating",
768+
Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected",
769+
},
770+
},
771+
{
772+
name: "MultipleErrors: all updating and some slow",
773+
args: args{
774+
syncWorkerStatus: &SyncWorkerStatus{
775+
Actual: configv1.Release{Version: "1.2.3"},
776+
Failure: &payload.UpdateError{
777+
UpdateEffect: payload.UpdateEffectNone,
778+
Reason: "ClusterOperatorsUpdating",
779+
Message: "some-message",
780+
Names: []string{"co-timeout", "co-bar-timeout", "co-foo-not-timeout"},
781+
},
782+
},
783+
},
784+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
785+
Type: ClusterStatusFailing,
786+
Status: configv1.ConditionTrue,
787+
Reason: "ClusterOperatorsUpdating",
788+
Message: "some-message",
789+
},
790+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
791+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
792+
Type: ClusterStatusFailing,
793+
Status: configv1.ConditionUnknown,
794+
Reason: "SlowClusterOperator",
795+
Message: "waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected",
796+
},
797+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
798+
Type: configv1.OperatorProgressing,
799+
Status: configv1.ConditionTrue,
800+
Reason: "ClusterOperatorsUpdating",
801+
Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected",
802+
},
803+
},
804+
{
805+
name: "MultipleErrors: all updating and all slow",
806+
args: args{
807+
syncWorkerStatus: &SyncWorkerStatus{
808+
Actual: configv1.Release{Version: "1.2.3"},
809+
Failure: &payload.UpdateError{
810+
UpdateEffect: payload.UpdateEffectNone,
811+
Reason: "ClusterOperatorsUpdating",
812+
Message: "some-message",
813+
Names: []string{"co-timeout", "co-bar-timeout", "machine-config"},
814+
},
815+
},
816+
},
817+
machineConfigTimeout: true,
818+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
819+
Type: ClusterStatusFailing,
820+
Status: configv1.ConditionTrue,
821+
Reason: "ClusterOperatorsUpdating",
822+
Message: "some-message",
823+
},
824+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
825+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
826+
Type: ClusterStatusFailing,
827+
Status: configv1.ConditionUnknown,
828+
Reason: "SlowClusterOperator",
829+
Message: "waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected",
830+
},
831+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
832+
Type: configv1.OperatorProgressing,
833+
Status: configv1.ConditionTrue,
834+
Reason: "ClusterOperatorsUpdating",
835+
Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected",
836+
},
837+
},
585838
}
586839
for _, tc := range tests {
587840
tc := tc
@@ -601,6 +854,11 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
601854
{true, false},
602855
{true, true},
603856
}
857+
if tc.machineConfigTimeout {
858+
payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-120*time.Minute))
859+
} else {
860+
payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-60*time.Minute))
861+
}
604862
for _, c := range combinations {
605863
tc.args.syncWorkerStatus.Reconciling = c.isReconciling
606864
cvStatus := &configv1.ClusterVersionStatus{}
@@ -616,7 +874,15 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
616874
if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" {
617875
t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
618876
}
877+
878+
if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty {
879+
progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing)
880+
if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" {
881+
t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
882+
}
883+
}
619884
}
885+
payload.COUpdateStartTimesRemove("machine-config")
620886
})
621887
}
622888
}

0 commit comments

Comments
 (0)