Skip to content

Commit b855c68

Browse files
Merge pull request #1212 from openshift-cherrypick-robot/cherry-pick-1165-to-release-4.17
[release-4.17] OCPBUGS-58451: Failing=Unknown upon long CO updating
2 parents b06c462 + b0f13d9 commit b855c68

File tree

3 files changed

+335
-14
lines changed

3 files changed

+335
-14
lines changed

pkg/cvo/status.go

Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,14 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status
321321
failingCondition.Reason = failingReason
322322
failingCondition.Message = failingMessage
323323
}
324+
if failure != nil &&
325+
strings.HasPrefix(progressReason, slowCOUpdatePrefix) {
326+
failingCondition.Status = configv1.ConditionUnknown
327+
failingCondition.Reason = "SlowClusterOperator"
328+
failingCondition.Message = progressMessage
329+
}
330+
progressReason = strings.TrimPrefix(progressReason, slowCOUpdatePrefix)
331+
324332
resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition)
325333

326334
// update progressing
@@ -531,6 +539,8 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus,
531539
}
532540
}
533541

542+
const slowCOUpdatePrefix = "Slow::"
543+
534544
// convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as
535545
// still making internal progress. The general error we try to suppress is an operator or operators still being
536546
// progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines
@@ -549,28 +559,67 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
549559
case payload.UpdateEffectReport:
550560
return uErr.Reason, uErr.Error(), false
551561
case payload.UpdateEffectNone:
552-
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
562+
return convertErrorToProgressingForUpdateEffectNone(uErr, now)
553563
case payload.UpdateEffectFail:
554564
return "", "", false
555565
case payload.UpdateEffectFailAfterInterval:
556-
var exceeded []string
557-
threshold := now.Add(-(40 * time.Minute))
558-
names := uErr.Names
559-
if len(names) == 0 {
560-
names = []string{uErr.Name}
566+
return convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr, now)
567+
}
568+
return "", "", false
569+
}
570+
571+
func convertErrorToProgressingForUpdateEffectNone(uErr *payload.UpdateError, now time.Time) (string, string, bool) {
572+
var exceeded []string
573+
names := uErr.Names
574+
if len(names) == 0 {
575+
names = []string{uErr.Name}
576+
}
577+
var machineConfig bool
578+
for _, name := range names {
579+
m := 30 * time.Minute
580+
// It takes longer to upgrade MCO
581+
if name == "machine-config" {
582+
m = 3 * m
561583
}
562-
for _, name := range names {
563-
if payload.COUpdateStartTimesGet(name).Before(threshold) {
584+
t := payload.COUpdateStartTimesGet(name)
585+
if (!t.IsZero()) && t.Before(now.Add(-(m))) {
586+
if name == "machine-config" {
587+
machineConfig = true
588+
} else {
564589
exceeded = append(exceeded, name)
565590
}
566591
}
567-
if len(exceeded) > 0 {
568-
return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false
569-
} else {
570-
return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true
592+
}
593+
// returns true in those slow cases because it is still only a suspicion
594+
if len(exceeded) > 0 && !machineConfig {
595+
return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true
596+
}
597+
if len(exceeded) > 0 && machineConfig {
598+
return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true
599+
}
600+
if len(exceeded) == 0 && machineConfig {
601+
return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true
602+
}
603+
return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true
604+
}
605+
606+
func convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr *payload.UpdateError, now time.Time) (string, string, bool) {
607+
var exceeded []string
608+
threshold := now.Add(-(40 * time.Minute))
609+
names := uErr.Names
610+
if len(names) == 0 {
611+
names = []string{uErr.Name}
612+
}
613+
for _, name := range names {
614+
if payload.COUpdateStartTimesGet(name).Before(threshold) {
615+
exceeded = append(exceeded, name)
571616
}
572617
}
573-
return "", "", false
618+
if len(exceeded) > 0 {
619+
return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false
620+
} else {
621+
return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true
622+
}
574623
}
575624

576625
// syncFailingStatus handles generic errors in the cluster version. It tries to preserve

0 commit comments

Comments
 (0)