Skip to content

Commit 68520e0

Browse files
committed
OCPBUGS-62627: cluster operator ingress reported Progressing=True with reason=Reconciling for a node reboot
Signed-off-by: Davide Salerno <[email protected]>
1 parent 0cac97a commit 68520e0

File tree

3 files changed

+349
-48
lines changed

3 files changed

+349
-48
lines changed

pkg/operator/controller/ingress/status.go

Lines changed: 102 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"errors"
88
"fmt"
99
"reflect"
10+
"slices"
1011
"sort"
1112
"strings"
1213
"time"
@@ -47,6 +48,38 @@ type expectedCondition struct {
4748
// or else the condition is not checked.
4849
ifConditionsTrue []string
4950
gracePeriod time.Duration
51+
// ignoreReasons specifies condition reasons that should be ignored when
52+
// determining if the expected condition is met. This allows distinguishing
53+
// between significant state changes (configuration rollouts) and transient
54+
// infrastructure events (node reboots, pod restarts).
55+
// ignoreReasons are only consulted when the condition exists and its Status
56+
// differs from expected.status; they are not consulted if the condition is
57+
// entirely missing
58+
ignoreReasons []string
59+
}
60+
61+
// DeploymentRollingOutConditionReason enum to map the different reasons a deployment could be in rolling out condition
62+
// since several functions are using this condition's reason for setting the Progressing condition to true or false
63+
type DeploymentRollingOutConditionReason int
64+
65+
const (
66+
ReasonDeploymentRollingOut DeploymentRollingOutConditionReason = iota
67+
ReasonNotRollingOut
68+
ReasonPodsStarting
69+
ReasonReplicasStabilizing
70+
)
71+
72+
var deploymentRollingOutConditionReason = map[DeploymentRollingOutConditionReason]string{
73+
ReasonDeploymentRollingOut: "DeploymentRollingOut",
74+
ReasonNotRollingOut: "DeploymentNotRollingOut",
75+
ReasonPodsStarting: "PodsStarting",
76+
ReasonReplicasStabilizing: "ReplicasStabilizing",
77+
}
78+
79+
const deploymentNewRSAvailableReason = "NewReplicaSetAvailable"
80+
81+
func (ss DeploymentRollingOutConditionReason) String() string {
82+
return deploymentRollingOutConditionReason[ss]
5083
}
5184

5285
// syncIngressControllerStatus computes the current status of ic and
@@ -330,7 +363,7 @@ func checkConditions(expectedConds []expectedCondition, conditions []operatorv1.
330363
if !haveCondition {
331364
continue
332365
}
333-
if condition.Status == expected.status {
366+
if condition.Status == expected.status || slices.Contains(expected.ignoreReasons, condition.Reason) {
334367
continue
335368
}
336369
failedPredicates := false
@@ -497,40 +530,56 @@ func computeDeploymentReplicasAllAvailableCondition(deployment *appsv1.Deploymen
497530
}
498531

499532
// computeDeploymentRollingOutCondition computes the ingress controller's
500-
// "DeploymentRollingOut" status condition by examining the number of updated
501-
// replicas reported in the deployment's status. The "DeploymentRollingOut"
533+
// "ReasonDeploymentRollingOut" status condition by examining the number of updated
534+
// replicas reported in the deployment's status. The "ReasonDeploymentRollingOut"
502535
// condition is true if the number of updated replicas is not equal to the number
503536
// of expected or available replicas.
504537
// See Reference: https://github.com/kubernetes/kubectl/blob/master/pkg/polymorphichelpers/rollout_status.go
505538
func computeDeploymentRollingOutCondition(deployment *appsv1.Deployment) operatorv1.OperatorCondition {
506-
// If have replicas is less than want replicas, then we are waiting for replicas to be updated.
539+
// Check if the deployment has the "NewReplicaSetAvailable" condition.
540+
// This indicates that Kubernetes has successfully completed a rollout.
541+
hasNewReplicaSetAvailable := false
542+
for _, cond := range deployment.Status.Conditions {
543+
if cond.Type == appsv1.DeploymentProgressing &&
544+
cond.Status == corev1.ConditionTrue &&
545+
cond.Reason == deploymentNewRSAvailableReason {
546+
hasNewReplicaSetAvailable = true
547+
548+
break
549+
}
550+
}
551+
552+
// If have replicas is less than wantGrace replicas, then we are waiting for replicas to be updated.
507553
if deployment.Spec.Replicas != nil && deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas {
554+
reason := computeDeploymentRollingOutStatusAndReason(hasNewReplicaSetAvailable, deployment)
508555
return operatorv1.OperatorCondition{
509556
Type: IngressControllerDeploymentRollingOutConditionType,
510557
Status: operatorv1.ConditionTrue,
511-
Reason: "DeploymentRollingOut",
558+
Reason: reason,
512559
Message: fmt.Sprintf(
513560
"Waiting for router deployment rollout to finish: %d out of %d new replica(s) have been updated...\n",
514561
deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas),
515562
}
516563
}
517564
// If have replicas greater than updated replicas, then we are waiting for old replicas to terminate.
518565
if deployment.Status.Replicas > deployment.Status.UpdatedReplicas {
566+
reason := computeDeploymentRollingOutStatusAndReason(hasNewReplicaSetAvailable, deployment)
519567
return operatorv1.OperatorCondition{
520568
Type: IngressControllerDeploymentRollingOutConditionType,
521569
Status: operatorv1.ConditionTrue,
522-
Reason: "DeploymentRollingOut",
570+
Reason: reason,
523571
Message: fmt.Sprintf(
524572
"Waiting for router deployment rollout to finish: %d old replica(s) are pending termination...\n",
525573
deployment.Status.Replicas-deployment.Status.UpdatedReplicas),
526574
}
527575
}
528576
// If available replicas less than updated replicas, then we are waiting for updated replicas to become available.
529577
if deployment.Status.AvailableReplicas < deployment.Status.UpdatedReplicas {
578+
reason := computeDeploymentRollingOutStatusAndReason(hasNewReplicaSetAvailable, deployment)
530579
return operatorv1.OperatorCondition{
531580
Type: IngressControllerDeploymentRollingOutConditionType,
532581
Status: operatorv1.ConditionTrue,
533-
Reason: "DeploymentRollingOut",
582+
Reason: reason,
534583
Message: fmt.Sprintf(
535584
"Waiting for router deployment rollout to finish: %d of %d updated replica(s) are available...\n",
536585
deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas),
@@ -540,11 +589,49 @@ func computeDeploymentRollingOutCondition(deployment *appsv1.Deployment) operato
540589
return operatorv1.OperatorCondition{
541590
Type: IngressControllerDeploymentRollingOutConditionType,
542591
Status: operatorv1.ConditionFalse,
543-
Reason: "DeploymentNotRollingOut",
592+
Reason: ReasonNotRollingOut.String(),
544593
Message: "Deployment is not actively rolling out",
545594
}
546595
}
547596

597+
// computeDeploymentRollingOutStatusAndReason computes the ingresscontroller's Status and Reason for DeploymentRollingOut
598+
// condition type checking if the deployment has the "NewReplicaSetAvailable" condition and the status of the pod replicas
599+
// and the generation metadata, in order to detect if the deployment is rolling out for am ingresscontroller configuration
600+
// change or due to an infrastructure change (i.e. node reboot due to update or scale up or down)
601+
func computeDeploymentRollingOutStatusAndReason(hasNewReplicaSetAvailable bool, deployment *appsv1.Deployment) string {
602+
var (
603+
wantReplicas = int32(0)
604+
haveReplicas = deployment.Status.Replicas
605+
updatedReplicas = deployment.Status.UpdatedReplicas
606+
availableReplicas = deployment.Status.AvailableReplicas
607+
readyReplicas = deployment.Status.ReadyReplicas
608+
)
609+
610+
if deployment.Spec.Replicas != nil {
611+
wantReplicas = *deployment.Spec.Replicas
612+
}
613+
614+
if hasNewReplicaSetAvailable && deployment.Status.ObservedGeneration == deployment.Generation &&
615+
updatedReplicas == wantReplicas {
616+
if haveReplicas != wantReplicas {
617+
return ReasonReplicasStabilizing.String()
618+
}
619+
if readyReplicas < wantReplicas || availableReplicas < wantReplicas {
620+
return ReasonPodsStarting.String()
621+
}
622+
}
623+
624+
// Additional safety check: Even without NewReplicaSetAvailable, if everything
625+
// matches except readiness, treat it as pods starting up
626+
if deployment.Status.ObservedGeneration == deployment.Generation &&
627+
updatedReplicas == wantReplicas &&
628+
haveReplicas == wantReplicas &&
629+
(readyReplicas < wantReplicas || availableReplicas < wantReplicas) {
630+
return ReasonPodsStarting.String()
631+
}
632+
return ReasonDeploymentRollingOut.String()
633+
}
634+
548635
// computeIngressDegradedCondition computes the ingresscontroller's "Degraded"
549636
// status condition, which aggregates other status conditions that can indicate
550637
// a degraded state. In addition, computeIngressDegradedCondition returns a
@@ -593,12 +680,7 @@ func computeIngressDegradedCondition(conditions []operatorv1.OperatorCondition,
593680
// Only check the default ingress controller for the canary
594681
// success status condition.
595682
if icName == manifests.DefaultIngressControllerName {
596-
canaryCond := struct {
597-
condition string
598-
status operatorv1.ConditionStatus
599-
ifConditionsTrue []string
600-
gracePeriod time.Duration
601-
}{
683+
canaryCond := expectedCondition{
602684
condition: IngressControllerCanaryCheckSuccessConditionType,
603685
status: operatorv1.ConditionTrue,
604686
gracePeriod: time.Second * 60,
@@ -765,6 +847,12 @@ func computeIngressProgressingCondition(conditions []operatorv1.OperatorConditio
765847
{
766848
condition: IngressControllerDeploymentRollingOutConditionType,
767849
status: operatorv1.ConditionFalse,
850+
// Ignore infrastructure-driven deployment rollouts when computing
851+
// the IngressController's Progressing condition.
852+
ignoreReasons: []string{
853+
ReasonReplicasStabilizing.String(), // Node reboots, pod evictions
854+
ReasonPodsStarting.String(), // Pods restarting after infrastructure events
855+
},
768856
},
769857
}
770858

0 commit comments

Comments
 (0)