Skip to content

Commit 6ab3796

Browse files
committed
OCPBUGS-62627: cluster operator ingress reported Progressing=True with reason=Reconciling for a node reboot
Signed-off-by: Davide Salerno <[email protected]>
1 parent 0cac97a commit 6ab3796

File tree

3 files changed

+203
-11
lines changed

3 files changed

+203
-11
lines changed

pkg/operator/controller/ingress/status.go

Lines changed: 93 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"errors"
88
"fmt"
99
"reflect"
10+
"slices"
1011
"sort"
1112
"strings"
1213
"time"
@@ -47,6 +48,33 @@ type expectedCondition struct {
4748
// or else the condition is not checked.
4849
ifConditionsTrue []string
4950
gracePeriod time.Duration
51+
// ignoreReasons specifies condition reasons that should be ignored when
52+
// determining if the expected condition is met. This allows distinguishing
53+
// between significant state changes (configuration rollouts) and transient
54+
// infrastructure events (node reboots, pod restarts).
55+
ignoreReasons []string
56+
}
57+
58+
// DeploymentRollingOutConditionReason enum to map the different reasons a deployment could be in rolling out condition
59+
// since several functions are using this condition's reason for setting the Progressing condition to true or false
60+
type DeploymentRollingOutConditionReason int
61+
62+
const (
63+
DeploymentRollingOut DeploymentRollingOutConditionReason = iota
64+
DeploymentNotRollingOut
65+
PodsStarting
66+
ReplicasStabilizing
67+
)
68+
69+
var deploymentRollingOutConditionReason = map[DeploymentRollingOutConditionReason]string{
70+
DeploymentRollingOut: "DeploymentRollingOut",
71+
DeploymentNotRollingOut: "DeploymentNotRollingOut",
72+
PodsStarting: "PodsStarting",
73+
ReplicasStabilizing: "ReplicasStabilizing",
74+
}
75+
76+
func (ss DeploymentRollingOutConditionReason) String() string {
77+
return deploymentRollingOutConditionReason[ss]
5078
}
5179

5280
// syncIngressControllerStatus computes the current status of ic and
@@ -330,7 +358,7 @@ func checkConditions(expectedConds []expectedCondition, conditions []operatorv1.
330358
if !haveCondition {
331359
continue
332360
}
333-
if condition.Status == expected.status {
361+
if condition.Status == expected.status || slices.Contains(expected.ignoreReasons, condition.Reason) {
334362
continue
335363
}
336364
failedPredicates := false
@@ -503,34 +531,49 @@ func computeDeploymentReplicasAllAvailableCondition(deployment *appsv1.Deploymen
503531
// of expected or available replicas.
504532
// See Reference: https://github.com/kubernetes/kubectl/blob/master/pkg/polymorphichelpers/rollout_status.go
505533
func computeDeploymentRollingOutCondition(deployment *appsv1.Deployment) operatorv1.OperatorCondition {
534+
// Check if the deployment has the "NewReplicaSetAvailable" condition.
535+
// This indicates that Kubernetes has successfully completed a rollout.
536+
hasNewReplicaSetAvailable := false
537+
for _, cond := range deployment.Status.Conditions {
538+
if cond.Type == appsv1.DeploymentProgressing &&
539+
cond.Status == corev1.ConditionTrue &&
540+
cond.Reason == "NewReplicaSetAvailable" {
541+
hasNewReplicaSetAvailable = true
542+
break
543+
}
544+
}
545+
506546
// If have replicas is less than want replicas, then we are waiting for replicas to be updated.
507547
if deployment.Spec.Replicas != nil && deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas {
548+
reason := computeDeploymentRollingOutStatusAndReason(hasNewReplicaSetAvailable, deployment)
508549
return operatorv1.OperatorCondition{
509550
Type: IngressControllerDeploymentRollingOutConditionType,
510551
Status: operatorv1.ConditionTrue,
511-
Reason: "DeploymentRollingOut",
552+
Reason: reason,
512553
Message: fmt.Sprintf(
513554
"Waiting for router deployment rollout to finish: %d out of %d new replica(s) have been updated...\n",
514555
deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas),
515556
}
516557
}
517558
// If have replicas greater than updated replicas, then we are waiting for old replicas to terminate.
518559
if deployment.Status.Replicas > deployment.Status.UpdatedReplicas {
560+
reason := computeDeploymentRollingOutStatusAndReason(hasNewReplicaSetAvailable, deployment)
519561
return operatorv1.OperatorCondition{
520562
Type: IngressControllerDeploymentRollingOutConditionType,
521563
Status: operatorv1.ConditionTrue,
522-
Reason: "DeploymentRollingOut",
564+
Reason: reason,
523565
Message: fmt.Sprintf(
524566
"Waiting for router deployment rollout to finish: %d old replica(s) are pending termination...\n",
525567
deployment.Status.Replicas-deployment.Status.UpdatedReplicas),
526568
}
527569
}
528570
// If available replicas less than updated replicas, then we are waiting for updated replicas to become available.
529571
if deployment.Status.AvailableReplicas < deployment.Status.UpdatedReplicas {
572+
reason := computeDeploymentRollingOutStatusAndReason(hasNewReplicaSetAvailable, deployment)
530573
return operatorv1.OperatorCondition{
531574
Type: IngressControllerDeploymentRollingOutConditionType,
532575
Status: operatorv1.ConditionTrue,
533-
Reason: "DeploymentRollingOut",
576+
Reason: reason,
534577
Message: fmt.Sprintf(
535578
"Waiting for router deployment rollout to finish: %d of %d updated replica(s) are available...\n",
536579
deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas),
@@ -540,11 +583,49 @@ func computeDeploymentRollingOutCondition(deployment *appsv1.Deployment) operato
540583
return operatorv1.OperatorCondition{
541584
Type: IngressControllerDeploymentRollingOutConditionType,
542585
Status: operatorv1.ConditionFalse,
543-
Reason: "DeploymentNotRollingOut",
586+
Reason: DeploymentNotRollingOut.String(),
544587
Message: "Deployment is not actively rolling out",
545588
}
546589
}
547590

591+
// computeDeploymentRollingOutStatusAndReason computes the ingresscontroller's Status and Reason for DeploymentRollingOut
592+
// condition type checking if the deployment has the "NewReplicaSetAvailable" condition and the status of the pod replicas
593+
// and the generation metadata, in order to detect if the deployment is rolling out for am ingresscontroller configuration
594+
// change or due to an infrastructure change (i.e. node reboot due to update or scale up or down)
595+
func computeDeploymentRollingOutStatusAndReason(hasNewReplicaSetAvailable bool, deployment *appsv1.Deployment) string {
596+
var (
597+
wantReplicas = int32(0)
598+
haveReplicas = deployment.Status.Replicas
599+
updatedReplicas = deployment.Status.UpdatedReplicas
600+
availableReplicas = deployment.Status.AvailableReplicas
601+
readyReplicas = deployment.Status.ReadyReplicas
602+
)
603+
604+
if deployment.Spec.Replicas != nil {
605+
wantReplicas = *deployment.Spec.Replicas
606+
}
607+
608+
if hasNewReplicaSetAvailable && deployment.Status.ObservedGeneration == deployment.Generation &&
609+
updatedReplicas == wantReplicas {
610+
if haveReplicas != wantReplicas {
611+
return ReplicasStabilizing.String()
612+
}
613+
if readyReplicas < wantReplicas || availableReplicas < wantReplicas {
614+
return PodsStarting.String()
615+
}
616+
}
617+
618+
// Additional safety check: Even without NewReplicaSetAvailable, if everything
619+
// matches except readiness, treat it as pods starting up
620+
if deployment.Status.ObservedGeneration == deployment.Generation &&
621+
updatedReplicas == wantReplicas &&
622+
haveReplicas == wantReplicas &&
623+
(readyReplicas < wantReplicas || availableReplicas < wantReplicas) {
624+
return PodsStarting.String()
625+
}
626+
return DeploymentRollingOut.String()
627+
}
628+
548629
// computeIngressDegradedCondition computes the ingresscontroller's "Degraded"
549630
// status condition, which aggregates other status conditions that can indicate
550631
// a degraded state. In addition, computeIngressDegradedCondition returns a
@@ -593,12 +674,7 @@ func computeIngressDegradedCondition(conditions []operatorv1.OperatorCondition,
593674
// Only check the default ingress controller for the canary
594675
// success status condition.
595676
if icName == manifests.DefaultIngressControllerName {
596-
canaryCond := struct {
597-
condition string
598-
status operatorv1.ConditionStatus
599-
ifConditionsTrue []string
600-
gracePeriod time.Duration
601-
}{
677+
canaryCond := expectedCondition{
602678
condition: IngressControllerCanaryCheckSuccessConditionType,
603679
status: operatorv1.ConditionTrue,
604680
gracePeriod: time.Second * 60,
@@ -765,6 +841,12 @@ func computeIngressProgressingCondition(conditions []operatorv1.OperatorConditio
765841
{
766842
condition: IngressControllerDeploymentRollingOutConditionType,
767843
status: operatorv1.ConditionFalse,
844+
// Ignore infrastructure-driven deployment rollouts when computing
845+
// the IngressController's Progressing condition.
846+
ignoreReasons: []string{
847+
"ReplicasStabilizing", // Node reboots, pod evictions
848+
"PodsStarting", // Pods restarting after infrastructure events
849+
},
768850
},
769851
}
770852

pkg/operator/controller/ingress/status_test.go

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,9 @@ func Test_computeDeploymentRollingOutCondition(t *testing.T) {
503503
replicasAvailable *int32
504504
expectStatus operatorv1.ConditionStatus
505505
expectMessageContains string
506+
expectReasonContains string
507+
differentGeneration bool
508+
hasNewReplicaSet bool
506509
}{
507510
{
508511
name: "Router pod replicas not rolling out",
@@ -576,6 +579,42 @@ func Test_computeDeploymentRollingOutCondition(t *testing.T) {
576579
replicasAvailable: pointer.Int32(2),
577580
expectMessageContains: "Deployment is not actively rolling out",
578581
},
582+
{
583+
name: "Router pods replicas available < updated and generation is equal to observedGeneration",
584+
expectStatus: operatorv1.ConditionTrue,
585+
replicasHave: pointer.Int32(4),
586+
replicasWanted: pointer.Int32(4),
587+
replicasUpdated: pointer.Int32(4),
588+
replicasAvailable: pointer.Int32(1),
589+
expectMessageContains: "1 of 4 updated replica(s) are available",
590+
expectReasonContains: PodsStarting.String(),
591+
differentGeneration: false,
592+
hasNewReplicaSet: true,
593+
},
594+
{
595+
name: "Router pod replicas have > updated, generation is equal to observedGeneration and NewReplicaSet created",
596+
expectStatus: operatorv1.ConditionTrue,
597+
replicasHave: pointer.Int32(3),
598+
replicasWanted: pointer.Int32(1),
599+
replicasUpdated: pointer.Int32(1),
600+
replicasAvailable: pointer.Int32(1),
601+
expectMessageContains: "2 old replica(s) are pending termination",
602+
expectReasonContains: ReplicasStabilizing.String(),
603+
differentGeneration: false,
604+
hasNewReplicaSet: true,
605+
},
606+
{
607+
name: "Router pod replicas have/updated < want, generation is equal to observedGeneration and NewReplicaSet created",
608+
expectStatus: operatorv1.ConditionTrue,
609+
replicasHave: pointer.Int32(4),
610+
replicasWanted: pointer.Int32(4),
611+
replicasUpdated: pointer.Int32(4),
612+
replicasAvailable: pointer.Int32(1),
613+
expectMessageContains: "Waiting for router deployment rollout to finish: 1 of 4 updated replica(s) are available...\n",
614+
expectReasonContains: PodsStarting.String(),
615+
differentGeneration: false,
616+
hasNewReplicaSet: true,
617+
},
579618
}
580619
for _, test := range tests {
581620
t.Run(test.name, func(t *testing.T) {
@@ -589,13 +628,29 @@ func Test_computeDeploymentRollingOutCondition(t *testing.T) {
589628
UpdatedReplicas: *test.replicasUpdated,
590629
},
591630
}
631+
if !test.differentGeneration {
632+
routerDeploy.ObjectMeta.Generation = 1
633+
routerDeploy.Status.ObservedGeneration = 1
634+
}
635+
if test.hasNewReplicaSet {
636+
routerDeploy.Status.Conditions = []appsv1.DeploymentCondition{
637+
{
638+
Type: appsv1.DeploymentProgressing,
639+
Status: corev1.ConditionTrue,
640+
Reason: "NewReplicaSetAvailable",
641+
},
642+
}
643+
}
592644
actual := computeDeploymentRollingOutCondition(routerDeploy)
593645
if actual.Status != test.expectStatus {
594646
t.Errorf("expected status to be %s, got %s", test.expectStatus, actual.Status)
595647
}
596648
if len(test.expectMessageContains) != 0 && !strings.Contains(actual.Message, test.expectMessageContains) {
597649
t.Errorf("expected message to include %q, got %q", test.expectMessageContains, actual.Message)
598650
}
651+
if len(test.expectReasonContains) != 0 && !strings.Contains(actual.Reason, test.expectReasonContains) {
652+
t.Errorf("expected reason to include %q, got %q", test.expectReasonContains, actual.Reason)
653+
}
599654
})
600655
}
601656
}
@@ -1701,6 +1756,60 @@ func Test_computeIngressProgressingCondition(t *testing.T) {
17011756
},
17021757
expect: operatorv1.OperatorCondition{Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionTrue},
17031758
},
1759+
{
1760+
description: "load balancer is not progressing, but unmanaged load balancer type and router deployment is rolling out due to node reboot (pods starting)",
1761+
conditions: []operatorv1.OperatorCondition{
1762+
{Type: IngressControllerLoadBalancerProgressingConditionType, Status: operatorv1.ConditionFalse},
1763+
{Type: IngressControllerDeploymentRollingOutConditionType, Status: operatorv1.ConditionTrue, Reason: PodsStarting.String()},
1764+
{Type: operatorv1.LoadBalancerManagedIngressConditionType, Status: operatorv1.ConditionFalse},
1765+
},
1766+
expect: operatorv1.OperatorCondition{Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionFalse},
1767+
},
1768+
{
1769+
description: "load balancer is progressing, but unmanaged load balancer type and router deployment is rolling out due to node reboot (pods starting)",
1770+
conditions: []operatorv1.OperatorCondition{
1771+
{Type: IngressControllerLoadBalancerProgressingConditionType, Status: operatorv1.ConditionTrue},
1772+
{Type: IngressControllerDeploymentRollingOutConditionType, Status: operatorv1.ConditionTrue, Reason: PodsStarting.String()},
1773+
{Type: operatorv1.LoadBalancerManagedIngressConditionType, Status: operatorv1.ConditionFalse},
1774+
},
1775+
expect: operatorv1.OperatorCondition{Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionFalse},
1776+
},
1777+
{
1778+
description: "load balancer and unmanaged load balancer are progressing, but router deployment is rolling out due to node reboot (pods starting)",
1779+
conditions: []operatorv1.OperatorCondition{
1780+
{Type: IngressControllerLoadBalancerProgressingConditionType, Status: operatorv1.ConditionTrue},
1781+
{Type: IngressControllerDeploymentRollingOutConditionType, Status: operatorv1.ConditionTrue, Reason: PodsStarting.String()},
1782+
{Type: operatorv1.LoadBalancerManagedIngressConditionType, Status: operatorv1.ConditionTrue},
1783+
},
1784+
expect: operatorv1.OperatorCondition{Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionTrue},
1785+
},
1786+
{
1787+
description: "load balancer is not progressing, but unmanaged load balancer type and router deployment is rolling out due to node reboot (replicas stabilizing)",
1788+
conditions: []operatorv1.OperatorCondition{
1789+
{Type: IngressControllerLoadBalancerProgressingConditionType, Status: operatorv1.ConditionFalse},
1790+
{Type: IngressControllerDeploymentRollingOutConditionType, Status: operatorv1.ConditionTrue, Reason: ReplicasStabilizing.String()},
1791+
{Type: operatorv1.LoadBalancerManagedIngressConditionType, Status: operatorv1.ConditionFalse},
1792+
},
1793+
expect: operatorv1.OperatorCondition{Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionFalse},
1794+
},
1795+
{
1796+
description: "load balancer is progressing, but unmanaged load balancer type and router deployment is rolling out due to node reboot (replicas stabilizing)",
1797+
conditions: []operatorv1.OperatorCondition{
1798+
{Type: IngressControllerLoadBalancerProgressingConditionType, Status: operatorv1.ConditionTrue},
1799+
{Type: IngressControllerDeploymentRollingOutConditionType, Status: operatorv1.ConditionTrue, Reason: ReplicasStabilizing.String()},
1800+
{Type: operatorv1.LoadBalancerManagedIngressConditionType, Status: operatorv1.ConditionFalse},
1801+
},
1802+
expect: operatorv1.OperatorCondition{Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionFalse},
1803+
},
1804+
{
1805+
description: "load balancer and unmanaged load balancer are progressing, but router deployment is rolling out due to node reboot (replicas stabilizing)",
1806+
conditions: []operatorv1.OperatorCondition{
1807+
{Type: IngressControllerLoadBalancerProgressingConditionType, Status: operatorv1.ConditionTrue},
1808+
{Type: IngressControllerDeploymentRollingOutConditionType, Status: operatorv1.ConditionTrue, Reason: ReplicasStabilizing.String()},
1809+
{Type: operatorv1.LoadBalancerManagedIngressConditionType, Status: operatorv1.ConditionTrue},
1810+
},
1811+
expect: operatorv1.OperatorCondition{Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionTrue},
1812+
},
17041813
{
17051814
description: "load balancer is not progressing and router deployment is rolling out",
17061815
conditions: []operatorv1.OperatorCondition{

test/e2e/operator_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ var (
7878
{Type: operatorv1.IngressControllerAvailableConditionType, Status: operatorv1.ConditionTrue},
7979
{Type: operatorv1.LoadBalancerManagedIngressConditionType, Status: operatorv1.ConditionFalse},
8080
{Type: operatorv1.DNSManagedIngressConditionType, Status: operatorv1.ConditionFalse},
81+
{Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionFalse},
8182
}
8283
availableConditionsForIngressControllerWithNodePort = []operatorv1.OperatorCondition{
8384
{Type: operatorv1.IngressControllerAvailableConditionType, Status: operatorv1.ConditionTrue},

0 commit comments

Comments
 (0)