Skip to content

Commit fc703a7

Browse files
Merge pull request #1855 from atiratree/fix-workload-conditions
OCPBUGS-23435: Fix Progressing and Degraded condition during workload rollouts.
2 parents 9b8048f + 8989fea commit fc703a7

File tree

2 files changed

+149
-5
lines changed

2 files changed

+149
-5
lines changed

pkg/operator/apiserver/controller/workload/workload.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,9 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
290290

291291
// If the workload is up to date, then we are no longer progressing
292292
workloadAtHighestGeneration := workload.ObjectMeta.Generation == workload.Status.ObservedGeneration
293-
workloadIsBeingUpdated := workload.Status.UpdatedReplicas < desiredReplicas
293+
// Update is done when all pods have been updated to the latest revision
294+
// and the deployment controller has reported NewReplicaSetAvailable
295+
workloadIsBeingUpdated := !workloadAtHighestGeneration || !hasDeploymentProgressed(workload.Status)
294296
workloadIsBeingUpdatedTooLong, err := isUpdatingTooLong(previousStatus, *deploymentProgressingCondition.Type)
295297
if !workloadAtHighestGeneration {
296298
deploymentProgressingCondition = deploymentProgressingCondition.
@@ -301,8 +303,15 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
301303
deploymentProgressingCondition = deploymentProgressingCondition.
302304
WithStatus(operatorv1.ConditionTrue).
303305
WithReason("PodsUpdating").
304-
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest generation", workload.Name, c.targetNamespace, workload.Status.UpdatedReplicas, desiredReplicas))
306+
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest generation and %d/%d pods are available", workload.Name, c.targetNamespace, workload.Status.UpdatedReplicas, desiredReplicas, workload.Status.AvailableReplicas, desiredReplicas))
305307
} else {
308+
// Terminating pods don't account for any of the other status fields but
309+
// still can exist in a state when they are accepting connections and would
310+
// contribute to unexpected behavior when we report Progressing=False.
311+
// The case of too many pods might occur for example if `TerminationGracePeriodSeconds` is set
312+
//
313+
// The workload should ensure this does not happen by using for example EnsureAtMostOnePodPerNode
314+
// so that the old pods terminate before the new ones are started.
306315
deploymentProgressingCondition = deploymentProgressingCondition.
307316
WithStatus(operatorv1.ConditionFalse).
308317
WithReason("AsExpected")
@@ -354,6 +363,17 @@ func isUpdatingTooLong(operatorStatus *operatorv1.OperatorStatus, progressingCon
354363
return progressing != nil && progressing.Status == operatorv1.ConditionTrue && time.Now().After(progressing.LastTransitionTime.Add(15*time.Minute)), nil
355364
}
356365

366+
// hasDeploymentProgressed returns true if the deployment reports NewReplicaSetAvailable
367+
// via the DeploymentProgressing condition
368+
func hasDeploymentProgressed(status appsv1.DeploymentStatus) bool {
369+
for _, cond := range status.Conditions {
370+
if cond.Type == appsv1.DeploymentProgressing {
371+
return cond.Status == corev1.ConditionTrue && cond.Reason == "NewReplicaSetAvailable"
372+
}
373+
}
374+
return false
375+
}
376+
357377
// EnsureAtMostOnePodPerNode updates the deployment spec to prevent more than
358378
// one pod of a given replicaset from landing on a node. It accomplishes this
359379
// by adding a label on the template and updates the pod anti-affinity term to include that label.

pkg/operator/apiserver/controller/workload/workload_test.go

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
141141
},
142142
Status: appsv1.DeploymentStatus{
143143
AvailableReplicas: 0,
144+
Conditions: []appsv1.DeploymentCondition{
145+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionFalse, LastUpdateTime: metav1.NewTime(time.Now().Add(-6 * time.Minute)), LastTransitionTime: metav1.NewTime(time.Now().Add(-6 * time.Minute)), Reason: "ProgressDeadlineExceeded", Message: "timed out"},
146+
},
144147
},
145148
},
146149
pods: []*corev1.Pod{
@@ -193,7 +196,7 @@ func TestUpdateOperatorStatus(t *testing.T) {
193196
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
194197
Status: operatorv1.ConditionTrue,
195198
Reason: "PodsUpdating",
196-
Message: "deployment/apiserver.openshift-apiserver: 0/3 pods have been updated to the latest generation",
199+
Message: "deployment/apiserver.openshift-apiserver: 0/3 pods have been updated to the latest generation and 0/3 pods are available",
197200
},
198201
}
199202
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
@@ -212,6 +215,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
212215
},
213216
Status: appsv1.DeploymentStatus{
214217
AvailableReplicas: 0,
218+
Conditions: []appsv1.DeploymentCondition{
219+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "ReplicaSetUpdated", Message: "progressing"},
220+
},
215221
},
216222
},
217223
pods: []*corev1.Pod{
@@ -263,7 +269,7 @@ func TestUpdateOperatorStatus(t *testing.T) {
263269
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
264270
Status: operatorv1.ConditionTrue,
265271
Reason: "PodsUpdating",
266-
Message: "deployment/apiserver.openshift-apiserver: 0/3 pods have been updated to the latest generation",
272+
Message: "deployment/apiserver.openshift-apiserver: 0/3 pods have been updated to the latest generation and 0/3 pods are available",
267273
},
268274
}
269275
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
@@ -283,6 +289,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
283289
Status: appsv1.DeploymentStatus{
284290
AvailableReplicas: 2,
285291
UpdatedReplicas: 3,
292+
Conditions: []appsv1.DeploymentCondition{
293+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
294+
},
286295
},
287296
},
288297
pods: []*corev1.Pod{
@@ -346,6 +355,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
346355
Status: appsv1.DeploymentStatus{
347356
AvailableReplicas: 3,
348357
UpdatedReplicas: 3,
358+
Conditions: []appsv1.DeploymentCondition{
359+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
360+
},
349361
},
350362
},
351363
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
@@ -388,8 +400,65 @@ func TestUpdateOperatorStatus(t *testing.T) {
388400
Replicas: ptr.To[int32](3),
389401
},
390402
Status: appsv1.DeploymentStatus{
403+
Replicas: 3,
404+
ReadyReplicas: 3,
391405
AvailableReplicas: 3,
406+
UpdatedReplicas: 3,
407+
ObservedGeneration: 99,
408+
Conditions: []appsv1.DeploymentCondition{
409+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
410+
},
411+
},
412+
},
413+
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
414+
expectedConditions := []operatorv1.OperatorCondition{
415+
{
416+
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeAvailable),
417+
Status: operatorv1.ConditionTrue,
418+
Reason: "AsExpected",
419+
Message: "",
420+
},
421+
{
422+
Type: fmt.Sprintf("%sWorkloadDegraded", defaultControllerName),
423+
Status: operatorv1.ConditionFalse,
424+
},
425+
{
426+
Type: fmt.Sprintf("%sDeploymentDegraded", defaultControllerName),
427+
Status: operatorv1.ConditionFalse,
428+
Reason: "AsExpected",
429+
Message: "",
430+
},
431+
{
432+
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
433+
Status: operatorv1.ConditionTrue,
434+
Reason: "NewGeneration",
435+
Message: "deployment/apiserver.openshift-apiserver: observed generation is 99, desired generation is 100.",
436+
},
437+
}
438+
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
439+
},
440+
},
441+
442+
{
443+
name: "scenario: rare case when we have an outdated (generation) workload and one old replica failing is but it will be picked up soon by the new rollout thus we are available and we are progressing",
444+
workload: &appsv1.Deployment{
445+
ObjectMeta: metav1.ObjectMeta{
446+
Name: "apiserver",
447+
Namespace: "openshift-apiserver",
448+
Generation: 100,
449+
},
450+
Spec: appsv1.DeploymentSpec{
451+
Replicas: ptr.To[int32](3),
452+
},
453+
Status: appsv1.DeploymentStatus{
454+
Replicas: 3,
455+
ReadyReplicas: 2,
456+
AvailableReplicas: 2,
457+
UpdatedReplicas: 3,
392458
ObservedGeneration: 99,
459+
Conditions: []appsv1.DeploymentCondition{
460+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
461+
},
393462
},
394463
},
395464
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
@@ -470,6 +539,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
470539
AvailableReplicas: 2,
471540
UpdatedReplicas: 1,
472541
ObservedGeneration: 2,
542+
Conditions: []appsv1.DeploymentCondition{
543+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "ReplicaSetUpdated", Message: "progressing"},
544+
},
473545
},
474546
},
475547
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
@@ -494,7 +566,7 @@ func TestUpdateOperatorStatus(t *testing.T) {
494566
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
495567
Status: operatorv1.ConditionTrue,
496568
Reason: "PodsUpdating",
497-
Message: "deployment/apiserver.openshift-apiserver: 1/3 pods have been updated to the latest generation",
569+
Message: "deployment/apiserver.openshift-apiserver: 1/3 pods have been updated to the latest generation and 2/3 pods are available",
498570
},
499571
}
500572
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
@@ -513,6 +585,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
513585
Status: appsv1.DeploymentStatus{
514586
AvailableReplicas: 3,
515587
UpdatedReplicas: 3,
588+
Conditions: []appsv1.DeploymentCondition{
589+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
590+
},
516591
},
517592
},
518593
previousConditions: []operatorv1.OperatorCondition{
@@ -548,6 +623,55 @@ func TestUpdateOperatorStatus(t *testing.T) {
548623
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
549624
},
550625
},
626+
{
627+
name: "some pods rolled out and waiting for old terminating pod before we can progress further",
628+
workload: &appsv1.Deployment{
629+
ObjectMeta: metav1.ObjectMeta{
630+
Name: "apiserver",
631+
Namespace: "openshift-apiserver",
632+
},
633+
Spec: appsv1.DeploymentSpec{
634+
Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
635+
Template: corev1.PodTemplateSpec{ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"foo": "bar"}}},
636+
Replicas: ptr.To[int32](3),
637+
},
638+
Status: appsv1.DeploymentStatus{
639+
AvailableReplicas: 2,
640+
ReadyReplicas: 2,
641+
UpdatedReplicas: 3,
642+
Conditions: []appsv1.DeploymentCondition{
643+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "ReplicaSetUpdated", Message: "progressing"},
644+
},
645+
},
646+
},
647+
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
648+
expectedConditions := []operatorv1.OperatorCondition{
649+
{
650+
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeAvailable),
651+
Status: operatorv1.ConditionTrue,
652+
Reason: "AsExpected",
653+
Message: "",
654+
},
655+
{
656+
Type: fmt.Sprintf("%sWorkloadDegraded", defaultControllerName),
657+
Status: operatorv1.ConditionFalse,
658+
},
659+
{
660+
Type: fmt.Sprintf("%sDeploymentDegraded", defaultControllerName),
661+
Status: operatorv1.ConditionFalse,
662+
Reason: "AsExpected",
663+
Message: "",
664+
},
665+
{
666+
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
667+
Status: operatorv1.ConditionTrue,
668+
Reason: "PodsUpdating",
669+
Message: "deployment/apiserver.openshift-apiserver: 3/3 pods have been updated to the latest generation and 2/3 pods are available",
670+
},
671+
}
672+
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
673+
},
674+
},
551675
}
552676

553677
for _, scenario := range scenarios {

0 commit comments

Comments
 (0)