Skip to content

Commit 8989fea

Browse files
atiratreestlaz
andcommitted
Fix Progressing and Degraded condition during workload rollouts.
Deployment rollout should depend on the Deployment Progressing condition to asses the state of the deployment as there can be unavailable pods during the deployment. This is capped by a 15 minute deadline. As a result - Degraded condition should be False during the deployment rollout. - Progressing condition should be True during the deployment rollout. Co-authored-by: Standa Láznička <[email protected]>
1 parent 4c1f8b4 commit 8989fea

File tree

2 files changed

+149
-5
lines changed

2 files changed

+149
-5
lines changed

pkg/operator/apiserver/controller/workload/workload.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,9 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
289289

290290
// If the workload is up to date, then we are no longer progressing
291291
workloadAtHighestGeneration := workload.ObjectMeta.Generation == workload.Status.ObservedGeneration
292-
workloadIsBeingUpdated := workload.Status.UpdatedReplicas < desiredReplicas
292+
// Update is done when all pods have been updated to the latest revision
293+
// and the deployment controller has reported NewReplicaSetAvailable
294+
workloadIsBeingUpdated := !workloadAtHighestGeneration || !hasDeploymentProgressed(workload.Status)
293295
workloadIsBeingUpdatedTooLong, err := isUpdatingTooLong(previousStatus, *deploymentProgressingCondition.Type)
294296
if !workloadAtHighestGeneration {
295297
deploymentProgressingCondition = deploymentProgressingCondition.
@@ -300,8 +302,15 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
300302
deploymentProgressingCondition = deploymentProgressingCondition.
301303
WithStatus(operatorv1.ConditionTrue).
302304
WithReason("PodsUpdating").
303-
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest generation", workload.Name, c.targetNamespace, workload.Status.UpdatedReplicas, desiredReplicas))
305+
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest generation and %d/%d pods are available", workload.Name, c.targetNamespace, workload.Status.UpdatedReplicas, desiredReplicas, workload.Status.AvailableReplicas, desiredReplicas))
304306
} else {
307+
// Terminating pods don't account for any of the other status fields but
308+
// still can exist in a state when they are accepting connections and would
309+
// contribute to unexpected behavior when we report Progressing=False.
310+
// The case of too many pods might occur for example if `TerminationGracePeriodSeconds` is set
311+
//
312+
// The workload should ensure this does not happen by using for example EnsureAtMostOnePodPerNode
313+
// so that the old pods terminate before the new ones are started.
305314
deploymentProgressingCondition = deploymentProgressingCondition.
306315
WithStatus(operatorv1.ConditionFalse).
307316
WithReason("AsExpected")
@@ -353,6 +362,17 @@ func isUpdatingTooLong(operatorStatus *operatorv1.OperatorStatus, progressingCon
353362
return progressing != nil && progressing.Status == operatorv1.ConditionTrue && time.Now().After(progressing.LastTransitionTime.Add(15*time.Minute)), nil
354363
}
355364

365+
// hasDeploymentProgressed returns true if the deployment reports NewReplicaSetAvailable
366+
// via the DeploymentProgressing condition
367+
func hasDeploymentProgressed(status appsv1.DeploymentStatus) bool {
368+
for _, cond := range status.Conditions {
369+
if cond.Type == appsv1.DeploymentProgressing {
370+
return cond.Status == corev1.ConditionTrue && cond.Reason == "NewReplicaSetAvailable"
371+
}
372+
}
373+
return false
374+
}
375+
356376
// EnsureAtMostOnePodPerNode updates the deployment spec to prevent more than
357377
// one pod of a given replicaset from landing on a node. It accomplishes this
358378
// by adding a label on the template and updates the pod anti-affinity term to include that label.

pkg/operator/apiserver/controller/workload/workload_test.go

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
140140
},
141141
Status: appsv1.DeploymentStatus{
142142
AvailableReplicas: 0,
143+
Conditions: []appsv1.DeploymentCondition{
144+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionFalse, LastUpdateTime: metav1.NewTime(time.Now().Add(-6 * time.Minute)), LastTransitionTime: metav1.NewTime(time.Now().Add(-6 * time.Minute)), Reason: "ProgressDeadlineExceeded", Message: "timed out"},
145+
},
143146
},
144147
},
145148
pods: []*corev1.Pod{
@@ -192,7 +195,7 @@ func TestUpdateOperatorStatus(t *testing.T) {
192195
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
193196
Status: operatorv1.ConditionTrue,
194197
Reason: "PodsUpdating",
195-
Message: "deployment/apiserver.openshift-apiserver: 0/3 pods have been updated to the latest generation",
198+
Message: "deployment/apiserver.openshift-apiserver: 0/3 pods have been updated to the latest generation and 0/3 pods are available",
196199
},
197200
}
198201
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
@@ -211,6 +214,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
211214
},
212215
Status: appsv1.DeploymentStatus{
213216
AvailableReplicas: 0,
217+
Conditions: []appsv1.DeploymentCondition{
218+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "ReplicaSetUpdated", Message: "progressing"},
219+
},
214220
},
215221
},
216222
pods: []*corev1.Pod{
@@ -262,7 +268,7 @@ func TestUpdateOperatorStatus(t *testing.T) {
262268
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
263269
Status: operatorv1.ConditionTrue,
264270
Reason: "PodsUpdating",
265-
Message: "deployment/apiserver.openshift-apiserver: 0/3 pods have been updated to the latest generation",
271+
Message: "deployment/apiserver.openshift-apiserver: 0/3 pods have been updated to the latest generation and 0/3 pods are available",
266272
},
267273
}
268274
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
@@ -282,6 +288,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
282288
Status: appsv1.DeploymentStatus{
283289
AvailableReplicas: 2,
284290
UpdatedReplicas: 3,
291+
Conditions: []appsv1.DeploymentCondition{
292+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
293+
},
285294
},
286295
},
287296
pods: []*corev1.Pod{
@@ -345,6 +354,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
345354
Status: appsv1.DeploymentStatus{
346355
AvailableReplicas: 3,
347356
UpdatedReplicas: 3,
357+
Conditions: []appsv1.DeploymentCondition{
358+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
359+
},
348360
},
349361
},
350362
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
@@ -387,8 +399,65 @@ func TestUpdateOperatorStatus(t *testing.T) {
387399
Replicas: ptr.To[int32](3),
388400
},
389401
Status: appsv1.DeploymentStatus{
402+
Replicas: 3,
403+
ReadyReplicas: 3,
390404
AvailableReplicas: 3,
405+
UpdatedReplicas: 3,
406+
ObservedGeneration: 99,
407+
Conditions: []appsv1.DeploymentCondition{
408+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
409+
},
410+
},
411+
},
412+
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
413+
expectedConditions := []operatorv1.OperatorCondition{
414+
{
415+
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeAvailable),
416+
Status: operatorv1.ConditionTrue,
417+
Reason: "AsExpected",
418+
Message: "",
419+
},
420+
{
421+
Type: fmt.Sprintf("%sWorkloadDegraded", defaultControllerName),
422+
Status: operatorv1.ConditionFalse,
423+
},
424+
{
425+
Type: fmt.Sprintf("%sDeploymentDegraded", defaultControllerName),
426+
Status: operatorv1.ConditionFalse,
427+
Reason: "AsExpected",
428+
Message: "",
429+
},
430+
{
431+
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
432+
Status: operatorv1.ConditionTrue,
433+
Reason: "NewGeneration",
434+
Message: "deployment/apiserver.openshift-apiserver: observed generation is 99, desired generation is 100.",
435+
},
436+
}
437+
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
438+
},
439+
},
440+
441+
{
442+
name: "scenario: rare case when we have an outdated (generation) workload and one old replica failing is but it will be picked up soon by the new rollout thus we are available and we are progressing",
443+
workload: &appsv1.Deployment{
444+
ObjectMeta: metav1.ObjectMeta{
445+
Name: "apiserver",
446+
Namespace: "openshift-apiserver",
447+
Generation: 100,
448+
},
449+
Spec: appsv1.DeploymentSpec{
450+
Replicas: ptr.To[int32](3),
451+
},
452+
Status: appsv1.DeploymentStatus{
453+
Replicas: 3,
454+
ReadyReplicas: 2,
455+
AvailableReplicas: 2,
456+
UpdatedReplicas: 3,
391457
ObservedGeneration: 99,
458+
Conditions: []appsv1.DeploymentCondition{
459+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
460+
},
392461
},
393462
},
394463
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
@@ -469,6 +538,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
469538
AvailableReplicas: 2,
470539
UpdatedReplicas: 1,
471540
ObservedGeneration: 2,
541+
Conditions: []appsv1.DeploymentCondition{
542+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "ReplicaSetUpdated", Message: "progressing"},
543+
},
472544
},
473545
},
474546
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
@@ -493,7 +565,7 @@ func TestUpdateOperatorStatus(t *testing.T) {
493565
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
494566
Status: operatorv1.ConditionTrue,
495567
Reason: "PodsUpdating",
496-
Message: "deployment/apiserver.openshift-apiserver: 1/3 pods have been updated to the latest generation",
568+
Message: "deployment/apiserver.openshift-apiserver: 1/3 pods have been updated to the latest generation and 2/3 pods are available",
497569
},
498570
}
499571
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
@@ -512,6 +584,9 @@ func TestUpdateOperatorStatus(t *testing.T) {
512584
Status: appsv1.DeploymentStatus{
513585
AvailableReplicas: 3,
514586
UpdatedReplicas: 3,
587+
Conditions: []appsv1.DeploymentCondition{
588+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "NewReplicaSetAvailable", Message: "has successfully progressed"},
589+
},
515590
},
516591
},
517592
previousConditions: []operatorv1.OperatorCondition{
@@ -547,6 +622,55 @@ func TestUpdateOperatorStatus(t *testing.T) {
547622
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
548623
},
549624
},
625+
{
626+
name: "some pods rolled out and waiting for old terminating pod before we can progress further",
627+
workload: &appsv1.Deployment{
628+
ObjectMeta: metav1.ObjectMeta{
629+
Name: "apiserver",
630+
Namespace: "openshift-apiserver",
631+
},
632+
Spec: appsv1.DeploymentSpec{
633+
Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
634+
Template: corev1.PodTemplateSpec{ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"foo": "bar"}}},
635+
Replicas: ptr.To[int32](3),
636+
},
637+
Status: appsv1.DeploymentStatus{
638+
AvailableReplicas: 2,
639+
ReadyReplicas: 2,
640+
UpdatedReplicas: 3,
641+
Conditions: []appsv1.DeploymentCondition{
642+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), LastTransitionTime: metav1.Now(), Reason: "ReplicaSetUpdated", Message: "progressing"},
643+
},
644+
},
645+
},
646+
validateOperatorStatus: func(actualStatus *operatorv1.OperatorStatus) error {
647+
expectedConditions := []operatorv1.OperatorCondition{
648+
{
649+
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeAvailable),
650+
Status: operatorv1.ConditionTrue,
651+
Reason: "AsExpected",
652+
Message: "",
653+
},
654+
{
655+
Type: fmt.Sprintf("%sWorkloadDegraded", defaultControllerName),
656+
Status: operatorv1.ConditionFalse,
657+
},
658+
{
659+
Type: fmt.Sprintf("%sDeploymentDegraded", defaultControllerName),
660+
Status: operatorv1.ConditionFalse,
661+
Reason: "AsExpected",
662+
Message: "",
663+
},
664+
{
665+
Type: fmt.Sprintf("%sDeployment%s", defaultControllerName, operatorv1.OperatorStatusTypeProgressing),
666+
Status: operatorv1.ConditionTrue,
667+
Reason: "PodsUpdating",
668+
Message: "deployment/apiserver.openshift-apiserver: 3/3 pods have been updated to the latest generation and 2/3 pods are available",
669+
},
670+
}
671+
return areCondidtionsEqual(expectedConditions, actualStatus.Conditions)
672+
},
673+
},
550674
}
551675

552676
for _, scenario := range scenarios {

0 commit comments

Comments
 (0)