Update maxUnavailable calculation for leader StatefulSet (#781)

adinilfeld · web-flow · commit d4d6b172cf76 · 2026-03-25T05:10:13.000+05:30
* Correctly set maxUnavailable on the leader StatefulSet

* Fix sts.maxUnavailable calculation edge cases.

Fix maxSurge calculation to be capped at lws.replicas, and evaluate percentages against sts.replicas rather than lws.replicas. Also add unit tests to prevent regressions.
diff --git a/pkg/controllers/leaderworkerset_controller.go b/pkg/controllers/leaderworkerset_controller.go
@@ -166,7 +166,14 @@ func (r *LeaderWorkerSetReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 		r.Record.Eventf(lws, revision, corev1.EventTypeNormal, GroupsProgressing, Create, fmt.Sprintf("Created leader statefulset %s", lws.Name))
 	} else if !lwsUpdated && partition != *leaderSts.Spec.UpdateStrategy.RollingUpdate.Partition {
 		// An event is logged to track update progress.
-		r.Record.Eventf(lws, revision, corev1.EventTypeNormal, GroupsUpdating, Update, fmt.Sprintf("Updating replicas %d to %d", *leaderSts.Spec.UpdateStrategy.RollingUpdate.Partition, partition))
+		oldPartition := *leaderSts.Spec.UpdateStrategy.RollingUpdate.Partition
+		var updateMsg string
+		if oldPartition-1 == partition {
+			updateMsg = fmt.Sprintf("Updating replica %d", partition)
+		} else {
+			updateMsg = fmt.Sprintf("Updating replicas %d to %d (inclusive)", partition, oldPartition-1)
+		}
+		r.Record.Eventf(lws, revision, corev1.EventTypeNormal, GroupsUpdating, Update, updateMsg)
 	}
 
 	// Create headless service if it does not exist.
@@ -782,6 +789,27 @@ func constructLeaderStatefulSetApplyConfiguration(lws *leaderworkerset.LeaderWor
 
 	podTemplateApplyConfiguration.WithAnnotations(podAnnotations)
 
+	lwsReplicas := int(*lws.Spec.Replicas)
+	lwsMaxUnavailable, err := intstr.GetScaledValueFromIntOrPercent(&lws.Spec.RolloutStrategy.RollingUpdateConfiguration.MaxUnavailable, lwsReplicas, false)
+	if err != nil {
+		return nil, err
+	}
+	lwsMaxSurge, err := intstr.GetScaledValueFromIntOrPercent(&lws.Spec.RolloutStrategy.RollingUpdateConfiguration.MaxSurge, lwsReplicas, true)
+	if err != nil {
+		return nil, err
+	}
+	if lwsMaxSurge > lwsReplicas {
+		lwsMaxSurge = lwsReplicas
+	}
+	stsMaxUnavailableInt := int32(lwsMaxUnavailable + lwsMaxSurge)
+	// lwsMaxUnavailable=0 and lwsMaxSurge=0 together should be blocked by webhook,
+	// but just in case, we'll make sure that stsMaxUnavailable is at least 1.
+	// This also handles the case when lws.Spec.Replicas is 0.
+	if stsMaxUnavailableInt < 1 {
+		stsMaxUnavailableInt = 1
+	}
+	stsMaxUnavailable := intstr.FromInt32(stsMaxUnavailableInt)
+
 	// construct statefulset apply configuration
 	statefulSetConfig := appsapplyv1.StatefulSet(lws.Name, lws.Namespace).
 		WithSpec(appsapplyv1.StatefulSetSpec().
@@ -790,7 +818,7 @@ func constructLeaderStatefulSetApplyConfiguration(lws *leaderworkerset.LeaderWor
 			WithPodManagementPolicy(appsv1.ParallelPodManagement).
 			WithTemplate(&podTemplateApplyConfiguration).
 			WithUpdateStrategy(appsapplyv1.StatefulSetUpdateStrategy().WithType(appsv1.StatefulSetUpdateStrategyType(lws.Spec.RolloutStrategy.Type)).WithRollingUpdate(
-				appsapplyv1.RollingUpdateStatefulSetStrategy().WithMaxUnavailable(lws.Spec.RolloutStrategy.RollingUpdateConfiguration.MaxUnavailable).WithPartition(partition),
+				appsapplyv1.RollingUpdateStatefulSetStrategy().WithMaxUnavailable(stsMaxUnavailable).WithPartition(partition),
 			)).
 			WithSelector(metaapplyv1.LabelSelector().
 				WithMatchLabels(map[string]string{
diff --git a/pkg/controllers/leaderworkerset_controller_test.go b/pkg/controllers/leaderworkerset_controller_test.go
@@ -69,6 +69,7 @@ func TestLeaderStatefulSetApplyConfig(t *testing.T) {
 		revisionKey     string
 		lws             *leaderworkerset.LeaderWorkerSet
 		wantApplyConfig *appsapplyv1.StatefulSetApplyConfiguration
+		stsReplicas     *int32
 	}{
 		{
 			name:        "1 replica, size 1, with empty leader template, exclusive placement disabled",
@@ -336,7 +337,76 @@ func TestLeaderStatefulSetApplyConfig(t *testing.T) {
 					PodManagementPolicy: ptr.To[appsv1.PodManagementPolicyType](appsv1.ParallelPodManagement),
 					UpdateStrategy: appsapplyv1.StatefulSetUpdateStrategy().
 						WithType(appsv1.RollingUpdateStatefulSetStrategyType).
-						WithRollingUpdate(appsapplyv1.RollingUpdateStatefulSetStrategy().WithPartition(0).WithMaxUnavailable(intstr.FromInt32(2))),
+						WithRollingUpdate(appsapplyv1.RollingUpdateStatefulSetStrategy().WithPartition(0).WithMaxUnavailable(intstr.FromInt32(3))),
+				},
+			},
+		},
+		{
+			name:        "0 maxUnavailable, 2 maxSurge, with empty leader template, exclusive placement disabled",
+			revisionKey: revisionKey2,
+			lws: wrappers.BuildBasicLeaderWorkerSet("test-sample", "default").
+				Replica(1).
+				RolloutStrategy(leaderworkerset.RolloutStrategy{
+					Type: leaderworkerset.RollingUpdateStrategyType,
+					RollingUpdateConfiguration: &leaderworkerset.RollingUpdateConfiguration{
+						MaxUnavailable: intstr.FromInt32(0),
+						MaxSurge:       intstr.FromInt32(2),
+					},
+				}).
+				WorkerTemplateSpec(wrappers.MakeWorkerPodSpec()).
+				Size(1).
+				RestartPolicy(leaderworkerset.RecreateGroupOnPodRestart).Obj(),
+			wantApplyConfig: &appsapplyv1.StatefulSetApplyConfiguration{
+				TypeMetaApplyConfiguration: metaapplyv1.TypeMetaApplyConfiguration{
+					Kind:       ptr.To[string]("StatefulSet"),
+					APIVersion: ptr.To[string]("apps/v1"),
+				},
+				ObjectMetaApplyConfiguration: &metaapplyv1.ObjectMetaApplyConfiguration{
+					Name:      ptr.To[string]("test-sample"),
+					Namespace: ptr.To[string]("default"),
+					Labels: map[string]string{
+						"leaderworkerset.sigs.k8s.io/name":                   "test-sample",
+						"leaderworkerset.sigs.k8s.io/template-revision-hash": revisionKey2,
+					},
+					Annotations: map[string]string{"leaderworkerset.sigs.k8s.io/replicas": "1"},
+				},
+				Spec: &appsapplyv1.StatefulSetSpecApplyConfiguration{
+					Replicas: ptr.To[int32](1),
+					Selector: &metaapplyv1.LabelSelectorApplyConfiguration{
+						MatchLabels: map[string]string{
+							"leaderworkerset.sigs.k8s.io/name":         "test-sample",
+							"leaderworkerset.sigs.k8s.io/worker-index": "0",
+						},
+					},
+					Template: &coreapplyv1.PodTemplateSpecApplyConfiguration{
+						ObjectMetaApplyConfiguration: &metaapplyv1.ObjectMetaApplyConfiguration{
+							Labels: map[string]string{
+								"leaderworkerset.sigs.k8s.io/name":                   "test-sample",
+								"leaderworkerset.sigs.k8s.io/worker-index":           "0",
+								"leaderworkerset.sigs.k8s.io/template-revision-hash": revisionKey2,
+							},
+							Annotations: map[string]string{
+								"leaderworkerset.sigs.k8s.io/size": "1",
+							},
+						},
+						Spec: &coreapplyv1.PodSpecApplyConfiguration{
+							Containers: []coreapplyv1.ContainerApplyConfiguration{
+								{
+									Name:      ptr.To[string]("worker"),
+									Image:     ptr.To[string]("docker.io/nginxinc/nginx-unprivileged:1.27"),
+									Ports:     []coreapplyv1.ContainerPortApplyConfiguration{{ContainerPort: ptr.To[int32](8080), Protocol: ptr.To[corev1.Protocol](corev1.ProtocolTCP)}},
+									Resources: &coreapplyv1.ResourceRequirementsApplyConfiguration{},
+								},
+							},
+						},
+					},
+					ServiceName:         ptr.To[string]("test-sample"),
+					PodManagementPolicy: ptr.To[appsv1.PodManagementPolicyType](appsv1.ParallelPodManagement),
+					UpdateStrategy: appsapplyv1.StatefulSetUpdateStrategy().
+						WithType(appsv1.RollingUpdateStatefulSetStrategyType).
+						// maxSurge is capped at 1 (the value of lwsReplicas),
+						// so stsMaxUnavailableInt = 0 (lwsMaxUnavailable) + 1 (capped maxSurge) = 1.
+						WithRollingUpdate(appsapplyv1.RollingUpdateStatefulSetStrategy().WithPartition(0).WithMaxUnavailable(intstr.FromInt32(1))),
 				},
 			},
 		},
@@ -526,11 +596,155 @@ func TestLeaderStatefulSetApplyConfig(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:        "0 replica, 0 maxUnavailable, 0 maxSurge, with empty leader template, exclusive placement disabled",
+			revisionKey: revisionKey2,
+			lws: wrappers.BuildBasicLeaderWorkerSet("test-sample", "default").
+				Replica(0).
+				RolloutStrategy(leaderworkerset.RolloutStrategy{
+					Type: leaderworkerset.RollingUpdateStrategyType,
+					RollingUpdateConfiguration: &leaderworkerset.RollingUpdateConfiguration{
+						MaxUnavailable: intstr.FromInt32(0),
+						MaxSurge:       intstr.FromInt32(0),
+					},
+				}).
+				WorkerTemplateSpec(wrappers.MakeWorkerPodSpec()).
+				Size(1).
+				RestartPolicy(leaderworkerset.RecreateGroupOnPodRestart).Obj(),
+			wantApplyConfig: &appsapplyv1.StatefulSetApplyConfiguration{
+				TypeMetaApplyConfiguration: metaapplyv1.TypeMetaApplyConfiguration{
+					Kind:       ptr.To[string]("StatefulSet"),
+					APIVersion: ptr.To[string]("apps/v1"),
+				},
+				ObjectMetaApplyConfiguration: &metaapplyv1.ObjectMetaApplyConfiguration{
+					Name:      ptr.To[string]("test-sample"),
+					Namespace: ptr.To[string]("default"),
+					Labels: map[string]string{
+						"leaderworkerset.sigs.k8s.io/name":                   "test-sample",
+						"leaderworkerset.sigs.k8s.io/template-revision-hash": revisionKey2,
+					},
+					Annotations: map[string]string{"leaderworkerset.sigs.k8s.io/replicas": "0"},
+				},
+				Spec: &appsapplyv1.StatefulSetSpecApplyConfiguration{
+					Replicas: ptr.To[int32](0),
+					Selector: &metaapplyv1.LabelSelectorApplyConfiguration{
+						MatchLabels: map[string]string{
+							"leaderworkerset.sigs.k8s.io/name":         "test-sample",
+							"leaderworkerset.sigs.k8s.io/worker-index": "0",
+						},
+					},
+					Template: &coreapplyv1.PodTemplateSpecApplyConfiguration{
+						ObjectMetaApplyConfiguration: &metaapplyv1.ObjectMetaApplyConfiguration{
+							Labels: map[string]string{
+								"leaderworkerset.sigs.k8s.io/name":                   "test-sample",
+								"leaderworkerset.sigs.k8s.io/worker-index":           "0",
+								"leaderworkerset.sigs.k8s.io/template-revision-hash": revisionKey2,
+							},
+							Annotations: map[string]string{
+								"leaderworkerset.sigs.k8s.io/size": "1",
+							},
+						},
+						Spec: &coreapplyv1.PodSpecApplyConfiguration{
+							Containers: []coreapplyv1.ContainerApplyConfiguration{
+								{
+									Name:      ptr.To[string]("worker"),
+									Image:     ptr.To[string]("docker.io/nginxinc/nginx-unprivileged:1.27"),
+									Ports:     []coreapplyv1.ContainerPortApplyConfiguration{{ContainerPort: ptr.To[int32](8080), Protocol: ptr.To[corev1.Protocol](corev1.ProtocolTCP)}},
+									Resources: &coreapplyv1.ResourceRequirementsApplyConfiguration{},
+								},
+							},
+						},
+					},
+					ServiceName:         ptr.To[string]("test-sample"),
+					PodManagementPolicy: ptr.To[appsv1.PodManagementPolicyType](appsv1.ParallelPodManagement),
+					UpdateStrategy: appsapplyv1.StatefulSetUpdateStrategy().
+						WithType(appsv1.RollingUpdateStatefulSetStrategyType).
+						// Sts maxUnavailable is forced to be at least 1,
+						// even if lws maxUnavailable=0 and lws maxSurge=0.
+						WithRollingUpdate(appsapplyv1.RollingUpdateStatefulSetStrategy().WithPartition(0).WithMaxUnavailable(intstr.FromInt32(1))),
+				},
+			},
+		},
+		{
+			// Validates maxSurge uses lws replicas, not sts replicas.
+			name:        "1 maxUnavailable, 50% maxSurge, 2 lws replicas, 3 sts replicas currently",
+			revisionKey: revisionKey2,
+			stsReplicas: ptr.To[int32](3),
+			lws: wrappers.BuildBasicLeaderWorkerSet("test-sample", "default").
+				Replica(2).
+				RolloutStrategy(leaderworkerset.RolloutStrategy{
+					Type: leaderworkerset.RollingUpdateStrategyType,
+					RollingUpdateConfiguration: &leaderworkerset.RollingUpdateConfiguration{
+						MaxUnavailable: intstr.FromInt32(1),
+						MaxSurge:       intstr.FromString("50%"),
+					},
+				}).
+				WorkerTemplateSpec(wrappers.MakeWorkerPodSpec()).
+				Size(1).
+				RestartPolicy(leaderworkerset.RecreateGroupOnPodRestart).Obj(),
+			wantApplyConfig: &appsapplyv1.StatefulSetApplyConfiguration{
+				TypeMetaApplyConfiguration: metaapplyv1.TypeMetaApplyConfiguration{
+					Kind:       ptr.To[string]("StatefulSet"),
+					APIVersion: ptr.To[string]("apps/v1"),
+				},
+				ObjectMetaApplyConfiguration: &metaapplyv1.ObjectMetaApplyConfiguration{
+					Name:      ptr.To[string]("test-sample"),
+					Namespace: ptr.To[string]("default"),
+					Labels: map[string]string{
+						"leaderworkerset.sigs.k8s.io/name":                   "test-sample",
+						"leaderworkerset.sigs.k8s.io/template-revision-hash": revisionKey2,
+					},
+					Annotations: map[string]string{"leaderworkerset.sigs.k8s.io/replicas": "2"},
+				},
+				Spec: &appsapplyv1.StatefulSetSpecApplyConfiguration{
+					Replicas: ptr.To[int32](3), // using stsReplicas
+					Selector: &metaapplyv1.LabelSelectorApplyConfiguration{
+						MatchLabels: map[string]string{
+							"leaderworkerset.sigs.k8s.io/name":         "test-sample",
+							"leaderworkerset.sigs.k8s.io/worker-index": "0",
+						},
+					},
+					Template: &coreapplyv1.PodTemplateSpecApplyConfiguration{
+						ObjectMetaApplyConfiguration: &metaapplyv1.ObjectMetaApplyConfiguration{
+							Labels: map[string]string{
+								"leaderworkerset.sigs.k8s.io/name":                   "test-sample",
+								"leaderworkerset.sigs.k8s.io/worker-index":           "0",
+								"leaderworkerset.sigs.k8s.io/template-revision-hash": revisionKey2,
+							},
+							Annotations: map[string]string{
+								"leaderworkerset.sigs.k8s.io/size": "1",
+							},
+						},
+						Spec: &coreapplyv1.PodSpecApplyConfiguration{
+							Containers: []coreapplyv1.ContainerApplyConfiguration{
+								{
+									Name:      ptr.To[string]("worker"),
+									Image:     ptr.To[string]("docker.io/nginxinc/nginx-unprivileged:1.27"),
+									Ports:     []coreapplyv1.ContainerPortApplyConfiguration{{ContainerPort: ptr.To[int32](8080), Protocol: ptr.To[corev1.Protocol](corev1.ProtocolTCP)}},
+									Resources: &coreapplyv1.ResourceRequirementsApplyConfiguration{},
+								},
+							},
+						},
+					},
+					ServiceName:         ptr.To[string]("test-sample"),
+					PodManagementPolicy: ptr.To[appsv1.PodManagementPolicyType](appsv1.ParallelPodManagement),
+					UpdateStrategy: appsapplyv1.StatefulSetUpdateStrategy().
+						WithType(appsv1.RollingUpdateStatefulSetStrategyType).
+						// maxUnavailable=1, maxSurge=50% of 2 replicas (lwsReplicas) = 1.
+						// So stsMaxUnavailableInt = 1 + 1 = 2
+						WithRollingUpdate(appsapplyv1.RollingUpdateStatefulSetStrategy().WithPartition(0).WithMaxUnavailable(intstr.FromInt32(2))),
+				},
+			},
+		},
 	}
 
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
-			stsApplyConfig, err := constructLeaderStatefulSetApplyConfiguration(tc.lws, 0, *tc.lws.Spec.Replicas, tc.revisionKey)
+			stsReplicas := *tc.lws.Spec.Replicas
+			if tc.stsReplicas != nil {
+				stsReplicas = *tc.stsReplicas
+			}
+			stsApplyConfig, err := constructLeaderStatefulSetApplyConfiguration(tc.lws, 0, stsReplicas, tc.revisionKey)
 			if err != nil {
 				t.Errorf("failed with error: %s", err.Error())
 			}
diff --git a/pkg/controllers/pod_controller.go b/pkg/controllers/pod_controller.go
@@ -190,7 +190,9 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 			return ctrl.Result{}, err
 		}
 		if err = r.Create(ctx, workerStatefulSet); err != nil {
-			r.Record.Eventf(&leaderWorkerSet, &pod, corev1.EventTypeWarning, FailedCreate, Create, fmt.Sprintf("Failed to create worker statefulset for leader pod %s", pod.Name))
+			if client.IgnoreAlreadyExists(err) != nil {
+				r.Record.Eventf(&leaderWorkerSet, &pod, corev1.EventTypeWarning, FailedCreate, Create, fmt.Sprintf("Failed to create worker statefulset for leader pod %s", pod.Name))
+			}
 			return ctrl.Result{}, client.IgnoreAlreadyExists(err)
 		}
 		r.Record.Eventf(&leaderWorkerSet, &pod, corev1.EventTypeNormal, GroupsProgressing, Create, fmt.Sprintf("Created worker statefulset for leader pod %s", pod.Name))
diff --git a/site/content/en/docs/concepts/rollout-strategy/_index.md b/site/content/en/docs/concepts/rollout-strategy/_index.md
@@ -7,8 +7,10 @@ description: >
 
 Rolling update is vital to online services with zero downtime. For LLM inference services, this is particularly important, which helps to mitigate stockout. Two different configurations are supported in LWS, `maxUnavailable` and `maxSurge`:
 
-- `MaxUnavailable`: Indicates how many replicas are allowed to be unavailable during the update, the unavailable number is based on the spec.replicas. Defaults to 1. Note that only values >= 1 are supported.
-- `MaxSurge`: Indicates how many extra replicas can be deployed during the update. Defaults to 0.
+- `maxUnavailable`: Indicates how many replicas are allowed to be unavailable during the update, the unavailable number is based on the spec.replicas. Defaults to 1.
+- `maxSurge`: Indicates how many extra replicas can be deployed during the update. Defaults to 0.
+
+Note that `maxSurge` and `maxUnavailable` can not both be zero at the same time.
 
 Here's a leaderWorkerSet configured with rollout strategy, you can find the example [here](https://github.com/kubernetes-sigs/lws/blob/main/docs/examples/sample/lws-rollout-strategy.yaml):
 
diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go
@@ -164,6 +164,25 @@ var _ = ginkgo.Describe("leaderWorkerSet e2e tests", func() {
 		testing.ExpectLeaderWorkerSetAvailable(ctx, k8sClient, lws, "All replicas are ready")
 	})
 
+	ginkgo.It("Can perform a rolling update with maxUnavailable zero and maxSurge set", func() {
+		lws := wrappers.BuildLeaderWorkerSet(ns.Name).Replica(4).MaxSurge(1).MaxUnavailable(0).Obj()
+		testing.MustCreateLws(ctx, k8sClient, lws)
+
+		// Wait for leaderWorkerSet to be ready then update it.
+		testing.ExpectLeaderWorkerSetAvailable(ctx, k8sClient, lws, "All replicas are ready")
+		testing.UpdateWorkerTemplate(ctx, k8sClient, lws)
+
+		// Happen during rolling update. MaxSurge=1, so we expect up to 5 replicas.
+		testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 5)
+
+		// Rolling update completes.
+		testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 4)
+		testing.ExpectValidWorkerStatefulSets(ctx, lws, k8sClient, true)
+		testing.ExpectValidPods(ctx, k8sClient, lws, &corev1.PodList{})
+		// Wait for leaderWorkerSet to be ready again.
+		testing.ExpectLeaderWorkerSetAvailable(ctx, k8sClient, lws, "All replicas are ready")
+	})
+
 	ginkgo.It("Can perform a rolling update even if old lws not ready", func() {
 		// Create lws with not exist image.
 		lws := wrappers.BuildLeaderWorkerSet(ns.Name).LeaderTemplate(nil).Size(1).Replica(2).MaxSurge(1).MaxUnavailable(0).Obj()
diff --git a/test/integration/controllers/leaderworkerset_test.go b/test/integration/controllers/leaderworkerset_test.go
diff --git a/test/integration/webhooks/leaderworkerset_test.go b/test/integration/webhooks/leaderworkerset_test.go

Original file line number	Diff line number	Diff line change
`@@ -190,7 +190,9 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R`
`190`	`190`	`return ctrl.Result{}, err`
`191`	`191`	`}`
`192`	`192`	`if err = r.Create(ctx, workerStatefulSet); err != nil {`
`193`		`- r.Record.Eventf(&leaderWorkerSet, &pod, corev1.EventTypeWarning, FailedCreate, Create, fmt.Sprintf("Failed to create worker statefulset for leader pod %s", pod.Name))`
	`193`	`+ if client.IgnoreAlreadyExists(err) != nil {`
	`194`	`+ r.Record.Eventf(&leaderWorkerSet, &pod, corev1.EventTypeWarning, FailedCreate, Create, fmt.Sprintf("Failed to create worker statefulset for leader pod %s", pod.Name))`
	`195`	`+ }`
`194`	`196`	`return ctrl.Result{}, client.IgnoreAlreadyExists(err)`
`195`	`197`	`}`
`196`	`198`	`r.Record.Eventf(&leaderWorkerSet, &pod, corev1.EventTypeNormal, GroupsProgressing, Create, fmt.Sprintf("Created worker statefulset for leader pod %s", pod.Name))`