fixup! fix: resolve deadlock when maxSurge>0 rolling update on single-replica LWS

veast · veast · commit ca057f0d7636 · 2026-03-16T12:25:52.000+08:00
Revert the wantReplicas condition change (&lt;= back to &lt;): the original
&lt;= was correct for multi-replica cases (e.g. replicas=4, maxSurge=1:
when only 1 replica is still unready we want to start reclaiming the
surge slot). The condition change broke existing tests.

The bug is fully fixed by the Case 2 change alone (returning
burstReplicas directly instead of wantReplicas(lwsReplicas)).

Also correct the new integration test expectations to match the actual
controller behaviour: after the surge pod (index=1) becomes ready the
controller immediately shrinks replicas back to 1 and sets partition=0
so that group-0 starts being replaced.
diff --git a/pkg/controllers/leaderworkerset_controller.go b/pkg/controllers/leaderworkerset_controller.go
@@ -293,10 +293,10 @@ func (r *LeaderWorkerSetReconciler) rollingUpdateParameters(ctx context.Context,
 	// unready (not-yet-updated) replicas. It is only called after the initial
 	// surge expansion has already happened (i.e. stsReplicas == burstReplicas).
 	wantReplicas := func(unreadyReplicas int32) int32 {
-		if unreadyReplicas < int32(maxSurge) {
-			// We have fewer unready replicas than the surge budget, meaning some
-			// surge replicas have already been replaced. Release one surge slot
-			// per newly-ready replica so we converge on lwsReplicas.
+		if unreadyReplicas <= int32(maxSurge) {
+			// We have at most maxSurge unready replicas, meaning all or most of the
+			// surge replicas have been replaced. Release one surge slot per
+			// newly-ready replica so we converge on lwsReplicas.
 			finalReplicas := lwsReplicas + utils.NonZeroValue(int32(unreadyReplicas)-1)
 			r.Record.Eventf(lws, nil, corev1.EventTypeNormal, GroupsProgressing, Delete, fmt.Sprintf("deleting surge replica %s-%d", lws.Name, finalReplicas))
 			return finalReplicas
@@ -310,8 +310,8 @@ func (r *LeaderWorkerSetReconciler) rollingUpdateParameters(ctx context.Context,
 	// so we must first expand to burstReplicas before the rolling partition
 	// can start advancing. Calling wantReplicas(lwsReplicas) here was wrong:
 	// with replicas=1 and maxSurge=1 it satisfied the shrink condition
-	// (1 <= 1) and immediately returned lwsReplicas, preventing the surge
-	// replica from ever being created.
+	// (unreadyReplicas(1) <= maxSurge(1)) and immediately returned lwsReplicas,
+	// preventing the surge replica from ever being created.
 	if leaderWorkerSetUpdated {
 		// Processing scaling up/down first prior to rolling update.
 		return min(lwsReplicas, stsReplicas), burstReplicas, nil
diff --git a/test/integration/controllers/leaderworkerset_test.go b/test/integration/controllers/leaderworkerset_test.go
@@ -2299,9 +2299,9 @@ var _ = ginkgo.Describe("LeaderWorkerSet controller", func() {
 		}),
 
 		ginkgo.Entry("rolling update with maxSurge=1 and single replica creates surge before rolling", &testCase{
-			// Regression test for: with replicas=1 and maxSurge=1 the controller
-			// was entering the shrink path of wantReplicas immediately on update
-			// (condition was unreadyReplicas<=maxSurge i.e. 1<=1), so it returned
+			// Regression test for: with replicas=1 and maxSurge=1, calling
+			// wantReplicas(lwsReplicas) in Case 2 satisfied the shrink condition
+			// (unreadyReplicas(1) <= maxSurge(1)), so the controller returned
 			// replicas=1 and emitted a spurious "deleting surge replica" event
 			// without ever creating the surge pod, leaving the update stuck forever.
 			makeLeaderWorkerSet: func(nsName string) *wrappers.LeaderWorkerSetWrapper {
@@ -2323,39 +2323,44 @@ var _ = ginkgo.Describe("LeaderWorkerSet controller", func() {
 				},
 				{
 					// Trigger rolling update by changing worker template.
-					// The controller must expand to burstReplicas (2) before
-					// advancing the partition -- it must NOT stay at replicas=1.
+					// The controller must expand to burstReplicas (2) with partition=1
+					// so the surge pod gets the new template. Before the fix, Case 2
+					// returned replicas=1 and the STS never grew.
 					lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
 						testing.UpdateWorkerTemplate(ctx, k8sClient, lws)
 					},
 					checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
-						// replicas must be 2 (1 original + 1 surge) to prove
-						// that the surge was actually created.
+						// replicas must be 2 (1 original + 1 surge) to prove the surge
+						// was actually created. partition=1 means pod-0 (old) is held,
+						// pod-1 (surge, new template) is created first.
 						testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 2)
 						testing.ExpectLeaderWorkerSetProgressing(ctx, k8sClient, lws, "Replicas are progressing")
 						testing.ExpectLeaderWorkerSetUpgradeInProgress(ctx, k8sClient, lws, "Rolling Upgrade is in progress")
-						testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 0)
+						testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 1)
 						testing.ExpectLeaderWorkerSetStatusReplicas(ctx, k8sClient, lws, 1, 0)
 					},
 				},
 				{
-					// Create the surge leader pod and mark it ready, which allows
-					// the rollout to proceed.
+					// Create the surge leader pod and mark it ready. Once pod-1 is
+					// ready the controller releases the surge (replicas=1, partition=0)
+					// and pod-0 starts being replaced with the new template.
 					lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
 						var leaderSts appsv1.StatefulSet
 						testing.GetLeaderStatefulset(ctx, lws, k8sClient, &leaderSts)
 						gomega.Expect(testing.CreateLeaderPods(ctx, leaderSts, k8sClient, lws, 1, 2)).To(gomega.Succeed())
 						testing.SetPodGroupToReady(ctx, k8sClient, lws.Name+"-1", lws)
 					},
 					checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
-						// Surge replica-1 is ready; replicas should shrink back toward lwsReplicas.
+						// Surge released: STS back to replicas=1, partition=0.
+						// pod-0 is being replaced (Progressing, not yet Available).
+						testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 1)
+						testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 0)
 						testing.ExpectLeaderWorkerSetProgressing(ctx, k8sClient, lws, "Replicas are progressing")
 						testing.ExpectLeaderWorkerSetUpgradeInProgress(ctx, k8sClient, lws, "Rolling Upgrade is in progress")
 					},
 				},
 				{
-					// Mark group-0 (the original replica) as ready on the new template;
-					// the rollout should now complete and replicas return to 1.
+					// Mark group-0 ready on the new template; the rollout completes.
 					lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
 						testing.SetPodGroupToReady(ctx, k8sClient, lws.Name+"-0", lws)
 					},