Skip to content

Commit 8b2e727

Browse files
committed
fix: resolve deadlock when maxSurge>0 rolling update on single-replica LWS
When a LeaderWorkerSet has replicas=1 and maxSurge=1, triggering a rolling update caused the controller to immediately emit a "deleting surge replica" event and return replicas=1 (no surge ever created), leaving the update permanently stuck with the StatefulSet at partition=1, replicas=1. Root cause: in Case 2 of rollingUpdateParameters (a new rolling update is detected) the code called wantReplicas(lwsReplicas). With replicas=1 and maxSurge=1 the condition inside wantReplicas was: unreadyReplicas(1) <= maxSurge(1) → true which jumped straight into the "release surge" branch and returned replicas=1. No surge replica was ever created, so the StatefulSet partition could never advance. Fix: Case 2 now returns burstReplicas directly instead of going through wantReplicas. At the moment a new update is detected all existing replicas are still running the old template (none are unready due to the update yet), so the correct action is to expand to lwsReplicas+maxSurge first. wantReplicas is only meaningful once stsReplicas==burstReplicas and the surge pods are being replaced. A new integration test "rolling update with maxSurge=1 and single replica creates surge before rolling" directly exercises the regression: it verifies that the leader StatefulSet expands to replicas=2 immediately after the update is triggered, then converges back to replicas=1 once all groups are ready. Fixes: #688 Signed-off-by: veast <veast@users.noreply.github.com>
1 parent a3dc446 commit 8b2e727

File tree

2 files changed

+91
-6
lines changed

2 files changed

+91
-6
lines changed

pkg/controllers/leaderworkerset_controller.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -288,12 +288,15 @@ func (r *LeaderWorkerSetReconciler) rollingUpdateParameters(ctx context.Context,
288288
}
289289
burstReplicas := lwsReplicas + int32(maxSurge)
290290

291-
// wantReplicas calculates the final replicas if needed.
291+
// wantReplicas calculates the desired total replicas (including any surge)
292+
// during an in-progress rolling update based on the number of currently
293+
// unready (not-yet-updated) replicas. It is only called after the initial
294+
// surge expansion has already happened (i.e. stsReplicas == burstReplicas).
292295
wantReplicas := func(unreadyReplicas int32) int32 {
293296
if unreadyReplicas <= int32(maxSurge) {
294-
// When we have n unready replicas and n bursted replicas, we should
295-
// start to release the burst replica gradually for the accommodation of
296-
// the unready ones.
297+
// We have at most maxSurge unready replicas, meaning all or most of the
298+
// surge replicas have been replaced. Release one surge slot per
299+
// newly-ready replica so we converge on lwsReplicas.
297300
finalReplicas := lwsReplicas + utils.NonZeroValue(int32(unreadyReplicas)-1)
298301
r.Record.Eventf(lws, nil, corev1.EventTypeNormal, GroupsProgressing, Delete, fmt.Sprintf("deleting surge replica %s-%d", lws.Name, finalReplicas))
299302
return finalReplicas
@@ -302,10 +305,16 @@ func (r *LeaderWorkerSetReconciler) rollingUpdateParameters(ctx context.Context,
302305
}
303306

304307
// Case 2:
305-
// Indicates a new rolling update here.
308+
// Indicates a new rolling update here. At this point all existing replicas
309+
// are still running the old template (none are unready due to the update),
310+
// so we must first expand to burstReplicas before the rolling partition
311+
// can start advancing. Calling wantReplicas(lwsReplicas) here was wrong:
312+
// with replicas=1 and maxSurge=1 it satisfied the shrink condition
313+
// (unreadyReplicas(1) <= maxSurge(1)) and immediately returned lwsReplicas,
314+
// preventing the surge replica from ever being created.
306315
if leaderWorkerSetUpdated {
307316
// Processing scaling up/down first prior to rolling update.
308-
return min(lwsReplicas, stsReplicas), wantReplicas(lwsReplicas), nil
317+
return min(lwsReplicas, stsReplicas), burstReplicas, nil
309318
}
310319

311320
partition := *sts.Spec.UpdateStrategy.RollingUpdate.Partition

test/integration/controllers/leaderworkerset_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,6 +2297,82 @@ var _ = ginkgo.Describe("LeaderWorkerSet controller", func() {
22972297
},
22982298
},
22992299
}),
2300+
2301+
ginkgo.Entry("rolling update with maxSurge=1 and single replica creates surge before rolling", &testCase{
2302+
// Regression test for: with replicas=1 and maxSurge=1, calling
2303+
// wantReplicas(lwsReplicas) in Case 2 satisfied the shrink condition
2304+
// (unreadyReplicas(1) <= maxSurge(1)), so the controller returned
2305+
// replicas=1 and emitted a spurious "deleting surge replica" event
2306+
// without ever creating the surge pod, leaving the update stuck forever.
2307+
makeLeaderWorkerSet: func(nsName string) *wrappers.LeaderWorkerSetWrapper {
2308+
return wrappers.BuildLeaderWorkerSet(nsName).Replica(1).MaxSurge(1)
2309+
},
2310+
updates: []*update{
2311+
{
2312+
// Set lws to available condition.
2313+
lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
2314+
testing.SetSuperPodToReady(ctx, k8sClient, lws, 1)
2315+
},
2316+
checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
2317+
testing.ExpectLeaderWorkerSetAvailable(ctx, k8sClient, lws, "All replicas are ready")
2318+
testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 0)
2319+
testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 1)
2320+
testing.ExpectValidWorkerStatefulSets(ctx, lws, k8sClient, true)
2321+
testing.ExpectLeaderWorkerSetStatusReplicas(ctx, k8sClient, lws, 1, 1)
2322+
},
2323+
},
2324+
{
2325+
// Trigger rolling update by changing worker template.
2326+
// The controller must expand to burstReplicas (2) with partition=1
2327+
// so the surge pod gets the new template. Before the fix, Case 2
2328+
// returned replicas=1 and the STS never grew.
2329+
lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
2330+
testing.UpdateWorkerTemplate(ctx, k8sClient, lws)
2331+
},
2332+
checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
2333+
// replicas must be 2 (1 original + 1 surge) to prove the surge
2334+
// was actually created. partition=1 means pod-0 (old) is held,
2335+
// pod-1 (surge, new template) is created first.
2336+
testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 2)
2337+
testing.ExpectLeaderWorkerSetProgressing(ctx, k8sClient, lws, "Replicas are progressing")
2338+
testing.ExpectLeaderWorkerSetUpgradeInProgress(ctx, k8sClient, lws, "Rolling Upgrade is in progress")
2339+
testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 1)
2340+
testing.ExpectLeaderWorkerSetStatusReplicas(ctx, k8sClient, lws, 1, 0)
2341+
},
2342+
},
2343+
{
2344+
// Create the surge leader pod and mark it ready. Once pod-1 is
2345+
// ready the controller releases the surge (replicas=1, partition=0)
2346+
// and pod-0 starts being replaced with the new template.
2347+
lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
2348+
var leaderSts appsv1.StatefulSet
2349+
testing.GetLeaderStatefulset(ctx, lws, k8sClient, &leaderSts)
2350+
gomega.Expect(testing.CreateLeaderPods(ctx, leaderSts, k8sClient, lws, 1, 2)).To(gomega.Succeed())
2351+
testing.SetPodGroupToReady(ctx, k8sClient, lws.Name+"-1", lws)
2352+
},
2353+
checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
2354+
// Surge released: STS back to replicas=1, partition=0.
2355+
// pod-0 is being replaced (Progressing, not yet Available).
2356+
testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 1)
2357+
testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 0)
2358+
testing.ExpectLeaderWorkerSetProgressing(ctx, k8sClient, lws, "Replicas are progressing")
2359+
testing.ExpectLeaderWorkerSetUpgradeInProgress(ctx, k8sClient, lws, "Rolling Upgrade is in progress")
2360+
},
2361+
},
2362+
{
2363+
// Mark group-0 ready on the new template; the rollout completes.
2364+
lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
2365+
testing.SetPodGroupToReady(ctx, k8sClient, lws.Name+"-0", lws)
2366+
},
2367+
checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
2368+
testing.ExpectLeaderWorkerSetAvailable(ctx, k8sClient, lws, "All replicas are ready")
2369+
testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 0)
2370+
testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 1)
2371+
testing.ExpectLeaderWorkerSetStatusReplicas(ctx, k8sClient, lws, 1, 1)
2372+
},
2373+
},
2374+
},
2375+
}),
23002376
) // end of DescribeTable
23012377

23022378
ginkgo.Context("with gang scheduling enabled", ginkgo.Ordered, func() {

0 commit comments

Comments
 (0)