Skip to content

Commit 09a6e8d

Browse files
committed
fix: resolve deadlock when maxSurge>0 rolling update on single-replica LWS
When a LeaderWorkerSet has replicas=1 and maxSurge=1, triggering a rolling update caused the controller to immediately emit a "deleting surge replica" event and return replicas=1 (no surge ever created), leaving the update permanently stuck with the StatefulSet at partition=1, replicas=1. Root cause: in Case 2 of rollingUpdateParameters (a new rolling update is detected) the code called wantReplicas(lwsReplicas). With replicas=1 and maxSurge=1 the condition inside wantReplicas was: unreadyReplicas(1) <= maxSurge(1) → true which jumped straight into the "release surge" branch and returned replicas=1. No surge replica was ever created, so the StatefulSet partition could never advance. Fix: Case 2 now returns burstReplicas directly instead of going through wantReplicas. At the moment a new update is detected all existing replicas are still running the old template (none are unready due to the update yet), so the correct action is to expand to lwsReplicas+maxSurge first. wantReplicas is only meaningful once stsReplicas==burstReplicas and the surge pods are being replaced. A new integration test "rolling update with maxSurge=1 and single replica creates surge before rolling" directly exercises the regression: it verifies that the leader StatefulSet expands to replicas=2 immediately after the update is triggered, then converges back to replicas=1 once all groups are ready. Fixes: #688 Signed-off-by: veast <veast@users.noreply.github.com>
1 parent a3dc446 commit 09a6e8d

File tree

2 files changed

+105
-7
lines changed

2 files changed

+105
-7
lines changed

pkg/controllers/leaderworkerset_controller.go

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -288,12 +288,27 @@ func (r *LeaderWorkerSetReconciler) rollingUpdateParameters(ctx context.Context,
288288
}
289289
burstReplicas := lwsReplicas + int32(maxSurge)
290290

291-
// wantReplicas calculates the final replicas if needed.
291+
// wantReplicas calculates the desired total replicas (including any surge)
292+
// during an in-progress rolling update based on the number of currently
293+
// unready (not-yet-updated) replicas. It is only called after the initial
294+
// surge expansion has already happened (i.e. stsReplicas == burstReplicas).
295+
//
296+
// We enter the shrink path only when both conditions hold:
297+
// 1. unreadyReplicas < lwsReplicas: at least one original replica has
298+
// already been successfully updated, so the surge pod has served its
299+
// purpose and can start being released.
300+
// 2. unreadyReplicas <= maxSurge: the remaining unready replicas fit
301+
// within the surge budget, meaning we can safely reduce replicas.
302+
//
303+
// Without condition (1), a single-replica LWS (lwsReplicas=1, maxSurge=1)
304+
// would satisfy (2) immediately on the first reconcile after Case 2 expands
305+
// to burstReplicas, causing the surge to be released before any replica has
306+
// actually been updated.
292307
wantReplicas := func(unreadyReplicas int32) int32 {
293-
if unreadyReplicas <= int32(maxSurge) {
294-
// When we have n unready replicas and n bursted replicas, we should
295-
// start to release the burst replica gradually for the accommodation of
296-
// the unready ones.
308+
if unreadyReplicas < lwsReplicas && unreadyReplicas <= int32(maxSurge) {
309+
// At least one replica has been updated and the remaining unready
310+
// count fits within the surge budget. Release one surge slot per
311+
// newly-ready replica so we converge back on lwsReplicas.
297312
finalReplicas := lwsReplicas + utils.NonZeroValue(int32(unreadyReplicas)-1)
298313
r.Record.Eventf(lws, nil, corev1.EventTypeNormal, GroupsProgressing, Delete, fmt.Sprintf("deleting surge replica %s-%d", lws.Name, finalReplicas))
299314
return finalReplicas
@@ -302,10 +317,16 @@ func (r *LeaderWorkerSetReconciler) rollingUpdateParameters(ctx context.Context,
302317
}
303318

304319
// Case 2:
305-
// Indicates a new rolling update here.
320+
// Indicates a new rolling update here. At this point all existing replicas
321+
// are still running the old template (none are unready due to the update),
322+
// so we must first expand to burstReplicas before the rolling partition
323+
// can start advancing. Calling wantReplicas(lwsReplicas) here was wrong:
324+
// with replicas=1 and maxSurge=1 it satisfied the shrink condition and
325+
// immediately returned lwsReplicas, preventing the surge replica from ever
326+
// being created.
306327
if leaderWorkerSetUpdated {
307328
// Processing scaling up/down first prior to rolling update.
308-
return min(lwsReplicas, stsReplicas), wantReplicas(lwsReplicas), nil
329+
return min(lwsReplicas, stsReplicas), burstReplicas, nil
309330
}
310331

311332
partition := *sts.Spec.UpdateStrategy.RollingUpdate.Partition

test/integration/controllers/leaderworkerset_test.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,6 +2297,83 @@ var _ = ginkgo.Describe("LeaderWorkerSet controller", func() {
22972297
},
22982298
},
22992299
}),
2300+
2301+
ginkgo.Entry("rolling update with maxSurge=1 and single replica creates surge before rolling", &testCase{
2302+
// Regression test for: with replicas=1 and maxSurge=1, calling
2303+
// wantReplicas(lwsReplicas) in Case 2 satisfied the shrink condition
2304+
// (unreadyReplicas(1) <= maxSurge(1)), so the controller returned
2305+
// replicas=1 and emitted a spurious "deleting surge replica" event
2306+
// without ever creating the surge pod, leaving the update stuck forever.
2307+
makeLeaderWorkerSet: func(nsName string) *wrappers.LeaderWorkerSetWrapper {
2308+
return wrappers.BuildLeaderWorkerSet(nsName).Replica(1).MaxSurge(1)
2309+
},
2310+
updates: []*update{
2311+
{
2312+
// Set lws to available condition.
2313+
lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
2314+
testing.SetSuperPodToReady(ctx, k8sClient, lws, 1)
2315+
},
2316+
checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
2317+
testing.ExpectLeaderWorkerSetAvailable(ctx, k8sClient, lws, "All replicas are ready")
2318+
testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 0)
2319+
testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 1)
2320+
testing.ExpectValidWorkerStatefulSets(ctx, lws, k8sClient, true)
2321+
testing.ExpectLeaderWorkerSetStatusReplicas(ctx, k8sClient, lws, 1, 1)
2322+
},
2323+
},
2324+
{
2325+
// Trigger rolling update by changing worker template.
2326+
// The controller must expand to burstReplicas (2) with partition=1
2327+
// so the surge pod gets the new template. Before the fix, Case 2
2328+
// returned replicas=1 and the STS never grew.
2329+
lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
2330+
testing.UpdateWorkerTemplate(ctx, k8sClient, lws)
2331+
},
2332+
checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
2333+
// replicas must be 2 (1 original + 1 surge) to prove the surge
2334+
// was actually created. partition=1 means pod-0 (old) is held,
2335+
// pod-1 (surge, new template) is created first.
2336+
testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 2)
2337+
testing.ExpectLeaderWorkerSetProgressing(ctx, k8sClient, lws, "Replicas are progressing")
2338+
testing.ExpectLeaderWorkerSetUpgradeInProgress(ctx, k8sClient, lws, "Rolling Upgrade is in progress")
2339+
testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 1)
2340+
testing.ExpectLeaderWorkerSetStatusReplicas(ctx, k8sClient, lws, 1, 0)
2341+
},
2342+
},
2343+
{
2344+
// Create the surge leader pod and mark it ready. The controller
2345+
// advances partition to 0 (pod-0 starts updating) while keeping
2346+
// replicas=2 until pod-0 is also ready on the new template.
2347+
lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
2348+
var leaderSts appsv1.StatefulSet
2349+
testing.GetLeaderStatefulset(ctx, lws, k8sClient, &leaderSts)
2350+
gomega.Expect(testing.CreateLeaderPods(ctx, leaderSts, k8sClient, lws, 1, 2)).To(gomega.Succeed())
2351+
testing.SetPodGroupToReady(ctx, k8sClient, lws.Name+"-1", lws)
2352+
},
2353+
checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
2354+
// pod-1 (surge) is ready; partition advances to 0 so pod-0
2355+
// can be updated. Surge (replicas=2) is kept until pod-0 is
2356+
// also updated, ensuring zero downtime.
2357+
testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 2)
2358+
testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 0)
2359+
testing.ExpectLeaderWorkerSetProgressing(ctx, k8sClient, lws, "Replicas are progressing")
2360+
testing.ExpectLeaderWorkerSetUpgradeInProgress(ctx, k8sClient, lws, "Rolling Upgrade is in progress")
2361+
},
2362+
},
2363+
{
2364+
// Mark group-0 ready on the new template; the rollout completes.
2365+
lwsUpdateFn: func(lws *leaderworkerset.LeaderWorkerSet) {
2366+
testing.SetPodGroupToReady(ctx, k8sClient, lws.Name+"-0", lws)
2367+
},
2368+
checkLWSState: func(lws *leaderworkerset.LeaderWorkerSet) {
2369+
testing.ExpectLeaderWorkerSetAvailable(ctx, k8sClient, lws, "All replicas are ready")
2370+
testing.ExpectStatefulsetPartitionEqualTo(ctx, k8sClient, lws, 0)
2371+
testing.ExpectValidLeaderStatefulSet(ctx, k8sClient, lws, 1)
2372+
testing.ExpectLeaderWorkerSetStatusReplicas(ctx, k8sClient, lws, 1, 1)
2373+
},
2374+
},
2375+
},
2376+
}),
23002377
) // end of DescribeTable
23012378

23022379
ginkgo.Context("with gang scheduling enabled", ginkgo.Ordered, func() {

0 commit comments

Comments
 (0)