From 10a227067195ceecbae4a5845a545ede9307646b Mon Sep 17 00:00:00 2001 From: Omer Aplatony Date: Sat, 15 Nov 2025 13:55:12 +0000 Subject: [PATCH 1/6] [WIP] In Place Only VPA Signed-off-by: Omer Aplatony --- vertical-pod-autoscaler/enhancements/xxxx-in-place-only/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 vertical-pod-autoscaler/enhancements/xxxx-in-place-only/README.md diff --git a/vertical-pod-autoscaler/enhancements/xxxx-in-place-only/README.md b/vertical-pod-autoscaler/enhancements/xxxx-in-place-only/README.md new file mode 100644 index 000000000000..e69de29bb2d1 From 29e7843f37cdf4c6bf9f8295b1c7a7e97a8a19d7 Mon Sep 17 00:00:00 2001 From: Omer Aplatony Date: Sat, 15 Nov 2025 15:53:37 +0000 Subject: [PATCH 2/6] removed spaces Signed-off-by: Omer Aplatony --- .../enhancements/8818-in-place-only/README.md | 195 ++++++++++++++++++ .../enhancements/xxxx-in-place-only/README.md | 0 2 files changed, 195 insertions(+) create mode 100644 vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md delete mode 100644 vertical-pod-autoscaler/enhancements/xxxx-in-place-only/README.md diff --git a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md new file mode 100644 index 000000000000..addafd347e92 --- /dev/null +++ b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md @@ -0,0 +1,195 @@ +# AEP-4017: Non-Disruptive In-Place Updates in VPA + + +- [Summary](#summary) +- [Motivation](#motivation) +- [Goals](#goals) +- [Non-Goals](#non-goals) +- [Proposal](#proposal) +- [Design Details](#design-details) +- [Test Plan](#test-plan) +- [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Upgrade](#upgrade) + - [Downgrade](#downgrade) +- [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [How can this feature be enabled / disabled in a live cluster?](#how-can-this-feature-be-enabled--disabled-in-a-live-cluster) +- [Kubernetes version compatibility](#kubernetes-version-compatibility) +- [Implementation History](#implementation-history) + + +## Summary + +[AEP-4016](https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler/enhancements/4016-in-place-updates-support) introduced the `InPlaceOrRecreate` update mode which attempts in-place updates first but falls back to pod eviction if the in-place update fails. However, for certain workloads, any disruption is unacceptable, and users would prefer to retry in-place updates indefinitely rather than evict and recreate pods. + +This proposal introduces a new update mode that only attempts in-place updates and retries on failure without ever falling back to eviction. + +## Motivation + +There are several use cases where pod disruption should be avoided at all costs: + +- Stateful workloads: Pods managing critical state where restart would cause data loss or lengthy recovery. +- Long-running computations: Jobs or services performing computations that cannot be checkpointed and would need to restart from the beginning. +- Strict SLO requirements: Services with stringent availability requirements where even brief disruptions are unacceptable. +In these scenarios, users would prefer: + +- To operate with current (potentially suboptimal) resource allocations until an in-place update becomes feasible +- To receive clear signals when updates cannot be applied +- To have VPA continuously retry updates as cluster conditions change + +## Goals + +- Provide a truly non-disruptive VPA update mode that never evicts pods +- Allow VPA to eventually apply updates when cluster conditions improve +- Respect the existing in-place update infrastructure from AEP-4016 + +## Non-Goals + +- Guarantee that all updates will eventually succeed (node capacity constraints may prevent this) +- Provide mechanisms to automatically increase node capacity to accommodate updates +- Change the behavior of existing update modes (Auto, Recreate, InPlaceOrRecreate) + +## Proposal + +Add a new supported value of UpdateMode: `InPlace` +This mode will: +- Apply recommendations during pod admission (like all other modes) +- Attempt in-place updates for running pods under the same conditions as `InPlaceOrRecreate` +- Never add pods to `podsForEviction` if in-place updates fail +- Continuously retry failed in-place update + +## Design Details + +Add `UpdateModeInPlace` to the VPA types: + +```golang +// In pkg/apis/autoscaling.k8s.io/v1/types.go +const ( + // ... existing modes ... + // UpdateModeInPlace means that VPA will only attempt to update pods in-place + // and will never evict them. If in-place update fails, VPA will retry later. + UpdateModeInPlace UpdateMode = "InPlace" +) +``` + +We will enhance `inplace_restriction.go` to support the new mode: + +```golang +// Update CanInPlaceUpdate to accept update mode +func (ip *PodsInPlaceRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod, updateMode vpa_types.UpdateMode) utils.InPlaceDecision { + if !features.Enabled(features.InPlaceOrRecreate) { + return utils.InPlaceEvict + } + + cr, present := ip.podToReplicaCreatorMap[getPodID(pod)] + if present { + singleGroupStats, present := ip.creatorToSingleGroupStatsMap[cr] + if pod.Status.Phase == apiv1.PodPending { + return utils.InPlaceDeferred + } + if present { + if isInPlaceUpdating(pod) { + canEvict := CanEvictInPlacingPod(pod, singleGroupStats, ip.lastInPlaceAttemptTimeMap, ip.clock) + if canEvict { + // For InPlace mode, never suggest eviction + if updateMode == vpa_types.UpdateModeInPlace { + return utils.InPlaceDeferred + } + return utils.InPlaceEvict + } + return utils.InPlaceDeferred + } + if singleGroupStats.isPodDisruptable() { + return utils.InPlaceApproved + } + } + } + klog.V(4).InfoS("Can't in-place update pod, waiting for next loop", "pod", klog.KObj(pod)) + return utils.InPlaceDeferred +} +``` + +The retry logic is implicitly handled by the existing `CanInPlaceUpdate` decision system. Specifically: +- Deferred State: When `CanInPlaceUpdate` returns `utils.InPlaceDeferred`, the pod is skipped in the current loop and will be reconsidered in the next iteration +- Loop Frequency: The updater's main loop runs periodically (default every 1 minute), providing natural retry behavior +- Condition-Based Decisions: The `CanEvictInPlacingPod` function already tracks state via `lastInPlaceAttemptTimeMap` + +```golang +for vpa, livePods := range controlledPods { + // ... existing setup code ... + + podsForInPlace := make([]*apiv1.Pod, 0) + podsForEviction := make([]*apiv1.Pod, 0) + + if updateMode == vpa_types.UpdateModeInPlace && inPlaceFeatureEnable { + // New mode: only in-place, never evict + podsForInPlace = u.getPodsUpdateOrder(filterNonInPlaceUpdatablePods(livePods, inPlaceLimiter), vpa) + inPlaceUpdatablePodsCounter.Add(vpaSize, len(podsForInPlace)) + } // rest of the code + + // ... existing counters ... + +for _, pod := range podsForInPlace { + withInPlaceUpdatable = true + decision := inPlaceLimiter.CanInPlaceUpdate(pod, updateMode) + + if decision == utils.InPlaceDeferred { + klog.V(2).InfoS("In-place update deferred, will retry in next loop", "pod", klog.KObj(pod)) + continue + } else if decision == utils.InPlaceEvict { + // Only add to eviction list if NOT in InPlace mode + if updateMode != vpa_types.UpdateModeInPlace { + podsForEviction = append(podsForEviction, pod) + } else { + klog.V(2).InfoS("In-place update would require eviction, but InPlace mode prevents it. Will retry later.", "pod", klog.KObj(pod)) + metrics_updater.RecordDeferredInPlaceUpdate(vpaSize, vpa.Name, vpa.Namespace, "EvictionPrevented") + } + continue + } + // rest of the code +} +``` + +## Test Plan + +The following test scenarios will be added to e2e tests. The InPlace mode will be tested in the following scenarios: + +- Basic In-Place Update: Pod successfully updated in-place with InPlace mode +- Failed Update - No Eviction: Update fails due to node capacity, verify no eviction occurs and pod remains running +- Failed Update - Retry Success: Update fails initially, conditions improve, verify successful retry + +## Upgrade / Downgrade Strategy + +### Upgrade + +On upgrade to VPA 1.6.0 (tentative release version), users can opt into the new `InPlace` mode by enabling the alpha Feature Gate (which defaults to disabled) by passing `--feature-gates=InPlace=true` to the updater and admission-controller components and setting their VPA UpdateMode to use `InPlace`. +Existing VPAs will continue to work as before. + +### Downgrade + +On downgrade of VPA from 1.6.0 (tentative release version), nothing will change. VPAs will continue to work as previously, unless, the user had enabled the feature gate. In which case downgrade could break their VPA that uses `InPlace`. + +## Feature Enablement and Rollback + +### How can this feature be enabled / disabled in a live cluster? + +- Feature gate name: `InPlace` +- Components depending on the feature gate: + - admission-controller + - updater + +Disabling of feature gate `InPlace` will cause the following to happen: +- admission-controller to reject new VPA objects being created with `InPlace` configured + - A descriptive error message should be returned to the user letting them know that they are using a feature gated feature + +Enabling of feature gate `InPlace` will cause the following to happen: +- admission-controller to accept new VPA objects being created with `InPlace` configured +- updater will attempt to perform an in-place **only** adjustment for VPAs configured with `InPlace` + +## Kubernetes version compatibility + +`InPlace` is being built assuming that it will be running on a Kubernetes version of at least 1.33 with the beta version of [KEP-1287: In-Place Update of Pod Resources](https://github.com/kubernetes/enhancements/issues/1287) enabled. +Should these conditions not be true, the VPA shall not be able to scale your workload at all. + +## Implementation History + +- 2025-15-11: initial version diff --git a/vertical-pod-autoscaler/enhancements/xxxx-in-place-only/README.md b/vertical-pod-autoscaler/enhancements/xxxx-in-place-only/README.md deleted file mode 100644 index e69de29bb2d1..000000000000 From c6d27c071134b5166325cf9bf24184c7591a82ab Mon Sep 17 00:00:00 2001 From: Omer Aplatony Date: Sat, 15 Nov 2025 15:57:41 +0000 Subject: [PATCH 3/6] Fixed AEP number Signed-off-by: Omer Aplatony --- .../enhancements/8818-in-place-only/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md index addafd347e92..9ceff5914a3e 100644 --- a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md +++ b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md @@ -1,4 +1,4 @@ -# AEP-4017: Non-Disruptive In-Place Updates in VPA +# AEP-8818: Non-Disruptive In-Place Updates in VPA - [Summary](#summary) From 26456cb4fca1ee72134c07d3c4317a3bfa7c7b4d Mon Sep 17 00:00:00 2001 From: Omer Aplatony Date: Sun, 16 Nov 2025 08:50:35 +0000 Subject: [PATCH 4/6] Fixed function Signed-off-by: Omer Aplatony --- .../enhancements/8818-in-place-only/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md index 9ceff5914a3e..488ac0439178 100644 --- a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md +++ b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md @@ -88,12 +88,12 @@ func (ip *PodsInPlaceRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod, updateMod } if present { if isInPlaceUpdating(pod) { + // For InPlace mode, never suggest eviction + if updateMode == vpa_types.UpdateModeInPlace { + return utils.InPlaceDeferred + } canEvict := CanEvictInPlacingPod(pod, singleGroupStats, ip.lastInPlaceAttemptTimeMap, ip.clock) if canEvict { - // For InPlace mode, never suggest eviction - if updateMode == vpa_types.UpdateModeInPlace { - return utils.InPlaceDeferred - } return utils.InPlaceEvict } return utils.InPlaceDeferred From e76adef91981467f340aff00059c2d9fa4a7cf9e Mon Sep 17 00:00:00 2001 From: Omer Aplatony Date: Sun, 16 Nov 2025 11:55:13 +0000 Subject: [PATCH 5/6] fmt Signed-off-by: Omer Aplatony --- .../enhancements/8818-in-place-only/README.md | 95 ++++++------------- 1 file changed, 28 insertions(+), 67 deletions(-) diff --git a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md index 488ac0439178..527f8aaf373c 100644 --- a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md +++ b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md @@ -71,83 +71,44 @@ const ( ) ``` -We will enhance `inplace_restriction.go` to support the new mode: +The updater loop will handle the `InPlace` mode by never adding pods to `podsForEviction`, regardless of the decision from `CanInPlaceUpdate`: ```golang -// Update CanInPlaceUpdate to accept update mode -func (ip *PodsInPlaceRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod, updateMode vpa_types.UpdateMode) utils.InPlaceDecision { - if !features.Enabled(features.InPlaceOrRecreate) { - return utils.InPlaceEvict - } +for vpa, livePods := range controlledPods { + updateMode := vpa.Spec.UpdatePolicy.UpdateMode - cr, present := ip.podToReplicaCreatorMap[getPodID(pod)] - if present { - singleGroupStats, present := ip.creatorToSingleGroupStatsMap[cr] - if pod.Status.Phase == apiv1.PodPending { - return utils.InPlaceDeferred - } - if present { - if isInPlaceUpdating(pod) { - // For InPlace mode, never suggest eviction - if updateMode == vpa_types.UpdateModeInPlace { - return utils.InPlaceDeferred - } - canEvict := CanEvictInPlacingPod(pod, singleGroupStats, ip.lastInPlaceAttemptTimeMap, ip.clock) - if canEvict { - return utils.InPlaceEvict - } - return utils.InPlaceDeferred - } - if singleGroupStats.isPodDisruptable() { - return utils.InPlaceApproved + if updateMode == vpa_types.UpdateModeInPlace && inPlaceFeatureEnable { + podsForInPlace = u.getPodsUpdateOrder( + filterNonInPlaceUpdatablePods(livePods, inPlaceLimiter), vpa) + } + // ... rest of existing code + + for _, pod := range podsForInPlace { + decision := inPlaceLimiter.CanInPlaceUpdate(pod) + + if decision == utils.InPlaceDeferred { + // Kubelet will automatically retry + continue + } else if decision == utils.InPlaceEvict { + // Only evict for InPlaceOrRecreate mode + if updateMode == vpa_types.UpdateModeInPlaceOrRecreate { + podsForEviction = append(podsForEviction, pod) } + // For InPlace mode, do nothing - Kubelet handles retry + continue } + + // InPlaceApproved - proceed with update + // ... } - klog.V(4).InfoS("Can't in-place update pod, waiting for next loop", "pod", klog.KObj(pod)) - return utils.InPlaceDeferred } ``` -The retry logic is implicitly handled by the existing `CanInPlaceUpdate` decision system. Specifically: -- Deferred State: When `CanInPlaceUpdate` returns `utils.InPlaceDeferred`, the pod is skipped in the current loop and will be reconsidered in the next iteration -- Loop Frequency: The updater's main loop runs periodically (default every 1 minute), providing natural retry behavior -- Condition-Based Decisions: The `CanEvictInPlacingPod` function already tracks state via `lastInPlaceAttemptTimeMap` - -```golang -for vpa, livePods := range controlledPods { - // ... existing setup code ... - podsForInPlace := make([]*apiv1.Pod, 0) - podsForEviction := make([]*apiv1.Pod, 0) - - if updateMode == vpa_types.UpdateModeInPlace && inPlaceFeatureEnable { - // New mode: only in-place, never evict - podsForInPlace = u.getPodsUpdateOrder(filterNonInPlaceUpdatablePods(livePods, inPlaceLimiter), vpa) - inPlaceUpdatablePodsCounter.Add(vpaSize, len(podsForInPlace)) - } // rest of the code - - // ... existing counters ... - -for _, pod := range podsForInPlace { - withInPlaceUpdatable = true - decision := inPlaceLimiter.CanInPlaceUpdate(pod, updateMode) - - if decision == utils.InPlaceDeferred { - klog.V(2).InfoS("In-place update deferred, will retry in next loop", "pod", klog.KObj(pod)) - continue - } else if decision == utils.InPlaceEvict { - // Only add to eviction list if NOT in InPlace mode - if updateMode != vpa_types.UpdateModeInPlace { - podsForEviction = append(podsForEviction, pod) - } else { - klog.V(2).InfoS("In-place update would require eviction, but InPlace mode prevents it. Will retry later.", "pod", klog.KObj(pod)) - metrics_updater.RecordDeferredInPlaceUpdate(vpaSize, vpa.Name, vpa.Namespace, "EvictionPrevented") - } - continue - } - // rest of the code -} -``` +Retry is handled entirely by the Kubelet based on pod conditions: +- `PodResizePending` (reason: `Deferred`) - Kubelet will retry automatically +- `PodResizePending` (reason: `Infeasible`) - Kubelet will never retry +- `PodResizeInProgress` - Resize is being applied ## Test Plan From 19e34c0b54bb2f5815f6caf6a46e306e95edc5c9 Mon Sep 17 00:00:00 2001 From: Omer Aplatony Date: Sun, 16 Nov 2025 15:34:58 +0000 Subject: [PATCH 6/6] Use kubelet as the for retry Signed-off-by: Omer Aplatony --- .../enhancements/8818-in-place-only/README.md | 94 ++++++++++++++----- 1 file changed, 70 insertions(+), 24 deletions(-) diff --git a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md index 527f8aaf373c..56b39d694d5e 100644 --- a/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md +++ b/vertical-pod-autoscaler/enhancements/8818-in-place-only/README.md @@ -66,40 +66,86 @@ Add `UpdateModeInPlace` to the VPA types: const ( // ... existing modes ... // UpdateModeInPlace means that VPA will only attempt to update pods in-place - // and will never evict them. If in-place update fails, VPA will retry later. + // and will never evict them. If in-place update fails, VPA will rely on + // Kubelet's automatic retry mechanism. UpdateModeInPlace UpdateMode = "InPlace" ) ``` -The updater loop will handle the `InPlace` mode by never adding pods to `podsForEviction`, regardless of the decision from `CanInPlaceUpdate`: +Modify the `CanInPlaceUpdate` to accomdate the new update mode: ```golang -for vpa, livePods := range controlledPods { - updateMode := vpa.Spec.UpdatePolicy.UpdateMode - - if updateMode == vpa_types.UpdateModeInPlace && inPlaceFeatureEnable { - podsForInPlace = u.getPodsUpdateOrder( - filterNonInPlaceUpdatablePods(livePods, inPlaceLimiter), vpa) +func (ip *PodsInPlaceRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod, updateMode vpa_types.UpdateMode) utils.InPlaceDecision { + switch updateMode { + case vpa_types.UpdateModeInPlaceOrRecreate: + if !features.Enabled(features.InPlaceOrRecreate) { + return utils.InPlaceEvict + } + case vpa_types.UpdateModeInPlace: + if !features.Enabled(features.InPlace) { + return utils.InPlaceEvict + } + default: + return utils.InPlaceEvict } - // ... rest of existing code - - for _, pod := range podsForInPlace { - decision := inPlaceLimiter.CanInPlaceUpdate(pod) - - if decision == utils.InPlaceDeferred { - // Kubelet will automatically retry - continue - } else if decision == utils.InPlaceEvict { - // Only evict for InPlaceOrRecreate mode - if updateMode == vpa_types.UpdateModeInPlaceOrRecreate { - podsForEviction = append(podsForEviction, pod) + + cr, present := ip.podToReplicaCreatorMap[getPodID(pod)] + if present { + singleGroupStats, present := ip.creatorToSingleGroupStatsMap[cr] + if pod.Status.Phase == apiv1.PodPending { + return utils.InPlaceDeferred + } + if present { + if isInPlaceUpdating(pod) { + // For InPlace mode we wait indefinitely for Kubelet + if updateMode == vpa_types.UpdateModeInPlace { + klog.V(4).InfoS("Pod is updating, waiting for completion (InPlace mode)", + "pod", klog.KObj(pod)) + return utils.InPlaceDeferred + } + + // For InPlaceOrRecreate mode, check timeout + canEvict := CanEvictInPlacingPod(pod, singleGroupStats, ip.lastInPlaceAttemptTimeMap, ip.clock) + if canEvict { + klog.V(2).InfoS("Pod update timed out, suggesting eviction", + "pod", klog.KObj(pod)) + return utils.InPlaceEvict + } + return utils.InPlaceDeferred + } + if singleGroupStats.isPodDisruptable() { + return utils.InPlaceApproved } - // For InPlace mode, do nothing - Kubelet handles retry - continue } + } + klog.V(4).InfoS("Can't in-place update pod, waiting for next loop", "pod", klog.KObj(pod)) + return utils.InPlaceDeferred +} +``` - // InPlaceApproved - proceed with update - // ... +The updater loop will handle the `InPlace` mode by never adding pods to `podsForEviction` as follows: + +```golang +for _, pod := range podsForInPlace { + decision := inPlaceLimiter.CanInPlaceUpdate(pod, updateMode) + + switch decision { + case utils.InPlaceDeferred: + klog.V(2).InfoS("In-place update deferred", "pod", klog.KObj(pod)) + continue + + case utils.InPlaceEvict: + // This should only happen for InPlaceOrRecreate mode + podsForEviction = append(podsForEviction, pod) + klog.V(2).InfoS("In-place update failed, falling back to eviction", + "pod", klog.KObj(pod)) + continue + + case utils.InPlaceApproved: + // Proceed with in-place update + if err := u.evictionRateLimiter.TryUpdate(pod, vpa); err != nil { + klog.V(2).InfoS("Failed to update pod", "pod", klog.KObj(pod), "error", err) + } } } ```