From 1dc509360b938fc48c04c22e95e3017febc4dea0 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Fri, 10 Oct 2025 11:42:58 -0700 Subject: [PATCH 01/41] Create draft-20251010-multipod-pdb.md --- .../draft-20251010-multipod-pdb.md | 830 ++++++++++++++++++ .../draft-20251010-multipod-pdb/kep.yaml | 51 ++ 2 files changed, 881 insertions(+) create mode 100644 keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md create mode 100644 keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md b/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md new file mode 100644 index 00000000000..81f4f25f8c3 --- /dev/null +++ b/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md @@ -0,0 +1,830 @@ + +# KEP-NNNN: Multipod PDB + + + + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) within one minor version of promotion to GA +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + + + +## Motivation + + + +### Goals + + + +### Non-Goals + + + +## Proposal + + + +### User Stories (Optional) + + + +#### Story 1 + +#### Story 2 + +### Notes/Constraints/Caveats (Optional) + + + +### Risks and Mitigations + + + +## Design Details + + + +### Test Plan + + + +[ ] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + + + +##### Unit tests + + + + + +- ``: `` - `` + +##### Integration tests + + + + + +- [test name](https://github.com/kubernetes/kubernetes/blob/2334b8469e1983c525c0c6382125710093a25883/test/integration/...): [integration master](https://testgrid.k8s.io/sig-release-master-blocking#integration-master?include-filter-by-regex=MyCoolFeature), [triage search](https://storage.googleapis.com/k8s-triage/index.html?test=MyCoolFeature) + +##### e2e tests + + + +- [test name](https://github.com/kubernetes/kubernetes/blob/2334b8469e1983c525c0c6382125710093a25883/test/e2e/...): [SIG ...](https://testgrid.k8s.io/sig-...?include-filter-by-regex=MyCoolFeature), [triage search](https://storage.googleapis.com/k8s-triage/index.html?test=MyCoolFeature) + +### Graduation Criteria + + + +### Upgrade / Downgrade Strategy + + + +### Version Skew Strategy + + + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [ ] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: + - Components depending on the feature gate: +- [ ] Other + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? + +###### Does enabling the feature change any default behavior? + + + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + + + +###### What happens if we reenable the feature if it was previously rolled back? + +###### Are there any tests for feature enablement/disablement? + + + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + + +###### Will enabling / using this feature result in introducing new API types? + + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + + + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml b/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml new file mode 100644 index 00000000000..5dfddc15e73 --- /dev/null +++ b/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml @@ -0,0 +1,51 @@ +title: KEP Template +kep-number: NNNN +authors: + - "@jane.doe" +owning-sig: sig-xyz +participating-sigs: + - sig-aaa + - sig-bbb +status: provisional|implementable|implemented|deferred|rejected|withdrawn|replaced +creation-date: yyyy-mm-dd +reviewers: + - TBD + - "@alice.doe" +approvers: + - TBD + - "@oscar.doe" + +see-also: + - "/keps/sig-aaa/1234-we-heard-you-like-keps" + - "/keps/sig-bbb/2345-everyone-gets-a-kep" +replaces: + - "/keps/sig-ccc/3456-replaced-kep" + +# The target maturity stage in the current dev cycle for this KEP. +# If the purpose of this KEP is to deprecate a user-visible feature +# and a Deprecated feature gates are added, they should be deprecated|disabled|removed. +stage: alpha|beta|stable + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.19" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.19" + beta: "v1.20" + stable: "v1.22" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: MyFeature + components: + - kube-apiserver + - kube-controller-manager +disable-supported: true + +# The following PRR answers are required at beta release +metrics: + - my_feature_metric From cd6a02c62d28b63787501edb3425f96469df39a7 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 13 Oct 2025 10:34:58 -0700 Subject: [PATCH 02/41] Update kep.yaml --- .../draft-20251010-multipod-pdb/kep.yaml | 32 +++++++------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml b/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml index 5dfddc15e73..d7c5a05d440 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml +++ b/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml @@ -1,41 +1,31 @@ -title: KEP Template -kep-number: NNNN +title: Multipod PDB +kep-number: TBD authors: - - "@jane.doe" -owning-sig: sig-xyz + - "@LogicalShark" +owning-sig: sig-apps participating-sigs: - - sig-aaa - - sig-bbb -status: provisional|implementable|implemented|deferred|rejected|withdrawn|replaced -creation-date: yyyy-mm-dd +status: provisional #provisional|implementable|implemented|deferred|rejected|withdrawn|replaced +creation-date: 2025-10-10 reviewers: - TBD - - "@alice.doe" approvers: - TBD - - "@oscar.doe" - -see-also: - - "/keps/sig-aaa/1234-we-heard-you-like-keps" - - "/keps/sig-bbb/2345-everyone-gets-a-kep" -replaces: - - "/keps/sig-ccc/3456-replaced-kep" # The target maturity stage in the current dev cycle for this KEP. # If the purpose of this KEP is to deprecate a user-visible feature # and a Deprecated feature gates are added, they should be deprecated|disabled|removed. -stage: alpha|beta|stable +stage: alpha # The most recent milestone for which work toward delivery of this KEP has been # done. This can be the current (upcoming) milestone, if it is being actively # worked on. -latest-milestone: "v1.19" +latest-milestone: "v1.36" # The milestone at which this feature was, or is targeted to be, at each stage. milestone: - alpha: "v1.19" - beta: "v1.20" - stable: "v1.22" + alpha: "v1.36" + beta: "v1.37" + stable: "v1.38" # The following PRR answers are required at alpha release # List the feature gate name and the components for which it must be enabled From ed69c0e2ca359be719f66049957676f98c8fc186 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 13 Oct 2025 11:54:44 -0700 Subject: [PATCH 03/41] Update draft-20251010-multipod-pdb.md --- .../draft-20251010-multipod-pdb.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md b/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md index 81f4f25f8c3..4eaa2f6151d 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md @@ -173,6 +173,8 @@ useful for a wide audience. A good summary is probably at least a paragraph in length. --> +Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal would allow it to treat multi-pod groups (e.g. LeaderWorkerSet replicas) as if they were pod replicas. We would add an optional field `replicaKey` to the PDB spec, so the PDB creator may provide a label to identify groups of pods which should be handled together. In the example of LWS, this is `leaderworkerset.sigs.k8s.io/group-key`, as all pods in a leader+workers group would share the same value for this label, and thus be identified as a single replica. + ## Motivation +The goal is to make PDBs usable for multi-pod replicas like a LWS, which has a leader and worker pods for use cases like distributed AI workloads. Currently, eviction or preemption of multiple pods may disturb pods across multiple LWS replicas, instead of the preferred outcome of evicting multiple pods from a single LWS replica. + ### Goals +- Add `replicaKey` to the PDB spec and implment the functionality of grouping pods for measuring availability +- Ensure all affected systems work as intended (kube-scheduler, cluster autoscaler, eviction API, any custom schedulers) + ### Non-Goals +- This feature will not affect involuntary disruptions like node failures or network partitions, only the voluntary eviction API + + ## Proposal -- This feature will not affect involuntary disruptions like node failures or network partitions, only the voluntary eviction API - +This feature will only affect the Eviction API. The following are involuntary disruptions and do not use the Eviction API: +- Manual pod deletion +- Cleanup from node deletion (pod garbage collector) +- Pod deletion from Deployments and StatefulSets +- Node failure +- Kubelet node-pressure eviction +- Taint manager deleting NoExecute tainted pods ## Proposal From c3486508715108f14bf98a897ef600140f18aacc Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Tue, 14 Oct 2025 10:42:52 -0700 Subject: [PATCH 06/41] Update draft-20251010-multipod-pdb.md --- .../draft-20251010-multipod-pdb.md | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md b/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md index 233acf3d445..69e4d25738b 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md @@ -233,10 +233,6 @@ the system. The goal here is to make this feel real for users without getting bogged down. --> -#### Story 1 - -#### Story 2 - ### Notes/Constraints/Caveats (Optional) -[ ] I/we understand the owners of the involved components may require updates to +[x] I/we understand the owners of the involved components may require updates to existing tests to make this code solid enough prior to committing the changes necessary to implement this enhancement. @@ -510,12 +506,12 @@ well as the [existing list] of feature gates. - [ ] Feature gate (also fill in values in `kep.yaml`) - Feature gate name: - Components depending on the feature gate: -- [ ] Other - - Describe the mechanism: +- [x] Other + - Describe the mechanism: the new field in the PDB spec will be optional - Will enabling / disabling the feature require downtime of the control - plane? + plane? No - Will enabling / disabling the feature require downtime or reprovisioning - of a node? + of a node? No ###### Does enabling the feature change any default behavior? @@ -524,6 +520,8 @@ Any change of default behavior may be surprising to users or break existing automations, so be extremely careful here. --> +It will change the behavior of the Eviction API and kube-scheduler, but should not affect any unrelated components. + ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? +Yes, update the PDB to remove the field. + ###### What happens if we reenable the feature if it was previously rolled back? +It will be enabled again, there should not be any disruptions. + ###### Are there any tests for feature enablement/disablement? +No + ### Rollout, Upgrade and Rollback Planning +No + ###### Will enabling / using this feature result in introducing new API types? +No + ###### Will enabling / using this feature result in any new calls to the cloud provider? +No + ###### Will enabling / using this feature result in increasing size or count of the existing API objects? +No ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? @@ -752,6 +763,7 @@ Think about adding additional work or introducing new steps in between [existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos --> +No ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? @@ -764,6 +776,7 @@ This through this both in small and large cases, again with respect to the [supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md --> +No ###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? @@ -776,6 +789,7 @@ If any of the resources can be exhausted, how this is mitigated with the existin Are there any tests that were run/should be run to understand performance characteristics better and validate the declared limits? --> +No ### Troubleshooting @@ -792,6 +806,8 @@ details). For now, we leave it here. ###### How does this feature react if the API server and/or etcd is unavailable? +No different behavior + ###### What are other known failure modes? +None that are not already part of the Eviction API + ###### What steps should be taken if SLOs are not being met to determine the problem? ## Implementation History From 3ec123493ce669cbc24bfcb780547db92cefafcb Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Wed, 15 Oct 2025 10:10:08 -0700 Subject: [PATCH 07/41] Update and rename draft-20251010-multipod-pdb.md to README.md --- ...aft-20251010-multipod-pdb.md => README.md} | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) rename keps/sig-apps/draft-20251010-multipod-pdb/{draft-20251010-multipod-pdb.md => README.md} (94%) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md similarity index 94% rename from keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md rename to keps/sig-apps/draft-20251010-multipod-pdb/README.md index 69e4d25738b..d3557240d6f 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/draft-20251010-multipod-pdb.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -173,7 +173,7 @@ useful for a wide audience. A good summary is probably at least a paragraph in length. --> -Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal would allow it to treat multi-pod groups (e.g. LeaderWorkerSet replicas) as if they were pod replicas. We would add an optional field `replicaKey` to the PDB spec, so the PDB creator may provide a label to identify groups of pods which should be handled together. In the example of LWS, this is `leaderworkerset.sigs.k8s.io/group-key`, as all pods in a leader+workers group would share the same value for this label, and thus be identified as a single replica. +Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal would allow it to treat multi-pod groups (e.g. LeaderWorkerSet [LWS] replicas) as if they were pod replicas. We would add an optional field `replicaKey` to the PDB spec, so the PDB creator may provide a label to identify groups of pods which should be handled together. In the example of LWS, this is `leaderworkerset.sigs.k8s.io/group-key`, as all pods in a leader+workers group would share the same value for this label, and thus be identified as a single replica. ## Motivation @@ -233,6 +233,25 @@ the system. The goal here is to make this feel real for users without getting bogged down. --> +If the user is not using LWS, their process will be unaffected. + +Using LWS, they would create a PDB like: +``` +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: lws-pdb +spec: + minAvailable: 4 + selector: + matchLabels: + leaderworkerset.sigs.k8s.io/name: mylws + replicaKey:"leaderworkerset.sigs.k8s.io/group-key" +``` + +With LWS replicas set up, all pods in the same group will have the same value under label key `leaderworkerset.sigs.k8s.io/group-key`. + + ### Notes/Constraints/Caveats (Optional) +#### Background on multi-pod replicas (LWS) + +We will take the LeaderWorkerSet (LWS) as an example of this system. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and each "worker" pod and the number of pods in the group. This is useful in cases where a leader process coordinates multiple worker processes, such as AI/ML workloads for distributed model training and inference. + +It works by keeping all worker pods in the same lifecycle: they are created and scheduled in parallel, and if any workers fail the group is considered failing. In this context, LWS "replicas" are not additional pods, but additional leader+workers pod groups. The user may also specify the number of worker pods within each pod group (`leaderWorkerTemplate.size`). For unique identification, each worker has an index, and each replica of the group has an index. + + ### Risks and Mitigations -The goal is to make PDBs usable for multi-pod replicas like a LWS, which has a leader and worker pods for use cases like distributed AI workloads. Currently, eviction or preemption of multiple pods may disturb pods across multiple LWS replicas, instead of the preferred outcome of evicting multiple pods from a single LWS replica. +The goal is to make PDBs usable for multi-pod replicas like a LWS, which has a leader and worker pods for use cases like distributed AI workloads. Currently, eviction or preemption of multiple pods may disturb pods across multiple LWS replicas, instead of the preferred outcome of evicting multiple pods from a single LWS replica. For workloads like a `LeaderWorkerSet`, the health of a replica depends on the simultaneous availability of all pods within its group. ### Goals @@ -195,8 +195,13 @@ List the specific goals of the KEP. What is it trying to achieve? How will we know that this has succeeded? --> -- Add `replicaKey` to the PDB spec and implment the functionality of grouping pods for measuring availability -- Ensure all affected systems work as intended (kube-scheduler, cluster autoscaler, eviction API, any custom schedulers) +The primary goal of this KEP is to extend the PodDisruptionBudget (PDB) to handle applications where a single logical replica is composed of multiple pods. This will allow the Eviction API to account for grouping during voluntary disruptions. +- Define availability for pod groups: allow application owners to define disruption budgets for multi-pod replicas rather than individual pods using a label. +- Enhance the PDB API: introduce optional field `replicaKey` to the `PodDisruptionBudget` spec. This field will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. +- Update eviction logic: modify the Eviction API to use the `replicaKey` for calculating availability. +- Maintain Compatibility: ensure that standard cluster operations that respect PDBs, such as `kubectl drain` and node draining initiated by the `cluster-autoscaler`, correctly adhere to the new group-based disruption budgets. +- Preserve existing functionality: for backward compatibility, PDBs that do not specify the new `replicaKey` field should not have any new behavior +- Ensure all affected systems work as intended with pod groups (kube-scheduler, cluster autoscaler, custom schedulers) ### Non-Goals @@ -205,14 +210,21 @@ What is out of scope for this KEP? Listing non-goals helps to focus discussion and make progress. --> -This feature will only affect the Eviction API. The following are involuntary disruptions and do not use the Eviction API: +This change will only affect the Eviction API. The following are involuntary disruptions and do not use the Eviction API: - Manual pod deletion -- Cleanup from node deletion (pod garbage collector) -- Pod deletion from Deployments and StatefulSets -- Node failure -- Kubelet node-pressure eviction +- Pods being deleted by their owning controller (e.g., during Deployment rollout) +- Node failure +- Pod cleanup due to a node being removed from the cluster +- Evictions by the Kubelet due to node pressure (e.g. memory shortage) - Taint manager deleting NoExecute tainted pods +This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `LeaderWorkerSet`, etc. +- The workload controller will be responsible for the `replicaKey` label on pods it manages. We will not create any system for pod labeling or validation of groups. +- The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, we will only handle evictions. + +This change will not affect scheduling. +- It is out of scope to introduce any form of gang scheduling or pod affinity rules. We only handle eviction of already-scheduled pods. + ## Proposal +- This feature relies on the workload controller (which may be `LeaderWorkerSet` or some third-party custom controller) to correctly apply `replicaKey` labels. Bugs in the controller could cause mislabeled pods and incorrect eviction decisions, possibly violating availability requirements. +- One failing pod in a large group will make the group unavailable, so a small number of simultaneously failing pods across groups could prevent evictions and block a node drain. This is intended behavior. + ## Design Details -If the user is not using LWS, their process will be unaffected. +*Note: if the user is not using multi-pod replicas, their process will be unaffected.* -Using LWS, they would create a PDB like: -``` +#### Story 1: Distributed Workload + +An engineer runs distributed ML training jobs using a `LeaderWorkerSet`. Each replica of consists of one leader and multiple worker pods that run concurrently. If any pod in a group is evicted, the group fails and must be restarted. + +To protect a long-running job from voluntary disruptions, such as node drain for an upgrade, the user must ensure that some number of training replicas remain available. If a disruption is required, it should evict an entire replica group, rather than pods across different replicas. + +The user would create a PDB for `LeaderWorkerSet` pods with a `replicaKey`: + +```yaml apiVersion: policy/v1 kind: PodDisruptionBudget metadata: - name: lws-pdb + name: my-training-job-pdb spec: - minAvailable: 4 + minAvailable: 4 selector: matchLabels: - leaderworkerset.sigs.k8s.io/name: mylws - replicaKey:"leaderworkerset.sigs.k8s.io/group-key" + leaderworkerset.sigs.k8s.io/name: my-training-job + replicaKey: "leaderworkerset.sigs.k8s.io/group-key" ``` -With LWS replicas set up, all pods in the same group will have the same value under label key `leaderworkerset.sigs.k8s.io/group-key`. If the user runs `kubectl node drain`, it will use the Eviction API and the controller will -1. Select pods matching the PDB `selector`. -2. Group selected pods using the value of the `replicaKey` label. -3. Evict only if the resulting number of healthy groups does not violate the PDB `minAvailable`. A group is disrupted if any of its pods is evicted. +Upon node drain, the Eviction API will: +1. Select all pods matching `leaderworkerset.sigs.k8s.io/name: my-training-job`. +2. Group these pods into replicas based on their value for label key `leaderworkerset.sigs.k8s.io/group-key` +3. Determine the number of healthy replicas. +4. Evict only if the number of healthy replicas after eviction will be at least `minAvailable` (4). An entire group is considered disrupted if any of its pods are targeted for eviction. + +This way, the job can continue running with sufficient replicas even during cluster maintenance. + +#### Story 2: Cluster Maintenance + +A cluster administrator will frequently drain nodes for upgrades, security patches, etc. The cluster may have various workloads, including `LeaderWorkerSet` with specific availability requirements. + +To perform node drains safely without an application owner, this user may rely on the application owner's PDB as described in Story 1. The user may `kubectl drain `, and the Eviction API will automatically identify multi-pod replicas and ensure that the drain does not violate the application's `minAvailable` requirement. +This allows safe maintenance without causing outages, as the drain will pause if it cannot evict certain pods. If this happens, the user may wait for the application to become healthier, or contact the application owner to resolve. ### Notes/Constraints/Caveats (Optional) From 82eb0160d65afd67fde9f99d7cbe313b90b3f379 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Wed, 15 Oct 2025 16:51:32 -0700 Subject: [PATCH 10/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 108 +++++++++++++++++- 1 file changed, 104 insertions(+), 4 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index e1d050449e7..65e4f0e2d02 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -173,7 +173,7 @@ useful for a wide audience. A good summary is probably at least a paragraph in length. --> -Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal would allow it to treat multi-pod groups (e.g. LeaderWorkerSet [LWS] replicas) as if they were pod replicas. We would add an optional field `replicaKey` to the PDB spec, so the PDB creator may provide a label to identify groups of pods which should be handled together. In the example of LWS, this is `leaderworkerset.sigs.k8s.io/group-key`, as all pods in a leader+workers group would share the same value for this label, and thus be identified as a single replica. +Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal is to allow them to treat multi-pod groups (e.g. LeaderWorkerSet replicas) as if they were pod replicas. We would add optional field `replicaKey` to the PDB spec, specifying a label which whose value would identify groups of pods that should be handled together. In the example of LWS, this is `leaderworkerset.sigs.k8s.io/group-key`, as all pods in a leader+workers group would share the same value for this label, and thus be identified as a single replica. ## Motivation @@ -188,6 +188,106 @@ demonstrate the interest in a KEP within the wider Kubernetes community. The goal is to make PDBs usable for multi-pod replicas like a LWS, which has a leader and worker pods for use cases like distributed AI workloads. Currently, eviction or preemption of multiple pods may disturb pods across multiple LWS replicas, instead of the preferred outcome of evicting multiple pods from a single LWS replica. For workloads like a `LeaderWorkerSet`, the health of a replica depends on the simultaneous availability of all pods within its group. +### Example + +```mermaid +graph TD + %% Define Styles for Setup Diagram + classDef node_box fill:#ececff,stroke:#9696ff,stroke-width:2px,color:#111 + classDef pod_box fill:#fff,stroke:#ccc,color:#111 + classDef replica_label fill:none,stroke:none,font-weight:bold + + subgraph "Physical Node Layout" + direction LR + + subgraph NodeA ["Node A"] + R0P0("Replica 0
Pod 0") + R0P1("Replica 0
Pod 1") + end + class NodeA node_box + + subgraph NodeB ["Node B"] + R0P2("Replica 0
Pod 2") + R1P0("Replica 1
Pod 0") + end + class NodeB node_box + + subgraph NodeC ["Node C"] + R1P1("Replica 1
Pod 1") + R1P2("Replica 1
Pod 2") + end + class NodeC node_box + + class R0P0,R0P1,R0P2,R1P0,R1P1,R1P2 pod_box + end + + %% Logical Groupings (defined at top level) + R0("Replica 0") + R1("Replica 1") + class R0,R1 replica_label + + R0 -.-> R0P0 + R0 -.-> R0P1 + R0 -.-> R0P2 + + R1 -.-> R1P0 + R1 -.-> R1P1 + R1 -.-> R1P2 + + %% Style all links (0 to 5) + linkStyle 0,1,2,3,4,5 stroke:#888,stroke-dasharray: 5 5,stroke-width:2px +``` + +Assume you have 2 replicas of 3 pods each. There are 3 nodes (A, B, C) which can each host 2 pods: node A hosts replica 0 pods 0 and 1, node B host replica 0 pod 2 and replica 1 pod 0, and node C hosts replica 1 pods 1 and 2 (see diagram). You would like a PDB to protect at least one replica, and so put `minAvailable: 3` in the PDB spec. Currently, a node drain on node B would see that there will be 4 pods remaining and evicts all pods from node B, failing a pod in both replicas, and if any pod fails in a replica, the replica fails. Technically the PDB was honored, but the intent was to keep a replica running. With this KEP, the user would declare `minAvailable: 1` and `replicaKey: "leaderworkerset.sigs.k8s.io/group-key"` in the PDB. The drain would identify the LWS replicas and determine that evicting the pods in node C would cause both replicas to fail and violate the PDB, and would safely stop before eviction. + +```mermaid +graph TD + %% Define Styles for Flowchart Diagram + classDef action fill:#e6f3ff,stroke:#66b3ff,stroke-width:2px,color:#111 + classDef decision fill:#fff0e6,stroke:#ff9933,stroke-width:2px,color:#111 + classDef pdb_spec fill:#ffccff,stroke:#cc00cc,stroke-width:2px,color:#111 + classDef outcome_bad fill:#fff0f0,stroke:#ffaaaa,stroke-width:2px,color:#111 + classDef outcome_good fill:#f0fff0,stroke:#aaffaa,stroke-width:2px,color:#111 + classDef process fill:#f0f0f0,stroke:#ccc,color:#111 + + StartDrain("kubectl drain
node-b initiated") + class StartDrain action + + StartDrain --> PDB_Type{Which PDB is active?} + + PDB_Type -- "Traditional PDB" --> PDB_Old(PDB Spec:
minAvailable 3 pods) + class PDB_Old pdb_spec + + PDB_Type -- "Multipod PDB (with KEP)" --> PDB_New(PDB Spec:
minAvailable 1 replica,
replicaKey: ...group-key) + class PDB_New pdb_spec + + %% --- Traditional PDB Flow --- + PDB_Old --> CalcPods(Calculate
available pods) + class CalcPods process + + CalcPods --> CheckPods{Are remaining pods
>= 3?} + class CheckPods decision + + CheckPods -- "Yes (4 >= 3)" --> DrainSuccess("Drain Proceeds:
Node B pods evicted") + class DrainSuccess action + + DrainSuccess --> AppDown("Application State:
Both replicas fail
(Technically PDB honored,
but intent violated)") + class AppDown outcome_bad + + %% --- Multipod PDB Flow --- + PDB_New --> CalcReplicas(Calculate
available replicas) + class CalcReplicas process + + CalcReplicas --> CheckReplicas{Are remaining replicas
>= 1?} + class CheckReplicas decision + + CheckReplicas -- "No (0 >= 1)" --> DrainBlocked("Drain Blocked:
Eviction prevented") + class DrainBlocked action + + DrainBlocked --> AppHealthy("Application State:
Both replicas healthy
(PDB intent
fully protected)") + class AppHealthy outcome_good +``` + ### Goals + +The primary goal of this KEP is to extend the PodDisruptionBudget (PDB) to handle applications where a single logical replica is composed of multiple pods. This will allow the Eviction API to account for grouping during voluntary disruptions. +- Define availability for pod groups: allow application owners to define disruption budgets for multi-pod replicas rather than individual pods using a label. +- Enhance the PDB API: introduce optional field `replicaKey` to the `PodDisruptionBudget` spec. This field will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. +- Update eviction logic: modify the Eviction API to use the `replicaKey` for calculating availability. +- Maintain Compatibility: ensure that standard cluster operations that respect PDBs, such as `kubectl drain` and node draining initiated by the `cluster-autoscaler`, correctly adhere to the new group-based disruption budgets. +- Preserve existing functionality: for backward compatibility, PDBs that do not specify the new `replicaKey` field should not have any new behavior +- Ensure all affected systems work as intended with pod groups (kube-scheduler, cluster autoscaler, custom schedulers) + +### Non-Goals + + + +This change will only affect the Eviction API. The following are involuntary disruptions and do not use the Eviction API: +- Manual pod deletion +- Pods being deleted by their owning controller (e.g. during Deployment rollout) +- Node failure +- Pod cleanup due to a node being removed from the cluster +- Evictions by the Kubelet due to node pressure (e.g. memory shortage) +- Taint manager deleting NoExecute tainted pods + +This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `LeaderWorkerSet`, etc. +- The workload controller will be responsible for the `replicaKey` label on pods it manages. We will not create any system for pod labeling or validation of groups. +- The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, we will only handle evictions. + +This change will not affect scheduling. +- It is out of scope to introduce any form of gang scheduling or pod affinity rules. We only handle eviction of already-scheduled pods. + +Partial replica failure +- We will assume that the replica is a single unit and can be considered failing if any pod in it is failing. In this KEP there is not a plan for systems in which the replica may still be healthy even with some percentage of pods failing. + + +## Proposal + + + +### User Stories (Optional) + + + +*Note: if the user is not using multi-pod replicas, their process will be unaffected.* + +#### Story 1: Distributed Workload + +An engineer is running distributed ML training jobs using a `LeaderWorkerSet`. Each replica consists of one leader and multiple worker pods that run concurrently. If any pod in a group is evicted, the group fails and must be restarted. + +To protect a long-running job from voluntary disruptions, such as node drain for an upgrade, the user must ensure that some number of training replicas remain available. If a disruption is required, it should evict an entire replica group, rather than pods across different replicas. + +The user would create a PDB for `LeaderWorkerSet` pods with a `replicaKey`: + +```yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: my-training-job-pdb +spec: + minAvailable: 4 + selector: + matchLabels: + leaderworkerset.sigs.k8s.io/name: my-training-job + replicaKey: "leaderworkerset.sigs.k8s.io/group-key" +``` + +Upon node drain, the Eviction API will: +1. Select all pods matching `leaderworkerset.sigs.k8s.io/name: my-training-job`. +2. Group these pods into replicas based on their value for label key `leaderworkerset.sigs.k8s.io/group-key` +3. Determine the number of healthy replicas. +4. Evict only if the number of healthy replicas after eviction will be at least `minAvailable` (4). An entire group is considered disrupted if any of its pods are targeted for eviction. + +This way, the job can continue running with sufficient replicas even during cluster maintenance. + +#### Story 2: Cluster Maintenance + +A cluster administrator will frequently drain nodes for upgrades, security patches, etc. The cluster may have various workloads, including `LeaderWorkerSet` with specific availability requirements. + +To perform node drains safely without an application owner, this user may rely on the application owner's PDB as described in Story 1. The user may `kubectl drain `, and the Eviction API will automatically identify multi-pod replicas and ensure that the drain does not violate the application's `minAvailable` requirement. + +This allows safe maintenance without causing outages, as the drain will pause if it cannot evict certain pods. If this happens, the user may wait for the application to become healthier, or contact the application owner to resolve. + +#### Setup Example ```mermaid graph TD @@ -197,7 +297,7 @@ graph TD classDef pod_box fill:#fff,stroke:#ccc,color:#1a1a1a classDef replica_label fill:none,stroke:none,font-weight:bold,color:#f0f0f0 - subgraph "Physical Node Layout" + subgraph "Physical Node Setup" direction LR subgraph NodeA ["Node A"] @@ -242,7 +342,11 @@ graph TD linkStyle 0,1,2,3,4,5 stroke:#888,stroke-dasharray: 5 5,stroke-width:2px ``` -Assume you have 2 replicas of 3 pods each. There are 3 nodes (A, B, C) which can each host 2 pods: node A hosts replica 0 pods 0 and 1, node B host replica 0 pod 2 and replica 1 pod 0, and node C hosts replica 1 pods 1 and 2 (see diagram). You would like a PDB to protect at least one replica, and so put `minAvailable: 3` in the PDB spec. Currently, a node drain on node B would see that there will be 4 pods remaining and evicts all pods from node B, failing a pod in both replicas, and if any pod fails in a replica, the replica fails. Technically the PDB was honored, but the intent was to keep a replica running. With this KEP, the user would declare `minAvailable: 1` and `replicaKey: "leaderworkerset.sigs.k8s.io/group-key"` in the PDB. The drain would identify the LWS replicas and determine that evicting the pods in node C would cause both replicas to fail and violate the PDB, and would safely stop before eviction. +Assume there are 2 LWS replicas of 3 pods each, and 3 nodes which each host 2 pods. As in the diagram, node A hosts replica 0 pods 0 and 1, node B host replica 0 pod 2 and replica 1 pod 0, and node C hosts replica 1 pods 1 and 2. + +To protect at least one 3-pod replica in the current system, a user could try a PDB with `minAvailable: 3`. A node drain on node B would see that there will still be 4 pods remaining afterwards, and would evict replica 0 pod 2 and replica 1 pod 0 pods from node B, failing one pod in each replica. Technically the PDB was honored, but now both replicas have a failing pod and they both fail. + +After the change, a PDB with `minAvailable: 1` and `replicaKey: "leaderworkerset.sigs.k8s.io/group-key"` would identify the LWS replicas, and determine that evicting the pods in node B would cause both replicas to fail. This violates the PDB, and the node drain would safely stop before eviction. ```mermaid graph TD @@ -292,101 +396,6 @@ graph TD class AppHealthy outcome_good ``` -### Goals - - - -The primary goal of this KEP is to extend the PodDisruptionBudget (PDB) to handle applications where a single logical replica is composed of multiple pods. This will allow the Eviction API to account for grouping during voluntary disruptions. -- Define availability for pod groups: allow application owners to define disruption budgets for multi-pod replicas rather than individual pods using a label. -- Enhance the PDB API: introduce optional field `replicaKey` to the `PodDisruptionBudget` spec. This field will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. -- Update eviction logic: modify the Eviction API to use the `replicaKey` for calculating availability. -- Maintain Compatibility: ensure that standard cluster operations that respect PDBs, such as `kubectl drain` and node draining initiated by the `cluster-autoscaler`, correctly adhere to the new group-based disruption budgets. -- Preserve existing functionality: for backward compatibility, PDBs that do not specify the new `replicaKey` field should not have any new behavior -- Ensure all affected systems work as intended with pod groups (kube-scheduler, cluster autoscaler, custom schedulers) - -### Non-Goals - - - -This change will only affect the Eviction API. The following are involuntary disruptions and do not use the Eviction API: -- Manual pod deletion -- Pods being deleted by their owning controller (e.g. during Deployment rollout) -- Node failure -- Pod cleanup due to a node being removed from the cluster -- Evictions by the Kubelet due to node pressure (e.g. memory shortage) -- Taint manager deleting NoExecute tainted pods - -This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `LeaderWorkerSet`, etc. -- The workload controller will be responsible for the `replicaKey` label on pods it manages. We will not create any system for pod labeling or validation of groups. -- The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, we will only handle evictions. - -This change will not affect scheduling. -- It is out of scope to introduce any form of gang scheduling or pod affinity rules. We only handle eviction of already-scheduled pods. - -## Proposal - - - -### User Stories (Optional) - - - -*Note: if the user is not using multi-pod replicas, their process will be unaffected.* - -#### Story 1: Distributed Workload - -An engineer is running distributed ML training jobs using a `LeaderWorkerSet`. Each replica consists of one leader and multiple worker pods that run concurrently. If any pod in a group is evicted, the group fails and must be restarted. - -To protect a long-running job from voluntary disruptions, such as node drain for an upgrade, the user must ensure that some number of training replicas remain available. If a disruption is required, it should evict an entire replica group, rather than pods across different replicas. - -The user would create a PDB for `LeaderWorkerSet` pods with a `replicaKey`: - -```yaml -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: my-training-job-pdb -spec: - minAvailable: 4 - selector: - matchLabels: - leaderworkerset.sigs.k8s.io/name: my-training-job - replicaKey: "leaderworkerset.sigs.k8s.io/group-key" -``` - -Upon node drain, the Eviction API will: -1. Select all pods matching `leaderworkerset.sigs.k8s.io/name: my-training-job`. -2. Group these pods into replicas based on their value for label key `leaderworkerset.sigs.k8s.io/group-key` -3. Determine the number of healthy replicas. -4. Evict only if the number of healthy replicas after eviction will be at least `minAvailable` (4). An entire group is considered disrupted if any of its pods are targeted for eviction. - -This way, the job can continue running with sufficient replicas even during cluster maintenance. - -#### Story 2: Cluster Maintenance - -A cluster administrator will frequently drain nodes for upgrades, security patches, etc. The cluster may have various workloads, including `LeaderWorkerSet` with specific availability requirements. - -To perform node drains safely without an application owner, this user may rely on the application owner's PDB as described in Story 1. The user may `kubectl drain `, and the Eviction API will automatically identify multi-pod replicas and ensure that the drain does not violate the application's `minAvailable` requirement. - -This allows safe maintenance without causing outages, as the drain will pause if it cannot evict certain pods. If this happens, the user may wait for the application to become healthier, or contact the application owner to resolve. ### Notes/Constraints/Caveats (Optional) @@ -411,7 +420,8 @@ If a PDB specifies a `replicaKey` but the `selector` matches pods that are missi A group is considered available only if all pods within that group are available (e.g. `Running` and `Ready`). If any pod within a group is unavailable for before an eviction is attempted, the entire group is considered unavailable. An eviction request for a pod in a healthy group may be denied if other groups are unhealthy, even if the pods in the unhealthy groups are not eviction targets. -#### Labeling +#### Other systems +While LWS is the primary use case and is given as the example in this KEP, this change is not exclusive to LWS and works with any other multi-pod replica systems which use labels. ### Risks and Mitigations From dcc66c27935bb00f2950002d782969a3a4d933fe Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Fri, 17 Oct 2025 13:43:03 -0700 Subject: [PATCH 13/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index b195d887342..6098f9ee881 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -412,18 +412,6 @@ We will take the LeaderWorkerSet (LWS) as an example of this system. The LWS API It works by keeping all worker pods in the same lifecycle: they are created and scheduled in parallel, and if any workers fail the group is considered failing. In this context, LWS "replicas" are not additional pods, but additional leader+workers pod groups. The user may also specify the number of worker pods within each pod group (`leaderWorkerTemplate.size`). For unique identification, each worker has an index, and each replica of the group has an index. -#### Pods without the `replicaKey` label - -If a PDB specifies a `replicaKey` but the `selector` matches pods that are missing this label, those pods will each be treated as normal replicas (equivalent to a group with size one). - -#### Group Health - -A group is considered available only if all pods within that group are available (e.g. `Running` and `Ready`). If any pod within a group is unavailable for before an eviction is attempted, the entire group is considered unavailable. An eviction request for a pod in a healthy group may be denied if other groups are unhealthy, even if the pods in the unhealthy groups are not eviction targets. - -#### Other systems -While LWS is the primary use case and is given as the example in this KEP, this change is not exclusive to LWS and works with any other multi-pod replica systems which use labels. - - ### Risks and Mitigations - This feature relies on the workload controller (which may be `LeaderWorkerSet` or some third-party custom controller) to correctly apply `replicaKey` labels. Bugs in the controller could cause mislabeled pods and incorrect eviction decisions, possibly violating availability requirements. -- One failing pod in a large group will make the group unavailable, so a small number of simultaneously failing pods across groups could prevent evictions and block a node drain. This is intended behavior. +- One failing pod in a large group will make the group unavailable, so a small number of simultaneously failing pods across groups could prevent evictions and block a node drain. This is intended behavior, but not necessarily obvious. ## Design Details @@ -450,6 +438,31 @@ required) or even code snippets. If there's any ambiguity about HOW your proposal will be implemented, this is the place to discuss them. --> +#### Pods without the `replicaKey` label + +If a PDB specifies a `replicaKey`, but the `selector` matches a pod that is missing the label, the pod will be treated as an unhealthy replica with size 1, as this should only be the result of a pod which had been incorrectly labeled or somehow in a malformed group. Even if the pod is technically healthy, we mark it as unavailable so that the Eviction API does not percieve it as an additional available replica for the PDB budget, and proceed with an otherwise unsafe node drain. An empty value (e.g. `leaderworkerset.sigs.k8s.io/group-key: ""`), as it would likely be caused by an error, will be treated as unlabeled (i.e. unhealthy). + +#### Group Health + +A group is considered available only if all pods within that group are available (e.g. `Running` and `Ready`). If any pod within a group is unavailable for before an eviction is attempted, the entire group is considered unavailable. An eviction request for a pod in a healthy group may be denied if other groups are unhealthy, even if the pods in the unhealthy groups are not eviction targets. + +#### Missing pods from a group + +- If a pod is missing (not failing) from a group, we would not know that the group is unhealthy if there is no indication as to the desired replica size. +- For this, we add `replicaSizeKey`, as the key whose value is the replica size. In LWS, `leaderworkerset.sigs.k8s.io/size` is set in all pods, and any group with an incorrect number of pods can be marked unhealthy. + +#### Total replicas + +- We will look at all selected pods at the time of checking for PDB availability, which is sufficient for an absolute number, e.g. `minAvailable=4` or `maxUnavailable=1`. For a percentage, e.g. `minAvailable=80%`, we would need to know the total number of replicas desired. +- Currently, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. To include `LeaderWorkerSet` would require hard-coding it as another recognized kind (creating a dependency on an non-core extension), or more significant changes to allow any kind of object to be recognized. +- Alternatively, we could add field `totalReplicasKey`, which would provide the number of replicas. Unfortunately, in LWS, the `leaderworkerset.sigs.k8s.io/replicas` annotation is only in the leader pod's StatefulSet. We would have to rely on the specific implementation details of LWS to extract the replicas count. +- It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. + +#### Other systems + +While LWS is the primary use case and is given as the example in this KEP, this change is not exclusive to LWS and works with any other multi-pod replica systems which use labels. + + ### Test Plan -Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal is to allow them to treat multi-pod groups (e.g. LeaderWorkerSet replicas) as if they were pod replicas. We would add optional field `replicaKey` to the PDB spec, specifying a label which whose value would identify groups of pods that should be handled together. In the example of LWS, this is `leaderworkerset.sigs.k8s.io/group-key`, as all pods in a leader+workers group would share the same value for this label, and thus be identified as a single replica. +Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal is to allow them to treat multi-pod groups (e.g. LeaderWorkerSet replicas) as if they were pod replicas. We would add optional fields `replicaKey` and `replicaSizeKey` to the PDB spec. `replicaKey` to specify a label which whose value would identify groups of pods that should be handled together (i.e. all pods with the same value are considered one replica). In the example of LWS, this is `leaderworkerset.sigs.k8s.io/group-key`, as each group has a unique value which is given to all pods in it. `replicaSizeKey` is needed to provide the size of a pod group, as it would otherwise be impossible to know there is a missing pod (e.g. there are 3 healthy pods, but the intended group size is 4, so the group should be marked unhealthy). ## Motivation @@ -197,11 +197,10 @@ know that this has succeeded? The primary goal of this KEP is to extend the PodDisruptionBudget (PDB) to handle applications where a single logical replica is composed of multiple pods. This will allow the Eviction API to account for grouping during voluntary disruptions. - Define availability for pod groups: allow application owners to define disruption budgets for multi-pod replicas rather than individual pods using a label. -- Enhance the PDB API: introduce optional field `replicaKey` to the `PodDisruptionBudget` spec. This field will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. -- Update eviction logic: modify the Eviction API to use the `replicaKey` for calculating availability. -- Maintain Compatibility: ensure that standard cluster operations that respect PDBs, such as `kubectl drain` and node draining initiated by the `cluster-autoscaler`, correctly adhere to the new group-based disruption budgets. +- Enhance the PDB API: introduce optional fields `replicaKey` and `replicaSizeKey` to the `PodDisruptionBudget` spec. `replicaKey` will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. `replicaSizeKey` will specify a key for the size of the group which the pod is a member of. +- Update eviction logic: modify the Eviction API to use the `replicaKey` pod replicas for calculating availability. +- Maintain Compatibility: ensure that standard cluster operations that respect PDBs, such as `kubectl drain` and node draining initiated by `cluster-autoscaler`, follow the group-based disruption budgets. Ensure all affected systems work as intended with pod groups (kube-scheduler, cluster autoscaler, custom schedulers). - Preserve existing functionality: for backward compatibility, PDBs that do not specify the new `replicaKey` field should not have any new behavior -- Ensure all affected systems work as intended with pod groups (kube-scheduler, cluster autoscaler, custom schedulers) ### Non-Goals @@ -219,7 +218,7 @@ This change will only affect the Eviction API. The following are involuntary dis - Taint manager deleting NoExecute tainted pods This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `LeaderWorkerSet`, etc. -- The workload controller will be responsible for the `replicaKey` label on pods it manages. We will not create any system for pod labeling or validation of groups. +- The workload controller will be responsible for the `replicaKey` label and all other labels and annotations on pods it manages. We will not create any system for pod labeling or validation of groups. - The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, we will only handle evictions. This change will not affect scheduling. @@ -270,6 +269,7 @@ spec: matchLabels: leaderworkerset.sigs.k8s.io/name: my-training-job replicaKey: "leaderworkerset.sigs.k8s.io/group-key" + replicaSizeKey: "leaderworkerset.sigs.k8s.io/size" ``` Upon node drain, the Eviction API will: @@ -346,7 +346,7 @@ Assume there are 2 LWS replicas of 3 pods each, and 3 nodes which each host 2 po To protect at least one 3-pod replica in the current system, a user could try a PDB with `minAvailable: 3`. A node drain on node B would see that there will still be 4 pods remaining afterwards, and would evict replica 0 pod 2 and replica 1 pod 0 pods from node B, failing one pod in each replica. Technically the PDB was honored, but now both replicas have a failing pod and they both fail. -After the change, a PDB with `minAvailable: 1` and `replicaKey: "leaderworkerset.sigs.k8s.io/group-key"` would identify the LWS replicas, and determine that evicting the pods in node B would cause both replicas to fail. This violates the PDB, and the node drain would safely stop before eviction. +After the change, a PDB with `minAvailable: 1` and `replicaKey` set would identify the LWS replicas, and determine that evicting the pods in node B would cause both replicas to fail. This violates the PDB, and the node drain would safely stop before eviction. ```mermaid graph TD @@ -442,6 +442,9 @@ proposal will be implemented, this is the place to discuss them. If a PDB specifies a `replicaKey`, but the `selector` matches a pod that is missing the label, the pod will be treated as an unhealthy replica with size 1, as this should only be the result of a pod which had been incorrectly labeled or somehow in a malformed group. Even if the pod is technically healthy, we mark it as unavailable so that the Eviction API does not percieve it as an additional available replica for the PDB budget, and proceed with an otherwise unsafe node drain. An empty value (e.g. `leaderworkerset.sigs.k8s.io/group-key: ""`), as it would likely be caused by an error, will be treated as unlabeled (i.e. unhealthy). +#### Labels vs. Annotations +LWS uses a label for the group id but an annotation for the group size. When provided with `replicaKey` and `replicaSizeKey`, we will check for both labels and annotations in case other implementations of multi-pod replicas use a different label/annotation setup. If both are set and not equal, we will default to marking the pod as unhealthy. + #### Group Health A group is considered available only if all pods within that group are available (e.g. `Running` and `Ready`). If any pod within a group is unavailable for before an eviction is attempted, the entire group is considered unavailable. An eviction request for a pod in a healthy group may be denied if other groups are unhealthy, even if the pods in the unhealthy groups are not eviction targets. From 5984c912274ff6335365db42597ed8ef5e7bd382 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Fri, 17 Oct 2025 14:02:51 -0700 Subject: [PATCH 15/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index ca6e8858b41..2fe54ba58ce 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -451,15 +451,16 @@ A group is considered available only if all pods within that group are available #### Missing pods from a group -- If a pod is missing (not failing) from a group, we would not know that the group is unhealthy if there is no indication as to the desired replica size. -- For this, we add `replicaSizeKey`, as the key whose value is the replica size. In LWS, `leaderworkerset.sigs.k8s.io/size` is set in all pods, and any group with an incorrect number of pods can be marked unhealthy. +- If a pod is missing (not failing) from a group, we would not know that the group is unhealthy without having the desired replica size. +- This is why `replicaSizeKey` is needed, and specifies the key whose value is the size of the replica which a pod belongs to. In LWS, `leaderworkerset.sigs.k8s.io/size` is set in all pods. +- Any group with an incorrect number of pods can be marked unhealthy, as if it had a failing pod. -#### Total replicas +#### Percentage of total replicas - We will look at all selected pods at the time of checking for PDB availability, which is sufficient for an absolute number, e.g. `minAvailable=4` or `maxUnavailable=1`. For a percentage, e.g. `minAvailable=80%`, we would need to know the total number of replicas desired. - Currently, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. To include `LeaderWorkerSet` would require hard-coding it as another recognized kind (creating a dependency on an non-core extension), or more significant changes to allow any kind of object to be recognized. - Alternatively, we could add field `totalReplicasKey`, which would provide the number of replicas. Unfortunately, in LWS, the `leaderworkerset.sigs.k8s.io/replicas` annotation is only in the leader pod's StatefulSet. We would have to rely on the specific implementation details of LWS to extract the replicas count. -- It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. +- It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, we can add `totalReplicasKey`. However, until this is added, we are not able to support percentage-based PDBs. #### Other systems From 2ebcf62f318dbb60da6aaf235942c7329f77cebe Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Fri, 17 Oct 2025 14:19:05 -0700 Subject: [PATCH 16/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 2fe54ba58ce..b7bbce2fa41 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -173,7 +173,13 @@ useful for a wide audience. A good summary is probably at least a paragraph in length. --> -Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal is to allow them to treat multi-pod groups (e.g. LeaderWorkerSet replicas) as if they were pod replicas. We would add optional fields `replicaKey` and `replicaSizeKey` to the PDB spec. `replicaKey` to specify a label which whose value would identify groups of pods that should be handled together (i.e. all pods with the same value are considered one replica). In the example of LWS, this is `leaderworkerset.sigs.k8s.io/group-key`, as each group has a unique value which is given to all pods in it. `replicaSizeKey` is needed to provide the size of a pod group, as it would otherwise be impossible to know there is a missing pod (e.g. there are 3 healthy pods, but the intended group size is 4, so the group should be marked unhealthy). +Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal is to allow them to treat multi-pod groups (e.g. LeaderWorkerSet replicas) as if they were pod replicas. We would add optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey` to the PDB spec. + +`replicaKey` will specify a label which whose value would identify groups of pods that should be handled together (i.e. all pods with the same value are considered one replica). In the example of a LeaderWorkerSet (LWS), this is `leaderworkerset.sigs.k8s.io/group-key`, as each group has a unique value which is given to all pods in the group. + +`replicaSizeKey` is needed to provide the size of each pod group, as it would otherwise be impossible to know there is a missing pod. For example, if we see 3 healthy pods but the intended group size is 4, the group should be marked unhealthy. + +`replicaTotalKey` is needed to provide the total number of desired replicas, so that we know whether a percentage-based PDB is met. For example, we don't know if two healthy replicas are sufficient for a PDB of `minAvailable: 50%` unless we know that the total desired replicas is <=4. ## Motivation @@ -459,8 +465,8 @@ A group is considered available only if all pods within that group are available - We will look at all selected pods at the time of checking for PDB availability, which is sufficient for an absolute number, e.g. `minAvailable=4` or `maxUnavailable=1`. For a percentage, e.g. `minAvailable=80%`, we would need to know the total number of replicas desired. - Currently, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. To include `LeaderWorkerSet` would require hard-coding it as another recognized kind (creating a dependency on an non-core extension), or more significant changes to allow any kind of object to be recognized. -- Alternatively, we could add field `totalReplicasKey`, which would provide the number of replicas. Unfortunately, in LWS, the `leaderworkerset.sigs.k8s.io/replicas` annotation is only in the leader pod's StatefulSet. We would have to rely on the specific implementation details of LWS to extract the replicas count. -- It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, we can add `totalReplicasKey`. However, until this is added, we are not able to support percentage-based PDBs. +- We add field `replicaTotalKey` to provide the number of replicas. Unfortunately, in LWS, the `leaderworkerset.sigs.k8s.io/replicas` annotation is only in the leader pod's StatefulSet. To accomodate this, we would have to rely on the specific implementation details of LWS to extract the replicas count. +- It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS will be compatible with `totalReplicasKey`. However, until this change is made, LWS is not able to support percentage-based PDBs. #### Other systems From 58fac7e80202424d095c6393dd1fa83c97fddc16 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Fri, 17 Oct 2025 14:21:52 -0700 Subject: [PATCH 17/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index b7bbce2fa41..1227c4c70c2 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -203,7 +203,7 @@ know that this has succeeded? The primary goal of this KEP is to extend the PodDisruptionBudget (PDB) to handle applications where a single logical replica is composed of multiple pods. This will allow the Eviction API to account for grouping during voluntary disruptions. - Define availability for pod groups: allow application owners to define disruption budgets for multi-pod replicas rather than individual pods using a label. -- Enhance the PDB API: introduce optional fields `replicaKey` and `replicaSizeKey` to the `PodDisruptionBudget` spec. `replicaKey` will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. `replicaSizeKey` will specify a key for the size of the group which the pod is a member of. +- Enhance the PDB API: introduce optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey` to the `PodDisruptionBudget` spec. `replicaKey` will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. `replicaSizeKey` will specify a key for the size of the group which the pod is a member of. `replicaTotalKey` will be a key for the number of total desired replicas. - Update eviction logic: modify the Eviction API to use the `replicaKey` pod replicas for calculating availability. - Maintain Compatibility: ensure that standard cluster operations that respect PDBs, such as `kubectl drain` and node draining initiated by `cluster-autoscaler`, follow the group-based disruption budgets. Ensure all affected systems work as intended with pod groups (kube-scheduler, cluster autoscaler, custom schedulers). - Preserve existing functionality: for backward compatibility, PDBs that do not specify the new `replicaKey` field should not have any new behavior @@ -468,6 +468,7 @@ A group is considered available only if all pods within that group are available - We add field `replicaTotalKey` to provide the number of replicas. Unfortunately, in LWS, the `leaderworkerset.sigs.k8s.io/replicas` annotation is only in the leader pod's StatefulSet. To accomodate this, we would have to rely on the specific implementation details of LWS to extract the replicas count. - It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS will be compatible with `totalReplicasKey`. However, until this change is made, LWS is not able to support percentage-based PDBs. + #### Other systems While LWS is the primary use case and is given as the example in this KEP, this change is not exclusive to LWS and works with any other multi-pod replica systems which use labels. From 1b51b94d1dde6f210874a17a734abd6d831aeb33 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Fri, 17 Oct 2025 14:27:54 -0700 Subject: [PATCH 18/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 1227c4c70c2..419595e10e6 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -468,6 +468,15 @@ A group is considered available only if all pods within that group are available - We add field `replicaTotalKey` to provide the number of replicas. Unfortunately, in LWS, the `leaderworkerset.sigs.k8s.io/replicas` annotation is only in the leader pod's StatefulSet. To accomodate this, we would have to rely on the specific implementation details of LWS to extract the replicas count. - It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS will be compatible with `totalReplicasKey`. However, until this change is made, LWS is not able to support percentage-based PDBs. +#### Required fields +- To use multi-pod replicas, `replicaKey` must be specified. + +Log a warning if: +- `replicaSizeKey` is not specified (we will then assume that missing pods are not an issue) + +Error if: +- `replicaSizeKey` or `replicaTotalKey` are specified are without `replicaKey` +- `replicaTotalKey` is not specified and a percentage-based PDB is used #### Other systems From 037a60c4d74f07b9fc1097538c280b77d9f9656d Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 11:26:56 -0700 Subject: [PATCH 19/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 419595e10e6..08bb83803ce 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -457,26 +457,25 @@ A group is considered available only if all pods within that group are available #### Missing pods from a group -- If a pod is missing (not failing) from a group, we would not know that the group is unhealthy without having the desired replica size. -- This is why `replicaSizeKey` is needed, and specifies the key whose value is the size of the replica which a pod belongs to. In LWS, `leaderworkerset.sigs.k8s.io/size` is set in all pods. -- Any group with an incorrect number of pods can be marked unhealthy, as if it had a failing pod. +If a pod is missing (not failing) from a group, we would not know that the group is unhealthy without having the desired replica size. This is why `replicaSizeKey` is needed, and specifies the key whose value is the size of the replica which a pod belongs to. In LWS, `leaderworkerset.sigs.k8s.io/size` is set in all pods. Any group with an incorrect number of pods can be marked unhealthy, as if it had a failing pod. #### Percentage of total replicas -- We will look at all selected pods at the time of checking for PDB availability, which is sufficient for an absolute number, e.g. `minAvailable=4` or `maxUnavailable=1`. For a percentage, e.g. `minAvailable=80%`, we would need to know the total number of replicas desired. -- Currently, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. To include `LeaderWorkerSet` would require hard-coding it as another recognized kind (creating a dependency on an non-core extension), or more significant changes to allow any kind of object to be recognized. -- We add field `replicaTotalKey` to provide the number of replicas. Unfortunately, in LWS, the `leaderworkerset.sigs.k8s.io/replicas` annotation is only in the leader pod's StatefulSet. To accomodate this, we would have to rely on the specific implementation details of LWS to extract the replicas count. -- It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS will be compatible with `totalReplicasKey`. However, until this change is made, LWS is not able to support percentage-based PDBs. +If we only look at all selected pods to check for PDB availability, it is sufficient for an absolute number of availability, e.g. `minAvailable=4`. However, without knowing the total number of replicas desired, there could be pods missing (making them "unavailable" but not detected). This means we wouldn't know when a percentage (e.g. `minAvailable=80%`) or any `maxUnavailable` is violated. + +To look at total replicas desired, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. `LeaderWorkerSet` has this field, but we don't want to hard-code it as another recognized kind (creating a dependency on an non-core extension). It may be possible to allow any arbitrary object with a `spec.replicas` field to be recognized, currently investigating how difficult this is. + +The alternative is to add field `replicaTotalKey` where each pod would have a label/annotation providing the expected number of replicas. In LWS the `leaderworkerset.sigs.k8s.io/replicas` annotation is unfortunately only in the leader pod's `StatefulSet`, and adding a process to extract the replicas count from this would rely on the specific implementation details of LWS. It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS would be compatible with `totalReplicasKey`. However, until this change is made, LWS would be unable to support percentage-based PDBs or `maxUnavailable`. #### Required fields -- To use multi-pod replicas, `replicaKey` must be specified. +To use multi-pod replicas, `replicaKey` must be specified. Log a warning if: - `replicaSizeKey` is not specified (we will then assume that missing pods are not an issue) +- `replicaTotalKey` is not specified and a percentage-based PDB or `maxUnavailable` is used Error if: - `replicaSizeKey` or `replicaTotalKey` are specified are without `replicaKey` -- `replicaTotalKey` is not specified and a percentage-based PDB is used #### Other systems From 36dabc2ed8be5a430bde6d72772bddf277487ef9 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 11:27:28 -0700 Subject: [PATCH 20/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 08bb83803ce..5578c760234 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -465,7 +465,7 @@ If we only look at all selected pods to check for PDB availability, it is suffic To look at total replicas desired, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. `LeaderWorkerSet` has this field, but we don't want to hard-code it as another recognized kind (creating a dependency on an non-core extension). It may be possible to allow any arbitrary object with a `spec.replicas` field to be recognized, currently investigating how difficult this is. -The alternative is to add field `replicaTotalKey` where each pod would have a label/annotation providing the expected number of replicas. In LWS the `leaderworkerset.sigs.k8s.io/replicas` annotation is unfortunately only in the leader pod's `StatefulSet`, and adding a process to extract the replicas count from this would rely on the specific implementation details of LWS. It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS would be compatible with `totalReplicasKey`. However, until this change is made, LWS would be unable to support percentage-based PDBs or `maxUnavailable`. +The alternative is to add field `replicaTotalKey` where each pod would have a label/annotation providing the expected number of replicas. In LWS the `leaderworkerset.sigs.k8s.io/replicas` annotation is unfortunately only in the leader pod's `StatefulSet`, and adding a process to extract the replicas count from this would rely on the specific implementation details of LWS. It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS would be compatible with `replicaTotalKey`. However, until this change is made, LWS would be unable to support percentage-based PDBs or `maxUnavailable`. #### Required fields To use multi-pod replicas, `replicaKey` must be specified. From 45e461366a2afb85566cebb68e91d0428fe81d4a Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 11:38:44 -0700 Subject: [PATCH 21/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 5578c760234..15226125ec7 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -463,7 +463,7 @@ If a pod is missing (not failing) from a group, we would not know that the group If we only look at all selected pods to check for PDB availability, it is sufficient for an absolute number of availability, e.g. `minAvailable=4`. However, without knowing the total number of replicas desired, there could be pods missing (making them "unavailable" but not detected). This means we wouldn't know when a percentage (e.g. `minAvailable=80%`) or any `maxUnavailable` is violated. -To look at total replicas desired, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. `LeaderWorkerSet` has this field, but we don't want to hard-code it as another recognized kind (creating a dependency on an non-core extension). It may be possible to allow any arbitrary object with a `spec.replicas` field to be recognized, currently investigating how difficult this is. +To look at total replicas desired, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. `LeaderWorkerSet` has this field, but we don't want to hard-code it as another recognized kind (creating a dependency on an non-core extension). It may be possible to allow any arbitrary object with a `spec.replicas` field to be recognized, but this would require refactoring parts of the `DisruptionController` or making any multi-pod replica controllers also create an associated `Scale` object. The alternative is to add field `replicaTotalKey` where each pod would have a label/annotation providing the expected number of replicas. In LWS the `leaderworkerset.sigs.k8s.io/replicas` annotation is unfortunately only in the leader pod's `StatefulSet`, and adding a process to extract the replicas count from this would rely on the specific implementation details of LWS. It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS would be compatible with `replicaTotalKey`. However, until this change is made, LWS would be unable to support percentage-based PDBs or `maxUnavailable`. From e8ee56b949ba14199c91a608e14ed427404207a7 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 11:50:55 -0700 Subject: [PATCH 22/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 15226125ec7..f2c0360d3fc 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -463,9 +463,9 @@ If a pod is missing (not failing) from a group, we would not know that the group If we only look at all selected pods to check for PDB availability, it is sufficient for an absolute number of availability, e.g. `minAvailable=4`. However, without knowing the total number of replicas desired, there could be pods missing (making them "unavailable" but not detected). This means we wouldn't know when a percentage (e.g. `minAvailable=80%`) or any `maxUnavailable` is violated. -To look at total replicas desired, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. `LeaderWorkerSet` has this field, but we don't want to hard-code it as another recognized kind (creating a dependency on an non-core extension). It may be possible to allow any arbitrary object with a `spec.replicas` field to be recognized, but this would require refactoring parts of the `DisruptionController` or making any multi-pod replica controllers also create an associated `Scale` object. +To look at total replicas desired, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. `LeaderWorkerSet` has this field, but we don't want to hard-code it as another recognized kind (creating a dependency on an non-core extension). It may be possible to allow any arbitrary object with a `spec.replicas` field to be recognized, but this may require refactoring parts of the `DisruptionController` (or making any multi-pod replica controllers also create an associated `Scale` object). -The alternative is to add field `replicaTotalKey` where each pod would have a label/annotation providing the expected number of replicas. In LWS the `leaderworkerset.sigs.k8s.io/replicas` annotation is unfortunately only in the leader pod's `StatefulSet`, and adding a process to extract the replicas count from this would rely on the specific implementation details of LWS. It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS would be compatible with `replicaTotalKey`. However, until this change is made, LWS would be unable to support percentage-based PDBs or `maxUnavailable`. +The other solution is to add field `replicaTotalKey` where each pod would have a label/annotation providing the expected number of replicas. In LWS the `leaderworkerset.sigs.k8s.io/replicas` annotation is unfortunately only in the leader pod's `StatefulSet`, and adding a process to extract the replicas count from this would rely on the specific implementation details of LWS. It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS would be compatible with `replicaTotalKey`. However, until this change is made, LWS would be unable to support percentage-based PDBs or `maxUnavailable`. #### Required fields To use multi-pod replicas, `replicaKey` must be specified. From 5a846769e987b1d902995807efbecb7d7897c58e Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 13:11:47 -0700 Subject: [PATCH 23/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index f2c0360d3fc..dfc386c403c 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -65,7 +65,7 @@ If none of those approvers are still appropriate, then changes to that list should be approved by the remaining approvers and/or the owning SIG (or SIG Architecture for cross-cutting KEPs). --> -# KEP-NNNN: Multipod PDB +# KEP-NNNN: PDB for Multi-pod Replicas -Currently a PodDisruptionBudget (PBD) uses individual pod replicas to count availability. This proposal is to allow them to treat multi-pod groups (e.g. LeaderWorkerSet replicas) as if they were pod replicas. We would add optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey` to the PDB spec. +A PodDisruptionBudget (PBD) ensures availability of certain number pods during voluntary disruptions (node drains). This proposal would allow PDBs to treat a group of pods (e.g. [LeaderWorkerSet](https://github.com/kubernetes-sigs/lws) replicas) as if it were an individual pod, for the purposes of measuring availability. The primary change would be to the PDB spec with new optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey`. These fields will label keys for which we will read the value given in each pod. -`replicaKey` will specify a label which whose value would identify groups of pods that should be handled together (i.e. all pods with the same value are considered one replica). In the example of a LeaderWorkerSet (LWS), this is `leaderworkerset.sigs.k8s.io/group-key`, as each group has a unique value which is given to all pods in the group. +`replicaKey` will identify groups of pods that should be handled together. All pods with the same value for this label key are considered a single replica, and a disruption to any pod within the replica is a disruption to the replica. For a LeaderWorkerSet (LWS), each group has a unique ID which is given to all pods in the group as the value for label `leaderworkerset.sigs.k8s.io/group-key`. -`replicaSizeKey` is needed to provide the size of each pod group, as it would otherwise be impossible to know there is a missing pod. For example, if we see 3 healthy pods but the intended group size is 4, the group should be marked unhealthy. +`replicaSizeKey` is needed to provide the size of each pod group. Without this, we cannot detect a missing pod. For example, if we see 3 healthy pods but the intended group size is 4, the group should be marked unhealthy. -`replicaTotalKey` is needed to provide the total number of desired replicas, so that we know whether a percentage-based PDB is met. For example, we don't know if two healthy replicas are sufficient for a PDB of `minAvailable: 50%` unless we know that the total desired replicas is <=4. +`replicaTotalKey` is needed to provide the total number of desired replicas. Without this, we cannot detect a missing replica. For example, if we see 2 healthy replicas but the intended replica count is 5, we should consider a PDB with `minAvailable: 50%` or `maxUnavailable: 1` to be violated. ## Motivation From 04f6344d46fcf180c6a9210d082cd5d08e667527 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 13:15:34 -0700 Subject: [PATCH 24/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index dfc386c403c..2efabd63e43 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -201,12 +201,11 @@ List the specific goals of the KEP. What is it trying to achieve? How will we know that this has succeeded? --> -The primary goal of this KEP is to extend the PodDisruptionBudget (PDB) to handle applications where a single logical replica is composed of multiple pods. This will allow the Eviction API to account for grouping during voluntary disruptions. -- Define availability for pod groups: allow application owners to define disruption budgets for multi-pod replicas rather than individual pods using a label. -- Enhance the PDB API: introduce optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey` to the `PodDisruptionBudget` spec. `replicaKey` will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. `replicaSizeKey` will specify a key for the size of the group which the pod is a member of. `replicaTotalKey` will be a key for the number of total desired replicas. +- Define availability for pod groups: allow application owners to define PDBs for multi-pod replicas rather than individual pods. +- Enhance the PDB API: introduce optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey` to the `PodDisruptionBudget` spec. `replicaKey` will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. `replicaSizeKey` will provide the size of the group which a pod is a member of, and `replicaTotalKey` provide the number of total desired replicas. - Update eviction logic: modify the Eviction API to use the `replicaKey` pod replicas for calculating availability. -- Maintain Compatibility: ensure that standard cluster operations that respect PDBs, such as `kubectl drain` and node draining initiated by `cluster-autoscaler`, follow the group-based disruption budgets. Ensure all affected systems work as intended with pod groups (kube-scheduler, cluster autoscaler, custom schedulers). -- Preserve existing functionality: for backward compatibility, PDBs that do not specify the new `replicaKey` field should not have any new behavior +- Maintain compatibility: ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow the group-based disruption budgets. All affected systems work as expected with pod groups (kube-scheduler, cluster autoscaler, custom schedulers). +- Preserve existing functionality: for backward compatibility, PDBs that do not specify the new `replicaKey` field should not have any new behavior. ### Non-Goals From 42d70fc0dde22c02ac23a2337a92f74ef7ab06b7 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 13:22:19 -0700 Subject: [PATCH 25/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 2efabd63e43..bd421bc0a40 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -173,13 +173,7 @@ useful for a wide audience. A good summary is probably at least a paragraph in length. --> -A PodDisruptionBudget (PBD) ensures availability of certain number pods during voluntary disruptions (node drains). This proposal would allow PDBs to treat a group of pods (e.g. [LeaderWorkerSet](https://github.com/kubernetes-sigs/lws) replicas) as if it were an individual pod, for the purposes of measuring availability. The primary change would be to the PDB spec with new optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey`. These fields will label keys for which we will read the value given in each pod. - -`replicaKey` will identify groups of pods that should be handled together. All pods with the same value for this label key are considered a single replica, and a disruption to any pod within the replica is a disruption to the replica. For a LeaderWorkerSet (LWS), each group has a unique ID which is given to all pods in the group as the value for label `leaderworkerset.sigs.k8s.io/group-key`. - -`replicaSizeKey` is needed to provide the size of each pod group. Without this, we cannot detect a missing pod. For example, if we see 3 healthy pods but the intended group size is 4, the group should be marked unhealthy. - -`replicaTotalKey` is needed to provide the total number of desired replicas. Without this, we cannot detect a missing replica. For example, if we see 2 healthy replicas but the intended replica count is 5, we should consider a PDB with `minAvailable: 50%` or `maxUnavailable: 1` to be violated. +A PodDisruptionBudget (PBD) ensures availability of certain number pods during voluntary disruptions (node drains). This proposal would allow PDBs to treat a group of pods (e.g. [LeaderWorkerSet](https://github.com/kubernetes-sigs/lws) replicas) as if it were an individual pod, for the purposes of measuring availability. We will add new optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey` to the PDB spec to pull information about the pod group replicas from pod labels. ## Motivation @@ -223,15 +217,14 @@ This change will only affect the Eviction API. The following are involuntary dis - Taint manager deleting NoExecute tainted pods This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `LeaderWorkerSet`, etc. -- The workload controller will be responsible for the `replicaKey` label and all other labels and annotations on pods it manages. We will not create any system for pod labeling or validation of groups. -- The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, we will only handle evictions. +- The workload controller will be responsible for the labels and annotations on pods it manages. This change is not responsible for pod labeling or validation of groups. +- The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, this will only handle evictions. This change will not affect scheduling. -- It is out of scope to introduce any form of gang scheduling or pod affinity rules. We only handle eviction of already-scheduled pods. +- It is out of scope to introduce any form of gang scheduling or pod affinity rules. This only handles eviction of already-scheduled pods. Partial replica failure -- We will assume that the replica is a single unit and can be considered failing if any pod in it is failing. In this KEP there is not a plan for systems in which the replica may still be healthy even with some percentage of pods failing. - +- Each replica is treated as a single unit and will be considered failing if any pod in it is failing. There is not currently a plan for types of replicas which are considred healthy with some percentage of pods failing. ## Proposal @@ -244,6 +237,14 @@ The "Design Details" section below is for the real nitty-gritty. --> +The primary change would be to the PDB spec with new optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey`. These fields will label keys for which we will read the value given in each pod. + +`replicaKey` will identify groups of pods that should be handled together. All pods with the same value for this label key are considered a single replica, and a disruption to any pod within the replica is a disruption to the replica. For a LeaderWorkerSet (LWS), each group has a unique ID which is given to all pods in the group as the value for label `leaderworkerset.sigs.k8s.io/group-key`. + +`replicaSizeKey` is needed to provide the size of each pod group. Without this, we cannot detect a missing pod. For example, if we see 3 healthy pods but the intended group size is 4, the group should be marked unhealthy. + +`replicaTotalKey` is needed to provide the total number of desired replicas. Without this, we cannot detect a missing replica. For example, if we see 2 healthy replicas but the intended replica count is 5, we should consider a PDB with `minAvailable: 50%` or `maxUnavailable: 1` to be violated. + ### User Stories (Optional) -*Note: if the user is not using multi-pod replicas, their process will be unaffected.* +*If the user is not using multi-pod replicas, their process will be unaffected.* #### Story 1: Distributed Workload An engineer is running distributed ML training jobs using a `LeaderWorkerSet`. Each replica consists of one leader and multiple worker pods that run concurrently. If any pod in a group is evicted, the group fails and must be restarted. -To protect a long-running job from voluntary disruptions, such as node drain for an upgrade, the user must ensure that some number of training replicas remain available. If a disruption is required, it should evict an entire replica group, rather than pods across different replicas. +To protect a long-running job from voluntary disruptions, such as node drain for an upgrade, the user must ensure that some number of training replicas remain available. A disruption should consider evicting pods belonging to different replicas to be disrupting each of those replicas. The user would create a PDB for `LeaderWorkerSet` pods with a `replicaKey`: @@ -280,11 +280,11 @@ spec: Upon node drain, the Eviction API will: 1. Select all pods matching `leaderworkerset.sigs.k8s.io/name: my-training-job`. -2. Group these pods into replicas based on their value for label key `leaderworkerset.sigs.k8s.io/group-key` +2. Group these pods into replicas based on their value for label `leaderworkerset.sigs.k8s.io/group-key` 3. Determine the number of healthy replicas. -4. Evict only if the number of healthy replicas after eviction will be at least `minAvailable` (4). An entire group is considered disrupted if any of its pods are targeted for eviction. - -This way, the job can continue running with sufficient replicas even during cluster maintenance. +4. Evict only if the number of healthy replicas after eviction will be at least `minAvailable` (4), where a group is considered disrupted if any of its pods are targeted for eviction. + +This way, the job can be protected to run with sufficient replicas during cluster maintenance. #### Story 2: Cluster Maintenance From b16c8f298203745dd1018d28a0b74788ff5113df Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 13:38:57 -0700 Subject: [PATCH 27/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 911450caa50..a6743de5eb7 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -260,7 +260,7 @@ bogged down. An engineer is running distributed ML training jobs using a `LeaderWorkerSet`. Each replica consists of one leader and multiple worker pods that run concurrently. If any pod in a group is evicted, the group fails and must be restarted. -To protect a long-running job from voluntary disruptions, such as node drain for an upgrade, the user must ensure that some number of training replicas remain available. A disruption should consider evicting pods belonging to different replicas to be disrupting each of those replicas. +To protect a long-running job from voluntary disruptions, such as node drain for an upgrade, this user must ensure that some number of training replicas remain available. A disruption should consider evicting pods belonging to different replicas to be disrupting each of those replicas. The user would create a PDB for `LeaderWorkerSet` pods with a `replicaKey`: @@ -288,11 +288,11 @@ This way, the job can be protected to run with sufficient replicas during cluste #### Story 2: Cluster Maintenance -A cluster administrator will frequently drain nodes for upgrades, security patches, etc. The cluster may have various workloads, including `LeaderWorkerSet` with specific availability requirements. +A cluster administrator frequently drains nodes for upgrades, security patches, etc. The cluster may have various workloads, including `LeaderWorkerSet` with specific availability requirements. -To perform node drains safely without an application owner, this user may rely on the application owner's PDB as described in Story 1. The user may `kubectl drain `, and the Eviction API will automatically identify multi-pod replicas and ensure that the drain does not violate the application's `minAvailable` requirement. +To perform node drains safely without contacting the application owner every time, this user may rely on the application owner's PDB as described in Story 1. The user may `kubectl drain `, and the Eviction API will automatically identify multi-pod replicas and ensure that the drain does not violate the application's `minAvailable`. -This allows safe maintenance without causing outages, as the drain will pause if it cannot evict certain pods. If this happens, the user may wait for the application to become healthier, or contact the application owner to resolve. +This allows safe maintenance without causing outages, as the drain will pause if it cannot evict certain pods. If this happens, this user may wait for the application to become healthier or higher capacity, or contact the application owner to resolve. #### Setup Example @@ -348,9 +348,9 @@ graph TD linkStyle 0,1,2,3,4,5 stroke:#888,stroke-dasharray: 5 5,stroke-width:2px ``` -Assume there are 2 LWS replicas of 3 pods each, and 3 nodes which each host 2 pods. As in the diagram, node A hosts replica 0 pods 0 and 1, node B host replica 0 pod 2 and replica 1 pod 0, and node C hosts replica 1 pods 1 and 2. +Assume the following setup: there are 2 LWS replicas of 3 pods each, and 3 nodes which host 2 pods each. As in the diagram, node A hosts replica 0 pods 0 and 1, node B host replica 0 pod 2 and replica 1 pod 0, and node C hosts replica 1 pods 1 and 2. -To protect at least one 3-pod replica in the current system, a user could try a PDB with `minAvailable: 3`. A node drain on node B would see that there will still be 4 pods remaining afterwards, and would evict replica 0 pod 2 and replica 1 pod 0 pods from node B, failing one pod in each replica. Technically the PDB was honored, but now both replicas have a failing pod and they both fail. +To protect at least one 3-pod replica in the current system, a user could try a PDB with `minAvailable: 3`. A node drain on node B would see that there will still be 4 pods remaining afterwards, and would evict replica 0 pod 2 and replica 1 pod 0 pods from node B, failing one pod in each replica. Technically the PDB was honored, but now both replicas have a failing pod and they both fail. After the change, a PDB with `minAvailable: 1` and `replicaKey` set would identify the LWS replicas, and determine that evicting the pods in node B would cause both replicas to fail. This violates the PDB, and the node drain would safely stop before eviction. @@ -414,9 +414,9 @@ This might be a good place to talk about core concepts and how they relate. #### Background on multi-pod replicas (LWS) -We will take the LeaderWorkerSet (LWS) as an example of this system. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and each "worker" pod and the number of pods in the group. This is useful in cases where a leader process coordinates multiple worker processes, such as AI/ML workloads for distributed model training and inference. +In this KEP, the LeaderWorkerSet (LWS) is used as the primary example of a multi-pod replica system. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and each "worker" pod and the number of pods in the group (size). This is useful in cases where a leader process coordinates multiple worker processes, particularly in AI/ML distributed workloads for model training and inference. -It works by keeping all worker pods in the same lifecycle: they are created and scheduled in parallel, and if any workers fail the group is considered failing. In this context, LWS "replicas" are not additional pods, but additional leader+workers pod groups. The user may also specify the number of worker pods within each pod group (`leaderWorkerTemplate.size`). For unique identification, each worker has an index, and each replica of the group has an index. +All worker pods are treated the same: they are created from the same template, operated on in parallel, and if any workers fail the group is considered failing. A LeaderWorkerSet object will specify "replicas," which are not additional pods (group size), but additional leader+workers pod groups. For unique identification, each worker has an index, and each replica of the group has an index. The pods have various labels providing information as seen in the [docs](https://lws.sigs.k8s.io/docs/reference/labels-annotations-and-environment-variables/). ### Risks and Mitigations From 4c602ac22ab10cd8c366a8ead8fb97e062673527 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 20 Oct 2025 16:20:09 -0700 Subject: [PATCH 28/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 210 ++++++++++-------- 1 file changed, 118 insertions(+), 92 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index a6743de5eb7..8dd5943776d 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -173,7 +173,7 @@ useful for a wide audience. A good summary is probably at least a paragraph in length. --> -A PodDisruptionBudget (PBD) ensures availability of certain number pods during voluntary disruptions (node drains). This proposal would allow PDBs to treat a group of pods (e.g. [LeaderWorkerSet](https://github.com/kubernetes-sigs/lws) replicas) as if it were an individual pod, for the purposes of measuring availability. We will add new optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey` to the PDB spec to pull information about the pod group replicas from pod labels. +A PodDisruptionBudget (PBD) ensures availability of a certain number of pods during voluntary disruptions (node drains). This proposal would allow PDBs to treat groups of pods (defined by the new `Workload` API from gang scheduling) as if they were individual pods for the purposes of measuring availability. ## Motivation @@ -186,7 +186,7 @@ demonstrate the interest in a KEP within the wider Kubernetes community. [experience reports]: https://github.com/golang/go/wiki/ExperienceReports --> -The goal is to make PDBs usable for multi-pod replicas like a LWS, which has a leader and worker pods for use cases like distributed AI workloads. Currently, eviction or preemption of multiple pods may disturb pods across multiple LWS replicas, instead of the preferred outcome of evicting multiple pods from a single LWS replica. For workloads like a `LeaderWorkerSet`, the health of a replica depends on the simultaneous availability of all pods within its group. +The goal is to make PDBs usable for pod groups as defined by the `Workload` object, which are common for use cases like distributed ML workloads. Eviction or preemption of pods across multiple groups should be recognized as disrupting each of those groups, as opposed to evicting multiple pods from a single group (which only disrupts that one group). For these workloads, the health of the entire replica depends on the simultaneous availability of a certain number of pods within its group (as defined in the `Workload`'s `PodGroup`). ### Goals @@ -195,11 +195,10 @@ List the specific goals of the KEP. What is it trying to achieve? How will we know that this has succeeded? --> -- Define availability for pod groups: allow application owners to define PDBs for multi-pod replicas rather than individual pods. -- Enhance the PDB API: introduce optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey` to the `PodDisruptionBudget` spec. `replicaKey` will specify a pod label key, and pods sharing the same value for this key will be treated as a single, atomic unit when calculating availability. `replicaSizeKey` will provide the size of the group which a pod is a member of, and `replicaTotalKey` provide the number of total desired replicas. -- Update eviction logic: modify the Eviction API to use the `replicaKey` pod replicas for calculating availability. -- Maintain compatibility: ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow the group-based disruption budgets. All affected systems work as expected with pod groups (kube-scheduler, cluster autoscaler, custom schedulers). -- Preserve existing functionality: for backward compatibility, PDBs that do not specify the new `replicaKey` field should not have any new behavior. +- Define availability for pod groups: allow application owners to define PDBs for multi-pod replicas (as defined by the `Workload` API) rather than individual pods. +- Update eviction logic to automatically detect group-based workloads. When a pod selected by a PDB is found to be part of a `Workload` (by checking for `pod.spec.workload.name`), the eviction logic will use the `Workload` and `PodGroup` definitions as the source of truth for grouping and availability. +- Maintain compatibility: ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow the group-based disruption budgets. +- Preserve existing functionality: for backward compatibility, the behavior of PDBs selecting pods that are *not* part of a `Workload` (do not have `pod.spec.workload.name` set) should be unchanged. ### Non-Goals @@ -216,15 +215,17 @@ This change will only affect the Eviction API. The following are involuntary dis - Evictions by the Kubelet due to node pressure (e.g. memory shortage) - Taint manager deleting NoExecute tainted pods -This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `LeaderWorkerSet`, etc. -- The workload controller will be responsible for the labels and annotations on pods it manages. This change is not responsible for pod labeling or validation of groups. +This proposal introduces no changes to the `PodDisruptionBudget` object, only the eviction logic. + +This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `Workload`, etc. +- The workload controller will be responsible for setting the `workload.name` and `workload.podGroup` on the pods it manages. - The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, this will only handle evictions. This change will not affect scheduling. -- It is out of scope to introduce any form of gang scheduling or pod affinity rules. This only handles eviction of already-scheduled pods. +- It is out of scope to introduce any form of or change to gang scheduling. This only handles eviction of already-scheduled pods. -Partial replica failure -- Each replica is treated as a single unit and will be considered failing if any pod in it is failing. There is not currently a plan for types of replicas which are considred healthy with some percentage of pods failing. +Partial replica health: +- This KEP follows the definition of multi-pod replica health from the `Workload` API, using `minCount`. A replica is considered "available" if it meets `minCount`, and "unavailable" if it does not. We are not introducing any other definition of partial health (e.g. a percentage, or requiring all pods healthy). ## Proposal @@ -237,13 +238,19 @@ The "Design Details" section below is for the real nitty-gritty. --> -The primary change would be to the PDB spec with new optional fields `replicaKey`, `replicaSizeKey`, and `replicaTotalKey`. These fields will label keys for which we will read the value given in each pod. -`replicaKey` will identify groups of pods that should be handled together. All pods with the same value for this label key are considered a single replica, and a disruption to any pod within the replica is a disruption to the replica. For a LeaderWorkerSet (LWS), each group has a unique ID which is given to all pods in the group as the value for label `leaderworkerset.sigs.k8s.io/group-key`. +When the Eviction API evaluates a PDB, it will check the pods selected by the PDB's `selector`. + +- If a selected pod does not have `spec.workload.name` set, it is treated as an individual pod, and availability is calculated based on pod counts, preserving existing behavior. +- If a selected pod has `spec.workload.name` and `spec.workload.podGroup` set, the eviction manager will treat *groups* of pods as the atomic unit for availability. -`replicaSizeKey` is needed to provide the size of each pod group. Without this, we cannot detect a missing pod. For example, if we see 3 healthy pods but the intended group size is 4, the group should be marked unhealthy. +The logic will be as follows: +- Identify the `Workload` object referenced by `pod.spec.workload.name`. +- From the `workload.spec.podGroups`, find the entry matching `pod.spec.workload.podGroup`. +- This `PodGroup` object defines the group's policy. The `minCount` (from `policy.gang.minCount`) defines the minimum number of pods required for one replica of that group to be healthy. The `replicas` field defines how many such groups are expected. +- The PDB's `minAvailable` or `maxUnavailable` will be interpreted in terms of these `PodGroup` replicas, not individual pods. +- A `PodGroup` replica is considered "available" only if the number of healthy pods belonging to it meets its `minCount`. -`replicaTotalKey` is needed to provide the total number of desired replicas. Without this, we cannot detect a missing replica. For example, if we see 2 healthy replicas but the intended replica count is 5, we should consider a PDB with `minAvailable: 50%` or `maxUnavailable: 1` to be violated. ### User Stories (Optional) @@ -253,46 +260,48 @@ Include as much detail as possible so that people can understand the "how" of the system. The goal here is to make this feel real for users without getting bogged down. --> - -*If the user is not using multi-pod replicas, their process will be unaffected.* +*If the user is not using the `Workload` API, their process will be unaffected.* #### Story 1: Distributed Workload -An engineer is running distributed ML training jobs using a `LeaderWorkerSet`. Each replica consists of one leader and multiple worker pods that run concurrently. If any pod in a group is evicted, the group fails and must be restarted. +An engineer is running distributed ML training jobs using the `Workload` object. The `Workload` defines a `PodGroup` named `worker` with `replicas: 10` and `policy.gang.minCount: 8`. This means the job requires 10 replicas, and each replica consists of 8 pods. -To protect a long-running job from voluntary disruptions, such as node drain for an upgrade, this user must ensure that some number of training replicas remain available. A disruption should consider evicting pods belonging to different replicas to be disrupting each of those replicas. +To protect this long-running job from voluntary disruptions (like node drains), the user wants to ensure at least 9 of the 10 worker groups remain available. -The user would create a PDB for `LeaderWorkerSet` pods with a `replicaKey`: +The user would create a standard PDB targeting the worker pods: ```yaml apiVersion: policy/v1 kind: PodDisruptionBudget metadata: - name: my-training-job-pdb + name: my-training-job-workers-pdb spec: - minAvailable: 4 + minAvailable: 9 selector: matchLabels: - leaderworkerset.sigs.k8s.io/name: my-training-job - replicaKey: "leaderworkerset.sigs.k8s.io/group-key" - replicaSizeKey: "leaderworkerset.sigs.k8s.io/size" + # Assuming pods are labeled by the workload controller + workload: my-training-job + pod-group: worker ``` Upon node drain, the Eviction API will: -1. Select all pods matching `leaderworkerset.sigs.k8s.io/name: my-training-job`. -2. Group these pods into replicas based on their value for label `leaderworkerset.sigs.k8s.io/group-key` -3. Determine the number of healthy replicas. -4. Evict only if the number of healthy replicas after eviction will be at least `minAvailable` (4), where a group is considered disrupted if any of its pods are targeted for eviction. +1. Select all pods matching the selector. +2. Detect that these pods have `spec.workload.name: my-training-job` and `spec.workload.podGroup: worker`. +3. Fetch the `Workload` object `my-training-job`. +4. Identify that the PDB applies to the 'worker' `PodGroup`, which has 10 replicas. +5. Interpret `minAvailable: 9` as "9 worker `PodGroup` replicas must remain available." +6. A group is considered disrupted if evicting a pod would cause its healthy pod count to drop below its `minCount` (which is 8). +7. The drain will proceed only if it does not cause the number of available worker groups to drop below 9. -This way, the job can be protected to run with sufficient replicas during cluster maintenance. +This way, the job is protected to run with sufficient replicas during cluster maintenance, and the PDB definition is simple and intuitive. #### Story 2: Cluster Maintenance -A cluster administrator frequently drains nodes for upgrades, security patches, etc. The cluster may have various workloads, including `LeaderWorkerSet` with specific availability requirements. +A cluster administrator frequently drains nodes for upgrades. The cluster has various workloads, including complex multi-pod applications defined by the `Workload` API. -To perform node drains safely without contacting the application owner every time, this user may rely on the application owner's PDB as described in Story 1. The user may `kubectl drain `, and the Eviction API will automatically identify multi-pod replicas and ensure that the drain does not violate the application's `minAvailable`. +To perform node drains safely, the administrator relies on application owners' PDBs. When the admin issues `kubectl drain `, the Eviction API automatically identifies which pods belong to a `Workload`. It interprets the PDBs for those pods in terms of `PodGroup` replicas instead of individual pods, ensuring that the drain does not violate the application's group-based availability requirements. -This allows safe maintenance without causing outages, as the drain will pause if it cannot evict certain pods. If this happens, this user may wait for the application to become healthier or higher capacity, or contact the application owner to resolve. +This allows safe maintenance without causing outages, as the drain will pause if it cannot evict pods without violating a group-based PDB. #### Setup Example @@ -307,52 +316,52 @@ graph TD direction LR subgraph NodeA ["Node A"] - R0P0("Replica 0
Pod 0") - R0P1("Replica 0
Pod 1") + G0P0("Group 0
Pod 0") + G0P1("Group 0
Pod 1") end class NodeA node_box subgraph NodeB ["Node B"] - R0P2("Replica 0
Pod 2") - R1P0("Replica 1
Pod 0") + G0P2("Group 0
Pod 2") + G1P0("Group 1
Pod 0") end class NodeB node_box subgraph NodeC ["Node C"] - R1P1("Replica 1
Pod 1") - R1P2("Replica 1
Pod 2") + G1P1("Group 1
Pod 1") + G1P2("Group 1
Pod 2") end class NodeC node_box - class R0P0,R0P1,R0P2,R1P0,R1P1,R1P2 pod_box + class G0P0,G0P1,G0P2,G1P0,G1P1,G1P2 pod_box end %% Logical Groupings (shown with links) - subgraph LogicalGrouping ["Logical Replica Groups"] + subgraph LogicalGrouping ["Logical PodGroup Replicas"] direction TB style LogicalGrouping fill:none,stroke:none - R0("Replica 0") - R1("Replica 1") - class R0,R1 replica_label + G0("Group 0") + G1("Group 1") + class G0,G1 replica_label end - R0 -.-> R0P0 - R0 -.-> R0P1 - R0 -.-> R0P2 + G0 -.-> G0P0 + G0 -.-> G0P1 + G0 -.-> G0P2 - R1 -.-> R1P0 - R1 -.-> R1P1 - R1 -.-> R1P2 + G1 -.-> G1P0 + G1 -.-> G1P1 + G1 -.-> G1P2 - %% Style all links (0 to 5) + %% Style all links linkStyle 0,1,2,3,4,5 stroke:#888,stroke-dasharray: 5 5,stroke-width:2px ``` -Assume the following setup: there are 2 LWS replicas of 3 pods each, and 3 nodes which host 2 pods each. As in the diagram, node A hosts replica 0 pods 0 and 1, node B host replica 0 pod 2 and replica 1 pod 0, and node C hosts replica 1 pods 1 and 2. +Assume the following setup: A `Workload` defines a `PodGroup` with `replicas: 2` and `minCount: 3`. This results in 2 logical groups (replicas) of 3 pods each. As in the diagram, Node A hosts Group 0 Pods 0 and 1, Node B hosts Group 0 Pod 2 and Group 1 Pod 0, and Node C hosts Group 1 Pods 1 and 2. -To protect at least one 3-pod replica in the current system, a user could try a PDB with `minAvailable: 3`. A node drain on node B would see that there will still be 4 pods remaining afterwards, and would evict replica 0 pod 2 and replica 1 pod 0 pods from node B, failing one pod in each replica. Technically the PDB was honored, but now both replicas have a failing pod and they both fail. +To protect at least one 3-pod group in the current system, a user could try a PDB with `minAvailable: 3` (pods). A node drain on Node B would see that there will still be 4 pods remaining afterwards, which satisfies `minAvailable: 3`, and proceed. Technically the PDB was honored, but now both groups have a failing pod (each is missing one pod and no longer meets `minCount: 3`), and the entire application fails. -After the change, a PDB with `minAvailable: 1` and `replicaKey` set would identify the LWS replicas, and determine that evicting the pods in node B would cause both replicas to fail. This violates the PDB, and the node drain would safely stop before eviction. +After this change, a PDB with `minAvailable: 1` (interpreted as 1 group) would be evaluated. The eviction logic would identify the 2 `PodGroup` replicas. It would determine that evicting the pods on Node B would cause both replicas to become unavailable, violating the PDB (`minAvailable: 1`), and the node drain would safely stop before eviction. ```mermaid graph TD @@ -367,12 +376,12 @@ graph TD StartDrain("kubectl drain
node-b initiated") class StartDrain action - StartDrain --> PDB_Type{Which PDB is active?} + StartDrain --> PDB_Type{Which PDB logic applies?} PDB_Type -- "Traditional PDB" --> PDB_Old(PDB Spec:
minAvailable 3 pods) class PDB_Old pdb_spec - PDB_Type -- "Multipod PDB (with KEP)" --> PDB_New(PDB Spec:
minAvailable 1 replica,
replicaKey: ...group-key) + PDB_Type -- "Workload-Aware PDB (with KEP)" --> PDB_New(PDB Spec:
minAvailable 1 group) class PDB_New pdb_spec %% --- Traditional PDB Flow --- @@ -385,24 +394,26 @@ graph TD CheckPods -- "Yes (4 >= 3)" --> DrainSuccess("Drain Proceeds:
Node B pods evicted") class DrainSuccess action - DrainSuccess --> AppDown("Application State:
Both replicas fail
(Technically PDB honored,
but intent violated)") + DrainSuccess --> AppDown("Application State:
Both groups fail
(Technically PDB honored,
but intent violated)") class AppDown outcome_bad %% --- Multipod PDB Flow --- - PDB_New --> CalcReplicas(Calculate
available replicas) + PDB_New --> DetectGroups(Detect pods belong
to a Workload) + class DetectGroups process + + DetectGroups --> CalcReplicas(Calculate
available groups) class CalcReplicas process - CalcReplicas --> CheckReplicas{Are remaining replicas
>= 1?} + CalcReplicas --> CheckReplicas{Are remaining groups
>= 1?} class CheckReplicas decision CheckReplicas -- "No (0 >= 1)" --> DrainBlocked("Drain Blocked:
Eviction prevented") class DrainBlocked action - DrainBlocked --> AppHealthy("Application State:
Both replicas healthy
(PDB intent
fully protected)") + DrainBlocked --> AppHealthy("Application State:
Both groups healthy
(PDB intent
fully protected)") class AppHealthy outcome_good ``` - ### Notes/Constraints/Caveats (Optional) -- This feature relies on the workload controller (which may be `LeaderWorkerSet` or some third-party custom controller) to correctly apply `replicaKey` labels. Bugs in the controller could cause mislabeled pods and incorrect eviction decisions, possibly violating availability requirements. -- One failing pod in a large group will make the group unavailable, so a small number of simultaneously failing pods across groups could prevent evictions and block a node drain. This is intended behavior, but not necessarily obvious. +- This feature relies on the pod's `spec.workload.name` and `spec.workload.podGroup` fields being correctly set by its managing controller. If these fields are missing, point to a non-existent `Workload` object, or are set on some pods of a replica but not others, the eviction logic will fall back to per-pod counting, which may violate the application's true availability requirements. +- One failing pod in a large group will make that group "unavailable" if it drops below its `minCount`. A small number of failing pods spread across many groups could prevent all evictions and block a node drain. This is intended behavior (as the application is unhealthy), but may be surprising to operators. +- A PDB `selector` that matches pods from multiple different `PodGroup`s (or a mix of grouped and individual pods) may have complex or unintended behavior. Users should be advised to create separate PDBs for each distinct `PodGroup` they wish to protect. ## Design Details @@ -444,43 +468,45 @@ required) or even code snippets. If there's any ambiguity about HOW your proposal will be implemented, this is the place to discuss them. --> -#### Pods without the `replicaKey` label - -If a PDB specifies a `replicaKey`, but the `selector` matches a pod that is missing the label, the pod will be treated as an unhealthy replica with size 1, as this should only be the result of a pod which had been incorrectly labeled or somehow in a malformed group. Even if the pod is technically healthy, we mark it as unavailable so that the Eviction API does not percieve it as an additional available replica for the PDB budget, and proceed with an otherwise unsafe node drain. An empty value (e.g. `leaderworkerset.sigs.k8s.io/group-key: ""`), as it would likely be caused by an error, will be treated as unlabeled (i.e. unhealthy). - -#### Labels vs. Annotations -LWS uses a label for the group id but an annotation for the group size. When provided with `replicaKey` and `replicaSizeKey`, we will check for both labels and annotations in case other implementations of multi-pod replicas use a different label/annotation setup. If both are set and not equal, we will default to marking the pod as unhealthy. +#### Eviction Logic Flow + +The core logic change is in the PDB eviction controller. +1. Get all pods matching the PDB's `selector`. +2. Partition this pod list into two sets: + - `individualPods`: Pods where `spec.workloadReference` is unset or `spec.workloadReference.Name` is empty. + - `groupedPods`: Pods where `spec.workloadReference.Name` is set. +3. If `groupedPods` is empty, proceed with the existing per-pod availability calculation on `individualPods`. The PDB's `minAvailable`/`maxUnavailable` are interpreted as pod counts. +4. If `groupedPods` is not empty: + - The controller will log a warning if `individualPods` is *also* not empty, as mixing pod types in one PDB is discouraged. The calculation will proceed based *only* on the `groupedPods`. + - The PDB's `minAvailable`/`maxUnavailable` are now interpreted as `PodGroup` replica counts. + - Group the `groupedPods` by their `(spec.workloadReference.Name, spec.workloadReference.PodGroup)` tuple. + - For each unique `(workload, podGroup)` tuple found: + - Fetch the `Workload` object. + - Find the matching `PodGroup` in its `spec.podGroups`. + - Read `totalReplicas = podGroup.replicas`. + - Read `minSizePerReplica = podGroup.policy.gang.minCount`. + - Identify all pods belonging to this `PodGroup` (across all its replicas). + - Group *these* pods by their `spec.workloadReference.PodGroupReplicaIndex`. + - Count the number of "available" replicas: A replica is "available" if its count of healthy, non-evicting pods is `>= minSizePerReplica`. + - The "total" for the PDB calculation is the sum of `totalReplicas` for all `PodGroup`s matched by the selector. + - The "available" count is the sum of "available" replicas across all matched `PodGroup`s. +5. Compare this "available" group count against the PDB's `minAvailable` or `maxUnavailable` to decide if an eviction is allowed. #### Group Health -A group is considered available only if all pods within that group are available (e.g. `Running` and `Ready`). If any pod within a group is unavailable for before an eviction is attempted, the entire group is considered unavailable. An eviction request for a pod in a healthy group may be denied if other groups are unhealthy, even if the pods in the unhealthy groups are not eviction targets. - -#### Missing pods from a group - -If a pod is missing (not failing) from a group, we would not know that the group is unhealthy without having the desired replica size. This is why `replicaSizeKey` is needed, and specifies the key whose value is the size of the replica which a pod belongs to. In LWS, `leaderworkerset.sigs.k8s.io/size` is set in all pods. Any group with an incorrect number of pods can be marked unhealthy, as if it had a failing pod. - -#### Percentage of total replicas - -If we only look at all selected pods to check for PDB availability, it is sufficient for an absolute number of availability, e.g. `minAvailable=4`. However, without knowing the total number of replicas desired, there could be pods missing (making them "unavailable" but not detected). This means we wouldn't know when a percentage (e.g. `minAvailable=80%`) or any `maxUnavailable` is violated. - -To look at total replicas desired, a PDB can see the `spec.replicas` field in `Deployment`, `StatefulSet`, or `ReplicaSet`. `LeaderWorkerSet` has this field, but we don't want to hard-code it as another recognized kind (creating a dependency on an non-core extension). It may be possible to allow any arbitrary object with a `spec.replicas` field to be recognized, but this may require refactoring parts of the `DisruptionController` (or making any multi-pod replica controllers also create an associated `Scale` object). - -The other solution is to add field `replicaTotalKey` where each pod would have a label/annotation providing the expected number of replicas. In LWS the `leaderworkerset.sigs.k8s.io/replicas` annotation is unfortunately only in the leader pod's `StatefulSet`, and adding a process to extract the replicas count from this would rely on the specific implementation details of LWS. It would be preferred to get LWS to include the replicas annotation on all pods, allowing it to be read like the other information. Once this is implemented, LWS would be compatible with `replicaTotalKey`. However, until this change is made, LWS would be unable to support percentage-based PDBs or `maxUnavailable`. +A `PodGroup` replica is considered **available** if its number of healthy, non-evicting pods is greater than or equal to its `policy.gang.minCount`. -#### Required fields -To use multi-pod replicas, `replicaKey` must be specified. +This logic inherently handles missing pods: the availability calculation is based *only* on the count of *currently healthy* pods. For example, if a replica expects 10 pods (`minCount` is 8) but only 9 pods exist (1 is missing), the replica is still considered **available** as long as 8 or 9 of those existing pods are healthy. If 3 pods are missing and only 7 healthy pods exist, the replica is **unavailable** (since 7 < 8). -Log a warning if: -- `replicaSizeKey` is not specified (we will then assume that missing pods are not an issue) -- `replicaTotalKey` is not specified and a percentage-based PDB or `maxUnavailable` is used +If any pod in an available group is targeted for eviction, the availability of that group replica must be re-evaluated (i.e., "healthy pods - 1") to see if it would drop below `minCount`. -Error if: -- `replicaSizeKey` or `replicaTotalKey` are specified are without `replicaKey` +#### Handling `minAvailable` Percentages -#### Other systems +The `Workload` object provides the total number of `replicas` for a `PodGroup`. This allows percentage-based `minAvailable` (e.g., "80%") and `maxUnavailable` to work correctly, as the total expected number of atomic units (the `PodGroup` replicas) is known. -While LWS is the primary use case and is given as the example in this KEP, this change is not exclusive to LWS and works with any other multi-pod replica systems which use labels. +#### Pods without `workloadReference` +If a PDB's `selector` matches a pod that is missing the `spec.workloadReference` field (or its `Name` is empty), it will be treated as an individual pod. If the PDB matches *only* individual pods, the standard per-pod logic applies. If it matches a mix, only the grouped-pod logic will apply (and the individual pods will be ignored for the PDB calculation, with a warning). ### Test Plan From b01a1b0e5e04f51fa5e575abbcd5df3f49ee0b39 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Tue, 21 Oct 2025 11:02:53 -0700 Subject: [PATCH 29/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 70 +++++++++++++------ 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 8dd5943776d..0472ad3deac 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -470,31 +470,57 @@ proposal will be implemented, this is the place to discuss them. #### Eviction Logic Flow -The core logic change is in the PDB eviction controller. -1. Get all pods matching the PDB's `selector`. -2. Partition this pod list into two sets: - - `individualPods`: Pods where `spec.workloadReference` is unset or `spec.workloadReference.Name` is empty. - - `groupedPods`: Pods where `spec.workloadReference.Name` is set. -3. If `groupedPods` is empty, proceed with the existing per-pod availability calculation on `individualPods`. The PDB's `minAvailable`/`maxUnavailable` are interpreted as pod counts. -4. If `groupedPods` is not empty: - - The controller will log a warning if `individualPods` is *also* not empty, as mixing pod types in one PDB is discouraged. The calculation will proceed based *only* on the `groupedPods`. - - The PDB's `minAvailable`/`maxUnavailable` are now interpreted as `PodGroup` replica counts. - - Group the `groupedPods` by their `(spec.workloadReference.Name, spec.workloadReference.PodGroup)` tuple. - - For each unique `(workload, podGroup)` tuple found: - - Fetch the `Workload` object. - - Find the matching `PodGroup` in its `spec.podGroups`. - - Read `totalReplicas = podGroup.replicas`. - - Read `minSizePerReplica = podGroup.policy.gang.minCount`. - - Identify all pods belonging to this `PodGroup` (across all its replicas). - - Group *these* pods by their `spec.workloadReference.PodGroupReplicaIndex`. - - Count the number of "available" replicas: A replica is "available" if its count of healthy, non-evicting pods is `>= minSizePerReplica`. - - The "total" for the PDB calculation is the sum of `totalReplicas` for all `PodGroup`s matched by the selector. - - The "available" count is the sum of "available" replicas across all matched `PodGroup`s. -5. Compare this "available" group count against the PDB's `minAvailable` or `maxUnavailable` to decide if an eviction is allowed. +1. Get all pods matching the PDB's `selector`. +2. If no pods have `spec.workloadReference.Name` set, follow with the existing per-pod availability behavior. +3. The controller will log a warning if there are also non-workload pods, as mixing pod types in one PDB is discouraged. These individual pods will be ignored. +4. Group the pods by `spec.workloadReference.Name` and `spec.workloadReference.PodGroup`. +5. Fetch the relevant `PodGroup` information from `Workload` objects' `spec.podGroups`: `PodGroup.replicas` (total replicas) and `PodGroup.policy.gang.minCount` (pods in each replica). +6. Count the number of available replicas: a replica is available if its count of existing, healthy, non-evicting pods `>= minCount`. +7. Count the total desired replicas, the sum of `replicas` for all `PodGroup`s. +8. Compare this available group count and total against the PDB's `minAvailable` or `maxUnavailable` to decide if an eviction is allowed. + +```mermaid +graph TD + subgraph "Eviction Logic Flow" + direction TB + + Start(Eviction API Triggered
for a PDB) --> GetPods[Get all pods matching
PDB selector] + + GetPods --> CheckWorkload{Do any pods have
spec.workloadReference set?} + + %% Branch 1: Legacy Path (No Workload Pods) + CheckWorkload -- "No" --> LegacyLogic[Use existing
per-pod logic] + LegacyLogic --> CalcPods[Calculate availability
based on individual
pod
counts] + CalcPods --> DecisionLegacy{Pods meet
PDB spec?} + DecisionLegacy -- "Yes" --> Allow[✅ Allow Eviction] + DecisionLegacy -- "No" --> Deny[❌ Deny Eviction] + + %% Branch 2: New Path (Workload-Aware Pods) + CheckWorkload -- "Yes" --> WarnMixed(Log warning if
mixed pod types found.
Individual pods
will be ignored.) + WarnMixed --> GroupPods[Group pods by
Workload and PodGroup] + GroupPods --> FetchGroupInfo[Fetch PodGroup info
from Workloads:
- Total replicas per group
- minCount per group] + FetchGroupInfo --> CountAvailable[Count 'available' replicas:
Existing, healthy,
non-evicting pods
must meet minCount] + CountAvailable --> SumTotalReplicas[Sum total desired
replicas from all
matched groups] + SumTotalReplicas --> DecisionNew{Compare available/total
group counts
against PDB spec} + DecisionNew -- "Yes" --> Allow + DecisionNew -- "No" --> Deny + end + + %% Styling (with dark text color for readability) + classDef decision fill:#fff0e6,stroke:#ff9933,stroke-width:2px,color:#111 + classDef process fill:#e6f3ff,stroke:#66b3ff,stroke-width:2px,color:#111 + classDef startEnd fill:#f0fff0,stroke:#aaffaa,stroke-width:2px,color:#111 + classDef error fill:#fff0f0,stroke:#ffaaaa,stroke-width:2px,color:#111 + + class Start,Allow,Deny startEnd + class Deny error + class GetPods,LegacyLogic,CalcPods,WarnMixed,GroupPods,FetchGroupInfo,CountAvailable,SumTotalReplicas process + class CheckWorkload,DecisionLegacy,DecisionNew decision +``` #### Group Health -A `PodGroup` replica is considered **available** if its number of healthy, non-evicting pods is greater than or equal to its `policy.gang.minCount`. +A `PodGroup` replica is considered available if its number of healthy, non-evicting pods is greater than or equal to its `policy.gang.minCount`. This logic inherently handles missing pods: the availability calculation is based *only* on the count of *currently healthy* pods. For example, if a replica expects 10 pods (`minCount` is 8) but only 9 pods exist (1 is missing), the replica is still considered **available** as long as 8 or 9 of those existing pods are healthy. If 3 pods are missing and only 7 healthy pods exist, the replica is **unavailable** (since 7 < 8). From b3323b657fa96118ca8cc80bef60b3ed624360ac Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Tue, 21 Oct 2025 14:59:16 -0700 Subject: [PATCH 30/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 275 ++++++++++++------ 1 file changed, 192 insertions(+), 83 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 0472ad3deac..b7d0d1cc315 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -65,7 +65,7 @@ If none of those approvers are still appropriate, then changes to that list should be approved by the remaining approvers and/or the owning SIG (or SIG Architecture for cross-cutting KEPs). --> -# KEP-NNNN: PDB for Multi-pod Replicas +# KEP-NNNN: PDB for Workload Replicas -A PodDisruptionBudget (PBD) ensures availability of a certain number of pods during voluntary disruptions (node drains). This proposal would allow PDBs to treat groups of pods (defined by the new `Workload` API from gang scheduling) as if they were individual pods for the purposes of measuring availability. +The Eviction API uses PodDisruptionBudgets (PBDs) to ensure availability of a certain number or percentage of pods during voluntary disruptions (node drains). This proposal would allow PDBs to treat groups of pods (defined by the new `Workload` API in the [gang scheduling KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/4671-gang-scheduling)) as if they were individual pods for the purposes of measuring availability. We will introduce a new boolean field `usePodGroups` in the PDB which explicitly enables this group-aware eviction logic. + +*Note: as of this writing, the `Workload` API is still in progress, for this KEP we assume it is fully implemented* ## Motivation @@ -186,7 +188,7 @@ demonstrate the interest in a KEP within the wider Kubernetes community. [experience reports]: https://github.com/golang/go/wiki/ExperienceReports --> -The goal is to make PDBs usable for pod groups as defined by the `Workload` object, which are common for use cases like distributed ML workloads. Eviction or preemption of pods across multiple groups should be recognized as disrupting each of those groups, as opposed to evicting multiple pods from a single group (which only disrupts that one group). For these workloads, the health of the entire replica depends on the simultaneous availability of a certain number of pods within its group (as defined in the `Workload`'s `PodGroup`). +The goal is to make PDBs usable for pod groups as defined by the `Workload` object, which are common for use cases of distributed workloads such as ML training. Eviction or preemption of pods across multiple groups should be recognized as disrupting each of those groups, as opposed to evicting multiple pods from a single group (which only disrupts that one group). For these workloads, the health of the entire replica depends on the simultaneous availability of a certain number of pods within its group (as defined in the `Workload`'s `PodGroup`). ### Goals @@ -195,10 +197,11 @@ List the specific goals of the KEP. What is it trying to achieve? How will we know that this has succeeded? --> -- Define availability for pod groups: allow application owners to define PDBs for multi-pod replicas (as defined by the `Workload` API) rather than individual pods. -- Update eviction logic to automatically detect group-based workloads. When a pod selected by a PDB is found to be part of a `Workload` (by checking for `pod.spec.workload.name`), the eviction logic will use the `Workload` and `PodGroup` definitions as the source of truth for grouping and availability. -- Maintain compatibility: ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow the group-based disruption budgets. -- Preserve existing functionality: for backward compatibility, the behavior of PDBs selecting pods that are *not* part of a `Workload` (do not have `pod.spec.workload.name` set) should be unchanged. +- **Introduce an opt-in for group-based PDBs:** Add a new boolean field `usepodGroups` to the `PodDisruptionBudget.spec`. +- **Define availability for pod groups:** Allow application owners to define PDBs for multi-pod replicas (as defined by the `Workload` API) rather than individual pods. +- **Update eviction logic:** When `usepodGroups: true` is set on a PDB, the eviction logic will use the `Workload` and `PodGroup` definitions (linked by `pod.spec.workload.name`) for grouping and calculating availability. +- **Maintain compatibility:** Ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow the group-based disruption budgets when opted in. +- **Preserve existing functionality:** For backward compatibility, the behavior of PDBs that do not set `usepodGroups: true` will be unchanged. ### Non-Goals @@ -206,7 +209,7 @@ know that this has succeeded? What is out of scope for this KEP? Listing non-goals helps to focus discussion and make progress. --> - + This change will only affect the Eviction API. The following are involuntary disruptions and do not use the Eviction API: - Manual pod deletion - Pods being deleted by their owning controller (e.g. during Deployment rollout) @@ -215,7 +218,7 @@ This change will only affect the Eviction API. The following are involuntary dis - Evictions by the Kubelet due to node pressure (e.g. memory shortage) - Taint manager deleting NoExecute tainted pods -This proposal introduces no changes to the `PodDisruptionBudget` object, only the eviction logic. +The only change to the `PodDisruptionBudget` object will be the optional field in the spec to enable the changes. This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `Workload`, etc. - The workload controller will be responsible for setting the `workload.name` and `workload.podGroup` on the pods it manages. @@ -227,6 +230,9 @@ This change will not affect scheduling. Partial replica health: - This KEP follows the definition of multi-pod replica health from the `Workload` API, using `minCount`. A replica is considered "available" if it meets `minCount`, and "unavailable" if it does not. We are not introducing any other definition of partial health (e.g. a percentage, or requiring all pods healthy). +Mixed workload types: +- If a PDB is measuring multi-pod replicas, individual pods without an assigned workload will be ignored. + ## Proposal +We propose adding a new, optional boolean field `usepodGroups` to the `PodDisruptionBudget.spec`. +- If this field is `false` (default) or unset, the Eviction API evaluates the PDB based on individual pod counts, preserving all existing behavior. -When the Eviction API evaluates a PDB, it will check the pods selected by the PDB's `selector`. - -- If a selected pod does not have `spec.workload.name` set, it is treated as an individual pod, and availability is calculated based on pod counts, preserving existing behavior. -- If a selected pod has `spec.workload.name` and `spec.workload.podGroup` set, the eviction manager will treat *groups* of pods as the atomic unit for availability. +When `usepodGroups: true`: +- If a selected pod does not have `spec.workload.name` set, log a warning and treat it as its own group. +- If a selected pod has `spec.workload.name` and `spec.workload.podGroup` set, the eviction manager will treat groups of pods as the atomic unit for availability. -The logic will be as follows: +When using pod groups: - Identify the `Workload` object referenced by `pod.spec.workload.name`. - From the `workload.spec.podGroups`, find the entry matching `pod.spec.workload.podGroup`. - This `PodGroup` object defines the group's policy. The `minCount` (from `policy.gang.minCount`) defines the minimum number of pods required for one replica of that group to be healthy. The `replicas` field defines how many such groups are expected. - The PDB's `minAvailable` or `maxUnavailable` will be interpreted in terms of these `PodGroup` replicas, not individual pods. - A `PodGroup` replica is considered "available" only if the number of healthy pods belonging to it meets its `minCount`. - ### User Stories (Optional) PDB_Type{Which PDB logic applies?} - PDB_Type -- "Traditional PDB" --> PDB_Old(PDB Spec:
minAvailable 3 pods) + PDB_Type -- "Traditional PDB" --> PDB_Old(PDB Spec:
minAvailable 3 pods
usePodGrouping: false) class PDB_Old pdb_spec - PDB_Type -- "Workload-Aware PDB (with KEP)" --> PDB_New(PDB Spec:
minAvailable 1 group) + PDB_Type -- "Workload-Aware PDB (with KEP)" --> PDB_New(PDB Spec:
minAvailable 1 group
usePodGrouping: true) class PDB_New pdb_spec %% --- Traditional PDB Flow --- @@ -424,14 +433,15 @@ This might be a good place to talk about core concepts and how they relate. --> #### Background on multi-pod replicas (LWS) - In this KEP, the LeaderWorkerSet (LWS) is used as the primary example of a multi-pod replica system. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and each "worker" pod and the number of pods in the group (size). This is useful in cases where a leader process coordinates multiple worker processes, particularly in AI/ML distributed workloads for model training and inference. All worker pods are treated the same: they are created from the same template, operated on in parallel, and if any workers fail the group is considered failing. A LeaderWorkerSet object will specify "replicas," which are not additional pods (group size), but additional leader+workers pod groups. For unique identification, each worker has an index, and each replica of the group has an index. The pods have various labels providing information as seen in the [docs](https://lws.sigs.k8s.io/docs/reference/labels-annotations-and-environment-variables/). #### Background on the `Workload` API -In this KEP, the `Workload` object from the gang scheduling API is the source of truth for pod grouping. The `Workload` API allows users to define complex, multi-component applications. +This KEP assumes that a pod controller (like the one managing `Workload` objects) will create pods and set `pod.spec.workload.name` and `pod.spec.workload.podGroup` on each pod it creates, linking it back to the `Workload` definition. The eviction logic uses this link to read the group's requirements. + +In this KEP, the `Workload` object from the gang scheduling API is the source of truth for pod grouping. A `Workload` object contains a list of `PodGroup`s. Each `PodGroup` defines: * `name`: A unique identifier for the group within the `Workload`. @@ -439,7 +449,9 @@ A `Workload` object contains a list of `PodGroup`s. Each `PodGroup` defines: * `policy`: The scheduling policy, such as `Gang`. * `policy.gang.minCount`: The minimum number of pods required for one replica of that group. -This KEP assumes that a pod controller (like the one managing `Workload` objects) will create pods and set `pod.spec.workload.name` and `pod.spec.workload.podGroup` on each pod it creates, linking it back to the `Workload` definition. The eviction logic uses this link to read the group's requirements. +In the context of LWS, a LWS replica is equivalent to a PodGroup replica, and its `size` is `minCount`. + +This KEP assumes that a controller (like the one managing `Workload` objects) will create pods and set `pod.spec.workload.name` and `pod.spec.workload.podGroup` on each pod it creates, linking it back to the `Workload` definition. The eviction logic uses this link to read the group's requirements. ### Risks and Mitigations @@ -455,7 +467,7 @@ How will UX be reviewed, and by whom? Consider including folks who also work outside the SIG or subproject. --> -- This feature relies on the pod's `spec.workload.name` and `spec.workload.podGroup` fields being correctly set by its managing controller. If these fields are missing, point to a non-existent `Workload` object, or are set on some pods of a replica but not others, the eviction logic will fall back to per-pod counting, which may violate the application's true availability requirements. +- This feature relies on the pod's `spec.workload.name` and `spec.workload.podGroup` fields being correctly set by its managing controller. If a user sets `usePodGrouping: true` but the pods are not correctly linked to a `Workload` object, the eviction logic will fall back to per-pod counting, which may violate the application's true availability requirements (though it will prevent a drain from being blocked by misconfiguration). - One failing pod in a large group will make that group "unavailable" if it drops below its `minCount`. A small number of failing pods spread across many groups could prevent all evictions and block a node drain. This is intended behavior (as the application is unhealthy), but may be surprising to operators. - A PDB `selector` that matches pods from multiple different `PodGroup`s (or a mix of grouped and individual pods) may have complex or unintended behavior. Users should be advised to create separate PDBs for each distinct `PodGroup` they wish to protect. @@ -468,35 +480,82 @@ required) or even code snippets. If there's any ambiguity about HOW your proposal will be implemented, this is the place to discuss them. --> +### API Definition + +We will add a new field to `PodDisruptionBudgetSpec` in `pkg/apis/policy/v1/types.go`. + +```go +// PodDisruptionBudgetSpec defines the desired state of PodDisruptionBudget +type PodDisruptionBudgetSpec struct { + // An eviction is allowed if at least "minAvailable" pods selected by + // "selector" will still be available after the eviction, i.e. even in the + // absence of the evicted pod. So, "minAvailable" is a safety threshold, + // an absolute number or a percentage. + // +optional + MinAvailable *intstr.IntOrString `json:"minAvailable,omitempty" protobuf:"bytes,1,opt,name=minAvailable"` + + // Label query over pods whose evictions are managed by the disruption + // budget. + // +optional + Selector *metav1.LabelSelector `json:"selector,omitempty" protobuf:"bytes,2,opt,name=selector"` + + // An eviction is allowed if at most "maxUnavailable" pods selected by + // "selector" are unavailable after the eviction, i.e. even in the + // presence of the evicted pod. So, "maxUnavailable" is a safety threshold, + // an absolute number or a percentage. + // +optional + MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty" protobuf:"bytes,3,opt,name=maxUnavailable"` + + // usePodGrouping indicates that availability should be calculated based on + // pod groups defined by the Workload API (pod.spec.workloadReference). + // If set to true, the eviction logic will interpret minAvailable/maxUnavailable + // as a count of PodGroup replicas, not individual pods. + // If a pod matched by the selector does not have a workloadReference, + // it will be treated as an individual pod for availability calculations, + // and a warning will be logged. + // Defaults to false. + // +optional + UsePodGrouping bool `json:"usePodGrouping,omitempty" protobuf:"varint,4,opt,name=usePodGrouping"` +} +``` + #### Eviction Logic Flow -1. Get all pods matching the PDB's `selector`. -2. If no pods have `spec.workloadReference.Name` set, follow with the existing per-pod availability behavior. -3. The controller will log a warning if there are also non-workload pods, as mixing pod types in one PDB is discouraged. These individual pods will be ignored. -4. Group the pods by `spec.workloadReference.Name` and `spec.workloadReference.PodGroup`. -5. Fetch the relevant `PodGroup` information from `Workload` objects' `spec.podGroups`: `PodGroup.replicas` (total replicas) and `PodGroup.policy.gang.minCount` (pods in each replica). -6. Count the number of available replicas: a replica is available if its count of existing, healthy, non-evicting pods `>= minCount`. -7. Count the total desired replicas, the sum of `replicas` for all `PodGroup`s. -8. Compare this available group count and total against the PDB's `minAvailable` or `maxUnavailable` to decide if an eviction is allowed. +If `pdb.spec.usePodGrouping: false` or unset, follow the existing per-pod availability behavior. +If `true`: +1. Get all pods matching the PDB's `selector`. +2. Check if all pods have `spec.workloadReference.Name` set. +3. If no pods have `spec.workloadReference.Name`, log a warning (misconfiguration) and fall back to existing per-pod availability. If some pods have `spec.workloadReference.Name` unset, log a warning, as mixing pod types in one PDB is discouraged. Individual pods will be counted as their own group. +4. Find the `Workload` object for each `spec.workloadReference.Name` +5. Find the `PodGroup` in the `Workload` for each `spec.workloadReference.PodGroup` +6. Get `PodGroup.replicas` (total replicas) and `PodGroup.policy.gang.minCount` (pods in each replica). +7. Count the number of available replicas: a replica is available if its count of existing, healthy, non-evicting pods `>= minCount`. +8. Count the total desired replicas, the sum of `replicas` for all `PodGroup`s. +9. Compare this available group count and total against the PDB's `minAvailable` or `maxUnavailable` to decide if an eviction is allowed. ```mermaid graph TD subgraph "Eviction Logic Flow" direction TB - Start(Eviction API Triggered
for a PDB) --> GetPods[Get all pods matching
PDB selector] + Start(Eviction API Triggered
for a PDB) --> CheckFlag{pdb.spec.usePodGrouping
== true?} + class CheckFlag decision - GetPods --> CheckWorkload{Do any pods have
spec.workloadReference set?} - - %% Branch 1: Legacy Path (No Workload Pods) - CheckWorkload -- "No" --> LegacyLogic[Use existing
per-pod logic] + %% Branch 1: Legacy Path (Flag False) + CheckFlag -- "No (default)" --> LegacyLogic[Use existing
per-pod logic] LegacyLogic --> CalcPods[Calculate availability
based on individual
pod
counts] CalcPods --> DecisionLegacy{Pods meet
PDB spec?} DecisionLegacy -- "Yes" --> Allow[✅ Allow Eviction] DecisionLegacy -- "No" --> Deny[❌ Deny Eviction] - %% Branch 2: New Path (Workload-Aware Pods) - CheckWorkload -- "Yes" --> WarnMixed(Log warning if
mixed pod types found.
Individual pods
will be ignored.) + %% Branch 2: New Path (Flag True) + CheckFlag -- "Yes" --> GetPods[Get all pods matching
PDB selector] + GetPods --> CheckWorkload{Do any pods have
spec.workloadReference set?} + + CheckWorkload -- "No" --> WarnMismatched(Log Warning:
'usePodGrouping' is true
but no pods have
workloadReference) + WarnMismatched --> LegacyLogic + + CheckWorkload -- "Yes" --> WarnMixed(Log warning if
mixed pod types found.
Individual pods
will be counted as 1) WarnMixed --> GroupPods[Group pods by
Workload and PodGroup] GroupPods --> FetchGroupInfo[Fetch PodGroup info
from Workloads:
- Total replicas per group
- minCount per group] FetchGroupInfo --> CountAvailable[Count 'available' replicas:
Existing, healthy,
non-evicting pods
must meet minCount] @@ -514,25 +573,19 @@ graph TD class Start,Allow,Deny startEnd class Deny error - class GetPods,LegacyLogic,CalcPods,WarnMixed,GroupPods,FetchGroupInfo,CountAvailable,SumTotalReplicas process - class CheckWorkload,DecisionLegacy,DecisionNew decision + class GetPods,LegacyLogic,CalcPods,WarnMixed,WarnMismatched,GroupPods,FetchGroupInfo,CountAvailable,SumTotalReplicas process + class CheckWorkload,DecisionLegacy,DecisionNew,CheckFlag decision ``` #### Group Health +A `PodGroup` replica is considered available if its number of existing, healthy, non-evicting pods is greater than or equal to its `policy.gang.minCount`. -A `PodGroup` replica is considered available if its number of healthy, non-evicting pods is greater than or equal to its `policy.gang.minCount`. - -This logic inherently handles missing pods: the availability calculation is based *only* on the count of *currently healthy* pods. For example, if a replica expects 10 pods (`minCount` is 8) but only 9 pods exist (1 is missing), the replica is still considered **available** as long as 8 or 9 of those existing pods are healthy. If 3 pods are missing and only 7 healthy pods exist, the replica is **unavailable** (since 7 < 8). - -If any pod in an available group is targeted for eviction, the availability of that group replica must be re-evaluated (i.e., "healthy pods - 1") to see if it would drop below `minCount`. +For example, if a replica expects 10 pods with `minCount: 8` but only has 9 healthy pods (1 is missing or unhealthy), the replica is still considered **available**. If 3 pods are missing or unhealthy and only 7 healthy pods exist, the replica is **unavailable**. If any pod in an available group is targeted for eviction, it would be unhealthy post-eviction and is also counted as unavailable for the PDB calculation. -#### Handling `minAvailable` Percentages -The `Workload` object provides the total number of `replicas` for a `PodGroup`. This allows percentage-based `minAvailable` (e.g., "80%") and `maxUnavailable` to work correctly, as the total expected number of atomic units (the `PodGroup` replicas) is known. +### Pods without `workloadReference` -#### Pods without `workloadReference` - -If a PDB's `selector` matches a pod that is missing the `spec.workloadReference` field (or its `Name` is empty), it will be treated as an individual pod. If the PDB matches *only* individual pods, the standard per-pod logic applies. If it matches a mix, only the grouped-pod logic will apply (and the individual pods will be ignored for the PDB calculation, with a warning). +If a PDB's `selector` matches a pod that is missing the `spec.workloadReference` field (or its `Name` is empty), it will be treated as an individual pod. If `usePodGrouping: true` is set, this will be logged as a warning. If the PDB matches *only* individual pods, this will be equivalent to the standard per-pod logic. ### Test Plan @@ -579,8 +632,8 @@ This can inform certain test coverage improvements that we want to do before extending the production code to implement this enhancement. --> -- ``: `` - `` - +- `k8s.io/kubernetes/pkg/controller/disruption`: `` - `` (tests for new eviction logic). + ##### Integration tests - [test name](https://github.com/kubernetes/kubernetes/blob/2334b8469e1983c525c0c6382125710093a25883/test/integration/...): [integration master](https://testgrid.k8s.io/sig-release-master-blocking#integration-master?include-filter-by-regex=MyCoolFeature), [triage search](https://storage.googleapis.com/k8s-triage/index.html?test=MyCoolFeature) +--> + +- An integration test will be added to `test/integration/disruption` to simulate the eviction process. +- **Test 1:** PDB with `usePodGrouping: false` (default) and `Workload`-managed pods. Verify eviction uses per-pod counting. +- **Test 2:** PDB with `usePodGrouping: true` and `Workload`-managed pods. Verify eviction uses per-group counting and blocks when `minAvailable` groups would be violated. +- **Test 3:** PDB with `usePodGrouping: true` but with non-`Workload` pods. Verify eviction falls back to per-pod counting and logs a warning. ##### e2e tests @@ -622,9 +680,18 @@ This can be done with: We expect no non-infra related flakes in the last month as a GA graduation criteria. If e2e tests are not necessary or useful, explain why. ---> - [test name](https://github.com/kubernetes/kubernetes/blob/2334b8469e1983c525c0c6382125710093a25883/test/e2e/...): [SIG ...](https://testgrid.k8s.io/sig-...?include-filter-by-regex=MyCoolFeature), [triage search](https://storage.googleapis.com/k8s-triage/index.html?test=MyCoolFeature) +--> + +An e2e test will be added. +1. Create a `Workload` with 2 `PodGroup` replicas, each with `minCount: 3`. +2. Create a PDB with `minAvailable: 1` and `usePodGrouping: true` selecting these pods. +3. Manually schedule pods such that one node drain would disrupt both groups (as in the example given earlier). +4. Attempt to drain the node. +5. Verify the drain is blocked by the PDB. +6. Update PDB to `minAvailable: 0`. +7. Verify the drain proceeds. ### Graduation Criteria @@ -715,6 +782,16 @@ enhancement: cluster required to make on upgrade, in order to make use of the enhancement? --> +Upgrade: +- No changes are required. The new field `usePodGrouping` defaults to `false`, so all existing PDBs will continue to function with per-pod logic. +- To use the feature, users must edit their PDBs to set `usePodGrouping: true`. + +Downgrade: +- If a PDB was created with `usePodGrouping: true`, this field will be dropped when the API server is downgraded (as it's an unknown field). +- The PDB will revert to per-pod logic. This is a behavior change that could violate the application's intended availability (as shown in the user story). +- Operators should remove `usePodGrouping` on all PDBs before a downgrade. + + ### Version Skew Strategy +This feature is entirely contained within the disruption controller in `kube-controller-manager` and the API server. By defaulting to false, a conflict generally reverts to the existing behavior. +- **New API server, old KCM:** The API server will accept the `usePodGrouping` field, but the old KCM will not know about it and will ignore it, always using per-pod logic. This matches the downgrade scenario. +- **Old API server, new KCM:** The new KCM will attempt to read the `usePodGrouping` field, but it won't exist on PDB objects. The KCM will default to `false` and use per-pod logic. + +The feature will only be active when both the API server and `kube-controller-manager` are at the new version and the user has set the field to `true`. + ## Production Readiness Review Questionnaire -It will change the behavior of the Eviction API and kube-scheduler, but should not affect any unrelated components. +No. The default behavior (field unset or `false`) uses existing per-pod availability. The new behavior is opt-in per-PDB. ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? @@ -804,11 +885,11 @@ feature. NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`. --> -Yes, update the PDB to remove the field. +Yes, update the PDB to remove the field or set to `false`. ###### What happens if we reenable the feature if it was previously rolled back? -It will be enabled again, there should not be any disruptions. +The group-based logic will be re-enabled on the next eviction which uses the PDB. ###### Are there any tests for feature enablement/disablement? @@ -825,7 +906,7 @@ You can take a look at one potential example of such test in: https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282 --> -No +Testing will cover both states of the boolean field. ### Rollout, Upgrade and Rollback Planning @@ -845,6 +926,8 @@ rollout. Similarly, consider large clusters and how enablement/disablement will rollout across nodes. --> +If an operator downgrades the control plane, PDBs with `usePodGrouping: true` will have that field dropped by the older API server. The PDB will silently revert to per-pod logic, which could lead to an application outage during a node drain if the operator was relying on group-based protection. + ###### What specific metrics should inform a rollback? +- An unusually low eviction count (`evictions_total`) might indicate the new logic is too restrictive, or a large number of PDBs are blocking drains. +- An increase in metrics related to unhealthy workloads could indicate the group-based logic is not sufficientlly protecting pod groups. + ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? +TODO + ###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? +No + ### Monitoring Requirements +`kubectl get pdb -A -o jsonpath='{..spec.usePodGrouping}'` will show PDBs which have the field set. + +If needed, add metric `disruption_controller_pdbs_using_pod_grouping` for the number of PDBs with `usePodGrouping: true`. + ###### How can someone using this feature know that it is working for their instance? +- PDB reconciliation latency should not increase significantly. +- Eviction API latency should not increase significantly. The new logic involves additional API calls to get the `Workload` objects, which should be negligible. + ###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? -- [ ] Metrics - - Metric name: - - [Optional] Aggregation method: - - Components exposing the metric: -- [ ] Other (treat as last resort) - - Details: +- [x] Metrics + - Metric name: `apiserver_request_duration_seconds` (for eviction requests) + - Components exposing the metric: `kube-apiserver` ###### Are there any missing metrics that would be useful to have to improve observability of this feature? @@ -939,6 +1033,9 @@ Describe the metrics themselves and the reasons why they weren't added (e.g., co implementation difficulties, etc.). --> +Metrics related to the disruption controller, e.g. a `disruption_controller_reconciliations_total` labeled with the replica mode (individual or pod groups). +For catching issues, `disruption_controller_pdb_grouping_misconfig_total` for when `usePodGrouping: true` but no `workloadReference` is found on pods, triggering a fallback. + ### Dependencies +- `Workload` API (CRD) + - Usage description: The disruption controller must be able to GET `Workload` objects by name from a pod's `workloadReference`. + - Impact of its outage on the feature: If the API server is down, evictions won't happen anyway. If the `Workload` CRD is somehow unavailable or the object is missing, the controller will fail to find the group definition. In this case for safety we would deny eviction, as availability cannot be guaranteed. + - Impact of its degraded performance: High latency on GET requests for `Workload` objects would increase the latency of eviction requests. + ### Scalability -No +`GET` on `workload.k8s.io/v1alpha1.Workload` objects from `kube-controller-manager` (disruption controller) during an eviction request and controller reconciliation. This should be low-volume, as evictions are not typically frequent. The controller will may use a cache to reduce API calls, for example an informer could prevent some new API calls, but add a `WATCH` from the controller on `Workload`s. ###### Will enabling / using this feature result in introducing new API types? @@ -1020,7 +1122,11 @@ Describe them, providing: - Estimated increase in size: (e.g., new annotation of size 32B) - Estimated amount of new objects: (e.g., new Object X for every existing Pod) --> -No + +Yes. +- API type(s): `policy/v1.PodDisruptionBudget` +- Estimated increase in size: One boolean field `usePodGrouping`. +- Estimated amount of new objects: 0. ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? @@ -1032,7 +1138,8 @@ Think about adding additional work or introducing new steps in between [existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos --> -No + +Not significantly. The eviction check may now potentially perform an additional API call for `Workload` objects and perform the group-based counting logic. ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? @@ -1045,7 +1152,8 @@ This through this both in small and large cases, again with respect to the [supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md --> -No + +If an informer/cache for `Workload` objects is added to the `kube-controller-manager`, this will increase its RAM usage by a small amount for each `Workload` object in the cluster. ###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? @@ -1058,6 +1166,7 @@ If any of the resources can be exhausted, how this is mitigated with the existin Are there any tests that were run/should be run to understand performance characteristics better and validate the declared limits? --> + No ### Troubleshooting @@ -1075,7 +1184,7 @@ details). For now, we leave it here. ###### How does this feature react if the API server and/or etcd is unavailable? -No different behavior +No different behavior. Eviction requests will fail regardless if the API server is down. ###### What are other known failure modes? From 67022a307855f8c5d05254401decf6d802e72203 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Wed, 22 Oct 2025 11:19:13 -0700 Subject: [PATCH 31/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 194 ++++++++---------- 1 file changed, 89 insertions(+), 105 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index b7d0d1cc315..d4b60cc1b12 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -173,9 +173,9 @@ useful for a wide audience. A good summary is probably at least a paragraph in length. --> -The Eviction API uses PodDisruptionBudgets (PBDs) to ensure availability of a certain number or percentage of pods during voluntary disruptions (node drains). This proposal would allow PDBs to treat groups of pods (defined by the new `Workload` API in the [gang scheduling KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/4671-gang-scheduling)) as if they were individual pods for the purposes of measuring availability. We will introduce a new boolean field `usePodGroups` in the PDB which explicitly enables this group-aware eviction logic. +The Eviction API uses PodDisruptionBudgets (PBDs) to ensure availability of a certain number or percentage of pods during voluntary disruptions. This proposal would allow eviction to treat groups of pods as if they were individual replicas for the purposes of measuring availability. We determine pod groups using the new `Workload` API in the [gang scheduling KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/4671-gang-scheduling), where pods declare their owning workload and pod group and `Workload` objects contain information about replicas and group size. To enable this functionality, optional boolean field `usePodGroups` will be added to the PDB spec. -*Note: as of this writing, the `Workload` API is still in progress, for this KEP we assume it is fully implemented* +*Note: as of this draft, the `Workload` API is still in progress, for this KEP we assume it is fully implemented* ## Motivation @@ -188,7 +188,7 @@ demonstrate the interest in a KEP within the wider Kubernetes community. [experience reports]: https://github.com/golang/go/wiki/ExperienceReports --> -The goal is to make PDBs usable for pod groups as defined by the `Workload` object, which are common for use cases of distributed workloads such as ML training. Eviction or preemption of pods across multiple groups should be recognized as disrupting each of those groups, as opposed to evicting multiple pods from a single group (which only disrupts that one group). For these workloads, the health of the entire replica depends on the simultaneous availability of a certain number of pods within its group (as defined in the `Workload`'s `PodGroup`). +The goal is to make PDBs more useful for pod groups. For example, a multi-pod [LeaderWorkerSet](https://lws.sigs.k8s.io/docs/overview/) replica (intended for distributed workloads like ML training and inference) will fail if any of its pods fails. Eviction or preemption of a small number of pods across multiple replicas would disrupt each replica, as opposed to evicting multiple pods from a single replica (only disrupting that one replica). We want the Eviction API to use a different definition of avalability for these cases, based on the health of pod groups rather than individual pods. ### Goals @@ -197,11 +197,11 @@ List the specific goals of the KEP. What is it trying to achieve? How will we know that this has succeeded? --> -- **Introduce an opt-in for group-based PDBs:** Add a new boolean field `usepodGroups` to the `PodDisruptionBudget.spec`. +- **Introduce an opt-in for group-based PDBs:** Add a new boolean field `usePodGroups` to the `PodDisruptionBudget.spec`. - **Define availability for pod groups:** Allow application owners to define PDBs for multi-pod replicas (as defined by the `Workload` API) rather than individual pods. -- **Update eviction logic:** When `usepodGroups: true` is set on a PDB, the eviction logic will use the `Workload` and `PodGroup` definitions (linked by `pod.spec.workload.name`) for grouping and calculating availability. -- **Maintain compatibility:** Ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow the group-based disruption budgets when opted in. -- **Preserve existing functionality:** For backward compatibility, the behavior of PDBs that do not set `usepodGroups: true` will be unchanged. +- **Update eviction logic:** When `usePodGroups: true` is set on a PDB, the eviction logic will use the `Workload` and `PodGroup` definitions (linked by `pod.spec.workload.name`) for grouping and calculate availability of groups. +- **Maintain compatibility:** Ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow group-based disruption budgets when enabled. +- **Preserve existing functionality:** For backward compatibility, the behavior of PDBs without `usePodGroups: true` will be unchanged. ### Non-Goals @@ -218,20 +218,20 @@ This change will only affect the Eviction API. The following are involuntary dis - Evictions by the Kubelet due to node pressure (e.g. memory shortage) - Taint manager deleting NoExecute tainted pods -The only change to the `PodDisruptionBudget` object will be the optional field in the spec to enable the changes. +The only change to object definitions will be the optional field in the `PodDisruptionBudget` spec to enable the changes. This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `Workload`, etc. - The workload controller will be responsible for setting the `workload.name` and `workload.podGroup` on the pods it manages. - The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, this will only handle evictions. This change will not affect scheduling. -- It is out of scope to introduce any form of or change to gang scheduling. This only handles eviction of already-scheduled pods. +- There will be no additions or changes to gang scheduling. This only handles eviction of already-scheduled pods. Partial replica health: -- This KEP follows the definition of multi-pod replica health from the `Workload` API, using `minCount`. A replica is considered "available" if it meets `minCount`, and "unavailable" if it does not. We are not introducing any other definition of partial health (e.g. a percentage, or requiring all pods healthy). +- This KEP follows the definition of multi-pod replica health from the `Workload` API, using `minCount`. A replica is considered "available" if it meets `minCount`, and "unavailable" if it does not. We are not introducing any other definition of partial health (e.g. percentage). Mixed workload types: -- If a PDB is measuring multi-pod replicas, individual pods without an assigned workload will be ignored. +- If a PDB has multi-pod replicas enabled, individual pods without an assigned workload will be treated as single-pod groups. ## Proposal @@ -244,19 +244,7 @@ The "Design Details" section below is for the real nitty-gritty. --> -We propose adding a new, optional boolean field `usepodGroups` to the `PodDisruptionBudget.spec`. -- If this field is `false` (default) or unset, the Eviction API evaluates the PDB based on individual pod counts, preserving all existing behavior. - -When `usepodGroups: true`: -- If a selected pod does not have `spec.workload.name` set, log a warning and treat it as its own group. -- If a selected pod has `spec.workload.name` and `spec.workload.podGroup` set, the eviction manager will treat groups of pods as the atomic unit for availability. - -When using pod groups: -- Identify the `Workload` object referenced by `pod.spec.workload.name`. -- From the `workload.spec.podGroups`, find the entry matching `pod.spec.workload.podGroup`. -- This `PodGroup` object defines the group's policy. The `minCount` (from `policy.gang.minCount`) defines the minimum number of pods required for one replica of that group to be healthy. The `replicas` field defines how many such groups are expected. -- The PDB's `minAvailable` or `maxUnavailable` will be interpreted in terms of these `PodGroup` replicas, not individual pods. -- A `PodGroup` replica is considered "available" only if the number of healthy pods belonging to it meets its `minCount`. +We propose adding a new, optional boolean field `usePodGroups` to the `PodDisruptionBudget.spec`. If this field is `false` (default) or unset, the Eviction API evaluates the PDB based on individual pod counts, preserving all existing behavior. If `true`, the Eviction API will find the `Workload` object and its `PodGroup` as specified by the Pod spec. This `PodGroup` will define the minimum number of pods required for one replica of that group to be healthy, and how many replicas are expected. This will be used to measure availability of pod groups, and the PDB's `minAvailable` or `maxUnavailable` will be interpreted in terms of these `PodGroup` replicas, rather than individual pods. ### User Stories (Optional) @@ -270,13 +258,11 @@ bogged down. #### Story 1: Distributed Workload -The user would create a PDB targeting the worker pods, setting the new flag: - -An engineer is running distributed ML training jobs using the `Workload` object. The `Workload` defines a `PodGroup` named `worker` with `replicas: 10` and `policy.gang.minCount: 8`. This means the job requires 10 replicas, and each replica consists of 8 pods. +An ML engineer is running distributed training jobs using `Workload` API. The `Workload` defines a `PodGroup` named `worker` with `replicas: 10` and `policy.gang.minCount: 8`. This means the job has 10 replicas, each consisting of at least 8 pods. To protect this long-running job from voluntary disruptions (like node drains), the user wants to ensure at least 9 of the 10 worker groups remain available. -The user would create a standard PDB targeting the worker pods: +This user would create a PDB targeting the worker pods: ```yaml apiVersion: policy/v1 @@ -285,92 +271,65 @@ metadata: name: my-training-job-workers-pdb spec: minAvailable: 9 - usePodGroups: true # <-- New field to enable group logic + usePodGroups: true # <-- New field to enable selector: matchLabels: # Assuming pods are labeled by the workload controller workload: my-training-job pod-group: worker ``` + Upon node drain, the Eviction API will: -1. See the PDB `my-training-job-workers-pdb` has `spec.usePodGrouping: true`. +1. See the PDB `my-training-job-workers-pdb` with `spec.usePodGrouping: true`. 2. Select all pods matching the selector. 3. Detect that these pods have `spec.workload.name: my-training-job` and `spec.workload.podGroup: worker`. 4. Fetch the `Workload` object `my-training-job`. -5. Identify that the PDB applies to the 'worker' `PodGroup`, which has 10 replicas. -6. Interpret `minAvailable: 9` as "9 worker `PodGroup` replicas must remain available." -7. A group is considered disrupted if evicting a pod would cause its healthy pod count to drop below its `minCount` (which is 8). +5. Find `worker` `PodGroup` in the `Workload`, which has 10 `replicas` and 8 `minSize`. +6. Interpreting `minAvailable: 9` as pod groups, a group is considered disrupted if evicting a pod would cause its healthy pod count to drop below 8. 8. The drain will proceed only if it does not cause the number of available worker groups to drop below 9. This way, the job is protected to run with sufficient replicas during cluster maintenance. #### Story 2: Cluster Maintenance -A cluster administrator frequently drains nodes for upgrades. The cluster has various workloads, including complex multi-pod applications defined by the `Workload` API. +A cluster administrator frequently drains nodes for upgrades. The cluster has various workloads, including multi-pod applications defined by the `Workload` API. -To perform node drains safely, the administrator relies on application owners' PDBs. When the admin issues `kubectl drain `, the Eviction API automatically identifies which pods belong to a `Workload`. It interprets the PDBs for those pods in terms of `PodGroup` replicas instead of individual pods, ensuring that the drain does not violate the application's group-based availability requirements. +To perform node drains safely, the administrator relies on application owners' PDBs. When the admin issues `kubectl drain `, the Eviction API uses the process above and interprets the PDBs in terms of `PodGroup` replicas, ensuring that the drain does not violate the application's group-based availability requirements. -This allows safe maintenance without causing outages, as the drain will pause if it cannot evict pods without violating a group-based PDB. +This allows safe maintenance without causing outages, as the drain will pause if it cannot evict pods without violating a group-based PDB. It will wait for better replica health, more availability, lower requirements, or the admin may contact the application owner to resolve. -#### Setup Example +#### Simplified Setup Example ```mermaid graph TD - %% Define Styles for Setup Diagram + %% Define Styles classDef node_box fill:#ececff,stroke:#9696ff,stroke-width:2px,color:#1a1a1a + classDef replica_box fill:#f9f9f9,stroke:#aaa,stroke-width:1px,color:#1a1a1a classDef pod_box fill:#fff,stroke:#ccc,color:#1a1a1a - classDef replica_label fill:none,stroke:none,font-weight:bold,color:#f0f0f0 - - subgraph "Physical Node Setup" - direction LR - - subgraph NodeA ["Node A"] - G0P0("Group 0
Pod 0") - G0P1("Group 0
Pod 1") - end - class NodeA node_box - subgraph NodeB ["Node B"] - G0P2("Group 0
Pod 2") - G1P0("Group 1
Pod 0") + subgraph NodeToDrain ["Node (Being Drained)"] + direction LR %% Arrange replicas side-by-side + + subgraph Replica0 ["Replica 0"] + P0A("Pod 0A") + P0B("Pod 0B") end - class NodeB node_box - - subgraph NodeC ["Node C"] - G1P1("Group 1
Pod 1") - G1P2("Group 1
Pod 2") + class Replica0 replica_box + + subgraph Replica1 ["Replica 1"] + P1A("Pod 1A") + P1B("Pod 1B") end - class NodeC node_box + class Replica1 replica_box - class G0P0,G0P1,G0P2,G1P0,G1P1,G1P2 pod_box end - %% Logical Groupings (shown with links) - subgraph LogicalGrouping ["Logical PodGroup Replicas"] - direction TB - style LogicalGrouping fill:none,stroke:none - G0("Group 0") - G1("Group 1") - class G0,G1 replica_label - end - - G0 -.-> G0P0 - G0 -.-> G0P1 - G0 -.-> G0P2 - - G1 -.-> G1P0 - G1 -.-> G1P1 - G1 -.-> G1P2 - - %% Style all links - linkStyle 0,1,2,3,4,5 stroke:#888,stroke-dasharray: 5 5,stroke-width:2px + class NodeToDrain node_box + class P0A,P0B,P1A,P1B pod_box ``` -Assume the following setup: A `Workload` defines a `PodGroup` with `replicas: 2` and `minCount: 3`. This results in 2 logical groups (replicas) of 3 pods each. As in the diagram, Node A hosts Group 0 Pods 0 and 1, Node B hosts Group 0 Pod 2 and Group 1 Pod 0, and Node C hosts Group 1 Pods 1 and 2. - -To protect at least one 3-pod group in the current system, a user could try a PDB with `minAvailable: 3` (pods). A node drain on Node B would see that there will still be 4 pods remaining afterwards, which satisfies `minAvailable: 3`, and proceed. Technically the PDB was honored, but now both groups have a failing pod (each is missing one pod and no longer meets `minCount: 3`), and the entire application fails. +In this simplified setup, the node being drained contains two replicas, each with two pods (there may be more nodes and replicas which we can ignore). The PDB wants at most one replica unavailable. Currently, the user might try `minUnavailable: 2` (one two-pod replica unavailable). The node drain would start, and could evict a pod from replica 0 and a pod from replica 1 before pausing (as there are only 2 pods left). This would disrupt both replicas. With the new changes, a PDB with `usePodGroups: true` and `minUnavailable: 1` (one replica unavailable) would pause before evicting a pod from the second replica, protecting one of the replicas as intended. -After this change, a PDB with `minAvailable: 1` (interpreted as 1 group) would be evaluated. The eviction logic would identify the 2 `PodGroup` replicas. It would determine that evicting the pods on Node B would cause both replicas to become unavailable, violating the PDB (`minAvailable: 1`), and the node drain would safely stop before eviction. ```mermaid graph TD @@ -382,44 +341,64 @@ graph TD classDef outcome_good fill:#f0fff0,stroke:#aaffaa,stroke-width:2px,color:#111 classDef process fill:#f0f0f0,stroke:#ccc,color:#111 - StartDrain("kubectl drain
node-b initiated") + %% --- Start --- + StartDrain("kubectl drain node") class StartDrain action - StartDrain --> PDB_Type{Which PDB logic applies?} + StartDrain --> PDB_Type{"PDB"} + class PDB_Type decision - PDB_Type -- "Traditional PDB" --> PDB_Old(PDB Spec:
minAvailable 3 pods
usePodGrouping: false) + %% --- Path 1: Traditional PDB --- + PDB_Type -- "Traditional PDB" --> PDB_Old(PDB Spec:
maxUnavailable: 2 pods
usePodGroups: false) class PDB_Old pdb_spec + + PDB_Old --> TryEvictP0A("Try to evict Pod 0A
(from Replica 0)") + class TryEvictP0A action - PDB_Type -- "Workload-Aware PDB (with KEP)" --> PDB_New(PDB Spec:
minAvailable 1 group
usePodGrouping: true) - class PDB_New pdb_spec + TryEvictP0A --> CheckPods1{"Unavailable pods (1) <= 2?"} + class CheckPods1 decision - %% --- Traditional PDB Flow --- - PDB_Old --> CalcPods(Calculate
available pods) - class CalcPods process + CheckPods1 -- "Yes (1 <= 2)" --> EvictP0A("Eviction Allowed") + class EvictP0A process - CalcPods --> CheckPods{Are remaining pods
>= 3?} - class CheckPods decision + EvictP0A --> TryEvictP1A("Try to evict Pod 1A
(from Replica 1)") + class TryEvictP1A action - CheckPods -- "Yes (4 >= 3)" --> DrainSuccess("Drain Proceeds:
Node B pods evicted") - class DrainSuccess action + TryEvictP1A --> CheckPods2{"Unavailable pods (2) <= 2?"} + class CheckPods2 decision - DrainSuccess --> AppDown("Application State:
Both groups fail
(Technically PDB honored,
but intent violated)") + CheckPods2 -- "Yes (2 <= 2)" --> EvictP1A("Eviction Allowed") + class EvictP1A process + + EvictP1A --> DrainStops("Drain Pauses
(PDB limit reached)") + class DrainStops action + + DrainStops --> AppDown("Application State:
Both replicas are broken
(One pod lost from each)") class AppDown outcome_bad - %% --- Multipod PDB Flow --- - PDB_New --> DetectGroups(Detect pods belong
to a Workload) - class DetectGroups process + %% --- Path 2: Group-Aware PDB (KEP) --- + PDB_Type -- "Group-Aware PDB (KEP)" --> PDB_New(PDB Spec:
maxUnavailable: 1 group
usePodGroups: true) + class PDB_New pdb_spec - DetectGroups --> CalcReplicas(Calculate
available groups) - class CalcReplicas process + PDB_New --> TryEvictP0A_New("Try to evict Pod 0A
(from Replica 0)") + class TryEvictP0A_New action - CalcReplicas --> CheckReplicas{Are remaining groups
>= 1?} - class CheckReplicas decision + TryEvictP0A_New --> CheckGroups1{"Eviction breaks Replica 0.
Unavailable groups (1) <= 1?"} + class CheckGroups1 decision - CheckReplicas -- "No (0 >= 1)" --> DrainBlocked("Drain Blocked:
Eviction prevented") - class DrainBlocked action + CheckGroups1 -- "Yes (1 <= 1)" --> EvictR0("Eviction Allowed") + class EvictR0 process - DrainBlocked --> AppHealthy("Application State:
Both groups healthy
(PDB intent
fully protected)") + EvictR0 --> TryEvictP1A_New("Try to evict Pod 1A
(from Replica 1)") + class TryEvictP1A_New action + + TryEvictP1A_New --> CheckGroups2{"Eviction breaks Replica 1.
Total unavailable groups (2) <= 1?"} + class CheckGroups2 decision + + CheckGroups2 -- "No (2 > 1)" --> EvictP1A_Denied("Eviction Denied
Drain Pauses") + class EvictP1A_Denied action + + EvictP1A_Denied --> AppHealthy("Application State:
Replica 1 is protected
(Only Replica 0 is disrupted)") class AppHealthy outcome_good ``` @@ -585,7 +564,7 @@ For example, if a replica expects 10 pods with `minCount: 8` but only has 9 heal ### Pods without `workloadReference` -If a PDB's `selector` matches a pod that is missing the `spec.workloadReference` field (or its `Name` is empty), it will be treated as an individual pod. If `usePodGrouping: true` is set, this will be logged as a warning. If the PDB matches *only* individual pods, this will be equivalent to the standard per-pod logic. +If a PDB's `selector` matches a pod that is missing the `spec.workloadReference` field (or its `Name` is empty), it will be treated as an individual pod. If `usePodGrouping: true` is set, this will be logged as a warning. If the PDB matches *only* individual pods, this will be equivalent to the standard per-pod logic. If a selected pod has `spec.workload.name` but no `spec.workload.podGroup`, this is a misconfiguration and it will be treated as unhealthy. ### Test Plan @@ -1232,6 +1211,11 @@ not need to be as detailed as the proposal, but should include enough information to express the idea and why it was not acceptable. --> +Initially there was a plan to integrate directly with multi-pod replica systems (LWS). This would add optional field `replicaKey` to the PDB spec, so the user may provide a label which would identify pods in the same group. For LWS, all pods in a leader+workers group will share the same value for label key `leaderworkerset.sigs.k8s.io/group-key`. This would also require keys to fetch the expected replica count (otherwise we could not detect a missing replica for `maxUnavailable` or a percentage `minAvailable`) and replica size (otherwise we could not detect a missing pod making a replica unhealthy). With the `Workload` API approved and implementaiton in progress, it is better to have both PDBs and LWS integrate with this new core component. + +In the case given in the simplified example above, there may be a way to change the eviction logic to such that the order of pod eviction preserves replicas when possible (e.g. prioritize evicting pods from the replica with the most pods in the node). However, it is simpler to understand and easier ensure intended behavior by just extending the existing PDB budget pattern. It is also unclear if this would work fully when gang scheduling is not used or the number of pods is greater than `minCount`. + + ## Infrastructure Needed (Optional) #### Background on multi-pod replicas (LWS) -In this KEP, the LeaderWorkerSet (LWS) is used as the primary example of a multi-pod replica system. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and each "worker" pod and the number of pods in the group (size). This is useful in cases where a leader process coordinates multiple worker processes, particularly in AI/ML distributed workloads for model training and inference. +A LeaderWorkerSet (LWS) the primary example of a multi-pod replica. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and for the "worker" pods. This is useful in cases where a leader process coordinates multiple worker processes, particularly in AI/ML distributed workloads for model training and inference. -All worker pods are treated the same: they are created from the same template, operated on in parallel, and if any workers fail the group is considered failing. A LeaderWorkerSet object will specify "replicas," which are not additional pods (group size), but additional leader+workers pod groups. For unique identification, each worker has an index, and each replica of the group has an index. The pods have various labels providing information as seen in the [docs](https://lws.sigs.k8s.io/docs/reference/labels-annotations-and-environment-variables/). +All worker pods are treated the same: they are created from the same template, scheduled in parallel, and if any workers fail the group is considered failing. A LeaderWorkerSet object will specify `replicas` for the number of leader+workers groups and `size` for the number of pods per group. #### Background on the `Workload` API @@ -514,46 +514,60 @@ If `true`: ```mermaid graph TD - subgraph "Eviction Logic Flow" + %% Define Styles + classDef decision fill:#fff0e6,stroke:#ff9933,stroke-width:2px,color:#111 + classDef process fill:#e6f3ff,stroke:#66b3ff,stroke-width:2px,color:#111 + classDef startEnd fill:#f0fff0,stroke:#aaffaa,stroke-width:2px,color:#111 + classDef error fill:#fff0f0,stroke:#ffaaaa,stroke-width:2px,color:#111 + classDef warning fill:#fff9e6,stroke:#ffd666,stroke-width:2px,color:#111 + + subgraph "Group-Aware Eviction Logic Flow" direction TB - Start(Eviction API Triggered
for a PDB) --> CheckFlag{pdb.spec.usePodGrouping
== true?} - class CheckFlag decision + Start(Eviction API Triggered
for a PDB) --> CheckFlag{"usePodGrouping: true?"} - %% Branch 1: Legacy Path (Flag False) - CheckFlag -- "No (default)" --> LegacyLogic[Use existing
per-pod logic] - LegacyLogic --> CalcPods[Calculate availability
based on individual
pod
counts] - CalcPods --> DecisionLegacy{Pods meet
PDB spec?} + %% Branch 1: Legacy Path (Flag False/Unset) + CheckFlag -- "No (default)" --> LegacyLogic[Use existing
per-pod availability logic] + LegacyLogic --> DecisionLegacy{"Pods meet
PDB spec?"} DecisionLegacy -- "Yes" --> Allow[✅ Allow Eviction] DecisionLegacy -- "No" --> Deny[❌ Deny Eviction] %% Branch 2: New Path (Flag True) - CheckFlag -- "Yes" --> GetPods[Get all pods matching
PDB selector] - GetPods --> CheckWorkload{Do any pods have
spec.workloadReference set?} + CheckFlag -- "Yes" --> GetPods[1. Get all pods matching
PDB selector] + + GetPods --> CheckWorkloadRefs{"2. Pods with
workloadReference?"} + + %% Path 2a: No workloadReference (Misconfiguration) + CheckWorkloadRefs -- "None" --> WarnMisconfig[3. Log Warning:
'usePodGrouping: true'
but no pods have
workloadReference] + WarnMisconfig --> LegacyLogic[Fall back to
per-pod logic] + + %% Path 2b: Mixed pod types + CheckWorkloadRefs -- "Some (Mixed)" --> WarnMixed[3. Log Warning:
Mixed pod types found.
Treat individual pods
as their own group.] + WarnMixed --> FindWorkloads + + %% Path 2c: All pods have workloadReference + CheckWorkloadRefs -- "All" --> FindWorkloads[4. Find Workload object
for each pod] + + %% Continue Group Logic Flow + FindWorkloads --> FindPodGroups[5. Find PodGroup in
Workload for each pod] + FindPodGroups --> GetGroupInfo[6. Get PodGroup
replicas & minCount] + + %% --- CORRECTED LINE --- + GetGroupInfo --> CountAvailable["7. Count available replicas
(healthy pods >= minCount)"] + + CountAvailable --> SumTotal[8. Sum total desired replicas
from all PodGroups] + SumTotal --> DecisionNew{"9. Compare available/total
group counts
against PDB spec"} - CheckWorkload -- "No" --> WarnMismatched(Log Warning:
'usePodGrouping' is true
but no pods have
workloadReference) - WarnMismatched --> LegacyLogic - - CheckWorkload -- "Yes" --> WarnMixed(Log warning if
mixed pod types found.
Individual pods
will be counted as 1) - WarnMixed --> GroupPods[Group pods by
Workload and PodGroup] - GroupPods --> FetchGroupInfo[Fetch PodGroup info
from Workloads:
- Total replicas per group
- minCount per group] - FetchGroupInfo --> CountAvailable[Count 'available' replicas:
Existing, healthy,
non-evicting pods
must meet minCount] - CountAvailable --> SumTotalReplicas[Sum total desired
replicas from all
matched groups] - SumTotalReplicas --> DecisionNew{Compare available/total
group counts
against PDB spec} DecisionNew -- "Yes" --> Allow DecisionNew -- "No" --> Deny end - %% Styling (with dark text color for readability) - classDef decision fill:#fff0e6,stroke:#ff9933,stroke-width:2px,color:#111 - classDef process fill:#e6f3ff,stroke:#66b3ff,stroke-width:2px,color:#111 - classDef startEnd fill:#f0fff0,stroke:#aaffaa,stroke-width:2px,color:#111 - classDef error fill:#fff0f0,stroke:#ffaaaa,stroke-width:2px,color:#111 - + %% Styling class Start,Allow,Deny startEnd class Deny error - class GetPods,LegacyLogic,CalcPods,WarnMixed,WarnMismatched,GroupPods,FetchGroupInfo,CountAvailable,SumTotalReplicas process - class CheckWorkload,DecisionLegacy,DecisionNew,CheckFlag decision + class WarnMisconfig,WarnMixed warning + class GetPods,LegacyLogic,DecisionLegacy,FindWorkloads,FindPodGroups,GetGroupInfo,CountAvailable,SumTotal process + class CheckWorkloadRefs,DecisionNew,CheckFlag decision ``` #### Group Health @@ -562,7 +576,7 @@ A `PodGroup` replica is considered available if its number of existing, healthy, For example, if a replica expects 10 pods with `minCount: 8` but only has 9 healthy pods (1 is missing or unhealthy), the replica is still considered **available**. If 3 pods are missing or unhealthy and only 7 healthy pods exist, the replica is **unavailable**. If any pod in an available group is targeted for eviction, it would be unhealthy post-eviction and is also counted as unavailable for the PDB calculation. -### Pods without `workloadReference` +### Pods missing fields If a PDB's `selector` matches a pod that is missing the `spec.workloadReference` field (or its `Name` is empty), it will be treated as an individual pod. If `usePodGrouping: true` is set, this will be logged as a warning. If the PDB matches *only* individual pods, this will be equivalent to the standard per-pod logic. If a selected pod has `spec.workload.name` but no `spec.workload.podGroup`, this is a misconfiguration and it will be treated as unhealthy. @@ -1211,7 +1225,7 @@ not need to be as detailed as the proposal, but should include enough information to express the idea and why it was not acceptable. --> -Initially there was a plan to integrate directly with multi-pod replica systems (LWS). This would add optional field `replicaKey` to the PDB spec, so the user may provide a label which would identify pods in the same group. For LWS, all pods in a leader+workers group will share the same value for label key `leaderworkerset.sigs.k8s.io/group-key`. This would also require keys to fetch the expected replica count (otherwise we could not detect a missing replica for `maxUnavailable` or a percentage `minAvailable`) and replica size (otherwise we could not detect a missing pod making a replica unhealthy). With the `Workload` API approved and implementaiton in progress, it is better to have both PDBs and LWS integrate with this new core component. +Initially there was a plan to integrate directly with multi-pod replica systems (LWS). This would add optional field `replicaKey` to the PDB spec, so the user may provide a label which would identify pods in the same group. For LWS, all pods in a leader+workers group will share the same value for label key `leaderworkerset.sigs.k8s.io/group-key`. This would also require keys to fetch the expected replica count (otherwise we could not detect a missing replica for `maxUnavailable` or a percentage `minAvailable`) and replica size (otherwise we could not detect a missing pod making a replica unhealthy). This would also require some changes to make the LWS [labels/annotations](https://lws.sigs.k8s.io/docs/reference/labels-annotations-and-environment-variables/) more easily avaiable. With the `Workload` API approved and implementaiton in progress, it is better to have both PDBs and LWS integrate with this new core component. In the case given in the simplified example above, there may be a way to change the eviction logic to such that the order of pod eviction preserves replicas when possible (e.g. prioritize evicting pods from the replica with the most pods in the node). However, it is simpler to understand and easier ensure intended behavior by just extending the existing PDB budget pattern. It is also unclear if this would work fully when gang scheduling is not used or the number of pods is greater than `minCount`. From 21c9b529b90ae0f30002ce260e6ab02b1baab907 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Wed, 22 Oct 2025 11:40:08 -0700 Subject: [PATCH 33/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 52 +++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 22bf8b00c2e..10ca6dc7c1a 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -280,7 +280,7 @@ spec: ``` Upon node drain, the Eviction API will: -1. See the PDB `my-training-job-workers-pdb` with `spec.usePodGrouping: true`. +1. See the PDB `my-training-job-workers-pdb` with `spec.usePodGroups: true`. 2. Select all pods matching the selector. 3. Detect that these pods have `spec.workload.name: my-training-job` and `spec.workload.podGroup: worker`. 4. Fetch the `Workload` object `my-training-job`. @@ -428,9 +428,7 @@ A `Workload` object contains a list of `PodGroup`s. Each `PodGroup` defines: * `policy`: The scheduling policy, such as `Gang`. * `policy.gang.minCount`: The minimum number of pods required for one replica of that group. -In the context of LWS, a LWS replica is equivalent to a PodGroup replica, and its `size` is `minCount`. - -This KEP assumes that a controller (like the one managing `Workload` objects) will create pods and set `pod.spec.workload.name` and `pod.spec.workload.podGroup` on each pod it creates, linking it back to the `Workload` definition. The eviction logic uses this link to read the group's requirements. +A LWS replica is would correspond to a PodGroup replica, with its `size` being `minCount`. ### Risks and Mitigations @@ -446,7 +444,7 @@ How will UX be reviewed, and by whom? Consider including folks who also work outside the SIG or subproject. --> -- This feature relies on the pod's `spec.workload.name` and `spec.workload.podGroup` fields being correctly set by its managing controller. If a user sets `usePodGrouping: true` but the pods are not correctly linked to a `Workload` object, the eviction logic will fall back to per-pod counting, which may violate the application's true availability requirements (though it will prevent a drain from being blocked by misconfiguration). +- This feature relies on the pod's `spec.workload.name` and `spec.workload.podGroup` fields being correctly set by its managing controller. If a user sets `usePodGroups: true` but the pods are not correctly linked to a `Workload` object, the eviction logic will fall back to per-pod counting, which may violate the application's true availability requirements (though it will prevent a drain from being blocked by misconfiguration). - One failing pod in a large group will make that group "unavailable" if it drops below its `minCount`. A small number of failing pods spread across many groups could prevent all evictions and block a node drain. This is intended behavior (as the application is unhealthy), but may be surprising to operators. - A PDB `selector` that matches pods from multiple different `PodGroup`s (or a mix of grouped and individual pods) may have complex or unintended behavior. Users should be advised to create separate PDBs for each distinct `PodGroup` they wish to protect. @@ -485,7 +483,7 @@ type PodDisruptionBudgetSpec struct { // +optional MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty" protobuf:"bytes,3,opt,name=maxUnavailable"` - // usePodGrouping indicates that availability should be calculated based on + // usePodGroups indicates that availability should be calculated based on // pod groups defined by the Workload API (pod.spec.workloadReference). // If set to true, the eviction logic will interpret minAvailable/maxUnavailable // as a count of PodGroup replicas, not individual pods. @@ -494,13 +492,13 @@ type PodDisruptionBudgetSpec struct { // and a warning will be logged. // Defaults to false. // +optional - UsePodGrouping bool `json:"usePodGrouping,omitempty" protobuf:"varint,4,opt,name=usePodGrouping"` + usePodGroups bool `json:"usePodGroups,omitempty" protobuf:"varint,4,opt,name=usePodGroups"` } ``` #### Eviction Logic Flow -If `pdb.spec.usePodGrouping: false` or unset, follow the existing per-pod availability behavior. +If `pdb.spec.usePodGroups: false` or unset, follow the existing per-pod availability behavior. If `true`: 1. Get all pods matching the PDB's `selector`. 2. Check if all pods have `spec.workloadReference.Name` set. @@ -524,7 +522,7 @@ graph TD subgraph "Group-Aware Eviction Logic Flow" direction TB - Start(Eviction API Triggered
for a PDB) --> CheckFlag{"usePodGrouping: true?"} + Start(Eviction API Triggered
for a PDB) --> CheckFlag{"usePodGroups: true?"} %% Branch 1: Legacy Path (Flag False/Unset) CheckFlag -- "No (default)" --> LegacyLogic[Use existing
per-pod availability logic] @@ -538,7 +536,7 @@ graph TD GetPods --> CheckWorkloadRefs{"2. Pods with
workloadReference?"} %% Path 2a: No workloadReference (Misconfiguration) - CheckWorkloadRefs -- "None" --> WarnMisconfig[3. Log Warning:
'usePodGrouping: true'
but no pods have
workloadReference] + CheckWorkloadRefs -- "None" --> WarnMisconfig[3. Log Warning:
'usePodGroups: true'
but no pods have
workloadReference] WarnMisconfig --> LegacyLogic[Fall back to
per-pod logic] %% Path 2b: Mixed pod types @@ -578,7 +576,7 @@ For example, if a replica expects 10 pods with `minCount: 8` but only has 9 heal ### Pods missing fields -If a PDB's `selector` matches a pod that is missing the `spec.workloadReference` field (or its `Name` is empty), it will be treated as an individual pod. If `usePodGrouping: true` is set, this will be logged as a warning. If the PDB matches *only* individual pods, this will be equivalent to the standard per-pod logic. If a selected pod has `spec.workload.name` but no `spec.workload.podGroup`, this is a misconfiguration and it will be treated as unhealthy. +If a PDB's `selector` matches a pod that is missing the `spec.workloadReference` field (or its `Name` is empty), it will be treated as an individual pod. If `usePodGroups: true` is set, this will be logged as a warning. If the PDB matches *only* individual pods, this will be equivalent to the standard per-pod logic. If a selected pod has `spec.workload.name` but no `spec.workload.podGroup`, this is a misconfiguration and it will be treated as unhealthy. ### Test Plan @@ -654,9 +652,9 @@ This can be done with: --> - An integration test will be added to `test/integration/disruption` to simulate the eviction process. -- **Test 1:** PDB with `usePodGrouping: false` (default) and `Workload`-managed pods. Verify eviction uses per-pod counting. -- **Test 2:** PDB with `usePodGrouping: true` and `Workload`-managed pods. Verify eviction uses per-group counting and blocks when `minAvailable` groups would be violated. -- **Test 3:** PDB with `usePodGrouping: true` but with non-`Workload` pods. Verify eviction falls back to per-pod counting and logs a warning. +- **Test 1:** PDB with `usePodGroups: false` (default) and `Workload`-managed pods. Verify eviction uses per-pod counting. +- **Test 2:** PDB with `usePodGroups: true` and `Workload`-managed pods. Verify eviction uses per-group counting and blocks when `minAvailable` groups would be violated. +- **Test 3:** PDB with `usePodGroups: true` but with non-`Workload` pods. Verify eviction falls back to per-pod counting and logs a warning. ##### e2e tests @@ -679,7 +677,7 @@ If e2e tests are not necessary or useful, explain why. An e2e test will be added. 1. Create a `Workload` with 2 `PodGroup` replicas, each with `minCount: 3`. -2. Create a PDB with `minAvailable: 1` and `usePodGrouping: true` selecting these pods. +2. Create a PDB with `minAvailable: 1` and `usePodGroups: true` selecting these pods. 3. Manually schedule pods such that one node drain would disrupt both groups (as in the example given earlier). 4. Attempt to drain the node. 5. Verify the drain is blocked by the PDB. @@ -776,13 +774,13 @@ enhancement: --> Upgrade: -- No changes are required. The new field `usePodGrouping` defaults to `false`, so all existing PDBs will continue to function with per-pod logic. -- To use the feature, users must edit their PDBs to set `usePodGrouping: true`. +- No changes are required. The new field `usePodGroups` defaults to `false`, so all existing PDBs will continue to function with per-pod logic. +- To use the feature, users must edit their PDBs to set `usePodGroups: true`. Downgrade: -- If a PDB was created with `usePodGrouping: true`, this field will be dropped when the API server is downgraded (as it's an unknown field). +- If a PDB was created with `usePodGroups: true`, this field will be dropped when the API server is downgraded (as it's an unknown field). - The PDB will revert to per-pod logic. This is a behavior change that could violate the application's intended availability (as shown in the user story). -- Operators should remove `usePodGrouping` on all PDBs before a downgrade. +- Operators should remove `usePodGroups` on all PDBs before a downgrade. ### Version Skew Strategy @@ -801,8 +799,8 @@ enhancement: --> This feature is entirely contained within the disruption controller in `kube-controller-manager` and the API server. By defaulting to false, a conflict generally reverts to the existing behavior. -- **New API server, old KCM:** The API server will accept the `usePodGrouping` field, but the old KCM will not know about it and will ignore it, always using per-pod logic. This matches the downgrade scenario. -- **Old API server, new KCM:** The new KCM will attempt to read the `usePodGrouping` field, but it won't exist on PDB objects. The KCM will default to `false` and use per-pod logic. +- **New API server, old KCM:** The API server will accept the `usePodGroups` field, but the old KCM will not know about it and will ignore it, always using per-pod logic. This matches the downgrade scenario. +- **Old API server, new KCM:** The new KCM will attempt to read the `usePodGroups` field, but it won't exist on PDB objects. The KCM will default to `false` and use per-pod logic. The feature will only be active when both the API server and `kube-controller-manager` are at the new version and the user has set the field to `true`. @@ -852,7 +850,7 @@ well as the [existing list] of feature gates. - Feature gate name: - Components depending on the feature gate: - [x] Other - - Describe the mechanism: The feature is enabled on a per-PDB basis with `spec.usePodGrouping: true`. It is disabled by default. + - Describe the mechanism: The feature is enabled on a per-PDB basis with `spec.usePodGroups: true`. It is disabled by default. - Will enabling / disabling the feature require downtime of the control plane? No - Will enabling / disabling the feature require downtime or reprovisioning of a node? No @@ -919,7 +917,7 @@ rollout. Similarly, consider large clusters and how enablement/disablement will rollout across nodes. --> -If an operator downgrades the control plane, PDBs with `usePodGrouping: true` will have that field dropped by the older API server. The PDB will silently revert to per-pod logic, which could lead to an application outage during a node drain if the operator was relying on group-based protection. +If an operator downgrades the control plane, PDBs with `usePodGroups: true` will have that field dropped by the older API server. The PDB will silently revert to per-pod logic, which could lead to an application outage during a node drain if the operator was relying on group-based protection. ###### What specific metrics should inform a rollback? @@ -966,9 +964,9 @@ checking if there are objects with field X set) may be a last resort. Avoid logs or events for this purpose. --> -`kubectl get pdb -A -o jsonpath='{..spec.usePodGrouping}'` will show PDBs which have the field set. +`kubectl get pdb -A -o jsonpath='{..spec.usePodGroups}'` will show PDBs which have the field set. -If needed, add metric `disruption_controller_pdbs_using_pod_grouping` for the number of PDBs with `usePodGrouping: true`. +If needed, add metric `disruption_controller_pdbs_using_pod_grouping` for the number of PDBs with `usePodGroups: true`. ###### How can someone using this feature know that it is working for their instance? @@ -1027,7 +1025,7 @@ implementation difficulties, etc.). --> Metrics related to the disruption controller, e.g. a `disruption_controller_reconciliations_total` labeled with the replica mode (individual or pod groups). -For catching issues, `disruption_controller_pdb_grouping_misconfig_total` for when `usePodGrouping: true` but no `workloadReference` is found on pods, triggering a fallback. +For catching issues, `disruption_controller_pdb_grouping_misconfig_total` for when `usePodGroups: true` but no `workloadReference` is found on pods, triggering a fallback. ### Dependencies @@ -1118,7 +1116,7 @@ Describe them, providing: Yes. - API type(s): `policy/v1.PodDisruptionBudget` -- Estimated increase in size: One boolean field `usePodGrouping`. +- Estimated increase in size: One boolean field `usePodGroups`. - Estimated amount of new objects: 0. ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? From 82349fc3314eecbfc38bb03272a4f18f4a3dde02 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Wed, 22 Oct 2025 12:47:12 -0700 Subject: [PATCH 34/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 10ca6dc7c1a..52706bd4f21 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -197,7 +197,7 @@ List the specific goals of the KEP. What is it trying to achieve? How will we know that this has succeeded? --> -- **Introduce an opt-in for group-based PDBs:** Add a new boolean field `usePodGroups` to the `PodDisruptionBudget.spec`. +- **Introduce a field to enable group-based PDBs:** Add a new boolean field `usePodGroups` to the `PodDisruptionBudget.spec`. - **Define availability for pod groups:** Allow application owners to define PDBs for multi-pod replicas (as defined by the `Workload` API) rather than individual pods. - **Update eviction logic:** When `usePodGroups: true` is set on a PDB, the eviction logic will use the `Workload` and `PodGroup` definitions (linked by `pod.spec.workload.name`) for grouping and calculate availability of groups. - **Maintain compatibility:** Ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow group-based disruption budgets when enabled. From e9c6a784b08259334e10ba15a05df9af1fa7a0d5 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Fri, 24 Oct 2025 10:00:38 -0700 Subject: [PATCH 35/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 52706bd4f21..955b4a37357 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -188,7 +188,7 @@ demonstrate the interest in a KEP within the wider Kubernetes community. [experience reports]: https://github.com/golang/go/wiki/ExperienceReports --> -The goal is to make PDBs more useful for pod groups. For example, a multi-pod [LeaderWorkerSet](https://lws.sigs.k8s.io/docs/overview/) replica (intended for distributed workloads like ML training and inference) will fail if any of its pods fails. Eviction or preemption of a small number of pods across multiple replicas would disrupt each replica, as opposed to evicting multiple pods from a single replica (only disrupting that one replica). We want the Eviction API to use a different definition of avalability for these cases, based on the health of pod groups rather than individual pods. +The goal of this KEP is to make PDBs more useful for pod groups, particularly with the `Workload API`. Eviction or preemption of a small number of pods across multiple multi-pod replicas could disrupt each replica, as opposed to evicting multiple pods from a single replica (only disrupting that one replica). We want to enable the Eviction API to define avaiability based on the health of pod groups, rather than individual pods. ### Goals @@ -199,7 +199,8 @@ know that this has succeeded? - **Introduce a field to enable group-based PDBs:** Add a new boolean field `usePodGroups` to the `PodDisruptionBudget.spec`. - **Define availability for pod groups:** Allow application owners to define PDBs for multi-pod replicas (as defined by the `Workload` API) rather than individual pods. -- **Update eviction logic:** When `usePodGroups: true` is set on a PDB, the eviction logic will use the `Workload` and `PodGroup` definitions (linked by `pod.spec.workload.name`) for grouping and calculate availability of groups. +- **Update eviction logic:** When `usePodGroups: true` is set in a PDB spec, the eviction logic will interpret the disruption budget given in the PDB (`minAvailable` or `maxUnavailable`) as a count of pod group replicas, rather than individual pod replicas. +- **Integrate with Workload API:** Use the pod spec's `workload.name` and `workload.podGroup` to retrieve `Workload` objects and their `PodGroup` groupings. - **Maintain compatibility:** Ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow group-based disruption budgets when enabled. - **Preserve existing functionality:** For backward compatibility, the behavior of PDBs without `usePodGroups: true` will be unchanged. @@ -411,11 +412,6 @@ Go in to as much detail as necessary here. This might be a good place to talk about core concepts and how they relate. --> -#### Background on multi-pod replicas (LWS) -A LeaderWorkerSet (LWS) the primary example of a multi-pod replica. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and for the "worker" pods. This is useful in cases where a leader process coordinates multiple worker processes, particularly in AI/ML distributed workloads for model training and inference. - -All worker pods are treated the same: they are created from the same template, scheduled in parallel, and if any workers fail the group is considered failing. A LeaderWorkerSet object will specify `replicas` for the number of leader+workers groups and `size` for the number of pods per group. - #### Background on the `Workload` API This KEP assumes that a pod controller (like the one managing `Workload` objects) will create pods and set `pod.spec.workload.name` and `pod.spec.workload.podGroup` on each pod it creates, linking it back to the `Workload` definition. The eviction logic uses this link to read the group's requirements. @@ -428,7 +424,10 @@ A `Workload` object contains a list of `PodGroup`s. Each `PodGroup` defines: * `policy`: The scheduling policy, such as `Gang`. * `policy.gang.minCount`: The minimum number of pods required for one replica of that group. -A LWS replica is would correspond to a PodGroup replica, with its `size` being `minCount`. +#### Background on multi-pod replicas (LeaderWorkerSet) + +[LeaderWorkerSet](https://lws.sigs.k8s.io/docs/overview/) (LWS) is the primary example of a multi-pod replica. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and for the "worker" pods. This is useful in cases where a leader process coordinates multiple worker processes, particularly in AI/ML distributed workloads for model training and inference. All worker pods are treated the same: they are created from the same template, scheduled in parallel, and if any workers fail the group is considered failing. A LeaderWorkerSet object will specify `replicas` for the number of leader+workers groups and `size` for the number of pods per group. A LWS replica is would correspond to a PodGroup replica, with its `size` being `minCount`. + ### Risks and Mitigations @@ -571,7 +570,7 @@ graph TD #### Group Health A `PodGroup` replica is considered available if its number of existing, healthy, non-evicting pods is greater than or equal to its `policy.gang.minCount`. -For example, if a replica expects 10 pods with `minCount: 8` but only has 9 healthy pods (1 is missing or unhealthy), the replica is still considered **available**. If 3 pods are missing or unhealthy and only 7 healthy pods exist, the replica is **unavailable**. If any pod in an available group is targeted for eviction, it would be unhealthy post-eviction and is also counted as unavailable for the PDB calculation. +For example, if a replica expects 10 pods with `minCount: 8` but only has 9 healthy pods (1 is missing or unhealthy), the replica is still considered healthy. If 3 pods are missing or unhealthy and only 7 healthy pods exist, the replica is unhealthy. If any pod in an available group is targeted for eviction, it would be unhealthy post-eviction and is also counted as unavailable for the PDB calculation. ### Pods missing fields @@ -937,7 +936,7 @@ Longer term, we may want to require automated upgrade/rollback tests, but we are missing a bunch of machinery and tooling and can't do that now. --> -TODO + ###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? @@ -1227,6 +1226,7 @@ Initially there was a plan to integrate directly with multi-pod replica systems In the case given in the simplified example above, there may be a way to change the eviction logic to such that the order of pod eviction preserves replicas when possible (e.g. prioritize evicting pods from the replica with the most pods in the node). However, it is simpler to understand and easier ensure intended behavior by just extending the existing PDB budget pattern. It is also unclear if this would work fully when gang scheduling is not used or the number of pods is greater than `minCount`. +Rather than using a field in the PDB spec, it would be possible to detect if any selected pods have the Workload API enabled by checking their spec for `workload.name`. However, we want this new behavior to be something explicitly enabled. Silently changing the behavior of existing PDB fields (`minAvaiable`/`maxUnavailable`), based on context from other objects, could cause confusion and possibly unintended disruptions. ## Infrastructure Needed (Optional) From b1d9559be0b18ffe15d821f065d4a2b796c7f62f Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Fri, 24 Oct 2025 12:08:36 -0700 Subject: [PATCH 36/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 955b4a37357..3480a8cdbd0 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -65,7 +65,7 @@ If none of those approvers are still appropriate, then changes to that list should be approved by the remaining approvers and/or the owning SIG (or SIG Architecture for cross-cutting KEPs). --> -# KEP-NNNN: PDB for Workload Replicas +# KEP-NNNN: PDB for Multi-Pod Replicas -The Eviction API uses PodDisruptionBudgets (PBDs) to ensure availability of a certain number or percentage of pods during voluntary disruptions. This proposal would allow eviction to treat groups of pods as if they were individual replicas for the purposes of measuring availability. We determine pod groups using the new `Workload` API in the [gang scheduling KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/4671-gang-scheduling), where pods declare their owning workload and pod group and `Workload` objects contain information about replicas and group size. To enable this functionality, optional boolean field `usePodGroups` will be added to the PDB spec. +Voluntary disruptions (node drains) can disrupt an application, as pods get evicted from the node. Users may create a PodDisruptionBudget (PBD) object to specify that a certain number (or percentage) of pods must remain available. If a pod eviction would violate the availability threshold given by the PDB, the disruption controller will block the eviction, protecting the availability of the application. Some applications will use `PodGroups` as defined in the new [Workload API](https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/4671-gang-scheduling), which each act as a single replica composed of multiple pods, requiring additional eviction logic to protect from disruptions. For example, in a [LeaderWorkerSet](https://lws.sigs.k8s.io/docs/overview/) designed for distributed ML training or inference, one pod from a group being evicted would fail the entire group. -*Note: as of this draft, the `Workload` API is still in progress, for this KEP we assume it is fully implemented* +This KEP will allow the Eviction API to treat each pod group as if it were a single replica when calculating availability for a PDB. To enable this functionality, the PDB spec will have optional boolean `usePodGroups` , and if enabled, the PDB will enforce a number of *pod group replicas* that must remain available, rather than a number of *individual pod replicas* as it is now. + +**Note: as of this draft, the Workload API is still in progress, for this KEP we assume it is fully implemented** ## Motivation @@ -188,7 +190,7 @@ demonstrate the interest in a KEP within the wider Kubernetes community. [experience reports]: https://github.com/golang/go/wiki/ExperienceReports --> -The goal of this KEP is to make PDBs more useful for pod groups, particularly with the `Workload API`. Eviction or preemption of a small number of pods across multiple multi-pod replicas could disrupt each replica, as opposed to evicting multiple pods from a single replica (only disrupting that one replica). We want to enable the Eviction API to define avaiability based on the health of pod groups, rather than individual pods. +The goal of this KEP is to make PDBs more useful for pod groups. We want to prevent eviction or preemption of a small number of pods across multiple multi-pod replicas, as this could disrupt each replica (as opposed to evicting multiple pods from a single replica, only disrupting that one replica). This will be achieved by counting pod groups, rather than individual pods, when calculating avaiability for eviction. ### Goals From c63c37b9b6fc567707ac91e5c5ef861ae9591f61 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Mon, 27 Oct 2025 10:27:57 -0700 Subject: [PATCH 37/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 3480a8cdbd0..7aa33596bd3 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -428,8 +428,9 @@ A `Workload` object contains a list of `PodGroup`s. Each `PodGroup` defines: #### Background on multi-pod replicas (LeaderWorkerSet) -[LeaderWorkerSet](https://lws.sigs.k8s.io/docs/overview/) (LWS) is the primary example of a multi-pod replica. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and for the "worker" pods. This is useful in cases where a leader process coordinates multiple worker processes, particularly in AI/ML distributed workloads for model training and inference. All worker pods are treated the same: they are created from the same template, scheduled in parallel, and if any workers fail the group is considered failing. A LeaderWorkerSet object will specify `replicas` for the number of leader+workers groups and `size` for the number of pods per group. A LWS replica is would correspond to a PodGroup replica, with its `size` being `minCount`. +[LeaderWorkerSet](https://lws.sigs.k8s.io/docs/overview/) (LWS) is the primary implementation of a multi-pod replica. The LWS API allows users to manage a group of pods together as if they were a single pod, by specifying a template for a "leader" pod and for the "worker" pods. This is useful in cases where a leader process coordinates multiple worker processes, particularly in AI/ML distributed workloads for model training and inference. All worker pods are treated the same: they are created from the same template, scheduled in parallel, and if any workers fail the group is considered failing. A LeaderWorkerSet object will specify `replicas` for the number of leader+workers groups and `size` for the number of pods per group. +LWS is planned to be integrated with the Workload API ([KEP](https://docs.google.com/document/d/1QlcIBtR2KyOKYRUTGubhhxuy7NfjHs1fXMJlvdUCyhM/edit?tab=t.0#heading=h.dxr6zknxhiui)). Each LWS replica is would correspond to a PodGroup replica, with its `size` being `minCount`. ### Risks and Mitigations From fa6e7d673e0343e8c0c116dddf953fc96c21bd4f Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Tue, 28 Oct 2025 11:26:13 -0700 Subject: [PATCH 38/41] Update README.md --- .../draft-20251010-multipod-pdb/README.md | 74 ++++++++----------- 1 file changed, 30 insertions(+), 44 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 7aa33596bd3..71718c948ee 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -173,9 +173,11 @@ useful for a wide audience. A good summary is probably at least a paragraph in length. --> -Voluntary disruptions (node drains) can disrupt an application, as pods get evicted from the node. Users may create a PodDisruptionBudget (PBD) object to specify that a certain number (or percentage) of pods must remain available. If a pod eviction would violate the availability threshold given by the PDB, the disruption controller will block the eviction, protecting the availability of the application. Some applications will use `PodGroups` as defined in the new [Workload API](https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/4671-gang-scheduling), which each act as a single replica composed of multiple pods, requiring additional eviction logic to protect from disruptions. For example, in a [LeaderWorkerSet](https://lws.sigs.k8s.io/docs/overview/) designed for distributed ML training or inference, one pod from a group being evicted would fail the entire group. +Voluntary disruptions (node drains) will evict pods from a node, potentially causing issues in an application that relies on having a certain one or more replicas running. To specify that a certain number (or percentage) of pods must remain available, users may create a `PodDisruptionBudget` (PBD) object and declare a `minAvailable` or `maxUnavailable` in its spec. Then, if a pod eviction would violate the availability threshold given by the PDB, the disruption controller will block the eviction and protect the availability of the application. -This KEP will allow the Eviction API to treat each pod group as if it were a single replica when calculating availability for a PDB. To enable this functionality, the PDB spec will have optional boolean `usePodGroups` , and if enabled, the PDB will enforce a number of *pod group replicas* that must remain available, rather than a number of *individual pod replicas* as it is now. +However, some applications will use `PodGroups` as defined in the new [Workload API](https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/4671-gang-scheduling), in which a group of pods acts as a single "superpod" entity (i.e., each replica is composed of multiple pods). These require more complex eviction logic to protect from disruptions. For example, in a [LeaderWorkerSet](https://lws.sigs.k8s.io/docs/overview/) running a distributed ML training job, one pod in a group being evicted would cause the entire group to fail. + +This KEP will allow the Eviction API to treat each pod group as if it were a single replica when calculating availability for a PDB. To enable this new behavior, the PDB spec will have optional boolean `usePodGroups`, and if `true`, the PDB will enforce a number of *pod group replicas* that must remain available, rather than a number of *individual pod replicas* as it is now. **Note: as of this draft, the Workload API is still in progress, for this KEP we assume it is fully implemented** @@ -190,7 +192,7 @@ demonstrate the interest in a KEP within the wider Kubernetes community. [experience reports]: https://github.com/golang/go/wiki/ExperienceReports --> -The goal of this KEP is to make PDBs more useful for pod groups. We want to prevent eviction or preemption of a small number of pods across multiple multi-pod replicas, as this could disrupt each replica (as opposed to evicting multiple pods from a single replica, only disrupting that one replica). This will be achieved by counting pod groups, rather than individual pods, when calculating avaiability for eviction. +The goal of this KEP is to improve the experience of using PDBs and the Eviction API for applications with multi-pod replicas. Most importantly, eviction of a small number of pods spread across multiple multi-pod replicas could disrupt each replica. This will be prevented by new functionality for calculating avaiability for eviction based on disrupted pod groups, rather than individual pods. ### Goals @@ -201,7 +203,7 @@ know that this has succeeded? - **Introduce a field to enable group-based PDBs:** Add a new boolean field `usePodGroups` to the `PodDisruptionBudget.spec`. - **Define availability for pod groups:** Allow application owners to define PDBs for multi-pod replicas (as defined by the `Workload` API) rather than individual pods. -- **Update eviction logic:** When `usePodGroups: true` is set in a PDB spec, the eviction logic will interpret the disruption budget given in the PDB (`minAvailable` or `maxUnavailable`) as a count of pod group replicas, rather than individual pod replicas. +- **Update eviction logic:** When enabled, the eviction logic will interpret the disruption budget given in the PDB (`minAvailable` or `maxUnavailable`) as a count of pod group replicas, rather than individual pod replicas. - **Integrate with Workload API:** Use the pod spec's `workload.name` and `workload.podGroup` to retrieve `Workload` objects and their `PodGroup` groupings. - **Maintain compatibility:** Ensure that common cluster operations that respect PDBs, such as `kubectl drain` and node drains initiated by `cluster-autoscaler`, follow group-based disruption budgets when enabled. - **Preserve existing functionality:** For backward compatibility, the behavior of PDBs without `usePodGroups: true` will be unchanged. @@ -212,29 +214,18 @@ know that this has succeeded? What is out of scope for this KEP? Listing non-goals helps to focus discussion and make progress. --> - -This change will only affect the Eviction API. The following are involuntary disruptions and do not use the Eviction API: -- Manual pod deletion -- Pods being deleted by their owning controller (e.g. during Deployment rollout) -- Node failure -- Pod cleanup due to a node being removed from the cluster -- Evictions by the Kubelet due to node pressure (e.g. memory shortage) -- Taint manager deleting NoExecute tainted pods -The only change to object definitions will be the optional field in the `PodDisruptionBudget` spec to enable the changes. +This change will only affect the Eviction API. Involuntary disruptions do not use the Eviction API include: manual pod deletion, pods being deleted by their owner (e.g. during Deployment rollout), node failure, pod cleanup due to a node being removed from the cluster, evictions by the Kubelet due to node pressure (e.g. memory shortage), taint manager deleting NoExecute tainted pods -This change will not affect the behavior of workload controllers for `Deployment`, `StatefulSet`, `Workload`, etc. -- The workload controller will be responsible for setting the `workload.name` and `workload.podGroup` on the pods it manages. -- The lifecycle and recovery of a disrupted replica is the responsibility of the workload controller, this will only handle evictions. +The only change to k8s resource definitions will be the optional field in the `PodDisruptionBudget` spec to enable the changes. -This change will not affect scheduling. -- There will be no additions or changes to gang scheduling. This only handles eviction of already-scheduled pods. +This change will not affect the behavior of controllers for `Deployment`, `StatefulSet`, `Workload`, or `LeaderWorkerSet`, which will be responsible for setting the `workload.name` and `workload.podGroup` on their managed pods. The lifecycle and recovery of a disrupted replica is the responsibility of their owning controller. -Partial replica health: -- This KEP follows the definition of multi-pod replica health from the `Workload` API, using `minCount`. A replica is considered "available" if it meets `minCount`, and "unavailable" if it does not. We are not introducing any other definition of partial health (e.g. percentage). +There will be no additions or changes to scheduling, including gang scheduling. This only handles eviction of already-scheduled pods. -Mixed workload types: -- If a PDB has multi-pod replicas enabled, individual pods without an assigned workload will be treated as single-pod groups. +This KEP follows the definition of multi-pod replica health using `minCount` from the `Workload` API. A replica is considered healthy if it meets `minCount` of healthy pods, and unhealthy otherwise. We are not introducing any other definition of partial health (e.g. a percentage). + +If a PDB has multi-pod replicas enabled, individual pods without an assigned workload will be treated as single-pod groups. We will log a warning as mixing types is not recommended, but the user is responsible for correct setup of replicas. ## Proposal @@ -247,7 +238,7 @@ The "Design Details" section below is for the real nitty-gritty. --> -We propose adding a new, optional boolean field `usePodGroups` to the `PodDisruptionBudget.spec`. If this field is `false` (default) or unset, the Eviction API evaluates the PDB based on individual pod counts, preserving all existing behavior. If `true`, the Eviction API will find the `Workload` object and its `PodGroup` as specified by the Pod spec. This `PodGroup` will define the minimum number of pods required for one replica of that group to be healthy, and how many replicas are expected. This will be used to measure availability of pod groups, and the PDB's `minAvailable` or `maxUnavailable` will be interpreted in terms of these `PodGroup` replicas, rather than individual pods. +We will add a new optional boolean `usePodGroups` to the `PodDisruptionBudget.spec`. If this field is unset or `false` (default), the Eviction API will evaluate the PDB based on individual pod counts, preserving all existing behavior. If `true`, the Eviction API will find the `Workload` object and its `PodGroup` as specified by the Pod spec. This `PodGroup` defines the minimum number of pods required for a replica of that group to be healthy, and how many replicas are expected. Using this information, the PDB's `minAvailable` or `maxUnavailable` will be interpreted in terms of these `PodGroup` replicas, rather than individual pods. ### User Stories (Optional) @@ -261,9 +252,9 @@ bogged down. #### Story 1: Distributed Workload -An ML engineer is running distributed training jobs using `Workload` API. The `Workload` defines a `PodGroup` named `worker` with `replicas: 10` and `policy.gang.minCount: 8`. This means the job has 10 replicas, each consisting of at least 8 pods. +An ML engineer is running distributed training jobs using Workload API. The `Workload` defines a `PodGroup` named `worker` with `replicas: 10` and `policy.gang.minCount: 8`. This means the job has 10 replicas, each consisting of at least 8 pods. -To protect this long-running job from voluntary disruptions (like node drains), the user wants to ensure at least 9 of the 10 worker groups remain available. +To protect this long-running job from voluntary disruptions, the user wants to ensure at least 9 of the 10 worker groups remain available. This user would create a PDB targeting the worker pods: @@ -277,7 +268,7 @@ spec: usePodGroups: true # <-- New field to enable selector: matchLabels: - # Assuming pods are labeled by the workload controller + # Assuming pods are labeled workload: my-training-job pod-group: worker ``` @@ -289,7 +280,7 @@ Upon node drain, the Eviction API will: 4. Fetch the `Workload` object `my-training-job`. 5. Find `worker` `PodGroup` in the `Workload`, which has 10 `replicas` and 8 `minSize`. 6. Interpreting `minAvailable: 9` as pod groups, a group is considered disrupted if evicting a pod would cause its healthy pod count to drop below 8. -8. The drain will proceed only if it does not cause the number of available worker groups to drop below 9. +7. The drain will proceed only if it does not cause the number of available worker groups to drop below 9. This way, the job is protected to run with sufficient replicas during cluster maintenance. @@ -297,9 +288,9 @@ This way, the job is protected to run with sufficient replicas during cluster ma A cluster administrator frequently drains nodes for upgrades. The cluster has various workloads, including multi-pod applications defined by the `Workload` API. -To perform node drains safely, the administrator relies on application owners' PDBs. When the admin issues `kubectl drain `, the Eviction API uses the process above and interprets the PDBs in terms of `PodGroup` replicas, ensuring that the drain does not violate the application's group-based availability requirements. +The admin would like to upgrade a node which is running the job from Story 1. To perform node drains safely, they rely on application owners' PDBs. When they issue `kubectl drain `, the Eviction API sees the PDB and uses the process above, interpreting the disruption in terms of `PodGroup` replicas and ensuring that the drain does not violate the application's group-based availability requirements. -This allows safe maintenance without causing outages, as the drain will pause if it cannot evict pods without violating a group-based PDB. It will wait for better replica health, more availability, lower requirements, or the admin may contact the application owner to resolve. +This allows safe maintenance without causing outages, as the drain will pause if it cannot evict pods without violating a group-based PDB. It will wait for better replica health, more availability, lower PDB requirements, or the admin may contact the application owner to resolve the block. #### Simplified Setup Example @@ -331,7 +322,9 @@ graph TD class P0A,P0B,P1A,P1B pod_box ``` -In this simplified setup, the node being drained contains two replicas, each with two pods (there may be more nodes and replicas which we can ignore). The PDB wants at most one replica unavailable. Currently, the user might try `minUnavailable: 2` (one two-pod replica unavailable). The node drain would start, and could evict a pod from replica 0 and a pod from replica 1 before pausing (as there are only 2 pods left). This would disrupt both replicas. With the new changes, a PDB with `usePodGroups: true` and `minUnavailable: 1` (one replica unavailable) would pause before evicting a pod from the second replica, protecting one of the replicas as intended. +In this setup, the node being drained contains two replicas, each with two pods (there may be more nodes and replicas which we can ignore). The PDB wants at most one replica unavailable. Currently, the user might try `minUnavailable: 2` (one two-pod replica unavailable). The node drain would start, and could evict a pod from replica 0 and a pod from replica 1 before pausing (as there are only 2 pods left). This would disrupt both replicas. With the new changes, a PDB with `usePodGroups: true` and `minUnavailable: 1` (one replica unavailable) would pause before evicting a pod from the second replica, protecting one of the replicas as intended. + +In a real cluster, there may be additional nodes or replicas, pods from other jobs sharing those nodes, etc. ```mermaid @@ -446,8 +439,8 @@ How will UX be reviewed, and by whom? Consider including folks who also work outside the SIG or subproject. --> -- This feature relies on the pod's `spec.workload.name` and `spec.workload.podGroup` fields being correctly set by its managing controller. If a user sets `usePodGroups: true` but the pods are not correctly linked to a `Workload` object, the eviction logic will fall back to per-pod counting, which may violate the application's true availability requirements (though it will prevent a drain from being blocked by misconfiguration). -- One failing pod in a large group will make that group "unavailable" if it drops below its `minCount`. A small number of failing pods spread across many groups could prevent all evictions and block a node drain. This is intended behavior (as the application is unhealthy), but may be surprising to operators. +- This feature relies on the pod's `spec.workload.name` and `spec.workload.podGroup` fields being correctly set by its managing controller. If a user sets `usePodGroups: true` but the pods are not correctly linked to a `Workload` object, the eviction logic will fall back to per-pod counting to prevent a drain from being blocked by misconfiguration, which may violate the application's intended availability requirements. +- One failing pod in a large group will make that group unhealthy if it drops below its `minCount`. In this way, a small number of failing pods spread across many replicas could prevent all evictions and block node drains. This is intended behavior (as the application is unhealthy), but may be surprising to operators. - A PDB `selector` that matches pods from multiple different `PodGroup`s (or a mix of grouped and individual pods) may have complex or unintended behavior. Users should be advised to create separate PDBs for each distinct `PodGroup` they wish to protect. ## Design Details @@ -467,22 +460,15 @@ We will add a new field to `PodDisruptionBudgetSpec` in `pkg/apis/policy/v1/type // PodDisruptionBudgetSpec defines the desired state of PodDisruptionBudget type PodDisruptionBudgetSpec struct { // An eviction is allowed if at least "minAvailable" pods selected by - // "selector" will still be available after the eviction, i.e. even in the - // absence of the evicted pod. So, "minAvailable" is a safety threshold, - // an absolute number or a percentage. - // +optional + // ... MinAvailable *intstr.IntOrString `json:"minAvailable,omitempty" protobuf:"bytes,1,opt,name=minAvailable"` // Label query over pods whose evictions are managed by the disruption - // budget. - // +optional + // ... Selector *metav1.LabelSelector `json:"selector,omitempty" protobuf:"bytes,2,opt,name=selector"` // An eviction is allowed if at most "maxUnavailable" pods selected by - // "selector" are unavailable after the eviction, i.e. even in the - // presence of the evicted pod. So, "maxUnavailable" is a safety threshold, - // an absolute number or a percentage. - // +optional + // ... MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty" protobuf:"bytes,3,opt,name=maxUnavailable"` // usePodGroups indicates that availability should be calculated based on @@ -571,9 +557,9 @@ graph TD ``` #### Group Health -A `PodGroup` replica is considered available if its number of existing, healthy, non-evicting pods is greater than or equal to its `policy.gang.minCount`. +A `PodGroup` replica is considered healthy if its number of existing, healthy, non-evicting pods is greater than or equal to its `policy.gang.minCount`. -For example, if a replica expects 10 pods with `minCount: 8` but only has 9 healthy pods (1 is missing or unhealthy), the replica is still considered healthy. If 3 pods are missing or unhealthy and only 7 healthy pods exist, the replica is unhealthy. If any pod in an available group is targeted for eviction, it would be unhealthy post-eviction and is also counted as unavailable for the PDB calculation. +For example, if a replica is intended to have 10 pods and has `minCount: 8` but only has 9 healthy pods (1 is missing or unhealthy), the replica is still considered healthy. If 3 pods were missing or unhealthy so only 7 healthy pods were found, the replica would be unhealthy. If any pod in a healthy group is targeted for eviction, it would be unhealthy post-eviction and is also counted as unhealthy for the PDB calculation. ### Pods missing fields From 40a77d13bfbc24e728c578c3dc526e4eef9dbb21 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Tue, 28 Oct 2025 11:32:38 -0700 Subject: [PATCH 39/41] Update kep.yaml --- .../draft-20251010-multipod-pdb/kep.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml b/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml index 5be11e05c7e..ab64e3e70c3 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml +++ b/keps/sig-apps/draft-20251010-multipod-pdb/kep.yaml @@ -7,7 +7,7 @@ participating-sigs: - sig-scheduling - sig-node status: provisional #provisional|implementable|implemented|deferred|rejected|withdrawn|replaced -creation-date: 2025-10-10 +creation-date: 2025-10-28 reviewers: - TBD approvers: @@ -31,13 +31,13 @@ milestone: # The following PRR answers are required at alpha release # List the feature gate name and the components for which it must be enabled -feature-gates: - - name: MyFeature - components: - - kube-apiserver - - kube-controller-manager -disable-supported: true +# feature-gates: +# - name: MyFeature +# components: +# - kube-apiserver +# - kube-controller-manager +# disable-supported: true # The following PRR answers are required at beta release -metrics: - - my_feature_metric +# metrics: +# - my_feature_metric From b999922dd71e0daffdc6729b7888fc761a013140 Mon Sep 17 00:00:00 2001 From: Marcus Alder Date: Tue, 4 Nov 2025 10:57:11 -0800 Subject: [PATCH 40/41] Update README.md --- keps/sig-apps/draft-20251010-multipod-pdb/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-apps/draft-20251010-multipod-pdb/README.md b/keps/sig-apps/draft-20251010-multipod-pdb/README.md index 71718c948ee..71b7bb68be3 100644 --- a/keps/sig-apps/draft-20251010-multipod-pdb/README.md +++ b/keps/sig-apps/draft-20251010-multipod-pdb/README.md @@ -65,7 +65,7 @@ If none of those approvers are still appropriate, then changes to that list should be approved by the remaining approvers and/or the owning SIG (or SIG Architecture for cross-cutting KEPs). --> -# KEP-NNNN: PDB for Multi-Pod Replicas +# KEP-5682: PDB for Multi-Pod Replicas