Skip to content

Commit 9f80e7a

Browse files
committed
Allow deletion of pending pods when using PDBS
Currently, if you have a PDB set, it is possible for a pod stuck in pending state to be prevented from deletion even though there are in fact enough healthy replicas. This commit allows pods in Pending state to be removed. This commit also adds associated unit tests. related-bug: kubernetes#80389
1 parent b8be11e commit 9f80e7a

File tree

3 files changed

+341
-18
lines changed

3 files changed

+341
-18
lines changed

pkg/registry/core/pod/storage/BUILD

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,14 @@ go_test(
2222
"//staging/src/k8s.io/api/policy/v1beta1:go_default_library",
2323
"//staging/src/k8s.io/apimachinery/pkg/api/equality:go_default_library",
2424
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
25+
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/internalversion:go_default_library",
2526
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
2627
"//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library",
2728
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
2829
"//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library",
30+
"//staging/src/k8s.io/apimachinery/pkg/runtime/schema:go_default_library",
2931
"//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library",
32+
"//staging/src/k8s.io/apimachinery/pkg/watch:go_default_library",
3033
"//staging/src/k8s.io/apiserver/pkg/endpoints/request:go_default_library",
3134
"//staging/src/k8s.io/apiserver/pkg/registry/generic:go_default_library",
3235
"//staging/src/k8s.io/apiserver/pkg/registry/generic/registry:go_default_library",

pkg/registry/core/pod/storage/eviction.go

Lines changed: 73 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ import (
2929
"k8s.io/apimachinery/pkg/runtime"
3030
"k8s.io/apimachinery/pkg/runtime/schema"
3131
"k8s.io/apimachinery/pkg/util/wait"
32-
genericregistry "k8s.io/apiserver/pkg/registry/generic/registry"
3332
"k8s.io/apiserver/pkg/registry/rest"
3433
"k8s.io/apiserver/pkg/util/dryrun"
3534
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
@@ -46,24 +45,25 @@ const (
4645
// This situation should self-correct because the PDB controller removes
4746
// entries from the map automatically after the PDB DeletionTimeout regardless.
4847
MaxDisruptedPodSize = 2000
48+
retrySteps = 20
4949
)
5050

5151
// EvictionsRetry is the retry for a conflict where multiple clients
5252
// are making changes to the same resource.
5353
var EvictionsRetry = wait.Backoff{
54-
Steps: 20,
54+
Steps: retrySteps,
5555
Duration: 500 * time.Millisecond,
5656
Factor: 1.0,
5757
Jitter: 0.1,
5858
}
5959

60-
func newEvictionStorage(store *genericregistry.Store, podDisruptionBudgetClient policyclient.PodDisruptionBudgetsGetter) *EvictionREST {
60+
func newEvictionStorage(store rest.StandardStorage, podDisruptionBudgetClient policyclient.PodDisruptionBudgetsGetter) *EvictionREST {
6161
return &EvictionREST{store: store, podDisruptionBudgetClient: podDisruptionBudgetClient}
6262
}
6363

6464
// EvictionREST implements the REST endpoint for evicting pods from nodes
6565
type EvictionREST struct {
66-
store *genericregistry.Store
66+
store rest.StandardStorage
6767
podDisruptionBudgetClient policyclient.PodDisruptionBudgetsGetter
6868
}
6969

@@ -130,13 +130,55 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
130130

131131
// Evicting a terminal pod should result in direct deletion of pod as it already caused disruption by the time we are evicting.
132132
// There is no need to check for pdb.
133-
if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed {
134-
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
135-
if err != nil {
136-
return nil, err
133+
if canIgnorePDB(pod) {
134+
deleteRefresh := false
135+
continueToPDBs := false
136+
// Preserve current deletionOptions if we need to fall through to checking PDBs later.
137+
preservedDeletionOptions := deletionOptions.DeepCopy()
138+
err := retry.RetryOnConflict(EvictionsRetry, func() error {
139+
if deleteRefresh {
140+
// If we hit a conflict error, get the latest pod
141+
obj, err = r.store.Get(ctx, eviction.Name, &metav1.GetOptions{})
142+
if err != nil {
143+
return err
144+
}
145+
pod = obj.(*api.Pod)
146+
if !canIgnorePDB(pod) {
147+
// Pod is no longer in a state where we can skip checking
148+
// PDBs, continue to PDB checks.
149+
continueToPDBs = true
150+
// restore original deletion options because we may have
151+
// modified them.
152+
deletionOptions = preservedDeletionOptions
153+
return nil
154+
}
155+
}
156+
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(preservedDeletionOptions) {
157+
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
158+
// racing with another PDB-impacting process elsewhere.
159+
if deletionOptions.Preconditions == nil {
160+
deletionOptions.Preconditions = &metav1.Preconditions{}
161+
}
162+
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
163+
} else {
164+
// restore original deletion options because we may have
165+
// modified them.
166+
deletionOptions = preservedDeletionOptions
167+
}
168+
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
169+
if err != nil {
170+
deleteRefresh = true
171+
return err
172+
}
173+
return nil
174+
})
175+
if !continueToPDBs {
176+
if err != nil {
177+
return nil, err
178+
}
179+
return &metav1.Status{
180+
Status: metav1.StatusSuccess}, nil
137181
}
138-
return &metav1.Status{
139-
Status: metav1.StatusSuccess}, nil
140182
}
141183
var rtStatus *metav1.Status
142184
var pdbName string
@@ -203,6 +245,27 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
203245
return &metav1.Status{Status: metav1.StatusSuccess}, nil
204246
}
205247

248+
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
249+
// without checking PDBs.
250+
func canIgnorePDB(pod *api.Pod) bool {
251+
if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed || pod.Status.Phase == api.PodPending {
252+
return true
253+
}
254+
return false
255+
}
256+
257+
func shouldEnforceResourceVersion(pod *api.Pod) bool {
258+
// Only pods that may be included as health in PDBs in the future need to be checked.
259+
if pod.Status.Phase == api.PodPending {
260+
return true
261+
}
262+
return false
263+
}
264+
265+
func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
266+
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
267+
}
268+
206269
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
207270
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
208271
if pdb.Status.ObservedGeneration < pdb.Generation {

0 commit comments

Comments
 (0)