Skip to content

Commit 717be0c

Browse files
committed
Allow deletion of unhealthy pods if enough healthy
Currently, if you have a PDB with 0 disruptions available and you attempt to evict a non-healthy pod, the eviction request will always fail. This is because the eviction API does not currently take in to account that the pod you are removing is the unhealthy one. This commit accounts for trying to evict an unhealthy pod as long as there are enough healthy pods to satisfy the PDB's requirements. To protect against race conditions, a ResourceVersion constraint is enforced. This will ensure that the target pod does not go healthy and allow any race condition to occur which might disrupt too many pods at once. This commit also eliminates superfluous class to DeepCopy for the deleteOptions struct.
1 parent b8ecff2 commit 717be0c

File tree

3 files changed

+165
-18
lines changed

3 files changed

+165
-18
lines changed

pkg/registry/core/pod/storage/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ go_test(
1414
],
1515
embed = [":go_default_library"],
1616
deps = [
17+
"//pkg/api/pod:go_default_library",
1718
"//pkg/apis/core:go_default_library",
1819
"//pkg/apis/policy:go_default_library",
1920
"//pkg/registry/registrytest:go_default_library",

pkg/registry/core/pod/storage/eviction.go

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"k8s.io/apiserver/pkg/util/dryrun"
3434
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
3535
"k8s.io/client-go/util/retry"
36+
podutil "k8s.io/kubernetes/pkg/api/pod"
3637
api "k8s.io/kubernetes/pkg/apis/core"
3738
"k8s.io/kubernetes/pkg/apis/policy"
3839
)
@@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
145146
}
146147

147148
// the PDB can be ignored, so delete the pod
148-
deletionOptions := originalDeleteOptions.DeepCopy()
149+
deleteOptions := originalDeleteOptions
150+
149151
// We should check if resourceVersion is already set by the requestor
150152
// as it might be older than the pod we just fetched and should be
151153
// honored.
152154
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
153-
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
155+
// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
154156
// racing with another PDB-impacting process elsewhere.
155-
if deletionOptions.Preconditions == nil {
156-
deletionOptions.Preconditions = &metav1.Preconditions{}
157-
}
158-
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
157+
deleteOptions = deleteOptions.DeepCopy()
158+
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
159159
}
160-
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
160+
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
161161
if err != nil {
162162
return err
163163
}
@@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
181181

182182
var rtStatus *metav1.Status
183183
var pdbName string
184+
updateDeletionOptions := false
185+
184186
err = func() error {
185187
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
186188
if err != nil {
@@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
201203

202204
pdb := &pdbs[0]
203205
pdbName = pdb.Name
206+
207+
// If the pod is not ready, it doesn't count towards healthy and we should not decrement
208+
if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
209+
updateDeletionOptions = true
210+
return nil
211+
}
212+
204213
refresh := false
205214
err = retry.RetryOnConflict(EvictionsRetry, func() error {
206215
if refresh {
@@ -232,18 +241,43 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
232241
return rtStatus, nil
233242
}
234243

235-
// At this point there was either no PDB or we succeeded in decrementing
244+
// At this point there was either no PDB or we succeeded in decrementing or
245+
// the pod was unready and we have enough healthy replicas
246+
247+
deleteOptions := originalDeleteOptions
248+
249+
// Set deleteOptions.Preconditions.ResourceVersion to ensure
250+
// the pod hasn't been considered ready since we calculated
251+
if updateDeletionOptions {
252+
// Take a copy so we can compare to client-provied Options later.
253+
deleteOptions = deleteOptions.DeepCopy()
254+
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
255+
}
236256

237257
// Try the delete
238-
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
258+
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
239259
if err != nil {
260+
if errors.IsConflict(err) && updateDeletionOptions &&
261+
(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
262+
// If we encounter a resource conflict error, we updated the deletion options to include them,
263+
// and the original deletion options did not specify ResourceVersion, we send back
264+
// TooManyRequests so clients will retry.
265+
return nil, createTooManyRequestsError(pdbName)
266+
}
240267
return nil, err
241268
}
242269

243270
// Success!
244271
return &metav1.Status{Status: metav1.StatusSuccess}, nil
245272
}
246273

274+
func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
275+
if deleteOptions.Preconditions == nil {
276+
deleteOptions.Preconditions = &metav1.Preconditions{}
277+
}
278+
deleteOptions.Preconditions.ResourceVersion = resourceVersion
279+
}
280+
247281
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
248282
// without checking PDBs.
249283
func canIgnorePDB(pod *api.Pod) bool {
@@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
268302
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
269303
}
270304

305+
func createTooManyRequestsError(name string) error {
306+
// TODO(mml): Add a Retry-After header. Once there are time-based
307+
// budgets, we can sometimes compute a sensible suggested value. But
308+
// even without that, we can give a suggestion (10 minutes?) that
309+
// prevents well-behaved clients from hammering us.
310+
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
311+
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
312+
return err
313+
}
314+
271315
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
272316
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
273317
if pdb.Status.ObservedGeneration < pdb.Generation {
274-
// TODO(mml): Add a Retry-After header. Once there are time-based
275-
// budgets, we can sometimes compute a sensible suggested value. But
276-
// even without that, we can give a suggestion (10 minutes?) that
277-
// prevents well-behaved clients from hammering us.
278-
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
279-
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
280-
return err
318+
319+
return createTooManyRequestsError(pdb.Name)
281320
}
282321
if pdb.Status.DisruptionsAllowed < 0 {
283322
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))

pkg/registry/core/pod/storage/eviction_test.go

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
3232
"k8s.io/apiserver/pkg/registry/rest"
3333
"k8s.io/client-go/kubernetes/fake"
34+
podapi "k8s.io/kubernetes/pkg/api/pod"
3435
api "k8s.io/kubernetes/pkg/apis/core"
3536
"k8s.io/kubernetes/pkg/apis/policy"
3637
)
@@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
219220
podName string
220221
expectedDeleteCount int
221222
podTerminating bool
223+
prc *api.PodCondition
222224
}{
223225
{
224226
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
@@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
301303
expectedDeleteCount: 1,
302304
podTerminating: true,
303305
},
306+
{
307+
name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
308+
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
309+
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
310+
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
311+
Status: policyv1beta1.PodDisruptionBudgetStatus{
312+
// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
313+
DisruptionsAllowed: 0,
314+
CurrentHealthy: 2,
315+
DesiredHealthy: 2,
316+
},
317+
}},
318+
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
319+
expectError: true,
320+
podName: "t7",
321+
expectedDeleteCount: 0,
322+
podTerminating: false,
323+
podPhase: api.PodRunning,
324+
prc: &api.PodCondition{
325+
Type: api.PodReady,
326+
Status: api.ConditionTrue,
327+
},
328+
},
329+
{
330+
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
331+
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
332+
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
333+
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
334+
Status: policyv1beta1.PodDisruptionBudgetStatus{
335+
// This simulates 3 pods desired, our pod unhealthy
336+
DisruptionsAllowed: 0,
337+
CurrentHealthy: 2,
338+
DesiredHealthy: 2,
339+
},
340+
}},
341+
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
342+
expectError: false,
343+
podName: "t8",
344+
expectedDeleteCount: 1,
345+
podTerminating: false,
346+
podPhase: api.PodRunning,
347+
prc: &api.PodCondition{
348+
Type: api.PodReady,
349+
Status: api.ConditionFalse,
350+
},
351+
},
352+
{
353+
// This case should return the 529 retry error.
354+
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
355+
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
356+
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
357+
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
358+
Status: policyv1beta1.PodDisruptionBudgetStatus{
359+
// This simulates 3 pods desired, our pod unhealthy
360+
DisruptionsAllowed: 0,
361+
CurrentHealthy: 2,
362+
DesiredHealthy: 2,
363+
},
364+
}},
365+
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
366+
expectError: true,
367+
podName: "t9",
368+
expectedDeleteCount: 1,
369+
podTerminating: false,
370+
podPhase: api.PodRunning,
371+
prc: &api.PodCondition{
372+
Type: api.PodReady,
373+
Status: api.ConditionFalse,
374+
},
375+
},
376+
{
377+
// This case should return the 529 retry error.
378+
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
379+
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
380+
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
381+
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
382+
Status: policyv1beta1.PodDisruptionBudgetStatus{
383+
// This simulates 3 pods desired, our pod unhealthy
384+
DisruptionsAllowed: 0,
385+
CurrentHealthy: 2,
386+
DesiredHealthy: 2,
387+
},
388+
}},
389+
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
390+
expectError: true,
391+
podName: "t10",
392+
expectedDeleteCount: 1,
393+
podTerminating: false,
394+
podPhase: api.PodRunning,
395+
prc: &api.PodCondition{
396+
Type: api.PodReady,
397+
Status: api.ConditionFalse,
398+
},
399+
},
304400
}
305401

306402
for _, tc := range testcases {
@@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
323419
pod.ObjectMeta.DeletionTimestamp = &currentTime
324420
}
325421

422+
// Setup pod condition
423+
if tc.prc != nil {
424+
if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
425+
t.Fatalf("Unable to update pod ready condition")
426+
}
427+
}
428+
326429
client := fake.NewSimpleClientset(tc.pdbs...)
327430
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
328431

@@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
416519
// Always return error for this pod
417520
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
418521
}
419-
if ms.pod.Name == "t6" {
420-
// This pod has a deletionTimestamp and should not raise conflict on delete
522+
if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
523+
// t6: This pod has a deletionTimestamp and should not raise conflict on delete
524+
// t8: This pod should not have a resource conflict.
421525
return nil, true, nil
422526
}
527+
if ms.pod.Name == "t10" {
528+
return nil, false, apierrors.NewBadRequest("test designed to error")
529+
}
423530
if count == 1 {
424531
// This is a hack to ensure that some test pods don't change phase
425532
// but do change resource version

0 commit comments

Comments
 (0)