Skip to content

Commit 7509c4e

Browse files
authored
Merge pull request kubernetes#94381 from mgugino-upstream-stage/eviction-disrupted-pods
Allow deletion of unhealthy pods if enough healthy
2 parents 9a6e35a + 717be0c commit 7509c4e

File tree

3 files changed

+165
-18
lines changed

3 files changed

+165
-18
lines changed

pkg/registry/core/pod/storage/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ go_test(
1414
],
1515
embed = [":go_default_library"],
1616
deps = [
17+
"//pkg/api/pod:go_default_library",
1718
"//pkg/apis/core:go_default_library",
1819
"//pkg/apis/policy:go_default_library",
1920
"//pkg/registry/registrytest:go_default_library",

pkg/registry/core/pod/storage/eviction.go

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"k8s.io/apiserver/pkg/util/dryrun"
3434
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
3535
"k8s.io/client-go/util/retry"
36+
podutil "k8s.io/kubernetes/pkg/api/pod"
3637
api "k8s.io/kubernetes/pkg/apis/core"
3738
"k8s.io/kubernetes/pkg/apis/policy"
3839
)
@@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
145146
}
146147

147148
// the PDB can be ignored, so delete the pod
148-
deletionOptions := originalDeleteOptions.DeepCopy()
149+
deleteOptions := originalDeleteOptions
150+
149151
// We should check if resourceVersion is already set by the requestor
150152
// as it might be older than the pod we just fetched and should be
151153
// honored.
152154
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
153-
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
155+
// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
154156
// racing with another PDB-impacting process elsewhere.
155-
if deletionOptions.Preconditions == nil {
156-
deletionOptions.Preconditions = &metav1.Preconditions{}
157-
}
158-
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
157+
deleteOptions = deleteOptions.DeepCopy()
158+
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
159159
}
160-
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
160+
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
161161
if err != nil {
162162
return err
163163
}
@@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
181181

182182
var rtStatus *metav1.Status
183183
var pdbName string
184+
updateDeletionOptions := false
185+
184186
err = func() error {
185187
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
186188
if err != nil {
@@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
201203

202204
pdb := &pdbs[0]
203205
pdbName = pdb.Name
206+
207+
// If the pod is not ready, it doesn't count towards healthy and we should not decrement
208+
if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
209+
updateDeletionOptions = true
210+
return nil
211+
}
212+
204213
refresh := false
205214
err = retry.RetryOnConflict(EvictionsRetry, func() error {
206215
if refresh {
@@ -232,18 +241,43 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
232241
return rtStatus, nil
233242
}
234243

235-
// At this point there was either no PDB or we succeeded in decrementing
244+
// At this point there was either no PDB or we succeeded in decrementing or
245+
// the pod was unready and we have enough healthy replicas
246+
247+
deleteOptions := originalDeleteOptions
248+
249+
// Set deleteOptions.Preconditions.ResourceVersion to ensure
250+
// the pod hasn't been considered ready since we calculated
251+
if updateDeletionOptions {
252+
// Take a copy so we can compare to client-provied Options later.
253+
deleteOptions = deleteOptions.DeepCopy()
254+
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
255+
}
236256

237257
// Try the delete
238-
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
258+
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
239259
if err != nil {
260+
if errors.IsConflict(err) && updateDeletionOptions &&
261+
(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
262+
// If we encounter a resource conflict error, we updated the deletion options to include them,
263+
// and the original deletion options did not specify ResourceVersion, we send back
264+
// TooManyRequests so clients will retry.
265+
return nil, createTooManyRequestsError(pdbName)
266+
}
240267
return nil, err
241268
}
242269

243270
// Success!
244271
return &metav1.Status{Status: metav1.StatusSuccess}, nil
245272
}
246273

274+
func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
275+
if deleteOptions.Preconditions == nil {
276+
deleteOptions.Preconditions = &metav1.Preconditions{}
277+
}
278+
deleteOptions.Preconditions.ResourceVersion = resourceVersion
279+
}
280+
247281
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
248282
// without checking PDBs.
249283
func canIgnorePDB(pod *api.Pod) bool {
@@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
268302
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
269303
}
270304

305+
func createTooManyRequestsError(name string) error {
306+
// TODO(mml): Add a Retry-After header. Once there are time-based
307+
// budgets, we can sometimes compute a sensible suggested value. But
308+
// even without that, we can give a suggestion (10 minutes?) that
309+
// prevents well-behaved clients from hammering us.
310+
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
311+
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
312+
return err
313+
}
314+
271315
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
272316
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
273317
if pdb.Status.ObservedGeneration < pdb.Generation {
274-
// TODO(mml): Add a Retry-After header. Once there are time-based
275-
// budgets, we can sometimes compute a sensible suggested value. But
276-
// even without that, we can give a suggestion (10 minutes?) that
277-
// prevents well-behaved clients from hammering us.
278-
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
279-
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
280-
return err
318+
319+
return createTooManyRequestsError(pdb.Name)
281320
}
282321
if pdb.Status.DisruptionsAllowed < 0 {
283322
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))

pkg/registry/core/pod/storage/eviction_test.go

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
3232
"k8s.io/apiserver/pkg/registry/rest"
3333
"k8s.io/client-go/kubernetes/fake"
34+
podapi "k8s.io/kubernetes/pkg/api/pod"
3435
api "k8s.io/kubernetes/pkg/apis/core"
3536
"k8s.io/kubernetes/pkg/apis/policy"
3637
)
@@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
219220
podName string
220221
expectedDeleteCount int
221222
podTerminating bool
223+
prc *api.PodCondition
222224
}{
223225
{
224226
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
@@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
301303
expectedDeleteCount: 1,
302304
podTerminating: true,
303305
},
306+
{
307+
name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
308+
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
309+
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
310+
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
311+
Status: policyv1beta1.PodDisruptionBudgetStatus{
312+
// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
313+
DisruptionsAllowed: 0,
314+
CurrentHealthy: 2,
315+
DesiredHealthy: 2,
316+
},
317+
}},
318+
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
319+
expectError: true,
320+
podName: "t7",
321+
expectedDeleteCount: 0,
322+
podTerminating: false,
323+
podPhase: api.PodRunning,
324+
prc: &api.PodCondition{
325+
Type: api.PodReady,
326+
Status: api.ConditionTrue,
327+
},
328+
},
329+
{
330+
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
331+
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
332+
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
333+
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
334+
Status: policyv1beta1.PodDisruptionBudgetStatus{
335+
// This simulates 3 pods desired, our pod unhealthy
336+
DisruptionsAllowed: 0,
337+
CurrentHealthy: 2,
338+
DesiredHealthy: 2,
339+
},
340+
}},
341+
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
342+
expectError: false,
343+
podName: "t8",
344+
expectedDeleteCount: 1,
345+
podTerminating: false,
346+
podPhase: api.PodRunning,
347+
prc: &api.PodCondition{
348+
Type: api.PodReady,
349+
Status: api.ConditionFalse,
350+
},
351+
},
352+
{
353+
// This case should return the 529 retry error.
354+
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
355+
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
356+
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
357+
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
358+
Status: policyv1beta1.PodDisruptionBudgetStatus{
359+
// This simulates 3 pods desired, our pod unhealthy
360+
DisruptionsAllowed: 0,
361+
CurrentHealthy: 2,
362+
DesiredHealthy: 2,
363+
},
364+
}},
365+
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
366+
expectError: true,
367+
podName: "t9",
368+
expectedDeleteCount: 1,
369+
podTerminating: false,
370+
podPhase: api.PodRunning,
371+
prc: &api.PodCondition{
372+
Type: api.PodReady,
373+
Status: api.ConditionFalse,
374+
},
375+
},
376+
{
377+
// This case should return the 529 retry error.
378+
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
379+
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
380+
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
381+
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
382+
Status: policyv1beta1.PodDisruptionBudgetStatus{
383+
// This simulates 3 pods desired, our pod unhealthy
384+
DisruptionsAllowed: 0,
385+
CurrentHealthy: 2,
386+
DesiredHealthy: 2,
387+
},
388+
}},
389+
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
390+
expectError: true,
391+
podName: "t10",
392+
expectedDeleteCount: 1,
393+
podTerminating: false,
394+
podPhase: api.PodRunning,
395+
prc: &api.PodCondition{
396+
Type: api.PodReady,
397+
Status: api.ConditionFalse,
398+
},
399+
},
304400
}
305401

306402
for _, tc := range testcases {
@@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
323419
pod.ObjectMeta.DeletionTimestamp = &currentTime
324420
}
325421

422+
// Setup pod condition
423+
if tc.prc != nil {
424+
if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
425+
t.Fatalf("Unable to update pod ready condition")
426+
}
427+
}
428+
326429
client := fake.NewSimpleClientset(tc.pdbs...)
327430
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
328431

@@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
416519
// Always return error for this pod
417520
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
418521
}
419-
if ms.pod.Name == "t6" {
420-
// This pod has a deletionTimestamp and should not raise conflict on delete
522+
if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
523+
// t6: This pod has a deletionTimestamp and should not raise conflict on delete
524+
// t8: This pod should not have a resource conflict.
421525
return nil, true, nil
422526
}
527+
if ms.pod.Name == "t10" {
528+
return nil, false, apierrors.NewBadRequest("test designed to error")
529+
}
423530
if count == 1 {
424531
// This is a hack to ensure that some test pods don't change phase
425532
// but do change resource version

0 commit comments

Comments
 (0)