Allow deletion of unhealthy pods if enough healthy

michaelgugino · michaelgugino · commit 717be0cd44e1 · 2020-10-20T12:59:23.000-04:00
Currently, if you have a PDB with 0 disruptions
available and you attempt to evict a non-healthy
pod, the eviction request will always fail.  This
is because the eviction API does not currently
take in to account that the pod you are removing
is the unhealthy one.

This commit accounts for trying to evict an
unhealthy pod as long as there are enough healthy
pods to satisfy the PDB's requirements.  To
protect against race conditions, a ResourceVersion
constraint is enforced.  This will ensure that
the target pod does not go healthy and allow
any race condition to occur which might disrupt
too many pods at once.

This commit also eliminates superfluous class to
DeepCopy for the deleteOptions struct.
diff --git a/pkg/registry/core/pod/storage/BUILD b/pkg/registry/core/pod/storage/BUILD
@@ -14,6 +14,7 @@ go_test(
     ],
     embed = [":go_default_library"],
     deps = [
+        "//pkg/api/pod:go_default_library",
         "//pkg/apis/core:go_default_library",
         "//pkg/apis/policy:go_default_library",
         "//pkg/registry/registrytest:go_default_library",
diff --git a/pkg/registry/core/pod/storage/eviction.go b/pkg/registry/core/pod/storage/eviction.go
@@ -33,6 +33,7 @@ import (
 	"k8s.io/apiserver/pkg/util/dryrun"
 	policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
 	"k8s.io/client-go/util/retry"
+	podutil "k8s.io/kubernetes/pkg/api/pod"
 	api "k8s.io/kubernetes/pkg/apis/core"
 	"k8s.io/kubernetes/pkg/apis/policy"
 )
@@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
 		}
 
 		// the PDB can be ignored, so delete the pod
-		deletionOptions := originalDeleteOptions.DeepCopy()
+		deleteOptions := originalDeleteOptions
+
 		// We should check if resourceVersion is already set by the requestor
 		// as it might be older than the pod we just fetched and should be
 		// honored.
 		if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
-			// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
+			// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
 			// racing with another PDB-impacting process elsewhere.
-			if deletionOptions.Preconditions == nil {
-				deletionOptions.Preconditions = &metav1.Preconditions{}
-			}
-			deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
+			deleteOptions = deleteOptions.DeepCopy()
+			setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
 		}
-		_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
+		_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
 		if err != nil {
 			return err
 		}
@@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
 
 	var rtStatus *metav1.Status
 	var pdbName string
+	updateDeletionOptions := false
+
 	err = func() error {
 		pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
 		if err != nil {
@@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
 
 		pdb := &pdbs[0]
 		pdbName = pdb.Name
+
+		// If the pod is not ready, it doesn't count towards healthy and we should not decrement
+		if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
+			updateDeletionOptions = true
+			return nil
+		}
+
 		refresh := false
 		err = retry.RetryOnConflict(EvictionsRetry, func() error {
 			if refresh {
@@ -232,18 +241,43 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
 		return rtStatus, nil
 	}
 
-	// At this point there was either no PDB or we succeeded in decrementing
+	// At this point there was either no PDB or we succeeded in decrementing or
+	// the pod was unready and we have enough healthy replicas
+
+	deleteOptions := originalDeleteOptions
+
+	// Set deleteOptions.Preconditions.ResourceVersion to ensure
+	// the pod hasn't been considered ready since we calculated
+	if updateDeletionOptions {
+		// Take a copy so we can compare to client-provied Options later.
+		deleteOptions = deleteOptions.DeepCopy()
+		setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
+	}
 
 	// Try the delete
-	_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
+	_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
 	if err != nil {
+		if errors.IsConflict(err) && updateDeletionOptions &&
+			(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
+			// If we encounter a resource conflict error, we updated the deletion options to include them,
+			// and the original deletion options did not specify ResourceVersion, we send back
+			// TooManyRequests so clients will retry.
+			return nil, createTooManyRequestsError(pdbName)
+		}
 		return nil, err
 	}
 
 	// Success!
 	return &metav1.Status{Status: metav1.StatusSuccess}, nil
 }
 
+func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
+	if deleteOptions.Preconditions == nil {
+		deleteOptions.Preconditions = &metav1.Preconditions{}
+	}
+	deleteOptions.Preconditions.ResourceVersion = resourceVersion
+}
+
 // canIgnorePDB returns true for pod conditions that allow the pod to be deleted
 // without checking PDBs.
 func canIgnorePDB(pod *api.Pod) bool {
@@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
 	return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
 }
 
+func createTooManyRequestsError(name string) error {
+	// TODO(mml): Add a Retry-After header.  Once there are time-based
+	// budgets, we can sometimes compute a sensible suggested value.  But
+	// even without that, we can give a suggestion (10 minutes?) that
+	// prevents well-behaved clients from hammering us.
+	err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
+	err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
+	return err
+}
+
 // checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
 func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
 	if pdb.Status.ObservedGeneration < pdb.Generation {
-		// TODO(mml): Add a Retry-After header.  Once there are time-based
-		// budgets, we can sometimes compute a sensible suggested value.  But
-		// even without that, we can give a suggestion (10 minutes?) that
-		// prevents well-behaved clients from hammering us.
-		err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
-		err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
-		return err
+
+		return createTooManyRequestsError(pdb.Name)
 	}
 	if pdb.Status.DisruptionsAllowed < 0 {
 		return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))
diff --git a/pkg/registry/core/pod/storage/eviction_test.go b/pkg/registry/core/pod/storage/eviction_test.go
@@ -31,6 +31,7 @@ import (
 	genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
 	"k8s.io/apiserver/pkg/registry/rest"
 	"k8s.io/client-go/kubernetes/fake"
+	podapi "k8s.io/kubernetes/pkg/api/pod"
 	api "k8s.io/kubernetes/pkg/apis/core"
 	"k8s.io/kubernetes/pkg/apis/policy"
 )
@@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
 		podName             string
 		expectedDeleteCount int
 		podTerminating      bool
+		prc                 *api.PodCondition
 	}{
 		{
 			name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
@@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
 			expectedDeleteCount: 1,
 			podTerminating:      true,
 		},
+		{
+			name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
+			pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
+				ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
+				Spec:       policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
+				Status: policyv1beta1.PodDisruptionBudgetStatus{
+					// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
+					DisruptionsAllowed: 0,
+					CurrentHealthy:     2,
+					DesiredHealthy:     2,
+				},
+			}},
+			eviction:            &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
+			expectError:         true,
+			podName:             "t7",
+			expectedDeleteCount: 0,
+			podTerminating:      false,
+			podPhase:            api.PodRunning,
+			prc: &api.PodCondition{
+				Type:   api.PodReady,
+				Status: api.ConditionTrue,
+			},
+		},
+		{
+			name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
+			pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
+				ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
+				Spec:       policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
+				Status: policyv1beta1.PodDisruptionBudgetStatus{
+					// This simulates 3 pods desired, our pod unhealthy
+					DisruptionsAllowed: 0,
+					CurrentHealthy:     2,
+					DesiredHealthy:     2,
+				},
+			}},
+			eviction:            &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
+			expectError:         false,
+			podName:             "t8",
+			expectedDeleteCount: 1,
+			podTerminating:      false,
+			podPhase:            api.PodRunning,
+			prc: &api.PodCondition{
+				Type:   api.PodReady,
+				Status: api.ConditionFalse,
+			},
+		},
+		{
+			// This case should return the 529 retry error.
+			name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
+			pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
+				ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
+				Spec:       policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
+				Status: policyv1beta1.PodDisruptionBudgetStatus{
+					// This simulates 3 pods desired, our pod unhealthy
+					DisruptionsAllowed: 0,
+					CurrentHealthy:     2,
+					DesiredHealthy:     2,
+				},
+			}},
+			eviction:            &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
+			expectError:         true,
+			podName:             "t9",
+			expectedDeleteCount: 1,
+			podTerminating:      false,
+			podPhase:            api.PodRunning,
+			prc: &api.PodCondition{
+				Type:   api.PodReady,
+				Status: api.ConditionFalse,
+			},
+		},
+		{
+			// This case should return the 529 retry error.
+			name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
+			pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
+				ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
+				Spec:       policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
+				Status: policyv1beta1.PodDisruptionBudgetStatus{
+					// This simulates 3 pods desired, our pod unhealthy
+					DisruptionsAllowed: 0,
+					CurrentHealthy:     2,
+					DesiredHealthy:     2,
+				},
+			}},
+			eviction:            &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
+			expectError:         true,
+			podName:             "t10",
+			expectedDeleteCount: 1,
+			podTerminating:      false,
+			podPhase:            api.PodRunning,
+			prc: &api.PodCondition{
+				Type:   api.PodReady,
+				Status: api.ConditionFalse,
+			},
+		},
 	}
 
 	for _, tc := range testcases {
@@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
 				pod.ObjectMeta.DeletionTimestamp = &currentTime
 			}
 
+			// Setup pod condition
+			if tc.prc != nil {
+				if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
+					t.Fatalf("Unable to update pod ready condition")
+				}
+			}
+
 			client := fake.NewSimpleClientset(tc.pdbs...)
 			evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
 
@@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
 		// Always return error for this pod
 		return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
 	}
-	if ms.pod.Name == "t6" {
-		// This pod has a deletionTimestamp and should not raise conflict on delete
+	if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
+		// t6: This pod has a deletionTimestamp and should not raise conflict on delete
+		// t8: This pod should not have a resource conflict.
 		return nil, true, nil
 	}
+	if ms.pod.Name == "t10" {
+		return nil, false, apierrors.NewBadRequest("test designed to error")
+	}
 	if count == 1 {
 		// This is a hack to ensure that some test pods don't change phase
 		// but do change resource version