Merge pull request #7914 from abdelrahman882/bsp

k8s-ci-robot · web-flow · commit f90590b90ff6 · 2025-03-24T06:20:36.000-07:00
Add time based drainability rule for non-pdb-assigned system pods
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -276,6 +276,8 @@ type AutoscalingOptions struct {
 	// MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have
 	// to allow their pods deletion in scale down
 	MinReplicaCount int
+	// BspDisruptionTimeout is the timeout after which CA will evict non-pdb-assigned blocking system pods
+	BspDisruptionTimeout time.Duration
 	// NodeDeleteDelayAfterTaint is the duration to wait before deleting a node after tainting it
 	NodeDeleteDelayAfterTaint time.Duration
 	// NodeGroupSetRatio is a collection of ratios used by CA used to make scaling decisions.
diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go
@@ -193,10 +193,11 @@ var (
 	recordDuplicatedEvents                  = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
 	maxNodesPerScaleUp                      = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
 	maxNodeGroupBinpackingDuration          = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
-	skipNodesWithSystemPods                 = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
+	skipNodesWithSystemPods                 = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will wait for --blocking-system-pod-distruption-timeout before deleting nodes with pods from kube-system (except for DaemonSet or mirror pods)")
 	skipNodesWithLocalStorage               = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
 	skipNodesWithCustomControllerPods       = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
 	minReplicaCount                         = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
+	bspDisruptionTimeout                    = flag.Duration("blocking-system-pod-distruption-timeout", time.Hour, "The timeout after which CA will evict non-pdb-assigned blocking system pods, applicable only when --skip-nodes-with-system-pods is set to true")
 	nodeDeleteDelayAfterTaint               = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
 	scaleDownSimulationTimeout              = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
 	maxCapacityMemoryDifferenceRatio        = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
@@ -370,6 +371,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		SkipNodesWithSystemPods:            *skipNodesWithSystemPods,
 		SkipNodesWithLocalStorage:          *skipNodesWithLocalStorage,
 		MinReplicaCount:                    *minReplicaCount,
+		BspDisruptionTimeout:               *bspDisruptionTimeout,
 		NodeDeleteDelayAfterTaint:          *nodeDeleteDelayAfterTaint,
 		ScaleDownSimulationTimeout:         *scaleDownSimulationTimeout,
 		SkipNodesWithCustomControllerPods:  *skipNodesWithCustomControllerPods,
diff --git a/cluster-autoscaler/simulator/drain_test.go b/cluster-autoscaler/simulator/drain_test.go
@@ -43,7 +43,11 @@ import (
 
 func TestGetPodsToMove(t *testing.T) {
 	var (
-		testTime = time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC)
+		testTime                                = time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC)
+		bspDisruptionTimeout                    = time.Hour
+		creationTimeBeforeBspDisturptionTimeout = testTime.Add(-bspDisruptionTimeout).Add(-time.Minute)
+		creationTimeAfterBspDisturptionTimeout  = testTime.Add(-bspDisruptionTimeout).Add(time.Minute)
+
 		replicas = int32(5)
 
 		unreplicatedPod = &apiv1.Pod{
@@ -68,6 +72,22 @@ func TestGetPodsToMove(t *testing.T) {
 				OwnerReferences: GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
 			},
 		}
+		drainableBlockingSystemPod = &apiv1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:              "systemPod",
+				Namespace:         "kube-system",
+				OwnerReferences:   GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
+				CreationTimestamp: metav1.Time{Time: creationTimeBeforeBspDisturptionTimeout},
+			},
+		}
+		nonDrainableBlockingSystemPod = &apiv1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:              "systemPod",
+				Namespace:         "kube-system",
+				OwnerReferences:   GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
+				CreationTimestamp: metav1.Time{Time: creationTimeAfterBspDisturptionTimeout},
+			},
+		}
 		localStoragePod = &apiv1.Pod{
 			ObjectMeta: metav1.ObjectMeta{
 				Name:            "localStoragePod",
@@ -541,6 +561,28 @@ func TestGetPodsToMove(t *testing.T) {
 				Reason: drain.UnmovableKubeSystemPod,
 			},
 		},
+		{
+			desc:    "Kube-system no pdb system pods blocking",
+			pods:    []*apiv1.Pod{nonDrainableBlockingSystemPod},
+			wantErr: true,
+			wantBlocking: &drain.BlockingPod{
+				Pod:    nonDrainableBlockingSystemPod,
+				Reason: drain.UnmovableKubeSystemPod,
+			}},
+		{
+			desc:     "Kube-system no pdb system pods allowing",
+			pods:     []*apiv1.Pod{drainableBlockingSystemPod},
+			wantPods: []*apiv1.Pod{drainableBlockingSystemPod},
+		},
+		{
+			desc:    "Kube-system no pdb system pods blocking",
+			pods:    []*apiv1.Pod{drainableBlockingSystemPod, nonDrainableBlockingSystemPod},
+			wantErr: true,
+			wantBlocking: &drain.BlockingPod{
+				Pod:    nonDrainableBlockingSystemPod,
+				Reason: drain.UnmovableKubeSystemPod,
+			},
+		},
 		{
 			desc:    "Local storage",
 			pods:    []*apiv1.Pod{localStoragePod},
@@ -771,6 +813,7 @@ func TestGetPodsToMove(t *testing.T) {
 				SkipNodesWithSystemPods:           true,
 				SkipNodesWithLocalStorage:         true,
 				SkipNodesWithCustomControllerPods: true,
+				BspDisruptionTimeout:              bspDisruptionTimeout,
 			}
 			rules := append(tc.rules, rules.Default(deleteOptions)...)
 			tracker := pdb.NewBasicRemainingPdbTracker()
diff --git a/cluster-autoscaler/simulator/drainability/rules/rules.go b/cluster-autoscaler/simulator/drainability/rules/rules.go
@@ -65,7 +65,7 @@ func Default(deleteOptions options.NodeDeleteOptions) Rules {
 
 		// Blocking checks
 		{rule: replicated.New(deleteOptions.SkipNodesWithCustomControllerPods)},
-		{rule: system.New(), skip: !deleteOptions.SkipNodesWithSystemPods},
+		{rule: system.New(deleteOptions.BspDisruptionTimeout), skip: !deleteOptions.SkipNodesWithSystemPods},
 		{rule: notsafetoevict.New()},
 		{rule: localstorage.New(), skip: !deleteOptions.SkipNodesWithLocalStorage},
 		{rule: pdbrule.New()},
diff --git a/cluster-autoscaler/simulator/drainability/rules/system/rule.go b/cluster-autoscaler/simulator/drainability/rules/system/rule.go
@@ -18,19 +18,25 @@ package system
 
 import (
 	"fmt"
+	"time"
 
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator/drainability"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
 )
 
+// KubeSystemNamespace is the namespase includes system pods
+const KubeSystemNamespace = "kube-system"
+
 // Rule is a drainability rule on how to handle system pods.
-type Rule struct{}
+type Rule struct {
+	BspDisruptionTimeout time.Duration
+}
 
 // New creates a new Rule.
-func New() *Rule {
-	return &Rule{}
+func New(bspDisruptionTimeout time.Duration) *Rule {
+	return &Rule{BspDisruptionTimeout: bspDisruptionTimeout}
 }
 
 // Name returns the name of the rule.
@@ -40,8 +46,20 @@ func (r *Rule) Name() string {
 
 // Drainable decides what to do with system pods on node drain.
 func (r *Rule) Drainable(drainCtx *drainability.DrainContext, pod *apiv1.Pod, _ *framework.NodeInfo) drainability.Status {
-	if pod.Namespace == "kube-system" && len(drainCtx.RemainingPdbTracker.MatchingPdbs(pod)) == 0 {
+	if isBlockingSystemPod(drainCtx, pod) {
+		if r.isBspPassedDisruptionTimeout(pod, drainCtx.Timestamp) {
+			return drainability.NewDrainableStatus()
+		}
 		return drainability.NewBlockedStatus(drain.UnmovableKubeSystemPod, fmt.Errorf("non-daemonset, non-mirrored, non-pdb-assigned kube-system pod present: %s", pod.Name))
 	}
 	return drainability.NewUndefinedStatus()
 }
+
+func isBlockingSystemPod(drainCtx *drainability.DrainContext, pod *apiv1.Pod) bool {
+	return pod.Namespace == KubeSystemNamespace && len(drainCtx.RemainingPdbTracker.MatchingPdbs(pod)) == 0
+}
+
+func (r *Rule) isBspPassedDisruptionTimeout(pod *apiv1.Pod, drainTime time.Time) bool {
+	return !pod.ObjectMeta.CreationTimestamp.IsZero() &&
+		drainTime.After(pod.ObjectMeta.CreationTimestamp.Add(r.BspDisruptionTimeout))
+}
diff --git a/cluster-autoscaler/simulator/drainability/rules/system/rule_test.go b/cluster-autoscaler/simulator/drainability/rules/system/rule_test.go
@@ -34,7 +34,11 @@ import (
 
 func TestDrainable(t *testing.T) {
 	var (
-		testTime = time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC)
+		testTime                                = time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC)
+		bspDisruptionTimeout                    = time.Minute
+		creationTimeBeforeBspDisturptionTimeout = testTime.Add(-bspDisruptionTimeout).Add(-time.Second)
+		creationTimeAfterBspDisturptionTimeout  = testTime.Add(-bspDisruptionTimeout).Add(time.Second)
+
 		replicas = int32(5)
 
 		rc = apiv1.ReplicationController{
@@ -84,6 +88,24 @@ func TestDrainable(t *testing.T) {
 			},
 		}
 
+		drainableBlockingSystemPod = &apiv1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:              "systemPod",
+				Namespace:         "kube-system",
+				OwnerReferences:   test.GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
+				CreationTimestamp: metav1.Time{Time: creationTimeBeforeBspDisturptionTimeout},
+			},
+		}
+
+		nonDrainableBlockingSystemPod = &apiv1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:              "systemPod",
+				Namespace:         "kube-system",
+				OwnerReferences:   test.GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
+				CreationTimestamp: metav1.Time{Time: creationTimeAfterBspDisturptionTimeout},
+			},
+		}
+
 		emptyPDB = &policyv1.PodDisruptionBudget{}
 
 		kubeSystemPDB = &policyv1.PodDisruptionBudget{
@@ -164,6 +186,18 @@ func TestDrainable(t *testing.T) {
 			wantReason: drain.UnmovableKubeSystemPod,
 			wantError:  true,
 		},
+		"block non-pdb system pod existing for less than BspDisruptionTimeout": {
+			pod:        nonDrainableBlockingSystemPod,
+			rcs:        []*apiv1.ReplicationController{&kubeSystemRc},
+			pdbs:       []*policyv1.PodDisruptionBudget{emptyPDB},
+			wantReason: drain.UnmovableKubeSystemPod,
+			wantError:  true,
+		},
+		"allow non-pdb system pod existing for more than BspDisruptionTimeout": {
+			pod:  drainableBlockingSystemPod,
+			rcs:  []*apiv1.ReplicationController{&kubeSystemRc},
+			pdbs: []*policyv1.PodDisruptionBudget{kubeSystemPDB},
+		},
 	} {
 		t.Run(desc, func(t *testing.T) {
 			tracker := pdb.NewBasicRemainingPdbTracker()
@@ -173,7 +207,7 @@ func TestDrainable(t *testing.T) {
 				RemainingPdbTracker: tracker,
 				Timestamp:           testTime,
 			}
-			status := New().Drainable(drainCtx, test.pod, nil)
+			status := New(bspDisruptionTimeout).Drainable(drainCtx, test.pod, nil)
 			assert.Equal(t, test.wantReason, status.BlockingReason)
 			assert.Equal(t, test.wantError, status.Error != nil)
 		})
diff --git a/cluster-autoscaler/simulator/options/nodedelete.go b/cluster-autoscaler/simulator/options/nodedelete.go
@@ -17,6 +17,8 @@ limitations under the License.
 package options
 
 import (
+	"time"
+
 	"k8s.io/autoscaler/cluster-autoscaler/config"
 )
 
@@ -35,6 +37,9 @@ type NodeDeleteOptions struct {
 	// set or replication controller should have to allow pod deletion during
 	// scale down.
 	MinReplicaCount int
+	// BspDisruptionTimeout is the timeout after which CA will evict
+	// non-pdb-assigned blocking system pods
+	BspDisruptionTimeout time.Duration
 }
 
 // NewNodeDeleteOptions returns new node delete options extracted from autoscaling options.
@@ -44,5 +49,6 @@ func NewNodeDeleteOptions(opts config.AutoscalingOptions) NodeDeleteOptions {
 		SkipNodesWithLocalStorage:         opts.SkipNodesWithLocalStorage,
 		SkipNodesWithCustomControllerPods: opts.SkipNodesWithCustomControllerPods,
 		MinReplicaCount:                   opts.MinReplicaCount,
+		BspDisruptionTimeout:              opts.BspDisruptionTimeout,
 	}
 }

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,8 @@ limitations under the License.`
`17`	`17`	`package options`
`18`	`18`
`19`	`19`	`import (`
	`20`	`+ "time"`
	`21`	`+`
`20`	`22`	`"k8s.io/autoscaler/cluster-autoscaler/config"`
`21`	`23`	`)`
`22`	`24`
`@@ -35,6 +37,9 @@ type NodeDeleteOptions struct {`
`35`	`37`	`// set or replication controller should have to allow pod deletion during`
`36`	`38`	`// scale down.`
`37`	`39`	`MinReplicaCount int`
	`40`	`+ // BspDisruptionTimeout is the timeout after which CA will evict`
	`41`	`+ // non-pdb-assigned blocking system pods`
	`42`	`+ BspDisruptionTimeout time.Duration`
`38`	`43`	`}`
`39`	`44`
`40`	`45`	`// NewNodeDeleteOptions returns new node delete options extracted from autoscaling options.`
`@@ -44,5 +49,6 @@ func NewNodeDeleteOptions(opts config.AutoscalingOptions) NodeDeleteOptions {`
`44`	`49`	`SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage,`
`45`	`50`	`SkipNodesWithCustomControllerPods: opts.SkipNodesWithCustomControllerPods,`
`46`	`51`	`MinReplicaCount: opts.MinReplicaCount,`
	`52`	`+ BspDisruptionTimeout: opts.BspDisruptionTimeout,`
`47`	`53`	`}`
`48`	`54`	`}`