Skip to content

Commit f90590b

Browse files
authored
Merge pull request #7914 from abdelrahman882/bsp
Add time based drainability rule for non-pdb-assigned system pods
2 parents 15295c7 + 696af98 commit f90590b

File tree

7 files changed

+114
-9
lines changed

7 files changed

+114
-9
lines changed

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,8 @@ type AutoscalingOptions struct {
276276
// MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have
277277
// to allow their pods deletion in scale down
278278
MinReplicaCount int
279+
// BspDisruptionTimeout is the timeout after which CA will evict non-pdb-assigned blocking system pods
280+
BspDisruptionTimeout time.Duration
279281
// NodeDeleteDelayAfterTaint is the duration to wait before deleting a node after tainting it
280282
NodeDeleteDelayAfterTaint time.Duration
281283
// NodeGroupSetRatio is a collection of ratios used by CA used to make scaling decisions.

cluster-autoscaler/config/flags/flags.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,10 +193,11 @@ var (
193193
recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
194194
maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
195195
maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
196-
skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
196+
skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will wait for --blocking-system-pod-distruption-timeout before deleting nodes with pods from kube-system (except for DaemonSet or mirror pods)")
197197
skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
198198
skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
199199
minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
200+
bspDisruptionTimeout = flag.Duration("blocking-system-pod-distruption-timeout", time.Hour, "The timeout after which CA will evict non-pdb-assigned blocking system pods, applicable only when --skip-nodes-with-system-pods is set to true")
200201
nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
201202
scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
202203
maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
@@ -370,6 +371,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
370371
SkipNodesWithSystemPods: *skipNodesWithSystemPods,
371372
SkipNodesWithLocalStorage: *skipNodesWithLocalStorage,
372373
MinReplicaCount: *minReplicaCount,
374+
BspDisruptionTimeout: *bspDisruptionTimeout,
373375
NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint,
374376
ScaleDownSimulationTimeout: *scaleDownSimulationTimeout,
375377
SkipNodesWithCustomControllerPods: *skipNodesWithCustomControllerPods,

cluster-autoscaler/simulator/drain_test.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@ import (
4343

4444
func TestGetPodsToMove(t *testing.T) {
4545
var (
46-
testTime = time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC)
46+
testTime = time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC)
47+
bspDisruptionTimeout = time.Hour
48+
creationTimeBeforeBspDisturptionTimeout = testTime.Add(-bspDisruptionTimeout).Add(-time.Minute)
49+
creationTimeAfterBspDisturptionTimeout = testTime.Add(-bspDisruptionTimeout).Add(time.Minute)
50+
4751
replicas = int32(5)
4852

4953
unreplicatedPod = &apiv1.Pod{
@@ -68,6 +72,22 @@ func TestGetPodsToMove(t *testing.T) {
6872
OwnerReferences: GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
6973
},
7074
}
75+
drainableBlockingSystemPod = &apiv1.Pod{
76+
ObjectMeta: metav1.ObjectMeta{
77+
Name: "systemPod",
78+
Namespace: "kube-system",
79+
OwnerReferences: GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
80+
CreationTimestamp: metav1.Time{Time: creationTimeBeforeBspDisturptionTimeout},
81+
},
82+
}
83+
nonDrainableBlockingSystemPod = &apiv1.Pod{
84+
ObjectMeta: metav1.ObjectMeta{
85+
Name: "systemPod",
86+
Namespace: "kube-system",
87+
OwnerReferences: GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
88+
CreationTimestamp: metav1.Time{Time: creationTimeAfterBspDisturptionTimeout},
89+
},
90+
}
7191
localStoragePod = &apiv1.Pod{
7292
ObjectMeta: metav1.ObjectMeta{
7393
Name: "localStoragePod",
@@ -541,6 +561,28 @@ func TestGetPodsToMove(t *testing.T) {
541561
Reason: drain.UnmovableKubeSystemPod,
542562
},
543563
},
564+
{
565+
desc: "Kube-system no pdb system pods blocking",
566+
pods: []*apiv1.Pod{nonDrainableBlockingSystemPod},
567+
wantErr: true,
568+
wantBlocking: &drain.BlockingPod{
569+
Pod: nonDrainableBlockingSystemPod,
570+
Reason: drain.UnmovableKubeSystemPod,
571+
}},
572+
{
573+
desc: "Kube-system no pdb system pods allowing",
574+
pods: []*apiv1.Pod{drainableBlockingSystemPod},
575+
wantPods: []*apiv1.Pod{drainableBlockingSystemPod},
576+
},
577+
{
578+
desc: "Kube-system no pdb system pods blocking",
579+
pods: []*apiv1.Pod{drainableBlockingSystemPod, nonDrainableBlockingSystemPod},
580+
wantErr: true,
581+
wantBlocking: &drain.BlockingPod{
582+
Pod: nonDrainableBlockingSystemPod,
583+
Reason: drain.UnmovableKubeSystemPod,
584+
},
585+
},
544586
{
545587
desc: "Local storage",
546588
pods: []*apiv1.Pod{localStoragePod},
@@ -771,6 +813,7 @@ func TestGetPodsToMove(t *testing.T) {
771813
SkipNodesWithSystemPods: true,
772814
SkipNodesWithLocalStorage: true,
773815
SkipNodesWithCustomControllerPods: true,
816+
BspDisruptionTimeout: bspDisruptionTimeout,
774817
}
775818
rules := append(tc.rules, rules.Default(deleteOptions)...)
776819
tracker := pdb.NewBasicRemainingPdbTracker()

cluster-autoscaler/simulator/drainability/rules/rules.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func Default(deleteOptions options.NodeDeleteOptions) Rules {
6565

6666
// Blocking checks
6767
{rule: replicated.New(deleteOptions.SkipNodesWithCustomControllerPods)},
68-
{rule: system.New(), skip: !deleteOptions.SkipNodesWithSystemPods},
68+
{rule: system.New(deleteOptions.BspDisruptionTimeout), skip: !deleteOptions.SkipNodesWithSystemPods},
6969
{rule: notsafetoevict.New()},
7070
{rule: localstorage.New(), skip: !deleteOptions.SkipNodesWithLocalStorage},
7171
{rule: pdbrule.New()},

cluster-autoscaler/simulator/drainability/rules/system/rule.go

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,25 @@ package system
1818

1919
import (
2020
"fmt"
21+
"time"
2122

2223
apiv1 "k8s.io/api/core/v1"
2324
"k8s.io/autoscaler/cluster-autoscaler/simulator/drainability"
2425
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
2526
"k8s.io/autoscaler/cluster-autoscaler/utils/drain"
2627
)
2728

29+
// KubeSystemNamespace is the namespase includes system pods
30+
const KubeSystemNamespace = "kube-system"
31+
2832
// Rule is a drainability rule on how to handle system pods.
29-
type Rule struct{}
33+
type Rule struct {
34+
BspDisruptionTimeout time.Duration
35+
}
3036

3137
// New creates a new Rule.
32-
func New() *Rule {
33-
return &Rule{}
38+
func New(bspDisruptionTimeout time.Duration) *Rule {
39+
return &Rule{BspDisruptionTimeout: bspDisruptionTimeout}
3440
}
3541

3642
// Name returns the name of the rule.
@@ -40,8 +46,20 @@ func (r *Rule) Name() string {
4046

4147
// Drainable decides what to do with system pods on node drain.
4248
func (r *Rule) Drainable(drainCtx *drainability.DrainContext, pod *apiv1.Pod, _ *framework.NodeInfo) drainability.Status {
43-
if pod.Namespace == "kube-system" && len(drainCtx.RemainingPdbTracker.MatchingPdbs(pod)) == 0 {
49+
if isBlockingSystemPod(drainCtx, pod) {
50+
if r.isBspPassedDisruptionTimeout(pod, drainCtx.Timestamp) {
51+
return drainability.NewDrainableStatus()
52+
}
4453
return drainability.NewBlockedStatus(drain.UnmovableKubeSystemPod, fmt.Errorf("non-daemonset, non-mirrored, non-pdb-assigned kube-system pod present: %s", pod.Name))
4554
}
4655
return drainability.NewUndefinedStatus()
4756
}
57+
58+
func isBlockingSystemPod(drainCtx *drainability.DrainContext, pod *apiv1.Pod) bool {
59+
return pod.Namespace == KubeSystemNamespace && len(drainCtx.RemainingPdbTracker.MatchingPdbs(pod)) == 0
60+
}
61+
62+
func (r *Rule) isBspPassedDisruptionTimeout(pod *apiv1.Pod, drainTime time.Time) bool {
63+
return !pod.ObjectMeta.CreationTimestamp.IsZero() &&
64+
drainTime.After(pod.ObjectMeta.CreationTimestamp.Add(r.BspDisruptionTimeout))
65+
}

cluster-autoscaler/simulator/drainability/rules/system/rule_test.go

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,11 @@ import (
3434

3535
func TestDrainable(t *testing.T) {
3636
var (
37-
testTime = time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC)
37+
testTime = time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC)
38+
bspDisruptionTimeout = time.Minute
39+
creationTimeBeforeBspDisturptionTimeout = testTime.Add(-bspDisruptionTimeout).Add(-time.Second)
40+
creationTimeAfterBspDisturptionTimeout = testTime.Add(-bspDisruptionTimeout).Add(time.Second)
41+
3842
replicas = int32(5)
3943

4044
rc = apiv1.ReplicationController{
@@ -84,6 +88,24 @@ func TestDrainable(t *testing.T) {
8488
},
8589
}
8690

91+
drainableBlockingSystemPod = &apiv1.Pod{
92+
ObjectMeta: metav1.ObjectMeta{
93+
Name: "systemPod",
94+
Namespace: "kube-system",
95+
OwnerReferences: test.GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
96+
CreationTimestamp: metav1.Time{Time: creationTimeBeforeBspDisturptionTimeout},
97+
},
98+
}
99+
100+
nonDrainableBlockingSystemPod = &apiv1.Pod{
101+
ObjectMeta: metav1.ObjectMeta{
102+
Name: "systemPod",
103+
Namespace: "kube-system",
104+
OwnerReferences: test.GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", ""),
105+
CreationTimestamp: metav1.Time{Time: creationTimeAfterBspDisturptionTimeout},
106+
},
107+
}
108+
87109
emptyPDB = &policyv1.PodDisruptionBudget{}
88110

89111
kubeSystemPDB = &policyv1.PodDisruptionBudget{
@@ -164,6 +186,18 @@ func TestDrainable(t *testing.T) {
164186
wantReason: drain.UnmovableKubeSystemPod,
165187
wantError: true,
166188
},
189+
"block non-pdb system pod existing for less than BspDisruptionTimeout": {
190+
pod: nonDrainableBlockingSystemPod,
191+
rcs: []*apiv1.ReplicationController{&kubeSystemRc},
192+
pdbs: []*policyv1.PodDisruptionBudget{emptyPDB},
193+
wantReason: drain.UnmovableKubeSystemPod,
194+
wantError: true,
195+
},
196+
"allow non-pdb system pod existing for more than BspDisruptionTimeout": {
197+
pod: drainableBlockingSystemPod,
198+
rcs: []*apiv1.ReplicationController{&kubeSystemRc},
199+
pdbs: []*policyv1.PodDisruptionBudget{kubeSystemPDB},
200+
},
167201
} {
168202
t.Run(desc, func(t *testing.T) {
169203
tracker := pdb.NewBasicRemainingPdbTracker()
@@ -173,7 +207,7 @@ func TestDrainable(t *testing.T) {
173207
RemainingPdbTracker: tracker,
174208
Timestamp: testTime,
175209
}
176-
status := New().Drainable(drainCtx, test.pod, nil)
210+
status := New(bspDisruptionTimeout).Drainable(drainCtx, test.pod, nil)
177211
assert.Equal(t, test.wantReason, status.BlockingReason)
178212
assert.Equal(t, test.wantError, status.Error != nil)
179213
})

cluster-autoscaler/simulator/options/nodedelete.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package options
1818

1919
import (
20+
"time"
21+
2022
"k8s.io/autoscaler/cluster-autoscaler/config"
2123
)
2224

@@ -35,6 +37,9 @@ type NodeDeleteOptions struct {
3537
// set or replication controller should have to allow pod deletion during
3638
// scale down.
3739
MinReplicaCount int
40+
// BspDisruptionTimeout is the timeout after which CA will evict
41+
// non-pdb-assigned blocking system pods
42+
BspDisruptionTimeout time.Duration
3843
}
3944

4045
// NewNodeDeleteOptions returns new node delete options extracted from autoscaling options.
@@ -44,5 +49,6 @@ func NewNodeDeleteOptions(opts config.AutoscalingOptions) NodeDeleteOptions {
4449
SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage,
4550
SkipNodesWithCustomControllerPods: opts.SkipNodesWithCustomControllerPods,
4651
MinReplicaCount: opts.MinReplicaCount,
52+
BspDisruptionTimeout: opts.BspDisruptionTimeout,
4753
}
4854
}

0 commit comments

Comments
 (0)