Skip to content

Commit ab28abb

Browse files
authored
Merge pull request #8279 from jackfrancis/ca-soft-taints-during-scaledown-1.30
[cluster-autoscaler-release-1.30] Fix cool down status condition to trigger scale down
2 parents 0d75ca5 + e8d5d0d commit ab28abb

File tree

3 files changed

+197
-71
lines changed

3 files changed

+197
-71
lines changed

cluster-autoscaler/core/scaledown/status/status.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ const (
100100
ScaleDownInCooldown
101101
// ScaleDownInProgress - the scale down wasn't attempted, because a previous scale-down was still in progress.
102102
ScaleDownInProgress
103+
// ScaleDownNoCandidates - the scale down was skipped because of no scale down candidates.
104+
ScaleDownNoCandidates
103105
)
104106

105107
// NodeDeleteResultType denotes the type of the result of node deletion. It provides deeper

cluster-autoscaler/core/static_autoscaler.go

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
626626

627627
metrics.UpdateDurationFromStart(metrics.FindUnneeded, unneededStart)
628628

629-
scaleDownInCooldown := a.isScaleDownInCooldown(currentTime, scaleDownCandidates)
629+
scaleDownInCooldown := a.isScaleDownInCooldown(currentTime)
630630
klog.V(4).Infof("Scale down status: lastScaleUpTime=%s lastScaleDownDeleteTime=%v "+
631631
"lastScaleDownFailTime=%s scaleDownForbidden=%v scaleDownInCooldown=%v",
632632
a.lastScaleUpTime, a.lastScaleDownDeleteTime, a.lastScaleDownFailTime,
@@ -647,6 +647,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
647647

648648
if scaleDownInCooldown {
649649
scaleDownStatus.Result = scaledownstatus.ScaleDownInCooldown
650+
a.updateSoftDeletionTaints(allNodes)
650651
} else {
651652
klog.V(4).Infof("Starting scale down")
652653

@@ -666,20 +667,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
666667
a.clusterStateRegistry.Recalculate()
667668
}
668669

669-
if (scaleDownStatus.Result == scaledownstatus.ScaleDownNoNodeDeleted ||
670-
scaleDownStatus.Result == scaledownstatus.ScaleDownNoUnneeded) &&
671-
a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount != 0 {
672-
taintableNodes := a.scaleDownPlanner.UnneededNodes()
673-
674-
// Make sure we are only cleaning taints from selected node groups.
675-
selectedNodes := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...)
676-
677-
// This is a sanity check to make sure `taintableNodes` only includes
678-
// nodes from selected nodes.
679-
taintableNodes = intersectNodes(selectedNodes, taintableNodes)
680-
untaintableNodes := subtractNodes(selectedNodes, taintableNodes)
681-
actuation.UpdateSoftDeletionTaints(a.AutoscalingContext, taintableNodes, untaintableNodes)
682-
}
670+
a.updateSoftDeletionTaints(allNodes)
683671

684672
if typedErr != nil {
685673
klog.Errorf("Failed to scale down: %v", typedErr)
@@ -700,8 +688,23 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
700688
return nil
701689
}
702690

703-
func (a *StaticAutoscaler) isScaleDownInCooldown(currentTime time.Time, scaleDownCandidates []*apiv1.Node) bool {
704-
scaleDownInCooldown := a.processorCallbacks.disableScaleDownForLoop || len(scaleDownCandidates) == 0
691+
func (a *StaticAutoscaler) updateSoftDeletionTaints(allNodes []*apiv1.Node) {
692+
if a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount != 0 {
693+
taintableNodes := a.scaleDownPlanner.UnneededNodes()
694+
695+
// Make sure we are only cleaning taints from selected node groups.
696+
selectedNodes := filterNodesFromSelectedGroups(a.CloudProvider, allNodes...)
697+
698+
// This is a sanity check to make sure `taintableNodes` only includes
699+
// nodes from selected nodes.
700+
taintableNodes = intersectNodes(selectedNodes, taintableNodes)
701+
untaintableNodes := subtractNodes(selectedNodes, taintableNodes)
702+
actuation.UpdateSoftDeletionTaints(a.AutoscalingContext, taintableNodes, untaintableNodes)
703+
}
704+
}
705+
706+
func (a *StaticAutoscaler) isScaleDownInCooldown(currentTime time.Time) bool {
707+
scaleDownInCooldown := a.processorCallbacks.disableScaleDownForLoop
705708

706709
if a.ScaleDownDelayTypeLocal {
707710
return scaleDownInCooldown

cluster-autoscaler/core/static_autoscaler_test.go

Lines changed: 175 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/actuation"
3939
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker"
4040
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/legacy"
41+
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/planner"
4142
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status"
4243
"k8s.io/autoscaler/cluster-autoscaler/core/scaleup/orchestrator"
4344
. "k8s.io/autoscaler/cluster-autoscaler/core/test"
@@ -1391,76 +1392,37 @@ func TestStaticAutoscalerRunOnceWithUnselectedNodeGroups(t *testing.T) {
13911392
provider.AddNode("ng1", n1)
13921393
assert.NotNil(t, provider)
13931394

1394-
tests := map[string]struct {
1395+
tests := []struct {
1396+
name string
13951397
node *apiv1.Node
13961398
pods []*apiv1.Pod
13971399
expectedTaints []apiv1.Taint
13981400
}{
1399-
"Node from selected node groups can get their deletion candidate taints removed": {
1401+
{
1402+
name: "Node from selected node groups can get their deletion candidate taints removed",
14001403
node: n1,
14011404
pods: []*apiv1.Pod{p1},
14021405
expectedTaints: []apiv1.Taint{},
14031406
},
1404-
"Node from non-selected node groups should keep their deletion candidate taints": {
1407+
{
1408+
name: "Node from non-selected node groups should keep their deletion candidate taints",
14051409
node: n2,
14061410
pods: nil,
14071411
expectedTaints: n2.Spec.Taints,
14081412
},
14091413
}
14101414

1411-
for name, test := range tests {
1412-
// prevent issues with scoping, we should be able to get rid of that with Go 1.22
1413-
test := test
1414-
t.Run(name, func(t *testing.T) {
1415+
for _, test := range tests {
1416+
t.Run(test.name, func(t *testing.T) {
14151417
t.Parallel()
1416-
// Create fake listers for the generated nodes, nothing returned by the rest (but the ones used in the tested path have to be defined).
1417-
readyNodeLister := kubernetes.NewTestNodeLister([]*apiv1.Node{test.node})
1418-
allNodeLister := kubernetes.NewTestNodeLister([]*apiv1.Node{test.node})
1419-
allPodListerMock := kubernetes.NewTestPodLister(test.pods)
1420-
daemonSetLister, err := kubernetes.NewTestDaemonSetLister(nil)
1421-
assert.NoError(t, err)
1422-
listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, allPodListerMock,
1423-
kubernetes.NewTestPodDisruptionBudgetLister(nil), daemonSetLister,
1424-
nil, nil, nil, nil)
1425-
1426-
// Create context with minimal autoscalingOptions that guarantee we reach the tested logic.
1427-
autoscalingOptions := config.AutoscalingOptions{
1428-
ScaleDownEnabled: true,
1429-
MaxBulkSoftTaintCount: 10,
1430-
MaxBulkSoftTaintTime: 3 * time.Second,
1431-
}
1432-
processorCallbacks := newStaticAutoscalerProcessorCallbacks()
1433-
clientset := fake.NewSimpleClientset(test.node)
1434-
context, err := NewScaleTestAutoscalingContext(autoscalingOptions, clientset, listerRegistry, provider, processorCallbacks, nil)
1435-
assert.NoError(t, err)
1436-
1437-
// Create CSR with unhealthy cluster protection effectively disabled, to guarantee we reach the tested logic.
1438-
clusterStateConfig := clusterstate.ClusterStateRegistryConfig{
1439-
OkTotalUnreadyCount: 1,
1440-
}
1441-
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(autoscalingOptions.NodeGroupDefaults))
1442-
1443-
// Setting the Actuator is necessary for testing any scale-down logic, it shouldn't have anything to do in this test.
1444-
sdActuator := actuation.NewActuator(&context, clusterState, deletiontracker.NewNodeDeletionTracker(0*time.Second), options.NodeDeleteOptions{}, nil, NewTestProcessors(&context).NodeGroupConfigProcessor)
1445-
context.ScaleDownActuator = sdActuator
1446-
1447-
// Fake planner that keeps track of the scale-down candidates passed to UpdateClusterState.
1448-
sdPlanner := &candidateTrackingFakePlanner{}
1449-
1450-
autoscaler := &StaticAutoscaler{
1451-
AutoscalingContext: &context,
1452-
clusterStateRegistry: clusterState,
1453-
scaleDownPlanner: sdPlanner,
1454-
scaleDownActuator: sdActuator,
1455-
processors: NewTestProcessors(&context),
1456-
loopStartNotifier: loopstart.NewObserversList(nil),
1457-
processorCallbacks: processorCallbacks,
1458-
}
1459-
1460-
err = autoscaler.RunOnce(time.Now().Add(5 * time.Hour))
1461-
assert.NoError(t, err)
1462-
newNode, err := clientset.CoreV1().Nodes().Get(stdcontext.TODO(), test.node.Name, metav1.GetOptions{})
1418+
allNodes := []*apiv1.Node{test.node}
1419+
fakeClient := buildFakeClient(t, allNodes...)
1420+
autoscaler := buildStaticAutoscaler(t, provider, allNodes, allNodes, fakeClient)
1421+
runningTime := time.Now()
1422+
err := autoscaler.RunOnce(runningTime)
14631423
assert.NoError(t, err)
1424+
newNode, clientErr := fakeClient.CoreV1().Nodes().Get(stdcontext.TODO(), test.node.Name, metav1.GetOptions{})
1425+
assert.NoError(t, clientErr)
14641426
assert.Equal(t, test.expectedTaints, newNode.Spec.Taints)
14651427
})
14661428
}
@@ -2623,3 +2585,162 @@ func newEstimatorBuilder() estimator.EstimatorBuilder {
26232585

26242586
return estimatorBuilder
26252587
}
2588+
2589+
func TestCleaningSoftTaintsInScaleDown(t *testing.T) {
2590+
2591+
provider := testprovider.NewTestCloudProvider(nil, nil)
2592+
2593+
minSizeNgName := "ng-min-size"
2594+
nodesToHaveNoTaints := createNodeGroupWithSoftTaintedNodes(provider, minSizeNgName, 2, 10, 2)
2595+
2596+
notSizeNgName := "ng"
2597+
nodesToHaveTaints := createNodeGroupWithSoftTaintedNodes(provider, notSizeNgName, 3, 10, 4)
2598+
2599+
tests := []struct {
2600+
name string
2601+
testNodes []*apiv1.Node
2602+
scaleDownInCoolDown bool
2603+
expectedNodesWithSoftTaints []*apiv1.Node
2604+
expectedNodesWithNoSoftTaints []*apiv1.Node
2605+
}{
2606+
{
2607+
name: "Soft tainted nodes are cleaned when scale down skipped",
2608+
testNodes: nodesToHaveNoTaints,
2609+
scaleDownInCoolDown: false,
2610+
expectedNodesWithSoftTaints: []*apiv1.Node{},
2611+
expectedNodesWithNoSoftTaints: nodesToHaveNoTaints,
2612+
},
2613+
{
2614+
name: "Soft tainted nodes are cleaned when scale down in cooldown",
2615+
testNodes: nodesToHaveNoTaints,
2616+
scaleDownInCoolDown: true,
2617+
expectedNodesWithSoftTaints: []*apiv1.Node{},
2618+
expectedNodesWithNoSoftTaints: nodesToHaveNoTaints,
2619+
},
2620+
{
2621+
name: "Soft tainted nodes are not cleaned when scale down requested",
2622+
testNodes: nodesToHaveTaints,
2623+
scaleDownInCoolDown: false,
2624+
expectedNodesWithSoftTaints: nodesToHaveTaints,
2625+
expectedNodesWithNoSoftTaints: []*apiv1.Node{},
2626+
},
2627+
{
2628+
name: "Soft tainted nodes are cleaned only from min sized node group when scale down requested",
2629+
testNodes: append(nodesToHaveNoTaints, nodesToHaveTaints...),
2630+
scaleDownInCoolDown: false,
2631+
expectedNodesWithSoftTaints: nodesToHaveTaints,
2632+
expectedNodesWithNoSoftTaints: nodesToHaveNoTaints,
2633+
},
2634+
}
2635+
for _, test := range tests {
2636+
t.Run(test.name, func(t *testing.T) {
2637+
t.Parallel()
2638+
fakeClient := buildFakeClient(t, test.testNodes...)
2639+
2640+
autoscaler := buildStaticAutoscaler(t, provider, test.testNodes, test.testNodes, fakeClient)
2641+
autoscaler.processorCallbacks.disableScaleDownForLoop = test.scaleDownInCoolDown
2642+
assert.Equal(t, autoscaler.isScaleDownInCooldown(time.Now()), test.scaleDownInCoolDown)
2643+
2644+
err := autoscaler.RunOnce(time.Now())
2645+
2646+
assert.NoError(t, err)
2647+
2648+
assertNodesSoftTaintsStatus(t, fakeClient, test.expectedNodesWithSoftTaints, true)
2649+
assertNodesSoftTaintsStatus(t, fakeClient, test.expectedNodesWithNoSoftTaints, false)
2650+
})
2651+
}
2652+
}
2653+
2654+
func buildStaticAutoscaler(t *testing.T, provider cloudprovider.CloudProvider, allNodes []*apiv1.Node, readyNodes []*apiv1.Node, fakeClient *fake.Clientset) *StaticAutoscaler {
2655+
autoscalingOptions := config.AutoscalingOptions{
2656+
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
2657+
ScaleDownUnneededTime: time.Minute,
2658+
ScaleDownUnreadyTime: time.Minute,
2659+
ScaleDownUtilizationThreshold: 0.5,
2660+
MaxNodeProvisionTime: 10 * time.Second,
2661+
},
2662+
MaxScaleDownParallelism: 10,
2663+
MaxDrainParallelism: 1,
2664+
ScaleDownEnabled: true,
2665+
MaxBulkSoftTaintCount: 20,
2666+
MaxBulkSoftTaintTime: 5 * time.Second,
2667+
NodeDeleteDelayAfterTaint: 5 * time.Minute,
2668+
ScaleDownSimulationTimeout: 10 * time.Second,
2669+
}
2670+
2671+
allNodeLister := kubernetes.NewTestNodeLister(allNodes)
2672+
readyNodeLister := kubernetes.NewTestNodeLister(readyNodes)
2673+
2674+
daemonSetLister, err := kubernetes.NewTestDaemonSetLister(nil)
2675+
assert.NoError(t, err)
2676+
listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister,
2677+
kubernetes.NewTestPodLister(nil),
2678+
kubernetes.NewTestPodDisruptionBudgetLister(nil), daemonSetLister, nil, nil, nil, nil)
2679+
2680+
processorCallbacks := newStaticAutoscalerProcessorCallbacks()
2681+
2682+
ctx, err := NewScaleTestAutoscalingContext(autoscalingOptions, fakeClient, listerRegistry, provider, processorCallbacks, nil)
2683+
assert.NoError(t, err)
2684+
2685+
processors := NewTestProcessors(&ctx)
2686+
cp := scaledowncandidates.NewCombinedScaleDownCandidatesProcessor()
2687+
cp.Register(scaledowncandidates.NewScaleDownCandidatesSortingProcessor([]scaledowncandidates.CandidatesComparer{}))
2688+
processors.ScaleDownNodeProcessor = cp
2689+
2690+
csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{OkTotalUnreadyCount: 1}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))
2691+
actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), options.NodeDeleteOptions{}, nil, processors.NodeGroupConfigProcessor)
2692+
ctx.ScaleDownActuator = actuator
2693+
2694+
deleteOptions := options.NewNodeDeleteOptions(ctx.AutoscalingOptions)
2695+
drainabilityRules := rules.Default(deleteOptions)
2696+
2697+
sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules)
2698+
2699+
autoscaler := &StaticAutoscaler{
2700+
AutoscalingContext: &ctx,
2701+
clusterStateRegistry: csr,
2702+
scaleDownActuator: actuator,
2703+
scaleDownPlanner: sdPlanner,
2704+
processors: processors,
2705+
loopStartNotifier: loopstart.NewObserversList(nil),
2706+
processorCallbacks: processorCallbacks,
2707+
}
2708+
return autoscaler
2709+
}
2710+
2711+
func buildFakeClient(t *testing.T, nodes ...*apiv1.Node) *fake.Clientset {
2712+
fakeClient := fake.NewSimpleClientset()
2713+
for _, node := range nodes {
2714+
_, err := fakeClient.CoreV1().Nodes().Create(stdcontext.TODO(), node, metav1.CreateOptions{})
2715+
assert.NoError(t, err)
2716+
}
2717+
return fakeClient
2718+
}
2719+
2720+
func createNodeGroupWithSoftTaintedNodes(provider *testprovider.TestCloudProvider, name string, minSize int, maxSize int, size int) []*apiv1.Node {
2721+
nodesCreationTime := time.Time{}
2722+
var ngNodes []*apiv1.Node
2723+
ng := provider.BuildNodeGroup(name, minSize, maxSize, size, false, "", nil)
2724+
provider.InsertNodeGroup(ng)
2725+
for i := range size {
2726+
node := BuildTestNode(fmt.Sprintf("%s-node-%d", name, i), 2000, 1000)
2727+
node.CreationTimestamp = metav1.NewTime(nodesCreationTime)
2728+
node.Spec.Taints = []apiv1.Taint{{
2729+
Key: taints.DeletionCandidateTaint,
2730+
Value: "1",
2731+
Effect: apiv1.TaintEffectNoSchedule,
2732+
}}
2733+
SetNodeReadyState(node, true, nodesCreationTime)
2734+
ngNodes = append(ngNodes, node)
2735+
provider.AddNode(ng.Id(), node)
2736+
}
2737+
return ngNodes
2738+
}
2739+
2740+
func assertNodesSoftTaintsStatus(t *testing.T, fakeClient *fake.Clientset, nodes []*apiv1.Node, tainted bool) {
2741+
for _, node := range nodes {
2742+
newNode, clientErr := fakeClient.CoreV1().Nodes().Get(stdcontext.TODO(), node.Name, metav1.GetOptions{})
2743+
assert.NoError(t, clientErr)
2744+
assert.Equal(t, tainted, taints.HasDeletionCandidateTaint(newNode))
2745+
}
2746+
}

0 commit comments

Comments
 (0)