From 32c7b57c90d6fe8e3612a9765641802ded86b2e2 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Thu, 28 Aug 2025 13:01:51 +0000 Subject: [PATCH 01/19] Node removal latency metrics added --- .../config/autoscaling_options.go | 2 + cluster-autoscaler/config/flags/flags.go | 2 + .../core/scaledown/actuation/actuator.go | 8 +- .../core/scaledown/actuation/actuator_test.go | 3 + .../latencytracker/latencytracker_test.go | 134 ++++++++++++++++++ .../latencytracker/nodelatencytracker.go | 66 +++++++++ .../core/scaledown/planner/planner.go | 18 ++- .../core/scaledown/planner/planner_test.go | 7 +- .../core/scaledown/unneeded/nodes.go | 49 ++++++- cluster-autoscaler/core/static_autoscaler.go | 9 +- .../core/static_autoscaler_test.go | 41 +++--- cluster-autoscaler/metrics/metrics.go | 14 ++ 12 files changed, 324 insertions(+), 29 deletions(-) create mode 100644 cluster-autoscaler/core/scaledown/latencytracker/latencytracker_test.go create mode 100644 cluster-autoscaler/core/scaledown/latencytracker/nodelatencytracker.go diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index a144580989cd..6699aa1fdf17 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -349,6 +349,8 @@ type AutoscalingOptions struct { CapacitybufferControllerEnabled bool // CapacitybufferPodInjectionEnabled tells if CA should injects fake pods for capacity buffers that are ready for provisioning CapacitybufferPodInjectionEnabled bool + // NodeLatencyTrackingEnabled is used to enable/disable node latency tracking. + NodeLatencyTrackingEnabled bool } // KubeClientOptions specify options for kube client diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go index 0f7209ebbb1b..e373a940c840 100644 --- a/cluster-autoscaler/config/flags/flags.go +++ b/cluster-autoscaler/config/flags/flags.go @@ -230,6 +230,7 @@ var ( nodeDeletionCandidateTTL = flag.Duration("node-deletion-candidate-ttl", time.Duration(0), "Maximum time a node can be marked as removable before the marking becomes stale. This sets the TTL of Cluster-Autoscaler's state if the Cluste-Autoscaler deployment becomes inactive") capacitybufferControllerEnabled = flag.Bool("capacity-buffer-controller-enabled", false, "Whether to enable the default controller for capacity buffers or not") capacitybufferPodInjectionEnabled = flag.Bool("capacity-buffer-pod-injection-enabled", false, "Whether to enable pod list processor that processes ready capacity buffers and injects fake pods accordingly") + nodeLatencyTrackingEnabled = flag.Bool("enable-node-latency-tracking", false, "Whether logic for monitoring of node latency is enabled.") // Deprecated flags ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group (Deprecated, use startup-taints instead)") @@ -414,6 +415,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { NodeDeletionCandidateTTL: *nodeDeletionCandidateTTL, CapacitybufferControllerEnabled: *capacitybufferControllerEnabled, CapacitybufferPodInjectionEnabled: *capacitybufferPodInjectionEnabled, + NodeLatencyTrackingEnabled: *nodeLatencyTrackingEnabled, } } diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator.go b/cluster-autoscaler/core/scaledown/actuation/actuator.go index 55ef2e5a8fa6..462db0823d0c 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator.go @@ -27,6 +27,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/core/scaledown" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/budgets" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/latencytracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" "k8s.io/autoscaler/cluster-autoscaler/core/utils" @@ -58,6 +59,7 @@ const ( type Actuator struct { ctx *context.AutoscalingContext nodeDeletionTracker *deletiontracker.NodeDeletionTracker + nodeLatencyTracker *latencytracker.NodeLatencyTracker nodeDeletionScheduler *GroupDeletionScheduler deleteOptions options.NodeDeleteOptions drainabilityRules rules.Rules @@ -78,7 +80,7 @@ type actuatorNodeGroupConfigGetter interface { } // NewActuator returns a new instance of Actuator. -func NewActuator(ctx *context.AutoscalingContext, scaleStateNotifier nodegroupchange.NodeGroupChangeObserver, ndt *deletiontracker.NodeDeletionTracker, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, configGetter actuatorNodeGroupConfigGetter) *Actuator { +func NewActuator(ctx *context.AutoscalingContext, scaleStateNotifier nodegroupchange.NodeGroupChangeObserver, ndt *deletiontracker.NodeDeletionTracker, nlt *latencytracker.NodeLatencyTracker, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, configGetter actuatorNodeGroupConfigGetter) *Actuator { ndb := NewNodeDeletionBatcher(ctx, scaleStateNotifier, ndt, ctx.NodeDeletionBatcherInterval) legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) var evictor Evictor @@ -90,6 +92,7 @@ func NewActuator(ctx *context.AutoscalingContext, scaleStateNotifier nodegroupch return &Actuator{ ctx: ctx, nodeDeletionTracker: ndt, + nodeLatencyTracker: nlt, nodeDeletionScheduler: NewGroupDeletionScheduler(ctx, ndt, ndb, evictor), budgetProcessor: budgets.NewScaleDownBudgetProcessor(ctx), deleteOptions: deleteOptions, @@ -324,6 +327,9 @@ func (a *Actuator) deleteNodesAsync(nodes []*apiv1.Node, nodeGroup cloudprovider } for _, node := range nodes { + if a.nodeLatencyTracker != nil { + a.nodeLatencyTracker.ObserveDeletion(node.Name, time.Now()) + } nodeInfo, err := clusterSnapshot.GetNodeInfo(node.Name) if err != nil { nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "nodeInfos.Get for %q returned error: %v", node.Name, err)} diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go index c2b6788f6247..b0e48898a516 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go @@ -39,6 +39,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/budgets" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/latencytracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" . "k8s.io/autoscaler/cluster-autoscaler/core/test" "k8s.io/autoscaler/cluster-autoscaler/observers/nodegroupchange" @@ -1279,6 +1280,7 @@ func runStartDeletionTest(t *testing.T, tc startDeletionTestCase, force bool) { nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), configGetter: nodegroupconfig.NewDefaultNodeGroupConfigProcessor(ctx.NodeGroupDefaults), + nodeLatencyTracker: latencytracker.NewNodeLatencyTracker(), } var gotResult status.ScaleDownResult @@ -1557,6 +1559,7 @@ func TestStartDeletionInBatchBasic(t *testing.T) { ctx: &ctx, nodeDeletionTracker: ndt, nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), + nodeLatencyTracker: latencytracker.NewNodeLatencyTracker(), } for _, nodes := range deleteNodes { diff --git a/cluster-autoscaler/core/scaledown/latencytracker/latencytracker_test.go b/cluster-autoscaler/core/scaledown/latencytracker/latencytracker_test.go new file mode 100644 index 000000000000..d010038b814c --- /dev/null +++ b/cluster-autoscaler/core/scaledown/latencytracker/latencytracker_test.go @@ -0,0 +1,134 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package latencytracker + +import ( + "sync" + "testing" + "time" +) + +func TestUpdateStateWithUnneededList_AddsNewNodes(t *testing.T) { + tracker := NewNodeLatencyTracker() + now := time.Now() + node := NodeInfo{Name: "node1", UnneededSince: now, Threshold: 5 * time.Minute} + + tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now) + + tracker.Lock() + defer tracker.Unlock() + if _, ok := tracker.nodes["node1"]; !ok { + t.Errorf("expected node1 to be tracked, but was not") + } +} + +func TestUpdateStateWithUnneededList_DoesNotDuplicate(t *testing.T) { + tracker := NewNodeLatencyTracker() + now := time.Now() + node := NodeInfo{Name: "node1", UnneededSince: now, Threshold: 5 * time.Minute} + + tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now) + tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now.Add(time.Minute)) + + tracker.Lock() + defer tracker.Unlock() + if len(tracker.nodes) != 1 { + t.Errorf("expected 1 tracked node, got %d", len(tracker.nodes)) + } +} + +func TestObserveDeletion_RemovesNode(t *testing.T) { + tracker := NewNodeLatencyTracker() + now := time.Now() + node := NodeInfo{ + Name: "node1", + UnneededSince: now.Add(-10 * time.Minute), + Threshold: 5 * time.Minute, + } + tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now) + + tracker.ObserveDeletion("node1", now) + + tracker.Lock() + defer tracker.Unlock() + if _, ok := tracker.nodes["node1"]; ok { + t.Errorf("expected node1 removed after ObserveDeletion") + } +} + +func TestObserveDeletion_NoOpIfNodeNotTracked(t *testing.T) { + tracker := NewNodeLatencyTracker() + now := time.Now() + + tracker.ObserveDeletion("node1", now) + + tracker.Lock() + defer tracker.Unlock() + if len(tracker.nodes) != 0 { + t.Errorf("expected no nodes tracked, got %d", len(tracker.nodes)) + } +} + +func TestConcurrentUpdatesAndDeletions(t *testing.T) { + tracker := NewNodeLatencyTracker() + now := time.Now() + + node := NodeInfo{ + Name: "node1", + UnneededSince: now, + Threshold: 2 * time.Minute, + } + + var wg sync.WaitGroup + stop := make(chan struct{}) + + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-stop: + return + default: + tracker.UpdateStateWithUnneededList([]NodeInfo{node}, time.Now()) + } + } + }() + + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-stop: + return + default: + tracker.ObserveDeletion("node1", time.Now()) + } + } + }() + + time.Sleep(50 * time.Millisecond) + close(stop) + wg.Wait() + + tracker.Lock() + defer tracker.Unlock() + if len(tracker.nodes) > 1 { + t.Errorf("expected at most 1 tracked node, got %d", len(tracker.nodes)) + } +} diff --git a/cluster-autoscaler/core/scaledown/latencytracker/nodelatencytracker.go b/cluster-autoscaler/core/scaledown/latencytracker/nodelatencytracker.go new file mode 100644 index 000000000000..0735c146e6a0 --- /dev/null +++ b/cluster-autoscaler/core/scaledown/latencytracker/nodelatencytracker.go @@ -0,0 +1,66 @@ +package latencytracker + +import ( + "sync" + "time" + + "k8s.io/autoscaler/cluster-autoscaler/metrics" + + "k8s.io/klog/v2" +) + +type NodeInfo struct { + Name string + UnneededSince time.Time + Threshold time.Duration +} + +type NodeLatencyTracker struct { + sync.Mutex + nodes map[string]NodeInfo +} + +// NewNodeLatencyTracker creates a new tracker. +func NewNodeLatencyTracker() *NodeLatencyTracker { + return &NodeLatencyTracker{ + nodes: make(map[string]NodeInfo), + } +} + +func (t *NodeLatencyTracker) UpdateStateWithUnneededList(list []NodeInfo, timestamp time.Time) { + t.Lock() + defer t.Unlock() + + currentSet := make(map[string]struct{}, len(list)) + for _, info := range list { + currentSet[info.Name] = struct{}{} + _, exists := t.nodes[info.Name] + if !exists { + t.nodes[info.Name] = NodeInfo{ + Name: info.Name, + UnneededSince: info.UnneededSince, + Threshold: info.Threshold, + } + klog.V(2).Infof("Started tracking unneeded node %s at %v with ScaleDownUnneededTime=%v", + info.Name, info.UnneededSince, info.Threshold) + } + } +} + +// ObserveDeletion is called by the actuator just before node deletion. +func (t *NodeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Time) { + t.Lock() + defer t.Unlock() + + if info, exists := t.nodes[nodeName]; exists { + duration := timestamp.Sub(info.UnneededSince) + + klog.V(2).Infof( + "Observing deletion for node %s, unneeded for %s (threshold was %s).", + nodeName, duration, info.Threshold, + ) + + metrics.UpdateScaleDownNodeDeletionDuration("true", duration-info.Threshold) + delete(t.nodes, nodeName) + } +} diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 2e2263fe84e0..ecc930548e6c 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -26,6 +26,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/latencytracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/unneeded" @@ -76,10 +77,11 @@ type Planner struct { cc controllerReplicasCalculator scaleDownSetProcessor nodes.ScaleDownSetProcessor scaleDownContext *nodes.ScaleDownContext + nodeLatencyTracker *latencytracker.NodeLatencyTracker } // New creates a new Planner object. -func New(context *context.AutoscalingContext, processors *processors.AutoscalingProcessors, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules) *Planner { +func New(context *context.AutoscalingContext, processors *processors.AutoscalingProcessors, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, nlt *latencytracker.NodeLatencyTracker) *Planner { resourceLimitsFinder := resource.NewLimitsFinder(processors.CustomResourcesProcessor) minUpdateInterval := context.AutoscalingOptions.NodeGroupDefaults.ScaleDownUnneededTime if minUpdateInterval == 0*time.Nanosecond { @@ -104,6 +106,7 @@ func New(context *context.AutoscalingContext, processors *processors.Autoscaling scaleDownSetProcessor: processors.ScaleDownSetProcessor, scaleDownContext: nodes.NewDefaultScaleDownContext(), minUpdateInterval: minUpdateInterval, + nodeLatencyTracker: nlt, } } @@ -307,6 +310,19 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand } } p.unneededNodes.Update(removableList, p.latestUpdate) + if p.nodeLatencyTracker != nil { + var unneededList []latencytracker.NodeInfo + for _, n := range p.unneededNodes.AsList() { + if threshold, ok := p.unneededNodes.GetUnneededTimeForNode(p.context, n.Name); ok { + unneededList = append(unneededList, latencytracker.NodeInfo{ + Name: n.Name, + UnneededSince: p.latestUpdate, + Threshold: threshold, + }) + } + } + p.nodeLatencyTracker.UpdateStateWithUnneededList(unneededList, p.latestUpdate) + } if unremovableCount > 0 { klog.V(1).Infof("%v nodes found to be unremovable in simulation, will re-check them at %v", unremovableCount, unremovableTimeout) } diff --git a/cluster-autoscaler/core/scaledown/planner/planner_test.go b/cluster-autoscaler/core/scaledown/planner/planner_test.go index 051a5f591645..3c8a6ae7d41e 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner_test.go +++ b/cluster-autoscaler/core/scaledown/planner/planner_test.go @@ -32,6 +32,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/latencytracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/unremovable" @@ -503,7 +504,7 @@ func TestUpdateClusterState(t *testing.T) { assert.NoError(t, err) clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, tc.nodes, tc.pods) deleteOptions := options.NodeDeleteOptions{} - p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil) + p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil, latencytracker.NewNodeLatencyTracker()) p.eligibilityChecker = &fakeEligibilityChecker{eligible: asMap(tc.eligible)} if tc.isSimulationTimeout { context.AutoscalingOptions.ScaleDownSimulationTimeout = 1 * time.Second @@ -699,7 +700,7 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) { assert.NoError(t, err) clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, nodes, nil) deleteOptions := options.NodeDeleteOptions{} - p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil) + p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil, latencytracker.NewNodeLatencyTracker()) p.eligibilityChecker = &fakeEligibilityChecker{eligible: asMap(nodeNames(nodes))} p.minUpdateInterval = tc.updateInterval p.unneededNodes.Update(previouslyUnneeded, time.Now()) @@ -1023,7 +1024,7 @@ func TestNodesToDelete(t *testing.T) { assert.NoError(t, err) clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, allNodes, nil) deleteOptions := options.NodeDeleteOptions{} - p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil) + p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil, latencytracker.NewNodeLatencyTracker()) p.latestUpdate = time.Now() p.scaleDownContext.ActuationStatus = deletiontracker.NewNodeDeletionTracker(0 * time.Second) p.unneededNodes.Update(allRemovables, time.Now().Add(-1*time.Hour)) diff --git a/cluster-autoscaler/core/scaledown/unneeded/nodes.go b/cluster-autoscaler/core/scaledown/unneeded/nodes.go index ba1ad8e4d7cc..9ebd99fe3065 100644 --- a/cluster-autoscaler/core/scaledown/unneeded/nodes.go +++ b/cluster-autoscaler/core/scaledown/unneeded/nodes.go @@ -39,10 +39,11 @@ import ( // Nodes tracks the state of cluster nodes that are not needed. type Nodes struct { - sdtg scaleDownTimeGetter - limitsFinder *resource.LimitsFinder - cachedList []*apiv1.Node - byName map[string]*node + sdtg scaleDownTimeGetter + limitsFinder *resource.LimitsFinder + cachedList []*apiv1.Node + byName map[string]*node + unneededTimeCache map[string]time.Duration } type node struct { @@ -60,8 +61,9 @@ type scaleDownTimeGetter interface { // NewNodes returns a new initialized Nodes object. func NewNodes(sdtg scaleDownTimeGetter, limitsFinder *resource.LimitsFinder) *Nodes { return &Nodes{ - sdtg: sdtg, - limitsFinder: limitsFinder, + sdtg: sdtg, + limitsFinder: limitsFinder, + unneededTimeCache: make(map[string]time.Duration), } } @@ -209,6 +211,41 @@ func (n *Nodes) RemovableAt(context *context.AutoscalingContext, scaleDownContex return } +// GetUnneededTimeForNode returns the unneeded timeout for a given node if tracked. +// Returns (duration, true) if found, otherwise (0, false). +func (n *Nodes) GetUnneededTimeForNode(ctx *context.AutoscalingContext, nodeName string) (time.Duration, bool) { + v, found := n.byName[nodeName] + if !found { + klog.V(4).Infof("Skipping - node %s not found in unneded list", nodeName) + return 0, false + } + + node := v.ntbr.Node + nodeGroup, err := ctx.CloudProvider.NodeGroupForNode(node) + if err != nil { + klog.Errorf("Error while getting node group for %s: %v", nodeName, err) + return 0, false + } + if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { + klog.V(4).Infof("Skipping %s - no node group", nodeName) + return 0, false + } + + ngID := nodeGroup.Id() + if cached, ok := n.unneededTimeCache[ngID]; ok { + return cached, true + } + + unneededTime, err := n.sdtg.GetScaleDownUnneededTime(nodeGroup) + if err != nil { + klog.Errorf("Error getting ScaleDownUnneededTime for node %s: %v", nodeName, err) + return 0, false + } + + n.unneededTimeCache[ngID] = unneededTime + return unneededTime, true +} + func (n *Nodes) unremovableReason(context *context.AutoscalingContext, scaleDownContext nodes.ScaleDownContext, v *node, ts time.Time, nodeGroupSize map[string]int) simulator.UnremovableReason { node := v.ntbr.Node // Check if node is marked with no scale down annotation. diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 77e4e13bfea3..6586f821c2d9 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -30,6 +30,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/core/scaledown" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/actuation" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/latencytracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/planner" scaledownstatus "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" @@ -170,11 +171,15 @@ func NewStaticAutoscaler( // TODO: Populate the ScaleDownActuator/Planner fields in AutoscalingContext // during the struct creation rather than here. - scaleDownPlanner := planner.New(autoscalingContext, processors, deleteOptions, drainabilityRules) + var nldt *latencytracker.NodeLatencyTracker + if autoscalingContext.AutoscalingOptions.NodeLatencyTrackingEnabled { + nldt = latencytracker.NewNodeLatencyTracker() + } + scaleDownPlanner := planner.New(autoscalingContext, processors, deleteOptions, drainabilityRules, nldt) processorCallbacks.scaleDownPlanner = scaleDownPlanner ndt := deletiontracker.NewNodeDeletionTracker(0 * time.Second) - scaleDownActuator := actuation.NewActuator(autoscalingContext, processors.ScaleStateNotifier, ndt, deleteOptions, drainabilityRules, processors.NodeGroupConfigProcessor) + scaleDownActuator := actuation.NewActuator(autoscalingContext, processors.ScaleStateNotifier, ndt, nldt, deleteOptions, drainabilityRules, processors.NodeGroupConfigProcessor) autoscalingContext.ScaleDownActuator = scaleDownActuator if scaleUpOrchestrator == nil { diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 02425eb2a996..95826e837962 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -48,6 +48,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/core/scaledown" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/actuation" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/latencytracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/planner" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" "k8s.io/autoscaler/cluster-autoscaler/core/scaleup/orchestrator" @@ -165,7 +166,7 @@ func (m *onNodeGroupDeleteMock) Delete(id string) error { func setUpScaleDownActuator(ctx *context.AutoscalingContext, autoscalingOptions config.AutoscalingOptions) { deleteOptions := options.NewNodeDeleteOptions(autoscalingOptions) - ctx.ScaleDownActuator = actuation.NewActuator(ctx, nil, deletiontracker.NewNodeDeletionTracker(0*time.Second), deleteOptions, rules.Default(deleteOptions), processorstest.NewTestProcessors(ctx).NodeGroupConfigProcessor) + ctx.ScaleDownActuator = actuation.NewActuator(ctx, nil, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), deleteOptions, rules.Default(deleteOptions), processorstest.NewTestProcessors(ctx).NodeGroupConfigProcessor) } type nodeGroup struct { @@ -211,6 +212,7 @@ type commonMocks struct { podDisruptionBudgetLister *podDisruptionBudgetListerMock daemonSetLister *daemonSetListerMock nodeDeletionTracker *deletiontracker.NodeDeletionTracker + nodeLatencyTracker *latencytracker.NodeLatencyTracker resourceClaimLister *fakeAllObjectsLister[*resourceapi.ResourceClaim] resourceSliceLister *fakeAllObjectsLister[*resourceapi.ResourceSlice] @@ -321,8 +323,12 @@ func setupAutoscaler(config *autoscalerSetupConfig) (*StaticAutoscaler, error) { if nodeDeletionTracker == nil { nodeDeletionTracker = deletiontracker.NewNodeDeletionTracker(0 * time.Second) } - ctx.ScaleDownActuator = actuation.NewActuator(&ctx, clusterState, nodeDeletionTracker, deleteOptions, drainabilityRules, processors.NodeGroupConfigProcessor) - sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules) + nodeLatencyTracker := config.mocks.nodeLatencyTracker + if nodeLatencyTracker == nil { + nodeLatencyTracker = latencytracker.NewNodeLatencyTracker() + } + ctx.ScaleDownActuator = actuation.NewActuator(&ctx, clusterState, nodeDeletionTracker, nodeLatencyTracker, deleteOptions, drainabilityRules, processors.NodeGroupConfigProcessor) + sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, nodeLatencyTracker) processorCallbacks.scaleDownPlanner = sdPlanner @@ -410,7 +416,7 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { } processors := processorstest.NewTestProcessors(&context) clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) @@ -678,7 +684,7 @@ func TestStaticAutoscalerRunOnceWithScaleDownDelayPerNG(t *testing.T) { clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) processors.ScaleStateNotifier.Register(clusterState) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) @@ -822,7 +828,7 @@ func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) { } clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) @@ -975,7 +981,7 @@ func TestStaticAutoscalerRunOnceWithALongUnregisteredNode(t *testing.T) { // broken node failed to register in time clusterState.UpdateNodes(nodes, nil, later) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) @@ -1130,7 +1136,7 @@ func TestStaticAutoscalerRunOncePodsWithPriorities(t *testing.T) { processors := processorstest.NewTestProcessors(&context) clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) @@ -1261,7 +1267,7 @@ func TestStaticAutoscalerRunOnceWithFilteringOnBinPackingEstimator(t *testing.T) processors := processorstest.NewTestProcessors(&context) clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) autoscaler := &StaticAutoscaler{ AutoscalingContext: &context, @@ -1359,7 +1365,7 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t * processors := processorstest.NewTestProcessors(&context) clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) autoscaler := &StaticAutoscaler{ AutoscalingContext: &context, @@ -2467,7 +2473,7 @@ func TestStaticAutoscalerUpcomingScaleDownCandidates(t *testing.T) { csr := clusterstate.NewClusterStateRegistry(provider, csrConfig, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), processors.AsyncNodeGroupStateChecker) // Setting the Actuator is necessary for testing any scale-down logic, it shouldn't have anything to do in this test. - actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), options.NodeDeleteOptions{}, nil, processorstest.NewTestProcessors(&ctx).NodeGroupConfigProcessor) + actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), options.NodeDeleteOptions{}, nil, processorstest.NewTestProcessors(&ctx).NodeGroupConfigProcessor) ctx.ScaleDownActuator = actuator // Fake planner that keeps track of the scale-down candidates passed to UpdateClusterState. @@ -3128,7 +3134,7 @@ func waitForDeleteToFinish(t *testing.T, deleteFinished <-chan bool) { } } -func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_processors.AutoscalingProcessors, cs *clusterstate.ClusterStateRegistry, nodeDeletionTracker *deletiontracker.NodeDeletionTracker) (scaledown.Planner, scaledown.Actuator) { +func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_processors.AutoscalingProcessors, cs *clusterstate.ClusterStateRegistry, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, nodeLatencyTracker *latencytracker.NodeLatencyTracker) (scaledown.Planner, scaledown.Actuator) { ctx.MaxScaleDownParallelism = 10 ctx.MaxDrainParallelism = 1 ctx.NodeDeletionBatcherInterval = 0 * time.Second @@ -3143,8 +3149,11 @@ func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_proce if nodeDeletionTracker == nil { nodeDeletionTracker = deletiontracker.NewNodeDeletionTracker(0 * time.Second) } - planner := planner.New(ctx, p, deleteOptions, nil) - actuator := actuation.NewActuator(ctx, cs, nodeDeletionTracker, deleteOptions, nil, p.NodeGroupConfigProcessor) + if nodeLatencyTracker == nil { + nodeLatencyTracker = latencytracker.NewNodeLatencyTracker() + } + planner := planner.New(ctx, p, deleteOptions, nil, nodeLatencyTracker) + actuator := actuation.NewActuator(ctx, cs, nodeDeletionTracker, nodeLatencyTracker, deleteOptions, nil, p.NodeGroupConfigProcessor) return planner, actuator } @@ -3260,13 +3269,13 @@ func buildStaticAutoscaler(t *testing.T, provider cloudprovider.CloudProvider, a processors.ScaleDownNodeProcessor = cp csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{OkTotalUnreadyCount: 1}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), processors.AsyncNodeGroupStateChecker) - actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), options.NodeDeleteOptions{}, nil, processors.NodeGroupConfigProcessor) + actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), options.NodeDeleteOptions{}, nil, processors.NodeGroupConfigProcessor) ctx.ScaleDownActuator = actuator deleteOptions := options.NewNodeDeleteOptions(ctx.AutoscalingOptions) drainabilityRules := rules.Default(deleteOptions) - sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules) + sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, latencytracker.NewNodeLatencyTracker()) autoscaler := &StaticAutoscaler{ AutoscalingContext: &ctx, diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index ebc5541c5edb..fd9168c4e15d 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -427,6 +427,15 @@ var ( Buckets: k8smetrics.ExponentialBuckets(1, 2, 6), // 1, 2, 4, ..., 32 }, []string{"instance_type", "cpu_count", "namespace_count"}, ) + + scaleDownNodeDeletionDuration = k8smetrics.NewHistogramVec( + &k8smetrics.HistogramOpts{ + Namespace: caNamespace, + Name: "node_deletion_duration_seconds", + Help: "Latency from planning (node marked) to final outcome (deleted, aborted, rescued).", + Buckets: k8smetrics.ExponentialBuckets(10, 2, 12), + }, []string{"deleted"}, + ) ) // RegisterAll registers all metrics. @@ -463,6 +472,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) { legacyregistry.MustRegister(nodeTaintsCount) legacyregistry.MustRegister(inconsistentInstancesMigsCount) legacyregistry.MustRegister(binpackingHeterogeneity) + legacyregistry.MustRegister(scaleDownNodeDeletionDuration) if emitPerNodeGroupMetrics { legacyregistry.MustRegister(nodesGroupMinNodes) @@ -750,3 +760,7 @@ func UpdateInconsistentInstancesMigsCount(migCount int) { func ObserveBinpackingHeterogeneity(instanceType, cpuCount, namespaceCount string, pegCount int) { binpackingHeterogeneity.WithLabelValues(instanceType, cpuCount, namespaceCount).Observe(float64(pegCount)) } + +func UpdateScaleDownNodeDeletionDuration(deleted string, duration time.Duration) { + scaleDownNodeDeletionDuration.WithLabelValues(deleted).Observe(duration.Seconds()) +} From ba29eef6be5ff64b0471a03a5c050f565788d19f Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 21:52:29 +0000 Subject: [PATCH 02/19] Update node_deletion_duration_seconds metrics bucket distribution --- cluster-autoscaler/metrics/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index fd9168c4e15d..2563eb8f4561 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -433,7 +433,7 @@ var ( Namespace: caNamespace, Name: "node_deletion_duration_seconds", Help: "Latency from planning (node marked) to final outcome (deleted, aborted, rescued).", - Buckets: k8smetrics.ExponentialBuckets(10, 2, 12), + Buckets: k8smetrics.ExponentialBuckets(1, 2, 12), //1, 2, 4, 8, ..., 2048 }, []string{"deleted"}, ) ) From 7512a0dc55485bfa6102da0b11d9f243094e2921 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 21:58:04 +0000 Subject: [PATCH 03/19] Rename files according to convention --- .../{nodelatencytracker.go => node_latency_tracker.go} | 0 .../{latencytracker_test.go => node_latency_tracker_test.go} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename cluster-autoscaler/core/scaledown/latencytracker/{nodelatencytracker.go => node_latency_tracker.go} (100%) rename cluster-autoscaler/core/scaledown/latencytracker/{latencytracker_test.go => node_latency_tracker_test.go} (100%) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/nodelatencytracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go similarity index 100% rename from cluster-autoscaler/core/scaledown/latencytracker/nodelatencytracker.go rename to cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go diff --git a/cluster-autoscaler/core/scaledown/latencytracker/latencytracker_test.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go similarity index 100% rename from cluster-autoscaler/core/scaledown/latencytracker/latencytracker_test.go rename to cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go From 50589282225514cb36d5a2d72d13dd4e1e09bce3 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:07:13 +0000 Subject: [PATCH 04/19] removed lock from metrics tracking --- .../scaledown/latencytracker/node_latency_tracker.go | 8 -------- .../latencytracker/node_latency_tracker_test.go | 10 ---------- 2 files changed, 18 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go index 0735c146e6a0..25befdc8be54 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go @@ -1,7 +1,6 @@ package latencytracker import ( - "sync" "time" "k8s.io/autoscaler/cluster-autoscaler/metrics" @@ -16,7 +15,6 @@ type NodeInfo struct { } type NodeLatencyTracker struct { - sync.Mutex nodes map[string]NodeInfo } @@ -28,9 +26,6 @@ func NewNodeLatencyTracker() *NodeLatencyTracker { } func (t *NodeLatencyTracker) UpdateStateWithUnneededList(list []NodeInfo, timestamp time.Time) { - t.Lock() - defer t.Unlock() - currentSet := make(map[string]struct{}, len(list)) for _, info := range list { currentSet[info.Name] = struct{}{} @@ -49,9 +44,6 @@ func (t *NodeLatencyTracker) UpdateStateWithUnneededList(list []NodeInfo, timest // ObserveDeletion is called by the actuator just before node deletion. func (t *NodeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Time) { - t.Lock() - defer t.Unlock() - if info, exists := t.nodes[nodeName]; exists { duration := timestamp.Sub(info.UnneededSince) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go index d010038b814c..e579688fb121 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go @@ -29,8 +29,6 @@ func TestUpdateStateWithUnneededList_AddsNewNodes(t *testing.T) { tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now) - tracker.Lock() - defer tracker.Unlock() if _, ok := tracker.nodes["node1"]; !ok { t.Errorf("expected node1 to be tracked, but was not") } @@ -44,8 +42,6 @@ func TestUpdateStateWithUnneededList_DoesNotDuplicate(t *testing.T) { tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now) tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now.Add(time.Minute)) - tracker.Lock() - defer tracker.Unlock() if len(tracker.nodes) != 1 { t.Errorf("expected 1 tracked node, got %d", len(tracker.nodes)) } @@ -63,8 +59,6 @@ func TestObserveDeletion_RemovesNode(t *testing.T) { tracker.ObserveDeletion("node1", now) - tracker.Lock() - defer tracker.Unlock() if _, ok := tracker.nodes["node1"]; ok { t.Errorf("expected node1 removed after ObserveDeletion") } @@ -76,8 +70,6 @@ func TestObserveDeletion_NoOpIfNodeNotTracked(t *testing.T) { tracker.ObserveDeletion("node1", now) - tracker.Lock() - defer tracker.Unlock() if len(tracker.nodes) != 0 { t.Errorf("expected no nodes tracked, got %d", len(tracker.nodes)) } @@ -126,8 +118,6 @@ func TestConcurrentUpdatesAndDeletions(t *testing.T) { close(stop) wg.Wait() - tracker.Lock() - defer tracker.Unlock() if len(tracker.nodes) > 1 { t.Errorf("expected at most 1 tracked node, got %d", len(tracker.nodes)) } From 697d57e7c365a77ff69831e5e106af69f6a10ded Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:21:47 +0000 Subject: [PATCH 05/19] Create and use LatencyTracker interface for better testability --- cluster-autoscaler/core/scaledown/actuation/actuator.go | 4 ++-- .../core/scaledown/latencytracker/node_latency_tracker.go | 5 ++++- cluster-autoscaler/core/scaledown/planner/planner.go | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator.go b/cluster-autoscaler/core/scaledown/actuation/actuator.go index 462db0823d0c..d7684d06cc1d 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator.go @@ -59,7 +59,7 @@ const ( type Actuator struct { ctx *context.AutoscalingContext nodeDeletionTracker *deletiontracker.NodeDeletionTracker - nodeLatencyTracker *latencytracker.NodeLatencyTracker + nodeLatencyTracker latencytracker.LatencyTracker nodeDeletionScheduler *GroupDeletionScheduler deleteOptions options.NodeDeleteOptions drainabilityRules rules.Rules @@ -80,7 +80,7 @@ type actuatorNodeGroupConfigGetter interface { } // NewActuator returns a new instance of Actuator. -func NewActuator(ctx *context.AutoscalingContext, scaleStateNotifier nodegroupchange.NodeGroupChangeObserver, ndt *deletiontracker.NodeDeletionTracker, nlt *latencytracker.NodeLatencyTracker, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, configGetter actuatorNodeGroupConfigGetter) *Actuator { +func NewActuator(ctx *context.AutoscalingContext, scaleStateNotifier nodegroupchange.NodeGroupChangeObserver, ndt *deletiontracker.NodeDeletionTracker, nlt latencytracker.LatencyTracker, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, configGetter actuatorNodeGroupConfigGetter) *Actuator { ndb := NewNodeDeletionBatcher(ctx, scaleStateNotifier, ndt, ctx.NodeDeletionBatcherInterval) legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) var evictor Evictor diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go index 25befdc8be54..ec886e2594a7 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go @@ -4,10 +4,13 @@ import ( "time" "k8s.io/autoscaler/cluster-autoscaler/metrics" - "k8s.io/klog/v2" ) +type LatencyTracker interface { + ObserveDeletion(nodeName string, timestamp time.Time) + UpdateStateWithUnneededList(list []NodeInfo, timestamp time.Time) +} type NodeInfo struct { Name string UnneededSince time.Time diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index ecc930548e6c..3be435fc24a9 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -77,11 +77,11 @@ type Planner struct { cc controllerReplicasCalculator scaleDownSetProcessor nodes.ScaleDownSetProcessor scaleDownContext *nodes.ScaleDownContext - nodeLatencyTracker *latencytracker.NodeLatencyTracker + nodeLatencyTracker latencytracker.LatencyTracker } // New creates a new Planner object. -func New(context *context.AutoscalingContext, processors *processors.AutoscalingProcessors, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, nlt *latencytracker.NodeLatencyTracker) *Planner { +func New(context *context.AutoscalingContext, processors *processors.AutoscalingProcessors, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, nlt latencytracker.LatencyTracker) *Planner { resourceLimitsFinder := resource.NewLimitsFinder(processors.CustomResourcesProcessor) minUpdateInterval := context.AutoscalingOptions.NodeGroupDefaults.ScaleDownUnneededTime if minUpdateInterval == 0*time.Nanosecond { From 71be2296efa21f03bd0fb6cad353c758b4ce13c8 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:34:23 +0000 Subject: [PATCH 06/19] Change UpdateStateWithUnneededList logic to also process nodes that are currently under deletion --- .../latencytracker/node_latency_tracker.go | 41 ++++++--- .../node_latency_tracker_test.go | 91 ------------------- .../core/scaledown/planner/planner.go | 16 +--- 3 files changed, 31 insertions(+), 117 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go index ec886e2594a7..c5fb7e9384f4 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go @@ -3,16 +3,16 @@ package latencytracker import ( "time" + apiv1 "k8s.io/api/core/v1" "k8s.io/autoscaler/cluster-autoscaler/metrics" "k8s.io/klog/v2" ) type LatencyTracker interface { ObserveDeletion(nodeName string, timestamp time.Time) - UpdateStateWithUnneededList(list []NodeInfo, timestamp time.Time) + UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) } type NodeInfo struct { - Name string UnneededSince time.Time Threshold time.Duration } @@ -28,19 +28,34 @@ func NewNodeLatencyTracker() *NodeLatencyTracker { } } -func (t *NodeLatencyTracker) UpdateStateWithUnneededList(list []NodeInfo, timestamp time.Time) { +// UpdateStateWithUnneededList records unneeded nodes and handles missing ones. +func (t *NodeLatencyTracker) UpdateStateWithUnneededList( + list []*apiv1.Node, + currentlyInDeletion map[string]bool, + timestamp time.Time, +) { currentSet := make(map[string]struct{}, len(list)) - for _, info := range list { - currentSet[info.Name] = struct{}{} - _, exists := t.nodes[info.Name] - if !exists { - t.nodes[info.Name] = NodeInfo{ - Name: info.Name, - UnneededSince: info.UnneededSince, - Threshold: info.Threshold, + for _, node := range list { + currentSet[node.Name] = struct{}{} + + if _, exists := t.nodes[node.Name]; !exists { + t.nodes[node.Name] = NodeInfo{ + UnneededSince: timestamp, + Threshold: 0, + } + klog.V(2).Infof("Started tracking unneeded node %s at %v", node.Name, timestamp) + } + } + + for name, info := range t.nodes { + if _, stillUnneeded := currentSet[name]; !stillUnneeded { + if _, inDeletion := currentlyInDeletion[name]; !inDeletion { + duration := timestamp.Sub(info.UnneededSince) + metrics.UpdateScaleDownNodeDeletionDuration("false", duration-info.Threshold) + delete(t.nodes, name) + klog.V(2).Infof("Node %q reported as deleted/missing (unneeded for %s, threshold %s)", + name, duration, info.Threshold) } - klog.V(2).Infof("Started tracking unneeded node %s at %v with ScaleDownUnneededTime=%v", - info.Name, info.UnneededSince, info.Threshold) } } } diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go index e579688fb121..d7fe2f63c45c 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go @@ -17,53 +17,10 @@ limitations under the License. package latencytracker import ( - "sync" "testing" "time" ) -func TestUpdateStateWithUnneededList_AddsNewNodes(t *testing.T) { - tracker := NewNodeLatencyTracker() - now := time.Now() - node := NodeInfo{Name: "node1", UnneededSince: now, Threshold: 5 * time.Minute} - - tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now) - - if _, ok := tracker.nodes["node1"]; !ok { - t.Errorf("expected node1 to be tracked, but was not") - } -} - -func TestUpdateStateWithUnneededList_DoesNotDuplicate(t *testing.T) { - tracker := NewNodeLatencyTracker() - now := time.Now() - node := NodeInfo{Name: "node1", UnneededSince: now, Threshold: 5 * time.Minute} - - tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now) - tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now.Add(time.Minute)) - - if len(tracker.nodes) != 1 { - t.Errorf("expected 1 tracked node, got %d", len(tracker.nodes)) - } -} - -func TestObserveDeletion_RemovesNode(t *testing.T) { - tracker := NewNodeLatencyTracker() - now := time.Now() - node := NodeInfo{ - Name: "node1", - UnneededSince: now.Add(-10 * time.Minute), - Threshold: 5 * time.Minute, - } - tracker.UpdateStateWithUnneededList([]NodeInfo{node}, now) - - tracker.ObserveDeletion("node1", now) - - if _, ok := tracker.nodes["node1"]; ok { - t.Errorf("expected node1 removed after ObserveDeletion") - } -} - func TestObserveDeletion_NoOpIfNodeNotTracked(t *testing.T) { tracker := NewNodeLatencyTracker() now := time.Now() @@ -74,51 +31,3 @@ func TestObserveDeletion_NoOpIfNodeNotTracked(t *testing.T) { t.Errorf("expected no nodes tracked, got %d", len(tracker.nodes)) } } - -func TestConcurrentUpdatesAndDeletions(t *testing.T) { - tracker := NewNodeLatencyTracker() - now := time.Now() - - node := NodeInfo{ - Name: "node1", - UnneededSince: now, - Threshold: 2 * time.Minute, - } - - var wg sync.WaitGroup - stop := make(chan struct{}) - - wg.Add(1) - go func() { - defer wg.Done() - for { - select { - case <-stop: - return - default: - tracker.UpdateStateWithUnneededList([]NodeInfo{node}, time.Now()) - } - } - }() - - wg.Add(1) - go func() { - defer wg.Done() - for { - select { - case <-stop: - return - default: - tracker.ObserveDeletion("node1", time.Now()) - } - } - }() - - time.Sleep(50 * time.Millisecond) - close(stop) - wg.Wait() - - if len(tracker.nodes) > 1 { - t.Errorf("expected at most 1 tracked node, got %d", len(tracker.nodes)) - } -} diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 3be435fc24a9..868255f6fda5 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -131,6 +131,9 @@ func (p *Planner) UpdateClusterState(podDestinations, scaleDownCandidates []*api podDestinations = filterOutOngoingDeletions(podDestinations, deletions) scaleDownCandidates = filterOutOngoingDeletions(scaleDownCandidates, deletions) p.categorizeNodes(asMap(nodeNames(podDestinations)), scaleDownCandidates) + if p.nodeLatencyTracker != nil { + p.nodeLatencyTracker.UpdateStateWithUnneededList(p.unneededNodes.AsList(), deletions, p.latestUpdate) + } p.rs.DropOldHints() p.actuationInjector.DropOldHints() return nil @@ -310,19 +313,6 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand } } p.unneededNodes.Update(removableList, p.latestUpdate) - if p.nodeLatencyTracker != nil { - var unneededList []latencytracker.NodeInfo - for _, n := range p.unneededNodes.AsList() { - if threshold, ok := p.unneededNodes.GetUnneededTimeForNode(p.context, n.Name); ok { - unneededList = append(unneededList, latencytracker.NodeInfo{ - Name: n.Name, - UnneededSince: p.latestUpdate, - Threshold: threshold, - }) - } - } - p.nodeLatencyTracker.UpdateStateWithUnneededList(unneededList, p.latestUpdate) - } if unremovableCount > 0 { klog.V(1).Infof("%v nodes found to be unremovable in simulation, will re-check them at %v", unremovableCount, unremovableTimeout) } From bb064bdf7a8f063009c68304493f080339a4a382 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:35:46 +0000 Subject: [PATCH 07/19] cover planner node deletion latency tracking with test --- .../scaledown/latencytracker/node_latency_tracker.go | 9 +++++++++ .../core/scaledown/planner/planner_test.go | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go index c5fb7e9384f4..e7f4aff3f046 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go @@ -74,3 +74,12 @@ func (t *NodeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Tim delete(t.nodes, nodeName) } } + +// GetTrackedNodes returns the names of all nodes currently tracked as unneeded. +func (t *NodeLatencyTracker) GetTrackedNodes() []string { + names := make([]string, 0, len(t.nodes)) + for name := range t.nodes { + names = append(names, name) + } + return names +} diff --git a/cluster-autoscaler/core/scaledown/planner/planner_test.go b/cluster-autoscaler/core/scaledown/planner/planner_test.go index 3c8a6ae7d41e..3beaa61c3df7 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner_test.go +++ b/cluster-autoscaler/core/scaledown/planner/planner_test.go @@ -522,6 +522,13 @@ func TestUpdateClusterState(t *testing.T) { assert.Equal(t, wantUnneeded[n.Name], p.unneededNodes.Contains(n.Name), []string{n.Name, "unneeded"}) assert.Equal(t, wantUnremovable[n.Name], p.unremovableNodes.Contains(n.Name), []string{n.Name, "unremovable"}) } + tracked := p.nodeLatencyTracker.GetTrackedNodes() + for _, name := range tc.wantUnneeded { + assert.Contains(t, tracked, name, "expected node in latency tracker") + } + for _, name := range tc.wantUnremovable { + assert.NotContains(t, tracked, name, "expected node not in latency tracker") + } }) } } From 905cb29653a086a2c50890d3e71bc1c3b3c8d708 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:41:40 +0000 Subject: [PATCH 08/19] Add UpdateThreshold method to ndlt and use it during RemovableAt --- .../latencytracker/node_latency_tracker.go | 12 +++++++ .../core/scaledown/planner/planner.go | 2 +- .../core/scaledown/unneeded/nodes.go | 27 ++++++++++------ .../core/scaledown/unneeded/nodes_test.go | 31 +++++++++++++++++-- 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go index e7f4aff3f046..d46b28bbce07 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go @@ -11,6 +11,7 @@ import ( type LatencyTracker interface { ObserveDeletion(nodeName string, timestamp time.Time) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) + UpdateThreshold(nodeName string, threshold time.Duration) } type NodeInfo struct { UnneededSince time.Time @@ -75,6 +76,17 @@ func (t *NodeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Tim } } +// UpdateThreshold updates the scale-down threshold for a tracked node. +func (t *NodeLatencyTracker) UpdateThreshold(nodeName string, threshold time.Duration) { + if info, exists := t.nodes[nodeName]; exists { + info.Threshold = threshold + t.nodes[nodeName] = info + klog.V(2).Infof("Updated threshold for node %q to %s", nodeName, threshold) + } else { + klog.Warningf("Attempted to update threshold for unknown node %q", nodeName) + } +} + // GetTrackedNodes returns the names of all nodes currently tracked as unneeded. func (t *NodeLatencyTracker) GetTrackedNodes() []string { names := make([]string, 0, len(t.nodes)) diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 868255f6fda5..eb47cf5380ec 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -96,7 +96,7 @@ func New(context *context.AutoscalingContext, processors *processors.Autoscaling return &Planner{ context: context, unremovableNodes: unremovable.NewNodes(), - unneededNodes: unneededNodes, + unneededNodes: unneeded.NewNodes(processors.NodeGroupConfigProcessor, resourceLimitsFinder, nlt), rs: simulator.NewRemovalSimulator(context.ListerRegistry, context.ClusterSnapshot, deleteOptions, drainabilityRules, true), actuationInjector: scheduling.NewHintingSimulator(), eligibilityChecker: eligibility.NewChecker(processors.NodeGroupConfigProcessor), diff --git a/cluster-autoscaler/core/scaledown/unneeded/nodes.go b/cluster-autoscaler/core/scaledown/unneeded/nodes.go index 9ebd99fe3065..b0959c873b15 100644 --- a/cluster-autoscaler/core/scaledown/unneeded/nodes.go +++ b/cluster-autoscaler/core/scaledown/unneeded/nodes.go @@ -25,6 +25,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/latencytracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource" "k8s.io/autoscaler/cluster-autoscaler/metrics" "k8s.io/autoscaler/cluster-autoscaler/processors/nodes" @@ -39,11 +40,12 @@ import ( // Nodes tracks the state of cluster nodes that are not needed. type Nodes struct { - sdtg scaleDownTimeGetter - limitsFinder *resource.LimitsFinder - cachedList []*apiv1.Node - byName map[string]*node - unneededTimeCache map[string]time.Duration + sdtg scaleDownTimeGetter + limitsFinder *resource.LimitsFinder + nodeLatencyTracker latencytracker.LatencyTracker + cachedList []*apiv1.Node + byName map[string]*node + unneededTimeCache map[string]time.Duration } type node struct { @@ -59,11 +61,12 @@ type scaleDownTimeGetter interface { } // NewNodes returns a new initialized Nodes object. -func NewNodes(sdtg scaleDownTimeGetter, limitsFinder *resource.LimitsFinder) *Nodes { +func NewNodes(sdtg scaleDownTimeGetter, limitsFinder *resource.LimitsFinder, nlt latencytracker.LatencyTracker) *Nodes { return &Nodes{ - sdtg: sdtg, - limitsFinder: limitsFinder, - unneededTimeCache: make(map[string]time.Duration), + sdtg: sdtg, + limitsFinder: limitsFinder, + unneededTimeCache: make(map[string]time.Duration), + nodeLatencyTracker: nlt, } } @@ -268,6 +271,9 @@ func (n *Nodes) unremovableReason(context *context.AutoscalingContext, scaleDown if ready { // Check how long a ready node was underutilized. unneededTime, err := n.sdtg.GetScaleDownUnneededTime(nodeGroup) + if n.nodeLatencyTracker != nil { + n.nodeLatencyTracker.UpdateThreshold(node.Name, unneededTime) + } if err != nil { klog.Errorf("Error trying to get ScaleDownUnneededTime for node %s (in group: %s)", node.Name, nodeGroup.Id()) return simulator.UnexpectedError @@ -278,6 +284,9 @@ func (n *Nodes) unremovableReason(context *context.AutoscalingContext, scaleDown } else { // Unready nodes may be deleted after a different time than underutilized nodes. unreadyTime, err := n.sdtg.GetScaleDownUnreadyTime(nodeGroup) + if n.nodeLatencyTracker != nil { + n.nodeLatencyTracker.UpdateThreshold(node.Name, unreadyTime) + } if err != nil { klog.Errorf("Error trying to get ScaleDownUnreadyTime for node %s (in group: %s)", node.Name, nodeGroup.Id()) return simulator.UnexpectedError diff --git a/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go b/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go index 7a650854ed2f..44bff7fbd768 100644 --- a/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go +++ b/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go @@ -101,7 +101,7 @@ func TestUpdate(t *testing.T) { tc := tc t.Run(tc.desc, func(t *testing.T) { t.Parallel() - nodes := NewNodes(nil, nil) + nodes := NewNodes(nil, nil, nil) nodes.Update(tc.initialNodes, initialTimestamp) nodes.Update(tc.finalNodes, finalTimestamp) wantNodes := len(tc.wantTimestamps) @@ -203,7 +203,8 @@ func TestRemovableAt(t *testing.T) { ctx, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{ScaleDownSimulationTimeout: 5 * time.Minute}, &fake.Clientset{}, registry, provider, nil, nil) assert.NoError(t, err) - n := NewNodes(&fakeScaleDownTimeGetter{}, &resource.LimitsFinder{}) + fakeTracker := NewFakeLatencyTracker() + n := NewNodes(&fakeScaleDownTimeGetter{}, &resource.LimitsFinder{}, fakeTracker) n.Update(removableNodes, time.Now()) gotEmptyToRemove, gotDrainToRemove, _ := n.RemovableAt(&ctx, nodeprocessors.ScaleDownContext{ ActuationStatus: as, @@ -213,6 +214,16 @@ func TestRemovableAt(t *testing.T) { if len(gotDrainToRemove) != tc.numDrainToRemove || len(gotEmptyToRemove) != tc.numEmptyToRemove { t.Errorf("%s: getNodesToRemove() return %d, %d, want %d, %d", tc.name, len(gotEmptyToRemove), len(gotDrainToRemove), tc.numEmptyToRemove, tc.numDrainToRemove) } + expectedThreshold := 0 * time.Second // matches fakeScaleDownTimeGetter + for _, node := range removableNodes { + nodeName := node.Node.Name + got, ok := fakeTracker.Observed[nodeName] + if !ok { + t.Errorf("NodeLatencyTracker not called for node %s", nodeName) + } else if got != expectedThreshold { + t.Errorf("NodeLatencyTracker called with %v for node %s, want %v", got, nodeName, expectedThreshold) + } + } }) } } @@ -339,3 +350,19 @@ func (f *fakeScaleDownTimeGetter) GetScaleDownUnneededTime(cloudprovider.NodeGro func (f *fakeScaleDownTimeGetter) GetScaleDownUnreadyTime(cloudprovider.NodeGroup) (time.Duration, error) { return 0 * time.Second, nil } + +type fakeLatencyTracker struct { + Observed map[string]time.Duration +} + +func NewFakeLatencyTracker() *fakeLatencyTracker { + return &fakeLatencyTracker{Observed: make(map[string]time.Duration)} +} + +func (f *fakeLatencyTracker) UpdateThreshold(nodeName string, threshold time.Duration) { + f.Observed[nodeName] = threshold +} +func (f *fakeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Time) { +} +func (f *fakeLatencyTracker) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) { +} From 3bd35bf3f63771321afa4eb40dfc1956693a52c6 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:42:30 +0000 Subject: [PATCH 09/19] remove GetUnneededTimeForNode --- .../core/scaledown/unneeded/nodes.go | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/unneeded/nodes.go b/cluster-autoscaler/core/scaledown/unneeded/nodes.go index b0959c873b15..83d2db057a74 100644 --- a/cluster-autoscaler/core/scaledown/unneeded/nodes.go +++ b/cluster-autoscaler/core/scaledown/unneeded/nodes.go @@ -65,7 +65,6 @@ func NewNodes(sdtg scaleDownTimeGetter, limitsFinder *resource.LimitsFinder, nlt return &Nodes{ sdtg: sdtg, limitsFinder: limitsFinder, - unneededTimeCache: make(map[string]time.Duration), nodeLatencyTracker: nlt, } } @@ -214,41 +213,6 @@ func (n *Nodes) RemovableAt(context *context.AutoscalingContext, scaleDownContex return } -// GetUnneededTimeForNode returns the unneeded timeout for a given node if tracked. -// Returns (duration, true) if found, otherwise (0, false). -func (n *Nodes) GetUnneededTimeForNode(ctx *context.AutoscalingContext, nodeName string) (time.Duration, bool) { - v, found := n.byName[nodeName] - if !found { - klog.V(4).Infof("Skipping - node %s not found in unneded list", nodeName) - return 0, false - } - - node := v.ntbr.Node - nodeGroup, err := ctx.CloudProvider.NodeGroupForNode(node) - if err != nil { - klog.Errorf("Error while getting node group for %s: %v", nodeName, err) - return 0, false - } - if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { - klog.V(4).Infof("Skipping %s - no node group", nodeName) - return 0, false - } - - ngID := nodeGroup.Id() - if cached, ok := n.unneededTimeCache[ngID]; ok { - return cached, true - } - - unneededTime, err := n.sdtg.GetScaleDownUnneededTime(nodeGroup) - if err != nil { - klog.Errorf("Error getting ScaleDownUnneededTime for node %s: %v", nodeName, err) - return 0, false - } - - n.unneededTimeCache[ngID] = unneededTime - return unneededTime, true -} - func (n *Nodes) unremovableReason(context *context.AutoscalingContext, scaleDownContext nodes.ScaleDownContext, v *node, ts time.Time, nodeGroupSize map[string]int) simulator.UnremovableReason { node := v.ntbr.Node // Check if node is marked with no scale down annotation. From 97eb2f55d6c7deb6891de7899d5debc080f176b1 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:43:48 +0000 Subject: [PATCH 10/19] Move ObserveDeletion to a correct place and test --- .../core/scaledown/actuation/actuator.go | 9 +- .../core/scaledown/actuation/actuator_test.go | 3264 +++++++++-------- 2 files changed, 1659 insertions(+), 1614 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator.go b/cluster-autoscaler/core/scaledown/actuation/actuator.go index d7684d06cc1d..198c02230281 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator.go @@ -327,9 +327,6 @@ func (a *Actuator) deleteNodesAsync(nodes []*apiv1.Node, nodeGroup cloudprovider } for _, node := range nodes { - if a.nodeLatencyTracker != nil { - a.nodeLatencyTracker.ObserveDeletion(node.Name, time.Now()) - } nodeInfo, err := clusterSnapshot.GetNodeInfo(node.Name) if err != nil { nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "nodeInfos.Get for %q returned error: %v", node.Name, err)} @@ -352,10 +349,16 @@ func (a *Actuator) deleteNodesAsync(nodes []*apiv1.Node, nodeGroup cloudprovider if force { go a.nodeDeletionScheduler.scheduleForceDeletion(nodeInfo, nodeGroup, batchSize, drain) + if a.nodeLatencyTracker != nil { + a.nodeLatencyTracker.ObserveDeletion(node.Name, time.Now()) + } continue } go a.nodeDeletionScheduler.ScheduleDeletion(nodeInfo, nodeGroup, batchSize, drain) + if a.nodeLatencyTracker != nil { + a.nodeLatencyTracker.ObserveDeletion(node.Name, time.Now()) + } } } diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go index b0e48898a516..13336b6726a1 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go @@ -39,7 +39,6 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/budgets" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/deletiontracker" - "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/latencytracker" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" . "k8s.io/autoscaler/cluster-autoscaler/core/test" "k8s.io/autoscaler/cluster-autoscaler/observers/nodegroupchange" @@ -55,1675 +54,1718 @@ import ( ) type nodeGroupViewInfo struct { - nodeGroupName string - from int - to int + nodeGroupName string + from int + to int } type scaleDownNodeInfo struct { - name string - nodeGroup string - evictedPods []*apiv1.Pod - utilInfo utilization.Info + name string + nodeGroup string + evictedPods []*apiv1.Pod + utilInfo utilization.Info } type scaleDownStatusInfo struct { - result status.ScaleDownResult - scaledDownNodes []scaleDownNodeInfo + result status.ScaleDownResult + scaledDownNodes []scaleDownNodeInfo } type startDeletionTestCase struct { - defaultOnly bool // Set to true to only run default deletion logic tests. - forcedOnly bool // Set to true to only run forced deletion logic tests. - nodeGroups map[string]*testprovider.TestNodeGroup - emptyNodes []nodeGroupViewInfo - drainNodes []nodeGroupViewInfo - pods map[string][]*apiv1.Pod - failedPodDrain map[string]bool - failedNodeDeletion map[string]bool - failedNodeTaint map[string]bool - wantStatus scaleDownStatusInfo - wantErr error - wantDeletedPods []string - wantDeletedNodes []string - wantTaintUpdates map[string][][]apiv1.Taint - wantNodeDeleteResults map[string]status.NodeDeleteResult + defaultOnly bool // Set to true to only run default deletion logic tests. + forcedOnly bool // Set to true to only run forced deletion logic tests. + nodeGroups map[string]*testprovider.TestNodeGroup + emptyNodes []nodeGroupViewInfo + drainNodes []nodeGroupViewInfo + pods map[string][]*apiv1.Pod + failedPodDrain map[string]bool + failedNodeDeletion map[string]bool + failedNodeTaint map[string]bool + wantStatus scaleDownStatusInfo + wantErr error + wantDeletedPods []string + wantDeletedNodes []string + wantTaintUpdates map[string][][]apiv1.Taint + wantNodeDeleteResults map[string]status.NodeDeleteResult } +// FakeLatencyTracker implements the same interface as NodeLatencyTracker +type fakeLatencyTracker struct { + ObservedNodes []string +} + +// ObserveDeletion simply records the node name +func (f *fakeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Time) { + f.ObservedNodes = append(f.ObservedNodes, nodeName) +} +func (f *fakeLatencyTracker) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) { +} +func (f *fakeLatencyTracker) UpdateThreshold(nodeName string, threshold time.Duration) {} + func getStartDeletionTestCases(ignoreDaemonSetsUtilization bool, force bool, suffix string) map[string]startDeletionTestCase { - toBeDeletedTaint := apiv1.Taint{Key: taints.ToBeDeletedTaint, Effect: apiv1.TaintEffectNoSchedule} - - dsUtilInfo := generateUtilInfo(2./8., 2./8.) - - if ignoreDaemonSetsUtilization { - dsUtilInfo = generateUtilInfo(0./8., 0./8.) - } - - testCases := map[string]startDeletionTestCase{ - "nothing to delete": { - emptyNodes: nil, - drainNodes: nil, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNoNodeDeleted, - }, - }, - "empty node deletion": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"test", 0, 2}, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - utilInfo: generateUtilInfo(0, 0), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "empty atomic node deletion": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "atomic-2": sizedNodeGroup("atomic-2", 2, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"atomic-2", 0, 2}, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "atomic-2-node-0", - nodeGroup: "atomic-2", - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-2-node-1", - nodeGroup: "atomic-2", - utilInfo: generateUtilInfo(0, 0), - }, - }, - }, - wantDeletedNodes: []string{ - "atomic-2-node-0", - "atomic-2-node-1", - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "atomic-2-node-0": { - {toBeDeletedTaint}, - }, - "atomic-2-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "atomic-2-node-0": {ResultType: status.NodeDeleteOk}, - "atomic-2-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "deletion with drain": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - drainNodes: []nodeGroupViewInfo{ - {"test", 0, 2}, - }, - pods: map[string][]*apiv1.Pod{ - "test-node-0": removablePods(2, "test-node-0"), - "test-node-1": removablePods(2, "test-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-0"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-1"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantDeletedPods: []string{"test-node-0-pod-0", "test-node-0-pod-1", "test-node-1-pod-0", "test-node-1-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "empty and drain deletion work correctly together": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"test", 0, 2}, - }, - drainNodes: []nodeGroupViewInfo{ - {"test", 2, 4}, - }, - pods: map[string][]*apiv1.Pod{ - "test-node-2": removablePods(2, "test-node-2"), - "test-node-3": removablePods(2, "test-node-3"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-2"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-3"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{ - "test-node-0", - "test-node-1", - "test-node-2", "test-node-3"}, - wantDeletedPods: []string{"test-node-2-pod-0", "test-node-2-pod-1", "test-node-3-pod-0", "test-node-3-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - "test-node-2": { - {toBeDeletedTaint}, - }, - "test-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - "test-node-2": {ResultType: status.NodeDeleteOk}, - "test-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "two atomic groups can be scaled down together": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "atomic-2-mixed": sizedNodeGroup("atomic-2-mixed", 2, true, ignoreDaemonSetsUtilization), - "atomic-2-drain": sizedNodeGroup("atomic-2-drain", 2, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"atomic-2-mixed", 1, 2}, - }, - drainNodes: []nodeGroupViewInfo{ - {"atomic-2-mixed", 0, 1}, - {"atomic-2-drain", 0, 2}, - }, - pods: map[string][]*apiv1.Pod{ - "atomic-2-mixed-node-0": removablePods(2, "atomic-2-mixed-node-0"), - "atomic-2-drain-node-0": removablePods(1, "atomic-2-drain-node-0"), - "atomic-2-drain-node-1": removablePods(2, "atomic-2-drain-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "atomic-2-mixed-node-1", - nodeGroup: "atomic-2-mixed", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-2-mixed-node-0", - nodeGroup: "atomic-2-mixed", - evictedPods: removablePods(2, "atomic-2-mixed-node-0"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "atomic-2-drain-node-0", - nodeGroup: "atomic-2-drain", - evictedPods: removablePods(1, "atomic-2-drain-node-0"), - utilInfo: generateUtilInfo(1./8., 1./8.), - }, - { - name: "atomic-2-drain-node-1", - nodeGroup: "atomic-2-drain", - evictedPods: removablePods(2, "atomic-2-drain-node-1"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"atomic-2-mixed-node-0", "atomic-2-mixed-node-1", "atomic-2-drain-node-0", "atomic-2-drain-node-1"}, - wantDeletedPods: []string{"atomic-2-mixed-node-0-pod-0", "atomic-2-mixed-node-0-pod-1", "atomic-2-drain-node-0-pod-0", "atomic-2-drain-node-1-pod-0", "atomic-2-drain-node-1-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "atomic-2-mixed-node-0": { - {toBeDeletedTaint}, - }, - "atomic-2-mixed-node-1": { - {toBeDeletedTaint}, - }, - "atomic-2-drain-node-0": { - {toBeDeletedTaint}, - }, - "atomic-2-drain-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "atomic-2-mixed-node-0": {ResultType: status.NodeDeleteOk}, - "atomic-2-mixed-node-1": {ResultType: status.NodeDeleteOk}, - "atomic-2-drain-node-0": {ResultType: status.NodeDeleteOk}, - "atomic-2-drain-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "atomic empty and drain deletion work correctly together": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "atomic-4": sizedNodeGroup("atomic-4", 4, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"atomic-4", 0, 2}, - }, - drainNodes: []nodeGroupViewInfo{ - {"atomic-4", 2, 4}, - }, - pods: map[string][]*apiv1.Pod{ - "atomic-4-node-2": removablePods(2, "atomic-4-node-2"), - "atomic-4-node-3": removablePods(2, "atomic-4-node-3"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "atomic-4-node-0", - nodeGroup: "atomic-4", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-4-node-1", - nodeGroup: "atomic-4", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-4-node-2", - nodeGroup: "atomic-4", - evictedPods: removablePods(2, "atomic-4-node-2"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "atomic-4-node-3", - nodeGroup: "atomic-4", - evictedPods: removablePods(2, "atomic-4-node-3"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"atomic-4-node-0", "atomic-4-node-1", "atomic-4-node-2", "atomic-4-node-3"}, - wantDeletedPods: []string{"atomic-4-node-2-pod-0", "atomic-4-node-2-pod-1", "atomic-4-node-3-pod-0", "atomic-4-node-3-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "atomic-4-node-0": { - {toBeDeletedTaint}, - }, - "atomic-4-node-1": { - {toBeDeletedTaint}, - }, - "atomic-4-node-2": { - {toBeDeletedTaint}, - }, - "atomic-4-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "atomic-4-node-0": {ResultType: status.NodeDeleteOk}, - "atomic-4-node-1": {ResultType: status.NodeDeleteOk}, - "atomic-4-node-2": {ResultType: status.NodeDeleteOk}, - "atomic-4-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "failure to taint empty node stops deletion and cleans already applied taints": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 4}}, - drainNodes: []nodeGroupViewInfo{{"test", 4, 5}}, - pods: map[string][]*apiv1.Pod{ - "test-node-4": removablePods(2, "test-node-4"), - }, - failedNodeTaint: map[string]bool{"test-node-2": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownError, - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - {}, - }, - "test-node-1": { - {toBeDeletedTaint}, - {}, - }, - "test-node-3": { - {toBeDeletedTaint}, - {}, - }, - }, - wantErr: cmpopts.AnyError, - }, - "failure to taint empty atomic node stops deletion and cleans already applied taints": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - "atomic-4": sizedNodeGroup("atomic-4", 4, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"atomic-4", 0, 4}}, - drainNodes: []nodeGroupViewInfo{{"test", 4, 5}}, - pods: map[string][]*apiv1.Pod{ - "test-node-4": removablePods(2, "test-node-4"), - }, - failedNodeTaint: map[string]bool{"atomic-4-node-2": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownError, - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "atomic-4-node-0": { - {toBeDeletedTaint}, - {}, - }, - "atomic-4-node-1": { - {toBeDeletedTaint}, - {}, - }, - "atomic-4-node-3": { - {toBeDeletedTaint}, - {}, - }, - }, - wantErr: cmpopts.AnyError, - }, - "failure to taint drain node stops further deletion and cleans already applied taints": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, //generateNodeGroupViewList(testNg, 0, 2), - drainNodes: []nodeGroupViewInfo{{"test", 2, 6}}, //generateNodeGroupViewList(testNg, 2, 6), - pods: map[string][]*apiv1.Pod{ - "test-node-2": removablePods(2, "test-node-2"), - "test-node-3": removablePods(2, "test-node-3"), - "test-node-4": removablePods(2, "test-node-4"), - "test-node-5": removablePods(2, "test-node-5"), - }, - failedNodeTaint: map[string]bool{"test-node-2": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownError, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - wantErr: cmpopts.AnyError, - }, - "failure to taint drain atomic node stops further deletion and cleans already applied taints": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - "atomic-6": sizedNodeGroup("atomic-6", 6, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - drainNodes: []nodeGroupViewInfo{{"atomic-6", 0, 6}}, - pods: map[string][]*apiv1.Pod{ - "atomic-6-node-0": removablePods(2, "atomic-6-node-0"), - "atomic-6-node-1": removablePods(2, "atomic-6-node-1"), - "atomic-6-node-2": removablePods(2, "atomic-6-node-2"), - "atomic-6-node-3": removablePods(2, "atomic-6-node-3"), - "atomic-6-node-4": removablePods(2, "atomic-6-node-4"), - "atomic-6-node-5": removablePods(2, "atomic-6-node-5"), - }, - failedNodeTaint: map[string]bool{"atomic-6-node-2": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownError, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - wantErr: cmpopts.AnyError, - }, - "nodes that failed drain are correctly reported in results": { - defaultOnly: true, - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - drainNodes: []nodeGroupViewInfo{{"test", 0, 4}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": removablePods(3, "test-node-0"), - "test-node-1": removablePods(3, "test-node-1"), - "test-node-2": removablePods(3, "test-node-2"), - "test-node-3": removablePods(3, "test-node-3"), - }, - failedPodDrain: map[string]bool{ - "test-node-0-pod-0": true, - "test-node-0-pod-1": true, - "test-node-2-pod-1": true, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-0"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-1"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-2"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-3"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-1", "test-node-3"}, - wantDeletedPods: []string{ - "test-node-0-pod-2", - "test-node-1-pod-0", "test-node-1-pod-1", "test-node-1-pod-2", - "test-node-2-pod-0", "test-node-2-pod-2", - "test-node-3-pod-0", "test-node-3-pod-1", "test-node-3-pod-2", - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - {}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - "test-node-2": { - {toBeDeletedTaint}, - {}, - }, - "test-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": { - ResultType: status.NodeDeleteErrorFailedToEvictPods, - Err: cmpopts.AnyError, - PodEvictionResults: map[string]status.PodEvictionResult{ - "test-node-0-pod-0": {Pod: removablePod("test-node-0-pod-0", "test-node-0"), Err: cmpopts.AnyError, TimedOut: true}, - "test-node-0-pod-1": {Pod: removablePod("test-node-0-pod-1", "test-node-0"), Err: cmpopts.AnyError, TimedOut: true}, - "test-node-0-pod-2": {Pod: removablePod("test-node-0-pod-2", "test-node-0")}, - }, - }, - "test-node-1": {ResultType: status.NodeDeleteOk}, - "test-node-2": { - ResultType: status.NodeDeleteErrorFailedToEvictPods, - Err: cmpopts.AnyError, - PodEvictionResults: map[string]status.PodEvictionResult{ - "test-node-2-pod-0": {Pod: removablePod("test-node-2-pod-0", "test-node-2")}, - "test-node-2-pod-1": {Pod: removablePod("test-node-2-pod-1", "test-node-2"), Err: cmpopts.AnyError, TimedOut: true}, - "test-node-2-pod-2": {Pod: removablePod("test-node-2-pod-2", "test-node-2")}, - }, - }, - "test-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "nodes that failed drain are forcefully deleted": { - forcedOnly: true, - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - drainNodes: []nodeGroupViewInfo{{"test", 0, 4}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": removablePods(3, "test-node-0"), - "test-node-1": removablePods(3, "test-node-1"), - "test-node-2": removablePods(3, "test-node-2"), - "test-node-3": removablePods(3, "test-node-3"), - }, - failedPodDrain: map[string]bool{ - "test-node-0-pod-0": true, - "test-node-0-pod-1": true, - "test-node-2-pod-1": true, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-0"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-1"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-2"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-3"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1", "test-node-2", "test-node-3"}, - wantDeletedPods: []string{ - "test-node-0-pod-2", - "test-node-1-pod-0", "test-node-1-pod-1", "test-node-1-pod-2", - "test-node-2-pod-0", "test-node-2-pod-2", - "test-node-3-pod-0", "test-node-3-pod-1", "test-node-3-pod-2", - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - "test-node-2": { - {toBeDeletedTaint}, - }, - "test-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - "test-node-2": {ResultType: status.NodeDeleteOk}, - "test-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "nodes that failed deletion are correctly reported in results": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - drainNodes: []nodeGroupViewInfo{{"test", 2, 4}}, - pods: map[string][]*apiv1.Pod{ - "test-node-2": removablePods(2, "test-node-2"), - "test-node-3": removablePods(2, "test-node-3"), - }, - failedNodeDeletion: map[string]bool{ - "test-node-1": true, - "test-node-3": true, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-2"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-3"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-2"}, - wantDeletedPods: []string{ - "test-node-2-pod-0", "test-node-2-pod-1", - "test-node-3-pod-0", "test-node-3-pod-1", - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - {}, - }, - "test-node-2": { - {toBeDeletedTaint}, - }, - "test-node-3": { - {toBeDeletedTaint}, - {}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, - "test-node-2": {ResultType: status.NodeDeleteOk}, - "test-node-3": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, - }, - }, - "DS pods are evicted from empty nodes, but don't block deletion on error": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": generateDsPods(2, "test-node-0"), - "test-node-1": generateDsPods(2, "test-node-1"), - }, - failedPodDrain: map[string]bool{"test-node-1-ds-pod-0": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantDeletedPods: []string{"test-node-0-ds-pod-0", "test-node-0-ds-pod-1", "test-node-1-ds-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "DS pods and deletion with drain": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - drainNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": generateDsPods(2, "test-node-0"), - "test-node-1": generateDsPods(2, "test-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - // this is nil because DaemonSetEvictionForOccupiedNodes is - // not enabled for drained nodes in this test suite - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - { - name: "test-node-1", - nodeGroup: "test", - // this is nil because DaemonSetEvictionForOccupiedNodes is - // not enabled for drained nodes in this test suite - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - // same as evicted pods - wantDeletedPods: nil, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "DS pods and empty and drain deletion work correctly together": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - drainNodes: []nodeGroupViewInfo{{"test", 2, 4}}, - pods: map[string][]*apiv1.Pod{ - "test-node-2": removablePods(2, "test-node-2"), - "test-node-3": generateDsPods(2, "test-node-3"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-2"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1", "test-node-2", "test-node-3"}, - // same as evicted pods - wantDeletedPods: nil, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - "test-node-2": { - {toBeDeletedTaint}, - }, - "test-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - "test-node-2": {ResultType: status.NodeDeleteOk}, - "test-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "nodes with pods are not deleted if the node is passed as empty": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": removablePods(2, "test-node-0"), - "test-node-1": removablePods(2, "test-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: nil, - wantDeletedPods: nil, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - {}, - }, - "test-node-1": { - {toBeDeletedTaint}, - {}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, - "test-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, - }, - }, - "atomic nodes with pods are not deleted if the node is passed as empty": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - "atomic-2": sizedNodeGroup("atomic-2", 2, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}, {"atomic-2", 0, 2}}, - pods: map[string][]*apiv1.Pod{ - "test-node-1": removablePods(2, "test-node-1"), - "atomic-2-node-1": removablePods(2, "atomic-2-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "atomic-2-node-0", - nodeGroup: "atomic-2", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-2-node-1", - nodeGroup: "atomic-2", - evictedPods: nil, - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0"}, - wantDeletedPods: nil, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - {}, - }, - "atomic-2-node-0": { - {toBeDeletedTaint}, - {}, - }, - "atomic-2-node-1": { - {toBeDeletedTaint}, - {}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, - "atomic-2-node-0": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, - "atomic-2-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, - }, - }, - } - - filteredTestCases := map[string]startDeletionTestCase{} - for k, v := range testCases { - if force && v.defaultOnly { - continue - } - if !force && v.forcedOnly { - continue - } - filteredTestCases[k+" "+suffix] = v - } - - return filteredTestCases + toBeDeletedTaint := apiv1.Taint{Key: taints.ToBeDeletedTaint, Effect: apiv1.TaintEffectNoSchedule} + + dsUtilInfo := generateUtilInfo(2./8., 2./8.) + + if ignoreDaemonSetsUtilization { + dsUtilInfo = generateUtilInfo(0./8., 0./8.) + } + + testCases := map[string]startDeletionTestCase{ + "nothing to delete": { + emptyNodes: nil, + drainNodes: nil, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNoNodeDeleted, + }, + }, + "empty node deletion": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"test", 0, 2}, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + utilInfo: generateUtilInfo(0, 0), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "empty atomic node deletion": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "atomic-2": sizedNodeGroup("atomic-2", 2, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"atomic-2", 0, 2}, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "atomic-2-node-0", + nodeGroup: "atomic-2", + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-2-node-1", + nodeGroup: "atomic-2", + utilInfo: generateUtilInfo(0, 0), + }, + }, + }, + wantDeletedNodes: []string{ + "atomic-2-node-0", + "atomic-2-node-1", + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "atomic-2-node-0": { + {toBeDeletedTaint}, + }, + "atomic-2-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "atomic-2-node-0": {ResultType: status.NodeDeleteOk}, + "atomic-2-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "deletion with drain": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + drainNodes: []nodeGroupViewInfo{ + {"test", 0, 2}, + }, + pods: map[string][]*apiv1.Pod{ + "test-node-0": removablePods(2, "test-node-0"), + "test-node-1": removablePods(2, "test-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-0"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-1"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantDeletedPods: []string{"test-node-0-pod-0", "test-node-0-pod-1", "test-node-1-pod-0", "test-node-1-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "empty and drain deletion work correctly together": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"test", 0, 2}, + }, + drainNodes: []nodeGroupViewInfo{ + {"test", 2, 4}, + }, + pods: map[string][]*apiv1.Pod{ + "test-node-2": removablePods(2, "test-node-2"), + "test-node-3": removablePods(2, "test-node-3"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-2"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-3"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{ + "test-node-0", + "test-node-1", + "test-node-2", "test-node-3"}, + wantDeletedPods: []string{"test-node-2-pod-0", "test-node-2-pod-1", "test-node-3-pod-0", "test-node-3-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + "test-node-2": { + {toBeDeletedTaint}, + }, + "test-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + "test-node-2": {ResultType: status.NodeDeleteOk}, + "test-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "two atomic groups can be scaled down together": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "atomic-2-mixed": sizedNodeGroup("atomic-2-mixed", 2, true, ignoreDaemonSetsUtilization), + "atomic-2-drain": sizedNodeGroup("atomic-2-drain", 2, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"atomic-2-mixed", 1, 2}, + }, + drainNodes: []nodeGroupViewInfo{ + {"atomic-2-mixed", 0, 1}, + {"atomic-2-drain", 0, 2}, + }, + pods: map[string][]*apiv1.Pod{ + "atomic-2-mixed-node-0": removablePods(2, "atomic-2-mixed-node-0"), + "atomic-2-drain-node-0": removablePods(1, "atomic-2-drain-node-0"), + "atomic-2-drain-node-1": removablePods(2, "atomic-2-drain-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "atomic-2-mixed-node-1", + nodeGroup: "atomic-2-mixed", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-2-mixed-node-0", + nodeGroup: "atomic-2-mixed", + evictedPods: removablePods(2, "atomic-2-mixed-node-0"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "atomic-2-drain-node-0", + nodeGroup: "atomic-2-drain", + evictedPods: removablePods(1, "atomic-2-drain-node-0"), + utilInfo: generateUtilInfo(1./8., 1./8.), + }, + { + name: "atomic-2-drain-node-1", + nodeGroup: "atomic-2-drain", + evictedPods: removablePods(2, "atomic-2-drain-node-1"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"atomic-2-mixed-node-0", "atomic-2-mixed-node-1", "atomic-2-drain-node-0", "atomic-2-drain-node-1"}, + wantDeletedPods: []string{"atomic-2-mixed-node-0-pod-0", "atomic-2-mixed-node-0-pod-1", "atomic-2-drain-node-0-pod-0", "atomic-2-drain-node-1-pod-0", "atomic-2-drain-node-1-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "atomic-2-mixed-node-0": { + {toBeDeletedTaint}, + }, + "atomic-2-mixed-node-1": { + {toBeDeletedTaint}, + }, + "atomic-2-drain-node-0": { + {toBeDeletedTaint}, + }, + "atomic-2-drain-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "atomic-2-mixed-node-0": {ResultType: status.NodeDeleteOk}, + "atomic-2-mixed-node-1": {ResultType: status.NodeDeleteOk}, + "atomic-2-drain-node-0": {ResultType: status.NodeDeleteOk}, + "atomic-2-drain-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "atomic empty and drain deletion work correctly together": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "atomic-4": sizedNodeGroup("atomic-4", 4, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"atomic-4", 0, 2}, + }, + drainNodes: []nodeGroupViewInfo{ + {"atomic-4", 2, 4}, + }, + pods: map[string][]*apiv1.Pod{ + "atomic-4-node-2": removablePods(2, "atomic-4-node-2"), + "atomic-4-node-3": removablePods(2, "atomic-4-node-3"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "atomic-4-node-0", + nodeGroup: "atomic-4", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-4-node-1", + nodeGroup: "atomic-4", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-4-node-2", + nodeGroup: "atomic-4", + evictedPods: removablePods(2, "atomic-4-node-2"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "atomic-4-node-3", + nodeGroup: "atomic-4", + evictedPods: removablePods(2, "atomic-4-node-3"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"atomic-4-node-0", "atomic-4-node-1", "atomic-4-node-2", "atomic-4-node-3"}, + wantDeletedPods: []string{"atomic-4-node-2-pod-0", "atomic-4-node-2-pod-1", "atomic-4-node-3-pod-0", "atomic-4-node-3-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "atomic-4-node-0": { + {toBeDeletedTaint}, + }, + "atomic-4-node-1": { + {toBeDeletedTaint}, + }, + "atomic-4-node-2": { + {toBeDeletedTaint}, + }, + "atomic-4-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "atomic-4-node-0": {ResultType: status.NodeDeleteOk}, + "atomic-4-node-1": {ResultType: status.NodeDeleteOk}, + "atomic-4-node-2": {ResultType: status.NodeDeleteOk}, + "atomic-4-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "failure to taint empty node stops deletion and cleans already applied taints": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 4}}, + drainNodes: []nodeGroupViewInfo{{"test", 4, 5}}, + pods: map[string][]*apiv1.Pod{ + "test-node-4": removablePods(2, "test-node-4"), + }, + failedNodeTaint: map[string]bool{"test-node-2": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownError, + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + {}, + }, + "test-node-1": { + {toBeDeletedTaint}, + {}, + }, + "test-node-3": { + {toBeDeletedTaint}, + {}, + }, + }, + wantErr: cmpopts.AnyError, + }, + "failure to taint empty atomic node stops deletion and cleans already applied taints": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + "atomic-4": sizedNodeGroup("atomic-4", 4, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"atomic-4", 0, 4}}, + drainNodes: []nodeGroupViewInfo{{"test", 4, 5}}, + pods: map[string][]*apiv1.Pod{ + "test-node-4": removablePods(2, "test-node-4"), + }, + failedNodeTaint: map[string]bool{"atomic-4-node-2": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownError, + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "atomic-4-node-0": { + {toBeDeletedTaint}, + {}, + }, + "atomic-4-node-1": { + {toBeDeletedTaint}, + {}, + }, + "atomic-4-node-3": { + {toBeDeletedTaint}, + {}, + }, + }, + wantErr: cmpopts.AnyError, + }, + "failure to taint drain node stops further deletion and cleans already applied taints": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, //generateNodeGroupViewList(testNg, 0, 2), + drainNodes: []nodeGroupViewInfo{{"test", 2, 6}}, //generateNodeGroupViewList(testNg, 2, 6), + pods: map[string][]*apiv1.Pod{ + "test-node-2": removablePods(2, "test-node-2"), + "test-node-3": removablePods(2, "test-node-3"), + "test-node-4": removablePods(2, "test-node-4"), + "test-node-5": removablePods(2, "test-node-5"), + }, + failedNodeTaint: map[string]bool{"test-node-2": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownError, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + wantErr: cmpopts.AnyError, + }, + "failure to taint drain atomic node stops further deletion and cleans already applied taints": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + "atomic-6": sizedNodeGroup("atomic-6", 6, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + drainNodes: []nodeGroupViewInfo{{"atomic-6", 0, 6}}, + pods: map[string][]*apiv1.Pod{ + "atomic-6-node-0": removablePods(2, "atomic-6-node-0"), + "atomic-6-node-1": removablePods(2, "atomic-6-node-1"), + "atomic-6-node-2": removablePods(2, "atomic-6-node-2"), + "atomic-6-node-3": removablePods(2, "atomic-6-node-3"), + "atomic-6-node-4": removablePods(2, "atomic-6-node-4"), + "atomic-6-node-5": removablePods(2, "atomic-6-node-5"), + }, + failedNodeTaint: map[string]bool{"atomic-6-node-2": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownError, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + wantErr: cmpopts.AnyError, + }, + "nodes that failed drain are correctly reported in results": { + defaultOnly: true, + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + drainNodes: []nodeGroupViewInfo{{"test", 0, 4}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": removablePods(3, "test-node-0"), + "test-node-1": removablePods(3, "test-node-1"), + "test-node-2": removablePods(3, "test-node-2"), + "test-node-3": removablePods(3, "test-node-3"), + }, + failedPodDrain: map[string]bool{ + "test-node-0-pod-0": true, + "test-node-0-pod-1": true, + "test-node-2-pod-1": true, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-0"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-1"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-2"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-3"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-1", "test-node-3"}, + wantDeletedPods: []string{ + "test-node-0-pod-2", + "test-node-1-pod-0", "test-node-1-pod-1", "test-node-1-pod-2", + "test-node-2-pod-0", "test-node-2-pod-2", + "test-node-3-pod-0", "test-node-3-pod-1", "test-node-3-pod-2", + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + {}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + "test-node-2": { + {toBeDeletedTaint}, + {}, + }, + "test-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": { + ResultType: status.NodeDeleteErrorFailedToEvictPods, + Err: cmpopts.AnyError, + PodEvictionResults: map[string]status.PodEvictionResult{ + "test-node-0-pod-0": {Pod: removablePod("test-node-0-pod-0", "test-node-0"), Err: cmpopts.AnyError, TimedOut: true}, + "test-node-0-pod-1": {Pod: removablePod("test-node-0-pod-1", "test-node-0"), Err: cmpopts.AnyError, TimedOut: true}, + "test-node-0-pod-2": {Pod: removablePod("test-node-0-pod-2", "test-node-0")}, + }, + }, + "test-node-1": {ResultType: status.NodeDeleteOk}, + "test-node-2": { + ResultType: status.NodeDeleteErrorFailedToEvictPods, + Err: cmpopts.AnyError, + PodEvictionResults: map[string]status.PodEvictionResult{ + "test-node-2-pod-0": {Pod: removablePod("test-node-2-pod-0", "test-node-2")}, + "test-node-2-pod-1": {Pod: removablePod("test-node-2-pod-1", "test-node-2"), Err: cmpopts.AnyError, TimedOut: true}, + "test-node-2-pod-2": {Pod: removablePod("test-node-2-pod-2", "test-node-2")}, + }, + }, + "test-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "nodes that failed drain are forcefully deleted": { + forcedOnly: true, + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + drainNodes: []nodeGroupViewInfo{{"test", 0, 4}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": removablePods(3, "test-node-0"), + "test-node-1": removablePods(3, "test-node-1"), + "test-node-2": removablePods(3, "test-node-2"), + "test-node-3": removablePods(3, "test-node-3"), + }, + failedPodDrain: map[string]bool{ + "test-node-0-pod-0": true, + "test-node-0-pod-1": true, + "test-node-2-pod-1": true, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-0"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-1"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-2"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-3"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1", "test-node-2", "test-node-3"}, + wantDeletedPods: []string{ + "test-node-0-pod-2", + "test-node-1-pod-0", "test-node-1-pod-1", "test-node-1-pod-2", + "test-node-2-pod-0", "test-node-2-pod-2", + "test-node-3-pod-0", "test-node-3-pod-1", "test-node-3-pod-2", + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + "test-node-2": { + {toBeDeletedTaint}, + }, + "test-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + "test-node-2": {ResultType: status.NodeDeleteOk}, + "test-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "nodes that failed deletion are correctly reported in results": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + drainNodes: []nodeGroupViewInfo{{"test", 2, 4}}, + pods: map[string][]*apiv1.Pod{ + "test-node-2": removablePods(2, "test-node-2"), + "test-node-3": removablePods(2, "test-node-3"), + }, + failedNodeDeletion: map[string]bool{ + "test-node-1": true, + "test-node-3": true, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-2"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-3"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-2"}, + wantDeletedPods: []string{ + "test-node-2-pod-0", "test-node-2-pod-1", + "test-node-3-pod-0", "test-node-3-pod-1", + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + {}, + }, + "test-node-2": { + {toBeDeletedTaint}, + }, + "test-node-3": { + {toBeDeletedTaint}, + {}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, + "test-node-2": {ResultType: status.NodeDeleteOk}, + "test-node-3": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, + }, + }, + "DS pods are evicted from empty nodes, but don't block deletion on error": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": generateDsPods(2, "test-node-0"), + "test-node-1": generateDsPods(2, "test-node-1"), + }, + failedPodDrain: map[string]bool{"test-node-1-ds-pod-0": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantDeletedPods: []string{"test-node-0-ds-pod-0", "test-node-0-ds-pod-1", "test-node-1-ds-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "DS pods and deletion with drain": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + drainNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": generateDsPods(2, "test-node-0"), + "test-node-1": generateDsPods(2, "test-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + // this is nil because DaemonSetEvictionForOccupiedNodes is + // not enabled for drained nodes in this test suite + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + { + name: "test-node-1", + nodeGroup: "test", + // this is nil because DaemonSetEvictionForOccupiedNodes is + // not enabled for drained nodes in this test suite + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + // same as evicted pods + wantDeletedPods: nil, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "DS pods and empty and drain deletion work correctly together": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + drainNodes: []nodeGroupViewInfo{{"test", 2, 4}}, + pods: map[string][]*apiv1.Pod{ + "test-node-2": removablePods(2, "test-node-2"), + "test-node-3": generateDsPods(2, "test-node-3"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-2"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1", "test-node-2", "test-node-3"}, + // same as evicted pods + wantDeletedPods: nil, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + "test-node-2": { + {toBeDeletedTaint}, + }, + "test-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + "test-node-2": {ResultType: status.NodeDeleteOk}, + "test-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "nodes with pods are not deleted if the node is passed as empty": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": removablePods(2, "test-node-0"), + "test-node-1": removablePods(2, "test-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: nil, + wantDeletedPods: nil, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + {}, + }, + "test-node-1": { + {toBeDeletedTaint}, + {}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, + "test-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, + }, + }, + "atomic nodes with pods are not deleted if the node is passed as empty": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + "atomic-2": sizedNodeGroup("atomic-2", 2, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}, {"atomic-2", 0, 2}}, + pods: map[string][]*apiv1.Pod{ + "test-node-1": removablePods(2, "test-node-1"), + "atomic-2-node-1": removablePods(2, "atomic-2-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "atomic-2-node-0", + nodeGroup: "atomic-2", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-2-node-1", + nodeGroup: "atomic-2", + evictedPods: nil, + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0"}, + wantDeletedPods: nil, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + {}, + }, + "atomic-2-node-0": { + {toBeDeletedTaint}, + {}, + }, + "atomic-2-node-1": { + {toBeDeletedTaint}, + {}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, + "atomic-2-node-0": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, + "atomic-2-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, + }, + }, + } + + filteredTestCases := map[string]startDeletionTestCase{} + for k, v := range testCases { + if force && v.defaultOnly { + continue + } + if !force && v.forcedOnly { + continue + } + filteredTestCases[k+" "+suffix] = v + } + + return filteredTestCases } func runStartDeletionTest(t *testing.T, tc startDeletionTestCase, force bool) { - // Insert all nodes into a map to support live node updates and GETs. - emptyNodeGroupViews, drainNodeGroupViews := []*budgets.NodeGroupView{}, []*budgets.NodeGroupView{} - allEmptyNodes, allDrainNodes := []*apiv1.Node{}, []*apiv1.Node{} - nodesByName := make(map[string]*apiv1.Node) - nodesLock := sync.Mutex{} - for _, ngvInfo := range tc.emptyNodes { - ngv := generateNodeGroupViewList(tc.nodeGroups[ngvInfo.nodeGroupName], ngvInfo.from, ngvInfo.to) - emptyNodeGroupViews = append(emptyNodeGroupViews, ngv...) - } - for _, bucket := range emptyNodeGroupViews { - allEmptyNodes = append(allEmptyNodes, bucket.Nodes...) - for _, node := range bucket.Nodes { - nodesByName[node.Name] = node - } - } - - for _, ngvInfo := range tc.drainNodes { - ngv := generateNodeGroupViewList(tc.nodeGroups[ngvInfo.nodeGroupName], ngvInfo.from, ngvInfo.to) - drainNodeGroupViews = append(drainNodeGroupViews, ngv...) - } - for _, bucket := range drainNodeGroupViews { - allDrainNodes = append(allDrainNodes, bucket.Nodes...) - for _, node := range bucket.Nodes { - nodesByName[node.Name] = node - } - } - - // Set up a fake k8s client to hook and verify certain actions. - fakeClient := &fake.Clientset{} - type nodeTaints struct { - nodeName string - taints []apiv1.Taint - } - taintUpdates := make(chan nodeTaints, 20) - deletedNodes := make(chan string, 10) - deletedPods := make(chan string, 10) - - ds := generateDaemonSet() - - // We're faking the whole k8s client, and some of the code needs to get live nodes and pods, so GET on nodes and pods has to be set up. - fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { - nodesLock.Lock() - defer nodesLock.Unlock() - getAction := action.(core.GetAction) - node, found := nodesByName[getAction.GetName()] - if !found { - return true, nil, fmt.Errorf("node %q not found", getAction.GetName()) - } - return true, node, nil - }) - fakeClient.Fake.AddReactor("get", "pods", - func(action core.Action) (bool, runtime.Object, error) { - return true, nil, errors.NewNotFound(apiv1.Resource("pod"), "whatever") - }) - // Hook node update to gather all taint updates, and to fail the update for certain nodes to simulate errors. - fakeClient.Fake.AddReactor("update", "nodes", - func(action core.Action) (bool, runtime.Object, error) { - nodesLock.Lock() - defer nodesLock.Unlock() - update := action.(core.UpdateAction) - obj := update.GetObject().(*apiv1.Node) - if tc.failedNodeTaint[obj.Name] { - return true, nil, fmt.Errorf("SIMULATED ERROR: won't taint") - } - nt := nodeTaints{ - nodeName: obj.Name, - } - for _, taint := range obj.Spec.Taints { - nt.taints = append(nt.taints, taint) - } - taintUpdates <- nt - nodesByName[obj.Name] = obj.DeepCopy() - return true, obj, nil - }) - // Hook eviction creation to gather which pods were evicted, and to fail the eviction for certain pods to simulate errors. - fakeClient.Fake.AddReactor("create", "pods", - func(action core.Action) (bool, runtime.Object, error) { - createAction := action.(core.CreateAction) - if createAction == nil { - return false, nil, nil - } - eviction := createAction.GetObject().(*policyv1beta1.Eviction) - if eviction == nil { - return false, nil, nil - } - if tc.failedPodDrain[eviction.Name] { - return true, nil, fmt.Errorf("SIMULATED ERROR: won't evict") - } - deletedPods <- eviction.Name - return true, nil, nil - }) - - // Hook node deletion at the level of cloud provider, to gather which nodes were deleted, and to fail the deletion for - // certain nodes to simulate errors. - provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleDown(func(nodeGroup string, node string) error { - if tc.failedNodeDeletion[node] { - return fmt.Errorf("SIMULATED ERROR: won't remove node") - } - deletedNodes <- node - return nil - }).Build() - for _, bucket := range emptyNodeGroupViews { - bucket.Group.(*testprovider.TestNodeGroup).SetCloudProvider(provider) - provider.InsertNodeGroup(bucket.Group) - for _, node := range bucket.Nodes { - provider.AddNode(bucket.Group.Id(), node) - } - } - for _, bucket := range drainNodeGroupViews { - bucket.Group.(*testprovider.TestNodeGroup).SetCloudProvider(provider) - provider.InsertNodeGroup(bucket.Group) - for _, node := range bucket.Nodes { - provider.AddNode(bucket.Group.Id(), node) - } - } - - // Set up other needed structures and options. - opts := config.AutoscalingOptions{ - MaxScaleDownParallelism: 10, - MaxDrainParallelism: 5, - MaxPodEvictionTime: 0, - DaemonSetEvictionForEmptyNodes: true, - } - - allPods := []*apiv1.Pod{} - - for _, pods := range tc.pods { - allPods = append(allPods, pods...) - } - - podLister := kube_util.NewTestPodLister(allPods) - pdbLister := kube_util.NewTestPodDisruptionBudgetLister([]*policyv1.PodDisruptionBudget{}) - dsLister, err := kube_util.NewTestDaemonSetLister([]*appsv1.DaemonSet{ds}) - if err != nil { - t.Fatalf("Couldn't create daemonset lister") - } - - registry := kube_util.NewListerRegistry(nil, nil, podLister, pdbLister, dsLister, nil, nil, nil, nil) - ctx, err := NewScaleTestAutoscalingContext(opts, fakeClient, registry, provider, nil, nil) - if err != nil { - t.Fatalf("Couldn't set up autoscaling context: %v", err) - } - csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker()) - for _, bucket := range emptyNodeGroupViews { - for _, node := range bucket.Nodes { - err := ctx.ClusterSnapshot.AddNodeInfo(framework.NewTestNodeInfo(node, tc.pods[node.Name]...)) - if err != nil { - t.Fatalf("Couldn't add node %q to snapshot: %v", node.Name, err) - } - } - } - for _, bucket := range drainNodeGroupViews { - for _, node := range bucket.Nodes { - pods, found := tc.pods[node.Name] - if !found { - t.Fatalf("Drain node %q doesn't have pods defined in the test case.", node.Name) - } - err := ctx.ClusterSnapshot.AddNodeInfo(framework.NewTestNodeInfo(node, pods...)) - if err != nil { - t.Fatalf("Couldn't add node %q to snapshot: %v", node.Name, err) - } - } - } - - wantScaleDownNodes := []*status.ScaleDownNode{} - for _, scaleDownNodeInfo := range tc.wantStatus.scaledDownNodes { - statusScaledDownNode := &status.ScaleDownNode{ - Node: generateNode(scaleDownNodeInfo.name), - NodeGroup: tc.nodeGroups[scaleDownNodeInfo.nodeGroup], - EvictedPods: scaleDownNodeInfo.evictedPods, - UtilInfo: scaleDownNodeInfo.utilInfo, - } - wantScaleDownNodes = append(wantScaleDownNodes, statusScaledDownNode) - } - - scaleStateNotifier := nodegroupchange.NewNodeGroupChangeObserversList() - scaleStateNotifier.Register(csr) - - // Create Actuator, run StartDeletion, and verify the error. - ndt := deletiontracker.NewNodeDeletionTracker(0) - ndb := NewNodeDeletionBatcher(&ctx, scaleStateNotifier, ndt, 0*time.Second) - legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, fullDsEviction: force} - actuator := Actuator{ - ctx: &ctx, nodeDeletionTracker: ndt, - nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), - budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), - configGetter: nodegroupconfig.NewDefaultNodeGroupConfigProcessor(ctx.NodeGroupDefaults), - nodeLatencyTracker: latencytracker.NewNodeLatencyTracker(), - } - - var gotResult status.ScaleDownResult - var gotScaleDownNodes []*status.ScaleDownNode - var gotErr error - if force { - gotResult, gotScaleDownNodes, gotErr = actuator.StartForceDeletion(allEmptyNodes, allDrainNodes) - } else { - gotResult, gotScaleDownNodes, gotErr = actuator.StartDeletion(allEmptyNodes, allDrainNodes) - } - - if diff := cmp.Diff(tc.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { - t.Errorf("StartDeletion error diff (-want +got):\n%s", diff) - } - - // Verify ScaleDownResult looks as expected. - if diff := cmp.Diff(tc.wantStatus.result, gotResult); diff != "" { - t.Errorf("StartDeletion result diff (-want +got):\n%s", diff) - } - - // Verify ScaleDownNodes looks as expected. - ignoreSdNodeOrder := cmpopts.SortSlices(func(a, b *status.ScaleDownNode) bool { return a.Node.Name < b.Node.Name }) - cmpNg := cmp.Comparer(func(a, b *testprovider.TestNodeGroup) bool { return a.Id() == b.Id() }) - statusCmpOpts := cmp.Options{ignoreSdNodeOrder, cmpNg, cmpopts.EquateEmpty()} - if diff := cmp.Diff(wantScaleDownNodes, gotScaleDownNodes, statusCmpOpts); diff != "" { - t.Errorf("StartDeletion scaled down nodes diff (-want +got):\n%s", diff) - } - - // Verify that all expected nodes were deleted using the cloud provider hook. - var gotDeletedNodes []string + // Insert all nodes into a map to support live node updates and GETs. + emptyNodeGroupViews, drainNodeGroupViews := []*budgets.NodeGroupView{}, []*budgets.NodeGroupView{} + allEmptyNodes, allDrainNodes := []*apiv1.Node{}, []*apiv1.Node{} + nodesByName := make(map[string]*apiv1.Node) + nodesLock := sync.Mutex{} + for _, ngvInfo := range tc.emptyNodes { + ngv := generateNodeGroupViewList(tc.nodeGroups[ngvInfo.nodeGroupName], ngvInfo.from, ngvInfo.to) + emptyNodeGroupViews = append(emptyNodeGroupViews, ngv...) + } + for _, bucket := range emptyNodeGroupViews { + allEmptyNodes = append(allEmptyNodes, bucket.Nodes...) + for _, node := range bucket.Nodes { + nodesByName[node.Name] = node + } + } + + for _, ngvInfo := range tc.drainNodes { + ngv := generateNodeGroupViewList(tc.nodeGroups[ngvInfo.nodeGroupName], ngvInfo.from, ngvInfo.to) + drainNodeGroupViews = append(drainNodeGroupViews, ngv...) + } + for _, bucket := range drainNodeGroupViews { + allDrainNodes = append(allDrainNodes, bucket.Nodes...) + for _, node := range bucket.Nodes { + nodesByName[node.Name] = node + } + } + + // Set up a fake k8s client to hook and verify certain actions. + fakeClient := &fake.Clientset{} + type nodeTaints struct { + nodeName string + taints []apiv1.Taint + } + taintUpdates := make(chan nodeTaints, 20) + deletedNodes := make(chan string, 10) + deletedPods := make(chan string, 10) + + ds := generateDaemonSet() + + // We're faking the whole k8s client, and some of the code needs to get live nodes and pods, so GET on nodes and pods has to be set up. + fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { + nodesLock.Lock() + defer nodesLock.Unlock() + getAction := action.(core.GetAction) + node, found := nodesByName[getAction.GetName()] + if !found { + return true, nil, fmt.Errorf("node %q not found", getAction.GetName()) + } + return true, node, nil + }) + fakeClient.Fake.AddReactor("get", "pods", + func(action core.Action) (bool, runtime.Object, error) { + return true, nil, errors.NewNotFound(apiv1.Resource("pod"), "whatever") + }) + // Hook node update to gather all taint updates, and to fail the update for certain nodes to simulate errors. + fakeClient.Fake.AddReactor("update", "nodes", + func(action core.Action) (bool, runtime.Object, error) { + nodesLock.Lock() + defer nodesLock.Unlock() + update := action.(core.UpdateAction) + obj := update.GetObject().(*apiv1.Node) + if tc.failedNodeTaint[obj.Name] { + return true, nil, fmt.Errorf("SIMULATED ERROR: won't taint") + } + nt := nodeTaints{ + nodeName: obj.Name, + } + for _, taint := range obj.Spec.Taints { + nt.taints = append(nt.taints, taint) + } + taintUpdates <- nt + nodesByName[obj.Name] = obj.DeepCopy() + return true, obj, nil + }) + // Hook eviction creation to gather which pods were evicted, and to fail the eviction for certain pods to simulate errors. + fakeClient.Fake.AddReactor("create", "pods", + func(action core.Action) (bool, runtime.Object, error) { + createAction := action.(core.CreateAction) + if createAction == nil { + return false, nil, nil + } + eviction := createAction.GetObject().(*policyv1beta1.Eviction) + if eviction == nil { + return false, nil, nil + } + if tc.failedPodDrain[eviction.Name] { + return true, nil, fmt.Errorf("SIMULATED ERROR: won't evict") + } + deletedPods <- eviction.Name + return true, nil, nil + }) + + // Hook node deletion at the level of cloud provider, to gather which nodes were deleted, and to fail the deletion for + // certain nodes to simulate errors. + provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleDown(func(nodeGroup string, node string) error { + if tc.failedNodeDeletion[node] { + return fmt.Errorf("SIMULATED ERROR: won't remove node") + } + deletedNodes <- node + return nil + }).Build() + for _, bucket := range emptyNodeGroupViews { + bucket.Group.(*testprovider.TestNodeGroup).SetCloudProvider(provider) + provider.InsertNodeGroup(bucket.Group) + for _, node := range bucket.Nodes { + provider.AddNode(bucket.Group.Id(), node) + } + } + for _, bucket := range drainNodeGroupViews { + bucket.Group.(*testprovider.TestNodeGroup).SetCloudProvider(provider) + provider.InsertNodeGroup(bucket.Group) + for _, node := range bucket.Nodes { + provider.AddNode(bucket.Group.Id(), node) + } + } + + // Set up other needed structures and options. + opts := config.AutoscalingOptions{ + MaxScaleDownParallelism: 10, + MaxDrainParallelism: 5, + MaxPodEvictionTime: 0, + DaemonSetEvictionForEmptyNodes: true, + } + + allPods := []*apiv1.Pod{} + + for _, pods := range tc.pods { + allPods = append(allPods, pods...) + } + + podLister := kube_util.NewTestPodLister(allPods) + pdbLister := kube_util.NewTestPodDisruptionBudgetLister([]*policyv1.PodDisruptionBudget{}) + dsLister, err := kube_util.NewTestDaemonSetLister([]*appsv1.DaemonSet{ds}) + if err != nil { + t.Fatalf("Couldn't create daemonset lister") + } + + registry := kube_util.NewListerRegistry(nil, nil, podLister, pdbLister, dsLister, nil, nil, nil, nil) + ctx, err := NewScaleTestAutoscalingContext(opts, fakeClient, registry, provider, nil, nil) + if err != nil { + t.Fatalf("Couldn't set up autoscaling context: %v", err) + } + csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker()) + for _, bucket := range emptyNodeGroupViews { + for _, node := range bucket.Nodes { + err := ctx.ClusterSnapshot.AddNodeInfo(framework.NewTestNodeInfo(node, tc.pods[node.Name]...)) + if err != nil { + t.Fatalf("Couldn't add node %q to snapshot: %v", node.Name, err) + } + } + } + for _, bucket := range drainNodeGroupViews { + for _, node := range bucket.Nodes { + pods, found := tc.pods[node.Name] + if !found { + t.Fatalf("Drain node %q doesn't have pods defined in the test case.", node.Name) + } + err := ctx.ClusterSnapshot.AddNodeInfo(framework.NewTestNodeInfo(node, pods...)) + if err != nil { + t.Fatalf("Couldn't add node %q to snapshot: %v", node.Name, err) + } + } + } + + wantScaleDownNodes := []*status.ScaleDownNode{} + for _, scaleDownNodeInfo := range tc.wantStatus.scaledDownNodes { + statusScaledDownNode := &status.ScaleDownNode{ + Node: generateNode(scaleDownNodeInfo.name), + NodeGroup: tc.nodeGroups[scaleDownNodeInfo.nodeGroup], + EvictedPods: scaleDownNodeInfo.evictedPods, + UtilInfo: scaleDownNodeInfo.utilInfo, + } + wantScaleDownNodes = append(wantScaleDownNodes, statusScaledDownNode) + } + + scaleStateNotifier := nodegroupchange.NewNodeGroupChangeObserversList() + scaleStateNotifier.Register(csr) + + // Create Actuator, run StartDeletion, and verify the error. + ndt := deletiontracker.NewNodeDeletionTracker(0) + ndb := NewNodeDeletionBatcher(&ctx, scaleStateNotifier, ndt, 0*time.Second) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, fullDsEviction: force} + fakeNodeLatencyTracker := &fakeLatencyTracker{} + actuator := Actuator{ + ctx: &ctx, nodeDeletionTracker: ndt, + nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), + budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), + configGetter: nodegroupconfig.NewDefaultNodeGroupConfigProcessor(ctx.NodeGroupDefaults), + nodeLatencyTracker: fakeNodeLatencyTracker, + } + + var gotResult status.ScaleDownResult + var gotScaleDownNodes []*status.ScaleDownNode + var gotErr error + if force { + gotResult, gotScaleDownNodes, gotErr = actuator.StartForceDeletion(allEmptyNodes, allDrainNodes) + } else { + gotResult, gotScaleDownNodes, gotErr = actuator.StartDeletion(allEmptyNodes, allDrainNodes) + } + + if diff := cmp.Diff(tc.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { + t.Errorf("StartDeletion error diff (-want +got):\n%s", diff) + } + + // Verify ScaleDownResult looks as expected. + if diff := cmp.Diff(tc.wantStatus.result, gotResult); diff != "" { + t.Errorf("StartDeletion result diff (-want +got):\n%s", diff) + } + + // Verify ScaleDownNodes looks as expected. + ignoreSdNodeOrder := cmpopts.SortSlices(func(a, b *status.ScaleDownNode) bool { return a.Node.Name < b.Node.Name }) + cmpNg := cmp.Comparer(func(a, b *testprovider.TestNodeGroup) bool { return a.Id() == b.Id() }) + statusCmpOpts := cmp.Options{ignoreSdNodeOrder, cmpNg, cmpopts.EquateEmpty()} + if diff := cmp.Diff(wantScaleDownNodes, gotScaleDownNodes, statusCmpOpts); diff != "" { + t.Errorf("StartDeletion scaled down nodes diff (-want +got):\n%s", diff) + } + + // Verify that all expected nodes were deleted using the cloud provider hook. + var gotDeletedNodes []string nodesLoop: - for i := 0; i < len(tc.wantDeletedNodes); i++ { - select { - case deletedNode := <-deletedNodes: - gotDeletedNodes = append(gotDeletedNodes, deletedNode) - case <-time.After(3 * time.Second): - t.Errorf("Timeout while waiting for deleted nodes.") - break nodesLoop - } - } - ignoreStrOrder := cmpopts.SortSlices(func(a, b string) bool { return a < b }) - if diff := cmp.Diff(tc.wantDeletedNodes, gotDeletedNodes, ignoreStrOrder); diff != "" { - t.Errorf("deletedNodes diff (-want +got):\n%s", diff) - } - - // Verify that all expected pods were deleted using the fake k8s client hook. - var gotDeletedPods []string + for i := 0; i < len(tc.wantDeletedNodes); i++ { + select { + case deletedNode := <-deletedNodes: + gotDeletedNodes = append(gotDeletedNodes, deletedNode) + case <-time.After(3 * time.Second): + t.Errorf("Timeout while waiting for deleted nodes.") + break nodesLoop + } + } + ignoreStrOrder := cmpopts.SortSlices(func(a, b string) bool { return a < b }) + if diff := cmp.Diff(tc.wantDeletedNodes, gotDeletedNodes, ignoreStrOrder); diff != "" { + t.Errorf("deletedNodes diff (-want +got):\n%s", diff) + } + + // Verify that all expected pods were deleted using the fake k8s client hook. + var gotDeletedPods []string podsLoop: - for i := 0; i < len(tc.wantDeletedPods); i++ { - select { - case deletedPod := <-deletedPods: - gotDeletedPods = append(gotDeletedPods, deletedPod) - case <-time.After(3 * time.Second): - t.Errorf("Timeout while waiting for deleted pods.") - break podsLoop - } - } - if diff := cmp.Diff(tc.wantDeletedPods, gotDeletedPods, ignoreStrOrder); diff != "" { - t.Errorf("deletedPods diff (-want +got):\n%s", diff) - } - - // Verify that all expected taint updates happened using the fake k8s client hook. - allUpdatesCount := 0 - for _, updates := range tc.wantTaintUpdates { - allUpdatesCount += len(updates) - } - gotTaintUpdates := make(map[string][][]apiv1.Taint) + for i := 0; i < len(tc.wantDeletedPods); i++ { + select { + case deletedPod := <-deletedPods: + gotDeletedPods = append(gotDeletedPods, deletedPod) + case <-time.After(3 * time.Second): + t.Errorf("Timeout while waiting for deleted pods.") + break podsLoop + } + } + if diff := cmp.Diff(tc.wantDeletedPods, gotDeletedPods, ignoreStrOrder); diff != "" { + t.Errorf("deletedPods diff (-want +got):\n%s", diff) + } + + // Verify that all expected taint updates happened using the fake k8s client hook. + allUpdatesCount := 0 + for _, updates := range tc.wantTaintUpdates { + allUpdatesCount += len(updates) + } + gotTaintUpdates := make(map[string][][]apiv1.Taint) taintsLoop: - for i := 0; i < allUpdatesCount; i++ { - select { - case taintUpdate := <-taintUpdates: - gotTaintUpdates[taintUpdate.nodeName] = append(gotTaintUpdates[taintUpdate.nodeName], taintUpdate.taints) - case <-time.After(3 * time.Second): - t.Errorf("Timeout while waiting for taint updates.") - break taintsLoop - } - } - startupTaintValue := cmpopts.IgnoreFields(apiv1.Taint{}, "Value") - if diff := cmp.Diff(tc.wantTaintUpdates, gotTaintUpdates, startupTaintValue, cmpopts.EquateEmpty()); diff != "" { - t.Errorf("taintUpdates diff (-want +got):\n%s", diff) - } - - // Wait for all expected deletions to be reported in NodeDeletionTracker. Reporting happens shortly after the deletion - // in cloud provider we sync to above and so this will usually not wait at all. However, it can still happen - // that there is a delay between cloud provider deletion and reporting, in which case the results are not there yet - // and we need to wait for them before asserting. - err = waitForDeletionResultsCount(actuator.nodeDeletionTracker, len(tc.wantNodeDeleteResults), 3*time.Second, 200*time.Millisecond) - if err != nil { - t.Errorf("Timeout while waiting for node deletion results") - } - - // Gather node deletion results for deletions started in the previous call, and verify that they look as expected. - nodeDeleteResults, _ := actuator.DeletionResults() - if diff := cmp.Diff(tc.wantNodeDeleteResults, nodeDeleteResults, cmpopts.EquateEmpty(), cmpopts.EquateErrors()); diff != "" { - t.Errorf("NodeDeleteResults diff (-want +got):\n%s", diff) - } + for i := 0; i < allUpdatesCount; i++ { + select { + case taintUpdate := <-taintUpdates: + gotTaintUpdates[taintUpdate.nodeName] = append(gotTaintUpdates[taintUpdate.nodeName], taintUpdate.taints) + case <-time.After(3 * time.Second): + t.Errorf("Timeout while waiting for taint updates.") + break taintsLoop + } + } + startupTaintValue := cmpopts.IgnoreFields(apiv1.Taint{}, "Value") + if diff := cmp.Diff(tc.wantTaintUpdates, gotTaintUpdates, startupTaintValue, cmpopts.EquateEmpty()); diff != "" { + t.Errorf("taintUpdates diff (-want +got):\n%s", diff) + } + + // Wait for all expected deletions to be reported in NodeDeletionTracker. Reporting happens shortly after the deletion + // in cloud provider we sync to above and so this will usually not wait at all. However, it can still happen + // that there is a delay between cloud provider deletion and reporting, in which case the results are not there yet + // and we need to wait for them before asserting. + err = waitForDeletionResultsCount(actuator.nodeDeletionTracker, len(tc.wantNodeDeleteResults), 3*time.Second, 200*time.Millisecond) + if err != nil { + t.Errorf("Timeout while waiting for node deletion results") + } + + // Gather node deletion results for deletions started in the previous call, and verify that they look as expected. + nodeDeleteResults, _ := actuator.DeletionResults() + if diff := cmp.Diff(tc.wantNodeDeleteResults, nodeDeleteResults, cmpopts.EquateEmpty(), cmpopts.EquateErrors()); diff != "" { + t.Errorf("NodeDeleteResults diff (-want +got):\n%s", diff) + } + // Verify ObserveDeletion was called for all nodes that were actually deleted + for _, expectedNode := range tc.wantDeletedNodes { + found := false + for _, observed := range fakeNodeLatencyTracker.ObservedNodes { + if observed == expectedNode { + found = true + break + } + } + if !found { + t.Errorf("Expected ObserveDeletion to be called for node %s, but it wasn't", expectedNode) + } + } } func TestStartDeletion(t *testing.T) { - testSets := []map[string]startDeletionTestCase{ - // IgnoreDaemonSetsUtilization is false - getStartDeletionTestCases(false, false, "testNg1"), - // IgnoreDaemonSetsUtilization is true - getStartDeletionTestCases(true, false, "testNg2"), - } - - for _, testSet := range testSets { - for tn, tc := range testSet { - t.Run(tn, func(t *testing.T) { - runStartDeletionTest(t, tc, false) - }) - } - } + testSets := []map[string]startDeletionTestCase{ + // IgnoreDaemonSetsUtilization is false + getStartDeletionTestCases(false, false, "testNg1"), + // IgnoreDaemonSetsUtilization is true + getStartDeletionTestCases(true, false, "testNg2"), + } + + for _, testSet := range testSets { + for tn, tc := range testSet { + t.Run(tn, func(t *testing.T) { + runStartDeletionTest(t, tc, false) + }) + } + } } func TestStartForceDeletion(t *testing.T) { - testSets := []map[string]startDeletionTestCase{ - // IgnoreDaemonSetsUtilization is false - getStartDeletionTestCases(false, true, "testNg1"), - // IgnoreDaemonSetsUtilization is true - getStartDeletionTestCases(true, true, "testNg2"), - } - - for _, testSet := range testSets { - for tn, tc := range testSet { - t.Run(tn, func(t *testing.T) { - runStartDeletionTest(t, tc, true) - }) - } - } + testSets := []map[string]startDeletionTestCase{ + // IgnoreDaemonSetsUtilization is false + getStartDeletionTestCases(false, true, "testNg1"), + // IgnoreDaemonSetsUtilization is true + getStartDeletionTestCases(true, true, "testNg2"), + } + + for _, testSet := range testSets { + for tn, tc := range testSet { + t.Run(tn, func(t *testing.T) { + runStartDeletionTest(t, tc, true) + }) + } + } } func TestStartDeletionInBatchBasic(t *testing.T) { - deleteInterval := 1 * time.Second - - for _, test := range []struct { - name string - deleteCalls int - numNodesToDelete map[string][]int //per node group and per call - failedRequests map[string]bool //per node group - wantSuccessfulDeletion map[string]int //per node group - }{ - { - name: "Succesfull deletion for all node group", - deleteCalls: 1, - numNodesToDelete: map[string][]int{ - "test-ng-1": {4}, - "test-ng-2": {5}, - "test-ng-3": {1}, - }, - wantSuccessfulDeletion: map[string]int{ - "test-ng-1": 4, - "test-ng-2": 5, - "test-ng-3": 1, - }, - }, - { - name: "Node deletion failed for one group", - deleteCalls: 1, - numNodesToDelete: map[string][]int{ - "test-ng-1": {4}, - "test-ng-2": {5}, - "test-ng-3": {1}, - }, - failedRequests: map[string]bool{ - "test-ng-1": true, - }, - wantSuccessfulDeletion: map[string]int{ - "test-ng-1": 0, - "test-ng-2": 5, - "test-ng-3": 1, - }, - }, - { - name: "Node deletion failed for one group two times", - deleteCalls: 2, - numNodesToDelete: map[string][]int{ - "test-ng-1": {4, 3}, - "test-ng-2": {5}, - "test-ng-3": {1}, - }, - failedRequests: map[string]bool{ - "test-ng-1": true, - }, - wantSuccessfulDeletion: map[string]int{ - "test-ng-1": 0, - "test-ng-2": 5, - "test-ng-3": 1, - }, - }, - { - name: "Node deletion failed for all groups", - deleteCalls: 2, - numNodesToDelete: map[string][]int{ - "test-ng-1": {4, 3}, - "test-ng-2": {5}, - "test-ng-3": {1}, - }, - failedRequests: map[string]bool{ - "test-ng-1": true, - "test-ng-2": true, - "test-ng-3": true, - }, - wantSuccessfulDeletion: map[string]int{ - "test-ng-1": 0, - "test-ng-2": 0, - "test-ng-3": 0, - }, - }, - } { - t.Run(test.name, func(t *testing.T) { - test := test - gotFailedRequest := func(nodeGroupId string) bool { - val, _ := test.failedRequests[nodeGroupId] - return val - } - deletedResult := make(chan string) - fakeClient := &fake.Clientset{} - provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleDown(func(nodeGroupId string, node string) error { - if gotFailedRequest(nodeGroupId) { - return fmt.Errorf("SIMULATED ERROR: won't remove node") - } - deletedResult <- nodeGroupId - return nil - }).Build() - // 2d array represent the waves of pushing nodes to delete. - deleteNodes := [][]*apiv1.Node{} - - for i := 0; i < test.deleteCalls; i++ { - deleteNodes = append(deleteNodes, []*apiv1.Node{}) - } - testNg1 := testprovider.NewTestNodeGroup("test-ng-1", 0, 100, 3, true, false, "n1-standard-2", nil, nil) - testNg2 := testprovider.NewTestNodeGroup("test-ng-2", 0, 100, 3, true, false, "n1-standard-2", nil, nil) - testNg3 := testprovider.NewTestNodeGroup("test-ng-3", 0, 100, 3, true, false, "n1-standard-2", nil, nil) - testNg := map[string]*testprovider.TestNodeGroup{ - "test-ng-1": testNg1, - "test-ng-2": testNg2, - "test-ng-3": testNg3, - } - - for ngName, numNodes := range test.numNodesToDelete { - ng := testNg[ngName] - provider.InsertNodeGroup(ng) - ng.SetCloudProvider(provider) - for i, num := range numNodes { - singleBucketList := generateNodeGroupViewList(ng, 0, num) - bucket := singleBucketList[0] - deleteNodes[i] = append(deleteNodes[i], bucket.Nodes...) - for _, node := range bucket.Nodes { - provider.AddNode(bucket.Group.Id(), node) - } - } - } - opts := config.AutoscalingOptions{ - MaxScaleDownParallelism: 10, - MaxDrainParallelism: 5, - MaxPodEvictionTime: 0, - DaemonSetEvictionForEmptyNodes: true, - } - - podLister := kube_util.NewTestPodLister([]*apiv1.Pod{}) - pdbLister := kube_util.NewTestPodDisruptionBudgetLister([]*policyv1.PodDisruptionBudget{}) - registry := kube_util.NewListerRegistry(nil, nil, podLister, pdbLister, nil, nil, nil, nil, nil) - ctx, err := NewScaleTestAutoscalingContext(opts, fakeClient, registry, provider, nil, nil) - if err != nil { - t.Fatalf("Couldn't set up autoscaling context: %v", err) - } - csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker()) - scaleStateNotifier := nodegroupchange.NewNodeGroupChangeObserversList() - scaleStateNotifier.Register(csr) - ndt := deletiontracker.NewNodeDeletionTracker(0) - ndb := NewNodeDeletionBatcher(&ctx, scaleStateNotifier, ndt, deleteInterval) - legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig} - actuator := Actuator{ - ctx: &ctx, nodeDeletionTracker: ndt, - nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), - budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), - nodeLatencyTracker: latencytracker.NewNodeLatencyTracker(), - } - - for _, nodes := range deleteNodes { - actuator.StartDeletion(nodes, []*apiv1.Node{}) - time.Sleep(deleteInterval) - } - wantDeletedNodes := 0 - for _, num := range test.wantSuccessfulDeletion { - wantDeletedNodes += num - } - gotDeletedNodes := map[string]int{ - "test-ng-1": 0, - "test-ng-2": 0, - "test-ng-3": 0, - } - for i := 0; i < wantDeletedNodes; i++ { - select { - case ngId := <-deletedResult: - gotDeletedNodes[ngId]++ - case <-time.After(1 * time.Second): - t.Errorf("Timeout while waiting for deleted nodes.") - break - } - } - if diff := cmp.Diff(test.wantSuccessfulDeletion, gotDeletedNodes); diff != "" { - t.Errorf("Successful deleteions per node group diff (-want +got):\n%s", diff) - } - }) - } + deleteInterval := 1 * time.Second + + for _, test := range []struct { + name string + deleteCalls int + numNodesToDelete map[string][]int //per node group and per call + failedRequests map[string]bool //per node group + wantSuccessfulDeletion map[string]int //per node group + }{ + { + name: "Succesfull deletion for all node group", + deleteCalls: 1, + numNodesToDelete: map[string][]int{ + "test-ng-1": {4}, + "test-ng-2": {5}, + "test-ng-3": {1}, + }, + wantSuccessfulDeletion: map[string]int{ + "test-ng-1": 4, + "test-ng-2": 5, + "test-ng-3": 1, + }, + }, + { + name: "Node deletion failed for one group", + deleteCalls: 1, + numNodesToDelete: map[string][]int{ + "test-ng-1": {4}, + "test-ng-2": {5}, + "test-ng-3": {1}, + }, + failedRequests: map[string]bool{ + "test-ng-1": true, + }, + wantSuccessfulDeletion: map[string]int{ + "test-ng-1": 0, + "test-ng-2": 5, + "test-ng-3": 1, + }, + }, + { + name: "Node deletion failed for one group two times", + deleteCalls: 2, + numNodesToDelete: map[string][]int{ + "test-ng-1": {4, 3}, + "test-ng-2": {5}, + "test-ng-3": {1}, + }, + failedRequests: map[string]bool{ + "test-ng-1": true, + }, + wantSuccessfulDeletion: map[string]int{ + "test-ng-1": 0, + "test-ng-2": 5, + "test-ng-3": 1, + }, + }, + { + name: "Node deletion failed for all groups", + deleteCalls: 2, + numNodesToDelete: map[string][]int{ + "test-ng-1": {4, 3}, + "test-ng-2": {5}, + "test-ng-3": {1}, + }, + failedRequests: map[string]bool{ + "test-ng-1": true, + "test-ng-2": true, + "test-ng-3": true, + }, + wantSuccessfulDeletion: map[string]int{ + "test-ng-1": 0, + "test-ng-2": 0, + "test-ng-3": 0, + }, + }, + } { + t.Run(test.name, func(t *testing.T) { + test := test + gotFailedRequest := func(nodeGroupId string) bool { + val, _ := test.failedRequests[nodeGroupId] + return val + } + deletedResult := make(chan string) + fakeClient := &fake.Clientset{} + provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleDown(func(nodeGroupId string, node string) error { + if gotFailedRequest(nodeGroupId) { + return fmt.Errorf("SIMULATED ERROR: won't remove node") + } + deletedResult <- nodeGroupId + return nil + }).Build() + // 2d array represent the waves of pushing nodes to delete. + deleteNodes := [][]*apiv1.Node{} + + for i := 0; i < test.deleteCalls; i++ { + deleteNodes = append(deleteNodes, []*apiv1.Node{}) + } + testNg1 := testprovider.NewTestNodeGroup("test-ng-1", 0, 100, 3, true, false, "n1-standard-2", nil, nil) + testNg2 := testprovider.NewTestNodeGroup("test-ng-2", 0, 100, 3, true, false, "n1-standard-2", nil, nil) + testNg3 := testprovider.NewTestNodeGroup("test-ng-3", 0, 100, 3, true, false, "n1-standard-2", nil, nil) + testNg := map[string]*testprovider.TestNodeGroup{ + "test-ng-1": testNg1, + "test-ng-2": testNg2, + "test-ng-3": testNg3, + } + + for ngName, numNodes := range test.numNodesToDelete { + ng := testNg[ngName] + provider.InsertNodeGroup(ng) + ng.SetCloudProvider(provider) + for i, num := range numNodes { + singleBucketList := generateNodeGroupViewList(ng, 0, num) + bucket := singleBucketList[0] + deleteNodes[i] = append(deleteNodes[i], bucket.Nodes...) + for _, node := range bucket.Nodes { + provider.AddNode(bucket.Group.Id(), node) + } + } + } + opts := config.AutoscalingOptions{ + MaxScaleDownParallelism: 10, + MaxDrainParallelism: 5, + MaxPodEvictionTime: 0, + DaemonSetEvictionForEmptyNodes: true, + } + + podLister := kube_util.NewTestPodLister([]*apiv1.Pod{}) + pdbLister := kube_util.NewTestPodDisruptionBudgetLister([]*policyv1.PodDisruptionBudget{}) + registry := kube_util.NewListerRegistry(nil, nil, podLister, pdbLister, nil, nil, nil, nil, nil) + ctx, err := NewScaleTestAutoscalingContext(opts, fakeClient, registry, provider, nil, nil) + if err != nil { + t.Fatalf("Couldn't set up autoscaling context: %v", err) + } + csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker()) + scaleStateNotifier := nodegroupchange.NewNodeGroupChangeObserversList() + scaleStateNotifier.Register(csr) + ndt := deletiontracker.NewNodeDeletionTracker(0) + ndb := NewNodeDeletionBatcher(&ctx, scaleStateNotifier, ndt, deleteInterval) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig} + fakeNodeLatencyTracker := &fakeLatencyTracker{} + actuator := Actuator{ + ctx: &ctx, nodeDeletionTracker: ndt, + nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), + budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), + nodeLatencyTracker: fakeNodeLatencyTracker, + } + + for _, nodes := range deleteNodes { + actuator.StartDeletion(nodes, []*apiv1.Node{}) + time.Sleep(deleteInterval) + } + wantDeletedNodes := 0 + for _, num := range test.wantSuccessfulDeletion { + wantDeletedNodes += num + } + gotDeletedNodes := map[string]int{ + "test-ng-1": 0, + "test-ng-2": 0, + "test-ng-3": 0, + } + for i := 0; i < wantDeletedNodes; i++ { + select { + case ngId := <-deletedResult: + gotDeletedNodes[ngId]++ + case <-time.After(1 * time.Second): + t.Errorf("Timeout while waiting for deleted nodes.") + break + } + } + if diff := cmp.Diff(test.wantSuccessfulDeletion, gotDeletedNodes); diff != "" { + t.Errorf("Successful deleteions per node group diff (-want +got):\n%s", diff) + } + for _, nodes := range deleteNodes { + for _, node := range nodes { + found := false + for _, observedNode := range fakeNodeLatencyTracker.ObservedNodes { + if observedNode == node.Name { + found = true + break + } + } + if !found { + t.Errorf("Expected ObserveDeletion to be called for node %s", node.Name) + } + } + } + }) + } } func sizedNodeGroup(id string, size int, atomic, ignoreDaemonSetUtil bool) *testprovider.TestNodeGroup { - ng := testprovider.NewTestNodeGroup(id, 1000, 0, size, true, false, "n1-standard-2", nil, nil) - ng.SetOptions(&config.NodeGroupAutoscalingOptions{ - ZeroOrMaxNodeScaling: atomic, - IgnoreDaemonSetsUtilization: ignoreDaemonSetUtil, - }) - return ng + ng := testprovider.NewTestNodeGroup(id, 1000, 0, size, true, false, "n1-standard-2", nil, nil) + ng.SetOptions(&config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: atomic, + IgnoreDaemonSetsUtilization: ignoreDaemonSetUtil, + }) + return ng } func generateNodes(from, to int, prefix string) []*apiv1.Node { - var result []*apiv1.Node - for i := from; i < to; i++ { - name := fmt.Sprintf("node-%d", i) - if prefix != "" { - name = prefix + "-" + name - } - result = append(result, generateNode(name)) - } - return result + var result []*apiv1.Node + for i := from; i < to; i++ { + name := fmt.Sprintf("node-%d", i) + if prefix != "" { + name = prefix + "-" + name + } + result = append(result, generateNode(name)) + } + return result } func generateNodeGroupViewList(ng cloudprovider.NodeGroup, from, to int) []*budgets.NodeGroupView { - return []*budgets.NodeGroupView{ - { - Group: ng, - Nodes: generateNodes(from, to, ng.Id()), - }, - } + return []*budgets.NodeGroupView{ + { + Group: ng, + Nodes: generateNodes(from, to, ng.Id()), + }, + } } func generateNode(name string) *apiv1.Node { - return &apiv1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: name}, - Status: apiv1.NodeStatus{ - Allocatable: apiv1.ResourceList{ - apiv1.ResourceCPU: resource.MustParse("8"), - apiv1.ResourceMemory: resource.MustParse("8G"), - }, - }, - } + return &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Status: apiv1.NodeStatus{ + Allocatable: apiv1.ResourceList{ + apiv1.ResourceCPU: resource.MustParse("8"), + apiv1.ResourceMemory: resource.MustParse("8G"), + }, + }, + } } func removablePods(count int, prefix string) []*apiv1.Pod { - var result []*apiv1.Pod - for i := 0; i < count; i++ { - name := fmt.Sprintf("pod-%d", i) - if prefix != "" { - name = prefix + "-" + name - } - result = append(result, removablePod(name, prefix)) - } - return result + var result []*apiv1.Pod + for i := 0; i < count; i++ { + name := fmt.Sprintf("pod-%d", i) + if prefix != "" { + name = prefix + "-" + name + } + result = append(result, removablePod(name, prefix)) + } + return result } func removablePod(name string, node string) *apiv1.Pod { - return &apiv1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: "default", - Annotations: map[string]string{ - "cluster-autoscaler.kubernetes.io/safe-to-evict": "true", - }, - }, - Spec: apiv1.PodSpec{ - NodeName: node, - Containers: []apiv1.Container{ - { - Name: "test-container", - Resources: apiv1.ResourceRequirements{ - Requests: map[apiv1.ResourceName]resource.Quantity{ - apiv1.ResourceCPU: resource.MustParse("1"), - apiv1.ResourceMemory: resource.MustParse("1G"), - }, - }, - }, - }, - }, - } + return &apiv1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + Annotations: map[string]string{ + "cluster-autoscaler.kubernetes.io/safe-to-evict": "true", + }, + }, + Spec: apiv1.PodSpec{ + NodeName: node, + Containers: []apiv1.Container{ + { + Name: "test-container", + Resources: apiv1.ResourceRequirements{ + Requests: map[apiv1.ResourceName]resource.Quantity{ + apiv1.ResourceCPU: resource.MustParse("1"), + apiv1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + }, + }, + } } func generateDsPods(count int, node string) []*apiv1.Pod { - var result []*apiv1.Pod - for i := 0; i < count; i++ { - name := fmt.Sprintf("ds-pod-%d", i) - result = append(result, generateDsPod(name, node)) - } - return result + var result []*apiv1.Pod + for i := 0; i < count; i++ { + name := fmt.Sprintf("ds-pod-%d", i) + result = append(result, generateDsPod(name, node)) + } + return result } func generateDsPod(name string, node string) *apiv1.Pod { - pod := removablePod(fmt.Sprintf("%s-%s", node, name), node) - pod.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "some-uid") - return pod + pod := removablePod(fmt.Sprintf("%s-%s", node, name), node) + pod.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "some-uid") + return pod } func generateDaemonSet() *appsv1.DaemonSet { - return &appsv1.DaemonSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "ds", - Namespace: "default", - SelfLink: "/apiv1s/apps/v1/namespaces/default/daemonsets/ds", - }, - } + return &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ds", + Namespace: "default", + SelfLink: "/apiv1s/apps/v1/namespaces/default/daemonsets/ds", + }, + } } func generateUtilInfo(cpuUtil, memUtil float64) utilization.Info { - var higherUtilName apiv1.ResourceName - var higherUtilVal float64 - if cpuUtil > memUtil { - higherUtilName = apiv1.ResourceCPU - higherUtilVal = cpuUtil - } else { - higherUtilName = apiv1.ResourceMemory - higherUtilVal = memUtil - } - return utilization.Info{ - CpuUtil: cpuUtil, - MemUtil: memUtil, - ResourceName: higherUtilName, - Utilization: higherUtilVal, - } + var higherUtilName apiv1.ResourceName + var higherUtilVal float64 + if cpuUtil > memUtil { + higherUtilName = apiv1.ResourceCPU + higherUtilVal = cpuUtil + } else { + higherUtilName = apiv1.ResourceMemory + higherUtilVal = memUtil + } + return utilization.Info{ + CpuUtil: cpuUtil, + MemUtil: memUtil, + ResourceName: higherUtilName, + Utilization: higherUtilVal, + } } func waitForDeletionResultsCount(ndt *deletiontracker.NodeDeletionTracker, resultsCount int, timeout, retryTime time.Duration) error { - // This is quite ugly, but shouldn't matter much since in most cases there shouldn't be a need to wait at all, and - // the function should return quickly after the first if check. - // An alternative could be to turn NodeDeletionTracker into an interface, and use an implementation which allows - // synchronizing calls to EndDeletion in the test code. - for retryUntil := time.Now().Add(timeout); time.Now().Before(retryUntil); time.Sleep(retryTime) { - if results, _ := ndt.DeletionResults(); len(results) == resultsCount { - return nil - } - } - return fmt.Errorf("timed out while waiting for node deletion results") + // This is quite ugly, but shouldn't matter much since in most cases there shouldn't be a need to wait at all, and + // the function should return quickly after the first if check. + // An alternative could be to turn NodeDeletionTracker into an interface, and use an implementation which allows + // synchronizing calls to EndDeletion in the test code. + for retryUntil := time.Now().Add(timeout); time.Now().Before(retryUntil); time.Sleep(retryTime) { + if results, _ := ndt.DeletionResults(); len(results) == resultsCount { + return nil + } + } + return fmt.Errorf("timed out while waiting for node deletion results") } + From 75249a648072c103687cd88e90543a8a001400b3 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:46:34 +0000 Subject: [PATCH 11/19] Add node latency tracker tests --- .../node_latency_tracker_test.go | 123 +++++++++++++++++- 1 file changed, 117 insertions(+), 6 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go index d7fe2f63c45c..d6cae23c60ec 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go @@ -19,15 +19,126 @@ package latencytracker import ( "testing" "time" + + apiv1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func TestObserveDeletion_NoOpIfNodeNotTracked(t *testing.T) { - tracker := NewNodeLatencyTracker() - now := time.Now() +func TestNodeLatencyTracker(t *testing.T) { + baseTime := time.Now() + + tests := []struct { + name string + setupNodes map[string]NodeInfo + unneededList []string + currentlyInDeletion map[string]bool + updateThresholds map[string]time.Duration + observeDeletion []string + expectedTrackedNodes []string + expectedDeletionTimes map[string]time.Duration + }{ + { + name: "add new unneeded nodes", + setupNodes: map[string]NodeInfo{}, + unneededList: []string{"node1", "node2"}, + currentlyInDeletion: map[string]bool{}, + updateThresholds: map[string]time.Duration{}, + observeDeletion: []string{}, + expectedTrackedNodes: []string{"node1", "node2"}, + }, + { + name: "observe deletion with threshold", + setupNodes: map[string]NodeInfo{ + "node1": {UnneededSince: baseTime, Threshold: 2 * time.Second}, + }, + unneededList: []string{}, + currentlyInDeletion: map[string]bool{}, + updateThresholds: map[string]time.Duration{}, + observeDeletion: []string{"node1"}, + expectedTrackedNodes: []string{}, + expectedDeletionTimes: map[string]time.Duration{ + "node1": 3 * time.Second, // simulate observation 5s after UnneededSince, threshold 2s + }, + }, + { + name: "remove unneeded node not in deletion", + setupNodes: map[string]NodeInfo{ + "node1": {UnneededSince: baseTime, Threshold: 1 * time.Second}, + "node2": {UnneededSince: baseTime, Threshold: 0}, + }, + unneededList: []string{"node2"}, // node1 is removed from unneeded + currentlyInDeletion: map[string]bool{}, + updateThresholds: map[string]time.Duration{}, + observeDeletion: []string{}, + expectedTrackedNodes: []string{"node2"}, + expectedDeletionTimes: map[string]time.Duration{ + "node1": 5*time.Second - 1*time.Second, // assume current timestamp baseTime+5s + }, + }, + { + name: "update threshold", + setupNodes: map[string]NodeInfo{ + "node1": {UnneededSince: baseTime, Threshold: 1 * time.Second}, + }, + unneededList: []string{"node1"}, + currentlyInDeletion: map[string]bool{}, + updateThresholds: map[string]time.Duration{ + "node1": 4 * time.Second, + }, + observeDeletion: []string{}, + expectedTrackedNodes: []string{"node1"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tracker := NewNodeLatencyTracker() + for name, info := range tt.setupNodes { + tracker.nodes[name] = info + } + + for node, threshold := range tt.updateThresholds { + tracker.UpdateThreshold(node, threshold) + } + unneededNodes := make([]*apiv1.Node, len(tt.unneededList)) + for i, name := range tt.unneededList { + unneededNodes[i] = &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: name}} + } + // simulate current timestamp as baseTime + 5s + currentTime := baseTime.Add(5 * time.Second) + tracker.UpdateStateWithUnneededList(unneededNodes, tt.currentlyInDeletion, currentTime) + + // Observe deletions + for _, node := range tt.observeDeletion { + tracker.ObserveDeletion(node, currentTime) + } - tracker.ObserveDeletion("node1", now) + // Check tracked nodes + gotTracked := tracker.GetTrackedNodes() + expectedMap := make(map[string]struct{}) + for _, n := range tt.expectedTrackedNodes { + expectedMap[n] = struct{}{} + } + for _, n := range gotTracked { + if _, ok := expectedMap[n]; !ok { + t.Errorf("unexpected tracked node %q", n) + } + delete(expectedMap, n) + } + for n := range expectedMap { + t.Errorf("expected node %q to be tracked, but was not", n) + } - if len(tracker.nodes) != 0 { - t.Errorf("expected no nodes tracked, got %d", len(tracker.nodes)) + for node, expectedDuration := range tt.expectedDeletionTimes { + info, ok := tt.setupNodes[node] + if !ok { + continue + } + duration := currentTime.Sub(info.UnneededSince) - info.Threshold + if duration != expectedDuration { + t.Errorf("node %q expected deletion duration %v, got %v", node, expectedDuration, duration) + } + } + }) } } From 4902e0fa4d99fe8e041ab4affe996cb3daf62abc Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Tue, 23 Sep 2025 22:50:44 +0000 Subject: [PATCH 12/19] Expose GetTrackedNodes in interface for testing --- .../core/scaledown/actuation/actuator_test.go | 3285 +++++++++-------- .../latencytracker/node_latency_tracker.go | 1 + .../core/scaledown/unneeded/nodes_test.go | 1 + 3 files changed, 1645 insertions(+), 1642 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go index 13336b6726a1..3ddd128f48a4 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go @@ -54,1718 +54,1719 @@ import ( ) type nodeGroupViewInfo struct { - nodeGroupName string - from int - to int + nodeGroupName string + from int + to int } type scaleDownNodeInfo struct { - name string - nodeGroup string - evictedPods []*apiv1.Pod - utilInfo utilization.Info + name string + nodeGroup string + evictedPods []*apiv1.Pod + utilInfo utilization.Info } type scaleDownStatusInfo struct { - result status.ScaleDownResult - scaledDownNodes []scaleDownNodeInfo + result status.ScaleDownResult + scaledDownNodes []scaleDownNodeInfo } type startDeletionTestCase struct { - defaultOnly bool // Set to true to only run default deletion logic tests. - forcedOnly bool // Set to true to only run forced deletion logic tests. - nodeGroups map[string]*testprovider.TestNodeGroup - emptyNodes []nodeGroupViewInfo - drainNodes []nodeGroupViewInfo - pods map[string][]*apiv1.Pod - failedPodDrain map[string]bool - failedNodeDeletion map[string]bool - failedNodeTaint map[string]bool - wantStatus scaleDownStatusInfo - wantErr error - wantDeletedPods []string - wantDeletedNodes []string - wantTaintUpdates map[string][][]apiv1.Taint - wantNodeDeleteResults map[string]status.NodeDeleteResult + defaultOnly bool // Set to true to only run default deletion logic tests. + forcedOnly bool // Set to true to only run forced deletion logic tests. + nodeGroups map[string]*testprovider.TestNodeGroup + emptyNodes []nodeGroupViewInfo + drainNodes []nodeGroupViewInfo + pods map[string][]*apiv1.Pod + failedPodDrain map[string]bool + failedNodeDeletion map[string]bool + failedNodeTaint map[string]bool + wantStatus scaleDownStatusInfo + wantErr error + wantDeletedPods []string + wantDeletedNodes []string + wantTaintUpdates map[string][][]apiv1.Taint + wantNodeDeleteResults map[string]status.NodeDeleteResult } // FakeLatencyTracker implements the same interface as NodeLatencyTracker type fakeLatencyTracker struct { - ObservedNodes []string + ObservedNodes []string } // ObserveDeletion simply records the node name func (f *fakeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Time) { - f.ObservedNodes = append(f.ObservedNodes, nodeName) + f.ObservedNodes = append(f.ObservedNodes, nodeName) } func (f *fakeLatencyTracker) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) { } func (f *fakeLatencyTracker) UpdateThreshold(nodeName string, threshold time.Duration) {} +func (f *fakeLatencyTracker) GetTrackedNodes() []string { return nil } + func getStartDeletionTestCases(ignoreDaemonSetsUtilization bool, force bool, suffix string) map[string]startDeletionTestCase { - toBeDeletedTaint := apiv1.Taint{Key: taints.ToBeDeletedTaint, Effect: apiv1.TaintEffectNoSchedule} - - dsUtilInfo := generateUtilInfo(2./8., 2./8.) - - if ignoreDaemonSetsUtilization { - dsUtilInfo = generateUtilInfo(0./8., 0./8.) - } - - testCases := map[string]startDeletionTestCase{ - "nothing to delete": { - emptyNodes: nil, - drainNodes: nil, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNoNodeDeleted, - }, - }, - "empty node deletion": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"test", 0, 2}, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - utilInfo: generateUtilInfo(0, 0), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "empty atomic node deletion": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "atomic-2": sizedNodeGroup("atomic-2", 2, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"atomic-2", 0, 2}, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "atomic-2-node-0", - nodeGroup: "atomic-2", - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-2-node-1", - nodeGroup: "atomic-2", - utilInfo: generateUtilInfo(0, 0), - }, - }, - }, - wantDeletedNodes: []string{ - "atomic-2-node-0", - "atomic-2-node-1", - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "atomic-2-node-0": { - {toBeDeletedTaint}, - }, - "atomic-2-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "atomic-2-node-0": {ResultType: status.NodeDeleteOk}, - "atomic-2-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "deletion with drain": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - drainNodes: []nodeGroupViewInfo{ - {"test", 0, 2}, - }, - pods: map[string][]*apiv1.Pod{ - "test-node-0": removablePods(2, "test-node-0"), - "test-node-1": removablePods(2, "test-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-0"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-1"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantDeletedPods: []string{"test-node-0-pod-0", "test-node-0-pod-1", "test-node-1-pod-0", "test-node-1-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "empty and drain deletion work correctly together": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"test", 0, 2}, - }, - drainNodes: []nodeGroupViewInfo{ - {"test", 2, 4}, - }, - pods: map[string][]*apiv1.Pod{ - "test-node-2": removablePods(2, "test-node-2"), - "test-node-3": removablePods(2, "test-node-3"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-2"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-3"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{ - "test-node-0", - "test-node-1", - "test-node-2", "test-node-3"}, - wantDeletedPods: []string{"test-node-2-pod-0", "test-node-2-pod-1", "test-node-3-pod-0", "test-node-3-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - "test-node-2": { - {toBeDeletedTaint}, - }, - "test-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - "test-node-2": {ResultType: status.NodeDeleteOk}, - "test-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "two atomic groups can be scaled down together": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "atomic-2-mixed": sizedNodeGroup("atomic-2-mixed", 2, true, ignoreDaemonSetsUtilization), - "atomic-2-drain": sizedNodeGroup("atomic-2-drain", 2, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"atomic-2-mixed", 1, 2}, - }, - drainNodes: []nodeGroupViewInfo{ - {"atomic-2-mixed", 0, 1}, - {"atomic-2-drain", 0, 2}, - }, - pods: map[string][]*apiv1.Pod{ - "atomic-2-mixed-node-0": removablePods(2, "atomic-2-mixed-node-0"), - "atomic-2-drain-node-0": removablePods(1, "atomic-2-drain-node-0"), - "atomic-2-drain-node-1": removablePods(2, "atomic-2-drain-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "atomic-2-mixed-node-1", - nodeGroup: "atomic-2-mixed", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-2-mixed-node-0", - nodeGroup: "atomic-2-mixed", - evictedPods: removablePods(2, "atomic-2-mixed-node-0"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "atomic-2-drain-node-0", - nodeGroup: "atomic-2-drain", - evictedPods: removablePods(1, "atomic-2-drain-node-0"), - utilInfo: generateUtilInfo(1./8., 1./8.), - }, - { - name: "atomic-2-drain-node-1", - nodeGroup: "atomic-2-drain", - evictedPods: removablePods(2, "atomic-2-drain-node-1"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"atomic-2-mixed-node-0", "atomic-2-mixed-node-1", "atomic-2-drain-node-0", "atomic-2-drain-node-1"}, - wantDeletedPods: []string{"atomic-2-mixed-node-0-pod-0", "atomic-2-mixed-node-0-pod-1", "atomic-2-drain-node-0-pod-0", "atomic-2-drain-node-1-pod-0", "atomic-2-drain-node-1-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "atomic-2-mixed-node-0": { - {toBeDeletedTaint}, - }, - "atomic-2-mixed-node-1": { - {toBeDeletedTaint}, - }, - "atomic-2-drain-node-0": { - {toBeDeletedTaint}, - }, - "atomic-2-drain-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "atomic-2-mixed-node-0": {ResultType: status.NodeDeleteOk}, - "atomic-2-mixed-node-1": {ResultType: status.NodeDeleteOk}, - "atomic-2-drain-node-0": {ResultType: status.NodeDeleteOk}, - "atomic-2-drain-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "atomic empty and drain deletion work correctly together": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "atomic-4": sizedNodeGroup("atomic-4", 4, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{ - {"atomic-4", 0, 2}, - }, - drainNodes: []nodeGroupViewInfo{ - {"atomic-4", 2, 4}, - }, - pods: map[string][]*apiv1.Pod{ - "atomic-4-node-2": removablePods(2, "atomic-4-node-2"), - "atomic-4-node-3": removablePods(2, "atomic-4-node-3"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "atomic-4-node-0", - nodeGroup: "atomic-4", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-4-node-1", - nodeGroup: "atomic-4", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-4-node-2", - nodeGroup: "atomic-4", - evictedPods: removablePods(2, "atomic-4-node-2"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "atomic-4-node-3", - nodeGroup: "atomic-4", - evictedPods: removablePods(2, "atomic-4-node-3"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"atomic-4-node-0", "atomic-4-node-1", "atomic-4-node-2", "atomic-4-node-3"}, - wantDeletedPods: []string{"atomic-4-node-2-pod-0", "atomic-4-node-2-pod-1", "atomic-4-node-3-pod-0", "atomic-4-node-3-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "atomic-4-node-0": { - {toBeDeletedTaint}, - }, - "atomic-4-node-1": { - {toBeDeletedTaint}, - }, - "atomic-4-node-2": { - {toBeDeletedTaint}, - }, - "atomic-4-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "atomic-4-node-0": {ResultType: status.NodeDeleteOk}, - "atomic-4-node-1": {ResultType: status.NodeDeleteOk}, - "atomic-4-node-2": {ResultType: status.NodeDeleteOk}, - "atomic-4-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "failure to taint empty node stops deletion and cleans already applied taints": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 4}}, - drainNodes: []nodeGroupViewInfo{{"test", 4, 5}}, - pods: map[string][]*apiv1.Pod{ - "test-node-4": removablePods(2, "test-node-4"), - }, - failedNodeTaint: map[string]bool{"test-node-2": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownError, - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - {}, - }, - "test-node-1": { - {toBeDeletedTaint}, - {}, - }, - "test-node-3": { - {toBeDeletedTaint}, - {}, - }, - }, - wantErr: cmpopts.AnyError, - }, - "failure to taint empty atomic node stops deletion and cleans already applied taints": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - "atomic-4": sizedNodeGroup("atomic-4", 4, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"atomic-4", 0, 4}}, - drainNodes: []nodeGroupViewInfo{{"test", 4, 5}}, - pods: map[string][]*apiv1.Pod{ - "test-node-4": removablePods(2, "test-node-4"), - }, - failedNodeTaint: map[string]bool{"atomic-4-node-2": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownError, - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "atomic-4-node-0": { - {toBeDeletedTaint}, - {}, - }, - "atomic-4-node-1": { - {toBeDeletedTaint}, - {}, - }, - "atomic-4-node-3": { - {toBeDeletedTaint}, - {}, - }, - }, - wantErr: cmpopts.AnyError, - }, - "failure to taint drain node stops further deletion and cleans already applied taints": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, //generateNodeGroupViewList(testNg, 0, 2), - drainNodes: []nodeGroupViewInfo{{"test", 2, 6}}, //generateNodeGroupViewList(testNg, 2, 6), - pods: map[string][]*apiv1.Pod{ - "test-node-2": removablePods(2, "test-node-2"), - "test-node-3": removablePods(2, "test-node-3"), - "test-node-4": removablePods(2, "test-node-4"), - "test-node-5": removablePods(2, "test-node-5"), - }, - failedNodeTaint: map[string]bool{"test-node-2": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownError, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - wantErr: cmpopts.AnyError, - }, - "failure to taint drain atomic node stops further deletion and cleans already applied taints": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - "atomic-6": sizedNodeGroup("atomic-6", 6, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - drainNodes: []nodeGroupViewInfo{{"atomic-6", 0, 6}}, - pods: map[string][]*apiv1.Pod{ - "atomic-6-node-0": removablePods(2, "atomic-6-node-0"), - "atomic-6-node-1": removablePods(2, "atomic-6-node-1"), - "atomic-6-node-2": removablePods(2, "atomic-6-node-2"), - "atomic-6-node-3": removablePods(2, "atomic-6-node-3"), - "atomic-6-node-4": removablePods(2, "atomic-6-node-4"), - "atomic-6-node-5": removablePods(2, "atomic-6-node-5"), - }, - failedNodeTaint: map[string]bool{"atomic-6-node-2": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownError, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - wantErr: cmpopts.AnyError, - }, - "nodes that failed drain are correctly reported in results": { - defaultOnly: true, - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - drainNodes: []nodeGroupViewInfo{{"test", 0, 4}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": removablePods(3, "test-node-0"), - "test-node-1": removablePods(3, "test-node-1"), - "test-node-2": removablePods(3, "test-node-2"), - "test-node-3": removablePods(3, "test-node-3"), - }, - failedPodDrain: map[string]bool{ - "test-node-0-pod-0": true, - "test-node-0-pod-1": true, - "test-node-2-pod-1": true, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-0"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-1"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-2"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-3"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-1", "test-node-3"}, - wantDeletedPods: []string{ - "test-node-0-pod-2", - "test-node-1-pod-0", "test-node-1-pod-1", "test-node-1-pod-2", - "test-node-2-pod-0", "test-node-2-pod-2", - "test-node-3-pod-0", "test-node-3-pod-1", "test-node-3-pod-2", - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - {}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - "test-node-2": { - {toBeDeletedTaint}, - {}, - }, - "test-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": { - ResultType: status.NodeDeleteErrorFailedToEvictPods, - Err: cmpopts.AnyError, - PodEvictionResults: map[string]status.PodEvictionResult{ - "test-node-0-pod-0": {Pod: removablePod("test-node-0-pod-0", "test-node-0"), Err: cmpopts.AnyError, TimedOut: true}, - "test-node-0-pod-1": {Pod: removablePod("test-node-0-pod-1", "test-node-0"), Err: cmpopts.AnyError, TimedOut: true}, - "test-node-0-pod-2": {Pod: removablePod("test-node-0-pod-2", "test-node-0")}, - }, - }, - "test-node-1": {ResultType: status.NodeDeleteOk}, - "test-node-2": { - ResultType: status.NodeDeleteErrorFailedToEvictPods, - Err: cmpopts.AnyError, - PodEvictionResults: map[string]status.PodEvictionResult{ - "test-node-2-pod-0": {Pod: removablePod("test-node-2-pod-0", "test-node-2")}, - "test-node-2-pod-1": {Pod: removablePod("test-node-2-pod-1", "test-node-2"), Err: cmpopts.AnyError, TimedOut: true}, - "test-node-2-pod-2": {Pod: removablePod("test-node-2-pod-2", "test-node-2")}, - }, - }, - "test-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "nodes that failed drain are forcefully deleted": { - forcedOnly: true, - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - drainNodes: []nodeGroupViewInfo{{"test", 0, 4}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": removablePods(3, "test-node-0"), - "test-node-1": removablePods(3, "test-node-1"), - "test-node-2": removablePods(3, "test-node-2"), - "test-node-3": removablePods(3, "test-node-3"), - }, - failedPodDrain: map[string]bool{ - "test-node-0-pod-0": true, - "test-node-0-pod-1": true, - "test-node-2-pod-1": true, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-0"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-1"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-2"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: removablePods(3, "test-node-3"), - utilInfo: generateUtilInfo(3./8., 3./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1", "test-node-2", "test-node-3"}, - wantDeletedPods: []string{ - "test-node-0-pod-2", - "test-node-1-pod-0", "test-node-1-pod-1", "test-node-1-pod-2", - "test-node-2-pod-0", "test-node-2-pod-2", - "test-node-3-pod-0", "test-node-3-pod-1", "test-node-3-pod-2", - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - "test-node-2": { - {toBeDeletedTaint}, - }, - "test-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - "test-node-2": {ResultType: status.NodeDeleteOk}, - "test-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "nodes that failed deletion are correctly reported in results": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - drainNodes: []nodeGroupViewInfo{{"test", 2, 4}}, - pods: map[string][]*apiv1.Pod{ - "test-node-2": removablePods(2, "test-node-2"), - "test-node-3": removablePods(2, "test-node-3"), - }, - failedNodeDeletion: map[string]bool{ - "test-node-1": true, - "test-node-3": true, - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-2"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-3"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-2"}, - wantDeletedPods: []string{ - "test-node-2-pod-0", "test-node-2-pod-1", - "test-node-3-pod-0", "test-node-3-pod-1", - }, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - {}, - }, - "test-node-2": { - {toBeDeletedTaint}, - }, - "test-node-3": { - {toBeDeletedTaint}, - {}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, - "test-node-2": {ResultType: status.NodeDeleteOk}, - "test-node-3": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, - }, - }, - "DS pods are evicted from empty nodes, but don't block deletion on error": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": generateDsPods(2, "test-node-0"), - "test-node-1": generateDsPods(2, "test-node-1"), - }, - failedPodDrain: map[string]bool{"test-node-1-ds-pod-0": true}, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - wantDeletedPods: []string{"test-node-0-ds-pod-0", "test-node-0-ds-pod-1", "test-node-1-ds-pod-1"}, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "DS pods and deletion with drain": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - drainNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": generateDsPods(2, "test-node-0"), - "test-node-1": generateDsPods(2, "test-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - // this is nil because DaemonSetEvictionForOccupiedNodes is - // not enabled for drained nodes in this test suite - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - { - name: "test-node-1", - nodeGroup: "test", - // this is nil because DaemonSetEvictionForOccupiedNodes is - // not enabled for drained nodes in this test suite - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1"}, - // same as evicted pods - wantDeletedPods: nil, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - }, - }, - "DS pods and empty and drain deletion work correctly together": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - drainNodes: []nodeGroupViewInfo{{"test", 2, 4}}, - pods: map[string][]*apiv1.Pod{ - "test-node-2": removablePods(2, "test-node-2"), - "test-node-3": generateDsPods(2, "test-node-3"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-2", - nodeGroup: "test", - evictedPods: removablePods(2, "test-node-2"), - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-3", - nodeGroup: "test", - evictedPods: nil, - utilInfo: dsUtilInfo, - }, - }, - }, - wantDeletedNodes: []string{"test-node-0", "test-node-1", "test-node-2", "test-node-3"}, - // same as evicted pods - wantDeletedPods: nil, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - }, - "test-node-2": { - {toBeDeletedTaint}, - }, - "test-node-3": { - {toBeDeletedTaint}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteOk}, - "test-node-2": {ResultType: status.NodeDeleteOk}, - "test-node-3": {ResultType: status.NodeDeleteOk}, - }, - }, - "nodes with pods are not deleted if the node is passed as empty": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, - pods: map[string][]*apiv1.Pod{ - "test-node-0": removablePods(2, "test-node-0"), - "test-node-1": removablePods(2, "test-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: nil, - wantDeletedPods: nil, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - {}, - }, - "test-node-1": { - {toBeDeletedTaint}, - {}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, - "test-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, - }, - }, - "atomic nodes with pods are not deleted if the node is passed as empty": { - nodeGroups: map[string]*testprovider.TestNodeGroup{ - "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), - "atomic-2": sizedNodeGroup("atomic-2", 2, true, ignoreDaemonSetsUtilization), - }, - emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}, {"atomic-2", 0, 2}}, - pods: map[string][]*apiv1.Pod{ - "test-node-1": removablePods(2, "test-node-1"), - "atomic-2-node-1": removablePods(2, "atomic-2-node-1"), - }, - wantStatus: scaleDownStatusInfo{ - result: status.ScaleDownNodeDeleteStarted, - scaledDownNodes: []scaleDownNodeInfo{ - { - name: "test-node-0", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "test-node-1", - nodeGroup: "test", - evictedPods: nil, - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - { - name: "atomic-2-node-0", - nodeGroup: "atomic-2", - evictedPods: nil, - utilInfo: generateUtilInfo(0, 0), - }, - { - name: "atomic-2-node-1", - nodeGroup: "atomic-2", - evictedPods: nil, - utilInfo: generateUtilInfo(2./8., 2./8.), - }, - }, - }, - wantDeletedNodes: []string{"test-node-0"}, - wantDeletedPods: nil, - wantTaintUpdates: map[string][][]apiv1.Taint{ - "test-node-0": { - {toBeDeletedTaint}, - }, - "test-node-1": { - {toBeDeletedTaint}, - {}, - }, - "atomic-2-node-0": { - {toBeDeletedTaint}, - {}, - }, - "atomic-2-node-1": { - {toBeDeletedTaint}, - {}, - }, - }, - wantNodeDeleteResults: map[string]status.NodeDeleteResult{ - "test-node-0": {ResultType: status.NodeDeleteOk}, - "test-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, - "atomic-2-node-0": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, - "atomic-2-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, - }, - }, - } - - filteredTestCases := map[string]startDeletionTestCase{} - for k, v := range testCases { - if force && v.defaultOnly { - continue - } - if !force && v.forcedOnly { - continue - } - filteredTestCases[k+" "+suffix] = v - } - - return filteredTestCases + toBeDeletedTaint := apiv1.Taint{Key: taints.ToBeDeletedTaint, Effect: apiv1.TaintEffectNoSchedule} + + dsUtilInfo := generateUtilInfo(2./8., 2./8.) + + if ignoreDaemonSetsUtilization { + dsUtilInfo = generateUtilInfo(0./8., 0./8.) + } + + testCases := map[string]startDeletionTestCase{ + "nothing to delete": { + emptyNodes: nil, + drainNodes: nil, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNoNodeDeleted, + }, + }, + "empty node deletion": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"test", 0, 2}, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + utilInfo: generateUtilInfo(0, 0), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "empty atomic node deletion": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "atomic-2": sizedNodeGroup("atomic-2", 2, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"atomic-2", 0, 2}, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "atomic-2-node-0", + nodeGroup: "atomic-2", + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-2-node-1", + nodeGroup: "atomic-2", + utilInfo: generateUtilInfo(0, 0), + }, + }, + }, + wantDeletedNodes: []string{ + "atomic-2-node-0", + "atomic-2-node-1", + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "atomic-2-node-0": { + {toBeDeletedTaint}, + }, + "atomic-2-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "atomic-2-node-0": {ResultType: status.NodeDeleteOk}, + "atomic-2-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "deletion with drain": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + drainNodes: []nodeGroupViewInfo{ + {"test", 0, 2}, + }, + pods: map[string][]*apiv1.Pod{ + "test-node-0": removablePods(2, "test-node-0"), + "test-node-1": removablePods(2, "test-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-0"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-1"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantDeletedPods: []string{"test-node-0-pod-0", "test-node-0-pod-1", "test-node-1-pod-0", "test-node-1-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "empty and drain deletion work correctly together": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"test", 0, 2}, + }, + drainNodes: []nodeGroupViewInfo{ + {"test", 2, 4}, + }, + pods: map[string][]*apiv1.Pod{ + "test-node-2": removablePods(2, "test-node-2"), + "test-node-3": removablePods(2, "test-node-3"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-2"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-3"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{ + "test-node-0", + "test-node-1", + "test-node-2", "test-node-3"}, + wantDeletedPods: []string{"test-node-2-pod-0", "test-node-2-pod-1", "test-node-3-pod-0", "test-node-3-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + "test-node-2": { + {toBeDeletedTaint}, + }, + "test-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + "test-node-2": {ResultType: status.NodeDeleteOk}, + "test-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "two atomic groups can be scaled down together": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "atomic-2-mixed": sizedNodeGroup("atomic-2-mixed", 2, true, ignoreDaemonSetsUtilization), + "atomic-2-drain": sizedNodeGroup("atomic-2-drain", 2, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"atomic-2-mixed", 1, 2}, + }, + drainNodes: []nodeGroupViewInfo{ + {"atomic-2-mixed", 0, 1}, + {"atomic-2-drain", 0, 2}, + }, + pods: map[string][]*apiv1.Pod{ + "atomic-2-mixed-node-0": removablePods(2, "atomic-2-mixed-node-0"), + "atomic-2-drain-node-0": removablePods(1, "atomic-2-drain-node-0"), + "atomic-2-drain-node-1": removablePods(2, "atomic-2-drain-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "atomic-2-mixed-node-1", + nodeGroup: "atomic-2-mixed", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-2-mixed-node-0", + nodeGroup: "atomic-2-mixed", + evictedPods: removablePods(2, "atomic-2-mixed-node-0"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "atomic-2-drain-node-0", + nodeGroup: "atomic-2-drain", + evictedPods: removablePods(1, "atomic-2-drain-node-0"), + utilInfo: generateUtilInfo(1./8., 1./8.), + }, + { + name: "atomic-2-drain-node-1", + nodeGroup: "atomic-2-drain", + evictedPods: removablePods(2, "atomic-2-drain-node-1"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"atomic-2-mixed-node-0", "atomic-2-mixed-node-1", "atomic-2-drain-node-0", "atomic-2-drain-node-1"}, + wantDeletedPods: []string{"atomic-2-mixed-node-0-pod-0", "atomic-2-mixed-node-0-pod-1", "atomic-2-drain-node-0-pod-0", "atomic-2-drain-node-1-pod-0", "atomic-2-drain-node-1-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "atomic-2-mixed-node-0": { + {toBeDeletedTaint}, + }, + "atomic-2-mixed-node-1": { + {toBeDeletedTaint}, + }, + "atomic-2-drain-node-0": { + {toBeDeletedTaint}, + }, + "atomic-2-drain-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "atomic-2-mixed-node-0": {ResultType: status.NodeDeleteOk}, + "atomic-2-mixed-node-1": {ResultType: status.NodeDeleteOk}, + "atomic-2-drain-node-0": {ResultType: status.NodeDeleteOk}, + "atomic-2-drain-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "atomic empty and drain deletion work correctly together": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "atomic-4": sizedNodeGroup("atomic-4", 4, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{ + {"atomic-4", 0, 2}, + }, + drainNodes: []nodeGroupViewInfo{ + {"atomic-4", 2, 4}, + }, + pods: map[string][]*apiv1.Pod{ + "atomic-4-node-2": removablePods(2, "atomic-4-node-2"), + "atomic-4-node-3": removablePods(2, "atomic-4-node-3"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "atomic-4-node-0", + nodeGroup: "atomic-4", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-4-node-1", + nodeGroup: "atomic-4", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-4-node-2", + nodeGroup: "atomic-4", + evictedPods: removablePods(2, "atomic-4-node-2"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "atomic-4-node-3", + nodeGroup: "atomic-4", + evictedPods: removablePods(2, "atomic-4-node-3"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"atomic-4-node-0", "atomic-4-node-1", "atomic-4-node-2", "atomic-4-node-3"}, + wantDeletedPods: []string{"atomic-4-node-2-pod-0", "atomic-4-node-2-pod-1", "atomic-4-node-3-pod-0", "atomic-4-node-3-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "atomic-4-node-0": { + {toBeDeletedTaint}, + }, + "atomic-4-node-1": { + {toBeDeletedTaint}, + }, + "atomic-4-node-2": { + {toBeDeletedTaint}, + }, + "atomic-4-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "atomic-4-node-0": {ResultType: status.NodeDeleteOk}, + "atomic-4-node-1": {ResultType: status.NodeDeleteOk}, + "atomic-4-node-2": {ResultType: status.NodeDeleteOk}, + "atomic-4-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "failure to taint empty node stops deletion and cleans already applied taints": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 4}}, + drainNodes: []nodeGroupViewInfo{{"test", 4, 5}}, + pods: map[string][]*apiv1.Pod{ + "test-node-4": removablePods(2, "test-node-4"), + }, + failedNodeTaint: map[string]bool{"test-node-2": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownError, + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + {}, + }, + "test-node-1": { + {toBeDeletedTaint}, + {}, + }, + "test-node-3": { + {toBeDeletedTaint}, + {}, + }, + }, + wantErr: cmpopts.AnyError, + }, + "failure to taint empty atomic node stops deletion and cleans already applied taints": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + "atomic-4": sizedNodeGroup("atomic-4", 4, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"atomic-4", 0, 4}}, + drainNodes: []nodeGroupViewInfo{{"test", 4, 5}}, + pods: map[string][]*apiv1.Pod{ + "test-node-4": removablePods(2, "test-node-4"), + }, + failedNodeTaint: map[string]bool{"atomic-4-node-2": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownError, + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "atomic-4-node-0": { + {toBeDeletedTaint}, + {}, + }, + "atomic-4-node-1": { + {toBeDeletedTaint}, + {}, + }, + "atomic-4-node-3": { + {toBeDeletedTaint}, + {}, + }, + }, + wantErr: cmpopts.AnyError, + }, + "failure to taint drain node stops further deletion and cleans already applied taints": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, //generateNodeGroupViewList(testNg, 0, 2), + drainNodes: []nodeGroupViewInfo{{"test", 2, 6}}, //generateNodeGroupViewList(testNg, 2, 6), + pods: map[string][]*apiv1.Pod{ + "test-node-2": removablePods(2, "test-node-2"), + "test-node-3": removablePods(2, "test-node-3"), + "test-node-4": removablePods(2, "test-node-4"), + "test-node-5": removablePods(2, "test-node-5"), + }, + failedNodeTaint: map[string]bool{"test-node-2": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownError, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + wantErr: cmpopts.AnyError, + }, + "failure to taint drain atomic node stops further deletion and cleans already applied taints": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + "atomic-6": sizedNodeGroup("atomic-6", 6, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + drainNodes: []nodeGroupViewInfo{{"atomic-6", 0, 6}}, + pods: map[string][]*apiv1.Pod{ + "atomic-6-node-0": removablePods(2, "atomic-6-node-0"), + "atomic-6-node-1": removablePods(2, "atomic-6-node-1"), + "atomic-6-node-2": removablePods(2, "atomic-6-node-2"), + "atomic-6-node-3": removablePods(2, "atomic-6-node-3"), + "atomic-6-node-4": removablePods(2, "atomic-6-node-4"), + "atomic-6-node-5": removablePods(2, "atomic-6-node-5"), + }, + failedNodeTaint: map[string]bool{"atomic-6-node-2": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownError, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + wantErr: cmpopts.AnyError, + }, + "nodes that failed drain are correctly reported in results": { + defaultOnly: true, + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + drainNodes: []nodeGroupViewInfo{{"test", 0, 4}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": removablePods(3, "test-node-0"), + "test-node-1": removablePods(3, "test-node-1"), + "test-node-2": removablePods(3, "test-node-2"), + "test-node-3": removablePods(3, "test-node-3"), + }, + failedPodDrain: map[string]bool{ + "test-node-0-pod-0": true, + "test-node-0-pod-1": true, + "test-node-2-pod-1": true, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-0"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-1"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-2"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-3"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-1", "test-node-3"}, + wantDeletedPods: []string{ + "test-node-0-pod-2", + "test-node-1-pod-0", "test-node-1-pod-1", "test-node-1-pod-2", + "test-node-2-pod-0", "test-node-2-pod-2", + "test-node-3-pod-0", "test-node-3-pod-1", "test-node-3-pod-2", + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + {}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + "test-node-2": { + {toBeDeletedTaint}, + {}, + }, + "test-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": { + ResultType: status.NodeDeleteErrorFailedToEvictPods, + Err: cmpopts.AnyError, + PodEvictionResults: map[string]status.PodEvictionResult{ + "test-node-0-pod-0": {Pod: removablePod("test-node-0-pod-0", "test-node-0"), Err: cmpopts.AnyError, TimedOut: true}, + "test-node-0-pod-1": {Pod: removablePod("test-node-0-pod-1", "test-node-0"), Err: cmpopts.AnyError, TimedOut: true}, + "test-node-0-pod-2": {Pod: removablePod("test-node-0-pod-2", "test-node-0")}, + }, + }, + "test-node-1": {ResultType: status.NodeDeleteOk}, + "test-node-2": { + ResultType: status.NodeDeleteErrorFailedToEvictPods, + Err: cmpopts.AnyError, + PodEvictionResults: map[string]status.PodEvictionResult{ + "test-node-2-pod-0": {Pod: removablePod("test-node-2-pod-0", "test-node-2")}, + "test-node-2-pod-1": {Pod: removablePod("test-node-2-pod-1", "test-node-2"), Err: cmpopts.AnyError, TimedOut: true}, + "test-node-2-pod-2": {Pod: removablePod("test-node-2-pod-2", "test-node-2")}, + }, + }, + "test-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "nodes that failed drain are forcefully deleted": { + forcedOnly: true, + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + drainNodes: []nodeGroupViewInfo{{"test", 0, 4}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": removablePods(3, "test-node-0"), + "test-node-1": removablePods(3, "test-node-1"), + "test-node-2": removablePods(3, "test-node-2"), + "test-node-3": removablePods(3, "test-node-3"), + }, + failedPodDrain: map[string]bool{ + "test-node-0-pod-0": true, + "test-node-0-pod-1": true, + "test-node-2-pod-1": true, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-0"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-1"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-2"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: removablePods(3, "test-node-3"), + utilInfo: generateUtilInfo(3./8., 3./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1", "test-node-2", "test-node-3"}, + wantDeletedPods: []string{ + "test-node-0-pod-2", + "test-node-1-pod-0", "test-node-1-pod-1", "test-node-1-pod-2", + "test-node-2-pod-0", "test-node-2-pod-2", + "test-node-3-pod-0", "test-node-3-pod-1", "test-node-3-pod-2", + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + "test-node-2": { + {toBeDeletedTaint}, + }, + "test-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + "test-node-2": {ResultType: status.NodeDeleteOk}, + "test-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "nodes that failed deletion are correctly reported in results": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + drainNodes: []nodeGroupViewInfo{{"test", 2, 4}}, + pods: map[string][]*apiv1.Pod{ + "test-node-2": removablePods(2, "test-node-2"), + "test-node-3": removablePods(2, "test-node-3"), + }, + failedNodeDeletion: map[string]bool{ + "test-node-1": true, + "test-node-3": true, + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-2"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-3"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-2"}, + wantDeletedPods: []string{ + "test-node-2-pod-0", "test-node-2-pod-1", + "test-node-3-pod-0", "test-node-3-pod-1", + }, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + {}, + }, + "test-node-2": { + {toBeDeletedTaint}, + }, + "test-node-3": { + {toBeDeletedTaint}, + {}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, + "test-node-2": {ResultType: status.NodeDeleteOk}, + "test-node-3": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, + }, + }, + "DS pods are evicted from empty nodes, but don't block deletion on error": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": generateDsPods(2, "test-node-0"), + "test-node-1": generateDsPods(2, "test-node-1"), + }, + failedPodDrain: map[string]bool{"test-node-1-ds-pod-0": true}, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + wantDeletedPods: []string{"test-node-0-ds-pod-0", "test-node-0-ds-pod-1", "test-node-1-ds-pod-1"}, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "DS pods and deletion with drain": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + drainNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": generateDsPods(2, "test-node-0"), + "test-node-1": generateDsPods(2, "test-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + // this is nil because DaemonSetEvictionForOccupiedNodes is + // not enabled for drained nodes in this test suite + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + { + name: "test-node-1", + nodeGroup: "test", + // this is nil because DaemonSetEvictionForOccupiedNodes is + // not enabled for drained nodes in this test suite + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1"}, + // same as evicted pods + wantDeletedPods: nil, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + }, + }, + "DS pods and empty and drain deletion work correctly together": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + drainNodes: []nodeGroupViewInfo{{"test", 2, 4}}, + pods: map[string][]*apiv1.Pod{ + "test-node-2": removablePods(2, "test-node-2"), + "test-node-3": generateDsPods(2, "test-node-3"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-2", + nodeGroup: "test", + evictedPods: removablePods(2, "test-node-2"), + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-3", + nodeGroup: "test", + evictedPods: nil, + utilInfo: dsUtilInfo, + }, + }, + }, + wantDeletedNodes: []string{"test-node-0", "test-node-1", "test-node-2", "test-node-3"}, + // same as evicted pods + wantDeletedPods: nil, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + }, + "test-node-2": { + {toBeDeletedTaint}, + }, + "test-node-3": { + {toBeDeletedTaint}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteOk}, + "test-node-2": {ResultType: status.NodeDeleteOk}, + "test-node-3": {ResultType: status.NodeDeleteOk}, + }, + }, + "nodes with pods are not deleted if the node is passed as empty": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}}, + pods: map[string][]*apiv1.Pod{ + "test-node-0": removablePods(2, "test-node-0"), + "test-node-1": removablePods(2, "test-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: nil, + wantDeletedPods: nil, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + {}, + }, + "test-node-1": { + {toBeDeletedTaint}, + {}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, + "test-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, + }, + }, + "atomic nodes with pods are not deleted if the node is passed as empty": { + nodeGroups: map[string]*testprovider.TestNodeGroup{ + "test": sizedNodeGroup("test", 3, false, ignoreDaemonSetsUtilization), + "atomic-2": sizedNodeGroup("atomic-2", 2, true, ignoreDaemonSetsUtilization), + }, + emptyNodes: []nodeGroupViewInfo{{"test", 0, 2}, {"atomic-2", 0, 2}}, + pods: map[string][]*apiv1.Pod{ + "test-node-1": removablePods(2, "test-node-1"), + "atomic-2-node-1": removablePods(2, "atomic-2-node-1"), + }, + wantStatus: scaleDownStatusInfo{ + result: status.ScaleDownNodeDeleteStarted, + scaledDownNodes: []scaleDownNodeInfo{ + { + name: "test-node-0", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "test-node-1", + nodeGroup: "test", + evictedPods: nil, + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + { + name: "atomic-2-node-0", + nodeGroup: "atomic-2", + evictedPods: nil, + utilInfo: generateUtilInfo(0, 0), + }, + { + name: "atomic-2-node-1", + nodeGroup: "atomic-2", + evictedPods: nil, + utilInfo: generateUtilInfo(2./8., 2./8.), + }, + }, + }, + wantDeletedNodes: []string{"test-node-0"}, + wantDeletedPods: nil, + wantTaintUpdates: map[string][][]apiv1.Taint{ + "test-node-0": { + {toBeDeletedTaint}, + }, + "test-node-1": { + {toBeDeletedTaint}, + {}, + }, + "atomic-2-node-0": { + {toBeDeletedTaint}, + {}, + }, + "atomic-2-node-1": { + {toBeDeletedTaint}, + {}, + }, + }, + wantNodeDeleteResults: map[string]status.NodeDeleteResult{ + "test-node-0": {ResultType: status.NodeDeleteOk}, + "test-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, + "atomic-2-node-0": {ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}, + "atomic-2-node-1": {ResultType: status.NodeDeleteErrorInternal, Err: cmpopts.AnyError}, + }, + }, + } + + filteredTestCases := map[string]startDeletionTestCase{} + for k, v := range testCases { + if force && v.defaultOnly { + continue + } + if !force && v.forcedOnly { + continue + } + filteredTestCases[k+" "+suffix] = v + } + + return filteredTestCases } func runStartDeletionTest(t *testing.T, tc startDeletionTestCase, force bool) { - // Insert all nodes into a map to support live node updates and GETs. - emptyNodeGroupViews, drainNodeGroupViews := []*budgets.NodeGroupView{}, []*budgets.NodeGroupView{} - allEmptyNodes, allDrainNodes := []*apiv1.Node{}, []*apiv1.Node{} - nodesByName := make(map[string]*apiv1.Node) - nodesLock := sync.Mutex{} - for _, ngvInfo := range tc.emptyNodes { - ngv := generateNodeGroupViewList(tc.nodeGroups[ngvInfo.nodeGroupName], ngvInfo.from, ngvInfo.to) - emptyNodeGroupViews = append(emptyNodeGroupViews, ngv...) - } - for _, bucket := range emptyNodeGroupViews { - allEmptyNodes = append(allEmptyNodes, bucket.Nodes...) - for _, node := range bucket.Nodes { - nodesByName[node.Name] = node - } - } - - for _, ngvInfo := range tc.drainNodes { - ngv := generateNodeGroupViewList(tc.nodeGroups[ngvInfo.nodeGroupName], ngvInfo.from, ngvInfo.to) - drainNodeGroupViews = append(drainNodeGroupViews, ngv...) - } - for _, bucket := range drainNodeGroupViews { - allDrainNodes = append(allDrainNodes, bucket.Nodes...) - for _, node := range bucket.Nodes { - nodesByName[node.Name] = node - } - } - - // Set up a fake k8s client to hook and verify certain actions. - fakeClient := &fake.Clientset{} - type nodeTaints struct { - nodeName string - taints []apiv1.Taint - } - taintUpdates := make(chan nodeTaints, 20) - deletedNodes := make(chan string, 10) - deletedPods := make(chan string, 10) - - ds := generateDaemonSet() - - // We're faking the whole k8s client, and some of the code needs to get live nodes and pods, so GET on nodes and pods has to be set up. - fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { - nodesLock.Lock() - defer nodesLock.Unlock() - getAction := action.(core.GetAction) - node, found := nodesByName[getAction.GetName()] - if !found { - return true, nil, fmt.Errorf("node %q not found", getAction.GetName()) - } - return true, node, nil - }) - fakeClient.Fake.AddReactor("get", "pods", - func(action core.Action) (bool, runtime.Object, error) { - return true, nil, errors.NewNotFound(apiv1.Resource("pod"), "whatever") - }) - // Hook node update to gather all taint updates, and to fail the update for certain nodes to simulate errors. - fakeClient.Fake.AddReactor("update", "nodes", - func(action core.Action) (bool, runtime.Object, error) { - nodesLock.Lock() - defer nodesLock.Unlock() - update := action.(core.UpdateAction) - obj := update.GetObject().(*apiv1.Node) - if tc.failedNodeTaint[obj.Name] { - return true, nil, fmt.Errorf("SIMULATED ERROR: won't taint") - } - nt := nodeTaints{ - nodeName: obj.Name, - } - for _, taint := range obj.Spec.Taints { - nt.taints = append(nt.taints, taint) - } - taintUpdates <- nt - nodesByName[obj.Name] = obj.DeepCopy() - return true, obj, nil - }) - // Hook eviction creation to gather which pods were evicted, and to fail the eviction for certain pods to simulate errors. - fakeClient.Fake.AddReactor("create", "pods", - func(action core.Action) (bool, runtime.Object, error) { - createAction := action.(core.CreateAction) - if createAction == nil { - return false, nil, nil - } - eviction := createAction.GetObject().(*policyv1beta1.Eviction) - if eviction == nil { - return false, nil, nil - } - if tc.failedPodDrain[eviction.Name] { - return true, nil, fmt.Errorf("SIMULATED ERROR: won't evict") - } - deletedPods <- eviction.Name - return true, nil, nil - }) - - // Hook node deletion at the level of cloud provider, to gather which nodes were deleted, and to fail the deletion for - // certain nodes to simulate errors. - provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleDown(func(nodeGroup string, node string) error { - if tc.failedNodeDeletion[node] { - return fmt.Errorf("SIMULATED ERROR: won't remove node") - } - deletedNodes <- node - return nil - }).Build() - for _, bucket := range emptyNodeGroupViews { - bucket.Group.(*testprovider.TestNodeGroup).SetCloudProvider(provider) - provider.InsertNodeGroup(bucket.Group) - for _, node := range bucket.Nodes { - provider.AddNode(bucket.Group.Id(), node) - } - } - for _, bucket := range drainNodeGroupViews { - bucket.Group.(*testprovider.TestNodeGroup).SetCloudProvider(provider) - provider.InsertNodeGroup(bucket.Group) - for _, node := range bucket.Nodes { - provider.AddNode(bucket.Group.Id(), node) - } - } - - // Set up other needed structures and options. - opts := config.AutoscalingOptions{ - MaxScaleDownParallelism: 10, - MaxDrainParallelism: 5, - MaxPodEvictionTime: 0, - DaemonSetEvictionForEmptyNodes: true, - } - - allPods := []*apiv1.Pod{} - - for _, pods := range tc.pods { - allPods = append(allPods, pods...) - } - - podLister := kube_util.NewTestPodLister(allPods) - pdbLister := kube_util.NewTestPodDisruptionBudgetLister([]*policyv1.PodDisruptionBudget{}) - dsLister, err := kube_util.NewTestDaemonSetLister([]*appsv1.DaemonSet{ds}) - if err != nil { - t.Fatalf("Couldn't create daemonset lister") - } - - registry := kube_util.NewListerRegistry(nil, nil, podLister, pdbLister, dsLister, nil, nil, nil, nil) - ctx, err := NewScaleTestAutoscalingContext(opts, fakeClient, registry, provider, nil, nil) - if err != nil { - t.Fatalf("Couldn't set up autoscaling context: %v", err) - } - csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker()) - for _, bucket := range emptyNodeGroupViews { - for _, node := range bucket.Nodes { - err := ctx.ClusterSnapshot.AddNodeInfo(framework.NewTestNodeInfo(node, tc.pods[node.Name]...)) - if err != nil { - t.Fatalf("Couldn't add node %q to snapshot: %v", node.Name, err) - } - } - } - for _, bucket := range drainNodeGroupViews { - for _, node := range bucket.Nodes { - pods, found := tc.pods[node.Name] - if !found { - t.Fatalf("Drain node %q doesn't have pods defined in the test case.", node.Name) - } - err := ctx.ClusterSnapshot.AddNodeInfo(framework.NewTestNodeInfo(node, pods...)) - if err != nil { - t.Fatalf("Couldn't add node %q to snapshot: %v", node.Name, err) - } - } - } - - wantScaleDownNodes := []*status.ScaleDownNode{} - for _, scaleDownNodeInfo := range tc.wantStatus.scaledDownNodes { - statusScaledDownNode := &status.ScaleDownNode{ - Node: generateNode(scaleDownNodeInfo.name), - NodeGroup: tc.nodeGroups[scaleDownNodeInfo.nodeGroup], - EvictedPods: scaleDownNodeInfo.evictedPods, - UtilInfo: scaleDownNodeInfo.utilInfo, - } - wantScaleDownNodes = append(wantScaleDownNodes, statusScaledDownNode) - } - - scaleStateNotifier := nodegroupchange.NewNodeGroupChangeObserversList() - scaleStateNotifier.Register(csr) - - // Create Actuator, run StartDeletion, and verify the error. - ndt := deletiontracker.NewNodeDeletionTracker(0) - ndb := NewNodeDeletionBatcher(&ctx, scaleStateNotifier, ndt, 0*time.Second) - legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, fullDsEviction: force} - fakeNodeLatencyTracker := &fakeLatencyTracker{} - actuator := Actuator{ - ctx: &ctx, nodeDeletionTracker: ndt, - nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), - budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), - configGetter: nodegroupconfig.NewDefaultNodeGroupConfigProcessor(ctx.NodeGroupDefaults), - nodeLatencyTracker: fakeNodeLatencyTracker, - } - - var gotResult status.ScaleDownResult - var gotScaleDownNodes []*status.ScaleDownNode - var gotErr error - if force { - gotResult, gotScaleDownNodes, gotErr = actuator.StartForceDeletion(allEmptyNodes, allDrainNodes) - } else { - gotResult, gotScaleDownNodes, gotErr = actuator.StartDeletion(allEmptyNodes, allDrainNodes) - } - - if diff := cmp.Diff(tc.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { - t.Errorf("StartDeletion error diff (-want +got):\n%s", diff) - } - - // Verify ScaleDownResult looks as expected. - if diff := cmp.Diff(tc.wantStatus.result, gotResult); diff != "" { - t.Errorf("StartDeletion result diff (-want +got):\n%s", diff) - } - - // Verify ScaleDownNodes looks as expected. - ignoreSdNodeOrder := cmpopts.SortSlices(func(a, b *status.ScaleDownNode) bool { return a.Node.Name < b.Node.Name }) - cmpNg := cmp.Comparer(func(a, b *testprovider.TestNodeGroup) bool { return a.Id() == b.Id() }) - statusCmpOpts := cmp.Options{ignoreSdNodeOrder, cmpNg, cmpopts.EquateEmpty()} - if diff := cmp.Diff(wantScaleDownNodes, gotScaleDownNodes, statusCmpOpts); diff != "" { - t.Errorf("StartDeletion scaled down nodes diff (-want +got):\n%s", diff) - } - - // Verify that all expected nodes were deleted using the cloud provider hook. - var gotDeletedNodes []string + // Insert all nodes into a map to support live node updates and GETs. + emptyNodeGroupViews, drainNodeGroupViews := []*budgets.NodeGroupView{}, []*budgets.NodeGroupView{} + allEmptyNodes, allDrainNodes := []*apiv1.Node{}, []*apiv1.Node{} + nodesByName := make(map[string]*apiv1.Node) + nodesLock := sync.Mutex{} + for _, ngvInfo := range tc.emptyNodes { + ngv := generateNodeGroupViewList(tc.nodeGroups[ngvInfo.nodeGroupName], ngvInfo.from, ngvInfo.to) + emptyNodeGroupViews = append(emptyNodeGroupViews, ngv...) + } + for _, bucket := range emptyNodeGroupViews { + allEmptyNodes = append(allEmptyNodes, bucket.Nodes...) + for _, node := range bucket.Nodes { + nodesByName[node.Name] = node + } + } + + for _, ngvInfo := range tc.drainNodes { + ngv := generateNodeGroupViewList(tc.nodeGroups[ngvInfo.nodeGroupName], ngvInfo.from, ngvInfo.to) + drainNodeGroupViews = append(drainNodeGroupViews, ngv...) + } + for _, bucket := range drainNodeGroupViews { + allDrainNodes = append(allDrainNodes, bucket.Nodes...) + for _, node := range bucket.Nodes { + nodesByName[node.Name] = node + } + } + + // Set up a fake k8s client to hook and verify certain actions. + fakeClient := &fake.Clientset{} + type nodeTaints struct { + nodeName string + taints []apiv1.Taint + } + taintUpdates := make(chan nodeTaints, 20) + deletedNodes := make(chan string, 10) + deletedPods := make(chan string, 10) + + ds := generateDaemonSet() + + // We're faking the whole k8s client, and some of the code needs to get live nodes and pods, so GET on nodes and pods has to be set up. + fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { + nodesLock.Lock() + defer nodesLock.Unlock() + getAction := action.(core.GetAction) + node, found := nodesByName[getAction.GetName()] + if !found { + return true, nil, fmt.Errorf("node %q not found", getAction.GetName()) + } + return true, node, nil + }) + fakeClient.Fake.AddReactor("get", "pods", + func(action core.Action) (bool, runtime.Object, error) { + return true, nil, errors.NewNotFound(apiv1.Resource("pod"), "whatever") + }) + // Hook node update to gather all taint updates, and to fail the update for certain nodes to simulate errors. + fakeClient.Fake.AddReactor("update", "nodes", + func(action core.Action) (bool, runtime.Object, error) { + nodesLock.Lock() + defer nodesLock.Unlock() + update := action.(core.UpdateAction) + obj := update.GetObject().(*apiv1.Node) + if tc.failedNodeTaint[obj.Name] { + return true, nil, fmt.Errorf("SIMULATED ERROR: won't taint") + } + nt := nodeTaints{ + nodeName: obj.Name, + } + for _, taint := range obj.Spec.Taints { + nt.taints = append(nt.taints, taint) + } + taintUpdates <- nt + nodesByName[obj.Name] = obj.DeepCopy() + return true, obj, nil + }) + // Hook eviction creation to gather which pods were evicted, and to fail the eviction for certain pods to simulate errors. + fakeClient.Fake.AddReactor("create", "pods", + func(action core.Action) (bool, runtime.Object, error) { + createAction := action.(core.CreateAction) + if createAction == nil { + return false, nil, nil + } + eviction := createAction.GetObject().(*policyv1beta1.Eviction) + if eviction == nil { + return false, nil, nil + } + if tc.failedPodDrain[eviction.Name] { + return true, nil, fmt.Errorf("SIMULATED ERROR: won't evict") + } + deletedPods <- eviction.Name + return true, nil, nil + }) + + // Hook node deletion at the level of cloud provider, to gather which nodes were deleted, and to fail the deletion for + // certain nodes to simulate errors. + provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleDown(func(nodeGroup string, node string) error { + if tc.failedNodeDeletion[node] { + return fmt.Errorf("SIMULATED ERROR: won't remove node") + } + deletedNodes <- node + return nil + }).Build() + for _, bucket := range emptyNodeGroupViews { + bucket.Group.(*testprovider.TestNodeGroup).SetCloudProvider(provider) + provider.InsertNodeGroup(bucket.Group) + for _, node := range bucket.Nodes { + provider.AddNode(bucket.Group.Id(), node) + } + } + for _, bucket := range drainNodeGroupViews { + bucket.Group.(*testprovider.TestNodeGroup).SetCloudProvider(provider) + provider.InsertNodeGroup(bucket.Group) + for _, node := range bucket.Nodes { + provider.AddNode(bucket.Group.Id(), node) + } + } + + // Set up other needed structures and options. + opts := config.AutoscalingOptions{ + MaxScaleDownParallelism: 10, + MaxDrainParallelism: 5, + MaxPodEvictionTime: 0, + DaemonSetEvictionForEmptyNodes: true, + } + + allPods := []*apiv1.Pod{} + + for _, pods := range tc.pods { + allPods = append(allPods, pods...) + } + + podLister := kube_util.NewTestPodLister(allPods) + pdbLister := kube_util.NewTestPodDisruptionBudgetLister([]*policyv1.PodDisruptionBudget{}) + dsLister, err := kube_util.NewTestDaemonSetLister([]*appsv1.DaemonSet{ds}) + if err != nil { + t.Fatalf("Couldn't create daemonset lister") + } + + registry := kube_util.NewListerRegistry(nil, nil, podLister, pdbLister, dsLister, nil, nil, nil, nil) + ctx, err := NewScaleTestAutoscalingContext(opts, fakeClient, registry, provider, nil, nil) + if err != nil { + t.Fatalf("Couldn't set up autoscaling context: %v", err) + } + csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker()) + for _, bucket := range emptyNodeGroupViews { + for _, node := range bucket.Nodes { + err := ctx.ClusterSnapshot.AddNodeInfo(framework.NewTestNodeInfo(node, tc.pods[node.Name]...)) + if err != nil { + t.Fatalf("Couldn't add node %q to snapshot: %v", node.Name, err) + } + } + } + for _, bucket := range drainNodeGroupViews { + for _, node := range bucket.Nodes { + pods, found := tc.pods[node.Name] + if !found { + t.Fatalf("Drain node %q doesn't have pods defined in the test case.", node.Name) + } + err := ctx.ClusterSnapshot.AddNodeInfo(framework.NewTestNodeInfo(node, pods...)) + if err != nil { + t.Fatalf("Couldn't add node %q to snapshot: %v", node.Name, err) + } + } + } + + wantScaleDownNodes := []*status.ScaleDownNode{} + for _, scaleDownNodeInfo := range tc.wantStatus.scaledDownNodes { + statusScaledDownNode := &status.ScaleDownNode{ + Node: generateNode(scaleDownNodeInfo.name), + NodeGroup: tc.nodeGroups[scaleDownNodeInfo.nodeGroup], + EvictedPods: scaleDownNodeInfo.evictedPods, + UtilInfo: scaleDownNodeInfo.utilInfo, + } + wantScaleDownNodes = append(wantScaleDownNodes, statusScaledDownNode) + } + + scaleStateNotifier := nodegroupchange.NewNodeGroupChangeObserversList() + scaleStateNotifier.Register(csr) + + // Create Actuator, run StartDeletion, and verify the error. + ndt := deletiontracker.NewNodeDeletionTracker(0) + ndb := NewNodeDeletionBatcher(&ctx, scaleStateNotifier, ndt, 0*time.Second) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, fullDsEviction: force} + fakeNodeLatencyTracker := &fakeLatencyTracker{} + actuator := Actuator{ + ctx: &ctx, nodeDeletionTracker: ndt, + nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), + budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), + configGetter: nodegroupconfig.NewDefaultNodeGroupConfigProcessor(ctx.NodeGroupDefaults), + nodeLatencyTracker: fakeNodeLatencyTracker, + } + + var gotResult status.ScaleDownResult + var gotScaleDownNodes []*status.ScaleDownNode + var gotErr error + if force { + gotResult, gotScaleDownNodes, gotErr = actuator.StartForceDeletion(allEmptyNodes, allDrainNodes) + } else { + gotResult, gotScaleDownNodes, gotErr = actuator.StartDeletion(allEmptyNodes, allDrainNodes) + } + + if diff := cmp.Diff(tc.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { + t.Errorf("StartDeletion error diff (-want +got):\n%s", diff) + } + + // Verify ScaleDownResult looks as expected. + if diff := cmp.Diff(tc.wantStatus.result, gotResult); diff != "" { + t.Errorf("StartDeletion result diff (-want +got):\n%s", diff) + } + + // Verify ScaleDownNodes looks as expected. + ignoreSdNodeOrder := cmpopts.SortSlices(func(a, b *status.ScaleDownNode) bool { return a.Node.Name < b.Node.Name }) + cmpNg := cmp.Comparer(func(a, b *testprovider.TestNodeGroup) bool { return a.Id() == b.Id() }) + statusCmpOpts := cmp.Options{ignoreSdNodeOrder, cmpNg, cmpopts.EquateEmpty()} + if diff := cmp.Diff(wantScaleDownNodes, gotScaleDownNodes, statusCmpOpts); diff != "" { + t.Errorf("StartDeletion scaled down nodes diff (-want +got):\n%s", diff) + } + + // Verify that all expected nodes were deleted using the cloud provider hook. + var gotDeletedNodes []string nodesLoop: - for i := 0; i < len(tc.wantDeletedNodes); i++ { - select { - case deletedNode := <-deletedNodes: - gotDeletedNodes = append(gotDeletedNodes, deletedNode) - case <-time.After(3 * time.Second): - t.Errorf("Timeout while waiting for deleted nodes.") - break nodesLoop - } - } - ignoreStrOrder := cmpopts.SortSlices(func(a, b string) bool { return a < b }) - if diff := cmp.Diff(tc.wantDeletedNodes, gotDeletedNodes, ignoreStrOrder); diff != "" { - t.Errorf("deletedNodes diff (-want +got):\n%s", diff) - } - - // Verify that all expected pods were deleted using the fake k8s client hook. - var gotDeletedPods []string + for i := 0; i < len(tc.wantDeletedNodes); i++ { + select { + case deletedNode := <-deletedNodes: + gotDeletedNodes = append(gotDeletedNodes, deletedNode) + case <-time.After(3 * time.Second): + t.Errorf("Timeout while waiting for deleted nodes.") + break nodesLoop + } + } + ignoreStrOrder := cmpopts.SortSlices(func(a, b string) bool { return a < b }) + if diff := cmp.Diff(tc.wantDeletedNodes, gotDeletedNodes, ignoreStrOrder); diff != "" { + t.Errorf("deletedNodes diff (-want +got):\n%s", diff) + } + + // Verify that all expected pods were deleted using the fake k8s client hook. + var gotDeletedPods []string podsLoop: - for i := 0; i < len(tc.wantDeletedPods); i++ { - select { - case deletedPod := <-deletedPods: - gotDeletedPods = append(gotDeletedPods, deletedPod) - case <-time.After(3 * time.Second): - t.Errorf("Timeout while waiting for deleted pods.") - break podsLoop - } - } - if diff := cmp.Diff(tc.wantDeletedPods, gotDeletedPods, ignoreStrOrder); diff != "" { - t.Errorf("deletedPods diff (-want +got):\n%s", diff) - } - - // Verify that all expected taint updates happened using the fake k8s client hook. - allUpdatesCount := 0 - for _, updates := range tc.wantTaintUpdates { - allUpdatesCount += len(updates) - } - gotTaintUpdates := make(map[string][][]apiv1.Taint) + for i := 0; i < len(tc.wantDeletedPods); i++ { + select { + case deletedPod := <-deletedPods: + gotDeletedPods = append(gotDeletedPods, deletedPod) + case <-time.After(3 * time.Second): + t.Errorf("Timeout while waiting for deleted pods.") + break podsLoop + } + } + if diff := cmp.Diff(tc.wantDeletedPods, gotDeletedPods, ignoreStrOrder); diff != "" { + t.Errorf("deletedPods diff (-want +got):\n%s", diff) + } + + // Verify that all expected taint updates happened using the fake k8s client hook. + allUpdatesCount := 0 + for _, updates := range tc.wantTaintUpdates { + allUpdatesCount += len(updates) + } + gotTaintUpdates := make(map[string][][]apiv1.Taint) taintsLoop: - for i := 0; i < allUpdatesCount; i++ { - select { - case taintUpdate := <-taintUpdates: - gotTaintUpdates[taintUpdate.nodeName] = append(gotTaintUpdates[taintUpdate.nodeName], taintUpdate.taints) - case <-time.After(3 * time.Second): - t.Errorf("Timeout while waiting for taint updates.") - break taintsLoop - } - } - startupTaintValue := cmpopts.IgnoreFields(apiv1.Taint{}, "Value") - if diff := cmp.Diff(tc.wantTaintUpdates, gotTaintUpdates, startupTaintValue, cmpopts.EquateEmpty()); diff != "" { - t.Errorf("taintUpdates diff (-want +got):\n%s", diff) - } - - // Wait for all expected deletions to be reported in NodeDeletionTracker. Reporting happens shortly after the deletion - // in cloud provider we sync to above and so this will usually not wait at all. However, it can still happen - // that there is a delay between cloud provider deletion and reporting, in which case the results are not there yet - // and we need to wait for them before asserting. - err = waitForDeletionResultsCount(actuator.nodeDeletionTracker, len(tc.wantNodeDeleteResults), 3*time.Second, 200*time.Millisecond) - if err != nil { - t.Errorf("Timeout while waiting for node deletion results") - } - - // Gather node deletion results for deletions started in the previous call, and verify that they look as expected. - nodeDeleteResults, _ := actuator.DeletionResults() - if diff := cmp.Diff(tc.wantNodeDeleteResults, nodeDeleteResults, cmpopts.EquateEmpty(), cmpopts.EquateErrors()); diff != "" { - t.Errorf("NodeDeleteResults diff (-want +got):\n%s", diff) - } - // Verify ObserveDeletion was called for all nodes that were actually deleted - for _, expectedNode := range tc.wantDeletedNodes { - found := false - for _, observed := range fakeNodeLatencyTracker.ObservedNodes { - if observed == expectedNode { - found = true - break - } - } - if !found { - t.Errorf("Expected ObserveDeletion to be called for node %s, but it wasn't", expectedNode) - } - } + for i := 0; i < allUpdatesCount; i++ { + select { + case taintUpdate := <-taintUpdates: + gotTaintUpdates[taintUpdate.nodeName] = append(gotTaintUpdates[taintUpdate.nodeName], taintUpdate.taints) + case <-time.After(3 * time.Second): + t.Errorf("Timeout while waiting for taint updates.") + break taintsLoop + } + } + startupTaintValue := cmpopts.IgnoreFields(apiv1.Taint{}, "Value") + if diff := cmp.Diff(tc.wantTaintUpdates, gotTaintUpdates, startupTaintValue, cmpopts.EquateEmpty()); diff != "" { + t.Errorf("taintUpdates diff (-want +got):\n%s", diff) + } + + // Wait for all expected deletions to be reported in NodeDeletionTracker. Reporting happens shortly after the deletion + // in cloud provider we sync to above and so this will usually not wait at all. However, it can still happen + // that there is a delay between cloud provider deletion and reporting, in which case the results are not there yet + // and we need to wait for them before asserting. + err = waitForDeletionResultsCount(actuator.nodeDeletionTracker, len(tc.wantNodeDeleteResults), 3*time.Second, 200*time.Millisecond) + if err != nil { + t.Errorf("Timeout while waiting for node deletion results") + } + + // Gather node deletion results for deletions started in the previous call, and verify that they look as expected. + nodeDeleteResults, _ := actuator.DeletionResults() + if diff := cmp.Diff(tc.wantNodeDeleteResults, nodeDeleteResults, cmpopts.EquateEmpty(), cmpopts.EquateErrors()); diff != "" { + t.Errorf("NodeDeleteResults diff (-want +got):\n%s", diff) + } + // Verify ObserveDeletion was called for all nodes that were actually deleted + for _, expectedNode := range tc.wantDeletedNodes { + found := false + for _, observed := range fakeNodeLatencyTracker.ObservedNodes { + if observed == expectedNode { + found = true + break + } + } + if !found { + t.Errorf("Expected ObserveDeletion to be called for node %s, but it wasn't", expectedNode) + } + } } func TestStartDeletion(t *testing.T) { - testSets := []map[string]startDeletionTestCase{ - // IgnoreDaemonSetsUtilization is false - getStartDeletionTestCases(false, false, "testNg1"), - // IgnoreDaemonSetsUtilization is true - getStartDeletionTestCases(true, false, "testNg2"), - } - - for _, testSet := range testSets { - for tn, tc := range testSet { - t.Run(tn, func(t *testing.T) { - runStartDeletionTest(t, tc, false) - }) - } - } + testSets := []map[string]startDeletionTestCase{ + // IgnoreDaemonSetsUtilization is false + getStartDeletionTestCases(false, false, "testNg1"), + // IgnoreDaemonSetsUtilization is true + getStartDeletionTestCases(true, false, "testNg2"), + } + + for _, testSet := range testSets { + for tn, tc := range testSet { + t.Run(tn, func(t *testing.T) { + runStartDeletionTest(t, tc, false) + }) + } + } } func TestStartForceDeletion(t *testing.T) { - testSets := []map[string]startDeletionTestCase{ - // IgnoreDaemonSetsUtilization is false - getStartDeletionTestCases(false, true, "testNg1"), - // IgnoreDaemonSetsUtilization is true - getStartDeletionTestCases(true, true, "testNg2"), - } - - for _, testSet := range testSets { - for tn, tc := range testSet { - t.Run(tn, func(t *testing.T) { - runStartDeletionTest(t, tc, true) - }) - } - } + testSets := []map[string]startDeletionTestCase{ + // IgnoreDaemonSetsUtilization is false + getStartDeletionTestCases(false, true, "testNg1"), + // IgnoreDaemonSetsUtilization is true + getStartDeletionTestCases(true, true, "testNg2"), + } + + for _, testSet := range testSets { + for tn, tc := range testSet { + t.Run(tn, func(t *testing.T) { + runStartDeletionTest(t, tc, true) + }) + } + } } func TestStartDeletionInBatchBasic(t *testing.T) { - deleteInterval := 1 * time.Second - - for _, test := range []struct { - name string - deleteCalls int - numNodesToDelete map[string][]int //per node group and per call - failedRequests map[string]bool //per node group - wantSuccessfulDeletion map[string]int //per node group - }{ - { - name: "Succesfull deletion for all node group", - deleteCalls: 1, - numNodesToDelete: map[string][]int{ - "test-ng-1": {4}, - "test-ng-2": {5}, - "test-ng-3": {1}, - }, - wantSuccessfulDeletion: map[string]int{ - "test-ng-1": 4, - "test-ng-2": 5, - "test-ng-3": 1, - }, - }, - { - name: "Node deletion failed for one group", - deleteCalls: 1, - numNodesToDelete: map[string][]int{ - "test-ng-1": {4}, - "test-ng-2": {5}, - "test-ng-3": {1}, - }, - failedRequests: map[string]bool{ - "test-ng-1": true, - }, - wantSuccessfulDeletion: map[string]int{ - "test-ng-1": 0, - "test-ng-2": 5, - "test-ng-3": 1, - }, - }, - { - name: "Node deletion failed for one group two times", - deleteCalls: 2, - numNodesToDelete: map[string][]int{ - "test-ng-1": {4, 3}, - "test-ng-2": {5}, - "test-ng-3": {1}, - }, - failedRequests: map[string]bool{ - "test-ng-1": true, - }, - wantSuccessfulDeletion: map[string]int{ - "test-ng-1": 0, - "test-ng-2": 5, - "test-ng-3": 1, - }, - }, - { - name: "Node deletion failed for all groups", - deleteCalls: 2, - numNodesToDelete: map[string][]int{ - "test-ng-1": {4, 3}, - "test-ng-2": {5}, - "test-ng-3": {1}, - }, - failedRequests: map[string]bool{ - "test-ng-1": true, - "test-ng-2": true, - "test-ng-3": true, - }, - wantSuccessfulDeletion: map[string]int{ - "test-ng-1": 0, - "test-ng-2": 0, - "test-ng-3": 0, - }, - }, - } { - t.Run(test.name, func(t *testing.T) { - test := test - gotFailedRequest := func(nodeGroupId string) bool { - val, _ := test.failedRequests[nodeGroupId] - return val - } - deletedResult := make(chan string) - fakeClient := &fake.Clientset{} - provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleDown(func(nodeGroupId string, node string) error { - if gotFailedRequest(nodeGroupId) { - return fmt.Errorf("SIMULATED ERROR: won't remove node") - } - deletedResult <- nodeGroupId - return nil - }).Build() - // 2d array represent the waves of pushing nodes to delete. - deleteNodes := [][]*apiv1.Node{} - - for i := 0; i < test.deleteCalls; i++ { - deleteNodes = append(deleteNodes, []*apiv1.Node{}) - } - testNg1 := testprovider.NewTestNodeGroup("test-ng-1", 0, 100, 3, true, false, "n1-standard-2", nil, nil) - testNg2 := testprovider.NewTestNodeGroup("test-ng-2", 0, 100, 3, true, false, "n1-standard-2", nil, nil) - testNg3 := testprovider.NewTestNodeGroup("test-ng-3", 0, 100, 3, true, false, "n1-standard-2", nil, nil) - testNg := map[string]*testprovider.TestNodeGroup{ - "test-ng-1": testNg1, - "test-ng-2": testNg2, - "test-ng-3": testNg3, - } - - for ngName, numNodes := range test.numNodesToDelete { - ng := testNg[ngName] - provider.InsertNodeGroup(ng) - ng.SetCloudProvider(provider) - for i, num := range numNodes { - singleBucketList := generateNodeGroupViewList(ng, 0, num) - bucket := singleBucketList[0] - deleteNodes[i] = append(deleteNodes[i], bucket.Nodes...) - for _, node := range bucket.Nodes { - provider.AddNode(bucket.Group.Id(), node) - } - } - } - opts := config.AutoscalingOptions{ - MaxScaleDownParallelism: 10, - MaxDrainParallelism: 5, - MaxPodEvictionTime: 0, - DaemonSetEvictionForEmptyNodes: true, - } - - podLister := kube_util.NewTestPodLister([]*apiv1.Pod{}) - pdbLister := kube_util.NewTestPodDisruptionBudgetLister([]*policyv1.PodDisruptionBudget{}) - registry := kube_util.NewListerRegistry(nil, nil, podLister, pdbLister, nil, nil, nil, nil, nil) - ctx, err := NewScaleTestAutoscalingContext(opts, fakeClient, registry, provider, nil, nil) - if err != nil { - t.Fatalf("Couldn't set up autoscaling context: %v", err) - } - csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker()) - scaleStateNotifier := nodegroupchange.NewNodeGroupChangeObserversList() - scaleStateNotifier.Register(csr) - ndt := deletiontracker.NewNodeDeletionTracker(0) - ndb := NewNodeDeletionBatcher(&ctx, scaleStateNotifier, ndt, deleteInterval) - legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig} - fakeNodeLatencyTracker := &fakeLatencyTracker{} - actuator := Actuator{ - ctx: &ctx, nodeDeletionTracker: ndt, - nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), - budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), - nodeLatencyTracker: fakeNodeLatencyTracker, - } - - for _, nodes := range deleteNodes { - actuator.StartDeletion(nodes, []*apiv1.Node{}) - time.Sleep(deleteInterval) - } - wantDeletedNodes := 0 - for _, num := range test.wantSuccessfulDeletion { - wantDeletedNodes += num - } - gotDeletedNodes := map[string]int{ - "test-ng-1": 0, - "test-ng-2": 0, - "test-ng-3": 0, - } - for i := 0; i < wantDeletedNodes; i++ { - select { - case ngId := <-deletedResult: - gotDeletedNodes[ngId]++ - case <-time.After(1 * time.Second): - t.Errorf("Timeout while waiting for deleted nodes.") - break - } - } - if diff := cmp.Diff(test.wantSuccessfulDeletion, gotDeletedNodes); diff != "" { - t.Errorf("Successful deleteions per node group diff (-want +got):\n%s", diff) - } - for _, nodes := range deleteNodes { - for _, node := range nodes { - found := false - for _, observedNode := range fakeNodeLatencyTracker.ObservedNodes { - if observedNode == node.Name { - found = true - break - } - } - if !found { - t.Errorf("Expected ObserveDeletion to be called for node %s", node.Name) - } - } - } - }) - } + deleteInterval := 1 * time.Second + + for _, test := range []struct { + name string + deleteCalls int + numNodesToDelete map[string][]int //per node group and per call + failedRequests map[string]bool //per node group + wantSuccessfulDeletion map[string]int //per node group + }{ + { + name: "Succesfull deletion for all node group", + deleteCalls: 1, + numNodesToDelete: map[string][]int{ + "test-ng-1": {4}, + "test-ng-2": {5}, + "test-ng-3": {1}, + }, + wantSuccessfulDeletion: map[string]int{ + "test-ng-1": 4, + "test-ng-2": 5, + "test-ng-3": 1, + }, + }, + { + name: "Node deletion failed for one group", + deleteCalls: 1, + numNodesToDelete: map[string][]int{ + "test-ng-1": {4}, + "test-ng-2": {5}, + "test-ng-3": {1}, + }, + failedRequests: map[string]bool{ + "test-ng-1": true, + }, + wantSuccessfulDeletion: map[string]int{ + "test-ng-1": 0, + "test-ng-2": 5, + "test-ng-3": 1, + }, + }, + { + name: "Node deletion failed for one group two times", + deleteCalls: 2, + numNodesToDelete: map[string][]int{ + "test-ng-1": {4, 3}, + "test-ng-2": {5}, + "test-ng-3": {1}, + }, + failedRequests: map[string]bool{ + "test-ng-1": true, + }, + wantSuccessfulDeletion: map[string]int{ + "test-ng-1": 0, + "test-ng-2": 5, + "test-ng-3": 1, + }, + }, + { + name: "Node deletion failed for all groups", + deleteCalls: 2, + numNodesToDelete: map[string][]int{ + "test-ng-1": {4, 3}, + "test-ng-2": {5}, + "test-ng-3": {1}, + }, + failedRequests: map[string]bool{ + "test-ng-1": true, + "test-ng-2": true, + "test-ng-3": true, + }, + wantSuccessfulDeletion: map[string]int{ + "test-ng-1": 0, + "test-ng-2": 0, + "test-ng-3": 0, + }, + }, + } { + t.Run(test.name, func(t *testing.T) { + test := test + gotFailedRequest := func(nodeGroupId string) bool { + val, _ := test.failedRequests[nodeGroupId] + return val + } + deletedResult := make(chan string) + fakeClient := &fake.Clientset{} + provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleDown(func(nodeGroupId string, node string) error { + if gotFailedRequest(nodeGroupId) { + return fmt.Errorf("SIMULATED ERROR: won't remove node") + } + deletedResult <- nodeGroupId + return nil + }).Build() + // 2d array represent the waves of pushing nodes to delete. + deleteNodes := [][]*apiv1.Node{} + + for i := 0; i < test.deleteCalls; i++ { + deleteNodes = append(deleteNodes, []*apiv1.Node{}) + } + testNg1 := testprovider.NewTestNodeGroup("test-ng-1", 0, 100, 3, true, false, "n1-standard-2", nil, nil) + testNg2 := testprovider.NewTestNodeGroup("test-ng-2", 0, 100, 3, true, false, "n1-standard-2", nil, nil) + testNg3 := testprovider.NewTestNodeGroup("test-ng-3", 0, 100, 3, true, false, "n1-standard-2", nil, nil) + testNg := map[string]*testprovider.TestNodeGroup{ + "test-ng-1": testNg1, + "test-ng-2": testNg2, + "test-ng-3": testNg3, + } + + for ngName, numNodes := range test.numNodesToDelete { + ng := testNg[ngName] + provider.InsertNodeGroup(ng) + ng.SetCloudProvider(provider) + for i, num := range numNodes { + singleBucketList := generateNodeGroupViewList(ng, 0, num) + bucket := singleBucketList[0] + deleteNodes[i] = append(deleteNodes[i], bucket.Nodes...) + for _, node := range bucket.Nodes { + provider.AddNode(bucket.Group.Id(), node) + } + } + } + opts := config.AutoscalingOptions{ + MaxScaleDownParallelism: 10, + MaxDrainParallelism: 5, + MaxPodEvictionTime: 0, + DaemonSetEvictionForEmptyNodes: true, + } + + podLister := kube_util.NewTestPodLister([]*apiv1.Pod{}) + pdbLister := kube_util.NewTestPodDisruptionBudgetLister([]*policyv1.PodDisruptionBudget{}) + registry := kube_util.NewListerRegistry(nil, nil, podLister, pdbLister, nil, nil, nil, nil, nil) + ctx, err := NewScaleTestAutoscalingContext(opts, fakeClient, registry, provider, nil, nil) + if err != nil { + t.Fatalf("Couldn't set up autoscaling context: %v", err) + } + csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker()) + scaleStateNotifier := nodegroupchange.NewNodeGroupChangeObserversList() + scaleStateNotifier.Register(csr) + ndt := deletiontracker.NewNodeDeletionTracker(0) + ndb := NewNodeDeletionBatcher(&ctx, scaleStateNotifier, ndt, deleteInterval) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig} + fakeNodeLatencyTracker := &fakeLatencyTracker{} + actuator := Actuator{ + ctx: &ctx, nodeDeletionTracker: ndt, + nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), + budgetProcessor: budgets.NewScaleDownBudgetProcessor(&ctx), + nodeLatencyTracker: fakeNodeLatencyTracker, + } + + for _, nodes := range deleteNodes { + actuator.StartDeletion(nodes, []*apiv1.Node{}) + time.Sleep(deleteInterval) + } + wantDeletedNodes := 0 + for _, num := range test.wantSuccessfulDeletion { + wantDeletedNodes += num + } + gotDeletedNodes := map[string]int{ + "test-ng-1": 0, + "test-ng-2": 0, + "test-ng-3": 0, + } + for i := 0; i < wantDeletedNodes; i++ { + select { + case ngId := <-deletedResult: + gotDeletedNodes[ngId]++ + case <-time.After(1 * time.Second): + t.Errorf("Timeout while waiting for deleted nodes.") + break + } + } + if diff := cmp.Diff(test.wantSuccessfulDeletion, gotDeletedNodes); diff != "" { + t.Errorf("Successful deleteions per node group diff (-want +got):\n%s", diff) + } + for _, nodes := range deleteNodes { + for _, node := range nodes { + found := false + for _, observedNode := range fakeNodeLatencyTracker.ObservedNodes { + if observedNode == node.Name { + found = true + break + } + } + if !found { + t.Errorf("Expected ObserveDeletion to be called for node %s", node.Name) + } + } + } + }) + } } func sizedNodeGroup(id string, size int, atomic, ignoreDaemonSetUtil bool) *testprovider.TestNodeGroup { - ng := testprovider.NewTestNodeGroup(id, 1000, 0, size, true, false, "n1-standard-2", nil, nil) - ng.SetOptions(&config.NodeGroupAutoscalingOptions{ - ZeroOrMaxNodeScaling: atomic, - IgnoreDaemonSetsUtilization: ignoreDaemonSetUtil, - }) - return ng + ng := testprovider.NewTestNodeGroup(id, 1000, 0, size, true, false, "n1-standard-2", nil, nil) + ng.SetOptions(&config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: atomic, + IgnoreDaemonSetsUtilization: ignoreDaemonSetUtil, + }) + return ng } func generateNodes(from, to int, prefix string) []*apiv1.Node { - var result []*apiv1.Node - for i := from; i < to; i++ { - name := fmt.Sprintf("node-%d", i) - if prefix != "" { - name = prefix + "-" + name - } - result = append(result, generateNode(name)) - } - return result + var result []*apiv1.Node + for i := from; i < to; i++ { + name := fmt.Sprintf("node-%d", i) + if prefix != "" { + name = prefix + "-" + name + } + result = append(result, generateNode(name)) + } + return result } func generateNodeGroupViewList(ng cloudprovider.NodeGroup, from, to int) []*budgets.NodeGroupView { - return []*budgets.NodeGroupView{ - { - Group: ng, - Nodes: generateNodes(from, to, ng.Id()), - }, - } + return []*budgets.NodeGroupView{ + { + Group: ng, + Nodes: generateNodes(from, to, ng.Id()), + }, + } } func generateNode(name string) *apiv1.Node { - return &apiv1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: name}, - Status: apiv1.NodeStatus{ - Allocatable: apiv1.ResourceList{ - apiv1.ResourceCPU: resource.MustParse("8"), - apiv1.ResourceMemory: resource.MustParse("8G"), - }, - }, - } + return &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Status: apiv1.NodeStatus{ + Allocatable: apiv1.ResourceList{ + apiv1.ResourceCPU: resource.MustParse("8"), + apiv1.ResourceMemory: resource.MustParse("8G"), + }, + }, + } } func removablePods(count int, prefix string) []*apiv1.Pod { - var result []*apiv1.Pod - for i := 0; i < count; i++ { - name := fmt.Sprintf("pod-%d", i) - if prefix != "" { - name = prefix + "-" + name - } - result = append(result, removablePod(name, prefix)) - } - return result + var result []*apiv1.Pod + for i := 0; i < count; i++ { + name := fmt.Sprintf("pod-%d", i) + if prefix != "" { + name = prefix + "-" + name + } + result = append(result, removablePod(name, prefix)) + } + return result } func removablePod(name string, node string) *apiv1.Pod { - return &apiv1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: "default", - Annotations: map[string]string{ - "cluster-autoscaler.kubernetes.io/safe-to-evict": "true", - }, - }, - Spec: apiv1.PodSpec{ - NodeName: node, - Containers: []apiv1.Container{ - { - Name: "test-container", - Resources: apiv1.ResourceRequirements{ - Requests: map[apiv1.ResourceName]resource.Quantity{ - apiv1.ResourceCPU: resource.MustParse("1"), - apiv1.ResourceMemory: resource.MustParse("1G"), - }, - }, - }, - }, - }, - } + return &apiv1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + Annotations: map[string]string{ + "cluster-autoscaler.kubernetes.io/safe-to-evict": "true", + }, + }, + Spec: apiv1.PodSpec{ + NodeName: node, + Containers: []apiv1.Container{ + { + Name: "test-container", + Resources: apiv1.ResourceRequirements{ + Requests: map[apiv1.ResourceName]resource.Quantity{ + apiv1.ResourceCPU: resource.MustParse("1"), + apiv1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + }, + }, + } } func generateDsPods(count int, node string) []*apiv1.Pod { - var result []*apiv1.Pod - for i := 0; i < count; i++ { - name := fmt.Sprintf("ds-pod-%d", i) - result = append(result, generateDsPod(name, node)) - } - return result + var result []*apiv1.Pod + for i := 0; i < count; i++ { + name := fmt.Sprintf("ds-pod-%d", i) + result = append(result, generateDsPod(name, node)) + } + return result } func generateDsPod(name string, node string) *apiv1.Pod { - pod := removablePod(fmt.Sprintf("%s-%s", node, name), node) - pod.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "some-uid") - return pod + pod := removablePod(fmt.Sprintf("%s-%s", node, name), node) + pod.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "some-uid") + return pod } func generateDaemonSet() *appsv1.DaemonSet { - return &appsv1.DaemonSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "ds", - Namespace: "default", - SelfLink: "/apiv1s/apps/v1/namespaces/default/daemonsets/ds", - }, - } + return &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ds", + Namespace: "default", + SelfLink: "/apiv1s/apps/v1/namespaces/default/daemonsets/ds", + }, + } } func generateUtilInfo(cpuUtil, memUtil float64) utilization.Info { - var higherUtilName apiv1.ResourceName - var higherUtilVal float64 - if cpuUtil > memUtil { - higherUtilName = apiv1.ResourceCPU - higherUtilVal = cpuUtil - } else { - higherUtilName = apiv1.ResourceMemory - higherUtilVal = memUtil - } - return utilization.Info{ - CpuUtil: cpuUtil, - MemUtil: memUtil, - ResourceName: higherUtilName, - Utilization: higherUtilVal, - } + var higherUtilName apiv1.ResourceName + var higherUtilVal float64 + if cpuUtil > memUtil { + higherUtilName = apiv1.ResourceCPU + higherUtilVal = cpuUtil + } else { + higherUtilName = apiv1.ResourceMemory + higherUtilVal = memUtil + } + return utilization.Info{ + CpuUtil: cpuUtil, + MemUtil: memUtil, + ResourceName: higherUtilName, + Utilization: higherUtilVal, + } } func waitForDeletionResultsCount(ndt *deletiontracker.NodeDeletionTracker, resultsCount int, timeout, retryTime time.Duration) error { - // This is quite ugly, but shouldn't matter much since in most cases there shouldn't be a need to wait at all, and - // the function should return quickly after the first if check. - // An alternative could be to turn NodeDeletionTracker into an interface, and use an implementation which allows - // synchronizing calls to EndDeletion in the test code. - for retryUntil := time.Now().Add(timeout); time.Now().Before(retryUntil); time.Sleep(retryTime) { - if results, _ := ndt.DeletionResults(); len(results) == resultsCount { - return nil - } - } - return fmt.Errorf("timed out while waiting for node deletion results") + // This is quite ugly, but shouldn't matter much since in most cases there shouldn't be a need to wait at all, and + // the function should return quickly after the first if check. + // An alternative could be to turn NodeDeletionTracker into an interface, and use an implementation which allows + // synchronizing calls to EndDeletion in the test code. + for retryUntil := time.Now().Add(timeout); time.Now().Before(retryUntil); time.Sleep(retryTime) { + if results, _ := ndt.DeletionResults(); len(results) == resultsCount { + return nil + } + } + return fmt.Errorf("timed out while waiting for node deletion results") } - diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go index d46b28bbce07..0fcf4f04a6fa 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go @@ -12,6 +12,7 @@ type LatencyTracker interface { ObserveDeletion(nodeName string, timestamp time.Time) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) UpdateThreshold(nodeName string, threshold time.Duration) + GetTrackedNodes() []string } type NodeInfo struct { UnneededSince time.Time diff --git a/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go b/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go index 44bff7fbd768..2ea3f501a76b 100644 --- a/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go +++ b/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go @@ -366,3 +366,4 @@ func (f *fakeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Tim } func (f *fakeLatencyTracker) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) { } +func (f *fakeLatencyTracker) GetTrackedNodes() []string { return nil } From 09bb88d521fa4a0d4be817cc9bbf249d2e96190e Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Wed, 24 Sep 2025 07:23:44 +0000 Subject: [PATCH 13/19] fix merge errors --- cluster-autoscaler/core/scaledown/planner/planner.go | 2 +- cluster-autoscaler/core/scaledown/planner/planner_test.go | 2 +- cluster-autoscaler/core/scaledown/unneeded/nodes_test.go | 3 ++- cluster-autoscaler/core/static_autoscaler_test.go | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index eb47cf5380ec..0d29302571ee 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -88,7 +88,7 @@ func New(context *context.AutoscalingContext, processors *processors.Autoscaling minUpdateInterval = 1 * time.Nanosecond } - unneededNodes := unneeded.NewNodes(processors.NodeGroupConfigProcessor, resourceLimitsFinder) + unneededNodes := unneeded.NewNodes(processors.NodeGroupConfigProcessor, resourceLimitsFinder, nlt) if context.AutoscalingOptions.NodeDeletionCandidateTTL != 0 { unneededNodes.LoadFromExistingTaints(context.ListerRegistry, time.Now(), context.AutoscalingOptions.NodeDeletionCandidateTTL) } diff --git a/cluster-autoscaler/core/scaledown/planner/planner_test.go b/cluster-autoscaler/core/scaledown/planner/planner_test.go index 3beaa61c3df7..50432930aa0c 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner_test.go +++ b/cluster-autoscaler/core/scaledown/planner/planner_test.go @@ -841,7 +841,7 @@ func TestNewPlannerWithExistingDeletionCandidateNodes(t *testing.T) { assert.NoError(t, err) deleteOptions := options.NodeDeleteOptions{} - p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil) + p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil, nil) p.unneededNodes.AsList() }) diff --git a/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go b/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go index 2ea3f501a76b..ea551a6db10e 100644 --- a/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go +++ b/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go @@ -284,8 +284,9 @@ func TestNodeLoadFromExistingTaints(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { t.Parallel() + currentTime = time.Now() - nodes := NewNodes(nil, nil) + nodes := NewNodes(nil, nil, nil) allNodeLister := kubernetes.NewTestNodeLister(nil) allNodeLister.SetNodes(tc.allNodes) diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 95826e837962..001b18e07352 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1713,7 +1713,7 @@ func TestStaticAutoscalerRunOnceWithExistingDeletionCandidateNodes(t *testing.T) } processors := processorstest.NewTestProcessors(&context) clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) From 445bc9fe657475c32fee34c2536d9fb46ec581d3 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Wed, 24 Sep 2025 13:20:30 +0000 Subject: [PATCH 14/19] fix linting issues --- .../latencytracker/node_latency_tracker.go | 30 ++++++++++++++++--- .../node_latency_tracker_test.go | 16 +++++----- cluster-autoscaler/metrics/metrics.go | 2 ++ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go index 0fcf4f04a6fa..075ce9020f70 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go @@ -1,3 +1,19 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package latencytracker import ( @@ -8,25 +24,31 @@ import ( "k8s.io/klog/v2" ) +// LatencyTracker defines the interface for tracking node removal latency. +// Implementations record when nodes become unneeded, observe deletion events, +// and expose thresholds for measuring node removal duration. type LatencyTracker interface { ObserveDeletion(nodeName string, timestamp time.Time) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) UpdateThreshold(nodeName string, threshold time.Duration) GetTrackedNodes() []string } -type NodeInfo struct { +type nodeInfo struct { UnneededSince time.Time Threshold time.Duration } +// NodeLatencyTracker is a concrete implementation of LatencyTracker. +// It keeps track of nodes that are marked as unneeded, when they became unneeded, +// and thresholds to adjust node removal latency metrics. type NodeLatencyTracker struct { - nodes map[string]NodeInfo + nodes map[string]nodeInfo } // NewNodeLatencyTracker creates a new tracker. func NewNodeLatencyTracker() *NodeLatencyTracker { return &NodeLatencyTracker{ - nodes: make(map[string]NodeInfo), + nodes: make(map[string]nodeInfo), } } @@ -41,7 +63,7 @@ func (t *NodeLatencyTracker) UpdateStateWithUnneededList( currentSet[node.Name] = struct{}{} if _, exists := t.nodes[node.Name]; !exists { - t.nodes[node.Name] = NodeInfo{ + t.nodes[node.Name] = nodeInfo{ UnneededSince: timestamp, Threshold: 0, } diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go index d6cae23c60ec..db2b68030cb8 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go @@ -1,5 +1,5 @@ /* -Copyright 2024 The Kubernetes Authors. +Copyright 2019 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -7,8 +7,8 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an "AS IS" BASIS, +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. @@ -29,7 +29,7 @@ func TestNodeLatencyTracker(t *testing.T) { tests := []struct { name string - setupNodes map[string]NodeInfo + setupNodes map[string]nodeInfo unneededList []string currentlyInDeletion map[string]bool updateThresholds map[string]time.Duration @@ -39,7 +39,7 @@ func TestNodeLatencyTracker(t *testing.T) { }{ { name: "add new unneeded nodes", - setupNodes: map[string]NodeInfo{}, + setupNodes: map[string]nodeInfo{}, unneededList: []string{"node1", "node2"}, currentlyInDeletion: map[string]bool{}, updateThresholds: map[string]time.Duration{}, @@ -48,7 +48,7 @@ func TestNodeLatencyTracker(t *testing.T) { }, { name: "observe deletion with threshold", - setupNodes: map[string]NodeInfo{ + setupNodes: map[string]nodeInfo{ "node1": {UnneededSince: baseTime, Threshold: 2 * time.Second}, }, unneededList: []string{}, @@ -62,7 +62,7 @@ func TestNodeLatencyTracker(t *testing.T) { }, { name: "remove unneeded node not in deletion", - setupNodes: map[string]NodeInfo{ + setupNodes: map[string]nodeInfo{ "node1": {UnneededSince: baseTime, Threshold: 1 * time.Second}, "node2": {UnneededSince: baseTime, Threshold: 0}, }, @@ -77,7 +77,7 @@ func TestNodeLatencyTracker(t *testing.T) { }, { name: "update threshold", - setupNodes: map[string]NodeInfo{ + setupNodes: map[string]nodeInfo{ "node1": {UnneededSince: baseTime, Threshold: 1 * time.Second}, }, unneededList: []string{"node1"}, diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 2563eb8f4561..a9bda6612117 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -761,6 +761,8 @@ func ObserveBinpackingHeterogeneity(instanceType, cpuCount, namespaceCount strin binpackingHeterogeneity.WithLabelValues(instanceType, cpuCount, namespaceCount).Observe(float64(pegCount)) } +// UpdateScaleDownNodeDeletionDuration records the time after which node was deleted/needed +// again after being marked unneded func UpdateScaleDownNodeDeletionDuration(deleted string, duration time.Duration) { scaleDownNodeDeletionDuration.WithLabelValues(deleted).Observe(duration.Seconds()) } From 045e7395f12c4f647e582619b6b158a423646cf3 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Wed, 24 Sep 2025 14:47:43 +0000 Subject: [PATCH 15/19] change name flag from nodeLatencyTrackingEnabled to nodeRemovalLatencyTrackingEnabled --- cluster-autoscaler/config/autoscaling_options.go | 8 ++------ cluster-autoscaler/config/flags/flags.go | 8 ++------ cluster-autoscaler/core/static_autoscaler.go | 2 +- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 6699aa1fdf17..a7681c29fc9c 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -345,12 +345,8 @@ type AutoscalingOptions struct { // NodeDeletionCandidateTTL is the maximum time a node can be marked as removable without being deleted. // This is used to prevent nodes from being stuck in the removable state during if the CA deployment becomes inactive. NodeDeletionCandidateTTL time.Duration - //CapacitybufferControllerEnabled tells if CA should run default capacity buffer as sub-process or not - CapacitybufferControllerEnabled bool - // CapacitybufferPodInjectionEnabled tells if CA should injects fake pods for capacity buffers that are ready for provisioning - CapacitybufferPodInjectionEnabled bool - // NodeLatencyTrackingEnabled is used to enable/disable node latency tracking. - NodeLatencyTrackingEnabled bool + // NodeRemovalLatencyTrackingEnabled is used to enable/disable node removal latency tracking. + NodeRemovalLatencyTrackingEnabled bool } // KubeClientOptions specify options for kube client diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go index e373a940c840..64ef47e9037c 100644 --- a/cluster-autoscaler/config/flags/flags.go +++ b/cluster-autoscaler/config/flags/flags.go @@ -228,9 +228,7 @@ var ( clusterSnapshotParallelism = flag.Int("cluster-snapshot-parallelism", 16, "Maximum parallelism of cluster snapshot creation.") checkCapacityProcessorInstance = flag.String("check-capacity-processor-instance", "", "Name of the processor instance. Only ProvisioningRequests that define this name in their parameters with the key \"processorInstance\" will be processed by this CA instance. It only refers to check capacity ProvisioningRequests, but if not empty, best-effort atomic ProvisioningRequests processing is disabled in this instance. Not recommended: Until CA 1.35, ProvisioningRequests with this name as prefix in their class will be also processed.") nodeDeletionCandidateTTL = flag.Duration("node-deletion-candidate-ttl", time.Duration(0), "Maximum time a node can be marked as removable before the marking becomes stale. This sets the TTL of Cluster-Autoscaler's state if the Cluste-Autoscaler deployment becomes inactive") - capacitybufferControllerEnabled = flag.Bool("capacity-buffer-controller-enabled", false, "Whether to enable the default controller for capacity buffers or not") - capacitybufferPodInjectionEnabled = flag.Bool("capacity-buffer-pod-injection-enabled", false, "Whether to enable pod list processor that processes ready capacity buffers and injects fake pods accordingly") - nodeLatencyTrackingEnabled = flag.Bool("enable-node-latency-tracking", false, "Whether logic for monitoring of node latency is enabled.") + nodeRemovalLatencyTrackingEnabled = flag.Bool("enable-node-removal-latency-tracking", false, "Whether to track latency from when a node is marked unneeded until it is removed or needed again.") // Deprecated flags ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group (Deprecated, use startup-taints instead)") @@ -413,9 +411,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { ProactiveScaleupEnabled: *proactiveScaleupEnabled, PodInjectionLimit: *podInjectionLimit, NodeDeletionCandidateTTL: *nodeDeletionCandidateTTL, - CapacitybufferControllerEnabled: *capacitybufferControllerEnabled, - CapacitybufferPodInjectionEnabled: *capacitybufferPodInjectionEnabled, - NodeLatencyTrackingEnabled: *nodeLatencyTrackingEnabled, + NodeRemovalLatencyTrackingEnabled: *nodeRemovalLatencyTrackingEnabled, } } diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 6586f821c2d9..64499e278413 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -172,7 +172,7 @@ func NewStaticAutoscaler( // TODO: Populate the ScaleDownActuator/Planner fields in AutoscalingContext // during the struct creation rather than here. var nldt *latencytracker.NodeLatencyTracker - if autoscalingContext.AutoscalingOptions.NodeLatencyTrackingEnabled { + if autoscalingContext.AutoscalingOptions.NodeRemovalLatencyTrackingEnabled { nldt = latencytracker.NewNodeLatencyTracker() } scaleDownPlanner := planner.New(autoscalingContext, processors, deleteOptions, drainabilityRules, nldt) From 90180f0180a2d0adc939aa634b13c4f963db6342 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Wed, 24 Sep 2025 15:25:40 +0000 Subject: [PATCH 16/19] fix failing test --- .../core/scaledown/actuation/actuator_test.go | 14 ++++++++++++++ .../core/scaledown/planner/planner.go | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go index 3ddd128f48a4..35be2939ef7c 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go @@ -18,6 +18,7 @@ package actuation import ( "fmt" + "strings" "sync" "testing" "time" @@ -1618,6 +1619,19 @@ func TestStartDeletionInBatchBasic(t *testing.T) { } for _, nodes := range deleteNodes { for _, node := range nodes { + // Extract node group from node name + parts := strings.Split(node.Name, "-") + if len(parts) < 3 { + continue + } + ngName := strings.Join(parts[:2], "-") + + // Skip check if no successful deletions expected for this group + if test.wantSuccessfulDeletion[ngName] == 0 { + continue + } + + // Verify ObserveDeletion was called found := false for _, observedNode := range fakeNodeLatencyTracker.ObservedNodes { if observedNode == node.Name { diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 0d29302571ee..f7259cb17c74 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -96,7 +96,7 @@ func New(context *context.AutoscalingContext, processors *processors.Autoscaling return &Planner{ context: context, unremovableNodes: unremovable.NewNodes(), - unneededNodes: unneeded.NewNodes(processors.NodeGroupConfigProcessor, resourceLimitsFinder, nlt), + unneededNodes: unneededNodes, rs: simulator.NewRemovalSimulator(context.ListerRegistry, context.ClusterSnapshot, deleteOptions, drainabilityRules, true), actuationInjector: scheduling.NewHintingSimulator(), eligibilityChecker: eligibility.NewChecker(processors.NodeGroupConfigProcessor), From 5702fde89248c705e3538e156e6065cc53abbb6e Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Sun, 28 Sep 2025 10:18:07 +0000 Subject: [PATCH 17/19] fix rebase issues --- cluster-autoscaler/config/autoscaling_options.go | 4 ++++ cluster-autoscaler/config/flags/flags.go | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index a7681c29fc9c..5a2c26b9cd7c 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -345,6 +345,10 @@ type AutoscalingOptions struct { // NodeDeletionCandidateTTL is the maximum time a node can be marked as removable without being deleted. // This is used to prevent nodes from being stuck in the removable state during if the CA deployment becomes inactive. NodeDeletionCandidateTTL time.Duration + //CapacitybufferControllerEnabled tells if CA should run default capacity buffer as sub-process or not + CapacitybufferControllerEnabled bool + // CapacitybufferPodInjectionEnabled tells if CA should injects fake pods for capacity buffers that are ready for provisioning + CapacitybufferPodInjectionEnabled bool // NodeRemovalLatencyTrackingEnabled is used to enable/disable node removal latency tracking. NodeRemovalLatencyTrackingEnabled bool } diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go index 64ef47e9037c..d60ba2c33885 100644 --- a/cluster-autoscaler/config/flags/flags.go +++ b/cluster-autoscaler/config/flags/flags.go @@ -228,6 +228,8 @@ var ( clusterSnapshotParallelism = flag.Int("cluster-snapshot-parallelism", 16, "Maximum parallelism of cluster snapshot creation.") checkCapacityProcessorInstance = flag.String("check-capacity-processor-instance", "", "Name of the processor instance. Only ProvisioningRequests that define this name in their parameters with the key \"processorInstance\" will be processed by this CA instance. It only refers to check capacity ProvisioningRequests, but if not empty, best-effort atomic ProvisioningRequests processing is disabled in this instance. Not recommended: Until CA 1.35, ProvisioningRequests with this name as prefix in their class will be also processed.") nodeDeletionCandidateTTL = flag.Duration("node-deletion-candidate-ttl", time.Duration(0), "Maximum time a node can be marked as removable before the marking becomes stale. This sets the TTL of Cluster-Autoscaler's state if the Cluste-Autoscaler deployment becomes inactive") + capacitybufferControllerEnabled = flag.Bool("capacity-buffer-controller-enabled", false, "Whether to enable the default controller for capacity buffers or not") + capacitybufferPodInjectionEnabled = flag.Bool("capacity-buffer-pod-injection-enabled", false, "Whether to enable pod list processor that processes ready capacity buffers and injects fake pods accordingly") nodeRemovalLatencyTrackingEnabled = flag.Bool("enable-node-removal-latency-tracking", false, "Whether to track latency from when a node is marked unneeded until it is removed or needed again.") // Deprecated flags @@ -411,6 +413,8 @@ func createAutoscalingOptions() config.AutoscalingOptions { ProactiveScaleupEnabled: *proactiveScaleupEnabled, PodInjectionLimit: *podInjectionLimit, NodeDeletionCandidateTTL: *nodeDeletionCandidateTTL, + CapacitybufferControllerEnabled: *capacitybufferControllerEnabled, + CapacitybufferPodInjectionEnabled: *capacitybufferPodInjectionEnabled, NodeRemovalLatencyTrackingEnabled: *nodeRemovalLatencyTrackingEnabled, } } From 6966d04191c1258fc69dcdfb884848f2c165ef4a Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Mon, 29 Sep 2025 10:03:09 +0000 Subject: [PATCH 18/19] change ndlt to interface --- cluster-autoscaler/core/static_autoscaler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 64499e278413..83c4a2a74129 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -171,7 +171,7 @@ func NewStaticAutoscaler( // TODO: Populate the ScaleDownActuator/Planner fields in AutoscalingContext // during the struct creation rather than here. - var nldt *latencytracker.NodeLatencyTracker + var nldt latencytracker.LatencyTracker if autoscalingContext.AutoscalingOptions.NodeRemovalLatencyTrackingEnabled { nldt = latencytracker.NewNodeLatencyTracker() } From 191c494cb6fd1ec9039fe2569bb529dccb0a16d8 Mon Sep 17 00:00:00 2001 From: Tetiana Yeremenko Date: Thu, 2 Oct 2025 08:57:31 +0000 Subject: [PATCH 19/19] Code review comments addressed --- .../latencytracker/node_latency_tracker.go | 30 ++++---- .../node_latency_tracker_test.go | 72 +++++++++---------- .../core/static_autoscaler_test.go | 65 ++++++++++++----- cluster-autoscaler/metrics/metrics.go | 15 ++-- 4 files changed, 106 insertions(+), 76 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go index 075ce9020f70..93a9c1514836 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go @@ -34,8 +34,8 @@ type LatencyTracker interface { GetTrackedNodes() []string } type nodeInfo struct { - UnneededSince time.Time - Threshold time.Duration + unneededSince time.Time + threshold time.Duration } // NodeLatencyTracker is a concrete implementation of LatencyTracker. @@ -64,21 +64,21 @@ func (t *NodeLatencyTracker) UpdateStateWithUnneededList( if _, exists := t.nodes[node.Name]; !exists { t.nodes[node.Name] = nodeInfo{ - UnneededSince: timestamp, - Threshold: 0, + unneededSince: timestamp, + threshold: 0, } - klog.V(2).Infof("Started tracking unneeded node %s at %v", node.Name, timestamp) + klog.V(4).Infof("Started tracking unneeded node %s at %v", node.Name, timestamp) } } for name, info := range t.nodes { if _, stillUnneeded := currentSet[name]; !stillUnneeded { if _, inDeletion := currentlyInDeletion[name]; !inDeletion { - duration := timestamp.Sub(info.UnneededSince) - metrics.UpdateScaleDownNodeDeletionDuration("false", duration-info.Threshold) + duration := timestamp.Sub(info.unneededSince) + metrics.UpdateScaleDownNodeRemovalLatency(false, duration-info.threshold) delete(t.nodes, name) - klog.V(2).Infof("Node %q reported as deleted/missing (unneeded for %s, threshold %s)", - name, duration, info.Threshold) + klog.V(4).Infof("Node %q reported as deleted/missing (unneeded for %s, threshold %s)", + name, duration, info.threshold) } } } @@ -87,14 +87,14 @@ func (t *NodeLatencyTracker) UpdateStateWithUnneededList( // ObserveDeletion is called by the actuator just before node deletion. func (t *NodeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Time) { if info, exists := t.nodes[nodeName]; exists { - duration := timestamp.Sub(info.UnneededSince) + duration := timestamp.Sub(info.unneededSince) - klog.V(2).Infof( + klog.V(4).Infof( "Observing deletion for node %s, unneeded for %s (threshold was %s).", - nodeName, duration, info.Threshold, + nodeName, duration, info.threshold, ) - metrics.UpdateScaleDownNodeDeletionDuration("true", duration-info.Threshold) + metrics.UpdateScaleDownNodeRemovalLatency(true, duration-info.threshold) delete(t.nodes, nodeName) } } @@ -102,9 +102,9 @@ func (t *NodeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Tim // UpdateThreshold updates the scale-down threshold for a tracked node. func (t *NodeLatencyTracker) UpdateThreshold(nodeName string, threshold time.Duration) { if info, exists := t.nodes[nodeName]; exists { - info.Threshold = threshold + info.threshold = threshold t.nodes[nodeName] = info - klog.V(2).Infof("Updated threshold for node %q to %s", nodeName, threshold) + klog.V(4).Infof("Updated threshold for node %q to %s", nodeName, threshold) } else { klog.Warningf("Attempted to update threshold for unknown node %q", nodeName) } diff --git a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go index db2b68030cb8..45226a65ce89 100644 --- a/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go +++ b/cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go @@ -28,65 +28,65 @@ func TestNodeLatencyTracker(t *testing.T) { baseTime := time.Now() tests := []struct { - name string - setupNodes map[string]nodeInfo - unneededList []string - currentlyInDeletion map[string]bool - updateThresholds map[string]time.Duration - observeDeletion []string - expectedTrackedNodes []string - expectedDeletionTimes map[string]time.Duration + name string + setupNodes map[string]nodeInfo + unneededList []string + currentlyInDeletion map[string]bool + updateThresholds map[string]time.Duration + observeDeletion []string + wantTrackedNodes []string + wantDeletionTimes map[string]time.Duration }{ { - name: "add new unneeded nodes", - setupNodes: map[string]nodeInfo{}, - unneededList: []string{"node1", "node2"}, - currentlyInDeletion: map[string]bool{}, - updateThresholds: map[string]time.Duration{}, - observeDeletion: []string{}, - expectedTrackedNodes: []string{"node1", "node2"}, + name: "add new unneeded nodes", + setupNodes: map[string]nodeInfo{}, + unneededList: []string{"node1", "node2"}, + currentlyInDeletion: map[string]bool{}, + updateThresholds: map[string]time.Duration{}, + observeDeletion: []string{}, + wantTrackedNodes: []string{"node1", "node2"}, }, { name: "observe deletion with threshold", setupNodes: map[string]nodeInfo{ - "node1": {UnneededSince: baseTime, Threshold: 2 * time.Second}, + "node1": {unneededSince: baseTime, threshold: 2 * time.Second}, }, - unneededList: []string{}, - currentlyInDeletion: map[string]bool{}, - updateThresholds: map[string]time.Duration{}, - observeDeletion: []string{"node1"}, - expectedTrackedNodes: []string{}, - expectedDeletionTimes: map[string]time.Duration{ + unneededList: []string{}, + currentlyInDeletion: map[string]bool{}, + updateThresholds: map[string]time.Duration{}, + observeDeletion: []string{"node1"}, + wantTrackedNodes: []string{}, + wantDeletionTimes: map[string]time.Duration{ "node1": 3 * time.Second, // simulate observation 5s after UnneededSince, threshold 2s }, }, { name: "remove unneeded node not in deletion", setupNodes: map[string]nodeInfo{ - "node1": {UnneededSince: baseTime, Threshold: 1 * time.Second}, - "node2": {UnneededSince: baseTime, Threshold: 0}, + "node1": {unneededSince: baseTime, threshold: 1 * time.Second}, + "node2": {unneededSince: baseTime, threshold: 0}, }, - unneededList: []string{"node2"}, // node1 is removed from unneeded - currentlyInDeletion: map[string]bool{}, - updateThresholds: map[string]time.Duration{}, - observeDeletion: []string{}, - expectedTrackedNodes: []string{"node2"}, - expectedDeletionTimes: map[string]time.Duration{ + unneededList: []string{"node2"}, // node1 is removed from unneeded + currentlyInDeletion: map[string]bool{}, + updateThresholds: map[string]time.Duration{}, + observeDeletion: []string{}, + wantTrackedNodes: []string{"node2"}, + wantDeletionTimes: map[string]time.Duration{ "node1": 5*time.Second - 1*time.Second, // assume current timestamp baseTime+5s }, }, { name: "update threshold", setupNodes: map[string]nodeInfo{ - "node1": {UnneededSince: baseTime, Threshold: 1 * time.Second}, + "node1": {unneededSince: baseTime, threshold: 1 * time.Second}, }, unneededList: []string{"node1"}, currentlyInDeletion: map[string]bool{}, updateThresholds: map[string]time.Duration{ "node1": 4 * time.Second, }, - observeDeletion: []string{}, - expectedTrackedNodes: []string{"node1"}, + observeDeletion: []string{}, + wantTrackedNodes: []string{"node1"}, }, } @@ -116,7 +116,7 @@ func TestNodeLatencyTracker(t *testing.T) { // Check tracked nodes gotTracked := tracker.GetTrackedNodes() expectedMap := make(map[string]struct{}) - for _, n := range tt.expectedTrackedNodes { + for _, n := range tt.wantTrackedNodes { expectedMap[n] = struct{}{} } for _, n := range gotTracked { @@ -129,12 +129,12 @@ func TestNodeLatencyTracker(t *testing.T) { t.Errorf("expected node %q to be tracked, but was not", n) } - for node, expectedDuration := range tt.expectedDeletionTimes { + for node, expectedDuration := range tt.wantDeletionTimes { info, ok := tt.setupNodes[node] if !ok { continue } - duration := currentTime.Sub(info.UnneededSince) - info.Threshold + duration := currentTime.Sub(info.unneededSince) - info.threshold if duration != expectedDuration { t.Errorf("node %q expected deletion duration %v, got %v", node, expectedDuration, duration) } diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 001b18e07352..6b1095aab87b 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -166,7 +166,7 @@ func (m *onNodeGroupDeleteMock) Delete(id string) error { func setUpScaleDownActuator(ctx *context.AutoscalingContext, autoscalingOptions config.AutoscalingOptions) { deleteOptions := options.NewNodeDeleteOptions(autoscalingOptions) - ctx.ScaleDownActuator = actuation.NewActuator(ctx, nil, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), deleteOptions, rules.Default(deleteOptions), processorstest.NewTestProcessors(ctx).NodeGroupConfigProcessor) + ctx.ScaleDownActuator = actuation.NewActuator(ctx, nil, deletiontracker.NewNodeDeletionTracker(0*time.Second), nil, deleteOptions, rules.Default(deleteOptions), processorstest.NewTestProcessors(ctx).NodeGroupConfigProcessor) } type nodeGroup struct { @@ -212,7 +212,6 @@ type commonMocks struct { podDisruptionBudgetLister *podDisruptionBudgetListerMock daemonSetLister *daemonSetListerMock nodeDeletionTracker *deletiontracker.NodeDeletionTracker - nodeLatencyTracker *latencytracker.NodeLatencyTracker resourceClaimLister *fakeAllObjectsLister[*resourceapi.ResourceClaim] resourceSliceLister *fakeAllObjectsLister[*resourceapi.ResourceSlice] @@ -323,12 +322,8 @@ func setupAutoscaler(config *autoscalerSetupConfig) (*StaticAutoscaler, error) { if nodeDeletionTracker == nil { nodeDeletionTracker = deletiontracker.NewNodeDeletionTracker(0 * time.Second) } - nodeLatencyTracker := config.mocks.nodeLatencyTracker - if nodeLatencyTracker == nil { - nodeLatencyTracker = latencytracker.NewNodeLatencyTracker() - } - ctx.ScaleDownActuator = actuation.NewActuator(&ctx, clusterState, nodeDeletionTracker, nodeLatencyTracker, deleteOptions, drainabilityRules, processors.NodeGroupConfigProcessor) - sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, nodeLatencyTracker) + ctx.ScaleDownActuator = actuation.NewActuator(&ctx, clusterState, nodeDeletionTracker, nil, deleteOptions, drainabilityRules, processors.NodeGroupConfigProcessor) + sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, nil) processorCallbacks.scaleDownPlanner = sdPlanner @@ -384,6 +379,21 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { ng1 := reflect.ValueOf(provider.GetNodeGroup("ng1")).Interface().(*testprovider.TestNodeGroup) assert.NotNil(t, ng1) assert.NotNil(t, provider) + // NodeLatencyTracker mock + nltMock := &latencytrackerMock{} + nltMock.On("ObserveDeletion", + "n2", + mock.MatchedBy(func(t time.Time) bool { return !t.IsZero() }), + ).Return() + nltMock.On("UpdateStateWithUnneededList", + mock.MatchedBy(func(nodes []*apiv1.Node) bool { return true }), + mock.MatchedBy(func(m map[string]bool) bool { return true }), + mock.Anything, + ).Return() + nltMock.On("UpdateThreshold", + "n2", + time.Minute, + ).Return() // Create context with mocked lister registry. options := config.AutoscalingOptions{ @@ -416,7 +426,7 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { } processors := processorstest.NewTestProcessors(&context) clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) - sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nltMock) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) @@ -2473,7 +2483,7 @@ func TestStaticAutoscalerUpcomingScaleDownCandidates(t *testing.T) { csr := clusterstate.NewClusterStateRegistry(provider, csrConfig, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), processors.AsyncNodeGroupStateChecker) // Setting the Actuator is necessary for testing any scale-down logic, it shouldn't have anything to do in this test. - actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), options.NodeDeleteOptions{}, nil, processorstest.NewTestProcessors(&ctx).NodeGroupConfigProcessor) + actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), nil, options.NodeDeleteOptions{}, nil, processorstest.NewTestProcessors(&ctx).NodeGroupConfigProcessor) ctx.ScaleDownActuator = actuator // Fake planner that keeps track of the scale-down candidates passed to UpdateClusterState. @@ -3134,7 +3144,7 @@ func waitForDeleteToFinish(t *testing.T, deleteFinished <-chan bool) { } } -func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_processors.AutoscalingProcessors, cs *clusterstate.ClusterStateRegistry, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, nodeLatencyTracker *latencytracker.NodeLatencyTracker) (scaledown.Planner, scaledown.Actuator) { +func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_processors.AutoscalingProcessors, cs *clusterstate.ClusterStateRegistry, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, nodeDeletionLatencyTracker latencytracker.LatencyTracker) (scaledown.Planner, scaledown.Actuator) { ctx.MaxScaleDownParallelism = 10 ctx.MaxDrainParallelism = 1 ctx.NodeDeletionBatcherInterval = 0 * time.Second @@ -3149,11 +3159,8 @@ func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_proce if nodeDeletionTracker == nil { nodeDeletionTracker = deletiontracker.NewNodeDeletionTracker(0 * time.Second) } - if nodeLatencyTracker == nil { - nodeLatencyTracker = latencytracker.NewNodeLatencyTracker() - } - planner := planner.New(ctx, p, deleteOptions, nil, nodeLatencyTracker) - actuator := actuation.NewActuator(ctx, cs, nodeDeletionTracker, nodeLatencyTracker, deleteOptions, nil, p.NodeGroupConfigProcessor) + planner := planner.New(ctx, p, deleteOptions, nil, nodeDeletionLatencyTracker) + actuator := actuation.NewActuator(ctx, cs, nodeDeletionTracker, nodeDeletionLatencyTracker, deleteOptions, nil, p.NodeGroupConfigProcessor) return planner, actuator } @@ -3269,13 +3276,13 @@ func buildStaticAutoscaler(t *testing.T, provider cloudprovider.CloudProvider, a processors.ScaleDownNodeProcessor = cp csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{OkTotalUnreadyCount: 1}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), processors.AsyncNodeGroupStateChecker) - actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), options.NodeDeleteOptions{}, nil, processors.NodeGroupConfigProcessor) + actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), nil, options.NodeDeleteOptions{}, nil, processors.NodeGroupConfigProcessor) ctx.ScaleDownActuator = actuator deleteOptions := options.NewNodeDeleteOptions(ctx.AutoscalingOptions) drainabilityRules := rules.Default(deleteOptions) - sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, latencytracker.NewNodeLatencyTracker()) + sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, nil) autoscaler := &StaticAutoscaler{ AutoscalingContext: &ctx, @@ -3325,3 +3332,25 @@ func assertNodesSoftTaintsStatus(t *testing.T, fakeClient *fake.Clientset, nodes assert.Equal(t, tainted, taints.HasDeletionCandidateTaint(newNode)) } } + +// latencytrackerMock implements LatencyTracker for mocking +type latencytrackerMock struct { + mock.Mock +} + +func (m *latencytrackerMock) ObserveDeletion(nodeName string, timestamp time.Time) { + m.Called(nodeName, timestamp) +} + +func (m *latencytrackerMock) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) { + m.Called(list, currentlyInDeletion, timestamp) +} + +func (m *latencytrackerMock) UpdateThreshold(nodeName string, threshold time.Duration) { + m.Called(nodeName, threshold) +} + +func (m *latencytrackerMock) GetTrackedNodes() []string { + args := m.Called() + return args.Get(0).([]string) +} diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index a9bda6612117..e5598c71d011 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -18,6 +18,7 @@ package metrics import ( "fmt" + "strconv" "time" "k8s.io/autoscaler/cluster-autoscaler/simulator" @@ -428,12 +429,12 @@ var ( }, []string{"instance_type", "cpu_count", "namespace_count"}, ) - scaleDownNodeDeletionDuration = k8smetrics.NewHistogramVec( + scaleDownNodeRemovalLatency = k8smetrics.NewHistogramVec( &k8smetrics.HistogramOpts{ Namespace: caNamespace, - Name: "node_deletion_duration_seconds", + Name: "node_removal_latency_seconds", Help: "Latency from planning (node marked) to final outcome (deleted, aborted, rescued).", - Buckets: k8smetrics.ExponentialBuckets(1, 2, 12), //1, 2, 4, 8, ..., 2048 + Buckets: k8smetrics.ExponentialBuckets(1, 2, 18), //1, 2, 4, 8, ..., 131072 approx 1.5 days }, []string{"deleted"}, ) ) @@ -472,7 +473,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) { legacyregistry.MustRegister(nodeTaintsCount) legacyregistry.MustRegister(inconsistentInstancesMigsCount) legacyregistry.MustRegister(binpackingHeterogeneity) - legacyregistry.MustRegister(scaleDownNodeDeletionDuration) + legacyregistry.MustRegister(scaleDownNodeRemovalLatency) if emitPerNodeGroupMetrics { legacyregistry.MustRegister(nodesGroupMinNodes) @@ -761,8 +762,8 @@ func ObserveBinpackingHeterogeneity(instanceType, cpuCount, namespaceCount strin binpackingHeterogeneity.WithLabelValues(instanceType, cpuCount, namespaceCount).Observe(float64(pegCount)) } -// UpdateScaleDownNodeDeletionDuration records the time after which node was deleted/needed +// UpdateScaleDownNodeRemovalLatency records the time after which node was deleted/needed // again after being marked unneded -func UpdateScaleDownNodeDeletionDuration(deleted string, duration time.Duration) { - scaleDownNodeDeletionDuration.WithLabelValues(deleted).Observe(duration.Seconds()) +func UpdateScaleDownNodeRemovalLatency(deleted bool, duration time.Duration) { + scaleDownNodeRemovalLatency.WithLabelValues(strconv.FormatBool(deleted)).Observe(duration.Seconds()) }