Skip to content

Commit 1e3cede

Browse files
committed
Do not remove healthy nodes from partially failing zero-or-max-scaling node pool scale-ups
1 parent 2945e95 commit 1e3cede

File tree

3 files changed

+249
-62
lines changed

3 files changed

+249
-62
lines changed

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ type NodeGroupAutoscalingOptions struct {
5252
MaxNodeProvisionTime time.Duration
5353
// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one.
5454
ZeroOrMaxNodeScaling bool
55+
// AllowNonAtomicScaleUpToMax indicates that partially failing scale-ups of ZeroOrMaxNodeScaling node groups should not be cancelled
56+
AllowNonAtomicScaleUpToMax bool
5557
// IgnoreDaemonSetsUtilization sets if daemonsets utilization should be considered during node scale-down
5658
IgnoreDaemonSetsUtilization bool
5759
}

cluster-autoscaler/core/static_autoscaler.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,10 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu
801801
continue
802802
}
803803

804+
if len(nodesToDelete) == 0 {
805+
continue
806+
}
807+
804808
if a.ForceDeleteLongUnregisteredNodes {
805809
err = nodeGroup.ForceDeleteNodes(nodesToDelete)
806810
if err == cloudprovider.ErrNotImplemented {
@@ -880,12 +884,14 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
880884
if nodeGroup == nil {
881885
err = fmt.Errorf("node group %s not found", nodeGroupId)
882886
} else if nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete); err == nil {
883-
err = nodeGroup.DeleteNodes(nodesToDelete)
887+
if len(nodesToDelete) > 0 {
888+
err = nodeGroup.DeleteNodes(nodesToDelete)
889+
}
884890
}
885891

886892
if err != nil {
887893
klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err)
888-
} else {
894+
} else if len(nodesToDelete) > 0 {
889895
deletedAny = true
890896
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
891897
}
@@ -898,21 +904,29 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
898904
}
899905

900906
// overrideNodesToDeleteForZeroOrMax returns a list of nodes to delete, taking into account that
901-
// node deletion for a "ZeroOrMaxNodeScaling" node group is atomic and should delete all nodes.
907+
// node deletion for a "ZeroOrMaxNodeScaling" should either keep or remove all the nodes.
902908
// For a non-"ZeroOrMaxNodeScaling" node group it returns the unchanged list of nodes to delete.
903909
func overrideNodesToDeleteForZeroOrMax(defaults config.NodeGroupAutoscalingOptions, nodeGroup cloudprovider.NodeGroup, nodesToDelete []*apiv1.Node) ([]*apiv1.Node, error) {
904910
opts, err := nodeGroup.GetOptions(defaults)
905911
if err != nil && err != cloudprovider.ErrNotImplemented {
906912
return []*apiv1.Node{}, fmt.Errorf("Failed to get node group options for %s: %s", nodeGroup.Id(), err)
907913
}
908914
// If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup
909-
// should stick to the all-or-nothing principle. Deleting all nodes.
915+
// node deletion for a "ZeroOrMaxNodeScaling" node group is atomic and should delete all nodes or none.
910916
if opts != nil && opts.ZeroOrMaxNodeScaling {
911917
instances, err := nodeGroup.Nodes()
912918
if err != nil {
913919
return []*apiv1.Node{}, fmt.Errorf("Failed to fill in nodes to delete from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroup.Id(), err)
914920
}
915-
return instancesToFakeNodes(instances), nil
921+
922+
// Remove all nodes in case when either:
923+
// 1. All nodes are failing
924+
// 2. AllowNonAtomicScaleUpToMax is false which means we want to atomically remove partially failed node groups
925+
if len(instances) == len(nodesToDelete) || !opts.AllowNonAtomicScaleUpToMax {
926+
// Remove all nodes
927+
return instancesToFakeNodes(instances), nil
928+
}
929+
return []*apiv1.Node{}, nil
916930
}
917931
// No override needed.
918932
return nodesToDelete, nil

0 commit comments

Comments
 (0)