From 07ca05c96d6fda0b4c781929f1e54cd587e39427 Mon Sep 17 00:00:00 2001 From: elmiko Date: Fri, 5 Sep 2025 16:13:56 -0400 Subject: [PATCH 1/2] add IsNodeCandidateForScaleDown interface This function allows cloud providers to specify when a node is not a good candidate for scaling down. This will occur before the autoscaler has begun to cordon, drain, and taint any node for scale down. Also adds a unit test for the prefiltering node processor. --- .../alicloud/alicloud_cloud_provider.go | 5 ++ .../cloudprovider/aws/aws_cloud_provider.go | 5 ++ .../azure/azure_cloud_provider.go | 5 ++ .../baiducloud/baiducloud_cloud_provider.go | 5 ++ .../bizflycloud/bizflycloud_cloud_provider.go | 5 ++ .../brightbox/brightbox_cloud_provider.go | 5 ++ .../cherryservers/cherry_cloud_provider.go | 5 ++ .../cloudprovider/civo/civo_cloud_provider.go | 5 ++ .../cloudprovider/cloud_provider.go | 8 ++ .../cloudstack/cloudstack_cloud_provider.go | 5 ++ .../clusterapi/clusterapi_provider.go | 5 ++ .../coreweave/coreweave_provider.go | 5 ++ .../digitalocean_cloud_provider.go | 5 ++ .../equinixmetal/cloud_provider.go | 5 ++ .../exoscale/exoscale_cloud_provider.go | 5 ++ .../externalgrpc_cloud_provider.go | 5 ++ .../cloudprovider/gce/gce_cloud_provider.go | 5 ++ .../hetzner/hetzner_cloud_provider.go | 5 ++ .../huaweicloud/huaweicloud_cloud_provider.go | 5 ++ .../ionoscloud/ionoscloud_cloud_provider.go | 5 ++ .../kamatera/kamatera_cloud_provider.go | 5 ++ .../cloudprovider/kubemark/kubemark_linux.go | 5 ++ .../cloudprovider/kwok/kwok_provider.go | 5 ++ .../linode/linode_cloud_provider.go | 5 ++ .../magnum/magnum_cloud_provider.go | 5 ++ .../cloudprovider/mocks/CloudProvider.go | 5 ++ .../oci/instancepools/oci_cloud_provider.go | 5 ++ .../oci/nodepools/oci_cloud_provider.go | 5 ++ .../ovhcloud/ovh_cloud_provider.go | 5 ++ .../cloudprovider/rancher/rancher_provider.go | 5 ++ .../scaleway/scaleway_cloud_provider.go | 5 ++ .../tencentcloud_cloud_provider.go | 5 ++ .../cloudprovider/test/test_cloud_provider.go | 44 +++++++--- .../cloudprovider/utho/utho_cloud_provider.go | 5 ++ .../volcengine/volcengine_cloud_provider.go | 5 ++ .../vultr/vultr_cloud_provider.go | 5 ++ .../nodes/pre_filtering_processor.go | 10 +++ .../nodes/pre_filtering_processor_test.go | 83 +++++++++++++++---- 38 files changed, 289 insertions(+), 26 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/alicloud/alicloud_cloud_provider.go b/cluster-autoscaler/cloudprovider/alicloud/alicloud_cloud_provider.go index c8513c7bf964..04f727dfd29c 100644 --- a/cluster-autoscaler/cloudprovider/alicloud/alicloud_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/alicloud/alicloud_cloud_provider.go @@ -175,6 +175,11 @@ func (ali *aliCloudProvider) Cleanup() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (ali *aliCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // AliRef contains a reference to ECS instance or . type AliRef struct { ID string diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go index 3117b1c73654..bff016a107ae 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go @@ -193,6 +193,11 @@ func (aws *awsCloudProvider) Refresh() error { return aws.awsManager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (aws *awsCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // AwsRef contains a reference to some entity in AWS world. type AwsRef struct { Name string diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go index f99f2dae615d..3338d8b720d3 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go @@ -174,6 +174,11 @@ func (azure *AzureCloudProvider) Refresh() error { return azure.azureManager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (azure *AzureCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // azureRef contains a reference to some entity in Azure world. type azureRef struct { Name string diff --git a/cluster-autoscaler/cloudprovider/baiducloud/baiducloud_cloud_provider.go b/cluster-autoscaler/cloudprovider/baiducloud/baiducloud_cloud_provider.go index 1698eb23cbde..0260c21bf84e 100644 --- a/cluster-autoscaler/cloudprovider/baiducloud/baiducloud_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/baiducloud/baiducloud_cloud_provider.go @@ -228,6 +228,11 @@ func (baiducloud *baiducloudCloudProvider) Refresh() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (baiducloud *baiducloudCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BaiducloudRef contains a reference to some entity in baiducloud world. type BaiducloudRef struct { Name string diff --git a/cluster-autoscaler/cloudprovider/bizflycloud/bizflycloud_cloud_provider.go b/cluster-autoscaler/cloudprovider/bizflycloud/bizflycloud_cloud_provider.go index 4b9f5db16e9f..ec5f9decdad3 100644 --- a/cluster-autoscaler/cloudprovider/bizflycloud/bizflycloud_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/bizflycloud/bizflycloud_cloud_provider.go @@ -172,6 +172,11 @@ func (d *bizflycloudCloudProvider) Refresh() error { return d.manager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (d *bizflycloudCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildBizflyCloud builds the Bizflycloud cloud provider. func BuildBizflyCloud( opts config.AutoscalingOptions, diff --git a/cluster-autoscaler/cloudprovider/brightbox/brightbox_cloud_provider.go b/cluster-autoscaler/cloudprovider/brightbox/brightbox_cloud_provider.go index f2806ece76db..6de29dec64f3 100644 --- a/cluster-autoscaler/cloudprovider/brightbox/brightbox_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/brightbox/brightbox_cloud_provider.go @@ -219,6 +219,11 @@ func (b *brightboxCloudProvider) Cleanup() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (b *brightboxCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildBrightbox builds the Brightbox provider func BuildBrightbox( opts config.AutoscalingOptions, diff --git a/cluster-autoscaler/cloudprovider/cherryservers/cherry_cloud_provider.go b/cluster-autoscaler/cloudprovider/cherryservers/cherry_cloud_provider.go index 0407f1971e8e..87b0793d315d 100644 --- a/cluster-autoscaler/cloudprovider/cherryservers/cherry_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/cherryservers/cherry_cloud_provider.go @@ -170,6 +170,11 @@ func (ccp *cherryCloudProvider) Cleanup() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (ccp *cherryCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildCherry is called by the autoscaler to build a Cherry Servers cloud provider. // // The cherryManager is created here, and the node groups are created diff --git a/cluster-autoscaler/cloudprovider/civo/civo_cloud_provider.go b/cluster-autoscaler/cloudprovider/civo/civo_cloud_provider.go index c5b69be57d1c..8c317ead4894 100644 --- a/cluster-autoscaler/cloudprovider/civo/civo_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/civo/civo_cloud_provider.go @@ -167,6 +167,11 @@ func (d *civoCloudProvider) Refresh() error { return d.manager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (d *civoCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildCivo builds the Civo cloud provider. func BuildCivo( opts config.AutoscalingOptions, diff --git a/cluster-autoscaler/cloudprovider/cloud_provider.go b/cluster-autoscaler/cloudprovider/cloud_provider.go index f19fa8817aea..ed48844d63e2 100644 --- a/cluster-autoscaler/cloudprovider/cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/cloud_provider.go @@ -154,6 +154,14 @@ type CloudProvider interface { // Refresh is called before every main loop and can be used to dynamically update cloud provider state. // In particular the list of node groups returned by NodeGroups can change as a result of CloudProvider.Refresh(). Refresh() error + + // IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. This function + // will be called during prefiltering of nodes for scaledown to allow cloud providers the opportunity + // to reject a node for scale down. This may be used in cases where nodes are undergoing upgrades or other + // cloud-specific behavior where the cluster autoscaler should not begin cordoning, draining, and tainting + // the node. + // Returns true if the node can be safely scaled down or false otherwise. + IsNodeCandidateForScaleDown(*apiv1.Node) (bool, error) } // ErrNotImplemented is returned if a method is not implemented. diff --git a/cluster-autoscaler/cloudprovider/cloudstack/cloudstack_cloud_provider.go b/cluster-autoscaler/cloudprovider/cloudstack/cloudstack_cloud_provider.go index cbc2996155d0..47bb994a7934 100644 --- a/cluster-autoscaler/cloudprovider/cloudstack/cloudstack_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/cloudstack/cloudstack_cloud_provider.go @@ -115,6 +115,11 @@ func (provider *cloudStackCloudProvider) Pricing() (cloudprovider.PricingModel, return nil, cloudprovider.ErrNotImplemented } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (provider *cloudStackCloudProvider) IsNodeCandidateForScaleDown(node *v1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically // created on the cloud provider side. The node group is not returned by NodeGroups() until it is created. func (provider *cloudStackCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string, taints []v1.Taint, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) { diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go index 94a322e5962f..35d57c5c2936 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go @@ -140,6 +140,11 @@ func (p *provider) GetNodeGpuConfig(node *corev1.Node) *cloudprovider.GpuConfig return gpu.GetNodeGPUFromCloudProvider(p, node) } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (p *provider) IsNodeCandidateForScaleDown(node *corev1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + func newProvider( name string, rl *cloudprovider.ResourceLimiter, diff --git a/cluster-autoscaler/cloudprovider/coreweave/coreweave_provider.go b/cluster-autoscaler/cloudprovider/coreweave/coreweave_provider.go index 84dadf018873..a24fccd816c6 100644 --- a/cluster-autoscaler/cloudprovider/coreweave/coreweave_provider.go +++ b/cluster-autoscaler/cloudprovider/coreweave/coreweave_provider.go @@ -181,6 +181,11 @@ func (c *CoreWeaveCloudProvider) Refresh() error { return c.manager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (c *CoreWeaveCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildCoreWeave builds the CoreWeave cloud provider with the given options and returns it. func BuildCoreWeave(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider { klog.V(4).Infof("Building CoreWeave cloud provider with options: %+v", opts) diff --git a/cluster-autoscaler/cloudprovider/digitalocean/digitalocean_cloud_provider.go b/cluster-autoscaler/cloudprovider/digitalocean/digitalocean_cloud_provider.go index b9e4941dffe6..bedcf42e2ff4 100644 --- a/cluster-autoscaler/cloudprovider/digitalocean/digitalocean_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/digitalocean/digitalocean_cloud_provider.go @@ -169,6 +169,11 @@ func (d *digitaloceanCloudProvider) Refresh() error { return d.manager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (d *digitaloceanCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildDigitalOcean builds the DigitalOcean cloud provider. func BuildDigitalOcean( opts config.AutoscalingOptions, diff --git a/cluster-autoscaler/cloudprovider/equinixmetal/cloud_provider.go b/cluster-autoscaler/cloudprovider/equinixmetal/cloud_provider.go index fd1ec89be920..9d2b9e89db0d 100644 --- a/cluster-autoscaler/cloudprovider/equinixmetal/cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/equinixmetal/cloud_provider.go @@ -176,6 +176,11 @@ func (pcp *equinixMetalCloudProvider) Cleanup() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (pcp *equinixMetalCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildCloudProvider is called by the autoscaler to build an Equinix Metal cloud provider. // // The equinixMetalManager is created here, and the node groups are created diff --git a/cluster-autoscaler/cloudprovider/exoscale/exoscale_cloud_provider.go b/cluster-autoscaler/cloudprovider/exoscale/exoscale_cloud_provider.go index 067340e19ad9..22d886baa450 100644 --- a/cluster-autoscaler/cloudprovider/exoscale/exoscale_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/exoscale/exoscale_cloud_provider.go @@ -224,6 +224,11 @@ func (e *exoscaleCloudProvider) Refresh() error { return e.manager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (e *exoscaleCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildExoscale builds the Exoscale cloud provider. func BuildExoscale(_ config.AutoscalingOptions, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider { manager, err := newManager(discoveryOpts) diff --git a/cluster-autoscaler/cloudprovider/externalgrpc/externalgrpc_cloud_provider.go b/cluster-autoscaler/cloudprovider/externalgrpc/externalgrpc_cloud_provider.go index 2779f229e3d3..acc30587d7fd 100644 --- a/cluster-autoscaler/cloudprovider/externalgrpc/externalgrpc_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/externalgrpc/externalgrpc_cloud_provider.go @@ -316,6 +316,11 @@ func (e *externalGrpcCloudProvider) Refresh() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (e *externalGrpcCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildExternalGrpc builds the externalgrpc cloud provider. func BuildExternalGrpc( opts config.AutoscalingOptions, diff --git a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go index 73d5773e7472..bac37987c2ee 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go @@ -148,6 +148,11 @@ func (gce *GceCloudProvider) Refresh() error { return gce.gceManager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (gce *GceCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // GceRef contains s reference to some entity in GCE world. type GceRef struct { Project string diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go index 07e73cfd3c6e..5ec235fee2fc 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go @@ -182,6 +182,11 @@ func (d *HetznerCloudProvider) Refresh() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (d *HetznerCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildHetzner builds the Hetzner cloud provider. func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider { manager, err := newManager() diff --git a/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_cloud_provider.go b/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_cloud_provider.go index d58dae40763f..9dfcd1e5e03c 100644 --- a/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_cloud_provider.go @@ -179,6 +179,11 @@ func (hcp *huaweicloudCloudProvider) Refresh() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (hcp *huaweicloudCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + func (hcp *huaweicloudCloudProvider) buildAsgs(specs []string) error { asgs, err := hcp.cloudServiceManager.ListScalingGroups() if err != nil { diff --git a/cluster-autoscaler/cloudprovider/ionoscloud/ionoscloud_cloud_provider.go b/cluster-autoscaler/cloudprovider/ionoscloud/ionoscloud_cloud_provider.go index 7669cba28b09..ea0d53af2350 100644 --- a/cluster-autoscaler/cloudprovider/ionoscloud/ionoscloud_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/ionoscloud/ionoscloud_cloud_provider.go @@ -308,6 +308,11 @@ func (ic *IonosCloudCloudProvider) Refresh() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (ic *IonosCloudCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildIonosCloud builds the IonosCloud cloud provider. func BuildIonosCloud( opts config.AutoscalingOptions, diff --git a/cluster-autoscaler/cloudprovider/kamatera/kamatera_cloud_provider.go b/cluster-autoscaler/cloudprovider/kamatera/kamatera_cloud_provider.go index ed8af5cc0fee..d072cfe84906 100644 --- a/cluster-autoscaler/cloudprovider/kamatera/kamatera_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/kamatera/kamatera_cloud_provider.go @@ -129,6 +129,11 @@ func (k *kamateraCloudProvider) Refresh() error { return k.manager.refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (k *kamateraCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildKamatera builds the Kamatera cloud provider. func BuildKamatera( opts config.AutoscalingOptions, diff --git a/cluster-autoscaler/cloudprovider/kubemark/kubemark_linux.go b/cluster-autoscaler/cloudprovider/kubemark/kubemark_linux.go index a38148a1e616..f95f8c06f00d 100644 --- a/cluster-autoscaler/cloudprovider/kubemark/kubemark_linux.go +++ b/cluster-autoscaler/cloudprovider/kubemark/kubemark_linux.go @@ -180,6 +180,11 @@ func (kubemark *KubemarkCloudProvider) Cleanup() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (kubemark *KubemarkCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // NodeGroup implements NodeGroup interface. type NodeGroup struct { Name string diff --git a/cluster-autoscaler/cloudprovider/kwok/kwok_provider.go b/cluster-autoscaler/cloudprovider/kwok/kwok_provider.go index a1691171bf0e..0972d6235f8c 100644 --- a/cluster-autoscaler/cloudprovider/kwok/kwok_provider.go +++ b/cluster-autoscaler/cloudprovider/kwok/kwok_provider.go @@ -170,6 +170,11 @@ func (kwok *KwokCloudProvider) Cleanup() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (kwok *KwokCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildKwok builds kwok cloud provider. func BuildKwok(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, diff --git a/cluster-autoscaler/cloudprovider/linode/linode_cloud_provider.go b/cluster-autoscaler/cloudprovider/linode/linode_cloud_provider.go index a5b44563ded7..d6f968e005cd 100644 --- a/cluster-autoscaler/cloudprovider/linode/linode_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/linode/linode_cloud_provider.go @@ -151,6 +151,11 @@ func (l *linodeCloudProvider) Refresh() error { return l.manager.refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (l *linodeCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + func newLinodeCloudProvider(config io.Reader, rl *cloudprovider.ResourceLimiter) (cloudprovider.CloudProvider, error) { m, err := newManager(config) if err != nil { diff --git a/cluster-autoscaler/cloudprovider/magnum/magnum_cloud_provider.go b/cluster-autoscaler/cloudprovider/magnum/magnum_cloud_provider.go index c6161e362157..2ea9c769b9e0 100644 --- a/cluster-autoscaler/cloudprovider/magnum/magnum_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/magnum/magnum_cloud_provider.go @@ -203,6 +203,11 @@ func (mcp *magnumCloudProvider) Cleanup() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (mcp *magnumCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // refreshNodeGroups gets the list of node groups which meet the requirements for autoscaling, // creates magnumNodeGroups for any that do not exist in the cloud provider, // and drops any node groups which are present in the cloud provider but not in the diff --git a/cluster-autoscaler/cloudprovider/mocks/CloudProvider.go b/cluster-autoscaler/cloudprovider/mocks/CloudProvider.go index 23ea1ae21d1a..49173c32f207 100644 --- a/cluster-autoscaler/cloudprovider/mocks/CloudProvider.go +++ b/cluster-autoscaler/cloudprovider/mocks/CloudProvider.go @@ -276,3 +276,8 @@ func (_m *CloudProvider) Refresh() error { return r0 } + +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (_m *CloudProvider) IsNodeCandidateForScaleDown(node *v1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} diff --git a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go index 425f4350331c..241c7013274d 100644 --- a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go @@ -147,6 +147,11 @@ func (ocp *OciCloudProvider) Refresh() error { return ocp.poolManager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (ocp *OciCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildOCI constructs the OciCloudProvider object that implements the could provider interface (InstancePoolManager). func BuildOCI(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider { ocidType, err := ocicommon.GetAllPoolTypes(opts.NodeGroups) diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_cloud_provider.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_cloud_provider.go index ae3f70549151..a64fbf854c7a 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_cloud_provider.go @@ -149,3 +149,8 @@ func (ocp *OciCloudProvider) Cleanup() error { func (ocp *OciCloudProvider) Refresh() error { return ocp.manager.Refresh() } + +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (ocp *OciCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} diff --git a/cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_provider.go b/cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_provider.go index c731a8b8e3d4..cae8aa52153a 100644 --- a/cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_provider.go @@ -318,3 +318,8 @@ func (provider *OVHCloudProvider) Refresh() error { return nil } + +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (provider *OVHCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} diff --git a/cluster-autoscaler/cloudprovider/rancher/rancher_provider.go b/cluster-autoscaler/cloudprovider/rancher/rancher_provider.go index a7b94d902b3e..85f927125037 100644 --- a/cluster-autoscaler/cloudprovider/rancher/rancher_provider.go +++ b/cluster-autoscaler/cloudprovider/rancher/rancher_provider.go @@ -219,6 +219,11 @@ func (provider *RancherCloudProvider) Cleanup() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (provider *RancherCloudProvider) IsNodeCandidateForScaleDown(node *corev1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + func (provider *RancherCloudProvider) scalableNodeGroups() ([]*nodeGroup, error) { var result []*nodeGroup diff --git a/cluster-autoscaler/cloudprovider/scaleway/scaleway_cloud_provider.go b/cluster-autoscaler/cloudprovider/scaleway/scaleway_cloud_provider.go index a214d16823ff..08104ee37a86 100644 --- a/cluster-autoscaler/cloudprovider/scaleway/scaleway_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/scaleway/scaleway_cloud_provider.go @@ -277,3 +277,8 @@ func (scw *scalewayCloudProvider) Refresh() error { return nil } + +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (scw *scalewayCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} diff --git a/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_cloud_provider.go b/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_cloud_provider.go index a33c140fb315..b93b5ec8654c 100644 --- a/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_cloud_provider.go @@ -173,6 +173,11 @@ func (tencentcloud *tencentCloudProvider) Refresh() error { return tencentcloud.tencentcloudManager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (tencentcloud *tencentCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildTencentcloud returns tencentcloud provider func BuildTencentcloud(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider { var config io.ReadCloser diff --git a/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go b/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go index 2c3bcf6a901f..94039d07ec25 100644 --- a/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go @@ -45,20 +45,25 @@ type OnNodeGroupDeleteFunc func(string) error // HasInstance is a function called to determine if a node has been removed from the cloud provider. type HasInstance func(string) (bool, error) +// IsNodeCandidateForScaleDown is a function called to determine if a cloud provider considers +// a node a good candidate for scaling down. +type IsNodeCandidateForScaleDown func(*apiv1.Node) (bool, error) + // TestCloudProvider is a dummy cloud provider to be used in tests. type TestCloudProvider struct { sync.Mutex - nodes map[string]string - groups map[string]cloudprovider.NodeGroup - onScaleUp func(string, int) error - onScaleDown func(string, string) error - onNodeGroupCreate func(string) error - onNodeGroupDelete func(string) error - hasInstance func(string) (bool, error) - machineTypes []string - machineTemplates map[string]*framework.NodeInfo - priceModel cloudprovider.PricingModel - resourceLimiter *cloudprovider.ResourceLimiter + nodes map[string]string + groups map[string]cloudprovider.NodeGroup + onScaleUp func(string, int) error + onScaleDown func(string, string) error + onNodeGroupCreate func(string) error + onNodeGroupDelete func(string) error + hasInstance func(string) (bool, error) + isNodeCandidateForScaleDown func(*apiv1.Node) (bool, error) + machineTypes []string + machineTemplates map[string]*framework.NodeInfo + priceModel cloudprovider.PricingModel + resourceLimiter *cloudprovider.ResourceLimiter } // TestCloudProviderBuilder is used to create CloudProvider @@ -127,6 +132,14 @@ func (b *TestCloudProviderBuilder) WithHasInstance(hasInstance HasInstance) *Tes return b } +// WithIsNodeCandidateForScaleDown adds an IsNodeCandidateForScaleDown handler to provider. +func (b *TestCloudProviderBuilder) WithIsNodeCandidateForScaleDown(isNodeCandidateForScaleDown IsNodeCandidateForScaleDown) *TestCloudProviderBuilder { + b.builders = append(b.builders, func(p *TestCloudProvider) { + p.isNodeCandidateForScaleDown = isNodeCandidateForScaleDown + }) + return b +} + // Build returns a built test cloud provider func (b *TestCloudProviderBuilder) Build() *TestCloudProvider { p := &TestCloudProvider{ @@ -352,6 +365,15 @@ func (tcp *TestCloudProvider) Refresh() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (tcp *TestCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + if tcp.isNodeCandidateForScaleDown == nil { + return true, cloudprovider.ErrNotImplemented + } + + return tcp.isNodeCandidateForScaleDown(node) +} + // TestNodeGroup is a node group used by TestCloudProvider. type TestNodeGroup struct { sync.Mutex diff --git a/cluster-autoscaler/cloudprovider/utho/utho_cloud_provider.go b/cluster-autoscaler/cloudprovider/utho/utho_cloud_provider.go index 9d511e127c70..397b2c6d8675 100644 --- a/cluster-autoscaler/cloudprovider/utho/utho_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/utho/utho_cloud_provider.go @@ -162,6 +162,11 @@ func (u *uthoCloudProvider) Refresh() error { return u.manager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (u *uthoCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // BuildUtho builds the Utho cloud provider. func BuildUtho( opts config.AutoscalingOptions, diff --git a/cluster-autoscaler/cloudprovider/volcengine/volcengine_cloud_provider.go b/cluster-autoscaler/cloudprovider/volcengine/volcengine_cloud_provider.go index 53ad1ec1139d..cb17bb0f55c5 100644 --- a/cluster-autoscaler/cloudprovider/volcengine/volcengine_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/volcengine/volcengine_cloud_provider.go @@ -120,6 +120,11 @@ func (v *volcengineCloudProvider) Refresh() error { return nil } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (v *volcengineCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // GetNodeGpuConfig returns the label, type and resource name for the GPU added to node. If node doesn't have // any GPUs, it returns nil. func (v *volcengineCloudProvider) GetNodeGpuConfig(node *apiv1.Node) *cloudprovider.GpuConfig { diff --git a/cluster-autoscaler/cloudprovider/vultr/vultr_cloud_provider.go b/cluster-autoscaler/cloudprovider/vultr/vultr_cloud_provider.go index 5234a1019786..822afe70da24 100644 --- a/cluster-autoscaler/cloudprovider/vultr/vultr_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/vultr/vultr_cloud_provider.go @@ -140,6 +140,11 @@ func (v *vultrCloudProvider) Refresh() error { return v.manager.Refresh() } +// IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. +func (v *vultrCloudProvider) IsNodeCandidateForScaleDown(node *apiv1.Node) (bool, error) { + return true, cloudprovider.ErrNotImplemented +} + // toProviderID returns a provider ID from the given node ID. func toProviderID(nodeID string) string { return fmt.Sprintf("%s%s", vultrProviderIDPrefix, nodeID) diff --git a/cluster-autoscaler/processors/nodes/pre_filtering_processor.go b/cluster-autoscaler/processors/nodes/pre_filtering_processor.go index 7a3091660135..0cf745dabaa1 100644 --- a/cluster-autoscaler/processors/nodes/pre_filtering_processor.go +++ b/cluster-autoscaler/processors/nodes/pre_filtering_processor.go @@ -22,6 +22,7 @@ import ( apiv1 "k8s.io/api/core/v1" klog "k8s.io/klog/v2" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/utils" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" @@ -48,6 +49,15 @@ func (n *PreFilteringScaleDownNodeProcessor) GetScaleDownCandidates(ctx *context nodeGroupSize := utils.GetNodeGroupSizeMap(ctx.CloudProvider) for _, node := range nodes { + if candidate, err := ctx.CloudProvider.IsNodeCandidateForScaleDown(node); err != nil { + if err != cloudprovider.ErrNotImplemented { + klog.Warningf("Error while checking if node is a candidate for deletion %s: %v", node.Name, err) + continue + } + } else if !candidate { + klog.V(5).Infof("Node %s is not a candidate for deletion (cloud provider determined)", node.Name) + continue + } nodeGroup, err := ctx.CloudProvider.NodeGroupForNode(node) if err != nil { klog.Warningf("Error while checking node group for %s: %v", node.Name, err) diff --git a/cluster-autoscaler/processors/nodes/pre_filtering_processor_test.go b/cluster-autoscaler/processors/nodes/pre_filtering_processor_test.go index e8165620bf5a..9e120997e398 100644 --- a/cluster-autoscaler/processors/nodes/pre_filtering_processor_test.go +++ b/cluster-autoscaler/processors/nodes/pre_filtering_processor_test.go @@ -17,6 +17,7 @@ limitations under the License. package nodes import ( + "strings" "testing" "github.com/stretchr/testify/assert" @@ -44,23 +45,75 @@ func TestPreFilteringScaleDownNodeProcessor_GetScaleDownCandidateNodes(t *testin ng1_1 := BuildTestNode("ng1-1", 1000, 1000) ng1_2 := BuildTestNode("ng1-2", 1000, 1000) ng2_1 := BuildTestNode("ng2-1", 1000, 1000) + ng2_2 := BuildTestNode("ng2-2", 1000, 1000) noNg := BuildTestNode("no-ng", 1000, 1000) - provider := testprovider.NewTestCloudProviderBuilder().Build() - provider.AddNodeGroup("ng1", 1, 10, 2) - provider.AddNodeGroup("ng2", 1, 10, 1) - provider.AddNode("ng1", ng1_1) - provider.AddNode("ng1", ng1_2) - provider.AddNode("ng2", ng2_1) - - ctx := &context.AutoscalingContext{ - CloudProvider: provider, + + testCases := map[string]struct { + buildProvider func() *testprovider.TestCloudProvider + configureProvider func(p *testprovider.TestCloudProvider) + expectedNodes []*apiv1.Node + inputNodes []*apiv1.Node + }{ + // Expectation: only node groups not at minimum size should be candidates. + "1 scale down candidate, 1 node group at minimum size, 1 node with no node group, 1 node group above minimum size.": { + configureProvider: func(p *testprovider.TestCloudProvider) { + p.AddNodeGroup("ng1", 1, 10, 2) + p.AddNodeGroup("ng2", 1, 10, 1) + p.AddNode("ng1", ng1_1) + p.AddNode("ng1", ng1_2) + p.AddNode("ng2", ng2_1) + }, + expectedNodes: []*apiv1.Node{ng1_1, ng1_2}, + inputNodes: []*apiv1.Node{ng1_1, ng1_2, ng2_1, noNg}, + }, + // Expectation: only node groups that contain nodes the cloud provider considers candidates for deletion should be candidates. + "1 scale down candidate, 1 node group with nodes that are not candidates for deletion, 1 node group above minimum size.": { + buildProvider: func() *testprovider.TestCloudProvider { + provider := testprovider. + NewTestCloudProviderBuilder(). + WithIsNodeCandidateForScaleDown(func(n *apiv1.Node) (bool, error) { + if strings.HasPrefix(n.Name, "ng2") { + return false, nil + } + return true, nil + }). + Build() + return provider + }, + configureProvider: func(p *testprovider.TestCloudProvider) { + p.AddNodeGroup("ng1", 1, 10, 2) + p.AddNodeGroup("ng2", 1, 10, 2) + p.AddNode("ng1", ng1_1) + p.AddNode("ng1", ng1_2) + p.AddNode("ng2", ng2_1) + p.AddNode("ng2", ng2_2) + }, + expectedNodes: []*apiv1.Node{ng1_1, ng1_2}, + inputNodes: []*apiv1.Node{ng1_1, ng1_2, ng2_1, ng2_2}, + }, } - expectedNodes := []*apiv1.Node{ng1_1, ng1_2} - defaultProcessor := NewPreFilteringScaleDownNodeProcessor() - inputNodes := []*apiv1.Node{ng1_1, ng1_2, ng2_1, noNg} - result, err := defaultProcessor.GetScaleDownCandidates(ctx, inputNodes) + for description, testCase := range testCases { + t.Run(description, func(t *testing.T) { + var provider *testprovider.TestCloudProvider + if testCase.buildProvider == nil { + provider = testprovider.NewTestCloudProviderBuilder().Build() + } else { + provider = testCase.buildProvider() + } + assert.NotNil(t, provider) - assert.NoError(t, err) - assert.Equal(t, result, expectedNodes) + testCase.configureProvider(provider) + + ctx := &context.AutoscalingContext{ + CloudProvider: provider, + } + + defaultProcessor := NewPreFilteringScaleDownNodeProcessor() + result, err := defaultProcessor.GetScaleDownCandidates(ctx, testCase.inputNodes) + + assert.NoError(t, err) + assert.Equal(t, result, testCase.expectedNodes) + }) + } } From a81913f3b9fc0e4427e2617ad48c453944683edc Mon Sep 17 00:00:00 2001 From: elmiko Date: Mon, 8 Sep 2025 15:53:23 -0400 Subject: [PATCH 2/2] add clusterapi IsNodeCandidateForScaleDown impl The initial implementation of this function for clusterapi will return that a node is not a good candidate for scale down when it belongs to a MachineDeployment that is currently rolling out an upgrade. --- .../clusterapi/clusterapi_controller.go | 21 +++++++ .../clusterapi/clusterapi_nodegroup.go | 62 +++++++++++++++++++ .../clusterapi/clusterapi_provider.go | 20 +++++- .../clusterapi/clusterapi_utils.go | 20 +++--- 4 files changed, 111 insertions(+), 12 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go index b3b7175dfa95..a3e03e9703c3 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go @@ -849,6 +849,27 @@ func (c *machineController) listMachinesForScalableResource(r *unstructured.Unst } } +func (c *machineController) listMachineSetsForMachineDeployment(r *unstructured.Unstructured) ([]*unstructured.Unstructured, error) { + selector := labels.SelectorFromSet(map[string]string{ + machineDeploymentNameLabel: r.GetName(), + }) + objs, err := c.machineSetInformer.Lister().ByNamespace(r.GetNamespace()).List(selector) + if err != nil { + return nil, fmt.Errorf("unable to list MachineSets for MachineDeployment %s: %w", r.GetName(), err) + } + + results := make([]*unstructured.Unstructured, 0, len(objs)) + for _, x := range objs { + u, ok := x.(*unstructured.Unstructured) + if !ok { + return nil, fmt.Errorf("expected unstructured resource from lister, not %T", x) + } + results = append(results, u.DeepCopy()) + } + + return results, nil +} + func (c *machineController) listScalableResources() ([]*unstructured.Unstructured, error) { scalableResources, err := c.listResources(c.machineSetInformer.Lister()) if err != nil { diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go index d68a48b89c3f..4b6bb1b2905a 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go @@ -458,6 +458,68 @@ func (ng *nodegroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*c return &defaults, nil } +func (ng *nodegroup) IsMachineDeploymentAndRollingOut() (bool, error) { + if ng.scalableResource.Kind() != machineDeploymentKind { + // Not a MachineDeployment. + return false, nil + } + + machineSets, err := ng.machineController.listMachineSetsForMachineDeployment(ng.scalableResource.unstructured) + if err != nil { + return false, err + } + + if len(machineSets) == 0 { + // No MachineSets => MD is not rolling out. + return false, nil + } + + // Find the latest revision, the MachineSet with the latest revision is the MachineSet that + // matches the MachineDeployment spec. + var latestMSRevisionInt int64 + for _, ms := range machineSets { + msRevision, ok := ms.GetAnnotations()[machineDeploymentRevisionAnnotation] + if !ok { + continue + } + + msRevisionInt, err := strconv.ParseInt(msRevision, 10, 64) + if err != nil { + return false, errors.Wrapf(err, "failed to parse current revision on MachineSet %s", klog.KObj(ms)) + } + latestMSRevisionInt = max(latestMSRevisionInt, msRevisionInt) + } + maxMSRevision := strconv.FormatInt(latestMSRevisionInt, 10) + + for _, ms := range machineSets { + if ms.GetAnnotations()[machineDeploymentRevisionAnnotation] == maxMSRevision { + // Ignore the MachineSet with the latest revision + continue + } + + // Check if any of the old MachineSets still have replicas + replicas, found, err := unstructured.NestedInt64(ms.UnstructuredContent(), "spec", "replicas") + if err != nil { + return false, errors.Wrapf(err, "failed to find spec replicas on MachineSet %s", klog.KObj(ms)) + } + if found && replicas > 0 { + // Found old MachineSets that still has replicas => MD is still rolling out. + return true, nil + } + replicas, found, err = unstructured.NestedInt64(ms.UnstructuredContent(), "status", "replicas") + if err != nil { + return false, errors.Wrapf(err, "failed to find status replicas on MachineSet %s", klog.KObj(ms)) + } + if found && replicas > 0 { + // Found old MachineSets that still has replicas => MD is still rolling out. + return true, nil + } + } + + // Didn't find any old MachineSets that still have replicas => MD is not rolling out. + return false, nil +} + func newNodeGroupFromScalableResource(controller *machineController, unstructuredScalableResource *unstructured.Unstructured) (*nodegroup, error) { // Ensure that the resulting node group would be allowed based on the autodiscovery specs if defined if !controller.allowedByAutoDiscoverySpecs(unstructuredScalableResource) { diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go index 35d57c5c2936..b1777045131f 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go @@ -21,6 +21,7 @@ import ( "path" "reflect" + "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/client-go/discovery" @@ -34,7 +35,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/config" - "k8s.io/autoscaler/cluster-autoscaler/utils/errors" + caserrors "k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" ) @@ -92,7 +93,7 @@ func (p *provider) HasInstance(node *corev1.Node) (bool, error) { return false, fmt.Errorf("machine not found for node %s: %v", node.Name, err) } -func (*provider) Pricing() (cloudprovider.PricingModel, errors.AutoscalerError) { +func (*provider) Pricing() (cloudprovider.PricingModel, caserrors.AutoscalerError) { return nil, cloudprovider.ErrNotImplemented } @@ -142,7 +143,20 @@ func (p *provider) GetNodeGpuConfig(node *corev1.Node) *cloudprovider.GpuConfig // IsNodeCandidateForScaleDown returns whether the node is a good candidate for scaling down. func (p *provider) IsNodeCandidateForScaleDown(node *corev1.Node) (bool, error) { - return true, cloudprovider.ErrNotImplemented + ng, err := p.controller.nodeGroupForNode(node) + if err != nil { + return false, errors.Wrapf(err, "failed to determine node group for node %s", klog.KObj(node)) + } + if ng == nil { + klog.V(5).Infof("node %s is not part of a node group", klog.KObj(node)) + return false, nil + } + rollingout, err := ng.IsMachineDeploymentAndRollingOut() + if err != nil { + return false, errors.Wrapf(err, "failed to determine rolling out status for MachineDeployment %s", ng.scalableResource.ID()) + } + // A node is a good candidate for scale down if it is not currently part of a MachineDeployment that is rolling out. + return !rollingout, nil } func newProvider( diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go index f28721f49b74..03b86adce90d 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go @@ -32,15 +32,17 @@ import ( ) const ( - cpuKey = "capacity.cluster-autoscaler.kubernetes.io/cpu" - memoryKey = "capacity.cluster-autoscaler.kubernetes.io/memory" - diskCapacityKey = "capacity.cluster-autoscaler.kubernetes.io/ephemeral-disk" - gpuTypeKey = "capacity.cluster-autoscaler.kubernetes.io/gpu-type" - gpuCountKey = "capacity.cluster-autoscaler.kubernetes.io/gpu-count" - maxPodsKey = "capacity.cluster-autoscaler.kubernetes.io/maxPods" - taintsKey = "capacity.cluster-autoscaler.kubernetes.io/taints" - labelsKey = "capacity.cluster-autoscaler.kubernetes.io/labels" - draDriverKey = "capacity.cluster-autoscaler.kubernetes.io/dra-driver" + cpuKey = "capacity.cluster-autoscaler.kubernetes.io/cpu" + memoryKey = "capacity.cluster-autoscaler.kubernetes.io/memory" + diskCapacityKey = "capacity.cluster-autoscaler.kubernetes.io/ephemeral-disk" + gpuTypeKey = "capacity.cluster-autoscaler.kubernetes.io/gpu-type" + gpuCountKey = "capacity.cluster-autoscaler.kubernetes.io/gpu-count" + maxPodsKey = "capacity.cluster-autoscaler.kubernetes.io/maxPods" + taintsKey = "capacity.cluster-autoscaler.kubernetes.io/taints" + labelsKey = "capacity.cluster-autoscaler.kubernetes.io/labels" + draDriverKey = "capacity.cluster-autoscaler.kubernetes.io/dra-driver" + machineDeploymentRevisionAnnotation = "machinedeployment.clusters.x-k8s.io/revision" + machineDeploymentNameLabel = "cluster.x-k8s.io/deployment-name" // UnknownArch is used if the Architecture is Unknown UnknownArch SystemArchitecture = "" // Amd64 is used if the Architecture is x86_64