Skip to content

Commit cae87c7

Browse files
committed
add clusterapi IsNodeCandidateForDeletion impl
The initial implementation of this function for clusterapi will return that a node is not a good candidate for deletion when it belongs to a MachineDeployment that is currently rolling out an upgrade.
1 parent 6cada97 commit cae87c7

File tree

4 files changed

+110
-12
lines changed

4 files changed

+110
-12
lines changed

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,27 @@ func (c *machineController) listMachinesForScalableResource(r *unstructured.Unst
849849
}
850850
}
851851

852+
func (c *machineController) listMachineSetsForMachineDeployment(r *unstructured.Unstructured) ([]*unstructured.Unstructured, error) {
853+
selector := labels.SelectorFromSet(map[string]string{
854+
"cluster.x-k8s.io/deployment-name": r.GetName(),
855+
})
856+
objs, err := c.machineSetInformer.Lister().ByNamespace(r.GetNamespace()).List(selector)
857+
if err != nil {
858+
return nil, err
859+
}
860+
861+
results := make([]*unstructured.Unstructured, 0, len(objs))
862+
for _, x := range objs {
863+
u, ok := x.(*unstructured.Unstructured)
864+
if !ok {
865+
return nil, fmt.Errorf("expected unstructured resource from lister, not %T", x)
866+
}
867+
results = append(results, u.DeepCopy())
868+
}
869+
870+
return results, nil
871+
}
872+
852873
func (c *machineController) listScalableResources() ([]*unstructured.Unstructured, error) {
853874
scalableResources, err := c.listResources(c.machineSetInformer.Lister())
854875
if err != nil {

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,68 @@ func (ng *nodegroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*c
458458
return &defaults, nil
459459
}
460460

461+
func (ng *nodegroup) IsMachineDeploymentAndRollingOut() (bool, error) {
462+
if ng.scalableResource.Kind() != machineDeploymentKind {
463+
// Not a MachineDeployment.
464+
return false, nil
465+
}
466+
467+
machineSets, err := ng.machineController.listMachineSetsForMachineDeployment(ng.scalableResource.unstructured)
468+
if err != nil {
469+
return false, err
470+
}
471+
472+
if len(machineSets) == 0 {
473+
// No MachineSets => MD is not rolling out.
474+
return false, nil
475+
}
476+
477+
// Find the latest revision, the MachineSet with the latest revision is the MachineSet that
478+
// matches the MachineDeployment spec.
479+
var latestMSRevisionInt int64
480+
for _, ms := range machineSets {
481+
msRevision, ok := ms.GetAnnotations()[machineDeploymentRevisionAnnotation]
482+
if !ok {
483+
continue
484+
}
485+
486+
msRevisionInt, err := strconv.ParseInt(msRevision, 10, 64)
487+
if err != nil {
488+
return false, errors.Wrapf(err, "failed to parse current revision on MachineSet %s", klog.KObj(ms))
489+
}
490+
latestMSRevisionInt = max(latestMSRevisionInt, msRevisionInt)
491+
}
492+
maxMSRevision := strconv.FormatInt(latestMSRevisionInt, 10)
493+
494+
for _, ms := range machineSets {
495+
if ms.GetAnnotations()[machineDeploymentRevisionAnnotation] == maxMSRevision {
496+
// Ignore the MachineSet with the latest revision
497+
continue
498+
}
499+
500+
// Check if any of the old MachineSets still have replicas
501+
replicas, found, err := unstructured.NestedInt64(ms.UnstructuredContent(), "spec", "replicas")
502+
if err != nil {
503+
return false, errors.Wrapf(err, "failed to find spec replicas on MachineSet %s", klog.KObj(ms))
504+
}
505+
if found && replicas > 0 {
506+
// Found old MachineSets that still has replicas => MD is still rolling out.
507+
return true, nil
508+
}
509+
replicas, found, err = unstructured.NestedInt64(ms.UnstructuredContent(), "status", "replicas")
510+
if err != nil {
511+
return false, errors.Wrapf(err, "failed to find status replicas on MachineSet %s", klog.KObj(ms))
512+
}
513+
if found && replicas > 0 {
514+
// Found old MachineSets that still has replicas => MD is still rolling out.
515+
return true, nil
516+
}
517+
}
518+
519+
// Didn't find any old MachineSets that still have replicas => MD is not rolling out.
520+
return false, nil
521+
}
522+
461523
func newNodeGroupFromScalableResource(controller *machineController, unstructuredScalableResource *unstructured.Unstructured) (*nodegroup, error) {
462524
// Ensure that the resulting node group would be allowed based on the autodiscovery specs if defined
463525
if !controller.allowedByAutoDiscoverySpecs(unstructuredScalableResource) {

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_provider.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"path"
2222
"reflect"
2323

24+
"github.com/pkg/errors"
2425
corev1 "k8s.io/api/core/v1"
2526
"k8s.io/apimachinery/pkg/api/resource"
2627
"k8s.io/client-go/discovery"
@@ -34,7 +35,7 @@ import (
3435

3536
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
3637
"k8s.io/autoscaler/cluster-autoscaler/config"
37-
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
38+
caserrors "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
3839
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
3940
)
4041

@@ -92,7 +93,7 @@ func (p *provider) HasInstance(node *corev1.Node) (bool, error) {
9293
return false, fmt.Errorf("machine not found for node %s: %v", node.Name, err)
9394
}
9495

95-
func (*provider) Pricing() (cloudprovider.PricingModel, errors.AutoscalerError) {
96+
func (*provider) Pricing() (cloudprovider.PricingModel, caserrors.AutoscalerError) {
9697
return nil, cloudprovider.ErrNotImplemented
9798
}
9899

@@ -142,7 +143,20 @@ func (p *provider) GetNodeGpuConfig(node *corev1.Node) *cloudprovider.GpuConfig
142143

143144
// IsNodeCandidateForDeletion returns whether the node is a good candidate for deletion.
144145
func (p *provider) IsNodeCandidateForDeletion(node *corev1.Node) (bool, error) {
145-
return true, cloudprovider.ErrNotImplemented
146+
ng, err := p.controller.nodeGroupForNode(node)
147+
if err != nil {
148+
return false, errors.Wrapf(err, "failed to determine node group for node %s", klog.KObj(node))
149+
}
150+
if ng == nil {
151+
klog.V(5).Infof("node %s is not part of a node group", klog.KObj(node))
152+
return false, nil
153+
}
154+
rollingout, err := ng.IsMachineDeploymentAndRollingOut()
155+
if err != nil {
156+
return false, errors.Wrapf(err, "failed to determine rolling out status for MachineDeployment %s", ng.scalableResource.ID())
157+
}
158+
// A node is a good candidate for deletion if it is not currently part of a MachineDeployment that is rolling out.
159+
return !rollingout, nil
146160
}
147161

148162
func newProvider(

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,16 @@ import (
3232
)
3333

3434
const (
35-
cpuKey = "capacity.cluster-autoscaler.kubernetes.io/cpu"
36-
memoryKey = "capacity.cluster-autoscaler.kubernetes.io/memory"
37-
diskCapacityKey = "capacity.cluster-autoscaler.kubernetes.io/ephemeral-disk"
38-
gpuTypeKey = "capacity.cluster-autoscaler.kubernetes.io/gpu-type"
39-
gpuCountKey = "capacity.cluster-autoscaler.kubernetes.io/gpu-count"
40-
maxPodsKey = "capacity.cluster-autoscaler.kubernetes.io/maxPods"
41-
taintsKey = "capacity.cluster-autoscaler.kubernetes.io/taints"
42-
labelsKey = "capacity.cluster-autoscaler.kubernetes.io/labels"
43-
draDriverKey = "capacity.cluster-autoscaler.kubernetes.io/dra-driver"
35+
cpuKey = "capacity.cluster-autoscaler.kubernetes.io/cpu"
36+
memoryKey = "capacity.cluster-autoscaler.kubernetes.io/memory"
37+
diskCapacityKey = "capacity.cluster-autoscaler.kubernetes.io/ephemeral-disk"
38+
gpuTypeKey = "capacity.cluster-autoscaler.kubernetes.io/gpu-type"
39+
gpuCountKey = "capacity.cluster-autoscaler.kubernetes.io/gpu-count"
40+
maxPodsKey = "capacity.cluster-autoscaler.kubernetes.io/maxPods"
41+
taintsKey = "capacity.cluster-autoscaler.kubernetes.io/taints"
42+
labelsKey = "capacity.cluster-autoscaler.kubernetes.io/labels"
43+
draDriverKey = "capacity.cluster-autoscaler.kubernetes.io/dra-driver"
44+
machineDeploymentRevisionAnnotation = "machinedeployment.clusters.x-k8s.io/revision"
4445
// UnknownArch is used if the Architecture is Unknown
4546
UnknownArch SystemArchitecture = ""
4647
// Amd64 is used if the Architecture is x86_64

0 commit comments

Comments
 (0)