diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml index f2e4991888..595ba9b3f1 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml @@ -969,6 +969,17 @@ spec: - name type: object type: array + nodeRepairConfig: + description: NodeRepairConfig specifies the node auto repair configuration + for the managed node group. + properties: + enabled: + default: false + description: |- + Enabled specifies whether node auto repair is enabled for the node group. + When enabled, EKS will automatically repair unhealthy nodes by replacing them. + type: boolean + type: object providerIDList: description: |- ProviderIDList are the provider IDs of instances in the diff --git a/exp/api/v1beta1/zz_generated.conversion.go b/exp/api/v1beta1/zz_generated.conversion.go index b974ea9dc0..f1eb07ad61 100644 --- a/exp/api/v1beta1/zz_generated.conversion.go +++ b/exp/api/v1beta1/zz_generated.conversion.go @@ -759,6 +759,7 @@ func autoConvert_v1beta2_AWSManagedMachinePoolSpec_To_v1beta1_AWSManagedMachineP out.AWSLaunchTemplate = nil } // WARNING: in.AWSLifecycleHooks requires manual conversion: does not exist in peer-type + // WARNING: in.NodeRepairConfig requires manual conversion: does not exist in peer-type return nil } diff --git a/exp/api/v1beta2/awsmanagedmachinepool_types.go b/exp/api/v1beta2/awsmanagedmachinepool_types.go index 0aeb7be0dc..28bff362f6 100644 --- a/exp/api/v1beta2/awsmanagedmachinepool_types.go +++ b/exp/api/v1beta2/awsmanagedmachinepool_types.go @@ -214,6 +214,10 @@ type AWSManagedMachinePoolSpec struct { // AWSLifecycleHooks specifies lifecycle hooks for the managed node group. // +optional AWSLifecycleHooks []AWSLifecycleHook `json:"lifecycleHooks,omitempty"` + + // NodeRepairConfig specifies the node auto repair configuration for the managed node group. + // +optional + NodeRepairConfig *NodeRepairConfig `json:"nodeRepairConfig,omitempty"` } // ManagedMachinePoolScaling specifies scaling options. @@ -297,6 +301,15 @@ type AWSManagedMachinePoolStatus struct { Conditions clusterv1.Conditions `json:"conditions,omitempty"` } +// NodeRepairConfig defines the node auto repair configuration for managed node groups. +type NodeRepairConfig struct { + // Enabled specifies whether node auto repair is enabled for the node group. + // When enabled, EKS will automatically repair unhealthy nodes by replacing them. + // +optional + // +kubebuilder:default=false + Enabled *bool `json:"enabled,omitempty"` +} + // +kubebuilder:object:root=true // +kubebuilder:resource:path=awsmanagedmachinepools,scope=Namespaced,categories=cluster-api,shortName=awsmmp // +kubebuilder:storageversion diff --git a/exp/api/v1beta2/zz_generated.deepcopy.go b/exp/api/v1beta2/zz_generated.deepcopy.go index 6885eb4c64..9133e0ba12 100644 --- a/exp/api/v1beta2/zz_generated.deepcopy.go +++ b/exp/api/v1beta2/zz_generated.deepcopy.go @@ -567,6 +567,11 @@ func (in *AWSManagedMachinePoolSpec) DeepCopyInto(out *AWSManagedMachinePoolSpec (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.NodeRepairConfig != nil { + in, out := &in.NodeRepairConfig, &out.NodeRepairConfig + *out = new(NodeRepairConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AWSManagedMachinePoolSpec. @@ -891,6 +896,26 @@ func (in *MixedInstancesPolicy) DeepCopy() *MixedInstancesPolicy { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeRepairConfig) DeepCopyInto(out *NodeRepairConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeRepairConfig. +func (in *NodeRepairConfig) DeepCopy() *NodeRepairConfig { + if in == nil { + return nil + } + out := new(NodeRepairConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Overrides) DeepCopyInto(out *Overrides) { *out = *in diff --git a/pkg/cloud/converters/eks.go b/pkg/cloud/converters/eks.go index fbb35c67c3..d1835b476f 100644 --- a/pkg/cloud/converters/eks.go +++ b/pkg/cloud/converters/eks.go @@ -217,6 +217,20 @@ func NodegroupUpdateconfigFromSDK(ngUpdateConfig *ekstypes.NodegroupUpdateConfig return converted } +// NodegroupRepairConfigToSDK is used to convert a CAPA NodeRepairConfig to AWS SDK NodeRepairConfig. +func NodegroupRepairConfigToSDK(repairConfig *expinfrav1.NodeRepairConfig) *ekstypes.NodeRepairConfig { + if repairConfig == nil { + // Default to disabled if not specified to avoid behavior changes + return &ekstypes.NodeRepairConfig{ + Enabled: aws.Bool(false), + } + } + + return &ekstypes.NodeRepairConfig{ + Enabled: repairConfig.Enabled, + } +} + // AMITypeToSDK converts a CAPA ManagedMachineAMIType to AWS SDK AMIType. func AMITypeToSDK(amiType expinfrav1.ManagedMachineAMIType) ekstypes.AMITypes { switch amiType { diff --git a/pkg/cloud/converters/eks_test.go b/pkg/cloud/converters/eks_test.go new file mode 100644 index 0000000000..5a3c5dac73 --- /dev/null +++ b/pkg/cloud/converters/eks_test.go @@ -0,0 +1,70 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package converters + +import ( + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types" + + expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" +) + +func TestNodegroupRepairConfigToSDK(t *testing.T) { + tests := []struct { + name string + input *expinfrav1.NodeRepairConfig + expected *ekstypes.NodeRepairConfig + }{ + { + name: "nil input returns default disabled", + input: nil, + expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(false)}, + }, + { + name: "enabled repair config", + input: &expinfrav1.NodeRepairConfig{ + Enabled: aws.Bool(true), + }, + expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(true)}, + }, + { + name: "disabled repair config", + input: &expinfrav1.NodeRepairConfig{ + Enabled: aws.Bool(false), + }, + expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(false)}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := NodegroupRepairConfigToSDK(tt.input) + if result == nil && tt.expected == nil { + return + } + if result == nil || tt.expected == nil { + t.Errorf("NodegroupRepairConfigToSDK() = %v, want %v", result, tt.expected) + return + } + if *result.Enabled != *tt.expected.Enabled { + t.Errorf("NodegroupRepairConfigToSDK().Enabled = %v, want %v", *result.Enabled, *tt.expected.Enabled) + } + }) + } +} diff --git a/pkg/cloud/services/eks/nodegroup.go b/pkg/cloud/services/eks/nodegroup.go index eb1430ffe6..42a8cf66ea 100644 --- a/pkg/cloud/services/eks/nodegroup.go +++ b/pkg/cloud/services/eks/nodegroup.go @@ -119,6 +119,11 @@ func (s *NodegroupService) updateConfig() (*ekstypes.NodegroupUpdateConfig, erro return converters.NodegroupUpdateconfigToSDK(updateConfig) } +func (s *NodegroupService) nodeRepairConfig() *ekstypes.NodeRepairConfig { + repairConfig := s.scope.ManagedMachinePool.Spec.NodeRepairConfig + return converters.NodegroupRepairConfigToSDK(repairConfig) +} + func (s *NodegroupService) roleArn(ctx context.Context) (*string, error) { var role *iamtypes.Role if s.scope.RoleName() != "" { @@ -249,6 +254,9 @@ func (s *NodegroupService) createNodegroup(ctx context.Context) (*ekstypes.Nodeg Version: s.scope.ManagedMachinePool.Status.LaunchTemplateVersion, } } + if managedPool.NodeRepairConfig != nil { + input.NodeRepairConfig = s.nodeRepairConfig() + } out, err := s.EKSClient.CreateNodegroup(ctx, input) if err != nil { @@ -480,6 +488,14 @@ func (s *NodegroupService) reconcileNodegroupConfig(ctx context.Context, ng *eks input.UpdateConfig = updatedConfig needsUpdate = true } + + specRepairConfig := s.nodeRepairConfig() + if !cmp.Equal(ng.NodeRepairConfig, specRepairConfig) { + s.Debug("Nodegroup repair configuration differs from spec, updating the nodegroup repair config", "nodegroup", ng.NodegroupName) + input.NodeRepairConfig = specRepairConfig + needsUpdate = true + } + if !needsUpdate { s.Debug("node group config update not needed", "cluster", eksClusterName, "name", *ng.NodegroupName) return nil