Skip to content

✨ Add node auto repair configuration for EKS managed node groups #5604

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -969,6 +969,17 @@ spec:
- name
type: object
type: array
nodeRepairConfig:
description: NodeRepairConfig specifies the node auto repair configuration
for the managed node group.
properties:
enabled:
default: false
description: |-
Enabled specifies whether node auto repair is enabled for the node group.
When enabled, EKS will automatically repair unhealthy nodes by replacing them.
type: boolean
type: object
providerIDList:
description: |-
ProviderIDList are the provider IDs of instances in the
Expand Down
1 change: 1 addition & 0 deletions exp/api/v1beta1/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions exp/api/v1beta2/awsmanagedmachinepool_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,10 @@ type AWSManagedMachinePoolSpec struct {
// AWSLifecycleHooks specifies lifecycle hooks for the managed node group.
// +optional
AWSLifecycleHooks []AWSLifecycleHook `json:"lifecycleHooks,omitempty"`

// NodeRepairConfig specifies the node auto repair configuration for the managed node group.
// +optional
NodeRepairConfig *NodeRepairConfig `json:"nodeRepairConfig,omitempty"`
}

// ManagedMachinePoolScaling specifies scaling options.
Expand Down Expand Up @@ -297,6 +301,15 @@ type AWSManagedMachinePoolStatus struct {
Conditions clusterv1.Conditions `json:"conditions,omitempty"`
}

// NodeRepairConfig defines the node auto repair configuration for managed node groups.
type NodeRepairConfig struct {
// Enabled specifies whether node auto repair is enabled for the node group.
// When enabled, EKS will automatically repair unhealthy nodes by replacing them.
// +optional
// +kubebuilder:default=false
Enabled *bool `json:"enabled,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:resource:path=awsmanagedmachinepools,scope=Namespaced,categories=cluster-api,shortName=awsmmp
// +kubebuilder:storageversion
Expand Down
25 changes: 25 additions & 0 deletions exp/api/v1beta2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions pkg/cloud/converters/eks.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,20 @@ func NodegroupUpdateconfigFromSDK(ngUpdateConfig *ekstypes.NodegroupUpdateConfig
return converted
}

// NodegroupRepairConfigToSDK is used to convert a CAPA NodeRepairConfig to AWS SDK NodeRepairConfig.
func NodegroupRepairConfigToSDK(repairConfig *expinfrav1.NodeRepairConfig) *ekstypes.NodeRepairConfig {
if repairConfig == nil {
// Default to disabled if not specified to avoid behavior changes
return &ekstypes.NodeRepairConfig{
Enabled: aws.Bool(false),
}
}

return &ekstypes.NodeRepairConfig{
Enabled: repairConfig.Enabled,
}
}

// AMITypeToSDK converts a CAPA ManagedMachineAMIType to AWS SDK AMIType.
func AMITypeToSDK(amiType expinfrav1.ManagedMachineAMIType) ekstypes.AMITypes {
switch amiType {
Expand Down
70 changes: 70 additions & 0 deletions pkg/cloud/converters/eks_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package converters

import (
"testing"

"github.com/aws/aws-sdk-go-v2/aws"
ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types"

expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2"
)

func TestNodegroupRepairConfigToSDK(t *testing.T) {
tests := []struct {
name string
input *expinfrav1.NodeRepairConfig
expected *ekstypes.NodeRepairConfig
}{
{
name: "nil input returns default disabled",
input: nil,
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(false)},
},
{
name: "enabled repair config",
input: &expinfrav1.NodeRepairConfig{
Enabled: aws.Bool(true),
},
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(true)},
},
{
name: "disabled repair config",
input: &expinfrav1.NodeRepairConfig{
Enabled: aws.Bool(false),
},
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(false)},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := NodegroupRepairConfigToSDK(tt.input)
if result == nil && tt.expected == nil {
return
}
if result == nil || tt.expected == nil {
t.Errorf("NodegroupRepairConfigToSDK() = %v, want %v", result, tt.expected)
return
}
if *result.Enabled != *tt.expected.Enabled {
t.Errorf("NodegroupRepairConfigToSDK().Enabled = %v, want %v", *result.Enabled, *tt.expected.Enabled)
}
})
}
}
16 changes: 16 additions & 0 deletions pkg/cloud/services/eks/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ func (s *NodegroupService) updateConfig() (*ekstypes.NodegroupUpdateConfig, erro
return converters.NodegroupUpdateconfigToSDK(updateConfig)
}

func (s *NodegroupService) nodeRepairConfig() *ekstypes.NodeRepairConfig {
repairConfig := s.scope.ManagedMachinePool.Spec.NodeRepairConfig
return converters.NodegroupRepairConfigToSDK(repairConfig)
}

func (s *NodegroupService) roleArn(ctx context.Context) (*string, error) {
var role *iamtypes.Role
if s.scope.RoleName() != "" {
Expand Down Expand Up @@ -249,6 +254,9 @@ func (s *NodegroupService) createNodegroup(ctx context.Context) (*ekstypes.Nodeg
Version: s.scope.ManagedMachinePool.Status.LaunchTemplateVersion,
}
}
if managedPool.NodeRepairConfig != nil {
input.NodeRepairConfig = s.nodeRepairConfig()
}

out, err := s.EKSClient.CreateNodegroup(ctx, input)
if err != nil {
Expand Down Expand Up @@ -480,6 +488,14 @@ func (s *NodegroupService) reconcileNodegroupConfig(ctx context.Context, ng *eks
input.UpdateConfig = updatedConfig
needsUpdate = true
}

specRepairConfig := s.nodeRepairConfig()
if !cmp.Equal(ng.NodeRepairConfig, specRepairConfig) {
s.Debug("Nodegroup repair configuration differs from spec, updating the nodegroup repair config", "nodegroup", ng.NodegroupName)
input.NodeRepairConfig = specRepairConfig
needsUpdate = true
}

if !needsUpdate {
s.Debug("node group config update not needed", "cluster", eksClusterName, "name", *ng.NodegroupName)
return nil
Expand Down