Skip to content

Commit c29aa17

Browse files
committed
feat: add node auto repair configuration for EKS managed node groups
1 parent 483f3a9 commit c29aa17

File tree

7 files changed

+152
-0
lines changed

7 files changed

+152
-0
lines changed

config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,17 @@ spec:
969969
- name
970970
type: object
971971
type: array
972+
nodeRepairConfig:
973+
description: NodeRepairConfig specifies the node auto repair configuration
974+
for the managed node group.
975+
properties:
976+
enabled:
977+
default: false
978+
description: |-
979+
Enabled specifies whether node auto repair is enabled for the node group.
980+
When enabled, EKS will automatically repair unhealthy nodes by replacing them.
981+
type: boolean
982+
type: object
972983
providerIDList:
973984
description: |-
974985
ProviderIDList are the provider IDs of instances in the

exp/api/v1beta1/zz_generated.conversion.go

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exp/api/v1beta2/awsmanagedmachinepool_types.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,10 @@ type AWSManagedMachinePoolSpec struct {
214214
// AWSLifecycleHooks specifies lifecycle hooks for the managed node group.
215215
// +optional
216216
AWSLifecycleHooks []AWSLifecycleHook `json:"lifecycleHooks,omitempty"`
217+
218+
// NodeRepairConfig specifies the node auto repair configuration for the managed node group.
219+
// +optional
220+
NodeRepairConfig *NodeRepairConfig `json:"nodeRepairConfig,omitempty"`
217221
}
218222

219223
// ManagedMachinePoolScaling specifies scaling options.
@@ -297,6 +301,15 @@ type AWSManagedMachinePoolStatus struct {
297301
Conditions clusterv1.Conditions `json:"conditions,omitempty"`
298302
}
299303

304+
// NodeRepairConfig defines the node auto repair configuration for managed node groups.
305+
type NodeRepairConfig struct {
306+
// Enabled specifies whether node auto repair is enabled for the node group.
307+
// When enabled, EKS will automatically repair unhealthy nodes by replacing them.
308+
// +optional
309+
// +kubebuilder:default=false
310+
Enabled *bool `json:"enabled,omitempty"`
311+
}
312+
300313
// +kubebuilder:object:root=true
301314
// +kubebuilder:resource:path=awsmanagedmachinepools,scope=Namespaced,categories=cluster-api,shortName=awsmmp
302315
// +kubebuilder:storageversion

exp/api/v1beta2/zz_generated.deepcopy.go

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/cloud/converters/eks.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,20 @@ func NodegroupUpdateconfigFromSDK(ngUpdateConfig *ekstypes.NodegroupUpdateConfig
217217
return converted
218218
}
219219

220+
// NodegroupRepairConfigToSDK is used to convert a CAPA NodeRepairConfig to AWS SDK NodeRepairConfig.
221+
func NodegroupRepairConfigToSDK(repairConfig *expinfrav1.NodeRepairConfig) *ekstypes.NodeRepairConfig {
222+
if repairConfig == nil {
223+
// Default to disabled if not specified to avoid behavior changes
224+
return &ekstypes.NodeRepairConfig{
225+
Enabled: aws.Bool(false),
226+
}
227+
}
228+
229+
return &ekstypes.NodeRepairConfig{
230+
Enabled: repairConfig.Enabled,
231+
}
232+
}
233+
220234
// AMITypeToSDK converts a CAPA ManagedMachineAMIType to AWS SDK AMIType.
221235
func AMITypeToSDK(amiType expinfrav1.ManagedMachineAMIType) ekstypes.AMITypes {
222236
switch amiType {

pkg/cloud/converters/eks_test.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package converters
18+
19+
import (
20+
"testing"
21+
22+
"github.com/aws/aws-sdk-go-v2/aws"
23+
ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types"
24+
25+
expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2"
26+
)
27+
28+
func TestNodegroupRepairConfigToSDK(t *testing.T) {
29+
tests := []struct {
30+
name string
31+
input *expinfrav1.NodeRepairConfig
32+
expected *ekstypes.NodeRepairConfig
33+
}{
34+
{
35+
name: "nil input returns default disabled",
36+
input: nil,
37+
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(false)},
38+
},
39+
{
40+
name: "enabled repair config",
41+
input: &expinfrav1.NodeRepairConfig{
42+
Enabled: aws.Bool(true),
43+
},
44+
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(true)},
45+
},
46+
{
47+
name: "disabled repair config",
48+
input: &expinfrav1.NodeRepairConfig{
49+
Enabled: aws.Bool(false),
50+
},
51+
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(false)},
52+
},
53+
}
54+
55+
for _, tt := range tests {
56+
t.Run(tt.name, func(t *testing.T) {
57+
result := NodegroupRepairConfigToSDK(tt.input)
58+
if result == nil && tt.expected == nil {
59+
return
60+
}
61+
if result == nil || tt.expected == nil {
62+
t.Errorf("NodegroupRepairConfigToSDK() = %v, want %v", result, tt.expected)
63+
return
64+
}
65+
if *result.Enabled != *tt.expected.Enabled {
66+
t.Errorf("NodegroupRepairConfigToSDK().Enabled = %v, want %v", *result.Enabled, *tt.expected.Enabled)
67+
}
68+
})
69+
}
70+
}

pkg/cloud/services/eks/nodegroup.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ func (s *NodegroupService) updateConfig() (*ekstypes.NodegroupUpdateConfig, erro
119119
return converters.NodegroupUpdateconfigToSDK(updateConfig)
120120
}
121121

122+
func (s *NodegroupService) nodeRepairConfig() *ekstypes.NodeRepairConfig {
123+
repairConfig := s.scope.ManagedMachinePool.Spec.NodeRepairConfig
124+
return converters.NodegroupRepairConfigToSDK(repairConfig)
125+
}
126+
122127
func (s *NodegroupService) roleArn(ctx context.Context) (*string, error) {
123128
var role *iamtypes.Role
124129
if s.scope.RoleName() != "" {
@@ -249,6 +254,9 @@ func (s *NodegroupService) createNodegroup(ctx context.Context) (*ekstypes.Nodeg
249254
Version: s.scope.ManagedMachinePool.Status.LaunchTemplateVersion,
250255
}
251256
}
257+
if managedPool.NodeRepairConfig != nil {
258+
input.NodeRepairConfig = s.nodeRepairConfig()
259+
}
252260

253261
out, err := s.EKSClient.CreateNodegroup(ctx, input)
254262
if err != nil {
@@ -480,6 +488,16 @@ func (s *NodegroupService) reconcileNodegroupConfig(ctx context.Context, ng *eks
480488
input.UpdateConfig = updatedConfig
481489
needsUpdate = true
482490
}
491+
492+
// Check if node repair configuration needs to be updated
493+
currentRepairConfig := ng.NodeRepairConfig
494+
specRepairConfig := s.nodeRepairConfig()
495+
if !cmp.Equal(currentRepairConfig, specRepairConfig) {
496+
s.Debug("Nodegroup repair configuration differs from spec, updating the nodegroup repair config", "nodegroup", ng.NodegroupName)
497+
input.NodeRepairConfig = specRepairConfig
498+
needsUpdate = true
499+
}
500+
483501
if !needsUpdate {
484502
s.Debug("node group config update not needed", "cluster", eksClusterName, "name", *ng.NodegroupName)
485503
return nil

0 commit comments

Comments
 (0)