Skip to content

Commit 7aa84f7

Browse files
committed
feat: add node auto repair configuration for EKS managed node groups
1 parent 483f3a9 commit 7aa84f7

File tree

7 files changed

+201
-0
lines changed

7 files changed

+201
-0
lines changed

config/crd/bases/infrastructure.cluster.x-k8s.io_awsmanagedmachinepools.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,17 @@ spec:
969969
- name
970970
type: object
971971
type: array
972+
nodeRepairConfig:
973+
description: NodeRepairConfig specifies the node auto repair configuration
974+
for the managed node group.
975+
properties:
976+
enabled:
977+
default: false
978+
description: |-
979+
Enabled specifies whether node auto repair is enabled for the node group.
980+
When enabled, EKS will automatically repair unhealthy nodes by replacing them.
981+
type: boolean
982+
type: object
972983
providerIDList:
973984
description: |-
974985
ProviderIDList are the provider IDs of instances in the

exp/api/v1beta1/zz_generated.conversion.go

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exp/api/v1beta2/awsmanagedmachinepool_types.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,10 @@ type AWSManagedMachinePoolSpec struct {
214214
// AWSLifecycleHooks specifies lifecycle hooks for the managed node group.
215215
// +optional
216216
AWSLifecycleHooks []AWSLifecycleHook `json:"lifecycleHooks,omitempty"`
217+
218+
// NodeRepairConfig specifies the node auto repair configuration for the managed node group.
219+
// +optional
220+
NodeRepairConfig *NodeRepairConfig `json:"nodeRepairConfig,omitempty"`
217221
}
218222

219223
// ManagedMachinePoolScaling specifies scaling options.
@@ -297,6 +301,15 @@ type AWSManagedMachinePoolStatus struct {
297301
Conditions clusterv1.Conditions `json:"conditions,omitempty"`
298302
}
299303

304+
// NodeRepairConfig defines the node auto repair configuration for managed node groups.
305+
type NodeRepairConfig struct {
306+
// Enabled specifies whether node auto repair is enabled for the node group.
307+
// When enabled, EKS will automatically repair unhealthy nodes by replacing them.
308+
// +optional
309+
// +kubebuilder:default=false
310+
Enabled *bool `json:"enabled,omitempty"`
311+
}
312+
300313
// +kubebuilder:object:root=true
301314
// +kubebuilder:resource:path=awsmanagedmachinepools,scope=Namespaced,categories=cluster-api,shortName=awsmmp
302315
// +kubebuilder:storageversion

exp/api/v1beta2/zz_generated.deepcopy.go

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/cloud/converters/eks.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,31 @@ func NodegroupUpdateconfigFromSDK(ngUpdateConfig *ekstypes.NodegroupUpdateConfig
217217
return converted
218218
}
219219

220+
// NodegroupRepairConfigToSDK is used to convert a CAPA NodeRepairConfig to AWS SDK NodeRepairConfig.
221+
func NodegroupRepairConfigToSDK(repairConfig *expinfrav1.NodeRepairConfig) *ekstypes.NodeRepairConfig {
222+
if repairConfig == nil {
223+
// Default to disabled if not specified to avoid behavior changes
224+
return &ekstypes.NodeRepairConfig{
225+
Enabled: aws.Bool(false),
226+
}
227+
}
228+
229+
return &ekstypes.NodeRepairConfig{
230+
Enabled: repairConfig.Enabled,
231+
}
232+
}
233+
234+
// NodegroupRepairConfigFromSDK is used to convert a AWS SDK NodeRepairConfig to a CAPA NodeRepairConfig.
235+
func NodegroupRepairConfigFromSDK(ngRepairConfig *ekstypes.NodeRepairConfig) *expinfrav1.NodeRepairConfig {
236+
if ngRepairConfig == nil {
237+
return nil
238+
}
239+
240+
return &expinfrav1.NodeRepairConfig{
241+
Enabled: ngRepairConfig.Enabled,
242+
}
243+
}
244+
220245
// AMITypeToSDK converts a CAPA ManagedMachineAMIType to AWS SDK AMIType.
221246
func AMITypeToSDK(amiType expinfrav1.ManagedMachineAMIType) ekstypes.AMITypes {
222247
switch amiType {

pkg/cloud/converters/eks_test.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package converters
18+
19+
import (
20+
"testing"
21+
22+
"github.com/aws/aws-sdk-go-v2/aws"
23+
ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types"
24+
"github.com/google/go-cmp/cmp"
25+
26+
expinfrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2"
27+
)
28+
29+
func TestNodegroupRepairConfigToSDK(t *testing.T) {
30+
tests := []struct {
31+
name string
32+
input *expinfrav1.NodeRepairConfig
33+
expected *ekstypes.NodeRepairConfig
34+
}{
35+
{
36+
name: "nil input returns default disabled",
37+
input: nil,
38+
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(false)},
39+
},
40+
{
41+
name: "enabled repair config",
42+
input: &expinfrav1.NodeRepairConfig{
43+
Enabled: aws.Bool(true),
44+
},
45+
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(true)},
46+
},
47+
{
48+
name: "disabled repair config",
49+
input: &expinfrav1.NodeRepairConfig{
50+
Enabled: aws.Bool(false),
51+
},
52+
expected: &ekstypes.NodeRepairConfig{Enabled: aws.Bool(false)},
53+
},
54+
}
55+
56+
for _, tt := range tests {
57+
t.Run(tt.name, func(t *testing.T) {
58+
result := NodegroupRepairConfigToSDK(tt.input)
59+
if result == nil && tt.expected == nil {
60+
return
61+
}
62+
if result == nil || tt.expected == nil {
63+
t.Errorf("NodegroupRepairConfigToSDK() = %v, want %v", result, tt.expected)
64+
return
65+
}
66+
if *result.Enabled != *tt.expected.Enabled {
67+
t.Errorf("NodegroupRepairConfigToSDK().Enabled = %v, want %v", *result.Enabled, *tt.expected.Enabled)
68+
}
69+
})
70+
}
71+
}
72+
73+
func TestNodegroupRepairConfigFromSDK(t *testing.T) {
74+
tests := []struct {
75+
name string
76+
input *ekstypes.NodeRepairConfig
77+
expected *expinfrav1.NodeRepairConfig
78+
}{
79+
{
80+
name: "nil input returns nil",
81+
input: nil,
82+
expected: nil,
83+
},
84+
{
85+
name: "enabled repair config",
86+
input: &ekstypes.NodeRepairConfig{
87+
Enabled: aws.Bool(true),
88+
},
89+
expected: &expinfrav1.NodeRepairConfig{Enabled: aws.Bool(true)},
90+
},
91+
{
92+
name: "disabled repair config",
93+
input: &ekstypes.NodeRepairConfig{
94+
Enabled: aws.Bool(false),
95+
},
96+
expected: &expinfrav1.NodeRepairConfig{Enabled: aws.Bool(false)},
97+
},
98+
}
99+
100+
for _, tt := range tests {
101+
t.Run(tt.name, func(t *testing.T) {
102+
result := NodegroupRepairConfigFromSDK(tt.input)
103+
if !cmp.Equal(result, tt.expected) {
104+
t.Errorf("NodegroupRepairConfigFromSDK() = %v, want %v", result, tt.expected)
105+
}
106+
})
107+
}
108+
}

pkg/cloud/services/eks/nodegroup.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ func (s *NodegroupService) updateConfig() (*ekstypes.NodegroupUpdateConfig, erro
119119
return converters.NodegroupUpdateconfigToSDK(updateConfig)
120120
}
121121

122+
func (s *NodegroupService) nodeRepairConfig() *ekstypes.NodeRepairConfig {
123+
repairConfig := s.scope.ManagedMachinePool.Spec.NodeRepairConfig
124+
return converters.NodegroupRepairConfigToSDK(repairConfig)
125+
}
126+
122127
func (s *NodegroupService) roleArn(ctx context.Context) (*string, error) {
123128
var role *iamtypes.Role
124129
if s.scope.RoleName() != "" {
@@ -249,6 +254,9 @@ func (s *NodegroupService) createNodegroup(ctx context.Context) (*ekstypes.Nodeg
249254
Version: s.scope.ManagedMachinePool.Status.LaunchTemplateVersion,
250255
}
251256
}
257+
if managedPool.NodeRepairConfig != nil {
258+
input.NodeRepairConfig = s.nodeRepairConfig()
259+
}
252260

253261
out, err := s.EKSClient.CreateNodegroup(ctx, input)
254262
if err != nil {
@@ -480,6 +488,16 @@ func (s *NodegroupService) reconcileNodegroupConfig(ctx context.Context, ng *eks
480488
input.UpdateConfig = updatedConfig
481489
needsUpdate = true
482490
}
491+
492+
// Check if node repair configuration needs to be updated
493+
currentRepairConfig := converters.NodegroupRepairConfigFromSDK(ng.NodeRepairConfig)
494+
specRepairConfig := s.scope.ManagedMachinePool.Spec.NodeRepairConfig
495+
if !cmp.Equal(currentRepairConfig, specRepairConfig) {
496+
s.Debug("Nodegroup repair configuration differs from spec, updating the nodegroup repair config", "nodegroup", ng.NodegroupName)
497+
input.NodeRepairConfig = s.nodeRepairConfig()
498+
needsUpdate = true
499+
}
500+
483501
if !needsUpdate {
484502
s.Debug("node group config update not needed", "cluster", eksClusterName, "name", *ng.NodegroupName)
485503
return nil

0 commit comments

Comments
 (0)