diff --git a/examples/44-node-repair.yaml b/examples/44-node-repair.yaml index ddbe861a6f..7611dca24f 100644 --- a/examples/44-node-repair.yaml +++ b/examples/44-node-repair.yaml @@ -1,4 +1,5 @@ -# An example ClusterConfig that uses a managed node group with auto repair. +# An example ClusterConfig that demonstrates node repair configuration +# for EKS managed nodegroups with various configuration options. apiVersion: eksctl.io/v1alpha5 kind: ClusterConfig @@ -8,6 +9,77 @@ metadata: region: us-west-2 managedNodeGroups: -- name: ng-1 - nodeRepairConfig: - enabled: true + # Example 1: Basic node repair + - name: basic-repair-ng + instanceType: m5.large + desiredCapacity: 3 + nodeRepairConfig: + enabled: true + + # Example 2: Node repair with percentage-based thresholds + - name: percentage-repair-ng + instanceType: m5.large + desiredCapacity: 3 + minSize: 1 + maxSize: 5 + nodeRepairConfig: + enabled: true + # Stop repair actions when 20% of nodes are unhealthy + maxUnhealthyNodeThresholdPercentage: 20 + # Repair at most 15% of unhealthy nodes in parallel + maxParallelNodesRepairedPercentage: 15 + + # Example 3: Node repair with count-based thresholds + - name: count-repair-ng + instanceType: m5.xlarge + desiredCapacity: 10 + minSize: 5 + maxSize: 20 + nodeRepairConfig: + enabled: true + # Stop repair actions when 3 nodes are unhealthy + maxUnhealthyNodeThresholdCount: 3 + # Repair at most 2 unhealthy nodes in parallel + maxParallelNodesRepairedCount: 2 + + # Example 4: GPU workload with custom repair overrides + - name: gpu-repair-ng + instanceType: g4dn.xlarge + desiredCapacity: 4 + minSize: 2 + maxSize: 8 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 25 + maxParallelNodesRepairedCount: 1 + # Custom repair behavior for specific failure scenarios + nodeRepairConfigOverrides: + # Handle GPU-related failures with immediate termination + - nodeMonitoringCondition: "AcceleratedInstanceNotReady" + nodeUnhealthyReason: "NvidiaXID13Error" + minRepairWaitTimeMins: 5 + repairAction: "Terminate" + # Handle network issues with restart first + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 15 + repairAction: "Restart" + + # Example 5: Conservative repair for critical workloads + - name: critical-repair-ng + instanceType: c5.2xlarge + desiredCapacity: 6 + minSize: 3 + maxSize: 12 + nodeRepairConfig: + enabled: true + # Conservative settings - stop repair when only 10% of nodes are unhealthy + maxUnhealthyNodeThresholdPercentage: 10 + # Repair only 1 node at a time + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # Wait longer before taking action on critical workloads + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 45 + repairAction: "Restart" diff --git a/integration/tests/enhanced_node_repair/enhanced_node_repair_test.go b/integration/tests/enhanced_node_repair/enhanced_node_repair_test.go new file mode 100644 index 0000000000..64ffa4fee4 --- /dev/null +++ b/integration/tests/enhanced_node_repair/enhanced_node_repair_test.go @@ -0,0 +1,150 @@ +//go:build integration +// +build integration + +package enhanced_node_repair + +import ( + "fmt" + "os" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + . "github.com/weaveworks/eksctl/integration/runner" + "github.com/weaveworks/eksctl/integration/tests" + "github.com/weaveworks/eksctl/pkg/testutils" +) + +var params *tests.Params + +func init() { + // Call testing.Init() prior to tests.NewParams(), as otherwise -test.* will not be recognised. See also: https://golang.org/doc/go1.13#testing + testing.Init() + params = tests.NewParamsWithGivenClusterName("enhanced-node-repair", "test-enhanced-node-repair") +} + +func TestEnhancedNodeRepair(t *testing.T) { + testutils.RegisterAndRun(t) +} + +var _ = Describe("(Integration) Enhanced Node Repair Configuration", func() { + + Context("CloudFormation template generation", func() { + It("should generate correct CloudFormation template with CLI flags", func() { + By("testing CLI flags generate correct CloudFormation") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--name", "test-cli-template", + "--region", params.Region, + "--managed", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=25", + "--node-repair-max-parallel-count=2", + "--dry-run", + ) + Expect(cmd).To(RunSuccessfully()) + }) + + It("should generate correct CloudFormation template with YAML config", func() { + By("creating temporary config file") + configFile := fmt.Sprintf("/tmp/test-enhanced-node-repair-%d.yaml", time.Now().Unix()) + yamlConfig := fmt.Sprintf(` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: test-yaml-template + region: %s + +managedNodeGroups: +- name: enhanced-ng + instanceType: t3.medium + desiredCapacity: 2 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 20 + maxParallelNodesRepairedPercentage: 15 + nodeRepairConfigOverrides: + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 15 + repairAction: "Restart" +`, params.Region) + + err := os.WriteFile(configFile, []byte(yamlConfig), 0644) + Expect(err).NotTo(HaveOccurred()) + defer os.Remove(configFile) + + By("testing YAML config generates correct CloudFormation") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--config-file", configFile, + "--dry-run", + ) + Expect(cmd).To(RunSuccessfully()) + }) + + It("should validate backward compatibility with existing config", func() { + By("testing existing node repair config still works") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--name", "test-backward-compat", + "--region", params.Region, + "--managed", + "--enable-node-repair", + "--dry-run", + ) + Expect(cmd).To(RunSuccessfully()) + }) + }) + + Context("error handling", func() { + It("should handle invalid CLI flag combinations gracefully", func() { + By("testing with unmanaged nodegroup (should fail)") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--name", "test-error-handling", + "--region", params.Region, + "--managed=false", + "--enable-node-repair", + "--dry-run", + ) + Expect(cmd).NotTo(RunSuccessfully()) + Expect(cmd).To(RunWithError(ContainSubstring("only valid with managed nodegroups"))) + }) + + It("should handle invalid YAML configuration gracefully", func() { + By("creating config file with invalid node repair config") + configFile := fmt.Sprintf("/tmp/test-invalid-config-%d.yaml", time.Now().Unix()) + invalidConfig := fmt.Sprintf(` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: test-invalid + region: %s + +nodeGroups: +- name: unmanaged-ng + instanceType: t3.medium + nodeRepairConfig: + enabled: true +`, params.Region) + + err := os.WriteFile(configFile, []byte(invalidConfig), 0644) + Expect(err).NotTo(HaveOccurred()) + defer os.Remove(configFile) + + By("testing invalid config is rejected") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--config-file", configFile, + "--dry-run", + ) + // This should fail because nodeRepairConfig is not supported for unmanaged nodegroups + Expect(cmd).NotTo(RunSuccessfully()) + }) + }) +}) \ No newline at end of file diff --git a/integration/tests/enhanced_node_repair_test.go b/integration/tests/enhanced_node_repair_test.go new file mode 100644 index 0000000000..c2297ce950 --- /dev/null +++ b/integration/tests/enhanced_node_repair_test.go @@ -0,0 +1,474 @@ +//go:build integration +// +build integration + +package integration_test + +import ( + "bytes" + "os" + "os/exec" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestEnhancedNodeRepairCLIFlags tests that the enhanced node repair CLI flags are properly parsed +func TestEnhancedNodeRepairCLIFlags(t *testing.T) { + tests := []struct { + name string + args []string + expected []string // Expected strings in the dry-run output + }{ + { + name: "basic node repair", + args: []string{ + "create", "cluster", + "--name", "test-basic-repair", + "--enable-node-repair", + "--dry-run", + }, + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + }, + }, + { + name: "node repair with percentage thresholds", + args: []string{ + "create", "cluster", + "--name", "test-percentage-repair", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=25", + "--node-repair-max-parallel-percentage=20", + "--dry-run", + }, + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + "MaxUnhealthyNodeThresholdPercentage\": 25", + "MaxParallelNodesRepairedPercentage\": 20", + }, + }, + { + name: "node repair with count thresholds", + args: []string{ + "create", "cluster", + "--name", "test-count-repair", + "--enable-node-repair", + "--node-repair-max-unhealthy-count=5", + "--node-repair-max-parallel-count=2", + "--dry-run", + }, + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + "MaxUnhealthyNodeThresholdCount\": 5", + "MaxParallelNodesRepairedCount\": 2", + }, + }, + { + name: "nodegroup with node repair flags", + args: []string{ + "create", "nodegroup", + "--cluster", "existing-cluster", + "--name", "test-ng-repair", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=30", + "--node-repair-max-parallel-count=1", + "--dry-run", + }, + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + "MaxUnhealthyNodeThresholdPercentage\": 30", + "MaxParallelNodesRepairedCount\": 1", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Run eksctl with the test arguments + cmd := exec.Command("./eksctl", tt.args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + // For dry-run, we expect it to fail due to missing AWS credentials or cluster + // but we should still get the configuration output + t.Logf("Command failed as expected (dry-run): %v", err) + t.Logf("Stderr: %s", stderr.String()) + } + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check that all expected strings are present in the output + for _, expected := range tt.expected { + assert.Contains(t, output, expected, "Expected string not found in output: %s", expected) + } + }) + } +} + +// TestEnhancedNodeRepairConfigFile tests that enhanced node repair configuration files are properly parsed +func TestEnhancedNodeRepairConfigFile(t *testing.T) { + tests := []struct { + name string + configFile string + expected []string + }{ + { + name: "basic config file", + configFile: "examples/44-node-repair.yaml", + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + }, + }, + { + name: "enhanced config file", + configFile: "examples/44-enhanced-node-repair.yaml", + expected: []string{ + "NodeRepairConfig", + "maxUnhealthyNodeThresholdPercentage", + "maxParallelNodesRepairedPercentage", + "nodeRepairConfigOverrides", + "AcceleratedInstanceNotReady", + "NvidiaXID13Error", + "NetworkNotReady", + "InterfaceNotUp", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Run eksctl with the config file + cmd := exec.Command("./eksctl", "create", "cluster", "--config-file", tt.configFile, "--dry-run") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + // For dry-run, we expect it to fail due to missing AWS credentials + // but we should still get the configuration output + t.Logf("Command failed as expected (dry-run): %v", err) + } + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check that all expected strings are present in the output + for _, expected := range tt.expected { + assert.Contains(t, output, expected, "Expected string not found in output: %s", expected) + } + }) + } +} + +// TestEnhancedNodeRepairCLIHelp tests that the CLI help includes the new flags +func TestEnhancedNodeRepairCLIHelp(t *testing.T) { + cmd := exec.Command("../../eksctl", "create", "cluster", "--help") + var stdout bytes.Buffer + cmd.Stdout = &stdout + + err := cmd.Run() + require.NoError(t, err, "Help command should not fail") + + output := stdout.String() + + // Check that all new flags are documented in help + expectedFlags := []string{ + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage", + "--node-repair-max-unhealthy-count", + "--node-repair-max-parallel-percentage", + "--node-repair-max-parallel-count", + } + + for _, flag := range expectedFlags { + assert.Contains(t, output, flag, "Flag not found in help output: %s", flag) + } + + // Check that flags have proper descriptions + assert.Contains(t, output, "managed nodegroups only", "Flags should indicate they're for managed nodegroups only") +} + +// TestEnhancedNodeRepairBackwardCompatibility tests that existing configurations still work +func TestEnhancedNodeRepairBackwardCompatibility(t *testing.T) { + // Test that the original example still works + cmd := exec.Command("../../eksctl", "create", "cluster", "--config-file", "../../examples/44-node-repair.yaml", "--dry-run") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + // Expected to fail due to missing AWS credentials, but should parse config + t.Logf("Command failed as expected (dry-run): %v", err) + } + + output := stdout.String() + stderr.String() + + // Should not contain any parsing errors + assert.NotContains(t, strings.ToLower(output), "unknown field", "Should not have unknown field errors") + assert.NotContains(t, strings.ToLower(output), "invalid", "Should not have invalid field errors") + + // Should contain the basic node repair config + assert.Contains(t, output, "nodeRepairConfig", "Should contain nodeRepairConfig") +} + +// TestEnhancedNodeRepairSchemaValidation tests that the schema includes new fields +func TestEnhancedNodeRepairSchemaValidation(t *testing.T) { + cmd := exec.Command("../../eksctl", "utils", "schema") + var stdout bytes.Buffer + cmd.Stdout = &stdout + + err := cmd.Run() + require.NoError(t, err, "Schema command should not fail") + + output := stdout.String() + + // Check that the schema includes all new fields + expectedFields := []string{ + "NodeGroupNodeRepairConfig", + "maxUnhealthyNodeThresholdPercentage", + "maxUnhealthyNodeThresholdCount", + "maxParallelNodesRepairedPercentage", + "maxParallelNodesRepairedCount", + "nodeRepairConfigOverrides", + } + + for _, field := range expectedFields { + assert.Contains(t, output, field, "Schema should include field: %s", field) + } +} + + + +// TestEnhancedNodeRepairErrorHandling tests error handling for invalid configurations +func TestEnhancedNodeRepairErrorHandling(t *testing.T) { + tests := []struct { + name string + args []string + expectedError string + }{ + { + name: "conflicting percentage and count thresholds", + args: []string{ + "create", "cluster", + "--name", "test-conflict", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=25", + "--node-repair-max-unhealthy-count=5", + "--dry-run", + }, + expectedError: "cannot specify both percentage and count", + }, + { + name: "conflicting parallel percentage and count", + args: []string{ + "create", "cluster", + "--name", "test-parallel-conflict", + "--enable-node-repair", + "--node-repair-max-parallel-percentage=20", + "--node-repair-max-parallel-count=2", + "--dry-run", + }, + expectedError: "cannot specify both percentage and count", + }, + { + name: "invalid percentage value too high", + args: []string{ + "create", "cluster", + "--name", "test-invalid-percentage", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=150", + "--dry-run", + }, + expectedError: "percentage must be between 1 and 100", + }, + { + name: "invalid percentage value zero", + args: []string{ + "create", "cluster", + "--name", "test-zero-percentage", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=0", + "--dry-run", + }, + expectedError: "percentage must be between 1 and 100", + }, + { + name: "invalid count value zero", + args: []string{ + "create", "cluster", + "--name", "test-zero-count", + "--enable-node-repair", + "--node-repair-max-unhealthy-count=0", + "--dry-run", + }, + expectedError: "count must be greater than 0", + }, + { + name: "node repair flags without enable flag", + args: []string{ + "create", "cluster", + "--name", "test-no-enable", + "--node-repair-max-unhealthy-percentage=25", + "--dry-run", + }, + expectedError: "node repair flags require --enable-node-repair", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Run eksctl with the test arguments + cmd := exec.Command("../../eksctl", tt.args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + require.Error(t, err, "Command should fail with invalid configuration") + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check that the expected error message is present + assert.Contains(t, strings.ToLower(output), strings.ToLower(tt.expectedError), + "Expected error message not found: %s", tt.expectedError) + }) + } +} + +// TestEnhancedNodeRepairConfigFileErrorHandling tests error handling for invalid config files +func TestEnhancedNodeRepairConfigFileErrorHandling(t *testing.T) { + // Create a temporary invalid config file + invalidConfig := ` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: test-invalid-config + region: us-west-2 +managedNodeGroups: +- name: ng-1 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 25 + maxUnhealthyNodeThresholdCount: 5 # This conflicts with percentage + maxParallelNodesRepairedPercentage: 150 # Invalid percentage > 100 +` + + tmpFile, err := os.CreateTemp("", "invalid-config-*.yaml") + require.NoError(t, err) + defer os.Remove(tmpFile.Name()) + + _, err = tmpFile.WriteString(invalidConfig) + require.NoError(t, err) + tmpFile.Close() + + // Run eksctl with the invalid config file + cmd := exec.Command("../../eksctl", "create", "cluster", "--config-file", tmpFile.Name(), "--dry-run") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err = cmd.Run() + require.Error(t, err, "Command should fail with invalid configuration") + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check for validation errors + expectedErrors := []string{ + "cannot specify both percentage and count", + "percentage must be between 1 and 100", + } + + for _, expectedError := range expectedErrors { + assert.Contains(t, strings.ToLower(output), strings.ToLower(expectedError), + "Expected error message not found: %s", expectedError) + } +} + +// TestEnhancedNodeRepairUnmanagedNodegroupError tests that node repair flags are rejected for unmanaged nodegroups +func TestEnhancedNodeRepairUnmanagedNodegroupError(t *testing.T) { + // Create a config with unmanaged nodegroup and node repair config + invalidConfig := ` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: test-unmanaged-repair + region: us-west-2 +nodeGroups: # Unmanaged nodegroup +- name: ng-1 + nodeRepairConfig: # This should be invalid for unmanaged nodegroups + enabled: true +` + + tmpFile, err := os.CreateTemp("", "unmanaged-repair-*.yaml") + require.NoError(t, err) + defer os.Remove(tmpFile.Name()) + + _, err = tmpFile.WriteString(invalidConfig) + require.NoError(t, err) + tmpFile.Close() + + // Run eksctl with the invalid config file + cmd := exec.Command("../../eksctl", "create", "cluster", "--config-file", tmpFile.Name(), "--dry-run") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err = cmd.Run() + require.Error(t, err, "Command should fail with node repair config on unmanaged nodegroup") + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check that the error mentions managed nodegroups only + assert.Contains(t, strings.ToLower(output), "managed nodegroups only", + "Should indicate that node repair is for managed nodegroups only") +} + +// TestEnhancedNodeRepairValidationRecovery tests that validation errors don't leave resources in inconsistent states +func TestEnhancedNodeRepairValidationRecovery(t *testing.T) { + // This test ensures that when validation fails, no partial resources are created + // Since we're using --dry-run, we're mainly testing that the validation happens early + // and doesn't proceed to resource creation + + cmd := exec.Command("../../eksctl", + "create", "cluster", + "--name", "test-validation-recovery", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=200", // Invalid percentage + "--dry-run", + ) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + require.Error(t, err, "Command should fail early with validation error") + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Ensure that validation happens before any resource creation attempts + assert.Contains(t, strings.ToLower(output), "percentage must be between 1 and 100", + "Should show validation error") + + // Should not contain CloudFormation template generation messages + assert.NotContains(t, strings.ToLower(output), "cloudformation template", + "Should not proceed to CloudFormation template generation") + assert.NotContains(t, strings.ToLower(output), "creating stack", + "Should not proceed to stack creation") +} \ No newline at end of file diff --git a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json index 433939a36e..5a993fee8c 100755 --- a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json +++ b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json @@ -2301,10 +2301,43 @@ "type": "boolean", "description": "Enables the auto repair feature for the nodegroup", "x-intellij-html-description": "Enables the auto repair feature for the nodegroup" + }, + "maxParallelNodesRepairedCount": { + "type": "integer", + "description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time.", + "x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time." + }, + "maxParallelNodesRepairedPercentage": { + "type": "integer", + "description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time.", + "x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time." + }, + "maxUnhealthyNodeThresholdCount": { + "type": "integer", + "description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time.", + "x-intellij-html-description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time." + }, + "maxUnhealthyNodeThresholdPercentage": { + "type": "integer", + "description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time.", + "x-intellij-html-description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time." + }, + "nodeRepairConfigOverrides": { + "items": { + "$ref": "#/definitions/NodeRepairConfigOverride" + }, + "type": "array", + "description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.", + "x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values." } }, "preferredOrder": [ - "enabled" + "enabled", + "maxUnhealthyNodeThresholdPercentage", + "maxUnhealthyNodeThresholdCount", + "maxParallelNodesRepairedPercentage", + "maxParallelNodesRepairedCount", + "nodeRepairConfigOverrides" ], "additionalProperties": false, "description": "contains the auto repair configuration for the nodegroup", @@ -2430,6 +2463,39 @@ "description": "contains the configuration for updating NodeGroups.", "x-intellij-html-description": "contains the configuration for updating NodeGroups." }, + "NodeRepairConfigOverride": { + "properties": { + "minRepairWaitTimeMins": { + "type": "integer", + "description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason", + "x-intellij-html-description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason" + }, + "nodeMonitoringCondition": { + "type": "string", + "description": "specifies an unhealthy condition reported by the node monitoring agent that this override would apply to", + "x-intellij-html-description": "specifies an unhealthy condition reported by the node monitoring agent that this override would apply to" + }, + "nodeUnhealthyReason": { + "type": "string", + "description": "specifies a reason reported by the node monitoring agent that this override would apply to", + "x-intellij-html-description": "specifies a reason reported by the node monitoring agent that this override would apply to" + }, + "repairAction": { + "type": "string", + "description": "specifies the repair action to take for nodes when all of the specified conditions are met", + "x-intellij-html-description": "specifies the repair action to take for nodes when all of the specified conditions are met" + } + }, + "preferredOrder": [ + "nodeMonitoringCondition", + "nodeUnhealthyReason", + "minRepairWaitTimeMins", + "repairAction" + ], + "additionalProperties": false, + "description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.", + "x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values." + }, "OIDCIdentityProvider": { "required": [ "name", diff --git a/pkg/apis/eksctl.io/v1alpha5/types.go b/pkg/apis/eksctl.io/v1alpha5/types.go index cda4a9f7b9..167f9a152e 100644 --- a/pkg/apis/eksctl.io/v1alpha5/types.go +++ b/pkg/apis/eksctl.io/v1alpha5/types.go @@ -1578,6 +1578,48 @@ type ( // Enables the auto repair feature for the nodegroup // +optional Enabled *bool `json:"enabled,omitempty"` + + // MaxUnhealthyNodeThresholdPercentage specifies a percentage threshold of unhealthy nodes, above which node auto + // repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time. + // +optional + MaxUnhealthyNodeThresholdPercentage *int `json:"maxUnhealthyNodeThresholdPercentage,omitempty"` + + // MaxUnhealthyNodeThresholdCount specifies a count threshold of unhealthy nodes, above which node auto + // repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time. + // +optional + MaxUnhealthyNodeThresholdCount *int `json:"maxUnhealthyNodeThresholdCount,omitempty"` + + // MaxParallelNodesRepairedPercentage specifies the maximum number of nodes that can be repaired concurrently or in parallel, + // expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time. + // +optional + MaxParallelNodesRepairedPercentage *int `json:"maxParallelNodesRepairedPercentage,omitempty"` + + // MaxParallelNodesRepairedCount specifies the maximum number of nodes that can be repaired concurrently or in parallel, + // expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time. + // +optional + MaxParallelNodesRepairedCount *int `json:"maxParallelNodesRepairedCount,omitempty"` + + // NodeRepairConfigOverrides specifies granular overrides for specific repair actions. These overrides control the + // repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values. + // +optional + NodeRepairConfigOverrides []NodeRepairConfigOverride `json:"nodeRepairConfigOverrides,omitempty"` + } + + // NodeRepairConfigOverride specifies granular overrides for specific repair actions. These overrides control the + // repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values. + NodeRepairConfigOverride struct { + // NodeMonitoringCondition specifies an unhealthy condition reported by the node monitoring agent that this override would apply to + NodeMonitoringCondition string `json:"nodeMonitoringCondition"` + + // NodeUnhealthyReason specifies a reason reported by the node monitoring agent that this override would apply to + NodeUnhealthyReason string `json:"nodeUnhealthyReason"` + + // MinRepairWaitTimeMins specifies the minimum time in minutes to wait before attempting to repair a node + // with this specific NodeMonitoringCondition and NodeUnhealthyReason + MinRepairWaitTimeMins int `json:"minRepairWaitTimeMins"` + + // RepairAction specifies the repair action to take for nodes when all of the specified conditions are met + RepairAction string `json:"repairAction"` } ) diff --git a/pkg/apis/eksctl.io/v1alpha5/types_test.go b/pkg/apis/eksctl.io/v1alpha5/types_test.go index eaf58f0bf7..bd7cd0073c 100644 --- a/pkg/apis/eksctl.io/v1alpha5/types_test.go +++ b/pkg/apis/eksctl.io/v1alpha5/types_test.go @@ -1,6 +1,8 @@ package v1alpha5 import ( + "encoding/json" + "github.com/aws/aws-sdk-go-v2/aws" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -82,4 +84,173 @@ var _ = Describe("Types", func() { }) }) + Describe("NodeGroupNodeRepairConfig", func() { + var ( + nodeRepairConfig *NodeGroupNodeRepairConfig + ) + + BeforeEach(func() { + nodeRepairConfig = &NodeGroupNodeRepairConfig{} + }) + + Describe("JSON marshaling and unmarshaling", func() { + When("all fields are set", func() { + It("should marshal and unmarshal correctly", func() { + nodeRepairConfig.Enabled = aws.Bool(true) + nodeRepairConfig.MaxUnhealthyNodeThresholdPercentage = aws.Int(20) + nodeRepairConfig.MaxUnhealthyNodeThresholdCount = aws.Int(5) + nodeRepairConfig.MaxParallelNodesRepairedPercentage = aws.Int(15) + nodeRepairConfig.MaxParallelNodesRepairedCount = aws.Int(2) + nodeRepairConfig.NodeRepairConfigOverrides = []NodeRepairConfigOverride{ + { + NodeMonitoringCondition: "AcceleratedInstanceNotReady", + NodeUnhealthyReason: "NvidiaXID13Error", + MinRepairWaitTimeMins: 10, + RepairAction: "Terminate", + }, + { + NodeMonitoringCondition: "NetworkNotReady", + NodeUnhealthyReason: "InterfaceNotUp", + MinRepairWaitTimeMins: 20, + RepairAction: "Restart", + }, + } + + // Test JSON marshaling + jsonData, err := json.Marshal(nodeRepairConfig) + Expect(err).NotTo(HaveOccurred()) + Expect(string(jsonData)).To(ContainSubstring(`"enabled":true`)) + Expect(string(jsonData)).To(ContainSubstring(`"maxUnhealthyNodeThresholdPercentage":20`)) + Expect(string(jsonData)).To(ContainSubstring(`"maxUnhealthyNodeThresholdCount":5`)) + Expect(string(jsonData)).To(ContainSubstring(`"maxParallelNodesRepairedPercentage":15`)) + Expect(string(jsonData)).To(ContainSubstring(`"maxParallelNodesRepairedCount":2`)) + Expect(string(jsonData)).To(ContainSubstring(`"nodeRepairConfigOverrides"`)) + Expect(string(jsonData)).To(ContainSubstring(`"AcceleratedInstanceNotReady"`)) + Expect(string(jsonData)).To(ContainSubstring(`"NvidiaXID13Error"`)) + + // Test JSON unmarshaling + var unmarshaled NodeGroupNodeRepairConfig + err = json.Unmarshal(jsonData, &unmarshaled) + Expect(err).NotTo(HaveOccurred()) + Expect(*unmarshaled.Enabled).To(BeTrue()) + Expect(*unmarshaled.MaxUnhealthyNodeThresholdPercentage).To(Equal(20)) + Expect(*unmarshaled.MaxUnhealthyNodeThresholdCount).To(Equal(5)) + Expect(*unmarshaled.MaxParallelNodesRepairedPercentage).To(Equal(15)) + Expect(*unmarshaled.MaxParallelNodesRepairedCount).To(Equal(2)) + Expect(len(unmarshaled.NodeRepairConfigOverrides)).To(Equal(2)) + Expect(unmarshaled.NodeRepairConfigOverrides[0].NodeMonitoringCondition).To(Equal("AcceleratedInstanceNotReady")) + Expect(unmarshaled.NodeRepairConfigOverrides[0].NodeUnhealthyReason).To(Equal("NvidiaXID13Error")) + Expect(unmarshaled.NodeRepairConfigOverrides[0].MinRepairWaitTimeMins).To(Equal(10)) + Expect(unmarshaled.NodeRepairConfigOverrides[0].RepairAction).To(Equal("Terminate")) + }) + }) + + When("only enabled field is set", func() { + It("should marshal and unmarshal correctly with minimal config", func() { + nodeRepairConfig.Enabled = aws.Bool(true) + + // Test JSON marshaling + jsonData, err := json.Marshal(nodeRepairConfig) + Expect(err).NotTo(HaveOccurred()) + Expect(string(jsonData)).To(ContainSubstring(`"enabled":true`)) + Expect(string(jsonData)).NotTo(ContainSubstring(`"maxUnhealthyNodeThresholdPercentage"`)) + + // Test JSON unmarshaling + var unmarshaled NodeGroupNodeRepairConfig + err = json.Unmarshal(jsonData, &unmarshaled) + Expect(err).NotTo(HaveOccurred()) + Expect(*unmarshaled.Enabled).To(BeTrue()) + Expect(unmarshaled.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + Expect(unmarshaled.MaxUnhealthyNodeThresholdCount).To(BeNil()) + Expect(unmarshaled.MaxParallelNodesRepairedPercentage).To(BeNil()) + Expect(unmarshaled.MaxParallelNodesRepairedCount).To(BeNil()) + Expect(len(unmarshaled.NodeRepairConfigOverrides)).To(Equal(0)) + }) + }) + + When("enabled is false", func() { + It("should handle disabled state correctly", func() { + nodeRepairConfig.Enabled = aws.Bool(false) + + jsonData, err := json.Marshal(nodeRepairConfig) + Expect(err).NotTo(HaveOccurred()) + Expect(string(jsonData)).To(ContainSubstring(`"enabled":false`)) + + var unmarshaled NodeGroupNodeRepairConfig + err = json.Unmarshal(jsonData, &unmarshaled) + Expect(err).NotTo(HaveOccurred()) + Expect(*unmarshaled.Enabled).To(BeFalse()) + }) + }) + }) + + Describe("NodeRepairConfigOverride", func() { + var override NodeRepairConfigOverride + + BeforeEach(func() { + override = NodeRepairConfigOverride{ + NodeMonitoringCondition: "NetworkNotReady", + NodeUnhealthyReason: "InterfaceNotUp", + MinRepairWaitTimeMins: 15, + RepairAction: "Restart", + } + }) + + It("should have all required fields", func() { + Expect(override.NodeMonitoringCondition).To(Equal("NetworkNotReady")) + Expect(override.NodeUnhealthyReason).To(Equal("InterfaceNotUp")) + Expect(override.MinRepairWaitTimeMins).To(Equal(15)) + Expect(override.RepairAction).To(Equal("Restart")) + }) + + It("should marshal to JSON correctly", func() { + jsonData, err := json.Marshal(override) + Expect(err).NotTo(HaveOccurred()) + Expect(string(jsonData)).To(ContainSubstring(`"nodeMonitoringCondition":"NetworkNotReady"`)) + Expect(string(jsonData)).To(ContainSubstring(`"nodeUnhealthyReason":"InterfaceNotUp"`)) + Expect(string(jsonData)).To(ContainSubstring(`"minRepairWaitTimeMins":15`)) + Expect(string(jsonData)).To(ContainSubstring(`"repairAction":"Restart"`)) + }) + + It("should unmarshal from JSON correctly", func() { + jsonStr := `{ + "nodeMonitoringCondition": "AcceleratedInstanceNotReady", + "nodeUnhealthyReason": "NvidiaXID13Error", + "minRepairWaitTimeMins": 25, + "repairAction": "Terminate" + }` + + var unmarshaled NodeRepairConfigOverride + err := json.Unmarshal([]byte(jsonStr), &unmarshaled) + Expect(err).NotTo(HaveOccurred()) + Expect(unmarshaled.NodeMonitoringCondition).To(Equal("AcceleratedInstanceNotReady")) + Expect(unmarshaled.NodeUnhealthyReason).To(Equal("NvidiaXID13Error")) + Expect(unmarshaled.MinRepairWaitTimeMins).To(Equal(25)) + Expect(unmarshaled.RepairAction).To(Equal("Terminate")) + }) + }) + + Describe("Pointer field handling", func() { + It("should distinguish between nil and zero values", func() { + // Test nil values + config1 := &NodeGroupNodeRepairConfig{} + Expect(config1.Enabled).To(BeNil()) + Expect(config1.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + + // Test zero values + config2 := &NodeGroupNodeRepairConfig{ + Enabled: aws.Bool(false), + MaxUnhealthyNodeThresholdPercentage: aws.Int(0), + MaxUnhealthyNodeThresholdCount: aws.Int(0), + MaxParallelNodesRepairedPercentage: aws.Int(0), + MaxParallelNodesRepairedCount: aws.Int(0), + } + Expect(config2.Enabled).NotTo(BeNil()) + Expect(*config2.Enabled).To(BeFalse()) + Expect(config2.MaxUnhealthyNodeThresholdPercentage).NotTo(BeNil()) + Expect(*config2.MaxUnhealthyNodeThresholdPercentage).To(Equal(0)) + }) + }) + }) + }) diff --git a/pkg/cfn/builder/managed_nodegroup.go b/pkg/cfn/builder/managed_nodegroup.go index a469c433fd..233ffe3790 100644 --- a/pkg/cfn/builder/managed_nodegroup.go +++ b/pkg/cfn/builder/managed_nodegroup.go @@ -127,9 +127,41 @@ func (m *ManagedNodeGroupResourceSet) AddAllResources(ctx context.Context) error if m.nodeGroup.NodeRepairConfig != nil { nodeRepairConfig := &gfneks.Nodegroup_NodeRepairConfig{} + if m.nodeGroup.NodeRepairConfig.Enabled != nil { nodeRepairConfig.Enabled = gfnt.NewBoolean(*m.nodeGroup.NodeRepairConfig.Enabled) } + + if m.nodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage != nil { + nodeRepairConfig.MaxUnhealthyNodeThresholdPercentage = gfnt.NewInteger(*m.nodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage) + } + + if m.nodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount != nil { + nodeRepairConfig.MaxUnhealthyNodeThresholdCount = gfnt.NewInteger(*m.nodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount) + } + + if m.nodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage != nil { + nodeRepairConfig.MaxParallelNodesRepairedPercentage = gfnt.NewInteger(*m.nodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage) + } + + if m.nodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount != nil { + nodeRepairConfig.MaxParallelNodesRepairedCount = gfnt.NewInteger(*m.nodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount) + } + + if len(m.nodeGroup.NodeRepairConfig.NodeRepairConfigOverrides) > 0 { + var overrides []gfneks.Nodegroup_NodeRepairConfigOverride + for _, override := range m.nodeGroup.NodeRepairConfig.NodeRepairConfigOverrides { + cfnOverride := gfneks.Nodegroup_NodeRepairConfigOverride{ + NodeMonitoringCondition: gfnt.NewString(override.NodeMonitoringCondition), + NodeUnhealthyReason: gfnt.NewString(override.NodeUnhealthyReason), + MinRepairWaitTimeMins: gfnt.NewInteger(override.MinRepairWaitTimeMins), + RepairAction: gfnt.NewString(override.RepairAction), + } + overrides = append(overrides, cfnOverride) + } + nodeRepairConfig.NodeRepairConfigOverrides = overrides + } + managedResource.NodeRepairConfig = nodeRepairConfig } diff --git a/pkg/cfn/builder/managed_nodegroup_test.go b/pkg/cfn/builder/managed_nodegroup_test.go index 6f9271fc2f..cad176f599 100644 --- a/pkg/cfn/builder/managed_nodegroup_test.go +++ b/pkg/cfn/builder/managed_nodegroup_test.go @@ -6,6 +6,7 @@ import ( "fmt" "testing" + "github.com/aws/aws-sdk-go-v2/aws" "github.com/weaveworks/eksctl/pkg/goformation" gfneks "github.com/weaveworks/eksctl/pkg/goformation/cloudformation/eks" gfnt "github.com/weaveworks/eksctl/pkg/goformation/cloudformation/types" @@ -261,3 +262,232 @@ func subs(ss []string) []*gfnt.Value { } return subs } + +func TestManagedNodeGroupNodeRepairConfig(t *testing.T) { + nodeRepairTests := []struct { + description string + nodeRepairConfig *api.NodeGroupNodeRepairConfig + expectedConfig *gfneks.Nodegroup_NodeRepairConfig + }{ + { + description: "nil node repair config", + nodeRepairConfig: nil, + expectedConfig: nil, + }, + { + description: "enabled only", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Enabled(), + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(true), + }, + }, + { + description: "disabled only", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Disabled(), + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(false), + }, + }, + { + description: "all threshold and parallel parameters", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Enabled(), + MaxUnhealthyNodeThresholdPercentage: aws.Int(20), + MaxUnhealthyNodeThresholdCount: aws.Int(5), + MaxParallelNodesRepairedPercentage: aws.Int(15), + MaxParallelNodesRepairedCount: aws.Int(2), + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(true), + MaxUnhealthyNodeThresholdPercentage: gfnt.NewInteger(20), + MaxUnhealthyNodeThresholdCount: gfnt.NewInteger(5), + MaxParallelNodesRepairedPercentage: gfnt.NewInteger(15), + MaxParallelNodesRepairedCount: gfnt.NewInteger(2), + }, + }, + { + description: "with node repair config overrides", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Enabled(), + NodeRepairConfigOverrides: []api.NodeRepairConfigOverride{ + { + NodeMonitoringCondition: "AcceleratedInstanceNotReady", + NodeUnhealthyReason: "NvidiaXID13Error", + MinRepairWaitTimeMins: 10, + RepairAction: "Terminate", + }, + { + NodeMonitoringCondition: "NetworkNotReady", + NodeUnhealthyReason: "InterfaceNotUp", + MinRepairWaitTimeMins: 20, + RepairAction: "Restart", + }, + }, + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(true), + NodeRepairConfigOverrides: []gfneks.Nodegroup_NodeRepairConfigOverride{ + { + NodeMonitoringCondition: gfnt.NewString("AcceleratedInstanceNotReady"), + NodeUnhealthyReason: gfnt.NewString("NvidiaXID13Error"), + MinRepairWaitTimeMins: gfnt.NewInteger(10), + RepairAction: gfnt.NewString("Terminate"), + }, + { + NodeMonitoringCondition: gfnt.NewString("NetworkNotReady"), + NodeUnhealthyReason: gfnt.NewString("InterfaceNotUp"), + MinRepairWaitTimeMins: gfnt.NewInteger(20), + RepairAction: gfnt.NewString("Restart"), + }, + }, + }, + }, + { + description: "comprehensive configuration", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Enabled(), + MaxUnhealthyNodeThresholdPercentage: aws.Int(25), + MaxParallelNodesRepairedCount: aws.Int(3), + NodeRepairConfigOverrides: []api.NodeRepairConfigOverride{ + { + NodeMonitoringCondition: "NetworkNotReady", + NodeUnhealthyReason: "InterfaceNotUp", + MinRepairWaitTimeMins: 15, + RepairAction: "Restart", + }, + }, + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(true), + MaxUnhealthyNodeThresholdPercentage: gfnt.NewInteger(25), + MaxParallelNodesRepairedCount: gfnt.NewInteger(3), + NodeRepairConfigOverrides: []gfneks.Nodegroup_NodeRepairConfigOverride{ + { + NodeMonitoringCondition: gfnt.NewString("NetworkNotReady"), + NodeUnhealthyReason: gfnt.NewString("InterfaceNotUp"), + MinRepairWaitTimeMins: gfnt.NewInteger(15), + RepairAction: gfnt.NewString("Restart"), + }, + }, + }, + }, + } + + for _, tt := range nodeRepairTests { + t.Run(tt.description, func(t *testing.T) { + clusterConfig := api.NewClusterConfig() + clusterConfig.Metadata.Name = "test-cluster" + clusterConfig.Metadata.Region = "us-west-2" + + ng := &api.ManagedNodeGroup{ + NodeGroupBase: &api.NodeGroupBase{ + Name: "test-ng", + InstanceType: "m5.large", + }, + NodeRepairConfig: tt.nodeRepairConfig, + } + + clusterConfig.Status = &api.ClusterStatus{} + err := api.SetManagedNodeGroupDefaults(ng, clusterConfig.Metadata, false) + require.NoError(t, err) + + p := mockprovider.NewMockProvider() + fakeVPCImporter := new(vpcfakes.FakeImporter) + bootstrapper, err := nodebootstrap.NewManagedBootstrapper(clusterConfig, ng) + require.NoError(t, err) + + // Mock subnets and AZ instance support like other tests + mockSubnetsAndAZInstanceSupport(clusterConfig, p, + []string{"us-west-2a"}, + []string{}, // local zones + []ec2types.InstanceType{api.DefaultNodeType}) + + stack := builder.NewManagedNodeGroup(p.EC2(), clusterConfig, ng, nil, bootstrapper, false, fakeVPCImporter) + err = stack.AddAllResources(context.Background()) + require.NoError(t, err) + + bytes, err := stack.RenderJSON() + require.NoError(t, err) + + template, err := goformation.ParseJSON(bytes) + require.NoError(t, err) + + // Get the managed nodegroup resource + ngResource, ok := template.Resources[builder.ManagedNodeGroupResourceName] + require.True(t, ok, "ManagedNodeGroup resource should exist") + managedNodeGroup, ok := ngResource.(*gfneks.Nodegroup) + require.True(t, ok, "Resource should be a Nodegroup") + + // Test the node repair config + if tt.expectedConfig == nil { + require.Nil(t, managedNodeGroup.NodeRepairConfig, "NodeRepairConfig should be nil") + } else { + require.NotNil(t, managedNodeGroup.NodeRepairConfig, "NodeRepairConfig should not be nil") + + // Test enabled field + if tt.expectedConfig.Enabled != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.Enabled) + require.Equal(t, tt.expectedConfig.Enabled.Raw(), managedNodeGroup.NodeRepairConfig.Enabled.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.Enabled) + } + + // Test threshold percentage + if tt.expectedConfig.MaxUnhealthyNodeThresholdPercentage != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage) + require.Equal(t, tt.expectedConfig.MaxUnhealthyNodeThresholdPercentage.Raw(), + managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage) + } + + // Test threshold count + if tt.expectedConfig.MaxUnhealthyNodeThresholdCount != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount) + require.Equal(t, tt.expectedConfig.MaxUnhealthyNodeThresholdCount.Raw(), + managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount) + } + + // Test parallel percentage + if tt.expectedConfig.MaxParallelNodesRepairedPercentage != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage) + require.Equal(t, tt.expectedConfig.MaxParallelNodesRepairedPercentage.Raw(), + managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage) + } + + // Test parallel count + if tt.expectedConfig.MaxParallelNodesRepairedCount != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount) + require.Equal(t, tt.expectedConfig.MaxParallelNodesRepairedCount.Raw(), + managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount) + } + + // Test overrides + require.Equal(t, len(tt.expectedConfig.NodeRepairConfigOverrides), + len(managedNodeGroup.NodeRepairConfig.NodeRepairConfigOverrides)) + + for i, expectedOverride := range tt.expectedConfig.NodeRepairConfigOverrides { + actualOverride := managedNodeGroup.NodeRepairConfig.NodeRepairConfigOverrides[i] + require.Equal(t, expectedOverride.NodeMonitoringCondition.Raw(), + actualOverride.NodeMonitoringCondition.Raw()) + require.Equal(t, expectedOverride.NodeUnhealthyReason.Raw(), + actualOverride.NodeUnhealthyReason.Raw()) + require.Equal(t, expectedOverride.MinRepairWaitTimeMins.Raw(), + actualOverride.MinRepairWaitTimeMins.Raw()) + require.Equal(t, expectedOverride.RepairAction.Raw(), + actualOverride.RepairAction.Raw()) + } + } + }) + } +} diff --git a/pkg/cfn/builder/testdata/launch_template/enhanced-node-repair.json b/pkg/cfn/builder/testdata/launch_template/enhanced-node-repair.json new file mode 100644 index 0000000000..4cf955f12b --- /dev/null +++ b/pkg/cfn/builder/testdata/launch_template/enhanced-node-repair.json @@ -0,0 +1,192 @@ +{ + "LaunchTemplate": { + "Type": "AWS::EC2::LaunchTemplate", + "Properties": { + "LaunchTemplateData": { + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/xvda", + "Ebs": { + "Iops": 3000, + "Throughput": 125, + "VolumeSize": 80, + "VolumeType": "gp3" + } + } + ], + "MetadataOptions": { + "HttpPutResponseHopLimit": 2, + "HttpTokens": "required" + }, + "SecurityGroupIds": [ + { + "Fn::ImportValue": "eksctl-lt::ClusterSecurityGroupId" + } + ], + "TagSpecifications": [ + { + "ResourceType": "instance", + "Tags": [ + { + "Key": "Name", + "Value": "lt-enhanced-node-repair-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "enhanced-node-repair" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + }, + { + "ResourceType": "volume", + "Tags": [ + { + "Key": "Name", + "Value": "lt-enhanced-node-repair-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "enhanced-node-repair" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + }, + { + "ResourceType": "network-interface", + "Tags": [ + { + "Key": "Name", + "Value": "lt-enhanced-node-repair-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "enhanced-node-repair" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + } + ] + }, + "LaunchTemplateName": { + "Fn::Sub": "${AWS::StackName}" + } + } + }, + "ManagedNodeGroup": { + "Type": "AWS::EKS::Nodegroup", + "Properties": { + "AmiType": "AL2023_x86_64_STANDARD", + "ClusterName": "lt", + "Labels": { + "alpha.eksctl.io/cluster-name": "lt", + "alpha.eksctl.io/nodegroup-name": "enhanced-node-repair" + }, + "InstanceTypes": ["m5.xlarge"], + "NodeRole": { + "Fn::GetAtt": [ + "NodeInstanceRole", + "Arn" + ] + }, + "NodegroupName": "enhanced-node-repair", + "ScalingConfig": { + "DesiredSize": 3, + "MaxSize": 5, + "MinSize": 1 + }, + "Subnets": [ + "subnet-public-us-west-2a" + ], + "Tags": { + "alpha.eksctl.io/nodegroup-name": "enhanced-node-repair", + "alpha.eksctl.io/nodegroup-type": "managed" + }, + "LaunchTemplate": { + "Id": { + "Ref": "LaunchTemplate" + } + }, + "NodeRepairConfig": { + "Enabled": true, + "MaxUnhealthyNodeThresholdPercentage": 20, + "MaxParallelNodesRepairedPercentage": 15, + "NodeRepairConfigOverrides": [ + { + "NodeMonitoringCondition": "AcceleratedInstanceNotReady", + "NodeUnhealthyReason": "NvidiaXID13Error", + "MinRepairWaitTimeMins": 10, + "RepairAction": "Terminate" + }, + { + "NodeMonitoringCondition": "NetworkNotReady", + "NodeUnhealthyReason": "InterfaceNotUp", + "MinRepairWaitTimeMins": 20, + "RepairAction": "Restart" + } + ] + } + } + }, + "NodeInstanceRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Action": [ + "sts:AssumeRole" + ], + "Effect": "Allow", + "Principal": { + "Service": [ + { + "Fn::FindInMap": [ + "ServicePrincipalPartitionMap", + { + "Ref": "AWS::Partition" + }, + "EC2" + ] + } + ] + } + } + ], + "Version": "2012-10-17" + }, + "ManagedPolicyArns": [ + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEC2ContainerRegistryPullOnly" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEKS_CNI_Policy" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + ], + "Path": "/", + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Sub": "${AWS::StackName}/NodeInstanceRole" + } + } + ] + } + } +} \ No newline at end of file diff --git a/pkg/ctl/cmdutils/configfile.go b/pkg/ctl/cmdutils/configfile.go index 7394d4ed15..0a4a787e67 100644 --- a/pkg/ctl/cmdutils/configfile.go +++ b/pkg/ctl/cmdutils/configfile.go @@ -611,10 +611,29 @@ func makeManagedNodegroup(nodeGroup *api.NodeGroup, options CreateManagedNGOptio Spot: options.Spot, InstanceTypes: options.InstanceTypes, } - if options.NodeRepairEnabled { + if options.NodeRepairEnabled || options.NodeRepairMaxUnhealthyPercentage != nil || + options.NodeRepairMaxUnhealthyCount != nil || options.NodeRepairMaxParallelPercentage != nil || + options.NodeRepairMaxParallelCount != nil { + mng.NodeRepairConfig = &api.NodeGroupNodeRepairConfig{ Enabled: &options.NodeRepairEnabled, } + + if options.NodeRepairMaxUnhealthyPercentage != nil && *options.NodeRepairMaxUnhealthyPercentage > 0 { + mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage = options.NodeRepairMaxUnhealthyPercentage + } + + if options.NodeRepairMaxUnhealthyCount != nil && *options.NodeRepairMaxUnhealthyCount > 0 { + mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount = options.NodeRepairMaxUnhealthyCount + } + + if options.NodeRepairMaxParallelPercentage != nil && *options.NodeRepairMaxParallelPercentage > 0 { + mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage = options.NodeRepairMaxParallelPercentage + } + + if options.NodeRepairMaxParallelCount != nil && *options.NodeRepairMaxParallelCount > 0 { + mng.NodeRepairConfig.MaxParallelNodesRepairedCount = options.NodeRepairMaxParallelCount + } } return mng } @@ -627,7 +646,7 @@ func validateManagedNGFlags(cmd *cobra.Command, managed bool) error { if managed { return nil } - flagsValidOnlyWithMNG := []string{"spot", "enable-node-repair", "instance-types"} + flagsValidOnlyWithMNG := []string{"spot", "enable-node-repair", "instance-types", "node-repair-max-unhealthy-percentage", "node-repair-max-unhealthy-count", "node-repair-max-parallel-percentage", "node-repair-max-parallel-count"} if flagName, found := findChangedFlag(cmd, flagsValidOnlyWithMNG); found { return fmt.Errorf("--%s is only valid with managed nodegroups (--managed)", flagName) } diff --git a/pkg/ctl/cmdutils/configfile_test.go b/pkg/ctl/cmdutils/configfile_test.go index 063c032d8a..21f07a9532 100644 --- a/pkg/ctl/cmdutils/configfile_test.go +++ b/pkg/ctl/cmdutils/configfile_test.go @@ -648,6 +648,164 @@ var _ = Describe("cmdutils configfile", func() { }) }) }) + + Context("makeManagedNodegroup with node repair config", func() { + var ( + ng *api.NodeGroup + options CreateManagedNGOptions + ) + + BeforeEach(func() { + ng = &api.NodeGroup{ + NodeGroupBase: &api.NodeGroupBase{ + Name: "test-ng", + }, + } + options = CreateManagedNGOptions{} + }) + + It("should create managed nodegroup without node repair config when not enabled", func() { + options.NodeRepairEnabled = false + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).To(BeNil()) + }) + + It("should create managed nodegroup with basic node repair config when enabled", func() { + options.NodeRepairEnabled = true + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(BeNil()) + }) + + It("should create managed nodegroup with threshold percentage", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyPercentage = aws.Int(25) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(Equal(25)) + }) + + It("should create managed nodegroup with threshold count", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyCount = aws.Int(5) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).To(Equal(5)) + }) + + It("should create managed nodegroup with parallel percentage", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxParallelPercentage = aws.Int(20) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).To(Equal(20)) + }) + + It("should create managed nodegroup with parallel count", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxParallelCount = aws.Int(3) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(Equal(3)) + }) + + It("should create managed nodegroup with all parameters", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyPercentage = aws.Int(30) + options.NodeRepairMaxUnhealthyCount = aws.Int(10) + options.NodeRepairMaxParallelPercentage = aws.Int(25) + options.NodeRepairMaxParallelCount = aws.Int(4) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(Equal(30)) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).To(Equal(10)) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).To(Equal(25)) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(Equal(4)) + }) + + It("should create node repair config when enabled is false but other parameters are set", func() { + options.NodeRepairEnabled = false + options.NodeRepairMaxUnhealthyPercentage = aws.Int(15) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeFalse()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(Equal(15)) + }) + + It("should ignore zero values for optional parameters", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyPercentage = aws.Int(0) + options.NodeRepairMaxParallelCount = aws.Int(0) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + // Zero values should be ignored + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(BeNil()) + }) + + It("should handle nil pointers for optional parameters", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyPercentage = nil + options.NodeRepairMaxUnhealthyCount = nil + options.NodeRepairMaxParallelPercentage = nil + options.NodeRepairMaxParallelCount = nil + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(BeNil()) + }) + }) }) func assertValidClusterEndpoint(endpoints *api.ClusterEndpoints, privateAccess, publicAccess bool) { diff --git a/pkg/ctl/cmdutils/create_cluster.go b/pkg/ctl/cmdutils/create_cluster.go index 31201c0d23..67fdd03294 100644 --- a/pkg/ctl/cmdutils/create_cluster.go +++ b/pkg/ctl/cmdutils/create_cluster.go @@ -43,6 +43,12 @@ type CreateManagedNGOptions struct { Spot bool NodeRepairEnabled bool InstanceTypes []string + + // New node repair configuration options + NodeRepairMaxUnhealthyPercentage *int + NodeRepairMaxUnhealthyCount *int + NodeRepairMaxParallelPercentage *int + NodeRepairMaxParallelCount *int } // CreateNGOptions holds options for creating a nodegroup diff --git a/pkg/ctl/cmdutils/nodegroup_flags.go b/pkg/ctl/cmdutils/nodegroup_flags.go index 8221ef4558..7662b7bb55 100644 --- a/pkg/ctl/cmdutils/nodegroup_flags.go +++ b/pkg/ctl/cmdutils/nodegroup_flags.go @@ -57,6 +57,13 @@ func AddCommonCreateNodeGroupFlags(fs *pflag.FlagSet, cmd *Cmd, ng *api.NodeGrou fs.BoolVarP(&mngOptions.Managed, "managed", "", true, "Create EKS-managed nodegroup") fs.BoolVar(&mngOptions.Spot, "spot", false, "Create a spot nodegroup (managed nodegroups only)") fs.BoolVar(&mngOptions.NodeRepairEnabled, "enable-node-repair", false, "Enable automatic node repair (managed nodegroups only)") + + // Node repair configuration flags + mngOptions.NodeRepairMaxUnhealthyPercentage = fs.Int("node-repair-max-unhealthy-percentage", 0, "Percentage threshold of unhealthy nodes, above which node auto repair actions will stop (managed nodegroups only)") + mngOptions.NodeRepairMaxUnhealthyCount = fs.Int("node-repair-max-unhealthy-count", 0, "Count threshold of unhealthy nodes, above which node auto repair actions will stop (managed nodegroups only)") + mngOptions.NodeRepairMaxParallelPercentage = fs.Int("node-repair-max-parallel-percentage", 0, "Maximum percentage of unhealthy nodes that can be repaired concurrently or in parallel (managed nodegroups only)") + mngOptions.NodeRepairMaxParallelCount = fs.Int("node-repair-max-parallel-count", 0, "Maximum count of unhealthy nodes that can be repaired concurrently or in parallel (managed nodegroups only)") + fs.StringSliceVar(&mngOptions.InstanceTypes, "instance-types", nil, "Comma-separated list of instance types (e.g., --instance-types=c3.large,c4.large,c5.large") } diff --git a/pkg/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go b/pkg/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go index e64996f12f..12d4f89194 100644 --- a/pkg/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go +++ b/pkg/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go @@ -15,6 +15,31 @@ type Nodegroup_NodeRepairConfig struct { // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-enabled Enabled *types.Value `json:"Enabled,omitempty"` + // MaxUnhealthyNodeThresholdPercentage AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-maxunhealthynodethresholdpercentage + MaxUnhealthyNodeThresholdPercentage *types.Value `json:"MaxUnhealthyNodeThresholdPercentage,omitempty"` + + // MaxUnhealthyNodeThresholdCount AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-maxunhealthynodethresholdcount + MaxUnhealthyNodeThresholdCount *types.Value `json:"MaxUnhealthyNodeThresholdCount,omitempty"` + + // MaxParallelNodesRepairedPercentage AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-maxparallelnodesrepairedpercentage + MaxParallelNodesRepairedPercentage *types.Value `json:"MaxParallelNodesRepairedPercentage,omitempty"` + + // MaxParallelNodesRepairedCount AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-maxparallelnodesrepairedcount + MaxParallelNodesRepairedCount *types.Value `json:"MaxParallelNodesRepairedCount,omitempty"` + + // NodeRepairConfigOverrides AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-noderepairconfigurations + NodeRepairConfigOverrides []Nodegroup_NodeRepairConfigOverride `json:"NodeRepairConfigOverrides,omitempty"` + // AWSCloudFormationDeletionPolicy represents a CloudFormation DeletionPolicy AWSCloudFormationDeletionPolicy policies.DeletionPolicy `json:"-"` @@ -35,3 +60,48 @@ type Nodegroup_NodeRepairConfig struct { func (r *Nodegroup_NodeRepairConfig) AWSCloudFormationType() string { return "AWS::EKS::Nodegroup.NodeRepairConfig" } + +// Nodegroup_NodeRepairConfigOverride AWS CloudFormation Resource (AWS::EKS::Nodegroup.NodeRepairConfigOverride) +// See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html +type Nodegroup_NodeRepairConfigOverride struct { + + // NodeMonitoringCondition AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html#cfn-eks-nodegroup-noderepairconfigurations-nodemonitoringcondition + NodeMonitoringCondition *types.Value `json:"NodeMonitoringCondition,omitempty"` + + // NodeUnhealthyReason AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html#cfn-eks-nodegroup-noderepairconfigurations-nodeunhealthyreason + NodeUnhealthyReason *types.Value `json:"NodeUnhealthyReason,omitempty"` + + // MinRepairWaitTimeMins AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html#cfn-eks-nodegroup-noderepairconfigurations-minrepairwaittimemins + MinRepairWaitTimeMins *types.Value `json:"MinRepairWaitTimeMins,omitempty"` + + // RepairAction AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html#cfn-eks-nodegroup-noderepairconfigurations-repairaction + RepairAction *types.Value `json:"RepairAction,omitempty"` + + // AWSCloudFormationDeletionPolicy represents a CloudFormation DeletionPolicy + AWSCloudFormationDeletionPolicy policies.DeletionPolicy `json:"-"` + + // AWSCloudFormationUpdateReplacePolicy represents a CloudFormation UpdateReplacePolicy + AWSCloudFormationUpdateReplacePolicy policies.UpdateReplacePolicy `json:"-"` + + // AWSCloudFormationDependsOn stores the logical ID of the resources to be created before this resource + AWSCloudFormationDependsOn []string `json:"-"` + + // AWSCloudFormationMetadata stores structured data associated with this resource + AWSCloudFormationMetadata map[string]interface{} `json:"-"` + + // AWSCloudFormationCondition stores the logical ID of the condition that must be satisfied for this resource to be created + AWSCloudFormationCondition string `json:"-"` +} + +// AWSCloudFormationType returns the AWS CloudFormation resource type +func (r *Nodegroup_NodeRepairConfigOverride) AWSCloudFormationType() string { + return "AWS::EKS::Nodegroup.NodeRepairConfigOverride" +} diff --git a/userdocs/src/usage/nodegroup-node-repair-config.md b/userdocs/src/usage/nodegroup-node-repair-config.md index f42ff4b434..da2d592dbf 100644 --- a/userdocs/src/usage/nodegroup-node-repair-config.md +++ b/userdocs/src/usage/nodegroup-node-repair-config.md @@ -1,46 +1,247 @@ -# Support for Node Repair Config in EKS Managed Nodegroups +# Support Node Repair Configuration for EKS Managed Nodegroups -EKS Managed Nodegroups now supports Node Repair, where the health of managed nodes are monitored, -and unhealthy worker nodes are replaced or rebooted in response. +EKS Managed Nodegroups supports Node Repair, where the health of managed nodes are monitored, +and unhealthy worker nodes are replaced or rebooted in response. eksctl now provides comprehensive +configuration options for fine-grained control over node repair behavior. -## Creating a cluster a managed nodegroup with node repair enabled +## Basic Node Repair Configuration -To create a cluster with a managed nodegroup using node repair, pass the `--enable-node-repair` flag: +### Using CLI flags + +To create a cluster with a managed nodegroup using basic node repair: ```shell $ eksctl create cluster --enable-node-repair ``` -To create a managed nodegroup using node repair on an existing cluster: +To create a managed nodegroup with node repair on an existing cluster: ```shell $ eksctl create nodegroup --cluster= --enable-node-repair ``` -To create a cluster with a managed nodegroup using node repair via a config file: +### Using configuration files ```yaml -# node-repair-nodegroup-cluster.yaml ---- +# basic-node-repair.yaml apiVersion: eksctl.io/v1alpha5 kind: ClusterConfig metadata: - name: cluster-44 + name: basic-node-repair-cluster region: us-west-2 managedNodeGroups: - name: ng-1 nodeRepairConfig: enabled: true +``` + +```shell +$ eksctl create cluster -f basic-node-repair.yaml +``` + +## Enhanced Node Repair Configuration + +### Threshold Configuration + +You can configure when node repair actions will stop using either percentage or count-based thresholds. **Note: You cannot use both percentage and count thresholds at the same time.** + +#### CLI flags for thresholds + +```shell +# Percentage-based threshold - repair stops when 20% of nodes are unhealthy +$ eksctl create cluster --enable-node-repair \ + --node-repair-max-unhealthy-percentage=20 +# Count-based threshold - repair stops when 5 nodes are unhealthy +$ eksctl create cluster --enable-node-repair \ + --node-repair-max-unhealthy-count=5 ``` +#### Configuration file for thresholds + +```yaml +managedNodeGroups: +- name: threshold-ng + nodeRepairConfig: + enabled: true + # Stop repair actions when 20% of nodes are unhealthy + maxUnhealthyNodeThresholdPercentage: 20 + # Alternative: stop repair actions when 3 nodes are unhealthy + # maxUnhealthyNodeThresholdCount: 3 + # Note: Cannot use both percentage and count thresholds simultaneously +``` + +### Parallel Repair Limits + +Control the maximum number of nodes that can be repaired concurrently or in parallel. This gives you finer-grained control over the pace of node replacements. **Note: You cannot use both percentage and count limits at the same time.** + +#### CLI flags for parallel limits + ```shell -$ eksctl create cluster -f node-repair-nodegroup-cluster.yaml +# Percentage-based parallel limits - repair at most 15% of unhealthy nodes in parallel +$ eksctl create cluster --enable-node-repair \ + --node-repair-max-parallel-percentage=15 + +# Count-based parallel limits - repair at most 2 unhealthy nodes in parallel +$ eksctl create cluster --enable-node-repair \ + --node-repair-max-parallel-count=2 ``` -## Further information +#### Configuration file for parallel limits + +```yaml +managedNodeGroups: +- name: parallel-ng + nodeRepairConfig: + enabled: true + # Repair at most 15% of unhealthy nodes in parallel + maxParallelNodesRepairedPercentage: 15 + # Alternative: repair at most 2 unhealthy nodes in parallel + # maxParallelNodesRepairedCount: 2 + # Note: Cannot use both percentage and count limits simultaneously +``` + +### Custom Repair Overrides + +Specify granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. **If you use this, you must specify all the values for each override.** + +```yaml +managedNodeGroups: +- name: custom-repair-ng + instanceType: g4dn.xlarge # GPU instances + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 25 + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # Handle GPU-related failures with immediate termination + - nodeMonitoringCondition: "AcceleratedInstanceNotReady" + nodeUnhealthyReason: "NvidiaXID13Error" + minRepairWaitTimeMins: 10 + repairAction: "Terminate" + # Handle network issues with restart after waiting + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 20 + repairAction: "Restart" +``` + +## Complete Configuration Examples + +For a comprehensive example with all configuration options, see [examples/44-node-repair.yaml](https://github.com/eksctl-io/eksctl/blob/main/examples/44-node-repair.yaml). + +### Example 1: Basic repair with percentage thresholds + +```yaml +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: basic-repair-cluster + region: us-west-2 + +managedNodeGroups: +- name: basic-ng + instanceType: m5.large + desiredCapacity: 3 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 20 + maxParallelNodesRepairedPercentage: 15 +``` + +### Example 2: Conservative repair for critical workloads + +```yaml +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: critical-workload-cluster + region: us-west-2 + +managedNodeGroups: +- name: critical-ng + instanceType: c5.2xlarge + desiredCapacity: 6 + nodeRepairConfig: + enabled: true + # Very conservative settings + maxUnhealthyNodeThresholdPercentage: 10 + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # Wait longer before taking action on critical workloads + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 45 + repairAction: "Restart" +``` + +### Example 3: GPU workload with specialized repair + +```yaml +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: gpu-workload-cluster + region: us-west-2 + +managedNodeGroups: +- name: gpu-ng + instanceType: g4dn.xlarge + desiredCapacity: 4 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 25 + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # GPU failures require immediate termination + - nodeMonitoringCondition: "AcceleratedInstanceNotReady" + nodeUnhealthyReason: "NvidiaXID13Error" + minRepairWaitTimeMins: 5 + repairAction: "Terminate" +``` + +## CLI Reference + +### Node Repair Flags + +| Flag | Description | Example | +|------|-------------|---------| +| `--enable-node-repair` | Enable automatic node repair | `--enable-node-repair` | +| `--node-repair-max-unhealthy-percentage` | Maximum percentage of unhealthy nodes before repair | `--node-repair-max-unhealthy-percentage=20` | +| `--node-repair-max-unhealthy-count` | Maximum count of unhealthy nodes before repair | `--node-repair-max-unhealthy-count=5` | +| `--node-repair-max-parallel-percentage` | Maximum percentage of nodes to repair in parallel | `--node-repair-max-parallel-percentage=15` | +| `--node-repair-max-parallel-count` | Maximum count of nodes to repair in parallel | `--node-repair-max-parallel-count=2` | + +**Note:** Node repair config overrides are only supported through YAML configuration files due to their complexity. + +## Configuration Reference + +### nodeRepairConfig + +| Field | Type | Description | Constraints | Example | +|-------|------|-------------|-------------|---------| +| `enabled` | boolean | Enable/disable node repair | - | `true` | +| `maxUnhealthyNodeThresholdPercentage` | integer | Percentage threshold of unhealthy nodes, above which node auto repair actions will stop | Cannot be used with `maxUnhealthyNodeThresholdCount` | `20` | +| `maxUnhealthyNodeThresholdCount` | integer | Count threshold of unhealthy nodes, above which node auto repair actions will stop | Cannot be used with `maxUnhealthyNodeThresholdPercentage` | `5` | +| `maxParallelNodesRepairedPercentage` | integer | Maximum percentage of unhealthy nodes that can be repaired concurrently or in parallel | Cannot be used with `maxParallelNodesRepairedCount` | `15` | +| `maxParallelNodesRepairedCount` | integer | Maximum count of unhealthy nodes that can be repaired concurrently or in parallel | Cannot be used with `maxParallelNodesRepairedPercentage` | `2` | +| `nodeRepairConfigOverrides` | array | Granular overrides for specific repair actions controlling repair action and delay time | All values must be specified for each override | See examples above | + +### nodeRepairConfigOverrides + +| Field | Type | Description | Valid Values | +|-------|------|-------------|--------------| +| `nodeMonitoringCondition` | string | Unhealthy condition reported by the node monitoring agent that this override applies to | `"AcceleratedInstanceNotReady"`, `"NetworkNotReady"` | +| `nodeUnhealthyReason` | string | Reason reported by the node monitoring agent that this override applies to | `"NvidiaXID13Error"`, `"InterfaceNotUp"` | +| `minRepairWaitTimeMins` | integer | Minimum time in minutes to wait before attempting to repair a node with the specified condition and reason | Any positive integer | +| `repairAction` | string | Repair action to take for nodes when all of the specified conditions are met | `"Terminate"`, `"Restart"`, `"NoAction"` | + +## Further Information - [EKS Managed Nodegroup Node Health][eks-user-guide]