Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 76 additions & 4 deletions examples/44-node-repair.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# An example ClusterConfig that uses a managed node group with auto repair.
# An example ClusterConfig that demonstrates node repair configuration
# for EKS managed nodegroups with various configuration options.

apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
Expand All @@ -8,6 +9,77 @@ metadata:
region: us-west-2

managedNodeGroups:
- name: ng-1
nodeRepairConfig:
enabled: true
# Example 1: Basic node repair
- name: basic-repair-ng
instanceType: m5.large
desiredCapacity: 3
nodeRepairConfig:
enabled: true

# Example 2: Node repair with percentage-based thresholds
- name: percentage-repair-ng
instanceType: m5.large
desiredCapacity: 3
minSize: 1
maxSize: 5
nodeRepairConfig:
enabled: true
# Stop repair actions when 20% of nodes are unhealthy
maxUnhealthyNodeThresholdPercentage: 20
# Repair at most 15% of unhealthy nodes in parallel
maxParallelNodesRepairedPercentage: 15

# Example 3: Node repair with count-based thresholds
- name: count-repair-ng
instanceType: m5.xlarge
desiredCapacity: 10
minSize: 5
maxSize: 20
nodeRepairConfig:
enabled: true
# Stop repair actions when 3 nodes are unhealthy
maxUnhealthyNodeThresholdCount: 3
# Repair at most 2 unhealthy nodes in parallel
maxParallelNodesRepairedCount: 2

# Example 4: GPU workload with custom repair overrides
- name: gpu-repair-ng
instanceType: g4dn.xlarge
desiredCapacity: 4
minSize: 2
maxSize: 8
nodeRepairConfig:
enabled: true
maxUnhealthyNodeThresholdPercentage: 25
maxParallelNodesRepairedCount: 1
# Custom repair behavior for specific failure scenarios
nodeRepairConfigOverrides:
# Handle GPU-related failures with immediate termination
- nodeMonitoringCondition: "AcceleratedInstanceNotReady"
nodeUnhealthyReason: "NvidiaXID13Error"
minRepairWaitTimeMins: 5
repairAction: "Terminate"
# Handle network issues with restart first
- nodeMonitoringCondition: "NetworkNotReady"
nodeUnhealthyReason: "InterfaceNotUp"
minRepairWaitTimeMins: 15
repairAction: "Restart"

# Example 5: Conservative repair for critical workloads
- name: critical-repair-ng
instanceType: c5.2xlarge
desiredCapacity: 6
minSize: 3
maxSize: 12
nodeRepairConfig:
enabled: true
# Conservative settings - stop repair when only 10% of nodes are unhealthy
maxUnhealthyNodeThresholdPercentage: 10
# Repair only 1 node at a time
maxParallelNodesRepairedCount: 1
nodeRepairConfigOverrides:
# Wait longer before taking action on critical workloads
- nodeMonitoringCondition: "NetworkNotReady"
nodeUnhealthyReason: "InterfaceNotUp"
minRepairWaitTimeMins: 45
repairAction: "Restart"
150 changes: 150 additions & 0 deletions integration/tests/enhanced_node_repair/enhanced_node_repair_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
//go:build integration
// +build integration

package enhanced_node_repair

import (
"fmt"
"os"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

. "github.com/weaveworks/eksctl/integration/runner"
"github.com/weaveworks/eksctl/integration/tests"
"github.com/weaveworks/eksctl/pkg/testutils"
)

var params *tests.Params

func init() {
// Call testing.Init() prior to tests.NewParams(), as otherwise -test.* will not be recognised. See also: https://golang.org/doc/go1.13#testing
testing.Init()
params = tests.NewParamsWithGivenClusterName("enhanced-node-repair", "test-enhanced-node-repair")
}

func TestEnhancedNodeRepair(t *testing.T) {
testutils.RegisterAndRun(t)
}

var _ = Describe("(Integration) Enhanced Node Repair Configuration", func() {

Context("CloudFormation template generation", func() {
It("should generate correct CloudFormation template with CLI flags", func() {
By("testing CLI flags generate correct CloudFormation")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--name", "test-cli-template",
"--region", params.Region,
"--managed",
"--enable-node-repair",
"--node-repair-max-unhealthy-percentage=25",
"--node-repair-max-parallel-count=2",
"--dry-run",
)
Expect(cmd).To(RunSuccessfully())
})

It("should generate correct CloudFormation template with YAML config", func() {
By("creating temporary config file")
configFile := fmt.Sprintf("/tmp/test-enhanced-node-repair-%d.yaml", time.Now().Unix())
yamlConfig := fmt.Sprintf(`
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
name: test-yaml-template
region: %s

managedNodeGroups:
- name: enhanced-ng
instanceType: t3.medium
desiredCapacity: 2
nodeRepairConfig:
enabled: true
maxUnhealthyNodeThresholdPercentage: 20
maxParallelNodesRepairedPercentage: 15
nodeRepairConfigOverrides:
- nodeMonitoringCondition: "NetworkNotReady"
nodeUnhealthyReason: "InterfaceNotUp"
minRepairWaitTimeMins: 15
repairAction: "Restart"
`, params.Region)

err := os.WriteFile(configFile, []byte(yamlConfig), 0644)
Expect(err).NotTo(HaveOccurred())
defer os.Remove(configFile)

By("testing YAML config generates correct CloudFormation")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--config-file", configFile,
"--dry-run",
)
Expect(cmd).To(RunSuccessfully())
})

It("should validate backward compatibility with existing config", func() {
By("testing existing node repair config still works")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--name", "test-backward-compat",
"--region", params.Region,
"--managed",
"--enable-node-repair",
"--dry-run",
)
Expect(cmd).To(RunSuccessfully())
})
})

Context("error handling", func() {
It("should handle invalid CLI flag combinations gracefully", func() {
By("testing with unmanaged nodegroup (should fail)")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--name", "test-error-handling",
"--region", params.Region,
"--managed=false",
"--enable-node-repair",
"--dry-run",
)
Expect(cmd).NotTo(RunSuccessfully())
Expect(cmd).To(RunWithError(ContainSubstring("only valid with managed nodegroups")))
})

It("should handle invalid YAML configuration gracefully", func() {
By("creating config file with invalid node repair config")
configFile := fmt.Sprintf("/tmp/test-invalid-config-%d.yaml", time.Now().Unix())
invalidConfig := fmt.Sprintf(`
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
name: test-invalid
region: %s

nodeGroups:
- name: unmanaged-ng
instanceType: t3.medium
nodeRepairConfig:
enabled: true
`, params.Region)

err := os.WriteFile(configFile, []byte(invalidConfig), 0644)
Expect(err).NotTo(HaveOccurred())
defer os.Remove(configFile)

By("testing invalid config is rejected")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--config-file", configFile,
"--dry-run",
)
// This should fail because nodeRepairConfig is not supported for unmanaged nodegroups
Expect(cmd).NotTo(RunSuccessfully())
})
})
})
Loading
Loading