Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions examples/44-enhanced-node-repair.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# An example ClusterConfig that demonstrates the enhanced node repair configuration
# for EKS managed nodegroups with comprehensive parameters and custom overrides.

apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
name: enhanced-node-repair-cluster
region: us-west-2

managedNodeGroups:
# Example 1: Basic node repair with percentage-based thresholds
- name: basic-repair-ng
instanceType: m5.large
desiredCapacity: 3
minSize: 1
maxSize: 5
nodeRepairConfig:
enabled: true
# Stop repair actions when 20% of nodes are unhealthy
maxUnhealthyNodeThresholdPercentage: 20
# Repair at most 15% of unhealthy nodes in parallel
maxParallelNodesRepairedPercentage: 15

# Example 2: Node repair with count-based thresholds
- name: count-based-repair-ng
instanceType: m5.xlarge
desiredCapacity: 10
minSize: 5
maxSize: 20
nodeRepairConfig:
enabled: true
# Stop repair actions when 3 nodes are unhealthy
maxUnhealthyNodeThresholdCount: 3
# Repair at most 2 unhealthy nodes in parallel
maxParallelNodesRepairedCount: 2

# Example 3: Comprehensive configuration with custom overrides
- name: comprehensive-repair-ng
instanceType: g4dn.xlarge # GPU instance for ML workloads
desiredCapacity: 4
minSize: 2
maxSize: 8
nodeRepairConfig:
enabled: true
# Stop repair actions when 25% of nodes are unhealthy
maxUnhealthyNodeThresholdPercentage: 25
# Repair at most 1 unhealthy node at a time to maintain workload availability
maxParallelNodesRepairedCount: 1
# Custom repair behavior for specific failure scenarios
nodeRepairConfigOverrides:
# Handle GPU-related failures with immediate termination
- nodeMonitoringCondition: "AcceleratedInstanceNotReady"
nodeUnhealthyReason: "NvidiaXID13Error"
minRepairWaitTimeMins: 5
repairAction: "Terminate"
# Handle network issues with restart first, then terminate
- nodeMonitoringCondition: "NetworkNotReady"
nodeUnhealthyReason: "InterfaceNotUp"
minRepairWaitTimeMins: 15
repairAction: "Restart"

# Example 4: Conservative repair settings for critical workloads
- name: critical-workload-ng
instanceType: c5.2xlarge
desiredCapacity: 6
minSize: 3
maxSize: 12
nodeRepairConfig:
enabled: true
# Very conservative settings - stop repair actions when only 10% of nodes are unhealthy
maxUnhealthyNodeThresholdPercentage: 10
# Repair only 1 unhealthy node at a time for critical workloads
maxParallelNodesRepairedCount: 1
nodeRepairConfigOverrides:
# For critical workloads, wait longer before taking action
- nodeMonitoringCondition: "AcceleratedInstanceNotReady"
nodeUnhealthyReason: "NvidiaXID13Error"
minRepairWaitTimeMins: 30
repairAction: "Terminate"
- nodeMonitoringCondition: "NetworkNotReady"
nodeUnhealthyReason: "InterfaceNotUp"
minRepairWaitTimeMins: 45
repairAction: "Restart"

# Example 5: Disabled node repair (for comparison)
- name: no-repair-ng
instanceType: t3.medium
desiredCapacity: 2
minSize: 1
maxSize: 4
nodeRepairConfig:
enabled: false

# Additional cluster configuration
vpc:
cidr: "10.0.0.0/16"
autoAllocateIPv6: false

# Enable logging for monitoring node repair activities
cloudWatch:
clusterLogging:
enableTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"]
46 changes: 23 additions & 23 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,23 @@ require (
github.com/Masterminds/semver/v3 v3.4.0
github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b44c2
github.com/aws/aws-sdk-go v1.55.7
github.com/aws/aws-sdk-go-v2 v1.38.3
github.com/aws/aws-sdk-go-v2 v1.39.0
github.com/aws/aws-sdk-go-v2/config v1.29.14
github.com/aws/aws-sdk-go-v2/credentials v1.17.67
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.58.2
github.com/aws/aws-sdk-go-v2/service/cloudformation v1.66.0
github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.53.2
github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.2
github.com/aws/aws-sdk-go-v2/credentials v1.18.11
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.59.0
github.com/aws/aws-sdk-go-v2/service/cloudformation v1.66.1
github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.53.3
github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.3
github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.51.3
github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1
github.com/aws/aws-sdk-go-v2/service/eks v1.73.1
github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.2
github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.2
github.com/aws/aws-sdk-go-v2/service/iam v1.47.3
github.com/aws/aws-sdk-go-v2/service/eks v1.74.0
github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.3
github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.3
github.com/aws/aws-sdk-go-v2/service/iam v1.47.4
github.com/aws/aws-sdk-go-v2/service/kms v1.38.3
github.com/aws/aws-sdk-go-v2/service/outposts v1.56.2
github.com/aws/aws-sdk-go-v2/service/ssm v1.64.2
github.com/aws/aws-sdk-go-v2/service/sts v1.33.19
github.com/aws/aws-sdk-go-v2/service/outposts v1.56.3
github.com/aws/aws-sdk-go-v2/service/ssm v1.64.3
github.com/aws/aws-sdk-go-v2/service/sts v1.38.3
github.com/aws/smithy-go v1.23.0
github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20250219002025-c3b5cd3d2fd9
github.com/benjamintf1/unmarshalledmatchers v1.0.0
Expand Down Expand Up @@ -129,22 +129,22 @@ require (
github.com/ashanbrown/makezero v1.2.0 // indirect
github.com/atotto/clipboard v0.1.4 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.7 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.7 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.7 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.33 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.7 // indirect
github.com/aws/aws-sdk-go-v2/service/eventbridge v1.36.12 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.6.1 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.14 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.8.7 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.7 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.7 // indirect
github.com/aws/aws-sdk-go-v2/service/pricing v1.32.17 // indirect
github.com/aws/aws-sdk-go-v2/service/route53 v1.48.8 // indirect
github.com/aws/aws-sdk-go-v2/service/s3 v1.77.1 // indirect
github.com/aws/aws-sdk-go-v2/service/sqs v1.37.15 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.29.2 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.3 // indirect
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bkielbasa/cyclop v1.2.3 // indirect
Expand Down
Loading
Loading