From 52e2fb13b154ac1967a8315da5277d985a89f5c5 Mon Sep 17 00:00:00 2001 From: sapphirew Date: Wed, 10 Sep 2025 15:06:31 -0700 Subject: [PATCH 1/4] feat: Add enhanced node repair configuration support This commit implements comprehensive enhanced node repair configuration for EKS managed nodegroups with the following features: - Support for percentage and count-based unhealthy node thresholds - Configurable parallel repair limits (percentage and count) - Advanced node repair config overrides for specific conditions - Full CLI flag support for all new parameters - Complete YAML configuration file support - Backward compatibility with existing configurations Key changes: - Extended API types with new NodeRepairConfigOverride struct - Added CLI flags for all new parameters - Updated CloudFormation builder for AWS EKS integration - Comprehensive unit and integration tests - Updated documentation and examples - Enhanced JSON schema validation CLI Examples: eksctl create cluster --enable-node-repair --node-repair-max-unhealthy-percentage=25 eksctl create nodegroup --enable-node-repair --node-repair-max-parallel-count=2 Config Examples: nodeRepairConfig: enabled: true maxUnhealthyNodeThresholdPercentage: 20 maxParallelNodesRepairedCount: 2 nodeRepairConfigOverrides: - nodeMonitoringCondition: NetworkNotReady nodeUnhealthyReason: InterfaceNotUp repairAction: Restart minRepairWaitTimeMins: 15 --- examples/44-enhanced-node-repair.yaml | 102 ++++ go.mod | 114 ++++- go.sum | 68 --- .../enhanced_node_repair_test.go | 150 ++++++ .../tests/enhanced_node_repair_test.go | 474 ++++++++++++++++++ .../eksctl.io/v1alpha5/assets/schema.json | 68 ++- pkg/apis/eksctl.io/v1alpha5/types.go | 35 ++ pkg/apis/eksctl.io/v1alpha5/types_test.go | 171 +++++++ pkg/cfn/builder/managed_nodegroup.go | 32 ++ pkg/cfn/builder/managed_nodegroup_test.go | 230 +++++++++ .../launch_template/enhanced-node-repair.json | 192 +++++++ pkg/ctl/cmdutils/configfile.go | 23 +- pkg/ctl/cmdutils/configfile_test.go | 158 ++++++ pkg/ctl/cmdutils/create_cluster.go | 6 + pkg/ctl/cmdutils/nodegroup_flags.go | 7 + .../eks/aws-eks-nodegroup_noderepairconfig.go | 70 +++ .../src/usage/nodegroup-node-repair-config.md | 264 +++++++++- 17 files changed, 2058 insertions(+), 106 deletions(-) create mode 100644 examples/44-enhanced-node-repair.yaml create mode 100644 integration/tests/enhanced_node_repair/enhanced_node_repair_test.go create mode 100644 integration/tests/enhanced_node_repair_test.go create mode 100644 pkg/cfn/builder/testdata/launch_template/enhanced-node-repair.json diff --git a/examples/44-enhanced-node-repair.yaml b/examples/44-enhanced-node-repair.yaml new file mode 100644 index 0000000000..55186b7399 --- /dev/null +++ b/examples/44-enhanced-node-repair.yaml @@ -0,0 +1,102 @@ +# An example ClusterConfig that demonstrates the enhanced node repair configuration +# for EKS managed nodegroups with comprehensive parameters and custom overrides. + +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: enhanced-node-repair-cluster + region: us-west-2 + +managedNodeGroups: + # Example 1: Basic node repair with percentage-based thresholds + - name: basic-repair-ng + instanceType: m5.large + desiredCapacity: 3 + minSize: 1 + maxSize: 5 + nodeRepairConfig: + enabled: true + # Trigger repair when 20% of nodes are unhealthy + maxUnhealthyNodeThresholdPercentage: 20 + # Repair at most 15% of nodes in parallel + maxParallelNodesRepairedPercentage: 15 + + # Example 2: Node repair with count-based thresholds + - name: count-based-repair-ng + instanceType: m5.xlarge + desiredCapacity: 10 + minSize: 5 + maxSize: 20 + nodeRepairConfig: + enabled: true + # Trigger repair when 3 nodes are unhealthy + maxUnhealthyNodeThresholdCount: 3 + # Repair at most 2 nodes in parallel + maxParallelNodesRepairedCount: 2 + + # Example 3: Comprehensive configuration with custom overrides + - name: comprehensive-repair-ng + instanceType: g4dn.xlarge # GPU instance for ML workloads + desiredCapacity: 4 + minSize: 2 + maxSize: 8 + nodeRepairConfig: + enabled: true + # Use percentage-based threshold for this larger nodegroup + maxUnhealthyNodeThresholdPercentage: 25 + # Limit parallel repairs to maintain workload availability + maxParallelNodesRepairedCount: 1 + # Custom repair behavior for specific failure scenarios + nodeRepairConfigOverrides: + # Handle GPU-related failures with immediate termination + - nodeMonitoringCondition: "AcceleratedInstanceNotReady" + nodeUnhealthyReason: "NvidiaXID13Error" + minRepairWaitTimeMins: 5 + repairAction: "Terminate" + # Handle network issues with restart first, then terminate + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 15 + repairAction: "Restart" + + # Example 4: Conservative repair settings for critical workloads + - name: critical-workload-ng + instanceType: c5.2xlarge + desiredCapacity: 6 + minSize: 3 + maxSize: 12 + nodeRepairConfig: + enabled: true + # Very conservative thresholds for critical workloads + maxUnhealthyNodeThresholdPercentage: 10 + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # For critical workloads, wait longer before taking action + - nodeMonitoringCondition: "AcceleratedInstanceNotReady" + nodeUnhealthyReason: "NvidiaXID13Error" + minRepairWaitTimeMins: 30 + repairAction: "Terminate" + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 45 + repairAction: "Restart" + + # Example 5: Disabled node repair (for comparison) + - name: no-repair-ng + instanceType: t3.medium + desiredCapacity: 2 + minSize: 1 + maxSize: 4 + nodeRepairConfig: + enabled: false + +# Additional cluster configuration +vpc: + cidr: "10.0.0.0/16" + autoAllocateIPv6: false + +# Enable logging for monitoring node repair activities +cloudWatch: + clusterLogging: + enableTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"] \ No newline at end of file diff --git a/go.mod b/go.mod index 119a409db5..93c1ca2f2d 100644 --- a/go.mod +++ b/go.mod @@ -8,23 +8,23 @@ require ( github.com/Masterminds/semver/v3 v3.4.0 github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b44c2 github.com/aws/aws-sdk-go v1.55.7 - github.com/aws/aws-sdk-go-v2 v1.38.3 + github.com/aws/aws-sdk-go-v2 v1.39.0 github.com/aws/aws-sdk-go-v2/config v1.29.14 - github.com/aws/aws-sdk-go-v2/credentials v1.17.67 - github.com/aws/aws-sdk-go-v2/service/autoscaling v1.58.2 - github.com/aws/aws-sdk-go-v2/service/cloudformation v1.66.0 - github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.53.2 - github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.2 + github.com/aws/aws-sdk-go-v2/credentials v1.18.11 + github.com/aws/aws-sdk-go-v2/service/autoscaling v1.59.0 + github.com/aws/aws-sdk-go-v2/service/cloudformation v1.66.1 + github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.53.3 + github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.3 github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.51.3 github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1 - github.com/aws/aws-sdk-go-v2/service/eks v1.73.1 - github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.2 - github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.2 - github.com/aws/aws-sdk-go-v2/service/iam v1.47.3 + github.com/aws/aws-sdk-go-v2/service/eks v1.73.2 + github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.3 + github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.3 + github.com/aws/aws-sdk-go-v2/service/iam v1.47.4 github.com/aws/aws-sdk-go-v2/service/kms v1.38.3 - github.com/aws/aws-sdk-go-v2/service/outposts v1.56.2 - github.com/aws/aws-sdk-go-v2/service/ssm v1.64.2 - github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 + github.com/aws/aws-sdk-go-v2/service/outposts v1.56.3 + github.com/aws/aws-sdk-go-v2/service/ssm v1.64.3 + github.com/aws/aws-sdk-go-v2/service/sts v1.38.3 github.com/aws/smithy-go v1.23.0 github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20250219002025-c3b5cd3d2fd9 github.com/benjamintf1/unmarshalledmatchers v1.0.0 @@ -129,22 +129,22 @@ require ( github.com/ashanbrown/makezero v1.2.0 // indirect github.com/atotto/clipboard v0.1.4 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.7 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.7 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.7 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.33 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.7 // indirect github.com/aws/aws-sdk-go-v2/service/eventbridge v1.36.12 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.6.1 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.14 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.8.7 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.7 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.7 // indirect github.com/aws/aws-sdk-go-v2/service/pricing v1.32.17 // indirect github.com/aws/aws-sdk-go-v2/service/route53 v1.48.8 // indirect github.com/aws/aws-sdk-go-v2/service/s3 v1.77.1 // indirect github.com/aws/aws-sdk-go-v2/service/sqs v1.37.15 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.29.2 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.3 // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bkielbasa/cyclop v1.2.3 // indirect @@ -478,3 +478,71 @@ replace ( k8s.io/sample-cli-plugin => k8s.io/sample-cli-plugin v0.32.3 k8s.io/sample-controller => k8s.io/sample-controller v0.32.3 ) + +replace github.com/aws/aws-sdk-go-v2 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2 + +replace github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/aws/protocol/eventstream + +replace github.com/aws/aws-sdk-go-v2/config => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/config + +replace github.com/aws/aws-sdk-go-v2/credentials => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/credentials + +replace github.com/aws/aws-sdk-go-v2/feature/ec2/imds => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/feature/ec2/imds + +replace github.com/aws/aws-sdk-go-v2/internal/configsources => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/internal/configsources + +replace github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/internal/endpoints/v2 + +replace github.com/aws/aws-sdk-go-v2/internal/ini => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/internal/ini + +replace github.com/aws/aws-sdk-go-v2/internal/v4a => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/internal/v4a + +replace github.com/aws/aws-sdk-go-v2/service/autoscaling => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/autoscaling + +replace github.com/aws/aws-sdk-go-v2/service/cloudformation => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/cloudformation + +replace github.com/aws/aws-sdk-go-v2/service/cloudtrail => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/cloudtrail + +replace github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/cloudwatchlogs + +replace github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/cognitoidentityprovider + +replace github.com/aws/aws-sdk-go-v2/service/ec2 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/ec2 + +replace github.com/aws/aws-sdk-go-v2/service/eks => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/eks + +replace github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/elasticloadbalancing + +replace github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/elasticloadbalancingv2 + +replace github.com/aws/aws-sdk-go-v2/service/eventbridge => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/eventbridge + +replace github.com/aws/aws-sdk-go-v2/service/iam => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/iam + +replace github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/internal/accept-encoding + +replace github.com/aws/aws-sdk-go-v2/service/internal/checksum => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/internal/checksum + +replace github.com/aws/aws-sdk-go-v2/service/internal/presigned-url => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/internal/presigned-url + +replace github.com/aws/aws-sdk-go-v2/service/internal/s3shared => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/internal/s3shared + +replace github.com/aws/aws-sdk-go-v2/service/kms => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/kms + +replace github.com/aws/aws-sdk-go-v2/service/outposts => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/outposts + +replace github.com/aws/aws-sdk-go-v2/service/pricing => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/pricing + +replace github.com/aws/aws-sdk-go-v2/service/route53 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/route53 + +replace github.com/aws/aws-sdk-go-v2/service/s3 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/s3 + +replace github.com/aws/aws-sdk-go-v2/service/sqs => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/sqs + +replace github.com/aws/aws-sdk-go-v2/service/ssm => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/ssm + +replace github.com/aws/aws-sdk-go-v2/service/sso => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/sso + +replace github.com/aws/aws-sdk-go-v2/service/ssooidc => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/ssooidc + +replace github.com/aws/aws-sdk-go-v2/service/sts => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/sts diff --git a/go.sum b/go.sum index 7e119bbdd2..7e30affd59 100644 --- a/go.sum +++ b/go.sum @@ -106,74 +106,6 @@ github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b4 github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b44c2/go.mod h1:RU/lVVsYHNN7Bwr2UmCw5z2aWPcNIHADY49bj082oYM= github.com/aws/aws-sdk-go v1.55.7 h1:UJrkFq7es5CShfBwlWAC8DA077vp8PyVbQd3lqLiztE= github.com/aws/aws-sdk-go v1.55.7/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= -github.com/aws/aws-sdk-go-v2 v1.38.3 h1:B6cV4oxnMs45fql4yRH+/Po/YU+597zgWqvDpYMturk= -github.com/aws/aws-sdk-go-v2 v1.38.3/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 h1:i8p8P4diljCr60PpJp6qZXNlgX4m2yQFpYk+9ZT+J4E= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1/go.mod h1:ddqbooRZYNoJ2dsTwOty16rM+/Aqmk/GOXrK8cg7V00= -github.com/aws/aws-sdk-go-v2/config v1.29.14 h1:f+eEi/2cKCg9pqKBoAIwRGzVb70MRKqWX4dg1BDcSJM= -github.com/aws/aws-sdk-go-v2/config v1.29.14/go.mod h1:wVPHWcIFv3WO89w0rE10gzf17ZYy+UVS1Geq8Iei34g= -github.com/aws/aws-sdk-go-v2/credentials v1.17.67 h1:9KxtdcIA/5xPNQyZRgUSpYOE6j9Bc4+D7nZua0KGYOM= -github.com/aws/aws-sdk-go-v2/credentials v1.17.67/go.mod h1:p3C44m+cfnbv763s52gCqrjaqyPikj9Sg47kUVaNZQQ= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 h1:x793wxmUWVDhshP8WW2mlnXuFrO4cOd3HLBroh1paFw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30/go.mod h1:Jpne2tDnYiFascUEs2AWHJL9Yp7A5ZVy3TNyxaAjD6M= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 h1:uF68eJA6+S9iVr9WgX1NaRGyQ/6MdIyc4JNUo6TN1FA= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6/go.mod h1:qlPeVZCGPiobx8wb1ft0GHT5l+dc6ldnwInDFaMvC7Y= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 h1:pa1DEC6JoI0zduhZePp3zmhWvk/xxm4NB8Hy/Tlsgos= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6/go.mod h1:gxEjPebnhWGJoaDdtDkA0JX46VRg1wcTHYe63OfX5pE= -github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= -github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.33 h1:/frG8aV09yhCVSOEC2pzktflJJO48NwY3xntHBwxHiA= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.33/go.mod h1:8vwASlAcV366M+qxZnjNzCjeastk1Rt1bpSRaGZanGU= -github.com/aws/aws-sdk-go-v2/service/autoscaling v1.58.2 h1:z6A7RKbrhDiVp5wlV/MgZ03uOv//yLM228nY9Clw2Ds= -github.com/aws/aws-sdk-go-v2/service/autoscaling v1.58.2/go.mod h1:ailCQb+KhHZcMFd/VstivWtcNizcI5lpHxzbk6FI2dM= -github.com/aws/aws-sdk-go-v2/service/cloudformation v1.66.0 h1:zDKnCvsZ21fO1oCx1Dj+QofcU2MABkM9gdb1278an+Y= -github.com/aws/aws-sdk-go-v2/service/cloudformation v1.66.0/go.mod h1:wkKFqGoZf9Asi1eKuWbz7SEx0RtCq4+drWwHKzizP9o= -github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.53.2 h1:dlGpx2aVrU8Kjksdo0H9JqC0DrDOctTsLsbOivy722s= -github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.53.2/go.mod h1:jl4HqKy8wA2nlM/K0X4evl9CIPtXBlBIk5CJFKQqGms= -github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.2 h1:TSNLZXt7ipIV+Q+GZAQ8dUxYUDsMX2/Atrn/YuPF3zI= -github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.2/go.mod h1:mSt0uBAxUj2dnagbjc7p+Jh68SSwgDTNzMKUjchDiOY= -github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.51.3 h1:4U9dpQZTvJ0Mi1qn8L1hRJ4igFCQYEjwUuOmYkWM5tE= -github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.51.3/go.mod h1:ygltZT++6Wn2uG4+tqE0NW1MkdEtb5W2O/CFc0xJX/g= -github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1 h1:+4A9SDduLZFlDeXWRmfQ6r8kyEJZQfK6lcg+KwdvWrI= -github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1/go.mod h1:ouvGEfHbLaIlWwpDpOVWPWR+YwO0HDv3vm5tYLq8ImY= -github.com/aws/aws-sdk-go-v2/service/eks v1.73.1 h1:Txq5jxY/ao+2Vx/kX9+65WTqkzCnxSlXnwIj+Cr/fng= -github.com/aws/aws-sdk-go-v2/service/eks v1.73.1/go.mod h1:+hYFg3laewH0YCfJRv+o5R3bradDKmFIm/uaiaD1U7U= -github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.2 h1:L71eE74x591WhKIfNvjc+qE6WcON6bb1KiDwymf0dVk= -github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.2/go.mod h1:RpnUpPKuFd5ViTyiAlru/sygnOJ4tw7elKm7lyQerfk= -github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.2 h1:5NXAi1QdV3DT6nyASf94+E3i1jZ6zuFPb0lM0yDSMvw= -github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.2/go.mod h1:Gk3uvYKSpiM998WXOreKOsk6PHkkIMp1xGgYFDlZLAA= -github.com/aws/aws-sdk-go-v2/service/eventbridge v1.36.12 h1:uH6GOnGSvVN9MCk6o3+HvZFpdqL7AzJKNOTM/6l+3/s= -github.com/aws/aws-sdk-go-v2/service/eventbridge v1.36.12/go.mod h1:6qtp53AQg7KEeYrsp430PNlmVVO9qK0Xw8nddE1y+ow= -github.com/aws/aws-sdk-go-v2/service/iam v1.47.3 h1:BDkM6KWoryEstnb0fTg5Ip+WsxAph/aCNqwws/sS5yE= -github.com/aws/aws-sdk-go-v2/service/iam v1.47.3/go.mod h1:5q4IwllQ9vIoq7bk8dPvPbT3LQCky+4NgV7vKwAbaEs= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.6.1 h1:7SuukGpyIgF5EiAbf1dZRxP+xSnY1WjiHBjL08fjJeE= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.6.1/go.mod h1:k+Vce/8R28tSozjdWphkrNhK8zLmdS9RgiDNZl6p8Rw= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 h1:dM9/92u2F1JbDaGooxTq18wmmFzbJRfXfVfy96/1CXM= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15/go.mod h1:SwFBy2vjtA0vZbjjaFtfN045boopadnoVPhu4Fv66vY= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.14 h1:fgdkfsxTehqPcIQa24G/Omwv9RocTq2UcONNX/OnrZI= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.14/go.mod h1:wMxQ3OE8fiM8z2YRAeb2J8DLTTWMvRyYYuQOs26AbTQ= -github.com/aws/aws-sdk-go-v2/service/kms v1.38.3 h1:RivOtUH3eEu6SWnUMFHKAW4MqDOzWn1vGQ3S38Y5QMg= -github.com/aws/aws-sdk-go-v2/service/kms v1.38.3/go.mod h1:cQn6tAF77Di6m4huxovNM7NVAozWTZLsDRp9t8Z/WYk= -github.com/aws/aws-sdk-go-v2/service/outposts v1.56.2 h1:VwNFKyLlOtvXBPLja/Q8hs1o5vtLn+y9L5VjCwGh8rQ= -github.com/aws/aws-sdk-go-v2/service/outposts v1.56.2/go.mod h1:m6oPE9UnVlxKJl3Z05N23MHUCy25Cu5zzU62QQqZRvE= -github.com/aws/aws-sdk-go-v2/service/pricing v1.32.17 h1:EtZFyL/uhaXlHjIwHW0KSJvppg+Ie1fzQ3wEXLEUj0I= -github.com/aws/aws-sdk-go-v2/service/pricing v1.32.17/go.mod h1:l7bufyRvU+8mY0Z1BNWbWvjr59dlj9YrLKmeiz5CJ30= -github.com/aws/aws-sdk-go-v2/service/route53 v1.48.8 h1:abeu0IVRqYXSts7Tl1Yoi/BxC59xdXYX0uVSN0fbPOk= -github.com/aws/aws-sdk-go-v2/service/route53 v1.48.8/go.mod h1:bOsuAIYHQbL+AqCldJ286MeljQL1sjUVGlpz9JMxCRM= -github.com/aws/aws-sdk-go-v2/service/s3 v1.77.1 h1:5bI9tJL2Z0FGFtp/LPDv0eyliFBHCn7LAhqpQuL+7kk= -github.com/aws/aws-sdk-go-v2/service/s3 v1.77.1/go.mod h1:njj3tSJONkfdLt4y6X8pyqeM6sJLNZxmzctKKV+n1GM= -github.com/aws/aws-sdk-go-v2/service/sqs v1.37.15 h1:KRXf9/NWjoRgj2WJbX13GNjBPQ1SxUYLnIfXTz08mWs= -github.com/aws/aws-sdk-go-v2/service/sqs v1.37.15/go.mod h1:1CY54O4jz8BzgH2d6KyrzKWr2bAoqKsqUv2YZUGwMLE= -github.com/aws/aws-sdk-go-v2/service/ssm v1.64.2 h1:6P4W42RUTZixRG6TgfRB8KlsqNzHtvBhs6sTbkVPZvk= -github.com/aws/aws-sdk-go-v2/service/ssm v1.64.2/go.mod h1:wtxdacy3oO5sHO03uOtk8HMGfgo1gBHKwuJdYM220i0= -github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 h1:1Gw+9ajCV1jogloEv1RRnvfRFia2cL6c9cuKV2Ps+G8= -github.com/aws/aws-sdk-go-v2/service/sso v1.25.3/go.mod h1:qs4a9T5EMLl/Cajiw2TcbNt2UNo/Hqlyp+GiuG4CFDI= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 h1:hXmVKytPfTy5axZ+fYbR5d0cFmC3JvwLm5kM83luako= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1/go.mod h1:MlYRNmYu/fGPoxBQVvBYr9nyr948aY/WLUvwBMBJubs= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 h1:1XuUZ8mYJw9B6lzAkXhqHlJd/XvaX32evhproijJEZY= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.19/go.mod h1:cQnB8CUnxbMU82JvlqjKR2HBOm3fe9pWorWBza6MBJ4= github.com/aws/smithy-go v1.23.0 h1:8n6I3gXzWJB2DxBDnfxgBaSX6oe0d/t10qGz7OKqMCE= github.com/aws/smithy-go v1.23.0/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20250219002025-c3b5cd3d2fd9 h1:fopAsvwV3w+MGIX3oWqq0YmwL+lr/ik+wXt0gyDaEEY= diff --git a/integration/tests/enhanced_node_repair/enhanced_node_repair_test.go b/integration/tests/enhanced_node_repair/enhanced_node_repair_test.go new file mode 100644 index 0000000000..64ffa4fee4 --- /dev/null +++ b/integration/tests/enhanced_node_repair/enhanced_node_repair_test.go @@ -0,0 +1,150 @@ +//go:build integration +// +build integration + +package enhanced_node_repair + +import ( + "fmt" + "os" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + . "github.com/weaveworks/eksctl/integration/runner" + "github.com/weaveworks/eksctl/integration/tests" + "github.com/weaveworks/eksctl/pkg/testutils" +) + +var params *tests.Params + +func init() { + // Call testing.Init() prior to tests.NewParams(), as otherwise -test.* will not be recognised. See also: https://golang.org/doc/go1.13#testing + testing.Init() + params = tests.NewParamsWithGivenClusterName("enhanced-node-repair", "test-enhanced-node-repair") +} + +func TestEnhancedNodeRepair(t *testing.T) { + testutils.RegisterAndRun(t) +} + +var _ = Describe("(Integration) Enhanced Node Repair Configuration", func() { + + Context("CloudFormation template generation", func() { + It("should generate correct CloudFormation template with CLI flags", func() { + By("testing CLI flags generate correct CloudFormation") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--name", "test-cli-template", + "--region", params.Region, + "--managed", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=25", + "--node-repair-max-parallel-count=2", + "--dry-run", + ) + Expect(cmd).To(RunSuccessfully()) + }) + + It("should generate correct CloudFormation template with YAML config", func() { + By("creating temporary config file") + configFile := fmt.Sprintf("/tmp/test-enhanced-node-repair-%d.yaml", time.Now().Unix()) + yamlConfig := fmt.Sprintf(` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: test-yaml-template + region: %s + +managedNodeGroups: +- name: enhanced-ng + instanceType: t3.medium + desiredCapacity: 2 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 20 + maxParallelNodesRepairedPercentage: 15 + nodeRepairConfigOverrides: + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 15 + repairAction: "Restart" +`, params.Region) + + err := os.WriteFile(configFile, []byte(yamlConfig), 0644) + Expect(err).NotTo(HaveOccurred()) + defer os.Remove(configFile) + + By("testing YAML config generates correct CloudFormation") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--config-file", configFile, + "--dry-run", + ) + Expect(cmd).To(RunSuccessfully()) + }) + + It("should validate backward compatibility with existing config", func() { + By("testing existing node repair config still works") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--name", "test-backward-compat", + "--region", params.Region, + "--managed", + "--enable-node-repair", + "--dry-run", + ) + Expect(cmd).To(RunSuccessfully()) + }) + }) + + Context("error handling", func() { + It("should handle invalid CLI flag combinations gracefully", func() { + By("testing with unmanaged nodegroup (should fail)") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--name", "test-error-handling", + "--region", params.Region, + "--managed=false", + "--enable-node-repair", + "--dry-run", + ) + Expect(cmd).NotTo(RunSuccessfully()) + Expect(cmd).To(RunWithError(ContainSubstring("only valid with managed nodegroups"))) + }) + + It("should handle invalid YAML configuration gracefully", func() { + By("creating config file with invalid node repair config") + configFile := fmt.Sprintf("/tmp/test-invalid-config-%d.yaml", time.Now().Unix()) + invalidConfig := fmt.Sprintf(` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: test-invalid + region: %s + +nodeGroups: +- name: unmanaged-ng + instanceType: t3.medium + nodeRepairConfig: + enabled: true +`, params.Region) + + err := os.WriteFile(configFile, []byte(invalidConfig), 0644) + Expect(err).NotTo(HaveOccurred()) + defer os.Remove(configFile) + + By("testing invalid config is rejected") + cmd := params.EksctlCreateCmd.WithArgs( + "cluster", + "--config-file", configFile, + "--dry-run", + ) + // This should fail because nodeRepairConfig is not supported for unmanaged nodegroups + Expect(cmd).NotTo(RunSuccessfully()) + }) + }) +}) \ No newline at end of file diff --git a/integration/tests/enhanced_node_repair_test.go b/integration/tests/enhanced_node_repair_test.go new file mode 100644 index 0000000000..c2297ce950 --- /dev/null +++ b/integration/tests/enhanced_node_repair_test.go @@ -0,0 +1,474 @@ +//go:build integration +// +build integration + +package integration_test + +import ( + "bytes" + "os" + "os/exec" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestEnhancedNodeRepairCLIFlags tests that the enhanced node repair CLI flags are properly parsed +func TestEnhancedNodeRepairCLIFlags(t *testing.T) { + tests := []struct { + name string + args []string + expected []string // Expected strings in the dry-run output + }{ + { + name: "basic node repair", + args: []string{ + "create", "cluster", + "--name", "test-basic-repair", + "--enable-node-repair", + "--dry-run", + }, + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + }, + }, + { + name: "node repair with percentage thresholds", + args: []string{ + "create", "cluster", + "--name", "test-percentage-repair", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=25", + "--node-repair-max-parallel-percentage=20", + "--dry-run", + }, + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + "MaxUnhealthyNodeThresholdPercentage\": 25", + "MaxParallelNodesRepairedPercentage\": 20", + }, + }, + { + name: "node repair with count thresholds", + args: []string{ + "create", "cluster", + "--name", "test-count-repair", + "--enable-node-repair", + "--node-repair-max-unhealthy-count=5", + "--node-repair-max-parallel-count=2", + "--dry-run", + }, + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + "MaxUnhealthyNodeThresholdCount\": 5", + "MaxParallelNodesRepairedCount\": 2", + }, + }, + { + name: "nodegroup with node repair flags", + args: []string{ + "create", "nodegroup", + "--cluster", "existing-cluster", + "--name", "test-ng-repair", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=30", + "--node-repair-max-parallel-count=1", + "--dry-run", + }, + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + "MaxUnhealthyNodeThresholdPercentage\": 30", + "MaxParallelNodesRepairedCount\": 1", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Run eksctl with the test arguments + cmd := exec.Command("./eksctl", tt.args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + // For dry-run, we expect it to fail due to missing AWS credentials or cluster + // but we should still get the configuration output + t.Logf("Command failed as expected (dry-run): %v", err) + t.Logf("Stderr: %s", stderr.String()) + } + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check that all expected strings are present in the output + for _, expected := range tt.expected { + assert.Contains(t, output, expected, "Expected string not found in output: %s", expected) + } + }) + } +} + +// TestEnhancedNodeRepairConfigFile tests that enhanced node repair configuration files are properly parsed +func TestEnhancedNodeRepairConfigFile(t *testing.T) { + tests := []struct { + name string + configFile string + expected []string + }{ + { + name: "basic config file", + configFile: "examples/44-node-repair.yaml", + expected: []string{ + "NodeRepairConfig", + "Enabled\": true", + }, + }, + { + name: "enhanced config file", + configFile: "examples/44-enhanced-node-repair.yaml", + expected: []string{ + "NodeRepairConfig", + "maxUnhealthyNodeThresholdPercentage", + "maxParallelNodesRepairedPercentage", + "nodeRepairConfigOverrides", + "AcceleratedInstanceNotReady", + "NvidiaXID13Error", + "NetworkNotReady", + "InterfaceNotUp", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Run eksctl with the config file + cmd := exec.Command("./eksctl", "create", "cluster", "--config-file", tt.configFile, "--dry-run") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + // For dry-run, we expect it to fail due to missing AWS credentials + // but we should still get the configuration output + t.Logf("Command failed as expected (dry-run): %v", err) + } + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check that all expected strings are present in the output + for _, expected := range tt.expected { + assert.Contains(t, output, expected, "Expected string not found in output: %s", expected) + } + }) + } +} + +// TestEnhancedNodeRepairCLIHelp tests that the CLI help includes the new flags +func TestEnhancedNodeRepairCLIHelp(t *testing.T) { + cmd := exec.Command("../../eksctl", "create", "cluster", "--help") + var stdout bytes.Buffer + cmd.Stdout = &stdout + + err := cmd.Run() + require.NoError(t, err, "Help command should not fail") + + output := stdout.String() + + // Check that all new flags are documented in help + expectedFlags := []string{ + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage", + "--node-repair-max-unhealthy-count", + "--node-repair-max-parallel-percentage", + "--node-repair-max-parallel-count", + } + + for _, flag := range expectedFlags { + assert.Contains(t, output, flag, "Flag not found in help output: %s", flag) + } + + // Check that flags have proper descriptions + assert.Contains(t, output, "managed nodegroups only", "Flags should indicate they're for managed nodegroups only") +} + +// TestEnhancedNodeRepairBackwardCompatibility tests that existing configurations still work +func TestEnhancedNodeRepairBackwardCompatibility(t *testing.T) { + // Test that the original example still works + cmd := exec.Command("../../eksctl", "create", "cluster", "--config-file", "../../examples/44-node-repair.yaml", "--dry-run") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + // Expected to fail due to missing AWS credentials, but should parse config + t.Logf("Command failed as expected (dry-run): %v", err) + } + + output := stdout.String() + stderr.String() + + // Should not contain any parsing errors + assert.NotContains(t, strings.ToLower(output), "unknown field", "Should not have unknown field errors") + assert.NotContains(t, strings.ToLower(output), "invalid", "Should not have invalid field errors") + + // Should contain the basic node repair config + assert.Contains(t, output, "nodeRepairConfig", "Should contain nodeRepairConfig") +} + +// TestEnhancedNodeRepairSchemaValidation tests that the schema includes new fields +func TestEnhancedNodeRepairSchemaValidation(t *testing.T) { + cmd := exec.Command("../../eksctl", "utils", "schema") + var stdout bytes.Buffer + cmd.Stdout = &stdout + + err := cmd.Run() + require.NoError(t, err, "Schema command should not fail") + + output := stdout.String() + + // Check that the schema includes all new fields + expectedFields := []string{ + "NodeGroupNodeRepairConfig", + "maxUnhealthyNodeThresholdPercentage", + "maxUnhealthyNodeThresholdCount", + "maxParallelNodesRepairedPercentage", + "maxParallelNodesRepairedCount", + "nodeRepairConfigOverrides", + } + + for _, field := range expectedFields { + assert.Contains(t, output, field, "Schema should include field: %s", field) + } +} + + + +// TestEnhancedNodeRepairErrorHandling tests error handling for invalid configurations +func TestEnhancedNodeRepairErrorHandling(t *testing.T) { + tests := []struct { + name string + args []string + expectedError string + }{ + { + name: "conflicting percentage and count thresholds", + args: []string{ + "create", "cluster", + "--name", "test-conflict", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=25", + "--node-repair-max-unhealthy-count=5", + "--dry-run", + }, + expectedError: "cannot specify both percentage and count", + }, + { + name: "conflicting parallel percentage and count", + args: []string{ + "create", "cluster", + "--name", "test-parallel-conflict", + "--enable-node-repair", + "--node-repair-max-parallel-percentage=20", + "--node-repair-max-parallel-count=2", + "--dry-run", + }, + expectedError: "cannot specify both percentage and count", + }, + { + name: "invalid percentage value too high", + args: []string{ + "create", "cluster", + "--name", "test-invalid-percentage", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=150", + "--dry-run", + }, + expectedError: "percentage must be between 1 and 100", + }, + { + name: "invalid percentage value zero", + args: []string{ + "create", "cluster", + "--name", "test-zero-percentage", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=0", + "--dry-run", + }, + expectedError: "percentage must be between 1 and 100", + }, + { + name: "invalid count value zero", + args: []string{ + "create", "cluster", + "--name", "test-zero-count", + "--enable-node-repair", + "--node-repair-max-unhealthy-count=0", + "--dry-run", + }, + expectedError: "count must be greater than 0", + }, + { + name: "node repair flags without enable flag", + args: []string{ + "create", "cluster", + "--name", "test-no-enable", + "--node-repair-max-unhealthy-percentage=25", + "--dry-run", + }, + expectedError: "node repair flags require --enable-node-repair", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Run eksctl with the test arguments + cmd := exec.Command("../../eksctl", tt.args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + require.Error(t, err, "Command should fail with invalid configuration") + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check that the expected error message is present + assert.Contains(t, strings.ToLower(output), strings.ToLower(tt.expectedError), + "Expected error message not found: %s", tt.expectedError) + }) + } +} + +// TestEnhancedNodeRepairConfigFileErrorHandling tests error handling for invalid config files +func TestEnhancedNodeRepairConfigFileErrorHandling(t *testing.T) { + // Create a temporary invalid config file + invalidConfig := ` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: test-invalid-config + region: us-west-2 +managedNodeGroups: +- name: ng-1 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 25 + maxUnhealthyNodeThresholdCount: 5 # This conflicts with percentage + maxParallelNodesRepairedPercentage: 150 # Invalid percentage > 100 +` + + tmpFile, err := os.CreateTemp("", "invalid-config-*.yaml") + require.NoError(t, err) + defer os.Remove(tmpFile.Name()) + + _, err = tmpFile.WriteString(invalidConfig) + require.NoError(t, err) + tmpFile.Close() + + // Run eksctl with the invalid config file + cmd := exec.Command("../../eksctl", "create", "cluster", "--config-file", tmpFile.Name(), "--dry-run") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err = cmd.Run() + require.Error(t, err, "Command should fail with invalid configuration") + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check for validation errors + expectedErrors := []string{ + "cannot specify both percentage and count", + "percentage must be between 1 and 100", + } + + for _, expectedError := range expectedErrors { + assert.Contains(t, strings.ToLower(output), strings.ToLower(expectedError), + "Expected error message not found: %s", expectedError) + } +} + +// TestEnhancedNodeRepairUnmanagedNodegroupError tests that node repair flags are rejected for unmanaged nodegroups +func TestEnhancedNodeRepairUnmanagedNodegroupError(t *testing.T) { + // Create a config with unmanaged nodegroup and node repair config + invalidConfig := ` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: test-unmanaged-repair + region: us-west-2 +nodeGroups: # Unmanaged nodegroup +- name: ng-1 + nodeRepairConfig: # This should be invalid for unmanaged nodegroups + enabled: true +` + + tmpFile, err := os.CreateTemp("", "unmanaged-repair-*.yaml") + require.NoError(t, err) + defer os.Remove(tmpFile.Name()) + + _, err = tmpFile.WriteString(invalidConfig) + require.NoError(t, err) + tmpFile.Close() + + // Run eksctl with the invalid config file + cmd := exec.Command("../../eksctl", "create", "cluster", "--config-file", tmpFile.Name(), "--dry-run") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err = cmd.Run() + require.Error(t, err, "Command should fail with node repair config on unmanaged nodegroup") + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Check that the error mentions managed nodegroups only + assert.Contains(t, strings.ToLower(output), "managed nodegroups only", + "Should indicate that node repair is for managed nodegroups only") +} + +// TestEnhancedNodeRepairValidationRecovery tests that validation errors don't leave resources in inconsistent states +func TestEnhancedNodeRepairValidationRecovery(t *testing.T) { + // This test ensures that when validation fails, no partial resources are created + // Since we're using --dry-run, we're mainly testing that the validation happens early + // and doesn't proceed to resource creation + + cmd := exec.Command("../../eksctl", + "create", "cluster", + "--name", "test-validation-recovery", + "--enable-node-repair", + "--node-repair-max-unhealthy-percentage=200", // Invalid percentage + "--dry-run", + ) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + require.Error(t, err, "Command should fail early with validation error") + + output := stdout.String() + stderr.String() + t.Logf("Full output: %s", output) + + // Ensure that validation happens before any resource creation attempts + assert.Contains(t, strings.ToLower(output), "percentage must be between 1 and 100", + "Should show validation error") + + // Should not contain CloudFormation template generation messages + assert.NotContains(t, strings.ToLower(output), "cloudformation template", + "Should not proceed to CloudFormation template generation") + assert.NotContains(t, strings.ToLower(output), "creating stack", + "Should not proceed to stack creation") +} \ No newline at end of file diff --git a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json index 433939a36e..f63b11eca7 100755 --- a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json +++ b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json @@ -2301,10 +2301,43 @@ "type": "boolean", "description": "Enables the auto repair feature for the nodegroup", "x-intellij-html-description": "Enables the auto repair feature for the nodegroup" + }, + "maxParallelNodesRepairedCount": { + "type": "integer", + "description": "specifies the maximum count of nodes that can be repaired in parallel", + "x-intellij-html-description": "specifies the maximum count of nodes that can be repaired in parallel" + }, + "maxParallelNodesRepairedPercentage": { + "type": "integer", + "description": "specifies the maximum percentage of nodes that can be repaired in parallel", + "x-intellij-html-description": "specifies the maximum percentage of nodes that can be repaired in parallel" + }, + "maxUnhealthyNodeThresholdCount": { + "type": "integer", + "description": "specifies the maximum count of unhealthy nodes", + "x-intellij-html-description": "specifies the maximum count of unhealthy nodes" + }, + "maxUnhealthyNodeThresholdPercentage": { + "type": "integer", + "description": "specifies the maximum percentage of unhealthy nodes", + "x-intellij-html-description": "specifies the maximum percentage of unhealthy nodes" + }, + "nodeRepairConfigOverrides": { + "items": { + "$ref": "#/definitions/NodeRepairConfigOverride" + }, + "type": "array", + "description": "specifies custom repair behavior for specific conditions", + "x-intellij-html-description": "specifies custom repair behavior for specific conditions" } }, "preferredOrder": [ - "enabled" + "enabled", + "maxUnhealthyNodeThresholdPercentage", + "maxUnhealthyNodeThresholdCount", + "maxParallelNodesRepairedPercentage", + "maxParallelNodesRepairedCount", + "nodeRepairConfigOverrides" ], "additionalProperties": false, "description": "contains the auto repair configuration for the nodegroup", @@ -2430,6 +2463,39 @@ "description": "contains the configuration for updating NodeGroups.", "x-intellij-html-description": "contains the configuration for updating NodeGroups." }, + "NodeRepairConfigOverride": { + "properties": { + "minRepairWaitTimeMins": { + "type": "integer", + "description": "specifies the minimum wait time before repair in minutes", + "x-intellij-html-description": "specifies the minimum wait time before repair in minutes" + }, + "nodeMonitoringCondition": { + "type": "string", + "description": "specifies the monitoring condition", + "x-intellij-html-description": "specifies the monitoring condition" + }, + "nodeUnhealthyReason": { + "type": "string", + "description": "specifies the reason for node being unhealthy", + "x-intellij-html-description": "specifies the reason for node being unhealthy" + }, + "repairAction": { + "type": "string", + "description": "specifies the action to take for repair", + "x-intellij-html-description": "specifies the action to take for repair" + } + }, + "preferredOrder": [ + "nodeMonitoringCondition", + "nodeUnhealthyReason", + "minRepairWaitTimeMins", + "repairAction" + ], + "additionalProperties": false, + "description": "defines custom repair behavior for specific node conditions", + "x-intellij-html-description": "defines custom repair behavior for specific node conditions" + }, "OIDCIdentityProvider": { "required": [ "name", diff --git a/pkg/apis/eksctl.io/v1alpha5/types.go b/pkg/apis/eksctl.io/v1alpha5/types.go index 9348269170..958fa0a84e 100644 --- a/pkg/apis/eksctl.io/v1alpha5/types.go +++ b/pkg/apis/eksctl.io/v1alpha5/types.go @@ -1576,6 +1576,41 @@ type ( // Enables the auto repair feature for the nodegroup // +optional Enabled *bool `json:"enabled,omitempty"` + + // MaxUnhealthyNodeThresholdPercentage specifies the maximum percentage of unhealthy nodes + // +optional + MaxUnhealthyNodeThresholdPercentage *int `json:"maxUnhealthyNodeThresholdPercentage,omitempty"` + + // MaxUnhealthyNodeThresholdCount specifies the maximum count of unhealthy nodes + // +optional + MaxUnhealthyNodeThresholdCount *int `json:"maxUnhealthyNodeThresholdCount,omitempty"` + + // MaxParallelNodesRepairedPercentage specifies the maximum percentage of nodes that can be repaired in parallel + // +optional + MaxParallelNodesRepairedPercentage *int `json:"maxParallelNodesRepairedPercentage,omitempty"` + + // MaxParallelNodesRepairedCount specifies the maximum count of nodes that can be repaired in parallel + // +optional + MaxParallelNodesRepairedCount *int `json:"maxParallelNodesRepairedCount,omitempty"` + + // NodeRepairConfigOverrides specifies custom repair behavior for specific conditions + // +optional + NodeRepairConfigOverrides []NodeRepairConfigOverride `json:"nodeRepairConfigOverrides,omitempty"` + } + + // NodeRepairConfigOverride defines custom repair behavior for specific node conditions + NodeRepairConfigOverride struct { + // NodeMonitoringCondition specifies the monitoring condition + NodeMonitoringCondition string `json:"nodeMonitoringCondition"` + + // NodeUnhealthyReason specifies the reason for node being unhealthy + NodeUnhealthyReason string `json:"nodeUnhealthyReason"` + + // MinRepairWaitTimeMins specifies the minimum wait time before repair in minutes + MinRepairWaitTimeMins int `json:"minRepairWaitTimeMins"` + + // RepairAction specifies the action to take for repair + RepairAction string `json:"repairAction"` } ) diff --git a/pkg/apis/eksctl.io/v1alpha5/types_test.go b/pkg/apis/eksctl.io/v1alpha5/types_test.go index eaf58f0bf7..bd7cd0073c 100644 --- a/pkg/apis/eksctl.io/v1alpha5/types_test.go +++ b/pkg/apis/eksctl.io/v1alpha5/types_test.go @@ -1,6 +1,8 @@ package v1alpha5 import ( + "encoding/json" + "github.com/aws/aws-sdk-go-v2/aws" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -82,4 +84,173 @@ var _ = Describe("Types", func() { }) }) + Describe("NodeGroupNodeRepairConfig", func() { + var ( + nodeRepairConfig *NodeGroupNodeRepairConfig + ) + + BeforeEach(func() { + nodeRepairConfig = &NodeGroupNodeRepairConfig{} + }) + + Describe("JSON marshaling and unmarshaling", func() { + When("all fields are set", func() { + It("should marshal and unmarshal correctly", func() { + nodeRepairConfig.Enabled = aws.Bool(true) + nodeRepairConfig.MaxUnhealthyNodeThresholdPercentage = aws.Int(20) + nodeRepairConfig.MaxUnhealthyNodeThresholdCount = aws.Int(5) + nodeRepairConfig.MaxParallelNodesRepairedPercentage = aws.Int(15) + nodeRepairConfig.MaxParallelNodesRepairedCount = aws.Int(2) + nodeRepairConfig.NodeRepairConfigOverrides = []NodeRepairConfigOverride{ + { + NodeMonitoringCondition: "AcceleratedInstanceNotReady", + NodeUnhealthyReason: "NvidiaXID13Error", + MinRepairWaitTimeMins: 10, + RepairAction: "Terminate", + }, + { + NodeMonitoringCondition: "NetworkNotReady", + NodeUnhealthyReason: "InterfaceNotUp", + MinRepairWaitTimeMins: 20, + RepairAction: "Restart", + }, + } + + // Test JSON marshaling + jsonData, err := json.Marshal(nodeRepairConfig) + Expect(err).NotTo(HaveOccurred()) + Expect(string(jsonData)).To(ContainSubstring(`"enabled":true`)) + Expect(string(jsonData)).To(ContainSubstring(`"maxUnhealthyNodeThresholdPercentage":20`)) + Expect(string(jsonData)).To(ContainSubstring(`"maxUnhealthyNodeThresholdCount":5`)) + Expect(string(jsonData)).To(ContainSubstring(`"maxParallelNodesRepairedPercentage":15`)) + Expect(string(jsonData)).To(ContainSubstring(`"maxParallelNodesRepairedCount":2`)) + Expect(string(jsonData)).To(ContainSubstring(`"nodeRepairConfigOverrides"`)) + Expect(string(jsonData)).To(ContainSubstring(`"AcceleratedInstanceNotReady"`)) + Expect(string(jsonData)).To(ContainSubstring(`"NvidiaXID13Error"`)) + + // Test JSON unmarshaling + var unmarshaled NodeGroupNodeRepairConfig + err = json.Unmarshal(jsonData, &unmarshaled) + Expect(err).NotTo(HaveOccurred()) + Expect(*unmarshaled.Enabled).To(BeTrue()) + Expect(*unmarshaled.MaxUnhealthyNodeThresholdPercentage).To(Equal(20)) + Expect(*unmarshaled.MaxUnhealthyNodeThresholdCount).To(Equal(5)) + Expect(*unmarshaled.MaxParallelNodesRepairedPercentage).To(Equal(15)) + Expect(*unmarshaled.MaxParallelNodesRepairedCount).To(Equal(2)) + Expect(len(unmarshaled.NodeRepairConfigOverrides)).To(Equal(2)) + Expect(unmarshaled.NodeRepairConfigOverrides[0].NodeMonitoringCondition).To(Equal("AcceleratedInstanceNotReady")) + Expect(unmarshaled.NodeRepairConfigOverrides[0].NodeUnhealthyReason).To(Equal("NvidiaXID13Error")) + Expect(unmarshaled.NodeRepairConfigOverrides[0].MinRepairWaitTimeMins).To(Equal(10)) + Expect(unmarshaled.NodeRepairConfigOverrides[0].RepairAction).To(Equal("Terminate")) + }) + }) + + When("only enabled field is set", func() { + It("should marshal and unmarshal correctly with minimal config", func() { + nodeRepairConfig.Enabled = aws.Bool(true) + + // Test JSON marshaling + jsonData, err := json.Marshal(nodeRepairConfig) + Expect(err).NotTo(HaveOccurred()) + Expect(string(jsonData)).To(ContainSubstring(`"enabled":true`)) + Expect(string(jsonData)).NotTo(ContainSubstring(`"maxUnhealthyNodeThresholdPercentage"`)) + + // Test JSON unmarshaling + var unmarshaled NodeGroupNodeRepairConfig + err = json.Unmarshal(jsonData, &unmarshaled) + Expect(err).NotTo(HaveOccurred()) + Expect(*unmarshaled.Enabled).To(BeTrue()) + Expect(unmarshaled.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + Expect(unmarshaled.MaxUnhealthyNodeThresholdCount).To(BeNil()) + Expect(unmarshaled.MaxParallelNodesRepairedPercentage).To(BeNil()) + Expect(unmarshaled.MaxParallelNodesRepairedCount).To(BeNil()) + Expect(len(unmarshaled.NodeRepairConfigOverrides)).To(Equal(0)) + }) + }) + + When("enabled is false", func() { + It("should handle disabled state correctly", func() { + nodeRepairConfig.Enabled = aws.Bool(false) + + jsonData, err := json.Marshal(nodeRepairConfig) + Expect(err).NotTo(HaveOccurred()) + Expect(string(jsonData)).To(ContainSubstring(`"enabled":false`)) + + var unmarshaled NodeGroupNodeRepairConfig + err = json.Unmarshal(jsonData, &unmarshaled) + Expect(err).NotTo(HaveOccurred()) + Expect(*unmarshaled.Enabled).To(BeFalse()) + }) + }) + }) + + Describe("NodeRepairConfigOverride", func() { + var override NodeRepairConfigOverride + + BeforeEach(func() { + override = NodeRepairConfigOverride{ + NodeMonitoringCondition: "NetworkNotReady", + NodeUnhealthyReason: "InterfaceNotUp", + MinRepairWaitTimeMins: 15, + RepairAction: "Restart", + } + }) + + It("should have all required fields", func() { + Expect(override.NodeMonitoringCondition).To(Equal("NetworkNotReady")) + Expect(override.NodeUnhealthyReason).To(Equal("InterfaceNotUp")) + Expect(override.MinRepairWaitTimeMins).To(Equal(15)) + Expect(override.RepairAction).To(Equal("Restart")) + }) + + It("should marshal to JSON correctly", func() { + jsonData, err := json.Marshal(override) + Expect(err).NotTo(HaveOccurred()) + Expect(string(jsonData)).To(ContainSubstring(`"nodeMonitoringCondition":"NetworkNotReady"`)) + Expect(string(jsonData)).To(ContainSubstring(`"nodeUnhealthyReason":"InterfaceNotUp"`)) + Expect(string(jsonData)).To(ContainSubstring(`"minRepairWaitTimeMins":15`)) + Expect(string(jsonData)).To(ContainSubstring(`"repairAction":"Restart"`)) + }) + + It("should unmarshal from JSON correctly", func() { + jsonStr := `{ + "nodeMonitoringCondition": "AcceleratedInstanceNotReady", + "nodeUnhealthyReason": "NvidiaXID13Error", + "minRepairWaitTimeMins": 25, + "repairAction": "Terminate" + }` + + var unmarshaled NodeRepairConfigOverride + err := json.Unmarshal([]byte(jsonStr), &unmarshaled) + Expect(err).NotTo(HaveOccurred()) + Expect(unmarshaled.NodeMonitoringCondition).To(Equal("AcceleratedInstanceNotReady")) + Expect(unmarshaled.NodeUnhealthyReason).To(Equal("NvidiaXID13Error")) + Expect(unmarshaled.MinRepairWaitTimeMins).To(Equal(25)) + Expect(unmarshaled.RepairAction).To(Equal("Terminate")) + }) + }) + + Describe("Pointer field handling", func() { + It("should distinguish between nil and zero values", func() { + // Test nil values + config1 := &NodeGroupNodeRepairConfig{} + Expect(config1.Enabled).To(BeNil()) + Expect(config1.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + + // Test zero values + config2 := &NodeGroupNodeRepairConfig{ + Enabled: aws.Bool(false), + MaxUnhealthyNodeThresholdPercentage: aws.Int(0), + MaxUnhealthyNodeThresholdCount: aws.Int(0), + MaxParallelNodesRepairedPercentage: aws.Int(0), + MaxParallelNodesRepairedCount: aws.Int(0), + } + Expect(config2.Enabled).NotTo(BeNil()) + Expect(*config2.Enabled).To(BeFalse()) + Expect(config2.MaxUnhealthyNodeThresholdPercentage).NotTo(BeNil()) + Expect(*config2.MaxUnhealthyNodeThresholdPercentage).To(Equal(0)) + }) + }) + }) + }) diff --git a/pkg/cfn/builder/managed_nodegroup.go b/pkg/cfn/builder/managed_nodegroup.go index a469c433fd..233ffe3790 100644 --- a/pkg/cfn/builder/managed_nodegroup.go +++ b/pkg/cfn/builder/managed_nodegroup.go @@ -127,9 +127,41 @@ func (m *ManagedNodeGroupResourceSet) AddAllResources(ctx context.Context) error if m.nodeGroup.NodeRepairConfig != nil { nodeRepairConfig := &gfneks.Nodegroup_NodeRepairConfig{} + if m.nodeGroup.NodeRepairConfig.Enabled != nil { nodeRepairConfig.Enabled = gfnt.NewBoolean(*m.nodeGroup.NodeRepairConfig.Enabled) } + + if m.nodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage != nil { + nodeRepairConfig.MaxUnhealthyNodeThresholdPercentage = gfnt.NewInteger(*m.nodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage) + } + + if m.nodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount != nil { + nodeRepairConfig.MaxUnhealthyNodeThresholdCount = gfnt.NewInteger(*m.nodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount) + } + + if m.nodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage != nil { + nodeRepairConfig.MaxParallelNodesRepairedPercentage = gfnt.NewInteger(*m.nodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage) + } + + if m.nodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount != nil { + nodeRepairConfig.MaxParallelNodesRepairedCount = gfnt.NewInteger(*m.nodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount) + } + + if len(m.nodeGroup.NodeRepairConfig.NodeRepairConfigOverrides) > 0 { + var overrides []gfneks.Nodegroup_NodeRepairConfigOverride + for _, override := range m.nodeGroup.NodeRepairConfig.NodeRepairConfigOverrides { + cfnOverride := gfneks.Nodegroup_NodeRepairConfigOverride{ + NodeMonitoringCondition: gfnt.NewString(override.NodeMonitoringCondition), + NodeUnhealthyReason: gfnt.NewString(override.NodeUnhealthyReason), + MinRepairWaitTimeMins: gfnt.NewInteger(override.MinRepairWaitTimeMins), + RepairAction: gfnt.NewString(override.RepairAction), + } + overrides = append(overrides, cfnOverride) + } + nodeRepairConfig.NodeRepairConfigOverrides = overrides + } + managedResource.NodeRepairConfig = nodeRepairConfig } diff --git a/pkg/cfn/builder/managed_nodegroup_test.go b/pkg/cfn/builder/managed_nodegroup_test.go index 6f9271fc2f..cad176f599 100644 --- a/pkg/cfn/builder/managed_nodegroup_test.go +++ b/pkg/cfn/builder/managed_nodegroup_test.go @@ -6,6 +6,7 @@ import ( "fmt" "testing" + "github.com/aws/aws-sdk-go-v2/aws" "github.com/weaveworks/eksctl/pkg/goformation" gfneks "github.com/weaveworks/eksctl/pkg/goformation/cloudformation/eks" gfnt "github.com/weaveworks/eksctl/pkg/goformation/cloudformation/types" @@ -261,3 +262,232 @@ func subs(ss []string) []*gfnt.Value { } return subs } + +func TestManagedNodeGroupNodeRepairConfig(t *testing.T) { + nodeRepairTests := []struct { + description string + nodeRepairConfig *api.NodeGroupNodeRepairConfig + expectedConfig *gfneks.Nodegroup_NodeRepairConfig + }{ + { + description: "nil node repair config", + nodeRepairConfig: nil, + expectedConfig: nil, + }, + { + description: "enabled only", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Enabled(), + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(true), + }, + }, + { + description: "disabled only", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Disabled(), + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(false), + }, + }, + { + description: "all threshold and parallel parameters", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Enabled(), + MaxUnhealthyNodeThresholdPercentage: aws.Int(20), + MaxUnhealthyNodeThresholdCount: aws.Int(5), + MaxParallelNodesRepairedPercentage: aws.Int(15), + MaxParallelNodesRepairedCount: aws.Int(2), + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(true), + MaxUnhealthyNodeThresholdPercentage: gfnt.NewInteger(20), + MaxUnhealthyNodeThresholdCount: gfnt.NewInteger(5), + MaxParallelNodesRepairedPercentage: gfnt.NewInteger(15), + MaxParallelNodesRepairedCount: gfnt.NewInteger(2), + }, + }, + { + description: "with node repair config overrides", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Enabled(), + NodeRepairConfigOverrides: []api.NodeRepairConfigOverride{ + { + NodeMonitoringCondition: "AcceleratedInstanceNotReady", + NodeUnhealthyReason: "NvidiaXID13Error", + MinRepairWaitTimeMins: 10, + RepairAction: "Terminate", + }, + { + NodeMonitoringCondition: "NetworkNotReady", + NodeUnhealthyReason: "InterfaceNotUp", + MinRepairWaitTimeMins: 20, + RepairAction: "Restart", + }, + }, + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(true), + NodeRepairConfigOverrides: []gfneks.Nodegroup_NodeRepairConfigOverride{ + { + NodeMonitoringCondition: gfnt.NewString("AcceleratedInstanceNotReady"), + NodeUnhealthyReason: gfnt.NewString("NvidiaXID13Error"), + MinRepairWaitTimeMins: gfnt.NewInteger(10), + RepairAction: gfnt.NewString("Terminate"), + }, + { + NodeMonitoringCondition: gfnt.NewString("NetworkNotReady"), + NodeUnhealthyReason: gfnt.NewString("InterfaceNotUp"), + MinRepairWaitTimeMins: gfnt.NewInteger(20), + RepairAction: gfnt.NewString("Restart"), + }, + }, + }, + }, + { + description: "comprehensive configuration", + nodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: api.Enabled(), + MaxUnhealthyNodeThresholdPercentage: aws.Int(25), + MaxParallelNodesRepairedCount: aws.Int(3), + NodeRepairConfigOverrides: []api.NodeRepairConfigOverride{ + { + NodeMonitoringCondition: "NetworkNotReady", + NodeUnhealthyReason: "InterfaceNotUp", + MinRepairWaitTimeMins: 15, + RepairAction: "Restart", + }, + }, + }, + expectedConfig: &gfneks.Nodegroup_NodeRepairConfig{ + Enabled: gfnt.NewBoolean(true), + MaxUnhealthyNodeThresholdPercentage: gfnt.NewInteger(25), + MaxParallelNodesRepairedCount: gfnt.NewInteger(3), + NodeRepairConfigOverrides: []gfneks.Nodegroup_NodeRepairConfigOverride{ + { + NodeMonitoringCondition: gfnt.NewString("NetworkNotReady"), + NodeUnhealthyReason: gfnt.NewString("InterfaceNotUp"), + MinRepairWaitTimeMins: gfnt.NewInteger(15), + RepairAction: gfnt.NewString("Restart"), + }, + }, + }, + }, + } + + for _, tt := range nodeRepairTests { + t.Run(tt.description, func(t *testing.T) { + clusterConfig := api.NewClusterConfig() + clusterConfig.Metadata.Name = "test-cluster" + clusterConfig.Metadata.Region = "us-west-2" + + ng := &api.ManagedNodeGroup{ + NodeGroupBase: &api.NodeGroupBase{ + Name: "test-ng", + InstanceType: "m5.large", + }, + NodeRepairConfig: tt.nodeRepairConfig, + } + + clusterConfig.Status = &api.ClusterStatus{} + err := api.SetManagedNodeGroupDefaults(ng, clusterConfig.Metadata, false) + require.NoError(t, err) + + p := mockprovider.NewMockProvider() + fakeVPCImporter := new(vpcfakes.FakeImporter) + bootstrapper, err := nodebootstrap.NewManagedBootstrapper(clusterConfig, ng) + require.NoError(t, err) + + // Mock subnets and AZ instance support like other tests + mockSubnetsAndAZInstanceSupport(clusterConfig, p, + []string{"us-west-2a"}, + []string{}, // local zones + []ec2types.InstanceType{api.DefaultNodeType}) + + stack := builder.NewManagedNodeGroup(p.EC2(), clusterConfig, ng, nil, bootstrapper, false, fakeVPCImporter) + err = stack.AddAllResources(context.Background()) + require.NoError(t, err) + + bytes, err := stack.RenderJSON() + require.NoError(t, err) + + template, err := goformation.ParseJSON(bytes) + require.NoError(t, err) + + // Get the managed nodegroup resource + ngResource, ok := template.Resources[builder.ManagedNodeGroupResourceName] + require.True(t, ok, "ManagedNodeGroup resource should exist") + managedNodeGroup, ok := ngResource.(*gfneks.Nodegroup) + require.True(t, ok, "Resource should be a Nodegroup") + + // Test the node repair config + if tt.expectedConfig == nil { + require.Nil(t, managedNodeGroup.NodeRepairConfig, "NodeRepairConfig should be nil") + } else { + require.NotNil(t, managedNodeGroup.NodeRepairConfig, "NodeRepairConfig should not be nil") + + // Test enabled field + if tt.expectedConfig.Enabled != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.Enabled) + require.Equal(t, tt.expectedConfig.Enabled.Raw(), managedNodeGroup.NodeRepairConfig.Enabled.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.Enabled) + } + + // Test threshold percentage + if tt.expectedConfig.MaxUnhealthyNodeThresholdPercentage != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage) + require.Equal(t, tt.expectedConfig.MaxUnhealthyNodeThresholdPercentage.Raw(), + managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage) + } + + // Test threshold count + if tt.expectedConfig.MaxUnhealthyNodeThresholdCount != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount) + require.Equal(t, tt.expectedConfig.MaxUnhealthyNodeThresholdCount.Raw(), + managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.MaxUnhealthyNodeThresholdCount) + } + + // Test parallel percentage + if tt.expectedConfig.MaxParallelNodesRepairedPercentage != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage) + require.Equal(t, tt.expectedConfig.MaxParallelNodesRepairedPercentage.Raw(), + managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedPercentage) + } + + // Test parallel count + if tt.expectedConfig.MaxParallelNodesRepairedCount != nil { + require.NotNil(t, managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount) + require.Equal(t, tt.expectedConfig.MaxParallelNodesRepairedCount.Raw(), + managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount.Raw()) + } else { + require.Nil(t, managedNodeGroup.NodeRepairConfig.MaxParallelNodesRepairedCount) + } + + // Test overrides + require.Equal(t, len(tt.expectedConfig.NodeRepairConfigOverrides), + len(managedNodeGroup.NodeRepairConfig.NodeRepairConfigOverrides)) + + for i, expectedOverride := range tt.expectedConfig.NodeRepairConfigOverrides { + actualOverride := managedNodeGroup.NodeRepairConfig.NodeRepairConfigOverrides[i] + require.Equal(t, expectedOverride.NodeMonitoringCondition.Raw(), + actualOverride.NodeMonitoringCondition.Raw()) + require.Equal(t, expectedOverride.NodeUnhealthyReason.Raw(), + actualOverride.NodeUnhealthyReason.Raw()) + require.Equal(t, expectedOverride.MinRepairWaitTimeMins.Raw(), + actualOverride.MinRepairWaitTimeMins.Raw()) + require.Equal(t, expectedOverride.RepairAction.Raw(), + actualOverride.RepairAction.Raw()) + } + } + }) + } +} diff --git a/pkg/cfn/builder/testdata/launch_template/enhanced-node-repair.json b/pkg/cfn/builder/testdata/launch_template/enhanced-node-repair.json new file mode 100644 index 0000000000..4cf955f12b --- /dev/null +++ b/pkg/cfn/builder/testdata/launch_template/enhanced-node-repair.json @@ -0,0 +1,192 @@ +{ + "LaunchTemplate": { + "Type": "AWS::EC2::LaunchTemplate", + "Properties": { + "LaunchTemplateData": { + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/xvda", + "Ebs": { + "Iops": 3000, + "Throughput": 125, + "VolumeSize": 80, + "VolumeType": "gp3" + } + } + ], + "MetadataOptions": { + "HttpPutResponseHopLimit": 2, + "HttpTokens": "required" + }, + "SecurityGroupIds": [ + { + "Fn::ImportValue": "eksctl-lt::ClusterSecurityGroupId" + } + ], + "TagSpecifications": [ + { + "ResourceType": "instance", + "Tags": [ + { + "Key": "Name", + "Value": "lt-enhanced-node-repair-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "enhanced-node-repair" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + }, + { + "ResourceType": "volume", + "Tags": [ + { + "Key": "Name", + "Value": "lt-enhanced-node-repair-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "enhanced-node-repair" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + }, + { + "ResourceType": "network-interface", + "Tags": [ + { + "Key": "Name", + "Value": "lt-enhanced-node-repair-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "enhanced-node-repair" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + } + ] + }, + "LaunchTemplateName": { + "Fn::Sub": "${AWS::StackName}" + } + } + }, + "ManagedNodeGroup": { + "Type": "AWS::EKS::Nodegroup", + "Properties": { + "AmiType": "AL2023_x86_64_STANDARD", + "ClusterName": "lt", + "Labels": { + "alpha.eksctl.io/cluster-name": "lt", + "alpha.eksctl.io/nodegroup-name": "enhanced-node-repair" + }, + "InstanceTypes": ["m5.xlarge"], + "NodeRole": { + "Fn::GetAtt": [ + "NodeInstanceRole", + "Arn" + ] + }, + "NodegroupName": "enhanced-node-repair", + "ScalingConfig": { + "DesiredSize": 3, + "MaxSize": 5, + "MinSize": 1 + }, + "Subnets": [ + "subnet-public-us-west-2a" + ], + "Tags": { + "alpha.eksctl.io/nodegroup-name": "enhanced-node-repair", + "alpha.eksctl.io/nodegroup-type": "managed" + }, + "LaunchTemplate": { + "Id": { + "Ref": "LaunchTemplate" + } + }, + "NodeRepairConfig": { + "Enabled": true, + "MaxUnhealthyNodeThresholdPercentage": 20, + "MaxParallelNodesRepairedPercentage": 15, + "NodeRepairConfigOverrides": [ + { + "NodeMonitoringCondition": "AcceleratedInstanceNotReady", + "NodeUnhealthyReason": "NvidiaXID13Error", + "MinRepairWaitTimeMins": 10, + "RepairAction": "Terminate" + }, + { + "NodeMonitoringCondition": "NetworkNotReady", + "NodeUnhealthyReason": "InterfaceNotUp", + "MinRepairWaitTimeMins": 20, + "RepairAction": "Restart" + } + ] + } + } + }, + "NodeInstanceRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Action": [ + "sts:AssumeRole" + ], + "Effect": "Allow", + "Principal": { + "Service": [ + { + "Fn::FindInMap": [ + "ServicePrincipalPartitionMap", + { + "Ref": "AWS::Partition" + }, + "EC2" + ] + } + ] + } + } + ], + "Version": "2012-10-17" + }, + "ManagedPolicyArns": [ + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEC2ContainerRegistryPullOnly" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEKS_CNI_Policy" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + ], + "Path": "/", + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Sub": "${AWS::StackName}/NodeInstanceRole" + } + } + ] + } + } +} \ No newline at end of file diff --git a/pkg/ctl/cmdutils/configfile.go b/pkg/ctl/cmdutils/configfile.go index 7394d4ed15..0a4a787e67 100644 --- a/pkg/ctl/cmdutils/configfile.go +++ b/pkg/ctl/cmdutils/configfile.go @@ -611,10 +611,29 @@ func makeManagedNodegroup(nodeGroup *api.NodeGroup, options CreateManagedNGOptio Spot: options.Spot, InstanceTypes: options.InstanceTypes, } - if options.NodeRepairEnabled { + if options.NodeRepairEnabled || options.NodeRepairMaxUnhealthyPercentage != nil || + options.NodeRepairMaxUnhealthyCount != nil || options.NodeRepairMaxParallelPercentage != nil || + options.NodeRepairMaxParallelCount != nil { + mng.NodeRepairConfig = &api.NodeGroupNodeRepairConfig{ Enabled: &options.NodeRepairEnabled, } + + if options.NodeRepairMaxUnhealthyPercentage != nil && *options.NodeRepairMaxUnhealthyPercentage > 0 { + mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage = options.NodeRepairMaxUnhealthyPercentage + } + + if options.NodeRepairMaxUnhealthyCount != nil && *options.NodeRepairMaxUnhealthyCount > 0 { + mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount = options.NodeRepairMaxUnhealthyCount + } + + if options.NodeRepairMaxParallelPercentage != nil && *options.NodeRepairMaxParallelPercentage > 0 { + mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage = options.NodeRepairMaxParallelPercentage + } + + if options.NodeRepairMaxParallelCount != nil && *options.NodeRepairMaxParallelCount > 0 { + mng.NodeRepairConfig.MaxParallelNodesRepairedCount = options.NodeRepairMaxParallelCount + } } return mng } @@ -627,7 +646,7 @@ func validateManagedNGFlags(cmd *cobra.Command, managed bool) error { if managed { return nil } - flagsValidOnlyWithMNG := []string{"spot", "enable-node-repair", "instance-types"} + flagsValidOnlyWithMNG := []string{"spot", "enable-node-repair", "instance-types", "node-repair-max-unhealthy-percentage", "node-repair-max-unhealthy-count", "node-repair-max-parallel-percentage", "node-repair-max-parallel-count"} if flagName, found := findChangedFlag(cmd, flagsValidOnlyWithMNG); found { return fmt.Errorf("--%s is only valid with managed nodegroups (--managed)", flagName) } diff --git a/pkg/ctl/cmdutils/configfile_test.go b/pkg/ctl/cmdutils/configfile_test.go index 063c032d8a..21f07a9532 100644 --- a/pkg/ctl/cmdutils/configfile_test.go +++ b/pkg/ctl/cmdutils/configfile_test.go @@ -648,6 +648,164 @@ var _ = Describe("cmdutils configfile", func() { }) }) }) + + Context("makeManagedNodegroup with node repair config", func() { + var ( + ng *api.NodeGroup + options CreateManagedNGOptions + ) + + BeforeEach(func() { + ng = &api.NodeGroup{ + NodeGroupBase: &api.NodeGroupBase{ + Name: "test-ng", + }, + } + options = CreateManagedNGOptions{} + }) + + It("should create managed nodegroup without node repair config when not enabled", func() { + options.NodeRepairEnabled = false + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).To(BeNil()) + }) + + It("should create managed nodegroup with basic node repair config when enabled", func() { + options.NodeRepairEnabled = true + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(BeNil()) + }) + + It("should create managed nodegroup with threshold percentage", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyPercentage = aws.Int(25) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(Equal(25)) + }) + + It("should create managed nodegroup with threshold count", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyCount = aws.Int(5) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).To(Equal(5)) + }) + + It("should create managed nodegroup with parallel percentage", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxParallelPercentage = aws.Int(20) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).To(Equal(20)) + }) + + It("should create managed nodegroup with parallel count", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxParallelCount = aws.Int(3) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(Equal(3)) + }) + + It("should create managed nodegroup with all parameters", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyPercentage = aws.Int(30) + options.NodeRepairMaxUnhealthyCount = aws.Int(10) + options.NodeRepairMaxParallelPercentage = aws.Int(25) + options.NodeRepairMaxParallelCount = aws.Int(4) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(Equal(30)) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).To(Equal(10)) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).To(Equal(25)) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(Equal(4)) + }) + + It("should create node repair config when enabled is false but other parameters are set", func() { + options.NodeRepairEnabled = false + options.NodeRepairMaxUnhealthyPercentage = aws.Int(15) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeFalse()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(Equal(15)) + }) + + It("should ignore zero values for optional parameters", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyPercentage = aws.Int(0) + options.NodeRepairMaxParallelCount = aws.Int(0) + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + // Zero values should be ignored + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(BeNil()) + }) + + It("should handle nil pointers for optional parameters", func() { + options.NodeRepairEnabled = true + options.NodeRepairMaxUnhealthyPercentage = nil + options.NodeRepairMaxUnhealthyCount = nil + options.NodeRepairMaxParallelPercentage = nil + options.NodeRepairMaxParallelCount = nil + + mng := makeManagedNodegroup(ng, options) + + Expect(mng.NodeRepairConfig).NotTo(BeNil()) + Expect(mng.NodeRepairConfig.Enabled).NotTo(BeNil()) + Expect(*mng.NodeRepairConfig.Enabled).To(BeTrue()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxUnhealthyNodeThresholdCount).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedPercentage).To(BeNil()) + Expect(mng.NodeRepairConfig.MaxParallelNodesRepairedCount).To(BeNil()) + }) + }) }) func assertValidClusterEndpoint(endpoints *api.ClusterEndpoints, privateAccess, publicAccess bool) { diff --git a/pkg/ctl/cmdutils/create_cluster.go b/pkg/ctl/cmdutils/create_cluster.go index 31201c0d23..67fdd03294 100644 --- a/pkg/ctl/cmdutils/create_cluster.go +++ b/pkg/ctl/cmdutils/create_cluster.go @@ -43,6 +43,12 @@ type CreateManagedNGOptions struct { Spot bool NodeRepairEnabled bool InstanceTypes []string + + // New node repair configuration options + NodeRepairMaxUnhealthyPercentage *int + NodeRepairMaxUnhealthyCount *int + NodeRepairMaxParallelPercentage *int + NodeRepairMaxParallelCount *int } // CreateNGOptions holds options for creating a nodegroup diff --git a/pkg/ctl/cmdutils/nodegroup_flags.go b/pkg/ctl/cmdutils/nodegroup_flags.go index ca76e33086..dcc4a9fde8 100644 --- a/pkg/ctl/cmdutils/nodegroup_flags.go +++ b/pkg/ctl/cmdutils/nodegroup_flags.go @@ -57,6 +57,13 @@ func AddCommonCreateNodeGroupFlags(fs *pflag.FlagSet, cmd *Cmd, ng *api.NodeGrou fs.BoolVarP(&mngOptions.Managed, "managed", "", true, "Create EKS-managed nodegroup") fs.BoolVar(&mngOptions.Spot, "spot", false, "Create a spot nodegroup (managed nodegroups only)") fs.BoolVar(&mngOptions.NodeRepairEnabled, "enable-node-repair", false, "Enable automatic node repair (managed nodegroups only)") + + // Node repair configuration flags + mngOptions.NodeRepairMaxUnhealthyPercentage = fs.Int("node-repair-max-unhealthy-percentage", 0, "Maximum percentage of unhealthy nodes before repair (managed nodegroups only)") + mngOptions.NodeRepairMaxUnhealthyCount = fs.Int("node-repair-max-unhealthy-count", 0, "Maximum count of unhealthy nodes before repair (managed nodegroups only)") + mngOptions.NodeRepairMaxParallelPercentage = fs.Int("node-repair-max-parallel-percentage", 0, "Maximum percentage of nodes to repair in parallel (managed nodegroups only)") + mngOptions.NodeRepairMaxParallelCount = fs.Int("node-repair-max-parallel-count", 0, "Maximum count of nodes to repair in parallel (managed nodegroups only)") + fs.StringSliceVar(&mngOptions.InstanceTypes, "instance-types", nil, "Comma-separated list of instance types (e.g., --instance-types=c3.large,c4.large,c5.large") } diff --git a/pkg/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go b/pkg/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go index e64996f12f..12d4f89194 100644 --- a/pkg/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go +++ b/pkg/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go @@ -15,6 +15,31 @@ type Nodegroup_NodeRepairConfig struct { // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-enabled Enabled *types.Value `json:"Enabled,omitempty"` + // MaxUnhealthyNodeThresholdPercentage AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-maxunhealthynodethresholdpercentage + MaxUnhealthyNodeThresholdPercentage *types.Value `json:"MaxUnhealthyNodeThresholdPercentage,omitempty"` + + // MaxUnhealthyNodeThresholdCount AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-maxunhealthynodethresholdcount + MaxUnhealthyNodeThresholdCount *types.Value `json:"MaxUnhealthyNodeThresholdCount,omitempty"` + + // MaxParallelNodesRepairedPercentage AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-maxparallelnodesrepairedpercentage + MaxParallelNodesRepairedPercentage *types.Value `json:"MaxParallelNodesRepairedPercentage,omitempty"` + + // MaxParallelNodesRepairedCount AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-maxparallelnodesrepairedcount + MaxParallelNodesRepairedCount *types.Value `json:"MaxParallelNodesRepairedCount,omitempty"` + + // NodeRepairConfigOverrides AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfig.html#cfn-eks-nodegroup-noderepairconfig-noderepairconfigurations + NodeRepairConfigOverrides []Nodegroup_NodeRepairConfigOverride `json:"NodeRepairConfigOverrides,omitempty"` + // AWSCloudFormationDeletionPolicy represents a CloudFormation DeletionPolicy AWSCloudFormationDeletionPolicy policies.DeletionPolicy `json:"-"` @@ -35,3 +60,48 @@ type Nodegroup_NodeRepairConfig struct { func (r *Nodegroup_NodeRepairConfig) AWSCloudFormationType() string { return "AWS::EKS::Nodegroup.NodeRepairConfig" } + +// Nodegroup_NodeRepairConfigOverride AWS CloudFormation Resource (AWS::EKS::Nodegroup.NodeRepairConfigOverride) +// See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html +type Nodegroup_NodeRepairConfigOverride struct { + + // NodeMonitoringCondition AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html#cfn-eks-nodegroup-noderepairconfigurations-nodemonitoringcondition + NodeMonitoringCondition *types.Value `json:"NodeMonitoringCondition,omitempty"` + + // NodeUnhealthyReason AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html#cfn-eks-nodegroup-noderepairconfigurations-nodeunhealthyreason + NodeUnhealthyReason *types.Value `json:"NodeUnhealthyReason,omitempty"` + + // MinRepairWaitTimeMins AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html#cfn-eks-nodegroup-noderepairconfigurations-minrepairwaittimemins + MinRepairWaitTimeMins *types.Value `json:"MinRepairWaitTimeMins,omitempty"` + + // RepairAction AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-eks-nodegroup-noderepairconfigurations.html#cfn-eks-nodegroup-noderepairconfigurations-repairaction + RepairAction *types.Value `json:"RepairAction,omitempty"` + + // AWSCloudFormationDeletionPolicy represents a CloudFormation DeletionPolicy + AWSCloudFormationDeletionPolicy policies.DeletionPolicy `json:"-"` + + // AWSCloudFormationUpdateReplacePolicy represents a CloudFormation UpdateReplacePolicy + AWSCloudFormationUpdateReplacePolicy policies.UpdateReplacePolicy `json:"-"` + + // AWSCloudFormationDependsOn stores the logical ID of the resources to be created before this resource + AWSCloudFormationDependsOn []string `json:"-"` + + // AWSCloudFormationMetadata stores structured data associated with this resource + AWSCloudFormationMetadata map[string]interface{} `json:"-"` + + // AWSCloudFormationCondition stores the logical ID of the condition that must be satisfied for this resource to be created + AWSCloudFormationCondition string `json:"-"` +} + +// AWSCloudFormationType returns the AWS CloudFormation resource type +func (r *Nodegroup_NodeRepairConfigOverride) AWSCloudFormationType() string { + return "AWS::EKS::Nodegroup.NodeRepairConfigOverride" +} diff --git a/userdocs/src/usage/nodegroup-node-repair-config.md b/userdocs/src/usage/nodegroup-node-repair-config.md index f42ff4b434..3eea048093 100644 --- a/userdocs/src/usage/nodegroup-node-repair-config.md +++ b/userdocs/src/usage/nodegroup-node-repair-config.md @@ -1,47 +1,287 @@ -# Support for Node Repair Config in EKS Managed Nodegroups +# Enhanced Node Repair Configuration for EKS Managed Nodegroups -EKS Managed Nodegroups now supports Node Repair, where the health of managed nodes are monitored, -and unhealthy worker nodes are replaced or rebooted in response. +EKS Managed Nodegroups supports Node Repair, where the health of managed nodes are monitored, +and unhealthy worker nodes are replaced or rebooted in response. eksctl now provides comprehensive +configuration options for fine-grained control over node repair behavior. -## Creating a cluster a managed nodegroup with node repair enabled +## Basic Node Repair Configuration -To create a cluster with a managed nodegroup using node repair, pass the `--enable-node-repair` flag: +### Using CLI flags + +To create a cluster with a managed nodegroup using basic node repair: ```shell $ eksctl create cluster --enable-node-repair ``` -To create a managed nodegroup using node repair on an existing cluster: +To create a managed nodegroup with node repair on an existing cluster: ```shell $ eksctl create nodegroup --cluster= --enable-node-repair ``` -To create a cluster with a managed nodegroup using node repair via a config file: +### Using configuration files ```yaml -# node-repair-nodegroup-cluster.yaml ---- +# basic-node-repair.yaml apiVersion: eksctl.io/v1alpha5 kind: ClusterConfig metadata: - name: cluster-44 + name: basic-node-repair-cluster region: us-west-2 managedNodeGroups: - name: ng-1 nodeRepairConfig: enabled: true +``` + +```shell +$ eksctl create cluster -f basic-node-repair.yaml +``` + +## Enhanced Node Repair Configuration + +### Threshold Configuration + +You can configure when node repair is triggered using either percentage or count-based thresholds: + +#### CLI flags for thresholds + +```shell +# Percentage-based thresholds +$ eksctl create cluster --enable-node-repair \ + --node-repair-max-unhealthy-percentage=20 + +# Count-based thresholds +$ eksctl create cluster --enable-node-repair \ + --node-repair-max-unhealthy-count=5 +``` +#### Configuration file for thresholds + +```yaml +managedNodeGroups: +- name: threshold-ng + nodeRepairConfig: + enabled: true + # Trigger repair when 20% of nodes are unhealthy + maxUnhealthyNodeThresholdPercentage: 20 + # Alternative: trigger repair when 3 nodes are unhealthy + # maxUnhealthyNodeThresholdCount: 3 ``` +### Parallel Repair Limits + +Control how many nodes can be repaired simultaneously: + +#### CLI flags for parallel limits + ```shell -$ eksctl create cluster -f node-repair-nodegroup-cluster.yaml +# Percentage-based parallel limits +$ eksctl create cluster --enable-node-repair \ + --node-repair-max-parallel-percentage=15 + +# Count-based parallel limits +$ eksctl create cluster --enable-node-repair \ + --node-repair-max-parallel-count=2 +``` + +#### Configuration file for parallel limits + +```yaml +managedNodeGroups: +- name: parallel-ng + nodeRepairConfig: + enabled: true + # Repair at most 15% of nodes in parallel + maxParallelNodesRepairedPercentage: 15 + # Alternative: repair at most 2 nodes in parallel + # maxParallelNodesRepairedCount: 2 +``` + +### Custom Repair Overrides + +Define specialized repair behavior for specific failure scenarios: + +```yaml +managedNodeGroups: +- name: custom-repair-ng + instanceType: g4dn.xlarge # GPU instances + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 25 + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # Handle GPU-related failures + - nodeMonitoringCondition: "AcceleratedInstanceNotReady" + nodeUnhealthyReason: "NvidiaXID13Error" + minRepairWaitTimeMins: 10 + repairAction: "Terminate" + # Handle network issues + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 20 + repairAction: "Restart" +``` + +## Complete Configuration Examples + +### Example 1: Basic repair with percentage thresholds + +```yaml +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: basic-repair-cluster + region: us-west-2 + +managedNodeGroups: +- name: basic-ng + instanceType: m5.large + desiredCapacity: 3 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 20 + maxParallelNodesRepairedPercentage: 15 +``` + +### Example 2: Conservative repair for critical workloads + +```yaml +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: critical-workload-cluster + region: us-west-2 + +managedNodeGroups: +- name: critical-ng + instanceType: c5.2xlarge + desiredCapacity: 6 + nodeRepairConfig: + enabled: true + # Very conservative settings + maxUnhealthyNodeThresholdPercentage: 10 + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # Wait longer before taking action on critical workloads + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 45 + repairAction: "Restart" +``` + +### Example 3: GPU workload with specialized repair + +```yaml +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: gpu-workload-cluster + region: us-west-2 + +managedNodeGroups: +- name: gpu-ng + instanceType: g4dn.xlarge + desiredCapacity: 4 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 25 + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # GPU failures require immediate termination + - nodeMonitoringCondition: "AcceleratedInstanceNotReady" + nodeUnhealthyReason: "NvidiaXID13Error" + minRepairWaitTimeMins: 5 + repairAction: "Terminate" +``` + +## CLI Reference + +### Node Repair Flags + +| Flag | Description | Example | +|------|-------------|---------| +| `--enable-node-repair` | Enable automatic node repair | `--enable-node-repair` | +| `--node-repair-max-unhealthy-percentage` | Maximum percentage of unhealthy nodes before repair | `--node-repair-max-unhealthy-percentage=20` | +| `--node-repair-max-unhealthy-count` | Maximum count of unhealthy nodes before repair | `--node-repair-max-unhealthy-count=5` | +| `--node-repair-max-parallel-percentage` | Maximum percentage of nodes to repair in parallel | `--node-repair-max-parallel-percentage=15` | +| `--node-repair-max-parallel-count` | Maximum count of nodes to repair in parallel | `--node-repair-max-parallel-count=2` | + +**Note:** Node repair config overrides are only supported through YAML configuration files due to their complexity. + +## Configuration Reference + +### nodeRepairConfig + +| Field | Type | Description | Example | +|-------|------|-------------|---------| +| `enabled` | boolean | Enable/disable node repair | `true` | +| `maxUnhealthyNodeThresholdPercentage` | integer | Percentage threshold for unhealthy nodes | `20` | +| `maxUnhealthyNodeThresholdCount` | integer | Count threshold for unhealthy nodes | `5` | +| `maxParallelNodesRepairedPercentage` | integer | Percentage limit for parallel repairs | `15` | +| `maxParallelNodesRepairedCount` | integer | Count limit for parallel repairs | `2` | +| `nodeRepairConfigOverrides` | array | Custom repair behavior overrides | See examples above | + +### nodeRepairConfigOverrides + +| Field | Type | Description | Valid Values | +|-------|------|-------------|--------------| +| `nodeMonitoringCondition` | string | Monitoring condition | `"AcceleratedInstanceNotReady"`, `"NetworkNotReady"` | +| `nodeUnhealthyReason` | string | Reason for node being unhealthy | `"NvidiaXID13Error"`, `"InterfaceNotUp"` | +| `minRepairWaitTimeMins` | integer | Minimum wait time before repair (minutes) | Any positive integer | +| `repairAction` | string | Action to take for repair | `"Terminate"`, `"Restart"`, `"NoAction"` | + +## Best Practices + +### Choosing Thresholds + +- **Small nodegroups (< 10 nodes)**: Use count-based thresholds for precise control +- **Large nodegroups (≥ 10 nodes)**: Use percentage-based thresholds for scalability +- **Critical workloads**: Use conservative thresholds (10-15%) +- **Development environments**: Use higher thresholds (20-30%) + +### Parallel Repair Limits + +- **High availability requirements**: Limit to 1-2 nodes or 10-15% +- **Batch workloads**: Allow higher parallel repairs (20-25%) +- **GPU workloads**: Limit to 1 node at a time due to cost and setup time + +### Custom Overrides + +- **GPU instances**: Use immediate termination for hardware failures +- **Network issues**: Try restart first, then terminate +- **Critical workloads**: Increase wait times to avoid unnecessary disruptions + +## Troubleshooting + +### Common Issues + +1. **Configuration validation errors**: Ensure parameter values are within valid ranges +2. **Conflicting thresholds**: Don't specify both percentage and count for the same parameter +3. **Invalid override values**: Check that monitoring conditions, reasons, and actions are valid + +### Monitoring Node Repair + +Enable CloudWatch logging to monitor node repair activities: + +```yaml +cloudWatch: + clusterLogging: + enableTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"] ``` -## Further information +## Further Information - [EKS Managed Nodegroup Node Health][eks-user-guide] +- [EKS Node Repair Configuration][eks-node-repair] +- [eksctl Managed Nodegroups][eksctl-managed-nodegroups] [eks-user-guide]: https://docs.aws.amazon.com/eks/latest/userguide/node-health.html +[eks-node-repair]: https://docs.aws.amazon.com/eks/latest/userguide/node-repair.html +[eksctl-managed-nodegroups]: https://eksctl.io/usage/managing-nodegroups/ From 38465360ea05706dae0354d22f0e18605aa7785e Mon Sep 17 00:00:00 2001 From: sapphirew Date: Mon, 22 Sep 2025 16:08:07 -0700 Subject: [PATCH 2/4] bump eks version --- go.mod | 70 +--------------------------------------------------------- go.sum | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 69 deletions(-) diff --git a/go.mod b/go.mod index 93c1ca2f2d..2a9e99dea2 100644 --- a/go.mod +++ b/go.mod @@ -17,7 +17,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.3 github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.51.3 github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1 - github.com/aws/aws-sdk-go-v2/service/eks v1.73.2 + github.com/aws/aws-sdk-go-v2/service/eks v1.74.0 github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.3 github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.3 github.com/aws/aws-sdk-go-v2/service/iam v1.47.4 @@ -478,71 +478,3 @@ replace ( k8s.io/sample-cli-plugin => k8s.io/sample-cli-plugin v0.32.3 k8s.io/sample-controller => k8s.io/sample-controller v0.32.3 ) - -replace github.com/aws/aws-sdk-go-v2 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2 - -replace github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/aws/protocol/eventstream - -replace github.com/aws/aws-sdk-go-v2/config => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/config - -replace github.com/aws/aws-sdk-go-v2/credentials => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/credentials - -replace github.com/aws/aws-sdk-go-v2/feature/ec2/imds => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/feature/ec2/imds - -replace github.com/aws/aws-sdk-go-v2/internal/configsources => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/internal/configsources - -replace github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/internal/endpoints/v2 - -replace github.com/aws/aws-sdk-go-v2/internal/ini => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/internal/ini - -replace github.com/aws/aws-sdk-go-v2/internal/v4a => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/internal/v4a - -replace github.com/aws/aws-sdk-go-v2/service/autoscaling => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/autoscaling - -replace github.com/aws/aws-sdk-go-v2/service/cloudformation => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/cloudformation - -replace github.com/aws/aws-sdk-go-v2/service/cloudtrail => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/cloudtrail - -replace github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/cloudwatchlogs - -replace github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/cognitoidentityprovider - -replace github.com/aws/aws-sdk-go-v2/service/ec2 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/ec2 - -replace github.com/aws/aws-sdk-go-v2/service/eks => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/eks - -replace github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/elasticloadbalancing - -replace github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/elasticloadbalancingv2 - -replace github.com/aws/aws-sdk-go-v2/service/eventbridge => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/eventbridge - -replace github.com/aws/aws-sdk-go-v2/service/iam => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/iam - -replace github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/internal/accept-encoding - -replace github.com/aws/aws-sdk-go-v2/service/internal/checksum => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/internal/checksum - -replace github.com/aws/aws-sdk-go-v2/service/internal/presigned-url => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/internal/presigned-url - -replace github.com/aws/aws-sdk-go-v2/service/internal/s3shared => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/internal/s3shared - -replace github.com/aws/aws-sdk-go-v2/service/kms => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/kms - -replace github.com/aws/aws-sdk-go-v2/service/outposts => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/outposts - -replace github.com/aws/aws-sdk-go-v2/service/pricing => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/pricing - -replace github.com/aws/aws-sdk-go-v2/service/route53 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/route53 - -replace github.com/aws/aws-sdk-go-v2/service/s3 => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/s3 - -replace github.com/aws/aws-sdk-go-v2/service/sqs => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/sqs - -replace github.com/aws/aws-sdk-go-v2/service/ssm => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/ssm - -replace github.com/aws/aws-sdk-go-v2/service/sso => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/sso - -replace github.com/aws/aws-sdk-go-v2/service/ssooidc => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/ssooidc - -replace github.com/aws/aws-sdk-go-v2/service/sts => /Users/rhaowang/github/eksctl/aws-sdk-go-v2/service/sts diff --git a/go.sum b/go.sum index 7e30affd59..967b89aabf 100644 --- a/go.sum +++ b/go.sum @@ -106,6 +106,74 @@ github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b4 github.com/aws/amazon-ec2-instance-selector/v3 v3.1.1-0.20250224180552-36eea73b44c2/go.mod h1:RU/lVVsYHNN7Bwr2UmCw5z2aWPcNIHADY49bj082oYM= github.com/aws/aws-sdk-go v1.55.7 h1:UJrkFq7es5CShfBwlWAC8DA077vp8PyVbQd3lqLiztE= github.com/aws/aws-sdk-go v1.55.7/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= +github.com/aws/aws-sdk-go-v2 v1.39.0 h1:xm5WV/2L4emMRmMjHFykqiA4M/ra0DJVSWUkDyBjbg4= +github.com/aws/aws-sdk-go-v2 v1.39.0/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 h1:i8p8P4diljCr60PpJp6qZXNlgX4m2yQFpYk+9ZT+J4E= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1/go.mod h1:ddqbooRZYNoJ2dsTwOty16rM+/Aqmk/GOXrK8cg7V00= +github.com/aws/aws-sdk-go-v2/config v1.29.14 h1:f+eEi/2cKCg9pqKBoAIwRGzVb70MRKqWX4dg1BDcSJM= +github.com/aws/aws-sdk-go-v2/config v1.29.14/go.mod h1:wVPHWcIFv3WO89w0rE10gzf17ZYy+UVS1Geq8Iei34g= +github.com/aws/aws-sdk-go-v2/credentials v1.18.11 h1:1Fnb+7Dk96/VYx/uYfzk5sU2V0b0y2RWZROiMZCN/Io= +github.com/aws/aws-sdk-go-v2/credentials v1.18.11/go.mod h1:iuvn9v10dkxU4sDgtTXGWY0MrtkEcmkUmjv4clxhuTc= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.7 h1:Is2tPmieqGS2edBnmOJIbdvOA6Op+rRpaYR60iBAwXM= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.7/go.mod h1:F1i5V5421EGci570yABvpIXgRIBPb5JM+lSkHF6Dq5w= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.7 h1:UCxq0X9O3xrlENdKf1r9eRJoKz/b0AfGkpp3a7FPlhg= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.7/go.mod h1:rHRoJUNUASj5Z/0eqI4w32vKvC7atoWR0jC+IkmVH8k= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.7 h1:Y6DTZUn7ZUC4th9FMBbo8LVE+1fyq3ofw+tRwkUd3PY= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.7/go.mod h1:x3XE6vMnU9QvHN/Wrx2s44kwzV2o2g5x/siw4ZUJ9g8= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.7 h1:BszAktdUo2xlzmYHjWMq70DqJ7cROM8iBd3f6hrpuMQ= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.7/go.mod h1:XJ1yHki/P7ZPuG4fd3f0Pg/dSGA2cTQBCLw82MH2H48= +github.com/aws/aws-sdk-go-v2/service/autoscaling v1.59.0 h1:3gdYbEifG0hBOi3j43F/5B5Wln0uzdk6sAZzULZFAUA= +github.com/aws/aws-sdk-go-v2/service/autoscaling v1.59.0/go.mod h1:EjcucApl+Do5h3SFDSqYdTd8KA25sWmttgF0J9YXDkc= +github.com/aws/aws-sdk-go-v2/service/cloudformation v1.66.1 h1:5HZUkH4sPTJkivr07q4Tu2AGPcttKxLRri8LCstfZs8= +github.com/aws/aws-sdk-go-v2/service/cloudformation v1.66.1/go.mod h1:eTAwEMBFx1uY9cnjh98c1V7GFqftJRb5X3wrUW04BTg= +github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.53.3 h1:o04FlK/Mkm2UvctYIPOrpgMpLYwFq3WIpceUk0d8c0o= +github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.53.3/go.mod h1:NE9Jd1chPuOVkgPPMkIthFg99iIqlLvZGxI+H3bJB3E= +github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.3 h1:7IR8c3gRjh67jHyUEkBa6cnt6KPAeBVTCpYExTlP0/4= +github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.57.3/go.mod h1:ptJgRWK9opQK1foOTBKUg3PokkKA0/xcTXWIxwliaIY= +github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.51.3 h1:4U9dpQZTvJ0Mi1qn8L1hRJ4igFCQYEjwUuOmYkWM5tE= +github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.51.3/go.mod h1:ygltZT++6Wn2uG4+tqE0NW1MkdEtb5W2O/CFc0xJX/g= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1 h1:+4A9SDduLZFlDeXWRmfQ6r8kyEJZQfK6lcg+KwdvWrI= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1/go.mod h1:ouvGEfHbLaIlWwpDpOVWPWR+YwO0HDv3vm5tYLq8ImY= +github.com/aws/aws-sdk-go-v2/service/eks v1.74.0 h1:GdG6qvpMet2Bs0XQR3O/4RJ8g87bXfPZCIzPBNqkX54= +github.com/aws/aws-sdk-go-v2/service/eks v1.74.0/go.mod h1:FeDTTHze8jWVCZBiMkUYxJ/TQdOpTf9zbJjf0RI0ajo= +github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.3 h1:61XdTI0Yol1blhU1mpj3lyxgZaBaO7EcZrAZ4Ryj+pk= +github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.3/go.mod h1:k1o3miorfzvEEwJJUbM+N+3Th3HhaLYgCUPdphBVMzw= +github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.3 h1:PGutY1v6+O1wOnvKLUoo+jGM9vzghqEouBb29W2hcOs= +github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.50.3/go.mod h1:YXClVP0EJ91D+khPRye/nUxK6/uQOsFEhMTKYiOnnrw= +github.com/aws/aws-sdk-go-v2/service/eventbridge v1.36.12 h1:uH6GOnGSvVN9MCk6o3+HvZFpdqL7AzJKNOTM/6l+3/s= +github.com/aws/aws-sdk-go-v2/service/eventbridge v1.36.12/go.mod h1:6qtp53AQg7KEeYrsp430PNlmVVO9qK0Xw8nddE1y+ow= +github.com/aws/aws-sdk-go-v2/service/iam v1.47.4 h1:3jK50qpmtonshV/dumtlzZA/0i8vp8a0KqWThrXnhpI= +github.com/aws/aws-sdk-go-v2/service/iam v1.47.4/go.mod h1:0y7wFmnEg9xTZxjmr2gHQ4xOHpCfrt70lFWTOAkrij4= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 h1:oegbebPEMA/1Jny7kvwejowCaHz1FWZAQ94WXFNCyTM= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1/go.mod h1:kemo5Myr9ac0U9JfSjMo9yHLtw+pECEHsFtJ9tqCEI8= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.8.7 h1:zmZ8qvtE9chfhBPuKB2aQFxW5F/rpwXUgmcVCgQzqRw= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.8.7/go.mod h1:vVYfbpd2l+pKqlSIDIOgouxNsGu5il9uDp0ooWb0jys= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.7 h1:mLgc5QIgOy26qyh5bvW+nDoAppxgn3J2WV3m9ewq7+8= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.7/go.mod h1:wXb/eQnqt8mDQIQTTmcw58B5mYGxzLGZGK8PWNFZ0BA= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.7 h1:u3VbDKUCWarWiU+aIUK4gjTr/wQFXV17y3hgNno9fcA= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.7/go.mod h1:/OuMQwhSyRapYxq6ZNpPer8juGNrB4P5Oz8bZ2cgjQE= +github.com/aws/aws-sdk-go-v2/service/kms v1.38.3 h1:RivOtUH3eEu6SWnUMFHKAW4MqDOzWn1vGQ3S38Y5QMg= +github.com/aws/aws-sdk-go-v2/service/kms v1.38.3/go.mod h1:cQn6tAF77Di6m4huxovNM7NVAozWTZLsDRp9t8Z/WYk= +github.com/aws/aws-sdk-go-v2/service/outposts v1.56.3 h1:TkxBxJJ40xug1O/LKAx5gDgIzbne8s5z+01JZsiOdeQ= +github.com/aws/aws-sdk-go-v2/service/outposts v1.56.3/go.mod h1:oEwTEYL6jq3k0aYlGr811o291esaRs5vgUyx7Iw0oIM= +github.com/aws/aws-sdk-go-v2/service/pricing v1.32.17 h1:EtZFyL/uhaXlHjIwHW0KSJvppg+Ie1fzQ3wEXLEUj0I= +github.com/aws/aws-sdk-go-v2/service/pricing v1.32.17/go.mod h1:l7bufyRvU+8mY0Z1BNWbWvjr59dlj9YrLKmeiz5CJ30= +github.com/aws/aws-sdk-go-v2/service/route53 v1.48.8 h1:abeu0IVRqYXSts7Tl1Yoi/BxC59xdXYX0uVSN0fbPOk= +github.com/aws/aws-sdk-go-v2/service/route53 v1.48.8/go.mod h1:bOsuAIYHQbL+AqCldJ286MeljQL1sjUVGlpz9JMxCRM= +github.com/aws/aws-sdk-go-v2/service/s3 v1.77.1 h1:5bI9tJL2Z0FGFtp/LPDv0eyliFBHCn7LAhqpQuL+7kk= +github.com/aws/aws-sdk-go-v2/service/s3 v1.77.1/go.mod h1:njj3tSJONkfdLt4y6X8pyqeM6sJLNZxmzctKKV+n1GM= +github.com/aws/aws-sdk-go-v2/service/sqs v1.37.15 h1:KRXf9/NWjoRgj2WJbX13GNjBPQ1SxUYLnIfXTz08mWs= +github.com/aws/aws-sdk-go-v2/service/sqs v1.37.15/go.mod h1:1CY54O4jz8BzgH2d6KyrzKWr2bAoqKsqUv2YZUGwMLE= +github.com/aws/aws-sdk-go-v2/service/ssm v1.64.3 h1:0vR3D1PTK2s1BDqlIgbSvGSIagR3qlSxWllTzuAImA0= +github.com/aws/aws-sdk-go-v2/service/ssm v1.64.3/go.mod h1:5O20AzpAiVXhRhrJd5Tv9vh1gA5+iYHqAMVc+6t4q7g= +github.com/aws/aws-sdk-go-v2/service/sso v1.29.2 h1:rcoTaYOhGE/zfxE1uR6X5fvj+uKkqeCNRE0rBbiQM34= +github.com/aws/aws-sdk-go-v2/service/sso v1.29.2/go.mod h1:Ql6jE9kyyWI5JHn+61UT/Y5Z0oyVJGmgmJbZD5g4unY= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.3 h1:BSIfeFtU9tlSt8vEYS7KzurMoAuYzYPWhcZiMtxVf2M= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.3/go.mod h1:XclEty74bsGBCr1s0VSaA11hQ4ZidK4viWK7rRfO88I= +github.com/aws/aws-sdk-go-v2/service/sts v1.38.3 h1:yEiZ0ztgji2GsCb/6uQSITXcGdtmWMfLRys0jJFiUkc= +github.com/aws/aws-sdk-go-v2/service/sts v1.38.3/go.mod h1:Z+Gd23v97pX9zK97+tX4ppAgqCt3Z2dIXB02CtBncK8= github.com/aws/smithy-go v1.23.0 h1:8n6I3gXzWJB2DxBDnfxgBaSX6oe0d/t10qGz7OKqMCE= github.com/aws/smithy-go v1.23.0/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20250219002025-c3b5cd3d2fd9 h1:fopAsvwV3w+MGIX3oWqq0YmwL+lr/ik+wXt0gyDaEEY= From e3a3c00a6ae4557e71c4ad2628aaff8d2e83ca89 Mon Sep 17 00:00:00 2001 From: sapphirew Date: Mon, 22 Sep 2025 16:14:12 -0700 Subject: [PATCH 3/4] update user doc for configurable node repair --- examples/44-enhanced-node-repair.yaml | 15 +-- .../eksctl.io/v1alpha5/assets/schema.json | 40 ++++---- pkg/apis/eksctl.io/v1alpha5/types.go | 27 ++++-- pkg/ctl/cmdutils/nodegroup_flags.go | 8 +- .../src/usage/nodegroup-node-repair-config.md | 97 ++++++------------- 5 files changed, 77 insertions(+), 110 deletions(-) diff --git a/examples/44-enhanced-node-repair.yaml b/examples/44-enhanced-node-repair.yaml index 55186b7399..9c9b78f614 100644 --- a/examples/44-enhanced-node-repair.yaml +++ b/examples/44-enhanced-node-repair.yaml @@ -17,9 +17,9 @@ managedNodeGroups: maxSize: 5 nodeRepairConfig: enabled: true - # Trigger repair when 20% of nodes are unhealthy + # Stop repair actions when 20% of nodes are unhealthy maxUnhealthyNodeThresholdPercentage: 20 - # Repair at most 15% of nodes in parallel + # Repair at most 15% of unhealthy nodes in parallel maxParallelNodesRepairedPercentage: 15 # Example 2: Node repair with count-based thresholds @@ -30,9 +30,9 @@ managedNodeGroups: maxSize: 20 nodeRepairConfig: enabled: true - # Trigger repair when 3 nodes are unhealthy + # Stop repair actions when 3 nodes are unhealthy maxUnhealthyNodeThresholdCount: 3 - # Repair at most 2 nodes in parallel + # Repair at most 2 unhealthy nodes in parallel maxParallelNodesRepairedCount: 2 # Example 3: Comprehensive configuration with custom overrides @@ -43,9 +43,9 @@ managedNodeGroups: maxSize: 8 nodeRepairConfig: enabled: true - # Use percentage-based threshold for this larger nodegroup + # Stop repair actions when 25% of nodes are unhealthy maxUnhealthyNodeThresholdPercentage: 25 - # Limit parallel repairs to maintain workload availability + # Repair at most 1 unhealthy node at a time to maintain workload availability maxParallelNodesRepairedCount: 1 # Custom repair behavior for specific failure scenarios nodeRepairConfigOverrides: @@ -68,8 +68,9 @@ managedNodeGroups: maxSize: 12 nodeRepairConfig: enabled: true - # Very conservative thresholds for critical workloads + # Very conservative settings - stop repair actions when only 10% of nodes are unhealthy maxUnhealthyNodeThresholdPercentage: 10 + # Repair only 1 unhealthy node at a time for critical workloads maxParallelNodesRepairedCount: 1 nodeRepairConfigOverrides: # For critical workloads, wait longer before taking action diff --git a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json index f63b11eca7..e2ba76d066 100755 --- a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json +++ b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json @@ -2304,31 +2304,31 @@ }, "maxParallelNodesRepairedCount": { "type": "integer", - "description": "specifies the maximum count of nodes that can be repaired in parallel", - "x-intellij-html-description": "specifies the maximum count of nodes that can be repaired in parallel" + "description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedPercentage at the same time", + "x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedPercentage at the same time" }, "maxParallelNodesRepairedPercentage": { "type": "integer", - "description": "specifies the maximum percentage of nodes that can be repaired in parallel", - "x-intellij-html-description": "specifies the maximum percentage of nodes that can be repaired in parallel" + "description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedCount at the same time", + "x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedCount at the same time" }, "maxUnhealthyNodeThresholdCount": { "type": "integer", - "description": "specifies the maximum count of unhealthy nodes", - "x-intellij-html-description": "specifies the maximum count of unhealthy nodes" + "description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdPercentage at the same time", + "x-intellij-html-description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdPercentage at the same time" }, "maxUnhealthyNodeThresholdPercentage": { "type": "integer", - "description": "specifies the maximum percentage of unhealthy nodes", - "x-intellij-html-description": "specifies the maximum percentage of unhealthy nodes" + "description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdCount at the same time", + "x-intellij-html-description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdCount at the same time" }, "nodeRepairConfigOverrides": { "items": { "$ref": "#/definitions/NodeRepairConfigOverride" }, "type": "array", - "description": "specifies custom repair behavior for specific conditions", - "x-intellij-html-description": "specifies custom repair behavior for specific conditions" + "description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values", + "x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values" } }, "preferredOrder": [ @@ -2467,23 +2467,23 @@ "properties": { "minRepairWaitTimeMins": { "type": "integer", - "description": "specifies the minimum wait time before repair in minutes", - "x-intellij-html-description": "specifies the minimum wait time before repair in minutes" + "description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific nodeMonitoringCondition and nodeUnhealthyReason", + "x-intellij-html-description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific nodeMonitoringCondition and nodeUnhealthyReason" }, "nodeMonitoringCondition": { "type": "string", - "description": "specifies the monitoring condition", - "x-intellij-html-description": "specifies the monitoring condition" + "description": "specifies an unhealthy condition reported by the node monitoring agent that this override would apply to", + "x-intellij-html-description": "specifies an unhealthy condition reported by the node monitoring agent that this override would apply to" }, "nodeUnhealthyReason": { "type": "string", - "description": "specifies the reason for node being unhealthy", - "x-intellij-html-description": "specifies the reason for node being unhealthy" + "description": "specifies a reason reported by the node monitoring agent that this override would apply to", + "x-intellij-html-description": "specifies a reason reported by the node monitoring agent that this override would apply to" }, "repairAction": { "type": "string", - "description": "specifies the action to take for repair", - "x-intellij-html-description": "specifies the action to take for repair" + "description": "specifies the repair action to take for nodes when all of the specified conditions are met", + "x-intellij-html-description": "specifies the repair action to take for nodes when all of the specified conditions are met" } }, "preferredOrder": [ @@ -2493,8 +2493,8 @@ "repairAction" ], "additionalProperties": false, - "description": "defines custom repair behavior for specific node conditions", - "x-intellij-html-description": "defines custom repair behavior for specific node conditions" + "description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values", + "x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values" }, "OIDCIdentityProvider": { "required": [ diff --git a/pkg/apis/eksctl.io/v1alpha5/types.go b/pkg/apis/eksctl.io/v1alpha5/types.go index 958fa0a84e..4b2d5a489a 100644 --- a/pkg/apis/eksctl.io/v1alpha5/types.go +++ b/pkg/apis/eksctl.io/v1alpha5/types.go @@ -1577,39 +1577,46 @@ type ( // +optional Enabled *bool `json:"enabled,omitempty"` - // MaxUnhealthyNodeThresholdPercentage specifies the maximum percentage of unhealthy nodes + // MaxUnhealthyNodeThresholdPercentage specifies a percentage threshold of unhealthy nodes, above which node auto + // repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time. // +optional MaxUnhealthyNodeThresholdPercentage *int `json:"maxUnhealthyNodeThresholdPercentage,omitempty"` - // MaxUnhealthyNodeThresholdCount specifies the maximum count of unhealthy nodes + // MaxUnhealthyNodeThresholdCount specifies a count threshold of unhealthy nodes, above which node auto + // repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time. // +optional MaxUnhealthyNodeThresholdCount *int `json:"maxUnhealthyNodeThresholdCount,omitempty"` - // MaxParallelNodesRepairedPercentage specifies the maximum percentage of nodes that can be repaired in parallel + // MaxParallelNodesRepairedPercentage specifies the maximum number of nodes that can be repaired concurrently or in parallel, + // expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time. // +optional MaxParallelNodesRepairedPercentage *int `json:"maxParallelNodesRepairedPercentage,omitempty"` - // MaxParallelNodesRepairedCount specifies the maximum count of nodes that can be repaired in parallel + // MaxParallelNodesRepairedCount specifies the maximum number of nodes that can be repaired concurrently or in parallel, + // expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time. // +optional MaxParallelNodesRepairedCount *int `json:"maxParallelNodesRepairedCount,omitempty"` - // NodeRepairConfigOverrides specifies custom repair behavior for specific conditions + // NodeRepairConfigOverrides specifies granular overrides for specific repair actions. These overrides control the + // repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values. // +optional NodeRepairConfigOverrides []NodeRepairConfigOverride `json:"nodeRepairConfigOverrides,omitempty"` } - // NodeRepairConfigOverride defines custom repair behavior for specific node conditions + // NodeRepairConfigOverride specifies granular overrides for specific repair actions. These overrides control the + // repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values. NodeRepairConfigOverride struct { - // NodeMonitoringCondition specifies the monitoring condition + // NodeMonitoringCondition specifies an unhealthy condition reported by the node monitoring agent that this override would apply to NodeMonitoringCondition string `json:"nodeMonitoringCondition"` - // NodeUnhealthyReason specifies the reason for node being unhealthy + // NodeUnhealthyReason specifies a reason reported by the node monitoring agent that this override would apply to NodeUnhealthyReason string `json:"nodeUnhealthyReason"` - // MinRepairWaitTimeMins specifies the minimum wait time before repair in minutes + // MinRepairWaitTimeMins specifies the minimum time in minutes to wait before attempting to repair a node + // with this specific NodeMonitoringCondition and NodeUnhealthyReason MinRepairWaitTimeMins int `json:"minRepairWaitTimeMins"` - // RepairAction specifies the action to take for repair + // RepairAction specifies the repair action to take for nodes when all of the specified conditions are met RepairAction string `json:"repairAction"` } ) diff --git a/pkg/ctl/cmdutils/nodegroup_flags.go b/pkg/ctl/cmdutils/nodegroup_flags.go index dcc4a9fde8..180fdb7ac0 100644 --- a/pkg/ctl/cmdutils/nodegroup_flags.go +++ b/pkg/ctl/cmdutils/nodegroup_flags.go @@ -59,10 +59,10 @@ func AddCommonCreateNodeGroupFlags(fs *pflag.FlagSet, cmd *Cmd, ng *api.NodeGrou fs.BoolVar(&mngOptions.NodeRepairEnabled, "enable-node-repair", false, "Enable automatic node repair (managed nodegroups only)") // Node repair configuration flags - mngOptions.NodeRepairMaxUnhealthyPercentage = fs.Int("node-repair-max-unhealthy-percentage", 0, "Maximum percentage of unhealthy nodes before repair (managed nodegroups only)") - mngOptions.NodeRepairMaxUnhealthyCount = fs.Int("node-repair-max-unhealthy-count", 0, "Maximum count of unhealthy nodes before repair (managed nodegroups only)") - mngOptions.NodeRepairMaxParallelPercentage = fs.Int("node-repair-max-parallel-percentage", 0, "Maximum percentage of nodes to repair in parallel (managed nodegroups only)") - mngOptions.NodeRepairMaxParallelCount = fs.Int("node-repair-max-parallel-count", 0, "Maximum count of nodes to repair in parallel (managed nodegroups only)") + mngOptions.NodeRepairMaxUnhealthyPercentage = fs.Int("node-repair-max-unhealthy-percentage", 0, "Percentage threshold of unhealthy nodes, above which node auto repair actions will stop (managed nodegroups only)") + mngOptions.NodeRepairMaxUnhealthyCount = fs.Int("node-repair-max-unhealthy-count", 0, "Count threshold of unhealthy nodes, above which node auto repair actions will stop (managed nodegroups only)") + mngOptions.NodeRepairMaxParallelPercentage = fs.Int("node-repair-max-parallel-percentage", 0, "Maximum percentage of unhealthy nodes that can be repaired concurrently or in parallel (managed nodegroups only)") + mngOptions.NodeRepairMaxParallelCount = fs.Int("node-repair-max-parallel-count", 0, "Maximum count of unhealthy nodes that can be repaired concurrently or in parallel (managed nodegroups only)") fs.StringSliceVar(&mngOptions.InstanceTypes, "instance-types", nil, "Comma-separated list of instance types (e.g., --instance-types=c3.large,c4.large,c5.large") } diff --git a/userdocs/src/usage/nodegroup-node-repair-config.md b/userdocs/src/usage/nodegroup-node-repair-config.md index 3eea048093..2fb7e8fa71 100644 --- a/userdocs/src/usage/nodegroup-node-repair-config.md +++ b/userdocs/src/usage/nodegroup-node-repair-config.md @@ -1,4 +1,4 @@ -# Enhanced Node Repair Configuration for EKS Managed Nodegroups +# Support Node Repair Configuration for EKS Managed Nodegroups EKS Managed Nodegroups supports Node Repair, where the health of managed nodes are monitored, and unhealthy worker nodes are replaced or rebooted in response. eksctl now provides comprehensive @@ -45,16 +45,16 @@ $ eksctl create cluster -f basic-node-repair.yaml ### Threshold Configuration -You can configure when node repair is triggered using either percentage or count-based thresholds: +You can configure when node repair actions will stop using either percentage or count-based thresholds. **Note: You cannot use both percentage and count thresholds at the same time.** #### CLI flags for thresholds ```shell -# Percentage-based thresholds +# Percentage-based threshold - repair stops when 20% of nodes are unhealthy $ eksctl create cluster --enable-node-repair \ --node-repair-max-unhealthy-percentage=20 -# Count-based thresholds +# Count-based threshold - repair stops when 5 nodes are unhealthy $ eksctl create cluster --enable-node-repair \ --node-repair-max-unhealthy-count=5 ``` @@ -66,24 +66,25 @@ managedNodeGroups: - name: threshold-ng nodeRepairConfig: enabled: true - # Trigger repair when 20% of nodes are unhealthy + # Stop repair actions when 20% of nodes are unhealthy maxUnhealthyNodeThresholdPercentage: 20 - # Alternative: trigger repair when 3 nodes are unhealthy + # Alternative: stop repair actions when 3 nodes are unhealthy # maxUnhealthyNodeThresholdCount: 3 + # Note: Cannot use both percentage and count thresholds simultaneously ``` ### Parallel Repair Limits -Control how many nodes can be repaired simultaneously: +Control the maximum number of nodes that can be repaired concurrently or in parallel. This gives you finer-grained control over the pace of node replacements. **Note: You cannot use both percentage and count limits at the same time.** #### CLI flags for parallel limits ```shell -# Percentage-based parallel limits +# Percentage-based parallel limits - repair at most 15% of unhealthy nodes in parallel $ eksctl create cluster --enable-node-repair \ --node-repair-max-parallel-percentage=15 -# Count-based parallel limits +# Count-based parallel limits - repair at most 2 unhealthy nodes in parallel $ eksctl create cluster --enable-node-repair \ --node-repair-max-parallel-count=2 ``` @@ -95,15 +96,16 @@ managedNodeGroups: - name: parallel-ng nodeRepairConfig: enabled: true - # Repair at most 15% of nodes in parallel + # Repair at most 15% of unhealthy nodes in parallel maxParallelNodesRepairedPercentage: 15 - # Alternative: repair at most 2 nodes in parallel + # Alternative: repair at most 2 unhealthy nodes in parallel # maxParallelNodesRepairedCount: 2 + # Note: Cannot use both percentage and count limits simultaneously ``` ### Custom Repair Overrides -Define specialized repair behavior for specific failure scenarios: +Specify granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. **If you use this, you must specify all the values for each override.** ```yaml managedNodeGroups: @@ -114,12 +116,12 @@ managedNodeGroups: maxUnhealthyNodeThresholdPercentage: 25 maxParallelNodesRepairedCount: 1 nodeRepairConfigOverrides: - # Handle GPU-related failures + # Handle GPU-related failures with immediate termination - nodeMonitoringCondition: "AcceleratedInstanceNotReady" nodeUnhealthyReason: "NvidiaXID13Error" minRepairWaitTimeMins: 10 repairAction: "Terminate" - # Handle network issues + # Handle network issues with restart after waiting - nodeMonitoringCondition: "NetworkNotReady" nodeUnhealthyReason: "InterfaceNotUp" minRepairWaitTimeMins: 20 @@ -219,69 +221,26 @@ managedNodeGroups: ### nodeRepairConfig -| Field | Type | Description | Example | -|-------|------|-------------|---------| -| `enabled` | boolean | Enable/disable node repair | `true` | -| `maxUnhealthyNodeThresholdPercentage` | integer | Percentage threshold for unhealthy nodes | `20` | -| `maxUnhealthyNodeThresholdCount` | integer | Count threshold for unhealthy nodes | `5` | -| `maxParallelNodesRepairedPercentage` | integer | Percentage limit for parallel repairs | `15` | -| `maxParallelNodesRepairedCount` | integer | Count limit for parallel repairs | `2` | -| `nodeRepairConfigOverrides` | array | Custom repair behavior overrides | See examples above | +| Field | Type | Description | Constraints | Example | +|-------|------|-------------|-------------|---------| +| `enabled` | boolean | Enable/disable node repair | - | `true` | +| `maxUnhealthyNodeThresholdPercentage` | integer | Percentage threshold of unhealthy nodes, above which node auto repair actions will stop | Cannot be used with `maxUnhealthyNodeThresholdCount` | `20` | +| `maxUnhealthyNodeThresholdCount` | integer | Count threshold of unhealthy nodes, above which node auto repair actions will stop | Cannot be used with `maxUnhealthyNodeThresholdPercentage` | `5` | +| `maxParallelNodesRepairedPercentage` | integer | Maximum percentage of unhealthy nodes that can be repaired concurrently or in parallel | Cannot be used with `maxParallelNodesRepairedCount` | `15` | +| `maxParallelNodesRepairedCount` | integer | Maximum count of unhealthy nodes that can be repaired concurrently or in parallel | Cannot be used with `maxParallelNodesRepairedPercentage` | `2` | +| `nodeRepairConfigOverrides` | array | Granular overrides for specific repair actions controlling repair action and delay time | All values must be specified for each override | See examples above | ### nodeRepairConfigOverrides | Field | Type | Description | Valid Values | |-------|------|-------------|--------------| -| `nodeMonitoringCondition` | string | Monitoring condition | `"AcceleratedInstanceNotReady"`, `"NetworkNotReady"` | -| `nodeUnhealthyReason` | string | Reason for node being unhealthy | `"NvidiaXID13Error"`, `"InterfaceNotUp"` | -| `minRepairWaitTimeMins` | integer | Minimum wait time before repair (minutes) | Any positive integer | -| `repairAction` | string | Action to take for repair | `"Terminate"`, `"Restart"`, `"NoAction"` | - -## Best Practices - -### Choosing Thresholds - -- **Small nodegroups (< 10 nodes)**: Use count-based thresholds for precise control -- **Large nodegroups (≥ 10 nodes)**: Use percentage-based thresholds for scalability -- **Critical workloads**: Use conservative thresholds (10-15%) -- **Development environments**: Use higher thresholds (20-30%) - -### Parallel Repair Limits - -- **High availability requirements**: Limit to 1-2 nodes or 10-15% -- **Batch workloads**: Allow higher parallel repairs (20-25%) -- **GPU workloads**: Limit to 1 node at a time due to cost and setup time - -### Custom Overrides - -- **GPU instances**: Use immediate termination for hardware failures -- **Network issues**: Try restart first, then terminate -- **Critical workloads**: Increase wait times to avoid unnecessary disruptions - -## Troubleshooting - -### Common Issues - -1. **Configuration validation errors**: Ensure parameter values are within valid ranges -2. **Conflicting thresholds**: Don't specify both percentage and count for the same parameter -3. **Invalid override values**: Check that monitoring conditions, reasons, and actions are valid - -### Monitoring Node Repair - -Enable CloudWatch logging to monitor node repair activities: - -```yaml -cloudWatch: - clusterLogging: - enableTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"] -``` +| `nodeMonitoringCondition` | string | Unhealthy condition reported by the node monitoring agent that this override applies to | `"AcceleratedInstanceNotReady"`, `"NetworkNotReady"` | +| `nodeUnhealthyReason` | string | Reason reported by the node monitoring agent that this override applies to | `"NvidiaXID13Error"`, `"InterfaceNotUp"` | +| `minRepairWaitTimeMins` | integer | Minimum time in minutes to wait before attempting to repair a node with the specified condition and reason | Any positive integer | +| `repairAction` | string | Repair action to take for nodes when all of the specified conditions are met | `"Terminate"`, `"Restart"`, `"NoAction"` | ## Further Information - [EKS Managed Nodegroup Node Health][eks-user-guide] -- [EKS Node Repair Configuration][eks-node-repair] -- [eksctl Managed Nodegroups][eksctl-managed-nodegroups] [eks-user-guide]: https://docs.aws.amazon.com/eks/latest/userguide/node-health.html -[eks-node-repair]: https://docs.aws.amazon.com/eks/latest/userguide/node-repair.html -[eksctl-managed-nodegroups]: https://eksctl.io/usage/managing-nodegroups/ From 3d28712185786af8ef4fee338f34017a34c3dc3c Mon Sep 17 00:00:00 2001 From: sapphirew Date: Fri, 10 Oct 2025 14:49:29 -0700 Subject: [PATCH 4/4] update user docs --- examples/44-enhanced-node-repair.yaml | 103 ------------------ examples/44-node-repair.yaml | 80 +++++++++++++- .../eksctl.io/v1alpha5/assets/schema.json | 28 ++--- .../src/usage/nodegroup-node-repair-config.md | 2 + 4 files changed, 92 insertions(+), 121 deletions(-) delete mode 100644 examples/44-enhanced-node-repair.yaml diff --git a/examples/44-enhanced-node-repair.yaml b/examples/44-enhanced-node-repair.yaml deleted file mode 100644 index 9c9b78f614..0000000000 --- a/examples/44-enhanced-node-repair.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# An example ClusterConfig that demonstrates the enhanced node repair configuration -# for EKS managed nodegroups with comprehensive parameters and custom overrides. - -apiVersion: eksctl.io/v1alpha5 -kind: ClusterConfig - -metadata: - name: enhanced-node-repair-cluster - region: us-west-2 - -managedNodeGroups: - # Example 1: Basic node repair with percentage-based thresholds - - name: basic-repair-ng - instanceType: m5.large - desiredCapacity: 3 - minSize: 1 - maxSize: 5 - nodeRepairConfig: - enabled: true - # Stop repair actions when 20% of nodes are unhealthy - maxUnhealthyNodeThresholdPercentage: 20 - # Repair at most 15% of unhealthy nodes in parallel - maxParallelNodesRepairedPercentage: 15 - - # Example 2: Node repair with count-based thresholds - - name: count-based-repair-ng - instanceType: m5.xlarge - desiredCapacity: 10 - minSize: 5 - maxSize: 20 - nodeRepairConfig: - enabled: true - # Stop repair actions when 3 nodes are unhealthy - maxUnhealthyNodeThresholdCount: 3 - # Repair at most 2 unhealthy nodes in parallel - maxParallelNodesRepairedCount: 2 - - # Example 3: Comprehensive configuration with custom overrides - - name: comprehensive-repair-ng - instanceType: g4dn.xlarge # GPU instance for ML workloads - desiredCapacity: 4 - minSize: 2 - maxSize: 8 - nodeRepairConfig: - enabled: true - # Stop repair actions when 25% of nodes are unhealthy - maxUnhealthyNodeThresholdPercentage: 25 - # Repair at most 1 unhealthy node at a time to maintain workload availability - maxParallelNodesRepairedCount: 1 - # Custom repair behavior for specific failure scenarios - nodeRepairConfigOverrides: - # Handle GPU-related failures with immediate termination - - nodeMonitoringCondition: "AcceleratedInstanceNotReady" - nodeUnhealthyReason: "NvidiaXID13Error" - minRepairWaitTimeMins: 5 - repairAction: "Terminate" - # Handle network issues with restart first, then terminate - - nodeMonitoringCondition: "NetworkNotReady" - nodeUnhealthyReason: "InterfaceNotUp" - minRepairWaitTimeMins: 15 - repairAction: "Restart" - - # Example 4: Conservative repair settings for critical workloads - - name: critical-workload-ng - instanceType: c5.2xlarge - desiredCapacity: 6 - minSize: 3 - maxSize: 12 - nodeRepairConfig: - enabled: true - # Very conservative settings - stop repair actions when only 10% of nodes are unhealthy - maxUnhealthyNodeThresholdPercentage: 10 - # Repair only 1 unhealthy node at a time for critical workloads - maxParallelNodesRepairedCount: 1 - nodeRepairConfigOverrides: - # For critical workloads, wait longer before taking action - - nodeMonitoringCondition: "AcceleratedInstanceNotReady" - nodeUnhealthyReason: "NvidiaXID13Error" - minRepairWaitTimeMins: 30 - repairAction: "Terminate" - - nodeMonitoringCondition: "NetworkNotReady" - nodeUnhealthyReason: "InterfaceNotUp" - minRepairWaitTimeMins: 45 - repairAction: "Restart" - - # Example 5: Disabled node repair (for comparison) - - name: no-repair-ng - instanceType: t3.medium - desiredCapacity: 2 - minSize: 1 - maxSize: 4 - nodeRepairConfig: - enabled: false - -# Additional cluster configuration -vpc: - cidr: "10.0.0.0/16" - autoAllocateIPv6: false - -# Enable logging for monitoring node repair activities -cloudWatch: - clusterLogging: - enableTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"] \ No newline at end of file diff --git a/examples/44-node-repair.yaml b/examples/44-node-repair.yaml index ddbe861a6f..7611dca24f 100644 --- a/examples/44-node-repair.yaml +++ b/examples/44-node-repair.yaml @@ -1,4 +1,5 @@ -# An example ClusterConfig that uses a managed node group with auto repair. +# An example ClusterConfig that demonstrates node repair configuration +# for EKS managed nodegroups with various configuration options. apiVersion: eksctl.io/v1alpha5 kind: ClusterConfig @@ -8,6 +9,77 @@ metadata: region: us-west-2 managedNodeGroups: -- name: ng-1 - nodeRepairConfig: - enabled: true + # Example 1: Basic node repair + - name: basic-repair-ng + instanceType: m5.large + desiredCapacity: 3 + nodeRepairConfig: + enabled: true + + # Example 2: Node repair with percentage-based thresholds + - name: percentage-repair-ng + instanceType: m5.large + desiredCapacity: 3 + minSize: 1 + maxSize: 5 + nodeRepairConfig: + enabled: true + # Stop repair actions when 20% of nodes are unhealthy + maxUnhealthyNodeThresholdPercentage: 20 + # Repair at most 15% of unhealthy nodes in parallel + maxParallelNodesRepairedPercentage: 15 + + # Example 3: Node repair with count-based thresholds + - name: count-repair-ng + instanceType: m5.xlarge + desiredCapacity: 10 + minSize: 5 + maxSize: 20 + nodeRepairConfig: + enabled: true + # Stop repair actions when 3 nodes are unhealthy + maxUnhealthyNodeThresholdCount: 3 + # Repair at most 2 unhealthy nodes in parallel + maxParallelNodesRepairedCount: 2 + + # Example 4: GPU workload with custom repair overrides + - name: gpu-repair-ng + instanceType: g4dn.xlarge + desiredCapacity: 4 + minSize: 2 + maxSize: 8 + nodeRepairConfig: + enabled: true + maxUnhealthyNodeThresholdPercentage: 25 + maxParallelNodesRepairedCount: 1 + # Custom repair behavior for specific failure scenarios + nodeRepairConfigOverrides: + # Handle GPU-related failures with immediate termination + - nodeMonitoringCondition: "AcceleratedInstanceNotReady" + nodeUnhealthyReason: "NvidiaXID13Error" + minRepairWaitTimeMins: 5 + repairAction: "Terminate" + # Handle network issues with restart first + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 15 + repairAction: "Restart" + + # Example 5: Conservative repair for critical workloads + - name: critical-repair-ng + instanceType: c5.2xlarge + desiredCapacity: 6 + minSize: 3 + maxSize: 12 + nodeRepairConfig: + enabled: true + # Conservative settings - stop repair when only 10% of nodes are unhealthy + maxUnhealthyNodeThresholdPercentage: 10 + # Repair only 1 node at a time + maxParallelNodesRepairedCount: 1 + nodeRepairConfigOverrides: + # Wait longer before taking action on critical workloads + - nodeMonitoringCondition: "NetworkNotReady" + nodeUnhealthyReason: "InterfaceNotUp" + minRepairWaitTimeMins: 45 + repairAction: "Restart" diff --git a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json index e2ba76d066..5a993fee8c 100755 --- a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json +++ b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json @@ -2304,31 +2304,31 @@ }, "maxParallelNodesRepairedCount": { "type": "integer", - "description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedPercentage at the same time", - "x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedPercentage at the same time" + "description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time.", + "x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time." }, "maxParallelNodesRepairedPercentage": { "type": "integer", - "description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedCount at the same time", - "x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedCount at the same time" + "description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time.", + "x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time." }, "maxUnhealthyNodeThresholdCount": { "type": "integer", - "description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdPercentage at the same time", - "x-intellij-html-description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdPercentage at the same time" + "description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time.", + "x-intellij-html-description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time." }, "maxUnhealthyNodeThresholdPercentage": { "type": "integer", - "description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdCount at the same time", - "x-intellij-html-description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdCount at the same time" + "description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time.", + "x-intellij-html-description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time." }, "nodeRepairConfigOverrides": { "items": { "$ref": "#/definitions/NodeRepairConfigOverride" }, "type": "array", - "description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values", - "x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values" + "description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.", + "x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values." } }, "preferredOrder": [ @@ -2467,8 +2467,8 @@ "properties": { "minRepairWaitTimeMins": { "type": "integer", - "description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific nodeMonitoringCondition and nodeUnhealthyReason", - "x-intellij-html-description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific nodeMonitoringCondition and nodeUnhealthyReason" + "description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason", + "x-intellij-html-description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason" }, "nodeMonitoringCondition": { "type": "string", @@ -2493,8 +2493,8 @@ "repairAction" ], "additionalProperties": false, - "description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values", - "x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values" + "description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.", + "x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values." }, "OIDCIdentityProvider": { "required": [ diff --git a/userdocs/src/usage/nodegroup-node-repair-config.md b/userdocs/src/usage/nodegroup-node-repair-config.md index 2fb7e8fa71..da2d592dbf 100644 --- a/userdocs/src/usage/nodegroup-node-repair-config.md +++ b/userdocs/src/usage/nodegroup-node-repair-config.md @@ -130,6 +130,8 @@ managedNodeGroups: ## Complete Configuration Examples +For a comprehensive example with all configuration options, see [examples/44-node-repair.yaml](https://github.com/eksctl-io/eksctl/blob/main/examples/44-node-repair.yaml). + ### Example 1: Basic repair with percentage thresholds ```yaml