From fa9eaa3346993d0bad55af2fd6cf783130d804ff Mon Sep 17 00:00:00 2001 From: pete911 Date: Fri, 19 May 2023 20:54:11 +0100 Subject: [PATCH 1/8] add imds asg target lifecycle hook --- README.md | 2 +- cmd/node-termination-handler.go | 6 + .../templates/daemonset.linux.yaml | 2 + .../templates/daemonset.windows.yaml | 2 + .../aws-node-termination-handler/values.yaml | 3 + pkg/config/config.go | 4 + pkg/config/config_test.go | 7 + pkg/ec2metadata/ec2metadata.go | 24 ++++ pkg/ec2metadata/ec2metadata_test.go | 85 ++++++++++++ .../asglifecycle/asg-lifecycle-monitor.go | 102 ++++++++++++++ .../asg-lifecycle-monitor_internal_test.go | 99 ++++++++++++++ .../asg-lifecycle-monitor_test.go | 125 ++++++++++++++++++ 12 files changed, 460 insertions(+), 1 deletion(-) create mode 100644 pkg/monitor/asglifecycle/asg-lifecycle-monitor.go create mode 100644 pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go create mode 100644 pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go diff --git a/README.md b/README.md index 0c3437b2..6807611f 100644 --- a/README.md +++ b/README.md @@ -75,8 +75,8 @@ Must be deployed as a Kubernetes **Deployment**. Also requires some **additional | Spot Instance Termination Notifications (ITN) | ✅ | ✅ | | Scheduled Events | ✅ | ✅ | | Instance Rebalance Recommendation | ✅ | ✅ | +| ASG Termination Lifecycle Hooks | ✅ | ✅ | | AZ Rebalance Recommendation | ❌ | ✅ | -| ASG Termination Lifecycle Hooks | ❌ | ✅ | | Instance State Change Events | ❌ | ✅ | diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 9f7dcaf1..97f33870 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -16,6 +16,7 @@ package main import ( "context" "fmt" + "github.com/aws/aws-node-termination-handler/pkg/monitor/asglifecycle" "os" "os/signal" "strings" @@ -50,6 +51,7 @@ import ( const ( scheduledMaintenance = "Scheduled Maintenance" spotITN = "Spot ITN" + asgLifecycle = "ASG Lifecycle" rebalanceRecommendation = "Rebalance Recommendation" sqsEvents = "SQS Event" timeFormat = "2006/01/02 15:04:05" @@ -172,6 +174,10 @@ func main() { imdsSpotMonitor := spotitn.NewSpotInterruptionMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName) monitoringFns[spotITN] = imdsSpotMonitor } + if nthConfig.EnableASGLifecycleDraining { + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName) + monitoringFns[asgLifecycle] = asgLifecycleMonitor + } if nthConfig.EnableScheduledEventDraining { imdsScheduledEventMonitor := scheduledevent.NewScheduledEventMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName) monitoringFns[scheduledMaintenance] = imdsScheduledEventMonitor diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml index 95e4b50f..be6385de 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml @@ -143,6 +143,8 @@ spec: {{- end }} - name: ENABLE_SPOT_INTERRUPTION_DRAINING value: {{ .Values.enableSpotInterruptionDraining | quote }} + - name: ENABLE_ASG_LIFECYCLE_DRAINING + value: {{ .Values.enableASGLifecycleDraining | quote }} - name: ENABLE_SCHEDULED_EVENT_DRAINING value: {{ .Values.enableScheduledEventDraining | quote }} - name: ENABLE_REBALANCE_MONITORING diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml index 8a9db7bf..95af69d1 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml @@ -143,6 +143,8 @@ spec: {{- end }} - name: ENABLE_SPOT_INTERRUPTION_DRAINING value: {{ .Values.enableSpotInterruptionDraining | quote }} + - name: ENABLE_ASG_LIFECYCLE_DRAINING + value: {{ .Values.enableASGLifecycleDraining | quote }} - name: ENABLE_SCHEDULED_EVENT_DRAINING value: {{ .Values.enableScheduledEventDraining | quote }} - name: ENABLE_REBALANCE_MONITORING diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml index 3a4d5db4..864c5f96 100644 --- a/config/helm/aws-node-termination-handler/values.yaml +++ b/config/helm/aws-node-termination-handler/values.yaml @@ -268,6 +268,9 @@ metadataTries: 3 # enableSpotInterruptionDraining If false, do not drain nodes when the spot interruption termination notice is received. Only used in IMDS mode. enableSpotInterruptionDraining: true +# enableASGLifecycleDraining If false, do not drain nodes when ASG target lifecycle state Terminated is received. Only used in IMDS mode. +enableASGLifecycleDraining: true + # enableScheduledEventDraining If false, do not drain nodes before the maintenance window starts for an EC2 instance scheduled event. Only used in IMDS mode. enableScheduledEventDraining: true diff --git a/pkg/config/config.go b/pkg/config/config.go index 4eabaf00..3fd79c25 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -53,6 +53,8 @@ const ( enableScheduledEventDrainingDefault = true enableSpotInterruptionDrainingConfigKey = "ENABLE_SPOT_INTERRUPTION_DRAINING" enableSpotInterruptionDrainingDefault = true + enableASGLifecycleDrainingConfigKey = "ENABLE_ASG_LIFECYCLE_DRAINING" + enableASGLifecycleDrainingDefault = true enableSQSTerminationDrainingConfigKey = "ENABLE_SQS_TERMINATION_DRAINING" enableSQSTerminationDrainingDefault = false enableRebalanceMonitoringConfigKey = "ENABLE_REBALANCE_MONITORING" @@ -131,6 +133,7 @@ type Config struct { WebhookProxy string EnableScheduledEventDraining bool EnableSpotInterruptionDraining bool + EnableASGLifecycleDraining bool EnableSQSTerminationDraining bool EnableRebalanceMonitoring bool EnableRebalanceDraining bool @@ -193,6 +196,7 @@ func ParseCliArgs() (config Config, err error) { flag.StringVar(&config.WebhookTemplateFile, "webhook-template-file", getEnv(webhookTemplateFileConfigKey, ""), "If specified, replaces the default webhook message template with content from template file.") flag.BoolVar(&config.EnableScheduledEventDraining, "enable-scheduled-event-draining", getBoolEnv(enableScheduledEventDrainingConfigKey, enableScheduledEventDrainingDefault), "If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event") flag.BoolVar(&config.EnableSpotInterruptionDraining, "enable-spot-interruption-draining", getBoolEnv(enableSpotInterruptionDrainingConfigKey, enableSpotInterruptionDrainingDefault), "If true, drain nodes when the spot interruption termination notice is received") + flag.BoolVar(&config.EnableASGLifecycleDraining, "enable-asg-lifecycle-draining", getBoolEnv(enableASGLifecycleDrainingConfigKey, enableASGLifecycleDrainingDefault), "If true, drain nodes when the ASG target lifecyle state is Terminated is received") flag.BoolVar(&config.EnableSQSTerminationDraining, "enable-sqs-termination-draining", getBoolEnv(enableSQSTerminationDrainingConfigKey, enableSQSTerminationDrainingDefault), "If true, drain nodes when an SQS termination event is received") flag.BoolVar(&config.EnableRebalanceMonitoring, "enable-rebalance-monitoring", getBoolEnv(enableRebalanceMonitoringConfigKey, enableRebalanceMonitoringDefault), "If true, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set \"enableRebalanceDraining\".") flag.BoolVar(&config.EnableRebalanceDraining, "enable-rebalance-draining", getBoolEnv(enableRebalanceDrainingConfigKey, enableRebalanceDrainingDefault), "If true, drain nodes when the rebalance recommendation notice is received") diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index e1f7a1ba..4b051b12 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -36,6 +36,7 @@ func TestParseCliArgsEnvSuccess(t *testing.T) { t.Setenv("DRY_RUN", "true") t.Setenv("ENABLE_SCHEDULED_EVENT_DRAINING", "true") t.Setenv("ENABLE_SPOT_INTERRUPTION_DRAINING", "false") + t.Setenv("ENABLE_ASG_LIFECYCLE_DRAINING", "false") t.Setenv("ENABLE_SQS_TERMINATION_DRAINING", "false") t.Setenv("ENABLE_REBALANCE_MONITORING", "true") t.Setenv("ENABLE_REBALANCE_DRAINING", "true") @@ -61,6 +62,7 @@ func TestParseCliArgsEnvSuccess(t *testing.T) { h.Equals(t, true, nthConfig.DryRun) h.Equals(t, true, nthConfig.EnableScheduledEventDraining) h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining) + h.Equals(t, false, nthConfig.EnableASGLifecycleDraining) h.Equals(t, false, nthConfig.EnableSQSTerminationDraining) h.Equals(t, true, nthConfig.EnableRebalanceMonitoring) h.Equals(t, true, nthConfig.EnableRebalanceDraining) @@ -96,6 +98,7 @@ func TestParseCliArgsSuccess(t *testing.T) { "--dry-run=true", "--enable-scheduled-event-draining=true", "--enable-spot-interruption-draining=false", + "--enable-asg-lifecycle-draining=false", "--enable-sqs-termination-draining=false", "--enable-rebalance-monitoring=true", "--enable-rebalance-draining=true", @@ -121,6 +124,7 @@ func TestParseCliArgsSuccess(t *testing.T) { h.Equals(t, true, nthConfig.DryRun) h.Equals(t, true, nthConfig.EnableScheduledEventDraining) h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining) + h.Equals(t, false, nthConfig.EnableASGLifecycleDraining) h.Equals(t, false, nthConfig.EnableSQSTerminationDraining) h.Equals(t, true, nthConfig.EnableRebalanceMonitoring) h.Equals(t, true, nthConfig.EnableRebalanceDraining) @@ -151,6 +155,7 @@ func TestParseCliArgsOverrides(t *testing.T) { t.Setenv("DRY_RUN", "false") t.Setenv("ENABLE_SCHEDULED_EVENT_DRAINING", "false") t.Setenv("ENABLE_SPOT_INTERRUPTION_DRAINING", "true") + t.Setenv("ENABLE_ASG_LIFECYCLE_DRAINING", "true") t.Setenv("ENABLE_SQS_TERMINATION_DRAINING", "false") t.Setenv("ENABLE_REBALANCE_MONITORING", "true") t.Setenv("ENABLE_REBALANCE_DRAINING", "true") @@ -174,6 +179,7 @@ func TestParseCliArgsOverrides(t *testing.T) { "--dry-run=true", "--enable-scheduled-event-draining=true", "--enable-spot-interruption-draining=false", + "--enable-asg-lifecycle-draining=false", "--enable-sqs-termination-draining=true", "--enable-rebalance-monitoring=false", "--enable-rebalance-draining=false", @@ -201,6 +207,7 @@ func TestParseCliArgsOverrides(t *testing.T) { h.Equals(t, true, nthConfig.DryRun) h.Equals(t, true, nthConfig.EnableScheduledEventDraining) h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining) + h.Equals(t, false, nthConfig.EnableASGLifecycleDraining) h.Equals(t, true, nthConfig.EnableSQSTerminationDraining) h.Equals(t, false, nthConfig.EnableRebalanceMonitoring) h.Equals(t, false, nthConfig.EnableRebalanceDraining) diff --git a/pkg/ec2metadata/ec2metadata.go b/pkg/ec2metadata/ec2metadata.go index f7f5ade8..7449d641 100644 --- a/pkg/ec2metadata/ec2metadata.go +++ b/pkg/ec2metadata/ec2metadata.go @@ -30,6 +30,8 @@ import ( const ( // SpotInstanceActionPath is the context path to spot/instance-action within IMDS SpotInstanceActionPath = "/latest/meta-data/spot/instance-action" + // ASGTargetLifecycleStatePath path to autoscaling target lifecycle state within IMDS + ASGTargetLifecycleStatePath = "/latest/meta-data/autoscaling/target-lifecycle-state" // ScheduledEventPath is the context path to events/maintenance/scheduled within IMDS ScheduledEventPath = "/latest/meta-data/events/maintenance/scheduled" // RebalanceRecommendationPath is the context path to events/recommendations/rebalance within IMDS @@ -193,6 +195,28 @@ func (e *Service) GetRebalanceRecommendationEvent() (rebalanceRec *RebalanceReco return rebalanceRec, nil } +// GetASGTargetLifecycleState retrieves ASG target lifecycle state from imds. State can be empty string +// if the lifecycle hook is not present on the ASG +func (e *Service) GetASGTargetLifecycleState() (state string, err error) { + resp, err := e.Request(ASGTargetLifecycleStatePath) + // 404s are normal when querying for the 'autoscaling/target-lifecycle-state' path and there is no lifecycle hook + if resp != nil && resp.StatusCode == 404 { + return "", nil + } else if resp != nil && (resp.StatusCode < 200 || resp.StatusCode >= 300) { + return "", fmt.Errorf("Metadata request received http status code: %d", resp.StatusCode) + } + if err != nil { + return "", fmt.Errorf("Unable to parse metadata response: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("Unable to parse http response. Status code: %d. %w", resp.StatusCode, err) + } + return string(body), nil +} + // GetMetadataInfo generic function for retrieving ec2 metadata func (e *Service) GetMetadataInfo(path string) (info string, err error) { metadataInfo := "" diff --git a/pkg/ec2metadata/ec2metadata_test.go b/pkg/ec2metadata/ec2metadata_test.go index 6fe58301..4a698c22 100644 --- a/pkg/ec2metadata/ec2metadata_test.go +++ b/pkg/ec2metadata/ec2metadata_test.go @@ -504,6 +504,91 @@ func TestGetRebalanceRecommendationEventRequestFailure(t *testing.T) { h.Assert(t, err != nil, "error expected because no server should be running") } +func TestGetASGTargetLifecycleStateSuccess(t *testing.T) { + requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state" + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100") + if req.URL.String() == "/latest/api/token" { + rw.WriteHeader(200) + _, err := rw.Write([]byte(`token`)) + h.Ok(t, err) + return + } + h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token") + h.Equals(t, req.URL.String(), requestPath) + _, err := rw.Write([]byte("InService")) + h.Ok(t, err) + })) + defer server.Close() + + expectedState := "InService" + + // Use URL from our local test server + imds := ec2metadata.New(server.URL, 1) + + state, err := imds.GetASGTargetLifecycleState() + h.Ok(t, err) + h.Equals(t, expectedState, state) +} + +func TestGetASGTargetLifecycleState404Success(t *testing.T) { + requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state" + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100") + if req.URL.String() == "/latest/api/token" { + rw.WriteHeader(200) + _, err := rw.Write([]byte(`token`)) + h.Ok(t, err) + return + } + h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token") + h.Equals(t, req.URL.String(), requestPath) + rw.WriteHeader(404) + })) + defer server.Close() + + // Use URL from our local test server + imds := ec2metadata.New(server.URL, 1) + + state, err := imds.GetASGTargetLifecycleState() + h.Ok(t, err) + h.Assert(t, state == "", "ASG target lifecycle state should be empty") +} + +func TestGetASGTargetLifecycleState500Failure(t *testing.T) { + requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state" + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100") + if req.URL.String() == "/latest/api/token" { + rw.WriteHeader(200) + _, err := rw.Write([]byte(`token`)) + h.Ok(t, err) + return + } + h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token") + h.Equals(t, req.URL.String(), requestPath) + rw.WriteHeader(500) + })) + defer server.Close() + + // Use URL from our local test server + imds := ec2metadata.New(server.URL, 1) + + _, err := imds.GetASGTargetLifecycleState() + h.Assert(t, err != nil, "error expected on non-200 or non-404 status code") +} + +func TestGetASGTargetLifecycleStateRequestFailure(t *testing.T) { + // Use URL from our local test server + imds := ec2metadata.New("/some-path-that-will-error", 1) + + _, err := imds.GetASGTargetLifecycleState() + h.Assert(t, err != nil, "error expected because no server should be running") +} + func TestGetMetadataServiceRequest404(t *testing.T) { var requestPath string = "/latest/meta-data/instance-type" diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go new file mode 100644 index 00000000..53ed5b1e --- /dev/null +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go @@ -0,0 +1,102 @@ +// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package asglifecycle + +import ( + "crypto/sha256" + "fmt" + "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/aws/aws-node-termination-handler/pkg/node" + "time" +) + +// ASGLifecycleMonitorKind is a const to define this monitor kind +const ASGLifecycleMonitorKind = "ASG_LIFECYCLE_MONITOR" + +// ASGLifecycleMonitor is a struct definition which facilitates monitoring of ASG target lifecycle state from IMDS +type ASGLifecycleMonitor struct { + IMDS *ec2metadata.Service + InterruptionChan chan<- monitor.InterruptionEvent + CancelChan chan<- monitor.InterruptionEvent + NodeName string +} + +// NewASGLifecycleMonitor creates an instance of a ASG lifecycle IMDS monitor +func NewASGLifecycleMonitor(imds *ec2metadata.Service, interruptionChan chan<- monitor.InterruptionEvent, cancelChan chan<- monitor.InterruptionEvent, nodeName string) ASGLifecycleMonitor { + return ASGLifecycleMonitor{ + IMDS: imds, + InterruptionChan: interruptionChan, + CancelChan: cancelChan, + NodeName: nodeName, + } +} + +// Monitor continuously monitors metadata for ASG target lifecycle state and sends interruption events to the passed in channel +func (m ASGLifecycleMonitor) Monitor() error { + interruptionEvent, err := m.checkForASGTargetLifecycleStateNotice() + if err != nil { + return err + } + if interruptionEvent != nil && interruptionEvent.Kind == monitor.ASGLifecycleKind { + m.InterruptionChan <- *interruptionEvent + } + return nil +} + +// Kind denotes the kind of monitor +func (m ASGLifecycleMonitor) Kind() string { + return ASGLifecycleMonitorKind +} + +// checkForSpotInterruptionNotice Checks EC2 instance metadata for a spot interruption termination notice +func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.InterruptionEvent, error) { + state, err := m.IMDS.GetASGTargetLifecycleState() + if err != nil { + return nil, fmt.Errorf("There was a problem checking for ASG target lifecycle state: %w", err) + } + if state != "Terminated" { + // if the state is not "Terminated", we can skip. State can also be empty (no hook configured). + return nil, nil + } + + nodeName := m.NodeName + // there is no time in the response, we just set time to the latest check + interruptionTime := time.Now() + + // There's no EventID returned, so we'll create it using a hash to prevent duplicates. + hash := sha256.New() + if _, err = hash.Write([]byte(fmt.Sprintf("%s:%s", state, interruptionTime))); err != nil { + return nil, fmt.Errorf("There was a problem creating an event ID from the event: %w", err) + } + + return &monitor.InterruptionEvent{ + EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)), + Kind: monitor.ASGLifecycleKind, + Monitor: ASGLifecycleMonitorKind, + StartTime: interruptionTime, + NodeName: nodeName, + Description: "AST tareget lifecycle state received. Instance will be \n", + PreDrainTask: setInterruptionTaint, + }, nil +} + +func setInterruptionTaint(interruptionEvent monitor.InterruptionEvent, n node.Node) error { + err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID) + if err != nil { + return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID, err) + } + + return nil +} diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go new file mode 100644 index 00000000..8532c16a --- /dev/null +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go @@ -0,0 +1,99 @@ +// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package asglifecycle + +import ( + "context" + "testing" + "time" + + "github.com/rs/zerolog/log" + + "github.com/aws/aws-node-termination-handler/pkg/config" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/aws/aws-node-termination-handler/pkg/node" + h "github.com/aws/aws-node-termination-handler/pkg/test" + "github.com/aws/aws-node-termination-handler/pkg/uptime" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/kubectl/pkg/drain" +) + +const nodeName = "NAME" + +func getDrainHelper(client *fake.Clientset) *drain.Helper { + return &drain.Helper{ + Client: client, + Force: true, + GracePeriodSeconds: -1, + IgnoreAllDaemonSets: true, + DeleteEmptyDirData: true, + Timeout: time.Duration(120) * time.Second, + Out: log.Logger, + ErrOut: log.Logger, + } +} + +func TestSetInterruptionTaint(t *testing.T) { + drainEvent := monitor.InterruptionEvent{ + EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", + } + nthConfig := config.Config{ + DryRun: true, + NodeName: nodeName, + } + + client := fake.NewSimpleClientset() + _, err := client.CoreV1().Nodes().Create(context.Background(), &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}, metav1.CreateOptions{}) + h.Ok(t, err) + + tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) + h.Ok(t, err) + + err = setInterruptionTaint(drainEvent, *tNode) + + h.Ok(t, err) +} + +func TestInterruptionTaintAlreadyPresent(t *testing.T) { + drainEvent := monitor.InterruptionEvent{ + EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", + } + nthConfig := config.Config{ + DryRun: false, + NodeName: nodeName, + } + + client := fake.NewSimpleClientset() + newNode := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: nodeName}, + Spec: v1.NodeSpec{Taints: []v1.Taint{{ + Key: node.RebalanceRecommendationTaint, + Value: drainEvent.EventID[:63], + Effect: v1.TaintEffectNoSchedule, + }, + }}, + } + + _, err := client.CoreV1().Nodes().Create(context.Background(), newNode, metav1.CreateOptions{}) + h.Ok(t, err) + + tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) + h.Ok(t, err) + + err = setInterruptionTaint(drainEvent, *tNode) + + h.Ok(t, err) +} diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go new file mode 100644 index 00000000..cf6ab6ac --- /dev/null +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go @@ -0,0 +1,125 @@ +// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package asglifecycle_test + +import ( + "github.com/aws/aws-node-termination-handler/pkg/monitor/asglifecycle" + "net/http" + "net/http/httptest" + "testing" + + "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + h "github.com/aws/aws-node-termination-handler/pkg/test" +) + +const ( + expFormattedTime = "2020-10-26 15:15:15 +0000 UTC" + imdsV2TokenPath = "/latest/api/token" + nodeName = "test-node" +) + +var asgTargetLifecycleStateResponse = []byte("InService") + +func TestMonitor_Success(t *testing.T) { + requestPath := ec2metadata.ASGTargetLifecycleStatePath + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + if imdsV2TokenPath == req.URL.String() { + rw.WriteHeader(403) + return + } + h.Equals(t, req.URL.String(), requestPath) + _, err := rw.Write(asgTargetLifecycleStateResponse) + h.Ok(t, err) + })) + defer server.Close() + + drainChan := make(chan monitor.InterruptionEvent) + cancelChan := make(chan monitor.InterruptionEvent) + imds := ec2metadata.New(server.URL, 1) + + go func() { + result := <-drainChan + h.Equals(t, monitor.ASGLifecycleKind, result.Kind) + h.Equals(t, asglifecycle.ASGLifecycleMonitorKind, result.Monitor) + h.Equals(t, expFormattedTime, result.StartTime.String()) + }() + + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName) + err := asgLifecycleMonitor.Monitor() + h.Ok(t, err) +} + +func TestMonitor_MetadataParseFailure(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + if imdsV2TokenPath == req.URL.String() { + rw.WriteHeader(403) + return + } + })) + defer server.Close() + + drainChan := make(chan monitor.InterruptionEvent) + cancelChan := make(chan monitor.InterruptionEvent) + imds := ec2metadata.New(server.URL, 1) + + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName) + err := asgLifecycleMonitor.Monitor() + h.Ok(t, err) +} + +func TestMonitor_404Response(t *testing.T) { + requestPath := ec2metadata.ASGTargetLifecycleStatePath + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + if imdsV2TokenPath == req.URL.String() { + rw.WriteHeader(403) + return + } + h.Equals(t, req.URL.String(), requestPath) + http.Error(rw, "error", http.StatusNotFound) + })) + defer server.Close() + + drainChan := make(chan monitor.InterruptionEvent) + cancelChan := make(chan monitor.InterruptionEvent) + imds := ec2metadata.New(server.URL, 1) + + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName) + err := asgLifecycleMonitor.Monitor() + h.Ok(t, err) +} + +func TestMonitor_500Response(t *testing.T) { + requestPath := ec2metadata.ASGTargetLifecycleStatePath + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + if imdsV2TokenPath == req.URL.String() { + rw.WriteHeader(403) + return + } + h.Equals(t, req.URL.String(), requestPath) + http.Error(rw, "error", http.StatusInternalServerError) + })) + defer server.Close() + + drainChan := make(chan monitor.InterruptionEvent) + cancelChan := make(chan monitor.InterruptionEvent) + imds := ec2metadata.New(server.URL, 1) + + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName) + err := asgLifecycleMonitor.Monitor() + h.Assert(t, err != nil, "Failed to return error when 500 response") +} From 2ff59b9eeb0bb1c16f4e6ed3b516b54ac45de514 Mon Sep 17 00:00:00 2001 From: pete911 Date: Tue, 23 May 2023 08:20:00 +0100 Subject: [PATCH 2/8] update comment on target lifecycle 404 response --- pkg/ec2metadata/ec2metadata.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ec2metadata/ec2metadata.go b/pkg/ec2metadata/ec2metadata.go index 7449d641..f64d0cde 100644 --- a/pkg/ec2metadata/ec2metadata.go +++ b/pkg/ec2metadata/ec2metadata.go @@ -199,7 +199,7 @@ func (e *Service) GetRebalanceRecommendationEvent() (rebalanceRec *RebalanceReco // if the lifecycle hook is not present on the ASG func (e *Service) GetASGTargetLifecycleState() (state string, err error) { resp, err := e.Request(ASGTargetLifecycleStatePath) - // 404s are normal when querying for the 'autoscaling/target-lifecycle-state' path and there is no lifecycle hook + // 404s should not happen, but there can be a case if the instance is brand new and the field is not populated yet if resp != nil && resp.StatusCode == 404 { return "", nil } else if resp != nil && (resp.StatusCode < 200 || resp.StatusCode >= 300) { From be55427dbfbd1dfe5ea84b2148f739ab91774c5e Mon Sep 17 00:00:00 2001 From: Lu-David Date: Tue, 27 Aug 2024 14:53:59 -0700 Subject: [PATCH 3/8] drafted test --- test/e2e/asg-lifecycle-imds-test | 126 +++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100755 test/e2e/asg-lifecycle-imds-test diff --git a/test/e2e/asg-lifecycle-imds-test b/test/e2e/asg-lifecycle-imds-test new file mode 100755 index 00000000..0877eb37 --- /dev/null +++ b/test/e2e/asg-lifecycle-imds-test @@ -0,0 +1,126 @@ +#!/bin/bash +set -euo pipefail + +# Available env vars: +# $TMP_DIR +# $CLUSTER_NAME +# $KUBECONFIG +# $NODE_TERMINATION_HANDLER_DOCKER_REPO +# $NODE_TERMINATION_HANDLER_DOCKER_TAG +# $WEBHOOK_DOCKER_REPO +# $WEBHOOK_DOCKER_TAG +# $AEMM_URL +# $AEMM_VERSION + + +function fail_and_exit { + echo "❌ ASG Lifecycle SQS Test failed $CLUSTER_NAME ❌" + exit "${1:-1}" +} + +echo "Starting ASG Lifecycle SQS Test for Node Termination Handler" +START_TIME=$(date -u +"%Y-%m-%dT%TZ") + +SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" + +common_helm_args=() + +localstack_helm_args=( + upgrade + --install + --namespace default + "$CLUSTER_NAME-localstack" + "$SCRIPTPATH/../../config/helm/localstack/" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set defaultRegion="${AWS_REGION}" + --wait +) + +set -x +helm "${localstack_helm_args[@]}" +set +x + +sleep 10 + +RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'" +localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \ + -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \ + | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }') +echo "🥑 Using localstack pod ${localstack_pod}" +run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}") +private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName') +instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId') +echo "🥑 Started mock EC2 instance ($instance_id) w/ private DNS name: ${private_dns_name}" +set -x +CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}" +queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl) + +echo "🥑 Created SQS Queue ${queue_url}" + +anth_helm_args=( + upgrade + --install + --namespace kube-system + "$CLUSTER_NAME-acth" + "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set nodeSelector."${NTH_CONTROL_LABEL}" + --set tolerations[0].operator=Exists + --set awsAccessKeyID=foo + --set awsSecretAccessKey=bar + --set awsRegion="${AWS_REGION}" + --set awsEndpoint="http://localstack.default" + --set checkTagBeforeDraining=false + --set enableSqsTerminationDraining=true + --set queueURL="${queue_url}" + --wait +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + --namespace default + "$CLUSTER_NAME-emtp" + "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" + --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" + --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait +) +[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x + +TAINT_CHECK_CYCLES=15 +TAINT_CHECK_SLEEP=15 + +DEPLOYED=0 + +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then + echo "✅ Verified regular-pod-test pod was scheduled and started!" + DEPLOYED=1 + break + fi + echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" + sleep $TAINT_CHECK_SLEEP +done + +if [[ $DEPLOYED -eq 0 ]]; then + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 +fi + From 66ca01d24c17225d11ebd93fdfabb379a8640d5f Mon Sep 17 00:00:00 2001 From: Lu-David Date: Wed, 28 Aug 2024 17:48:05 -0700 Subject: [PATCH 4/8] added imds support --- .../asglifecycle/asg-lifecycle-monitor.go | 7 +- test/e2e/asg-lifecycle-imds-test | 135 +++++++++++------- test/e2e/spot-interruption-test | 8 ++ test/k8s-local-cluster-test/run-test | 2 +- 4 files changed, 93 insertions(+), 59 deletions(-) diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go index 53ed5b1e..575161b7 100644 --- a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go @@ -16,10 +16,11 @@ package asglifecycle import ( "crypto/sha256" "fmt" + "time" + "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" "github.com/aws/aws-node-termination-handler/pkg/monitor" "github.com/aws/aws-node-termination-handler/pkg/node" - "time" ) // ASGLifecycleMonitorKind is a const to define this monitor kind @@ -82,12 +83,12 @@ func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.I } return &monitor.InterruptionEvent{ - EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)), + EventID: fmt.Sprintf("target-lifecycle-state-terminated-%x", hash.Sum(nil)), Kind: monitor.ASGLifecycleKind, Monitor: ASGLifecycleMonitorKind, StartTime: interruptionTime, NodeName: nodeName, - Description: "AST tareget lifecycle state received. Instance will be \n", + Description: "AST target lifecycle state received. Instance will be terminated\n", PreDrainTask: setInterruptionTaint, }, nil } diff --git a/test/e2e/asg-lifecycle-imds-test b/test/e2e/asg-lifecycle-imds-test index 0877eb37..5f273812 100755 --- a/test/e2e/asg-lifecycle-imds-test +++ b/test/e2e/asg-lifecycle-imds-test @@ -1,80 +1,42 @@ #!/bin/bash set -euo pipefail -# Available env vars: -# $TMP_DIR -# $CLUSTER_NAME -# $KUBECONFIG -# $NODE_TERMINATION_HANDLER_DOCKER_REPO -# $NODE_TERMINATION_HANDLER_DOCKER_TAG -# $WEBHOOK_DOCKER_REPO -# $WEBHOOK_DOCKER_TAG -# $AEMM_URL -# $AEMM_VERSION +# The purpose of this end-to-end test is to ensure that nodes with an NTH pod, will cordon and drain when they receive an ASG Termination event +# This test assumes that AEMM (aws ec2 metadata mock service) provides an endpoint for getting and setting the target-lifecycle-state +# For more details on expected behavior +# Reference: https://docs.aws.amazon.com/autoscaling/ec2/userguide/retrieving-target-lifecycle-state-through-imds.html + +SERVICE_NAME=amazon-ec2-metadata-mock-service function fail_and_exit { - echo "❌ ASG Lifecycle SQS Test failed $CLUSTER_NAME ❌" + echo "❌ ASG Lifecycle IMDS Test failed $CLUSTER_NAME ❌" exit "${1:-1}" } -echo "Starting ASG Lifecycle SQS Test for Node Termination Handler" +echo "Starting ASG Lifecycle IMDS Test for Node Termination Handler" START_TIME=$(date -u +"%Y-%m-%dT%TZ") SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" common_helm_args=() -localstack_helm_args=( - upgrade - --install - --namespace default - "$CLUSTER_NAME-localstack" - "$SCRIPTPATH/../../config/helm/localstack/" - --set nodeSelector."${NTH_CONTROL_LABEL}" - --set defaultRegion="${AWS_REGION}" - --wait -) - -set -x -helm "${localstack_helm_args[@]}" -set +x - -sleep 10 - -RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'" -localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \ - -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \ - | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }') -echo "🥑 Using localstack pod ${localstack_pod}" -run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}") -private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName') -instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId') -echo "🥑 Started mock EC2 instance ($instance_id) w/ private DNS name: ${private_dns_name}" -set -x -CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}" -queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl) - -echo "🥑 Created SQS Queue ${queue_url}" - anth_helm_args=( upgrade --install --namespace kube-system - "$CLUSTER_NAME-acth" + "$CLUSTER_NAME-nth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" + --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" - --set nodeSelector."${NTH_CONTROL_LABEL}" - --set tolerations[0].operator=Exists - --set awsAccessKeyID=foo - --set awsSecretAccessKey=bar - --set awsRegion="${AWS_REGION}" - --set awsEndpoint="http://localstack.default" - --set checkTagBeforeDraining=false - --set enableSqsTerminationDraining=true - --set queueURL="${queue_url}" + --set taintNode="true" + --set enableASGLifecycleDraining="true" + --set enableSpotInterruptionDraining="false" + --set enableScheduledEventDraining="false" + --set daemonsetTolerations="" --wait + --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") @@ -104,11 +66,30 @@ set -x helm "${emtp_helm_args[@]}" set +x +aemm_helm_args=( + upgrade + --install + --namespace default + "$CLUSTER_NAME-aemm" + "/Volumes/workplace/github/amazon-ec2-metadata-mock/helm/amazon-ec2-metadata-mock" #$AEMM_DL_URL" + --set servicePort="$IMDS_PORT" + --set 'tolerations[0].effect=NoSchedule' + --set 'tolerations[0].operator=Exists' + --set serviceName="$SERVICE_NAME" + --set arguments='{asglifecycle}' + --wait +) +[[ ${#common_helm_args[@]} -gt 0 ]] && + aemm_helm_args+=("${common_helm_args[@]}") + +set -x +retry 5 helm "${aemm_helm_args[@]}" +set +x + TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 DEPLOYED=0 - for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" @@ -124,3 +105,47 @@ if [[ $DEPLOYED -eq 0 ]]; then fail_and_exit 2 fi +# Confirm that IMDS is accessible from script +# Port forwarding allows the script to access the IMDS service running inside the cluster and make POST requests +# Port is cleaned up after script finishes running +# Sleeping ensures that the port is fully etablished before proceeding to the next commands +LOCAL_PORT=8899 +LOCAL_AEMM_URL="http://localhost:$LOCAL_PORT/latest/meta-data/autoscaling/target-lifecycle-state" +kubectl port-forward service/$SERVICE_NAME $LOCAL_PORT:$IMDS_PORT > /dev/null 2>&1 & +PORT_FORWARD_PID=$! +trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR +echo "✅ Local Port $LOCAL_PORT forwards to service $SERVICE_NAME on $IMDS_PORT" +sleep 2 + +# Update the target-lifecycle-state to Terminated +TARGET_LIFECYCLE_STATE=$(curl -s $LOCAL_AEMM_URL) +echo "✅ Instance Metadata URL is available at $LOCAL_AEMM_URL! Current target-lifecycle-state: $TARGET_LIFECYCLE_STATE" +curl -s -X POST -H "Content-Type: application/json" -d '{"state":"Terminated"}' $LOCAL_AEMM_URL +TARGET_LIFECYCLE_STATE=$(curl -s $LOCAL_AEMM_URL) +echo "Set target-lifecycle-state to ${TARGET_LIFECYCLE_STATE}" + +# Check that worker node was cordoned and drained +cordoned=0 +test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then + echo "✅ Verified the worker node was cordoned!" + cordoned=1 + fi + + if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + echo "✅ ASG Lifecycle IMDS Test Passed $CLUSTER_NAME! ✅" + exit 0 + fi + echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" + sleep $TAINT_CHECK_SLEEP +done + +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" +else + echo "❌ regular-pod-test was not evicted" +fi + +fail_and_exit 1 \ No newline at end of file diff --git a/test/e2e/spot-interruption-test b/test/e2e/spot-interruption-test index 36781d21..27a9f026 100755 --- a/test/e2e/spot-interruption-test +++ b/test/e2e/spot-interruption-test @@ -112,6 +112,11 @@ cordoned=0 tainted=0 test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in $(seq 1 $TAINT_CHECK_CYCLES); do + + echo "CURRENT NODES" + echo $(kubectl get nodes) + echo "\n" + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 @@ -125,6 +130,9 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then echo "✅ Verified the regular-pod-test pod was evicted!" echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅" + echo "CURRENT NODES" + echo $(kubectl get nodes) + echo "\n" exit 0 fi echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index fd80d9fd..c5b1886d 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -300,7 +300,7 @@ for assert_script in $ASSERTION_SCRIPTS; do POD_ID=$(get_nth_worker_pod || :) kubectl logs "$POD_ID" --namespace kube-system || : ## Resets cluster to run another test on the same cluster - reset_cluster + # reset_cluster echo "✅ Assertion test $assert_script PASSED! ✅" done From 574c4bda1ea650bbc1f8faa1aed20d2ffbbc4759 Mon Sep 17 00:00:00 2001 From: Lu-David Date: Thu, 29 Aug 2024 16:27:07 -0700 Subject: [PATCH 5/8] cleaned up tests --- test/e2e/asg-lifecycle-imds-test | 35 +++++++++------------------- test/e2e/spot-interruption-test | 8 ------- test/k8s-local-cluster-test/run-test | 2 +- 3 files changed, 12 insertions(+), 33 deletions(-) diff --git a/test/e2e/asg-lifecycle-imds-test b/test/e2e/asg-lifecycle-imds-test index 5f273812..ad3a9de4 100755 --- a/test/e2e/asg-lifecycle-imds-test +++ b/test/e2e/asg-lifecycle-imds-test @@ -1,13 +1,20 @@ #!/bin/bash set -euo pipefail - # The purpose of this end-to-end test is to ensure that nodes with an NTH pod, will cordon and drain when they receive an ASG Termination event -# This test assumes that AEMM (aws ec2 metadata mock service) provides an endpoint for getting and setting the target-lifecycle-state +# through Instance Metadata Service (IMDS) +# This test assumes that AEMM (aws ec2 metadata mock service) provides an endpoint for getting target-lifecycle-state # For more details on expected behavior # Reference: https://docs.aws.amazon.com/autoscaling/ec2/userguide/retrieving-target-lifecycle-state-through-imds.html -SERVICE_NAME=amazon-ec2-metadata-mock-service +# Available env vars: +# $CLUSTER_NAME +# $NODE_TERMINATION_HANDLER_DOCKER_REPO +# $NODE_TERMINATION_HANDLER_DOCKER_TAG +# $WEBHOOK_DOCKER_REPO +# $WEBHOOK_DOCKER_TAG +# $AEMM_URL +# $IMDS_PORT function fail_and_exit { echo "❌ ASG Lifecycle IMDS Test failed $CLUSTER_NAME ❌" @@ -15,7 +22,6 @@ function fail_and_exit { } echo "Starting ASG Lifecycle IMDS Test for Node Termination Handler" -START_TIME=$(date -u +"%Y-%m-%dT%TZ") SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" @@ -71,11 +77,10 @@ aemm_helm_args=( --install --namespace default "$CLUSTER_NAME-aemm" - "/Volumes/workplace/github/amazon-ec2-metadata-mock/helm/amazon-ec2-metadata-mock" #$AEMM_DL_URL" + "$AEMM_DL_URL" --set servicePort="$IMDS_PORT" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' - --set serviceName="$SERVICE_NAME" --set arguments='{asglifecycle}' --wait ) @@ -105,24 +110,6 @@ if [[ $DEPLOYED -eq 0 ]]; then fail_and_exit 2 fi -# Confirm that IMDS is accessible from script -# Port forwarding allows the script to access the IMDS service running inside the cluster and make POST requests -# Port is cleaned up after script finishes running -# Sleeping ensures that the port is fully etablished before proceeding to the next commands -LOCAL_PORT=8899 -LOCAL_AEMM_URL="http://localhost:$LOCAL_PORT/latest/meta-data/autoscaling/target-lifecycle-state" -kubectl port-forward service/$SERVICE_NAME $LOCAL_PORT:$IMDS_PORT > /dev/null 2>&1 & -PORT_FORWARD_PID=$! -trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR -echo "✅ Local Port $LOCAL_PORT forwards to service $SERVICE_NAME on $IMDS_PORT" -sleep 2 - -# Update the target-lifecycle-state to Terminated -TARGET_LIFECYCLE_STATE=$(curl -s $LOCAL_AEMM_URL) -echo "✅ Instance Metadata URL is available at $LOCAL_AEMM_URL! Current target-lifecycle-state: $TARGET_LIFECYCLE_STATE" -curl -s -X POST -H "Content-Type: application/json" -d '{"state":"Terminated"}' $LOCAL_AEMM_URL -TARGET_LIFECYCLE_STATE=$(curl -s $LOCAL_AEMM_URL) -echo "Set target-lifecycle-state to ${TARGET_LIFECYCLE_STATE}" # Check that worker node was cordoned and drained cordoned=0 diff --git a/test/e2e/spot-interruption-test b/test/e2e/spot-interruption-test index 27a9f026..36781d21 100755 --- a/test/e2e/spot-interruption-test +++ b/test/e2e/spot-interruption-test @@ -112,11 +112,6 @@ cordoned=0 tainted=0 test_node=${TEST_NODE:-$CLUSTER_NAME-worker} for i in $(seq 1 $TAINT_CHECK_CYCLES); do - - echo "CURRENT NODES" - echo $(kubectl get nodes) - echo "\n" - if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned!" cordoned=1 @@ -130,9 +125,6 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then echo "✅ Verified the regular-pod-test pod was evicted!" echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅" - echo "CURRENT NODES" - echo $(kubectl get nodes) - echo "\n" exit 0 fi echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index c5b1886d..fd80d9fd 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -300,7 +300,7 @@ for assert_script in $ASSERTION_SCRIPTS; do POD_ID=$(get_nth_worker_pod || :) kubectl logs "$POD_ID" --namespace kube-system || : ## Resets cluster to run another test on the same cluster - # reset_cluster + reset_cluster echo "✅ Assertion test $assert_script PASSED! ✅" done From 69debdd49e90780adeec890c94e1c13cdbbd03b5 Mon Sep 17 00:00:00 2001 From: Lu-David Date: Wed, 4 Sep 2024 13:33:23 -0700 Subject: [PATCH 6/8] cleaned comments and unit-test --- pkg/monitor/asglifecycle/asg-lifecycle-monitor.go | 2 +- pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go index 575161b7..a623ae96 100644 --- a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go @@ -61,7 +61,7 @@ func (m ASGLifecycleMonitor) Kind() string { return ASGLifecycleMonitorKind } -// checkForSpotInterruptionNotice Checks EC2 instance metadata for a spot interruption termination notice +// checkForASGTargetLifecycleStateNotice Checks EC2 instance metadata for a asg lifecycle termination notice func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.InterruptionEvent, error) { state, err := m.IMDS.GetASGTargetLifecycleState() if err != nil { diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go index 8532c16a..539b3b1a 100644 --- a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go @@ -80,7 +80,7 @@ func TestInterruptionTaintAlreadyPresent(t *testing.T) { newNode := &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: nodeName}, Spec: v1.NodeSpec{Taints: []v1.Taint{{ - Key: node.RebalanceRecommendationTaint, + Key: node.ASGLifecycleTerminationTaint, Value: drainEvent.EventID[:63], Effect: v1.TaintEffectNoSchedule, }, From 3d0074623f7b83377a22dbe7527e0199bd1e8d3b Mon Sep 17 00:00:00 2001 From: David Lu Date: Mon, 9 Sep 2024 14:59:23 -0700 Subject: [PATCH 7/8] updated aemm mock version --- test/k8s-local-cluster-test/run-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index fd80d9fd..7c820cfe 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -18,7 +18,7 @@ WEBHOOK_DOCKER_IMG="" OVERRIDE_PATH=0 K8S_VERSION="1.30" AEMM_URL="amazon-ec2-metadata-mock-service.default.svc.cluster.local" -AEMM_VERSION="1.8.1" +AEMM_VERSION="1.12.0" AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v$AEMM_VERSION/amazon-ec2-metadata-mock-$AEMM_VERSION.tgz" WEBHOOK_URL=${WEBHOOK_URL:="http://webhook-test-proxy.default.svc.cluster.local"} From c6d065ccedb677bd47f19f9d58fd163adedd842f Mon Sep 17 00:00:00 2001 From: David Lu Date: Tue, 10 Sep 2024 12:53:02 -0700 Subject: [PATCH 8/8] eks test update AEMM version --- test/eks-cluster-test/run-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/eks-cluster-test/run-test b/test/eks-cluster-test/run-test index 949f68d1..a1e3e316 100755 --- a/test/eks-cluster-test/run-test +++ b/test/eks-cluster-test/run-test @@ -6,7 +6,7 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" PRESERVE=true export TEST_WINDOWS="false" -export AEMM_VERSION="1.8.1" +export AEMM_VERSION="1.12.0" export AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v$AEMM_VERSION/amazon-ec2-metadata-mock-$AEMM_VERSION.tgz" export CLUSTER_CONFIG_FILE=$SCRIPTPATH/cluster-spec.yaml