From fa9eaa3346993d0bad55af2fd6cf783130d804ff Mon Sep 17 00:00:00 2001
From: pete911
Date: Fri, 19 May 2023 20:54:11 +0100
Subject: [PATCH 1/8] add imds asg target lifecycle hook
---
README.md | 2 +-
cmd/node-termination-handler.go | 6 +
.../templates/daemonset.linux.yaml | 2 +
.../templates/daemonset.windows.yaml | 2 +
.../aws-node-termination-handler/values.yaml | 3 +
pkg/config/config.go | 4 +
pkg/config/config_test.go | 7 +
pkg/ec2metadata/ec2metadata.go | 24 ++++
pkg/ec2metadata/ec2metadata_test.go | 85 ++++++++++++
.../asglifecycle/asg-lifecycle-monitor.go | 102 ++++++++++++++
.../asg-lifecycle-monitor_internal_test.go | 99 ++++++++++++++
.../asg-lifecycle-monitor_test.go | 125 ++++++++++++++++++
12 files changed, 460 insertions(+), 1 deletion(-)
create mode 100644 pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
create mode 100644 pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go
create mode 100644 pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go
diff --git a/README.md b/README.md
index 0c3437b2..6807611f 100644
--- a/README.md
+++ b/README.md
@@ -75,8 +75,8 @@ Must be deployed as a Kubernetes **Deployment**. Also requires some **additional
| Spot Instance Termination Notifications (ITN) | ✅ | ✅ |
| Scheduled Events | ✅ | ✅ |
| Instance Rebalance Recommendation | ✅ | ✅ |
+| ASG Termination Lifecycle Hooks | ✅ | ✅ |
| AZ Rebalance Recommendation | ❌ | ✅ |
-| ASG Termination Lifecycle Hooks | ❌ | ✅ |
| Instance State Change Events | ❌ | ✅ |
diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go
index 9f7dcaf1..97f33870 100644
--- a/cmd/node-termination-handler.go
+++ b/cmd/node-termination-handler.go
@@ -16,6 +16,7 @@ package main
import (
"context"
"fmt"
+ "github.com/aws/aws-node-termination-handler/pkg/monitor/asglifecycle"
"os"
"os/signal"
"strings"
@@ -50,6 +51,7 @@ import (
const (
scheduledMaintenance = "Scheduled Maintenance"
spotITN = "Spot ITN"
+ asgLifecycle = "ASG Lifecycle"
rebalanceRecommendation = "Rebalance Recommendation"
sqsEvents = "SQS Event"
timeFormat = "2006/01/02 15:04:05"
@@ -172,6 +174,10 @@ func main() {
imdsSpotMonitor := spotitn.NewSpotInterruptionMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName)
monitoringFns[spotITN] = imdsSpotMonitor
}
+ if nthConfig.EnableASGLifecycleDraining {
+ asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName)
+ monitoringFns[asgLifecycle] = asgLifecycleMonitor
+ }
if nthConfig.EnableScheduledEventDraining {
imdsScheduledEventMonitor := scheduledevent.NewScheduledEventMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName)
monitoringFns[scheduledMaintenance] = imdsScheduledEventMonitor
diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml
index 95e4b50f..be6385de 100644
--- a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml
+++ b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml
@@ -143,6 +143,8 @@ spec:
{{- end }}
- name: ENABLE_SPOT_INTERRUPTION_DRAINING
value: {{ .Values.enableSpotInterruptionDraining | quote }}
+ - name: ENABLE_ASG_LIFECYCLE_DRAINING
+ value: {{ .Values.enableASGLifecycleDraining | quote }}
- name: ENABLE_SCHEDULED_EVENT_DRAINING
value: {{ .Values.enableScheduledEventDraining | quote }}
- name: ENABLE_REBALANCE_MONITORING
diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml
index 8a9db7bf..95af69d1 100644
--- a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml
+++ b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml
@@ -143,6 +143,8 @@ spec:
{{- end }}
- name: ENABLE_SPOT_INTERRUPTION_DRAINING
value: {{ .Values.enableSpotInterruptionDraining | quote }}
+ - name: ENABLE_ASG_LIFECYCLE_DRAINING
+ value: {{ .Values.enableASGLifecycleDraining | quote }}
- name: ENABLE_SCHEDULED_EVENT_DRAINING
value: {{ .Values.enableScheduledEventDraining | quote }}
- name: ENABLE_REBALANCE_MONITORING
diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml
index 3a4d5db4..864c5f96 100644
--- a/config/helm/aws-node-termination-handler/values.yaml
+++ b/config/helm/aws-node-termination-handler/values.yaml
@@ -268,6 +268,9 @@ metadataTries: 3
# enableSpotInterruptionDraining If false, do not drain nodes when the spot interruption termination notice is received. Only used in IMDS mode.
enableSpotInterruptionDraining: true
+# enableASGLifecycleDraining If false, do not drain nodes when ASG target lifecycle state Terminated is received. Only used in IMDS mode.
+enableASGLifecycleDraining: true
+
# enableScheduledEventDraining If false, do not drain nodes before the maintenance window starts for an EC2 instance scheduled event. Only used in IMDS mode.
enableScheduledEventDraining: true
diff --git a/pkg/config/config.go b/pkg/config/config.go
index 4eabaf00..3fd79c25 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -53,6 +53,8 @@ const (
enableScheduledEventDrainingDefault = true
enableSpotInterruptionDrainingConfigKey = "ENABLE_SPOT_INTERRUPTION_DRAINING"
enableSpotInterruptionDrainingDefault = true
+ enableASGLifecycleDrainingConfigKey = "ENABLE_ASG_LIFECYCLE_DRAINING"
+ enableASGLifecycleDrainingDefault = true
enableSQSTerminationDrainingConfigKey = "ENABLE_SQS_TERMINATION_DRAINING"
enableSQSTerminationDrainingDefault = false
enableRebalanceMonitoringConfigKey = "ENABLE_REBALANCE_MONITORING"
@@ -131,6 +133,7 @@ type Config struct {
WebhookProxy string
EnableScheduledEventDraining bool
EnableSpotInterruptionDraining bool
+ EnableASGLifecycleDraining bool
EnableSQSTerminationDraining bool
EnableRebalanceMonitoring bool
EnableRebalanceDraining bool
@@ -193,6 +196,7 @@ func ParseCliArgs() (config Config, err error) {
flag.StringVar(&config.WebhookTemplateFile, "webhook-template-file", getEnv(webhookTemplateFileConfigKey, ""), "If specified, replaces the default webhook message template with content from template file.")
flag.BoolVar(&config.EnableScheduledEventDraining, "enable-scheduled-event-draining", getBoolEnv(enableScheduledEventDrainingConfigKey, enableScheduledEventDrainingDefault), "If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event")
flag.BoolVar(&config.EnableSpotInterruptionDraining, "enable-spot-interruption-draining", getBoolEnv(enableSpotInterruptionDrainingConfigKey, enableSpotInterruptionDrainingDefault), "If true, drain nodes when the spot interruption termination notice is received")
+ flag.BoolVar(&config.EnableASGLifecycleDraining, "enable-asg-lifecycle-draining", getBoolEnv(enableASGLifecycleDrainingConfigKey, enableASGLifecycleDrainingDefault), "If true, drain nodes when the ASG target lifecyle state is Terminated is received")
flag.BoolVar(&config.EnableSQSTerminationDraining, "enable-sqs-termination-draining", getBoolEnv(enableSQSTerminationDrainingConfigKey, enableSQSTerminationDrainingDefault), "If true, drain nodes when an SQS termination event is received")
flag.BoolVar(&config.EnableRebalanceMonitoring, "enable-rebalance-monitoring", getBoolEnv(enableRebalanceMonitoringConfigKey, enableRebalanceMonitoringDefault), "If true, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set \"enableRebalanceDraining\".")
flag.BoolVar(&config.EnableRebalanceDraining, "enable-rebalance-draining", getBoolEnv(enableRebalanceDrainingConfigKey, enableRebalanceDrainingDefault), "If true, drain nodes when the rebalance recommendation notice is received")
diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go
index e1f7a1ba..4b051b12 100644
--- a/pkg/config/config_test.go
+++ b/pkg/config/config_test.go
@@ -36,6 +36,7 @@ func TestParseCliArgsEnvSuccess(t *testing.T) {
t.Setenv("DRY_RUN", "true")
t.Setenv("ENABLE_SCHEDULED_EVENT_DRAINING", "true")
t.Setenv("ENABLE_SPOT_INTERRUPTION_DRAINING", "false")
+ t.Setenv("ENABLE_ASG_LIFECYCLE_DRAINING", "false")
t.Setenv("ENABLE_SQS_TERMINATION_DRAINING", "false")
t.Setenv("ENABLE_REBALANCE_MONITORING", "true")
t.Setenv("ENABLE_REBALANCE_DRAINING", "true")
@@ -61,6 +62,7 @@ func TestParseCliArgsEnvSuccess(t *testing.T) {
h.Equals(t, true, nthConfig.DryRun)
h.Equals(t, true, nthConfig.EnableScheduledEventDraining)
h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining)
+ h.Equals(t, false, nthConfig.EnableASGLifecycleDraining)
h.Equals(t, false, nthConfig.EnableSQSTerminationDraining)
h.Equals(t, true, nthConfig.EnableRebalanceMonitoring)
h.Equals(t, true, nthConfig.EnableRebalanceDraining)
@@ -96,6 +98,7 @@ func TestParseCliArgsSuccess(t *testing.T) {
"--dry-run=true",
"--enable-scheduled-event-draining=true",
"--enable-spot-interruption-draining=false",
+ "--enable-asg-lifecycle-draining=false",
"--enable-sqs-termination-draining=false",
"--enable-rebalance-monitoring=true",
"--enable-rebalance-draining=true",
@@ -121,6 +124,7 @@ func TestParseCliArgsSuccess(t *testing.T) {
h.Equals(t, true, nthConfig.DryRun)
h.Equals(t, true, nthConfig.EnableScheduledEventDraining)
h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining)
+ h.Equals(t, false, nthConfig.EnableASGLifecycleDraining)
h.Equals(t, false, nthConfig.EnableSQSTerminationDraining)
h.Equals(t, true, nthConfig.EnableRebalanceMonitoring)
h.Equals(t, true, nthConfig.EnableRebalanceDraining)
@@ -151,6 +155,7 @@ func TestParseCliArgsOverrides(t *testing.T) {
t.Setenv("DRY_RUN", "false")
t.Setenv("ENABLE_SCHEDULED_EVENT_DRAINING", "false")
t.Setenv("ENABLE_SPOT_INTERRUPTION_DRAINING", "true")
+ t.Setenv("ENABLE_ASG_LIFECYCLE_DRAINING", "true")
t.Setenv("ENABLE_SQS_TERMINATION_DRAINING", "false")
t.Setenv("ENABLE_REBALANCE_MONITORING", "true")
t.Setenv("ENABLE_REBALANCE_DRAINING", "true")
@@ -174,6 +179,7 @@ func TestParseCliArgsOverrides(t *testing.T) {
"--dry-run=true",
"--enable-scheduled-event-draining=true",
"--enable-spot-interruption-draining=false",
+ "--enable-asg-lifecycle-draining=false",
"--enable-sqs-termination-draining=true",
"--enable-rebalance-monitoring=false",
"--enable-rebalance-draining=false",
@@ -201,6 +207,7 @@ func TestParseCliArgsOverrides(t *testing.T) {
h.Equals(t, true, nthConfig.DryRun)
h.Equals(t, true, nthConfig.EnableScheduledEventDraining)
h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining)
+ h.Equals(t, false, nthConfig.EnableASGLifecycleDraining)
h.Equals(t, true, nthConfig.EnableSQSTerminationDraining)
h.Equals(t, false, nthConfig.EnableRebalanceMonitoring)
h.Equals(t, false, nthConfig.EnableRebalanceDraining)
diff --git a/pkg/ec2metadata/ec2metadata.go b/pkg/ec2metadata/ec2metadata.go
index f7f5ade8..7449d641 100644
--- a/pkg/ec2metadata/ec2metadata.go
+++ b/pkg/ec2metadata/ec2metadata.go
@@ -30,6 +30,8 @@ import (
const (
// SpotInstanceActionPath is the context path to spot/instance-action within IMDS
SpotInstanceActionPath = "/latest/meta-data/spot/instance-action"
+ // ASGTargetLifecycleStatePath path to autoscaling target lifecycle state within IMDS
+ ASGTargetLifecycleStatePath = "/latest/meta-data/autoscaling/target-lifecycle-state"
// ScheduledEventPath is the context path to events/maintenance/scheduled within IMDS
ScheduledEventPath = "/latest/meta-data/events/maintenance/scheduled"
// RebalanceRecommendationPath is the context path to events/recommendations/rebalance within IMDS
@@ -193,6 +195,28 @@ func (e *Service) GetRebalanceRecommendationEvent() (rebalanceRec *RebalanceReco
return rebalanceRec, nil
}
+// GetASGTargetLifecycleState retrieves ASG target lifecycle state from imds. State can be empty string
+// if the lifecycle hook is not present on the ASG
+func (e *Service) GetASGTargetLifecycleState() (state string, err error) {
+ resp, err := e.Request(ASGTargetLifecycleStatePath)
+ // 404s are normal when querying for the 'autoscaling/target-lifecycle-state' path and there is no lifecycle hook
+ if resp != nil && resp.StatusCode == 404 {
+ return "", nil
+ } else if resp != nil && (resp.StatusCode < 200 || resp.StatusCode >= 300) {
+ return "", fmt.Errorf("Metadata request received http status code: %d", resp.StatusCode)
+ }
+ if err != nil {
+ return "", fmt.Errorf("Unable to parse metadata response: %w", err)
+ }
+ defer resp.Body.Close()
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return "", fmt.Errorf("Unable to parse http response. Status code: %d. %w", resp.StatusCode, err)
+ }
+ return string(body), nil
+}
+
// GetMetadataInfo generic function for retrieving ec2 metadata
func (e *Service) GetMetadataInfo(path string) (info string, err error) {
metadataInfo := ""
diff --git a/pkg/ec2metadata/ec2metadata_test.go b/pkg/ec2metadata/ec2metadata_test.go
index 6fe58301..4a698c22 100644
--- a/pkg/ec2metadata/ec2metadata_test.go
+++ b/pkg/ec2metadata/ec2metadata_test.go
@@ -504,6 +504,91 @@ func TestGetRebalanceRecommendationEventRequestFailure(t *testing.T) {
h.Assert(t, err != nil, "error expected because no server should be running")
}
+func TestGetASGTargetLifecycleStateSuccess(t *testing.T) {
+ requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state"
+
+ server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+ rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100")
+ if req.URL.String() == "/latest/api/token" {
+ rw.WriteHeader(200)
+ _, err := rw.Write([]byte(`token`))
+ h.Ok(t, err)
+ return
+ }
+ h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token")
+ h.Equals(t, req.URL.String(), requestPath)
+ _, err := rw.Write([]byte("InService"))
+ h.Ok(t, err)
+ }))
+ defer server.Close()
+
+ expectedState := "InService"
+
+ // Use URL from our local test server
+ imds := ec2metadata.New(server.URL, 1)
+
+ state, err := imds.GetASGTargetLifecycleState()
+ h.Ok(t, err)
+ h.Equals(t, expectedState, state)
+}
+
+func TestGetASGTargetLifecycleState404Success(t *testing.T) {
+ requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state"
+
+ server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+ rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100")
+ if req.URL.String() == "/latest/api/token" {
+ rw.WriteHeader(200)
+ _, err := rw.Write([]byte(`token`))
+ h.Ok(t, err)
+ return
+ }
+ h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token")
+ h.Equals(t, req.URL.String(), requestPath)
+ rw.WriteHeader(404)
+ }))
+ defer server.Close()
+
+ // Use URL from our local test server
+ imds := ec2metadata.New(server.URL, 1)
+
+ state, err := imds.GetASGTargetLifecycleState()
+ h.Ok(t, err)
+ h.Assert(t, state == "", "ASG target lifecycle state should be empty")
+}
+
+func TestGetASGTargetLifecycleState500Failure(t *testing.T) {
+ requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state"
+
+ server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+ rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100")
+ if req.URL.String() == "/latest/api/token" {
+ rw.WriteHeader(200)
+ _, err := rw.Write([]byte(`token`))
+ h.Ok(t, err)
+ return
+ }
+ h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token")
+ h.Equals(t, req.URL.String(), requestPath)
+ rw.WriteHeader(500)
+ }))
+ defer server.Close()
+
+ // Use URL from our local test server
+ imds := ec2metadata.New(server.URL, 1)
+
+ _, err := imds.GetASGTargetLifecycleState()
+ h.Assert(t, err != nil, "error expected on non-200 or non-404 status code")
+}
+
+func TestGetASGTargetLifecycleStateRequestFailure(t *testing.T) {
+ // Use URL from our local test server
+ imds := ec2metadata.New("/some-path-that-will-error", 1)
+
+ _, err := imds.GetASGTargetLifecycleState()
+ h.Assert(t, err != nil, "error expected because no server should be running")
+}
+
func TestGetMetadataServiceRequest404(t *testing.T) {
var requestPath string = "/latest/meta-data/instance-type"
diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
new file mode 100644
index 00000000..53ed5b1e
--- /dev/null
+++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
@@ -0,0 +1,102 @@
+// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"). You may
+// not use this file except in compliance with the License. A copy of the
+// License is located at
+//
+// http://aws.amazon.com/apache2.0/
+//
+// or in the "license" file accompanying this file. This file is distributed
+// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+// express or implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package asglifecycle
+
+import (
+ "crypto/sha256"
+ "fmt"
+ "github.com/aws/aws-node-termination-handler/pkg/ec2metadata"
+ "github.com/aws/aws-node-termination-handler/pkg/monitor"
+ "github.com/aws/aws-node-termination-handler/pkg/node"
+ "time"
+)
+
+// ASGLifecycleMonitorKind is a const to define this monitor kind
+const ASGLifecycleMonitorKind = "ASG_LIFECYCLE_MONITOR"
+
+// ASGLifecycleMonitor is a struct definition which facilitates monitoring of ASG target lifecycle state from IMDS
+type ASGLifecycleMonitor struct {
+ IMDS *ec2metadata.Service
+ InterruptionChan chan<- monitor.InterruptionEvent
+ CancelChan chan<- monitor.InterruptionEvent
+ NodeName string
+}
+
+// NewASGLifecycleMonitor creates an instance of a ASG lifecycle IMDS monitor
+func NewASGLifecycleMonitor(imds *ec2metadata.Service, interruptionChan chan<- monitor.InterruptionEvent, cancelChan chan<- monitor.InterruptionEvent, nodeName string) ASGLifecycleMonitor {
+ return ASGLifecycleMonitor{
+ IMDS: imds,
+ InterruptionChan: interruptionChan,
+ CancelChan: cancelChan,
+ NodeName: nodeName,
+ }
+}
+
+// Monitor continuously monitors metadata for ASG target lifecycle state and sends interruption events to the passed in channel
+func (m ASGLifecycleMonitor) Monitor() error {
+ interruptionEvent, err := m.checkForASGTargetLifecycleStateNotice()
+ if err != nil {
+ return err
+ }
+ if interruptionEvent != nil && interruptionEvent.Kind == monitor.ASGLifecycleKind {
+ m.InterruptionChan <- *interruptionEvent
+ }
+ return nil
+}
+
+// Kind denotes the kind of monitor
+func (m ASGLifecycleMonitor) Kind() string {
+ return ASGLifecycleMonitorKind
+}
+
+// checkForSpotInterruptionNotice Checks EC2 instance metadata for a spot interruption termination notice
+func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.InterruptionEvent, error) {
+ state, err := m.IMDS.GetASGTargetLifecycleState()
+ if err != nil {
+ return nil, fmt.Errorf("There was a problem checking for ASG target lifecycle state: %w", err)
+ }
+ if state != "Terminated" {
+ // if the state is not "Terminated", we can skip. State can also be empty (no hook configured).
+ return nil, nil
+ }
+
+ nodeName := m.NodeName
+ // there is no time in the response, we just set time to the latest check
+ interruptionTime := time.Now()
+
+ // There's no EventID returned, so we'll create it using a hash to prevent duplicates.
+ hash := sha256.New()
+ if _, err = hash.Write([]byte(fmt.Sprintf("%s:%s", state, interruptionTime))); err != nil {
+ return nil, fmt.Errorf("There was a problem creating an event ID from the event: %w", err)
+ }
+
+ return &monitor.InterruptionEvent{
+ EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)),
+ Kind: monitor.ASGLifecycleKind,
+ Monitor: ASGLifecycleMonitorKind,
+ StartTime: interruptionTime,
+ NodeName: nodeName,
+ Description: "AST tareget lifecycle state received. Instance will be \n",
+ PreDrainTask: setInterruptionTaint,
+ }, nil
+}
+
+func setInterruptionTaint(interruptionEvent monitor.InterruptionEvent, n node.Node) error {
+ err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID)
+ if err != nil {
+ return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID, err)
+ }
+
+ return nil
+}
diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go
new file mode 100644
index 00000000..8532c16a
--- /dev/null
+++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go
@@ -0,0 +1,99 @@
+// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"). You may
+// not use this file except in compliance with the License. A copy of the
+// License is located at
+//
+// http://aws.amazon.com/apache2.0/
+//
+// or in the "license" file accompanying this file. This file is distributed
+// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+// express or implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package asglifecycle
+
+import (
+ "context"
+ "testing"
+ "time"
+
+ "github.com/rs/zerolog/log"
+
+ "github.com/aws/aws-node-termination-handler/pkg/config"
+ "github.com/aws/aws-node-termination-handler/pkg/monitor"
+ "github.com/aws/aws-node-termination-handler/pkg/node"
+ h "github.com/aws/aws-node-termination-handler/pkg/test"
+ "github.com/aws/aws-node-termination-handler/pkg/uptime"
+ v1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/client-go/kubernetes/fake"
+ "k8s.io/kubectl/pkg/drain"
+)
+
+const nodeName = "NAME"
+
+func getDrainHelper(client *fake.Clientset) *drain.Helper {
+ return &drain.Helper{
+ Client: client,
+ Force: true,
+ GracePeriodSeconds: -1,
+ IgnoreAllDaemonSets: true,
+ DeleteEmptyDirData: true,
+ Timeout: time.Duration(120) * time.Second,
+ Out: log.Logger,
+ ErrOut: log.Logger,
+ }
+}
+
+func TestSetInterruptionTaint(t *testing.T) {
+ drainEvent := monitor.InterruptionEvent{
+ EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters",
+ }
+ nthConfig := config.Config{
+ DryRun: true,
+ NodeName: nodeName,
+ }
+
+ client := fake.NewSimpleClientset()
+ _, err := client.CoreV1().Nodes().Create(context.Background(), &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}, metav1.CreateOptions{})
+ h.Ok(t, err)
+
+ tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime)
+ h.Ok(t, err)
+
+ err = setInterruptionTaint(drainEvent, *tNode)
+
+ h.Ok(t, err)
+}
+
+func TestInterruptionTaintAlreadyPresent(t *testing.T) {
+ drainEvent := monitor.InterruptionEvent{
+ EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters",
+ }
+ nthConfig := config.Config{
+ DryRun: false,
+ NodeName: nodeName,
+ }
+
+ client := fake.NewSimpleClientset()
+ newNode := &v1.Node{
+ ObjectMeta: metav1.ObjectMeta{Name: nodeName},
+ Spec: v1.NodeSpec{Taints: []v1.Taint{{
+ Key: node.RebalanceRecommendationTaint,
+ Value: drainEvent.EventID[:63],
+ Effect: v1.TaintEffectNoSchedule,
+ },
+ }},
+ }
+
+ _, err := client.CoreV1().Nodes().Create(context.Background(), newNode, metav1.CreateOptions{})
+ h.Ok(t, err)
+
+ tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime)
+ h.Ok(t, err)
+
+ err = setInterruptionTaint(drainEvent, *tNode)
+
+ h.Ok(t, err)
+}
diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go
new file mode 100644
index 00000000..cf6ab6ac
--- /dev/null
+++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go
@@ -0,0 +1,125 @@
+// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"). You may
+// not use this file except in compliance with the License. A copy of the
+// License is located at
+//
+// http://aws.amazon.com/apache2.0/
+//
+// or in the "license" file accompanying this file. This file is distributed
+// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+// express or implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package asglifecycle_test
+
+import (
+ "github.com/aws/aws-node-termination-handler/pkg/monitor/asglifecycle"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+
+ "github.com/aws/aws-node-termination-handler/pkg/ec2metadata"
+ "github.com/aws/aws-node-termination-handler/pkg/monitor"
+ h "github.com/aws/aws-node-termination-handler/pkg/test"
+)
+
+const (
+ expFormattedTime = "2020-10-26 15:15:15 +0000 UTC"
+ imdsV2TokenPath = "/latest/api/token"
+ nodeName = "test-node"
+)
+
+var asgTargetLifecycleStateResponse = []byte("InService")
+
+func TestMonitor_Success(t *testing.T) {
+ requestPath := ec2metadata.ASGTargetLifecycleStatePath
+
+ server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+ if imdsV2TokenPath == req.URL.String() {
+ rw.WriteHeader(403)
+ return
+ }
+ h.Equals(t, req.URL.String(), requestPath)
+ _, err := rw.Write(asgTargetLifecycleStateResponse)
+ h.Ok(t, err)
+ }))
+ defer server.Close()
+
+ drainChan := make(chan monitor.InterruptionEvent)
+ cancelChan := make(chan monitor.InterruptionEvent)
+ imds := ec2metadata.New(server.URL, 1)
+
+ go func() {
+ result := <-drainChan
+ h.Equals(t, monitor.ASGLifecycleKind, result.Kind)
+ h.Equals(t, asglifecycle.ASGLifecycleMonitorKind, result.Monitor)
+ h.Equals(t, expFormattedTime, result.StartTime.String())
+ }()
+
+ asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName)
+ err := asgLifecycleMonitor.Monitor()
+ h.Ok(t, err)
+}
+
+func TestMonitor_MetadataParseFailure(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+ if imdsV2TokenPath == req.URL.String() {
+ rw.WriteHeader(403)
+ return
+ }
+ }))
+ defer server.Close()
+
+ drainChan := make(chan monitor.InterruptionEvent)
+ cancelChan := make(chan monitor.InterruptionEvent)
+ imds := ec2metadata.New(server.URL, 1)
+
+ asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName)
+ err := asgLifecycleMonitor.Monitor()
+ h.Ok(t, err)
+}
+
+func TestMonitor_404Response(t *testing.T) {
+ requestPath := ec2metadata.ASGTargetLifecycleStatePath
+
+ server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+ if imdsV2TokenPath == req.URL.String() {
+ rw.WriteHeader(403)
+ return
+ }
+ h.Equals(t, req.URL.String(), requestPath)
+ http.Error(rw, "error", http.StatusNotFound)
+ }))
+ defer server.Close()
+
+ drainChan := make(chan monitor.InterruptionEvent)
+ cancelChan := make(chan monitor.InterruptionEvent)
+ imds := ec2metadata.New(server.URL, 1)
+
+ asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName)
+ err := asgLifecycleMonitor.Monitor()
+ h.Ok(t, err)
+}
+
+func TestMonitor_500Response(t *testing.T) {
+ requestPath := ec2metadata.ASGTargetLifecycleStatePath
+
+ server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+ if imdsV2TokenPath == req.URL.String() {
+ rw.WriteHeader(403)
+ return
+ }
+ h.Equals(t, req.URL.String(), requestPath)
+ http.Error(rw, "error", http.StatusInternalServerError)
+ }))
+ defer server.Close()
+
+ drainChan := make(chan monitor.InterruptionEvent)
+ cancelChan := make(chan monitor.InterruptionEvent)
+ imds := ec2metadata.New(server.URL, 1)
+
+ asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName)
+ err := asgLifecycleMonitor.Monitor()
+ h.Assert(t, err != nil, "Failed to return error when 500 response")
+}
From 2ff59b9eeb0bb1c16f4e6ed3b516b54ac45de514 Mon Sep 17 00:00:00 2001
From: pete911
Date: Tue, 23 May 2023 08:20:00 +0100
Subject: [PATCH 2/8] update comment on target lifecycle 404 response
---
pkg/ec2metadata/ec2metadata.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pkg/ec2metadata/ec2metadata.go b/pkg/ec2metadata/ec2metadata.go
index 7449d641..f64d0cde 100644
--- a/pkg/ec2metadata/ec2metadata.go
+++ b/pkg/ec2metadata/ec2metadata.go
@@ -199,7 +199,7 @@ func (e *Service) GetRebalanceRecommendationEvent() (rebalanceRec *RebalanceReco
// if the lifecycle hook is not present on the ASG
func (e *Service) GetASGTargetLifecycleState() (state string, err error) {
resp, err := e.Request(ASGTargetLifecycleStatePath)
- // 404s are normal when querying for the 'autoscaling/target-lifecycle-state' path and there is no lifecycle hook
+ // 404s should not happen, but there can be a case if the instance is brand new and the field is not populated yet
if resp != nil && resp.StatusCode == 404 {
return "", nil
} else if resp != nil && (resp.StatusCode < 200 || resp.StatusCode >= 300) {
From be55427dbfbd1dfe5ea84b2148f739ab91774c5e Mon Sep 17 00:00:00 2001
From: Lu-David
Date: Tue, 27 Aug 2024 14:53:59 -0700
Subject: [PATCH 3/8] drafted test
---
test/e2e/asg-lifecycle-imds-test | 126 +++++++++++++++++++++++++++++++
1 file changed, 126 insertions(+)
create mode 100755 test/e2e/asg-lifecycle-imds-test
diff --git a/test/e2e/asg-lifecycle-imds-test b/test/e2e/asg-lifecycle-imds-test
new file mode 100755
index 00000000..0877eb37
--- /dev/null
+++ b/test/e2e/asg-lifecycle-imds-test
@@ -0,0 +1,126 @@
+#!/bin/bash
+set -euo pipefail
+
+# Available env vars:
+# $TMP_DIR
+# $CLUSTER_NAME
+# $KUBECONFIG
+# $NODE_TERMINATION_HANDLER_DOCKER_REPO
+# $NODE_TERMINATION_HANDLER_DOCKER_TAG
+# $WEBHOOK_DOCKER_REPO
+# $WEBHOOK_DOCKER_TAG
+# $AEMM_URL
+# $AEMM_VERSION
+
+
+function fail_and_exit {
+ echo "❌ ASG Lifecycle SQS Test failed $CLUSTER_NAME ❌"
+ exit "${1:-1}"
+}
+
+echo "Starting ASG Lifecycle SQS Test for Node Termination Handler"
+START_TIME=$(date -u +"%Y-%m-%dT%TZ")
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+common_helm_args=()
+
+localstack_helm_args=(
+ upgrade
+ --install
+ --namespace default
+ "$CLUSTER_NAME-localstack"
+ "$SCRIPTPATH/../../config/helm/localstack/"
+ --set nodeSelector."${NTH_CONTROL_LABEL}"
+ --set defaultRegion="${AWS_REGION}"
+ --wait
+)
+
+set -x
+helm "${localstack_helm_args[@]}"
+set +x
+
+sleep 10
+
+RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'"
+localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \
+ -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \
+ | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }')
+echo "🥑 Using localstack pod ${localstack_pod}"
+run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}")
+private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName')
+instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId')
+echo "🥑 Started mock EC2 instance ($instance_id) w/ private DNS name: ${private_dns_name}"
+set -x
+CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}"
+queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl)
+
+echo "🥑 Created SQS Queue ${queue_url}"
+
+anth_helm_args=(
+ upgrade
+ --install
+ --namespace kube-system
+ "$CLUSTER_NAME-acth"
+ "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
+ --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
+ --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
+ --set nodeSelector."${NTH_CONTROL_LABEL}"
+ --set tolerations[0].operator=Exists
+ --set awsAccessKeyID=foo
+ --set awsSecretAccessKey=bar
+ --set awsRegion="${AWS_REGION}"
+ --set awsEndpoint="http://localstack.default"
+ --set checkTagBeforeDraining=false
+ --set enableSqsTerminationDraining=true
+ --set queueURL="${queue_url}"
+ --wait
+)
+[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] &&
+ anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY")
+[[ ${#common_helm_args[@]} -gt 0 ]] &&
+ anth_helm_args+=("${common_helm_args[@]}")
+
+set -x
+helm "${anth_helm_args[@]}"
+set +x
+
+emtp_helm_args=(
+ upgrade
+ --install
+ --namespace default
+ "$CLUSTER_NAME-emtp"
+ "$SCRIPTPATH/../../config/helm/webhook-test-proxy/"
+ --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO"
+ --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG"
+ --wait
+)
+[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] &&
+ emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY")
+[[ ${#common_helm_args[@]} -gt 0 ]] &&
+ emtp_helm_args+=("${common_helm_args[@]}")
+
+set -x
+helm "${emtp_helm_args[@]}"
+set +x
+
+TAINT_CHECK_CYCLES=15
+TAINT_CHECK_SLEEP=15
+
+DEPLOYED=0
+
+for i in $(seq 1 $TAINT_CHECK_CYCLES); do
+ if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
+ echo "✅ Verified regular-pod-test pod was scheduled and started!"
+ DEPLOYED=1
+ break
+ fi
+ echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
+ sleep $TAINT_CHECK_SLEEP
+done
+
+if [[ $DEPLOYED -eq 0 ]]; then
+ echo "❌ regular-pod-test pod deployment failed"
+ fail_and_exit 2
+fi
+
From 66ca01d24c17225d11ebd93fdfabb379a8640d5f Mon Sep 17 00:00:00 2001
From: Lu-David
Date: Wed, 28 Aug 2024 17:48:05 -0700
Subject: [PATCH 4/8] added imds support
---
.../asglifecycle/asg-lifecycle-monitor.go | 7 +-
test/e2e/asg-lifecycle-imds-test | 135 +++++++++++-------
test/e2e/spot-interruption-test | 8 ++
test/k8s-local-cluster-test/run-test | 2 +-
4 files changed, 93 insertions(+), 59 deletions(-)
diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
index 53ed5b1e..575161b7 100644
--- a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
+++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
@@ -16,10 +16,11 @@ package asglifecycle
import (
"crypto/sha256"
"fmt"
+ "time"
+
"github.com/aws/aws-node-termination-handler/pkg/ec2metadata"
"github.com/aws/aws-node-termination-handler/pkg/monitor"
"github.com/aws/aws-node-termination-handler/pkg/node"
- "time"
)
// ASGLifecycleMonitorKind is a const to define this monitor kind
@@ -82,12 +83,12 @@ func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.I
}
return &monitor.InterruptionEvent{
- EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)),
+ EventID: fmt.Sprintf("target-lifecycle-state-terminated-%x", hash.Sum(nil)),
Kind: monitor.ASGLifecycleKind,
Monitor: ASGLifecycleMonitorKind,
StartTime: interruptionTime,
NodeName: nodeName,
- Description: "AST tareget lifecycle state received. Instance will be \n",
+ Description: "AST target lifecycle state received. Instance will be terminated\n",
PreDrainTask: setInterruptionTaint,
}, nil
}
diff --git a/test/e2e/asg-lifecycle-imds-test b/test/e2e/asg-lifecycle-imds-test
index 0877eb37..5f273812 100755
--- a/test/e2e/asg-lifecycle-imds-test
+++ b/test/e2e/asg-lifecycle-imds-test
@@ -1,80 +1,42 @@
#!/bin/bash
set -euo pipefail
-# Available env vars:
-# $TMP_DIR
-# $CLUSTER_NAME
-# $KUBECONFIG
-# $NODE_TERMINATION_HANDLER_DOCKER_REPO
-# $NODE_TERMINATION_HANDLER_DOCKER_TAG
-# $WEBHOOK_DOCKER_REPO
-# $WEBHOOK_DOCKER_TAG
-# $AEMM_URL
-# $AEMM_VERSION
+# The purpose of this end-to-end test is to ensure that nodes with an NTH pod, will cordon and drain when they receive an ASG Termination event
+# This test assumes that AEMM (aws ec2 metadata mock service) provides an endpoint for getting and setting the target-lifecycle-state
+# For more details on expected behavior
+# Reference: https://docs.aws.amazon.com/autoscaling/ec2/userguide/retrieving-target-lifecycle-state-through-imds.html
+
+SERVICE_NAME=amazon-ec2-metadata-mock-service
function fail_and_exit {
- echo "❌ ASG Lifecycle SQS Test failed $CLUSTER_NAME ❌"
+ echo "❌ ASG Lifecycle IMDS Test failed $CLUSTER_NAME ❌"
exit "${1:-1}"
}
-echo "Starting ASG Lifecycle SQS Test for Node Termination Handler"
+echo "Starting ASG Lifecycle IMDS Test for Node Termination Handler"
START_TIME=$(date -u +"%Y-%m-%dT%TZ")
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
common_helm_args=()
-localstack_helm_args=(
- upgrade
- --install
- --namespace default
- "$CLUSTER_NAME-localstack"
- "$SCRIPTPATH/../../config/helm/localstack/"
- --set nodeSelector."${NTH_CONTROL_LABEL}"
- --set defaultRegion="${AWS_REGION}"
- --wait
-)
-
-set -x
-helm "${localstack_helm_args[@]}"
-set +x
-
-sleep 10
-
-RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'"
-localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \
- -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \
- | awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }')
-echo "🥑 Using localstack pod ${localstack_pod}"
-run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "${RUN_INSTANCE_CMD}")
-private_dns_name=$(echo "${run_instances_resp}" | jq -r '.Instances[] .PrivateDnsName')
-instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId')
-echo "🥑 Started mock EC2 instance ($instance_id) w/ private DNS name: ${private_dns_name}"
-set -x
-CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}"
-queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "${CREATE_SQS_CMD}" | jq -r .QueueUrl)
-
-echo "🥑 Created SQS Queue ${queue_url}"
-
anth_helm_args=(
upgrade
--install
--namespace kube-system
- "$CLUSTER_NAME-acth"
+ "$CLUSTER_NAME-nth"
"$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
+ --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}"
--set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
--set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
- --set nodeSelector."${NTH_CONTROL_LABEL}"
- --set tolerations[0].operator=Exists
- --set awsAccessKeyID=foo
- --set awsSecretAccessKey=bar
- --set awsRegion="${AWS_REGION}"
- --set awsEndpoint="http://localstack.default"
- --set checkTagBeforeDraining=false
- --set enableSqsTerminationDraining=true
- --set queueURL="${queue_url}"
+ --set taintNode="true"
+ --set enableASGLifecycleDraining="true"
+ --set enableSpotInterruptionDraining="false"
+ --set enableScheduledEventDraining="false"
+ --set daemonsetTolerations=""
--wait
+ --force
)
[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] &&
anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY")
@@ -104,11 +66,30 @@ set -x
helm "${emtp_helm_args[@]}"
set +x
+aemm_helm_args=(
+ upgrade
+ --install
+ --namespace default
+ "$CLUSTER_NAME-aemm"
+ "/Volumes/workplace/github/amazon-ec2-metadata-mock/helm/amazon-ec2-metadata-mock" #$AEMM_DL_URL"
+ --set servicePort="$IMDS_PORT"
+ --set 'tolerations[0].effect=NoSchedule'
+ --set 'tolerations[0].operator=Exists'
+ --set serviceName="$SERVICE_NAME"
+ --set arguments='{asglifecycle}'
+ --wait
+)
+[[ ${#common_helm_args[@]} -gt 0 ]] &&
+ aemm_helm_args+=("${common_helm_args[@]}")
+
+set -x
+retry 5 helm "${aemm_helm_args[@]}"
+set +x
+
TAINT_CHECK_CYCLES=15
TAINT_CHECK_SLEEP=15
DEPLOYED=0
-
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
echo "✅ Verified regular-pod-test pod was scheduled and started!"
@@ -124,3 +105,47 @@ if [[ $DEPLOYED -eq 0 ]]; then
fail_and_exit 2
fi
+# Confirm that IMDS is accessible from script
+# Port forwarding allows the script to access the IMDS service running inside the cluster and make POST requests
+# Port is cleaned up after script finishes running
+# Sleeping ensures that the port is fully etablished before proceeding to the next commands
+LOCAL_PORT=8899
+LOCAL_AEMM_URL="http://localhost:$LOCAL_PORT/latest/meta-data/autoscaling/target-lifecycle-state"
+kubectl port-forward service/$SERVICE_NAME $LOCAL_PORT:$IMDS_PORT > /dev/null 2>&1 &
+PORT_FORWARD_PID=$!
+trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR
+echo "✅ Local Port $LOCAL_PORT forwards to service $SERVICE_NAME on $IMDS_PORT"
+sleep 2
+
+# Update the target-lifecycle-state to Terminated
+TARGET_LIFECYCLE_STATE=$(curl -s $LOCAL_AEMM_URL)
+echo "✅ Instance Metadata URL is available at $LOCAL_AEMM_URL! Current target-lifecycle-state: $TARGET_LIFECYCLE_STATE"
+curl -s -X POST -H "Content-Type: application/json" -d '{"state":"Terminated"}' $LOCAL_AEMM_URL
+TARGET_LIFECYCLE_STATE=$(curl -s $LOCAL_AEMM_URL)
+echo "Set target-lifecycle-state to ${TARGET_LIFECYCLE_STATE}"
+
+# Check that worker node was cordoned and drained
+cordoned=0
+test_node="${TEST_NODE:-$CLUSTER_NAME-worker}"
+for i in $(seq 1 $TAINT_CHECK_CYCLES); do
+ if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then
+ echo "✅ Verified the worker node was cordoned!"
+ cordoned=1
+ fi
+
+ if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
+ echo "✅ Verified the regular-pod-test pod was evicted!"
+ echo "✅ ASG Lifecycle IMDS Test Passed $CLUSTER_NAME! ✅"
+ exit 0
+ fi
+ echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
+ sleep $TAINT_CHECK_SLEEP
+done
+
+if [[ $cordoned -eq 0 ]]; then
+ echo "❌ Worker node was not cordoned"
+else
+ echo "❌ regular-pod-test was not evicted"
+fi
+
+fail_and_exit 1
\ No newline at end of file
diff --git a/test/e2e/spot-interruption-test b/test/e2e/spot-interruption-test
index 36781d21..27a9f026 100755
--- a/test/e2e/spot-interruption-test
+++ b/test/e2e/spot-interruption-test
@@ -112,6 +112,11 @@ cordoned=0
tainted=0
test_node=${TEST_NODE:-$CLUSTER_NAME-worker}
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
+
+ echo "CURRENT NODES"
+ echo $(kubectl get nodes)
+ echo "\n"
+
if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then
echo "✅ Verified the worker node was cordoned!"
cordoned=1
@@ -125,6 +130,9 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
echo "✅ Verified the regular-pod-test pod was evicted!"
echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅"
+ echo "CURRENT NODES"
+ echo $(kubectl get nodes)
+ echo "\n"
exit 0
fi
echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index fd80d9fd..c5b1886d 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -300,7 +300,7 @@ for assert_script in $ASSERTION_SCRIPTS; do
POD_ID=$(get_nth_worker_pod || :)
kubectl logs "$POD_ID" --namespace kube-system || :
## Resets cluster to run another test on the same cluster
- reset_cluster
+ # reset_cluster
echo "✅ Assertion test $assert_script PASSED! ✅"
done
From 574c4bda1ea650bbc1f8faa1aed20d2ffbbc4759 Mon Sep 17 00:00:00 2001
From: Lu-David
Date: Thu, 29 Aug 2024 16:27:07 -0700
Subject: [PATCH 5/8] cleaned up tests
---
test/e2e/asg-lifecycle-imds-test | 35 +++++++++-------------------
test/e2e/spot-interruption-test | 8 -------
test/k8s-local-cluster-test/run-test | 2 +-
3 files changed, 12 insertions(+), 33 deletions(-)
diff --git a/test/e2e/asg-lifecycle-imds-test b/test/e2e/asg-lifecycle-imds-test
index 5f273812..ad3a9de4 100755
--- a/test/e2e/asg-lifecycle-imds-test
+++ b/test/e2e/asg-lifecycle-imds-test
@@ -1,13 +1,20 @@
#!/bin/bash
set -euo pipefail
-
# The purpose of this end-to-end test is to ensure that nodes with an NTH pod, will cordon and drain when they receive an ASG Termination event
-# This test assumes that AEMM (aws ec2 metadata mock service) provides an endpoint for getting and setting the target-lifecycle-state
+# through Instance Metadata Service (IMDS)
+# This test assumes that AEMM (aws ec2 metadata mock service) provides an endpoint for getting target-lifecycle-state
# For more details on expected behavior
# Reference: https://docs.aws.amazon.com/autoscaling/ec2/userguide/retrieving-target-lifecycle-state-through-imds.html
-SERVICE_NAME=amazon-ec2-metadata-mock-service
+# Available env vars:
+# $CLUSTER_NAME
+# $NODE_TERMINATION_HANDLER_DOCKER_REPO
+# $NODE_TERMINATION_HANDLER_DOCKER_TAG
+# $WEBHOOK_DOCKER_REPO
+# $WEBHOOK_DOCKER_TAG
+# $AEMM_URL
+# $IMDS_PORT
function fail_and_exit {
echo "❌ ASG Lifecycle IMDS Test failed $CLUSTER_NAME ❌"
@@ -15,7 +22,6 @@ function fail_and_exit {
}
echo "Starting ASG Lifecycle IMDS Test for Node Termination Handler"
-START_TIME=$(date -u +"%Y-%m-%dT%TZ")
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
@@ -71,11 +77,10 @@ aemm_helm_args=(
--install
--namespace default
"$CLUSTER_NAME-aemm"
- "/Volumes/workplace/github/amazon-ec2-metadata-mock/helm/amazon-ec2-metadata-mock" #$AEMM_DL_URL"
+ "$AEMM_DL_URL"
--set servicePort="$IMDS_PORT"
--set 'tolerations[0].effect=NoSchedule'
--set 'tolerations[0].operator=Exists'
- --set serviceName="$SERVICE_NAME"
--set arguments='{asglifecycle}'
--wait
)
@@ -105,24 +110,6 @@ if [[ $DEPLOYED -eq 0 ]]; then
fail_and_exit 2
fi
-# Confirm that IMDS is accessible from script
-# Port forwarding allows the script to access the IMDS service running inside the cluster and make POST requests
-# Port is cleaned up after script finishes running
-# Sleeping ensures that the port is fully etablished before proceeding to the next commands
-LOCAL_PORT=8899
-LOCAL_AEMM_URL="http://localhost:$LOCAL_PORT/latest/meta-data/autoscaling/target-lifecycle-state"
-kubectl port-forward service/$SERVICE_NAME $LOCAL_PORT:$IMDS_PORT > /dev/null 2>&1 &
-PORT_FORWARD_PID=$!
-trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR
-echo "✅ Local Port $LOCAL_PORT forwards to service $SERVICE_NAME on $IMDS_PORT"
-sleep 2
-
-# Update the target-lifecycle-state to Terminated
-TARGET_LIFECYCLE_STATE=$(curl -s $LOCAL_AEMM_URL)
-echo "✅ Instance Metadata URL is available at $LOCAL_AEMM_URL! Current target-lifecycle-state: $TARGET_LIFECYCLE_STATE"
-curl -s -X POST -H "Content-Type: application/json" -d '{"state":"Terminated"}' $LOCAL_AEMM_URL
-TARGET_LIFECYCLE_STATE=$(curl -s $LOCAL_AEMM_URL)
-echo "Set target-lifecycle-state to ${TARGET_LIFECYCLE_STATE}"
# Check that worker node was cordoned and drained
cordoned=0
diff --git a/test/e2e/spot-interruption-test b/test/e2e/spot-interruption-test
index 27a9f026..36781d21 100755
--- a/test/e2e/spot-interruption-test
+++ b/test/e2e/spot-interruption-test
@@ -112,11 +112,6 @@ cordoned=0
tainted=0
test_node=${TEST_NODE:-$CLUSTER_NAME-worker}
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
-
- echo "CURRENT NODES"
- echo $(kubectl get nodes)
- echo "\n"
-
if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then
echo "✅ Verified the worker node was cordoned!"
cordoned=1
@@ -130,9 +125,6 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
echo "✅ Verified the regular-pod-test pod was evicted!"
echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅"
- echo "CURRENT NODES"
- echo $(kubectl get nodes)
- echo "\n"
exit 0
fi
echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index c5b1886d..fd80d9fd 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -300,7 +300,7 @@ for assert_script in $ASSERTION_SCRIPTS; do
POD_ID=$(get_nth_worker_pod || :)
kubectl logs "$POD_ID" --namespace kube-system || :
## Resets cluster to run another test on the same cluster
- # reset_cluster
+ reset_cluster
echo "✅ Assertion test $assert_script PASSED! ✅"
done
From 69debdd49e90780adeec890c94e1c13cdbbd03b5 Mon Sep 17 00:00:00 2001
From: Lu-David
Date: Wed, 4 Sep 2024 13:33:23 -0700
Subject: [PATCH 6/8] cleaned comments and unit-test
---
pkg/monitor/asglifecycle/asg-lifecycle-monitor.go | 2 +-
pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
index 575161b7..a623ae96 100644
--- a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
+++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go
@@ -61,7 +61,7 @@ func (m ASGLifecycleMonitor) Kind() string {
return ASGLifecycleMonitorKind
}
-// checkForSpotInterruptionNotice Checks EC2 instance metadata for a spot interruption termination notice
+// checkForASGTargetLifecycleStateNotice Checks EC2 instance metadata for a asg lifecycle termination notice
func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.InterruptionEvent, error) {
state, err := m.IMDS.GetASGTargetLifecycleState()
if err != nil {
diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go
index 8532c16a..539b3b1a 100644
--- a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go
+++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go
@@ -80,7 +80,7 @@ func TestInterruptionTaintAlreadyPresent(t *testing.T) {
newNode := &v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: nodeName},
Spec: v1.NodeSpec{Taints: []v1.Taint{{
- Key: node.RebalanceRecommendationTaint,
+ Key: node.ASGLifecycleTerminationTaint,
Value: drainEvent.EventID[:63],
Effect: v1.TaintEffectNoSchedule,
},
From 3d0074623f7b83377a22dbe7527e0199bd1e8d3b Mon Sep 17 00:00:00 2001
From: David Lu
Date: Mon, 9 Sep 2024 14:59:23 -0700
Subject: [PATCH 7/8] updated aemm mock version
---
test/k8s-local-cluster-test/run-test | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test
index fd80d9fd..7c820cfe 100755
--- a/test/k8s-local-cluster-test/run-test
+++ b/test/k8s-local-cluster-test/run-test
@@ -18,7 +18,7 @@ WEBHOOK_DOCKER_IMG=""
OVERRIDE_PATH=0
K8S_VERSION="1.30"
AEMM_URL="amazon-ec2-metadata-mock-service.default.svc.cluster.local"
-AEMM_VERSION="1.8.1"
+AEMM_VERSION="1.12.0"
AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v$AEMM_VERSION/amazon-ec2-metadata-mock-$AEMM_VERSION.tgz"
WEBHOOK_URL=${WEBHOOK_URL:="http://webhook-test-proxy.default.svc.cluster.local"}
From c6d065ccedb677bd47f19f9d58fd163adedd842f Mon Sep 17 00:00:00 2001
From: David Lu
Date: Tue, 10 Sep 2024 12:53:02 -0700
Subject: [PATCH 8/8] eks test update AEMM version
---
test/eks-cluster-test/run-test | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/test/eks-cluster-test/run-test b/test/eks-cluster-test/run-test
index 949f68d1..a1e3e316 100755
--- a/test/eks-cluster-test/run-test
+++ b/test/eks-cluster-test/run-test
@@ -6,7 +6,7 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
PRESERVE=true
export TEST_WINDOWS="false"
-export AEMM_VERSION="1.8.1"
+export AEMM_VERSION="1.12.0"
export AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v$AEMM_VERSION/amazon-ec2-metadata-mock-$AEMM_VERSION.tgz"
export CLUSTER_CONFIG_FILE=$SCRIPTPATH/cluster-spec.yaml