Enable EventStatus and add a metric

AndrewMitchell25 · AndrewMitchell25 · commit fa9fc1bcdfe4 · 2026-04-08T22:47:09.000Z
diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go
@@ -160,6 +160,13 @@ func (c *Controller) reconcileInstanceStatus(ctx context.Context) error {
 
 	errs := make([]error, len(instanceStatuses))
 	workqueue.ParallelizeUntil(ctx, 10, len(instanceStatuses), func(i int) {
+		categories := map[string]bool{}
+		for _, d := range instanceStatuses[i].Details {
+			categories[string(d.Category)] = true
+		}
+		for cat := range categories {
+			InstanceStatusUnhealthy.Inc(map[string]string{categoryLabel: cat})
+		}
 		if err := c.handleMessage(ctx, instancestatusfailure.Message(instanceStatuses[i])); err != nil {
 			errs[i] = fmt.Errorf("handling instance status check message, %w", err)
 		}
diff --git a/pkg/controllers/interruption/metrics.go b/pkg/controllers/interruption/metrics.go
@@ -25,6 +25,7 @@ import (
 const (
 	interruptionSubsystem = "interruption"
 	messageTypeLabel      = "message_type"
+	categoryLabel         = "category"
 )
 
 var (
@@ -59,4 +60,14 @@ var (
 		},
 		[]string{},
 	)
+	InstanceStatusUnhealthy = opmetrics.NewPrometheusCounter(
+		crmetrics.Registry,
+		prometheus.CounterOpts{
+			Namespace: metrics.Namespace,
+			Subsystem: interruptionSubsystem,
+			Name:      "instance_status_unhealthy_total",
+			Help:      "Count of unhealthy instance statuses detected from EC2 DescribeInstanceStatus. Broken down by status check category.",
+		},
+		[]string{categoryLabel},
+	)
 )
diff --git a/pkg/controllers/interruption/suite_test.go b/pkg/controllers/interruption/suite_test.go
@@ -109,6 +109,7 @@ var _ = BeforeEach(func() {
 	ctx = options.ToContext(ctx, test.Options(test.OptionsFields{InterruptionQueue: lo.ToPtr("test-cluster")}))
 	unavailableOfferingsCache.Flush()
 	sqsapi.Reset()
+	interruption.InstanceStatusUnhealthy.Reset()
 })
 
 var _ = AfterEach(func() {
@@ -316,7 +317,7 @@ var _ = Describe("InterruptionHandling", func() {
 
 			Expect(awsEnv.CapacityReservationProvider.GetAvailableInstanceCount("cr-56fac701cc1951b03")).To(Equal(0))
 		})
-		It("should delete the NodeClaim when an instance is unhealthy due to EC2 status checks", func() {
+		It("should delete the NodeClaim when an instance is unhealthy due to EC2 status checks or scheduled events", func() {
 			ctx = options.ToContext(ctx, test.Options(test.OptionsFields{InterruptionQueue: lo.ToPtr("")}))
 			awsEnv.EC2API.DescribeInstanceStatusOutput.Set(&ec2.DescribeInstanceStatusOutput{
 				InstanceStatuses: []ec2types.InstanceStatus{
@@ -341,6 +342,11 @@ var _ = Describe("InterruptionHandling", func() {
 								},
 							},
 						},
+						Events: []ec2types.InstanceStatusEvent{
+							{
+								Code: ec2types.EventCodeInstanceRetirement,
+							},
+						},
 					},
 				},
 			})
@@ -351,9 +357,15 @@ var _ = Describe("InterruptionHandling", func() {
 				metrics.ReasonLabel: "instance_status_failure",
 				"nodepool":          "default",
 			})
+			ExpectMetricCounterValue(interruption.InstanceStatusUnhealthy, 1, map[string]string{
+				"category": "SystemStatus",
+			})
+			ExpectMetricCounterValue(interruption.InstanceStatusUnhealthy, 1, map[string]string{
+				"category": "EventStatus",
+			})
 			ExpectNotFound(ctx, env.Client, nodeClaim)
 		})
-		It("should NOT delete the NodeClaim when an instance is unhealthy due to a Scheduled Event Status or EBS Status", func() {
+		It("should NOT delete the NodeClaim when an instance is unhealthy due to EBS Status only", func() {
 			ctx = options.ToContext(ctx, test.Options(test.OptionsFields{InterruptionQueue: lo.ToPtr("")}))
 			awsEnv.EC2API.DescribeInstanceStatusOutput.Set(&ec2.DescribeInstanceStatusOutput{
 				InstanceStatuses: []ec2types.InstanceStatus{
@@ -369,11 +381,6 @@ var _ = Describe("InterruptionHandling", func() {
 								},
 							},
 						},
-						Events: []ec2types.InstanceStatusEvent{
-							{
-								Code: ec2types.EventCodeInstanceRetirement,
-							},
-						},
 					},
 				},
 			})
diff --git a/pkg/providers/instancestatus/instancestatus.go b/pkg/providers/instancestatus/instancestatus.go
@@ -33,10 +33,9 @@ type Category string
 const (
 	InstanceStatus = Category("InstanceStatus")
 	SystemStatus   = Category("SystemStatus")
-	// EventStatus is currently ignored since this is already consumed via EventBridge in the Interruption controller.
-	// The handling of maintenance events is currently primitive where we treat all events as instance degradation
-	// with an involuntary replacement. Only consuming events via EventBridge allows some users to opt-out of maintenance
-	// event handling (https://github.com/aws/karpenter-provider-aws/issues/8524).
+	// EventStatus surfaces scheduled maintenance events. These are also consumed via EventBridge
+	// in the Interruption controller when an SQS queue is configured. The handling of maintenance events is
+	// currently primitive where we treat all events as instance degradation with an involuntary replacement.
 	EventStatus = Category("EventStatus")
 	// EBSStatus check failures are currently ignored until we can differentiate which volumes affect the node vs pods w/ PVCs
 	EBSStatus = Category("EBSStatus")
@@ -96,12 +95,12 @@ func (p DefaultProvider) List(ctx context.Context) ([]HealthStatus, error) {
 			if details.Status != ec2types.StatusTypeFailed {
 				return false
 			}
-			// ignore EBS and Scheduled Event health checks for now
-			if details.Category == EBSStatus || details.Category == EventStatus {
+			// ignore EBS health checks for now
+			if details.Category == EBSStatus {
 				return false
 			}
 			// Do not evaluate against the unhealthy threshold when its a scheduled maintenance event.
-			// Scheduled maintenance events often have a future scheduled time which makes a thershold
+			// Scheduled maintenance events often have a future scheduled time which makes a threshold
 			// difficult to utilize. We take the stance that if there is a scheduled maintenance event,
 			// then there is something wrong with the underlying host that warrants vacating immediately.
 			// This matches how we process scheduled maintenance events from EventBridge.
diff --git a/test/suites/interruption/suite_test.go b/test/suites/interruption/suite_test.go
@@ -267,7 +267,7 @@ var _ = Describe("Interruption", func() {
 		env.EventuallyExpectNotFound(node)
 		env.EventuallyExpectHealthyPodCount(selector, 1)
 	})
-	FIt("should terminate the node when receiving an instance status failure", func() {
+	It("should terminate the node when receiving an instance status failure", func() {
 		numPods := 1
 		dep := coretest.Deployment(coretest.DeploymentOptions{
 			Replicas: int32(numPods),
diff --git a/website/content/en/preview/concepts/disruption.md b/website/content/en/preview/concepts/disruption.md
@@ -220,8 +220,9 @@ Additionally, Karpenter utilizes the [EC2 DescribeInstanceStatus](https://docs.a
 
 * System Status - surfaces failures in the underlying physical host (hardware or software)
 * Instance Status - surfaces failures in the virtual machine 
+* Scheduled Maintenance Events - surfaces upcoming maintenance events that may affect the instance
 
-System and Instance status checks do not require the `--interruption-queue` to be configured, just EC2 DescribeInstanceStatus IAM permissions.
+These status checks do not require the `--interruption-queue` to be configured, just EC2 DescribeInstanceStatus IAM permissions.
 
 ### Node Auto Repair