Skip to content

Commit fa9fc1b

Browse files
Enable EventStatus and add a metric
1 parent 9d83ef0 commit fa9fc1b

File tree

6 files changed

+41
-16
lines changed

6 files changed

+41
-16
lines changed

pkg/controllers/interruption/controller.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,13 @@ func (c *Controller) reconcileInstanceStatus(ctx context.Context) error {
160160

161161
errs := make([]error, len(instanceStatuses))
162162
workqueue.ParallelizeUntil(ctx, 10, len(instanceStatuses), func(i int) {
163+
categories := map[string]bool{}
164+
for _, d := range instanceStatuses[i].Details {
165+
categories[string(d.Category)] = true
166+
}
167+
for cat := range categories {
168+
InstanceStatusUnhealthy.Inc(map[string]string{categoryLabel: cat})
169+
}
163170
if err := c.handleMessage(ctx, instancestatusfailure.Message(instanceStatuses[i])); err != nil {
164171
errs[i] = fmt.Errorf("handling instance status check message, %w", err)
165172
}

pkg/controllers/interruption/metrics.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
const (
2626
interruptionSubsystem = "interruption"
2727
messageTypeLabel = "message_type"
28+
categoryLabel = "category"
2829
)
2930

3031
var (
@@ -59,4 +60,14 @@ var (
5960
},
6061
[]string{},
6162
)
63+
InstanceStatusUnhealthy = opmetrics.NewPrometheusCounter(
64+
crmetrics.Registry,
65+
prometheus.CounterOpts{
66+
Namespace: metrics.Namespace,
67+
Subsystem: interruptionSubsystem,
68+
Name: "instance_status_unhealthy_total",
69+
Help: "Count of unhealthy instance statuses detected from EC2 DescribeInstanceStatus. Broken down by status check category.",
70+
},
71+
[]string{categoryLabel},
72+
)
6273
)

pkg/controllers/interruption/suite_test.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ var _ = BeforeEach(func() {
109109
ctx = options.ToContext(ctx, test.Options(test.OptionsFields{InterruptionQueue: lo.ToPtr("test-cluster")}))
110110
unavailableOfferingsCache.Flush()
111111
sqsapi.Reset()
112+
interruption.InstanceStatusUnhealthy.Reset()
112113
})
113114

114115
var _ = AfterEach(func() {
@@ -316,7 +317,7 @@ var _ = Describe("InterruptionHandling", func() {
316317

317318
Expect(awsEnv.CapacityReservationProvider.GetAvailableInstanceCount("cr-56fac701cc1951b03")).To(Equal(0))
318319
})
319-
It("should delete the NodeClaim when an instance is unhealthy due to EC2 status checks", func() {
320+
It("should delete the NodeClaim when an instance is unhealthy due to EC2 status checks or scheduled events", func() {
320321
ctx = options.ToContext(ctx, test.Options(test.OptionsFields{InterruptionQueue: lo.ToPtr("")}))
321322
awsEnv.EC2API.DescribeInstanceStatusOutput.Set(&ec2.DescribeInstanceStatusOutput{
322323
InstanceStatuses: []ec2types.InstanceStatus{
@@ -341,6 +342,11 @@ var _ = Describe("InterruptionHandling", func() {
341342
},
342343
},
343344
},
345+
Events: []ec2types.InstanceStatusEvent{
346+
{
347+
Code: ec2types.EventCodeInstanceRetirement,
348+
},
349+
},
344350
},
345351
},
346352
})
@@ -351,9 +357,15 @@ var _ = Describe("InterruptionHandling", func() {
351357
metrics.ReasonLabel: "instance_status_failure",
352358
"nodepool": "default",
353359
})
360+
ExpectMetricCounterValue(interruption.InstanceStatusUnhealthy, 1, map[string]string{
361+
"category": "SystemStatus",
362+
})
363+
ExpectMetricCounterValue(interruption.InstanceStatusUnhealthy, 1, map[string]string{
364+
"category": "EventStatus",
365+
})
354366
ExpectNotFound(ctx, env.Client, nodeClaim)
355367
})
356-
It("should NOT delete the NodeClaim when an instance is unhealthy due to a Scheduled Event Status or EBS Status", func() {
368+
It("should NOT delete the NodeClaim when an instance is unhealthy due to EBS Status only", func() {
357369
ctx = options.ToContext(ctx, test.Options(test.OptionsFields{InterruptionQueue: lo.ToPtr("")}))
358370
awsEnv.EC2API.DescribeInstanceStatusOutput.Set(&ec2.DescribeInstanceStatusOutput{
359371
InstanceStatuses: []ec2types.InstanceStatus{
@@ -369,11 +381,6 @@ var _ = Describe("InterruptionHandling", func() {
369381
},
370382
},
371383
},
372-
Events: []ec2types.InstanceStatusEvent{
373-
{
374-
Code: ec2types.EventCodeInstanceRetirement,
375-
},
376-
},
377384
},
378385
},
379386
})

pkg/providers/instancestatus/instancestatus.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,9 @@ type Category string
3333
const (
3434
InstanceStatus = Category("InstanceStatus")
3535
SystemStatus = Category("SystemStatus")
36-
// EventStatus is currently ignored since this is already consumed via EventBridge in the Interruption controller.
37-
// The handling of maintenance events is currently primitive where we treat all events as instance degradation
38-
// with an involuntary replacement. Only consuming events via EventBridge allows some users to opt-out of maintenance
39-
// event handling (https://github.com/aws/karpenter-provider-aws/issues/8524).
36+
// EventStatus surfaces scheduled maintenance events. These are also consumed via EventBridge
37+
// in the Interruption controller when an SQS queue is configured. The handling of maintenance events is
38+
// currently primitive where we treat all events as instance degradation with an involuntary replacement.
4039
EventStatus = Category("EventStatus")
4140
// EBSStatus check failures are currently ignored until we can differentiate which volumes affect the node vs pods w/ PVCs
4241
EBSStatus = Category("EBSStatus")
@@ -96,12 +95,12 @@ func (p DefaultProvider) List(ctx context.Context) ([]HealthStatus, error) {
9695
if details.Status != ec2types.StatusTypeFailed {
9796
return false
9897
}
99-
// ignore EBS and Scheduled Event health checks for now
100-
if details.Category == EBSStatus || details.Category == EventStatus {
98+
// ignore EBS health checks for now
99+
if details.Category == EBSStatus {
101100
return false
102101
}
103102
// Do not evaluate against the unhealthy threshold when its a scheduled maintenance event.
104-
// Scheduled maintenance events often have a future scheduled time which makes a thershold
103+
// Scheduled maintenance events often have a future scheduled time which makes a threshold
105104
// difficult to utilize. We take the stance that if there is a scheduled maintenance event,
106105
// then there is something wrong with the underlying host that warrants vacating immediately.
107106
// This matches how we process scheduled maintenance events from EventBridge.

test/suites/interruption/suite_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ var _ = Describe("Interruption", func() {
267267
env.EventuallyExpectNotFound(node)
268268
env.EventuallyExpectHealthyPodCount(selector, 1)
269269
})
270-
FIt("should terminate the node when receiving an instance status failure", func() {
270+
It("should terminate the node when receiving an instance status failure", func() {
271271
numPods := 1
272272
dep := coretest.Deployment(coretest.DeploymentOptions{
273273
Replicas: int32(numPods),

website/content/en/preview/concepts/disruption.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,9 @@ Additionally, Karpenter utilizes the [EC2 DescribeInstanceStatus](https://docs.a
220220
221221
* System Status - surfaces failures in the underlying physical host (hardware or software)
222222
* Instance Status - surfaces failures in the virtual machine
223+
* Scheduled Maintenance Events - surfaces upcoming maintenance events that may affect the instance
223224
224-
System and Instance status checks do not require the `--interruption-queue` to be configured, just EC2 DescribeInstanceStatus IAM permissions.
225+
These status checks do not require the `--interruption-queue` to be configured, just EC2 DescribeInstanceStatus IAM permissions.
225226
226227
### Node Auto Repair
227228

0 commit comments

Comments
 (0)