Skip to content

Commit 18ea913

Browse files
authored
[autoscaling] Add error reasons in the autoscaling conditions (#46063)
### What does this PR do? Add codified `Reason` in the DPA Status to allow better error matching and understanding ### Motivation ### Describe how you validated your changes There are a significant number of error cases, testing everything is not required. To test an easy one, you can create an Autoscaler that references an unsupported `apiGroup` ### Additional Notes Co-authored-by: vincent.boulineau <vincent.boulineau@datadoghq.com>
1 parent 6bc0136 commit 18ea913

File tree

10 files changed

+268
-142
lines changed

10 files changed

+268
-142
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
//go:build kubeapiserver
7+
8+
package autoscaling
9+
10+
import (
11+
"fmt"
12+
)
13+
14+
// ConditionReasonType is a typed string for programmatic condition reasons.
15+
// Values must be CamelCase with no spaces, following the Kubernetes convention.
16+
type ConditionReasonType string
17+
18+
const (
19+
// ConditionReasonInvalidTargetRef indicates the targetRef API version could not be parsed.
20+
ConditionReasonInvalidTargetRef ConditionReasonType = "InvalidTargetRef"
21+
// ConditionReasonInvalidTarget indicates the target cannot be autoscaled (e.g. it is the cluster agent itself).
22+
ConditionReasonInvalidTarget ConditionReasonType = "InvalidTarget"
23+
// ConditionReasonInvalidSpec indicates the DPA spec failed validation (e.g. bad objectives or constraints).
24+
ConditionReasonInvalidSpec ConditionReasonType = "InvalidSpec"
25+
// ConditionReasonClusterAutoscalerLimitReached indicates the maximum number of DPA objects per cluster has been reached.
26+
ConditionReasonClusterAutoscalerLimitReached ConditionReasonType = "ClusterAutoscalerLimitReached"
27+
// ConditionReasonRecommendationError indicates Datadog could not compute recommendations.
28+
ConditionReasonRecommendationError ConditionReasonType = "RecommendationError"
29+
// ConditionReasonLocalRecommenderError indicates the local fallback recommender could not compute recommendations.
30+
ConditionReasonLocalRecommenderError ConditionReasonType = "LocalRecommenderError"
31+
// ConditionReasonTargetNotFound indicates the scale target could not be found.
32+
ConditionReasonTargetNotFound ConditionReasonType = "TargetNotFound"
33+
// ConditionReasonScaleFailed indicates the scale operation on the target failed.
34+
ConditionReasonScaleFailed ConditionReasonType = "ScaleFailed"
35+
// ConditionReasonScalingDisabled indicates scaling is blocked because current replicas is 0.
36+
ConditionReasonScalingDisabled ConditionReasonType = "ScalingDisabled"
37+
// ConditionReasonPolicyRestricted indicates the applyPolicy or strategy blocks the scaling action.
38+
ConditionReasonPolicyRestricted ConditionReasonType = "PolicyRestricted"
39+
// ConditionReasonFallbackRestricted indicates the fallback scaling direction is not allowed.
40+
ConditionReasonFallbackRestricted ConditionReasonType = "FallbackRestricted"
41+
// ConditionReasonUnsupportedTargetKind indicates vertical rollout is not supported for this workload Kind.
42+
ConditionReasonUnsupportedTargetKind ConditionReasonType = "UnsupportedTargetKind"
43+
// ConditionReasonRolloutFailed indicates a failure when triggering a vertical rollout on the target.
44+
ConditionReasonRolloutFailed ConditionReasonType = "RolloutFailed"
45+
)
46+
47+
// ConditionReason is an interface that errors can implement to provide
48+
// a programmatic Reason for Kubernetes conditions.
49+
// When an error implements this interface, Reason() populates the condition's
50+
// Reason field and Error() populates the Message field.
51+
// When an error does not implement this interface, Error() populates the
52+
// Message field and Reason is left empty.
53+
type ConditionReason interface {
54+
Reason() ConditionReasonType
55+
}
56+
57+
// NewConditionError wraps an existing error with a programmatic reason.
58+
func NewConditionError(reason ConditionReasonType, err error) error {
59+
return &conditionError{reason: reason, err: err}
60+
}
61+
62+
// NewConditionErrorf creates a new error with a programmatic reason and a formatted message.
63+
func NewConditionErrorf(reason ConditionReasonType, format string, args ...any) error {
64+
return &conditionError{reason: reason, err: fmt.Errorf(format, args...)}
65+
}
66+
67+
type conditionError struct {
68+
reason ConditionReasonType
69+
err error
70+
}
71+
72+
func (e *conditionError) Error() string { return e.err.Error() }
73+
func (e *conditionError) Unwrap() error { return e.err }
74+
func (e *conditionError) Reason() ConditionReasonType { return e.reason }

pkg/clusteragent/autoscaling/max_heap.go

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,8 @@ import (
1111
"container/heap"
1212
"sync"
1313
"time"
14-
15-
"github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model"
1614
)
1715

18-
type store = Store[model.PodAutoscalerInternal]
19-
2016
// TimestampKey is a struct that holds a timestamp and key for a `DatadogPodAutoscaler` object
2117
type TimestampKey struct {
2218
Timestamp time.Time
@@ -79,22 +75,25 @@ func (h *MaxTimestampKeyHeap) FindIdx(key string) (int, bool) {
7975
}
8076

8177
// HashHeap is a struct that holds a MaxHeap and a set of keys that exist in the heap
82-
type HashHeap struct {
83-
MaxHeap MaxTimestampKeyHeap
84-
Keys map[string]bool
85-
maxSize int
86-
mu sync.RWMutex
87-
store *store
78+
type HashHeap[T any] struct {
79+
MaxHeap MaxTimestampKeyHeap
80+
Keys map[string]bool
81+
maxSize int
82+
mu sync.RWMutex
83+
store *Store[T]
84+
creationTimestamp func(*T) time.Time
8885
}
8986

90-
// NewHashHeap returns a new MaxHeap with the given max size
91-
func NewHashHeap(maxSize int, store *store) *HashHeap {
92-
h := &HashHeap{
93-
MaxHeap: *NewMaxHeap(),
94-
Keys: make(map[string]bool),
95-
maxSize: maxSize,
96-
mu: sync.RWMutex{},
97-
store: store,
87+
// NewHashHeap returns a new MaxHeap with the given max size.
88+
// creationTimestamp is used to extract the creation timestamp from stored objects.
89+
func NewHashHeap[T any](maxSize int, store *Store[T], creationTimestamp func(*T) time.Time) *HashHeap[T] {
90+
h := &HashHeap[T]{
91+
MaxHeap: *NewMaxHeap(),
92+
Keys: make(map[string]bool),
93+
maxSize: maxSize,
94+
mu: sync.RWMutex{},
95+
store: store,
96+
creationTimestamp: creationTimestamp,
9897
}
9998

10099
store.RegisterObserver(Observer{
@@ -107,19 +106,19 @@ func NewHashHeap(maxSize int, store *store) *HashHeap {
107106

108107
// InsertIntoHeap returns true if the key already exists in the max heap or was inserted correctly
109108
// Used as an ObserverFunc; accept sender as parameter to match ObserverFunc signature
110-
func (h *HashHeap) InsertIntoHeap(key, _sender string) {
111-
// Get object from store
112-
podAutoscalerInternal, podAutoscalerInternalFound := h.store.Get(key)
113-
if !podAutoscalerInternalFound {
109+
func (h *HashHeap[T]) InsertIntoHeap(key, _sender string) {
110+
obj, found := h.store.Get(key)
111+
if !found {
114112
return
115113
}
116114

117-
if podAutoscalerInternal.CreationTimestamp().IsZero() {
115+
ts := h.creationTimestamp(&obj)
116+
if ts.IsZero() {
118117
return
119118
}
120119

121120
newTimestampKey := TimestampKey{
122-
Timestamp: podAutoscalerInternal.CreationTimestamp(),
121+
Timestamp: ts,
123122
Key: key,
124123
}
125124

@@ -147,7 +146,7 @@ func (h *HashHeap) InsertIntoHeap(key, _sender string) {
147146

148147
// DeleteFromHeap removes the given key from the max heap
149148
// Used as an ObserverFunc; accept sender as parameter to match ObserverFunc signature
150-
func (h *HashHeap) DeleteFromHeap(key, _sender string) {
149+
func (h *HashHeap[T]) DeleteFromHeap(key, _sender string) {
151150
// Key did not exist in heap, return early
152151
if !h.Exists(key) {
153152
return
@@ -167,14 +166,14 @@ func (h *HashHeap) DeleteFromHeap(key, _sender string) {
167166
}
168167

169168
// Exists returns true if the given key exists in the heap
170-
func (h *HashHeap) Exists(key string) bool {
169+
func (h *HashHeap[T]) Exists(key string) bool {
171170
h.mu.RLock()
172171
defer h.mu.RUnlock()
173172
_, ok := h.Keys[key]
174173
return ok
175174
}
176175

177176
// MaxSize returns the size limit on the hash heap
178-
func (h *HashHeap) MaxSize() int {
177+
func (h *HashHeap[T]) MaxSize() int {
179178
return h.maxSize
180179
}

pkg/clusteragent/autoscaling/workload/controller.go

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,10 @@ var (
5454
}
5555
)
5656

57-
type store = autoscaling.Store[model.PodAutoscalerInternal]
57+
type (
58+
store = autoscaling.Store[model.PodAutoscalerInternal]
59+
limitHeap = autoscaling.HashHeap[model.PodAutoscalerInternal]
60+
)
5861

5962
// Controller for DatadogPodAutoscaler objects
6063
type Controller struct {
@@ -66,7 +69,7 @@ type Controller struct {
6669
eventRecorder record.EventRecorder
6770
store *store
6871

69-
limitHeap *autoscaling.HashHeap
72+
limitHeap *limitHeap
7073

7174
podWatcher PodWatcher
7275
horizontalController *horizontalController
@@ -90,7 +93,7 @@ func NewController(
9093
store *store,
9194
podWatcher PodWatcher,
9295
localSender sender.Sender,
93-
limitHeap *autoscaling.HashHeap,
96+
limitHeap *limitHeap,
9497
) (*Controller, error) {
9598
c := &Controller{
9699
clusterID: clusterID,
@@ -461,7 +464,7 @@ func (c *Controller) validateAutoscaler(podAutoscalerInternal model.PodAutoscale
461464
// Check that we are within the limit of 100 DatadogPodAutoscalers
462465
key := podAutoscalerInternal.ID()
463466
if !c.limitHeap.Exists(key) {
464-
return fmt.Errorf("Autoscaler disabled as maximum number per cluster reached (%d)", c.limitHeap.MaxSize())
467+
return autoscaling.NewConditionErrorf(autoscaling.ConditionReasonClusterAutoscalerLimitReached, "Autoscaler disabled as maximum number per cluster reached (%d)", c.limitHeap.MaxSize())
465468
}
466469

467470
// Check that targetRef is not set to the cluster agent
@@ -485,7 +488,7 @@ func (c *Controller) validateAutoscaler(podAutoscalerInternal model.PodAutoscale
485488
clusterAgentNs := namespace.GetMyNamespace()
486489

487490
if podAutoscalerInternal.Namespace() == clusterAgentNs && podAutoscalerInternal.Spec().TargetRef.Name == resourceName {
488-
return errors.New("Autoscaling target cannot be set to the cluster agent")
491+
return autoscaling.NewConditionErrorf(autoscaling.ConditionReasonInvalidTarget, "Autoscaling target cannot be set to the cluster agent")
489492
}
490493
if err := validateAutoscalerObjectives(podAutoscalerInternal.Spec()); err != nil {
491494
return err
@@ -497,7 +500,7 @@ func validateAutoscalerObjectives(spec *datadoghq.DatadogPodAutoscalerSpec) erro
497500
if spec.Fallback != nil && len(spec.Fallback.Horizontal.Objectives) > 0 {
498501
for _, objective := range spec.Fallback.Horizontal.Objectives {
499502
if objective.Type == datadoghqcommon.DatadogPodAutoscalerCustomQueryObjectiveType {
500-
return errors.New("Autoscaler fallback cannot be based on custom query objective")
503+
return autoscaling.NewConditionErrorf(autoscaling.ConditionReasonInvalidSpec, "Autoscaler fallback cannot be based on custom query objective")
501504
}
502505
}
503506
}
@@ -506,15 +509,15 @@ func validateAutoscalerObjectives(spec *datadoghq.DatadogPodAutoscalerSpec) erro
506509
switch objective.Type {
507510
case datadoghqcommon.DatadogPodAutoscalerCustomQueryObjectiveType:
508511
if objective.CustomQuery == nil {
509-
return errors.New("Autoscaler objective type is custom query but customQueryObjective is nil")
512+
return autoscaling.NewConditionErrorf(autoscaling.ConditionReasonInvalidSpec, "Autoscaler objective type is custom query but customQueryObjective is nil")
510513
}
511514
case datadoghqcommon.DatadogPodAutoscalerPodResourceObjectiveType:
512515
if objective.PodResource == nil {
513-
return fmt.Errorf("autoscaler objective type is %s but podResource is nil", objective.Type)
516+
return autoscaling.NewConditionErrorf(autoscaling.ConditionReasonInvalidSpec, "autoscaler objective type is %s but podResource is nil", objective.Type)
514517
}
515518
case datadoghqcommon.DatadogPodAutoscalerContainerResourceObjectiveType:
516519
if objective.ContainerResource == nil {
517-
return fmt.Errorf("autoscaler objective type is %s but containerResource is nil", objective.Type)
520+
return autoscaling.NewConditionErrorf(autoscaling.ConditionReasonInvalidSpec, "autoscaler objective type is %s but containerResource is nil", objective.Type)
518521
}
519522
}
520523
}

pkg/clusteragent/autoscaling/workload/controller_horizontal.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ package workload
99

1010
import (
1111
"context"
12-
"errors"
1312
"fmt"
1413
"math"
1514
"slices"
@@ -77,7 +76,7 @@ func (hr *horizontalController) sync(ctx context.Context, podAutoscaler *datadog
7776
// Get the current scale of the target resource
7877
scale, gr, err := hr.scaler.get(ctx, autoscalerInternal.Namespace(), autoscalerInternal.Spec().TargetRef.Name, gvk)
7978
if err != nil {
80-
err = fmt.Errorf("failed to get scale subresource for autoscaler %s, err: %w", autoscalerInternal.ID(), err)
79+
err = autoscaling.NewConditionError(autoscaling.ConditionReasonTargetNotFound, fmt.Errorf("failed to get scale subresource for autoscaler %s, err: %w", autoscalerInternal.ID(), err))
8180
autoscalerInternal.UpdateFromHorizontalAction(nil, err)
8281
return autoscaling.Requeue, err
8382
}
@@ -127,7 +126,7 @@ func (hr *horizontalController) performScaling(ctx context.Context, podAutoscale
127126
scale.Spec.Replicas = horizontalAction.ToReplicas
128127
_, err = hr.scaler.update(ctx, gr, scale)
129128
if err != nil {
130-
err = fmt.Errorf("failed to scale target: %s/%s to %d replicas, err: %w", scale.Namespace, scale.Name, horizontalAction.ToReplicas, err)
129+
err = autoscaling.NewConditionError(autoscaling.ConditionReasonScaleFailed, fmt.Errorf("failed to scale target: %s/%s to %d replicas, err: %w", scale.Namespace, scale.Name, horizontalAction.ToReplicas, err))
131130
hr.eventRecorder.Event(podAutoscaler, corev1.EventTypeWarning, model.FailedScaleEventReason, err.Error())
132131
autoscalerInternal.UpdateFromHorizontalAction(nil, err)
133132

@@ -161,7 +160,7 @@ func (hr *horizontalController) computeScaleAction(
161160
) (*datadoghqcommon.DatadogPodAutoscalerHorizontalAction, time.Duration, error) {
162161
// Check if we scaling has been disabled explicitly
163162
if currentDesiredReplicas == 0 {
164-
return nil, 0, errors.New("scaling disabled as current replicas is set to 0")
163+
return nil, 0, autoscaling.NewConditionErrorf(autoscaling.ConditionReasonScalingDisabled, "scaling disabled as current replicas is set to 0")
165164
}
166165

167166
// Saving original targetDesiredReplicas
@@ -233,7 +232,7 @@ func (hr *horizontalController) computeScaleAction(
233232
if autoscalerInternal.Spec().Fallback != nil && !isFallbackScalingDirectionEnabled(autoscalerInternal.Spec().Fallback.Horizontal.Direction, scaleDirection) {
234233
limitReason = fmt.Sprintf("scaling disabled as fallback in the scaling direction (%s) is disabled", scaleDirection)
235234
log.Debugf("Scaling limited for autoscaler id: %s, scale direction: %s, limit reason: %s", autoscalerInternal.ID(), scaleDirection, limitReason)
236-
return nil, 0, errors.New(limitReason)
235+
return nil, 0, autoscaling.NewConditionErrorf(autoscaling.ConditionReasonFallbackRestricted, "%s", limitReason)
237236
}
238237
}
239238

@@ -273,7 +272,7 @@ func (hr *horizontalController) computeScaleAction(
273272
allowed, reason := isScalingAllowed(autoscalerSpec, source, scaleDirection)
274273
if !allowed {
275274
log.Debugf("Scaling not allowed for autoscaler id: %s, scale direction: %s, scale reason: %s (would have scaled to %d replicas)", autoscalerInternal.ID(), scaleDirection, reason, horizontalAction.ToReplicas)
276-
return nil, 0, errors.New(reason)
275+
return nil, 0, autoscaling.NewConditionErrorf(autoscaling.ConditionReasonPolicyRestricted, "%s", reason)
277276
}
278277

279278
return horizontalAction, evalAfter, nil

0 commit comments

Comments
 (0)