Skip to content

Commit 8308648

Browse files
committed
Emit error event on failed async scale-up
1 parent 06dc935 commit 8308648

File tree

3 files changed

+142
-5
lines changed

3 files changed

+142
-5
lines changed

cluster-autoscaler/core/scaleup/orchestrator/async_initializer.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@ import (
2121
"time"
2222

2323
appsv1 "k8s.io/api/apps/v1"
24+
apiv1 "k8s.io/api/core/v1"
2425
"k8s.io/klog/v2"
2526

2627
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
2728
"k8s.io/autoscaler/cluster-autoscaler/context"
29+
"k8s.io/autoscaler/cluster-autoscaler/expander"
2830
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroups"
2931
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset"
3032
"k8s.io/autoscaler/cluster-autoscaler/processors/status"
@@ -41,6 +43,7 @@ type AsyncNodeGroupInitializer struct {
4143
mutex sync.Mutex
4244
allTargetSizes map[string]int64
4345
nodeGroup cloudprovider.NodeGroup
46+
triggeringPods []*apiv1.Pod
4447
nodeInfo *framework.NodeInfo
4548
scaleUpExecutor *scaleUpExecutor
4649
taintConfig taints.TaintConfig
@@ -52,7 +55,7 @@ type AsyncNodeGroupInitializer struct {
5255

5356
// NewAsyncNodeGroupInitializer creates a new AsyncNodeGroupInitializer instance.
5457
func NewAsyncNodeGroupInitializer(
55-
nodeGroup cloudprovider.NodeGroup,
58+
option *expander.Option,
5659
nodeInfo *framework.NodeInfo,
5760
scaleUpExecutor *scaleUpExecutor,
5861
taintConfig taints.TaintConfig,
@@ -63,7 +66,8 @@ func NewAsyncNodeGroupInitializer(
6366
) *AsyncNodeGroupInitializer {
6467
return &AsyncNodeGroupInitializer{
6568
allTargetSizes: map[string]int64{},
66-
nodeGroup: nodeGroup,
69+
nodeGroup: option.NodeGroup,
70+
triggeringPods: option.Pods,
6771
nodeInfo: nodeInfo,
6872
scaleUpExecutor: scaleUpExecutor,
6973
taintConfig: taintConfig,
@@ -104,8 +108,7 @@ func (s *AsyncNodeGroupInitializer) ChangeTargetSize(nodeGroup string, delta int
104108
func (s *AsyncNodeGroupInitializer) InitializeNodeGroup(result nodegroups.AsyncNodeGroupCreationResult) {
105109
if result.Error != nil {
106110
klog.Errorf("Async node group creation failed. Async scale-up is cancelled. %v", result.Error)
107-
scaleUpStatus, _ := status.UpdateScaleUpError(&status.ScaleUpStatus{}, errors.ToAutoscalerError(errors.InternalError, result.Error))
108-
s.scaleUpStatusProcessor.Process(s.context, scaleUpStatus)
111+
s.emitScaleUpStatus(&status.ScaleUpStatus{}, errors.ToAutoscalerError(errors.InternalError, result.Error))
109112
return
110113
}
111114
mainCreatedNodeGroup := result.CreationResult.MainCreatedNodeGroup
@@ -144,7 +147,17 @@ func (s *AsyncNodeGroupInitializer) InitializeNodeGroup(result nodegroups.AsyncN
144147
failedNodeGroupIds = append(failedNodeGroupIds, failedNodeGroup.Id())
145148
}
146149
klog.Errorf("Async scale-up for asynchronously created node group failed: %v (node groups: %v)", err, failedNodeGroupIds)
150+
s.emitScaleUpStatus(&status.ScaleUpStatus{
151+
CreateNodeGroupResults: []nodegroups.CreateNodeGroupResult{result.CreationResult},
152+
FailedResizeNodeGroups: failedNodeGroups,
153+
PodsTriggeredScaleUp: s.triggeringPods,
154+
}, err)
147155
return
148156
}
149157
klog.Infof("Initial scale-up succeeded. Scale ups: %v", scaleUpInfos)
150158
}
159+
160+
func (s *AsyncNodeGroupInitializer) emitScaleUpStatus(scaleUpStatus *status.ScaleUpStatus, err errors.AutoscalerError) {
161+
status.UpdateScaleUpError(scaleUpStatus, err)
162+
s.scaleUpStatusProcessor.Process(s.context, scaleUpStatus)
163+
}
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package orchestrator
18+
19+
import (
20+
"fmt"
21+
"testing"
22+
23+
"github.com/stretchr/testify/assert"
24+
apiv1 "k8s.io/api/core/v1"
25+
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
26+
testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
27+
"k8s.io/autoscaler/cluster-autoscaler/config"
28+
"k8s.io/autoscaler/cluster-autoscaler/context"
29+
. "k8s.io/autoscaler/cluster-autoscaler/core/test"
30+
"k8s.io/autoscaler/cluster-autoscaler/expander"
31+
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroups"
32+
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroups/asyncnodegroups"
33+
"k8s.io/autoscaler/cluster-autoscaler/processors/status"
34+
processorstest "k8s.io/autoscaler/cluster-autoscaler/processors/test"
35+
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
36+
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
37+
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
38+
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
39+
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
40+
"k8s.io/client-go/kubernetes/fake"
41+
)
42+
43+
func TestNodePoolAsyncInitialization(t *testing.T) {
44+
testCases := []struct {
45+
name string
46+
failingScaleUps map[string]bool
47+
expectedScaleUps map[string]int
48+
}{
49+
{
50+
name: "scale up upcoming node group",
51+
expectedScaleUps: map[string]int{"async-ng": 3},
52+
},
53+
{
54+
name: "failing initial scale up",
55+
failingScaleUps: map[string]bool{"async-ng": true},
56+
},
57+
}
58+
for _, tc := range testCases {
59+
t.Run(tc.name, func(t *testing.T) {
60+
scaledUpGroups := make(map[string]int)
61+
provider := testprovider.NewTestCloudProvider(
62+
func(nodeGroup string, increase int) error {
63+
if tc.failingScaleUps[nodeGroup] {
64+
return fmt.Errorf("Simulated error")
65+
}
66+
scaledUpGroups[nodeGroup] += increase
67+
return nil
68+
}, nil)
69+
options := config.AutoscalingOptions{
70+
NodeAutoprovisioningEnabled: true,
71+
AsyncNodeGroupsEnabled: true,
72+
}
73+
listers := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, nil, nil)
74+
context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, listers, provider, nil, nil)
75+
assert.NoError(t, err)
76+
p1 := BuildTestPod("p1", 2, 1000)
77+
upcomingNodeGroup := provider.BuildNodeGroup("upcoming-ng", 0, 100, 0, false, true, "T1", nil)
78+
createdNodeGroup := provider.BuildNodeGroup("async-ng", 0, 100, 0, false, true, "T1", nil)
79+
option := expander.Option{
80+
NodeGroup: upcomingNodeGroup,
81+
Pods: []*apiv1.Pod{p1},
82+
}
83+
processors := processorstest.NewTestProcessors(&context)
84+
processors.AsyncNodeGroupStateChecker = &asyncnodegroups.MockAsyncNodeGroupStateChecker{IsUpcomingNodeGroup: map[string]bool{upcomingNodeGroup.Id(): true}}
85+
nodeInfo := framework.NewTestNodeInfo(BuildTestNode("t1", 100, 0))
86+
executor := newScaleUpExecutor(&context, processors.ScaleStateNotifier, processors.AsyncNodeGroupStateChecker)
87+
scaleUpStatusProcessor := &fakeScaleUpStatusProcessor{}
88+
initializer := NewAsyncNodeGroupInitializer(&option, nodeInfo, executor, taints.TaintConfig{}, nil, scaleUpStatusProcessor, &context, false)
89+
initializer.SetTargetSize(upcomingNodeGroup.Id(), 3)
90+
asyncResult := nodegroups.AsyncNodeGroupCreationResult{
91+
CreationResult: nodegroups.CreateNodeGroupResult{MainCreatedNodeGroup: createdNodeGroup},
92+
CreatedToUpcomingMapping: map[string]string{
93+
createdNodeGroup.Id(): upcomingNodeGroup.Id(),
94+
},
95+
}
96+
initializer.InitializeNodeGroup(asyncResult)
97+
assert.Equal(t, len(scaledUpGroups), len(tc.expectedScaleUps))
98+
for groupName, increase := range tc.expectedScaleUps {
99+
assert.Equal(t, increase, scaledUpGroups[groupName])
100+
}
101+
if len(tc.failingScaleUps) > 0 {
102+
expectedErr := errors.ToAutoscalerError(errors.CloudProviderError, fmt.Errorf("Simulated error")).AddPrefix("failed to increase node group size: ")
103+
assert.Equal(t, scaleUpStatusProcessor.lastStatus, &status.ScaleUpStatus{
104+
Result: status.ScaleUpError,
105+
ScaleUpError: &expectedErr,
106+
CreateNodeGroupResults: []nodegroups.CreateNodeGroupResult{asyncResult.CreationResult},
107+
FailedResizeNodeGroups: []cloudprovider.NodeGroup{createdNodeGroup},
108+
PodsTriggeredScaleUp: option.Pods,
109+
})
110+
}
111+
})
112+
}
113+
}
114+
115+
type fakeScaleUpStatusProcessor struct {
116+
lastStatus *status.ScaleUpStatus
117+
}
118+
119+
func (f *fakeScaleUpStatusProcessor) Process(_ *context.AutoscalingContext, status *status.ScaleUpStatus) {
120+
f.lastStatus = status
121+
}
122+
123+
func (f *fakeScaleUpStatusProcessor) CleanUp() {
124+
}

cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
224224
var scaleUpStatus *status.ScaleUpStatus
225225
oldId := bestOption.NodeGroup.Id()
226226
if o.autoscalingContext.AsyncNodeGroupsEnabled {
227-
initializer := NewAsyncNodeGroupInitializer(bestOption.NodeGroup, nodeInfos[oldId], o.scaleUpExecutor, o.taintConfig, daemonSets, o.processors.ScaleUpStatusProcessor, o.autoscalingContext, allOrNothing)
227+
initializer := NewAsyncNodeGroupInitializer(bestOption, nodeInfos[oldId], o.scaleUpExecutor, o.taintConfig, daemonSets, o.processors.ScaleUpStatusProcessor, o.autoscalingContext, allOrNothing)
228228
createNodeGroupResults, scaleUpStatus, aErr = o.CreateNodeGroupAsync(bestOption, nodeInfos, schedulablePodGroups, podEquivalenceGroups, daemonSets, initializer)
229229
} else {
230230
createNodeGroupResults, scaleUpStatus, aErr = o.CreateNodeGroup(bestOption, nodeInfos, schedulablePodGroups, podEquivalenceGroups, daemonSets)

0 commit comments

Comments
 (0)