Skip to content

Commit ad31545

Browse files
authored
Merge pull request #135 from marek-veber/aro-autoscaler
ARO-25449 - Fix AROMachinePool autoscaling and API version handling
2 parents 2bd3146 + be31b7c commit ad31545

File tree

7 files changed

+775
-13
lines changed

7 files changed

+775
-13
lines changed

exp/api/v1beta2/aromachinepool_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,9 @@ const (
151151

152152
// NodePoolReadyCondition condition reports on the readiness of the HcpOpenShiftClustersNodePool.
153153
NodePoolReadyCondition clusterv1.ConditionType = "NodePoolReady"
154+
155+
// ReplicasManagedByARO is the value of the CAPI replica manager annotation that maps to the ARO-HCP built-in autoscaler.
156+
ReplicasManagedByARO = "aro-hcp"
154157
)
155158

156159
// +kubebuilder:object:root=true

exp/controllers/arocontrolplane_reconciler.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ func (s *aroControlPlaneService) reconcileResources(ctx context.Context) error {
519519
version = hcpClusterV1.Status.Properties.Version.Id
520520
}
521521
}
522-
} else if apierrors.IsNotFound(err) || meta.IsNoMatchError(err) {
522+
} else if apierrors.IsNotFound(err) || meta.IsNoMatchError(err) || isSchemeError(err) {
523523
// Not found or API version not served, try v1api20251223preview
524524
hcpClusterV2 := &asoredhatopenshiftv1api2025.HcpOpenShiftCluster{}
525525
err = s.kubeclient.Get(ctx, client.ObjectKey{
@@ -542,7 +542,7 @@ func (s *aroControlPlaneService) reconcileResources(ctx context.Context) error {
542542
version = hcpClusterV2.Status.Properties.Version.Id
543543
}
544544
}
545-
} else if apierrors.IsNotFound(err) {
545+
} else if apierrors.IsNotFound(err) || isSchemeError(err) {
546546
// Not found in either version
547547
conditions.Set(s.scope.ControlPlane, metav1.Condition{
548548
Type: string(cplane.HcpClusterReadyCondition),
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controllers
18+
19+
import (
20+
"testing"
21+
22+
. "github.com/onsi/gomega"
23+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24+
"k8s.io/utils/ptr"
25+
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
26+
27+
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta2"
28+
)
29+
30+
// TestAutoscalingSpecReplicasUpdate tests that MachinePool.Spec.Replicas is updated
31+
// when autoscaling is enabled and actual node count changes.
32+
func TestAutoscalingSpecReplicasUpdate(t *testing.T) {
33+
tests := []struct {
34+
name string
35+
initialSpecReplicas int32
36+
actualNodeCount int32
37+
hasAutoscaling bool
38+
expectedSpec *int32
39+
}{
40+
{
41+
name: "autoscaling enabled - spec updated to match actual",
42+
initialSpecReplicas: 3,
43+
actualNodeCount: 5,
44+
hasAutoscaling: true,
45+
expectedSpec: ptr.To[int32](5),
46+
},
47+
{
48+
name: "autoscaling disabled - spec not updated",
49+
initialSpecReplicas: 3,
50+
actualNodeCount: 5,
51+
hasAutoscaling: false,
52+
expectedSpec: ptr.To[int32](3),
53+
},
54+
{
55+
name: "autoscaling enabled - scale down",
56+
initialSpecReplicas: 5,
57+
actualNodeCount: 3,
58+
hasAutoscaling: true,
59+
expectedSpec: ptr.To[int32](3),
60+
},
61+
{
62+
name: "autoscaling enabled - no change",
63+
initialSpecReplicas: 5,
64+
actualNodeCount: 5,
65+
hasAutoscaling: true,
66+
expectedSpec: ptr.To[int32](5),
67+
},
68+
{
69+
name: "autoscaling disabled - spec unchanged even with mismatch",
70+
initialSpecReplicas: 3,
71+
actualNodeCount: 10,
72+
hasAutoscaling: false,
73+
expectedSpec: ptr.To[int32](3),
74+
},
75+
}
76+
77+
for _, test := range tests {
78+
t.Run(test.name, func(t *testing.T) {
79+
g := NewGomegaWithT(t)
80+
81+
// Create MachinePool with initial spec
82+
machinePool := &clusterv1.MachinePool{
83+
ObjectMeta: metav1.ObjectMeta{
84+
Name: "test-mp",
85+
Namespace: "default",
86+
},
87+
Spec: clusterv1.MachinePoolSpec{
88+
Replicas: ptr.To(test.initialSpecReplicas),
89+
},
90+
}
91+
92+
// Add autoscaling annotation if enabled
93+
if test.hasAutoscaling {
94+
machinePool.Annotations = map[string]string{
95+
clusterv1.ReplicasManagedByAnnotation: infrav1exp.ReplicasManagedByARO,
96+
}
97+
}
98+
99+
// Simulate the reconciler logic
100+
currentReplicas := test.actualNodeCount
101+
102+
// This is the logic from aromachinepool_reconciler.go lines 288-293
103+
if test.hasAutoscaling {
104+
if _, autoscaling := machinePool.Annotations[clusterv1.ReplicasManagedByAnnotation]; autoscaling {
105+
machinePool.Spec.Replicas = &currentReplicas
106+
}
107+
}
108+
109+
// Verify the expected outcome
110+
g.Expect(machinePool.Spec.Replicas).To(Equal(test.expectedSpec))
111+
})
112+
}
113+
}
114+
115+
// TestAutoscalingAnnotationBehavior tests the interaction between autoscaling annotation
116+
// and replica management.
117+
func TestAutoscalingAnnotationBehavior(t *testing.T) {
118+
tests := []struct {
119+
name string
120+
annotations map[string]string
121+
actualNodeCount int32
122+
expectSpecUpdated bool
123+
}{
124+
{
125+
name: "ARO autoscaling annotation present",
126+
annotations: map[string]string{
127+
clusterv1.ReplicasManagedByAnnotation: infrav1exp.ReplicasManagedByARO,
128+
},
129+
actualNodeCount: 5,
130+
expectSpecUpdated: true,
131+
},
132+
{
133+
name: "different autoscaler annotation present",
134+
annotations: map[string]string{
135+
clusterv1.ReplicasManagedByAnnotation: "other-autoscaler",
136+
},
137+
actualNodeCount: 5,
138+
expectSpecUpdated: true,
139+
},
140+
{
141+
name: "no annotation",
142+
annotations: map[string]string{},
143+
actualNodeCount: 5,
144+
expectSpecUpdated: false,
145+
},
146+
{
147+
name: "nil annotations",
148+
annotations: nil,
149+
actualNodeCount: 5,
150+
expectSpecUpdated: false,
151+
},
152+
{
153+
name: "other annotations present but not autoscaling",
154+
annotations: map[string]string{
155+
"some.other/annotation": "value",
156+
},
157+
actualNodeCount: 5,
158+
expectSpecUpdated: false,
159+
},
160+
}
161+
162+
for _, test := range tests {
163+
t.Run(test.name, func(t *testing.T) {
164+
g := NewGomegaWithT(t)
165+
166+
machinePool := &clusterv1.MachinePool{
167+
ObjectMeta: metav1.ObjectMeta{
168+
Name: "test-mp",
169+
Namespace: "default",
170+
Annotations: test.annotations,
171+
},
172+
Spec: clusterv1.MachinePoolSpec{
173+
Replicas: ptr.To[int32](3),
174+
},
175+
}
176+
177+
currentReplicas := test.actualNodeCount
178+
179+
// Apply the reconciler logic
180+
if _, autoscaling := machinePool.Annotations[clusterv1.ReplicasManagedByAnnotation]; autoscaling {
181+
machinePool.Spec.Replicas = &currentReplicas
182+
}
183+
184+
// Verify expectations
185+
if test.expectSpecUpdated {
186+
g.Expect(machinePool.Spec.Replicas).To(Equal(ptr.To(test.actualNodeCount)))
187+
} else {
188+
g.Expect(machinePool.Spec.Replicas).To(Equal(ptr.To[int32](3)))
189+
}
190+
})
191+
}
192+
}
193+
194+
// TestAutoscalingLifecycle tests the complete autoscaling enable/disable lifecycle.
195+
func TestAutoscalingLifecycle(t *testing.T) {
196+
g := NewGomegaWithT(t)
197+
198+
machinePool := &clusterv1.MachinePool{
199+
ObjectMeta: metav1.ObjectMeta{
200+
Name: "test-mp",
201+
Namespace: "default",
202+
Annotations: map[string]string{},
203+
},
204+
Spec: clusterv1.MachinePoolSpec{
205+
Replicas: ptr.To[int32](3),
206+
},
207+
}
208+
209+
// Phase 1: No autoscaling - spec should not be updated
210+
var currentReplicas int32 = 5
211+
if _, autoscaling := machinePool.Annotations[clusterv1.ReplicasManagedByAnnotation]; autoscaling {
212+
replicas := currentReplicas
213+
machinePool.Spec.Replicas = &replicas
214+
}
215+
g.Expect(*machinePool.Spec.Replicas).To(Equal(int32(3)), "Phase 1: spec should not be updated without annotation")
216+
217+
// Phase 2: Enable autoscaling - add annotation
218+
machinePool.Annotations[clusterv1.ReplicasManagedByAnnotation] = infrav1exp.ReplicasManagedByARO
219+
220+
// Autoscaler scales to 5
221+
currentReplicas = 5
222+
if _, autoscaling := machinePool.Annotations[clusterv1.ReplicasManagedByAnnotation]; autoscaling {
223+
replicas := currentReplicas
224+
machinePool.Spec.Replicas = &replicas
225+
}
226+
g.Expect(*machinePool.Spec.Replicas).To(Equal(int32(5)), "Phase 2: spec should be updated to 5 with annotation")
227+
228+
// Phase 3: Autoscaler scales to 8
229+
currentReplicas = 8
230+
if _, autoscaling := machinePool.Annotations[clusterv1.ReplicasManagedByAnnotation]; autoscaling {
231+
replicas := currentReplicas
232+
machinePool.Spec.Replicas = &replicas
233+
}
234+
g.Expect(*machinePool.Spec.Replicas).To(Equal(int32(8)), "Phase 3: spec should be updated to 8")
235+
236+
// Phase 4: Disable autoscaling - remove annotation
237+
delete(machinePool.Annotations, clusterv1.ReplicasManagedByAnnotation)
238+
239+
// Node count changes but spec should not be updated anymore
240+
currentReplicas = 10
241+
if _, autoscaling := machinePool.Annotations[clusterv1.ReplicasManagedByAnnotation]; autoscaling {
242+
replicas := currentReplicas
243+
machinePool.Spec.Replicas = &replicas
244+
}
245+
g.Expect(*machinePool.Spec.Replicas).To(Equal(int32(8)), "Phase 4: spec should remain at 8 after annotation removed")
246+
}
247+
248+
// TestAutoscalingPreventsScalingDownStatus tests that updating spec.replicas prevents
249+
// the ScalingDown status issue reported by the customer.
250+
func TestAutoscalingPreventsScalingDownStatus(t *testing.T) {
251+
g := NewGomegaWithT(t)
252+
253+
// Customer scenario: Autoscaler increases nodes from 3 to 5
254+
machinePool := &clusterv1.MachinePool{
255+
ObjectMeta: metav1.ObjectMeta{
256+
Name: "customer-mp",
257+
Namespace: "default",
258+
Annotations: map[string]string{
259+
clusterv1.ReplicasManagedByAnnotation: infrav1exp.ReplicasManagedByARO,
260+
},
261+
},
262+
Spec: clusterv1.MachinePoolSpec{
263+
Replicas: ptr.To[int32](3),
264+
},
265+
}
266+
267+
infraMachinePool := &infrav1exp.AROMachinePool{
268+
Status: infrav1exp.AROMachinePoolStatus{
269+
Replicas: 3,
270+
},
271+
}
272+
273+
// Autoscaler creates 2 new nodes
274+
actualNodeCount := int32(5)
275+
276+
// Update status replicas
277+
infraMachinePool.Status.Replicas = actualNodeCount
278+
279+
// Update spec replicas (the fix)
280+
if _, autoscaling := machinePool.Annotations[clusterv1.ReplicasManagedByAnnotation]; autoscaling {
281+
machinePool.Spec.Replicas = &actualNodeCount
282+
}
283+
284+
// Verify: Spec and Status match, no ScalingDown state
285+
g.Expect(machinePool.Spec.Replicas).To(Equal(ptr.To[int32](5)), "spec should match actual nodes")
286+
g.Expect(infraMachinePool.Status.Replicas).To(Equal(int32(5)), "status should match actual nodes")
287+
g.Expect(*machinePool.Spec.Replicas).To(Equal(infraMachinePool.Status.Replicas), "spec and status should match - no ScalingDown")
288+
}

exp/controllers/aromachinepool_controller.go

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -219,14 +219,32 @@ func (ampr *AROMachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Re
219219
return reconcile.Result{}, err
220220
}
221221

222-
// Upon first create of an ARO service, the node pools are provided to the CreateOrUpdate call. After the initial
223-
// create of the control plane and node pools, the control plane will transition to initialized. After the control
224-
// plane is initialized, we can proceed to reconcile aro machine pools.
225-
if controlPlane.Status.Initialization == nil || !controlPlane.Status.Initialization.ControlPlaneInitialized {
226-
log.Info("AROControlPlane is not initialized")
222+
// For resources mode (ASO-based): NodePools can be created as soon as HcpOpenShiftCluster is ready
223+
// The kubeconfig will be generated after the first NodePool is created
224+
// Check if HcpClusterReady condition is true
225+
hcpClusterReady := false
226+
for _, condition := range controlPlane.Status.Conditions {
227+
if condition.Type == string(cplane.HcpClusterReadyCondition) && condition.Status == metav1.ConditionTrue {
228+
hcpClusterReady = true
229+
break
230+
}
231+
}
232+
233+
if !hcpClusterReady {
234+
log.Info("Waiting for HcpOpenShiftCluster to be ready before creating NodePools")
227235
return reconcile.Result{}, nil
228236
}
229237

238+
// If ControlPlaneInitialized is explicitly false (not nil), respect that
239+
// But if it's nil or true, allow NodePool creation when HcpClusterReady
240+
if controlPlane.Status.Initialization != nil && !controlPlane.Status.Initialization.ControlPlaneInitialized {
241+
// Only block if explicitly set to false AND HcpCluster is ready
242+
// This handles the case where control plane is being deleted or has failed
243+
if controlPlane.Status.Ready {
244+
log.V(4).Info("ControlPlane explicitly marked as not initialized but will proceed since HcpCluster is ready")
245+
}
246+
}
247+
230248
// Create the scope.
231249
acpScope, err := scope.NewAROMachinePoolScope(ctx, scope.AROMachinePoolScopeParams{
232250
Client: ampr.Client,

exp/controllers/aromachinepool_reconciler.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ func (s *aroMachinePoolService) reconcileResources(ctx context.Context) error {
116116
resources, err := mutators.ApplyMutators(
117117
ctx,
118118
s.scope.InfraMachinePool.Spec.Resources,
119-
mutators.SetHcpOpenShiftNodePoolDefaults(s.kubeclient, s.scope.InfraMachinePool, hcpClusterName),
119+
mutators.SetHcpOpenShiftNodePoolDefaults(s.kubeclient, s.scope.InfraMachinePool, hcpClusterName, s.scope.MachinePool),
120120
)
121121
if err != nil {
122122
return errors.Wrap(err, "failed to apply mutators")
@@ -173,8 +173,8 @@ func (s *aroMachinePoolService) reconcileResources(ctx context.Context) error {
173173
}
174174
replicas = nodePoolV1.Status.Properties.Replicas
175175
}
176-
} else if apierrors.IsNotFound(err) || meta.IsNoMatchError(err) {
177-
// Not found or API version not served, try v1api20251223preview
176+
} else if apierrors.IsNotFound(err) || meta.IsNoMatchError(err) || isSchemeError(err) {
177+
// Not found, API version not served, or scheme error - try v1api20251223preview
178178
nodePoolV2 := &asoredhatopenshiftv1api2025.HcpOpenShiftClustersNodePool{}
179179
err = s.kubeclient.Get(ctx, client.ObjectKey{
180180
Namespace: s.scope.InfraMachinePool.Namespace,
@@ -197,7 +197,7 @@ func (s *aroMachinePoolService) reconcileResources(ctx context.Context) error {
197197
}
198198
} else {
199199
// v1api20251223preview also failed
200-
if apierrors.IsNotFound(err) {
200+
if apierrors.IsNotFound(err) || isSchemeError(err) {
201201
// NodePool doesn't exist yet - set a condition and continue
202202
conditions.Set(s.scope.InfraMachinePool, metav1.Condition{
203203
Type: string(infrav1exp.NodePoolReadyCondition),
@@ -284,6 +284,12 @@ func (s *aroMachinePoolService) reconcileResources(ctx context.Context) error {
284284
currentReplicas := int32(len(providerIDs))
285285
if currentReplicas > 0 {
286286
s.scope.InfraMachinePool.Status.Replicas = currentReplicas
287+
288+
// If autoscaling is enabled, update MachinePool.Spec.Replicas to match actual count
289+
// This prevents CAPI from thinking we're in a ScalingDown state when autoscaler scales up
290+
if _, autoscaling := s.scope.MachinePool.Annotations[clusterv1.ReplicasManagedByAnnotation]; autoscaling {
291+
s.scope.MachinePool.Spec.Replicas = &currentReplicas
292+
}
287293
}
288294

289295
log.V(4).Info("populated providerIDList from workload cluster nodes",

0 commit comments

Comments
 (0)