Skip to content

Commit 238b63b

Browse files
committed
reconcile rosaMachinePool version
1 parent 121d0ff commit 238b63b

10 files changed

+282
-20
lines changed

config/crd/bases/controlplane.cluster.x-k8s.io_rosacontrolplanes.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -362,8 +362,13 @@ spec:
362362
AKS, EKS, GKE, etc.
363363
type: boolean
364364
failureMessage:
365-
description: ErrorMessage indicates that there is a terminal problem
366-
reconciling the state, and will be set to a descriptive error message.
365+
description: "FailureMessage will be set in the event that there is
366+
a terminal problem reconciling the state and will be set to a descriptive
367+
error message. \n This field should not be set for transitive errors
368+
that a controller faces that are expected to be fixed automatically
369+
over time (like service outages), but instead indicate that something
370+
is fundamentally wrong with the spec or the configuration of the
371+
controller, and that manual intervention is required."
367372
type: string
368373
id:
369374
description: ID is the cluster ID given by ROSA.

config/crd/bases/infrastructure.cluster.x-k8s.io_rosamachinepools.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,11 @@ spec:
9494
type: array
9595
subnet:
9696
type: string
97+
version:
98+
description: Version specifies the penshift version of the nodes associated
99+
with this machinepool. ROSAControlPlane version is used if not set.
100+
pattern: ^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$
101+
type: string
97102
required:
98103
- nodePoolName
99104
type: object
@@ -146,6 +151,15 @@ spec:
146151
- type
147152
type: object
148153
type: array
154+
failureMessage:
155+
description: "FailureMessage will be set in the event that there is
156+
a terminal problem reconciling the state and will be set to a descriptive
157+
error message. \n This field should not be set for transitive errors
158+
that a controller faces that are expected to be fixed automatically
159+
over time (like service outages), but instead indicate that something
160+
is fundamentally wrong with the spec or the configuration of the
161+
controller, and that manual intervention is required."
162+
type: string
149163
id:
150164
description: ID is the ID given by ROSA.
151165
type: string

controlplane/rosa/api/v1beta2/rosacontrolplane_types.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -473,8 +473,15 @@ type RosaControlPlaneStatus struct {
473473
// Ready denotes that the ROSAControlPlane API Server is ready to receive requests.
474474
// +kubebuilder:default=false
475475
Ready bool `json:"ready"`
476-
// ErrorMessage indicates that there is a terminal problem reconciling the
477-
// state, and will be set to a descriptive error message.
476+
// FailureMessage will be set in the event that there is a terminal problem
477+
// reconciling the state and will be set to a descriptive error message.
478+
//
479+
// This field should not be set for transitive errors that a controller
480+
// faces that are expected to be fixed automatically over
481+
// time (like service outages), but instead indicate that something is
482+
// fundamentally wrong with the spec or the configuration of
483+
// the controller, and that manual intervention is required.
484+
//
478485
// +optional
479486
FailureMessage *string `json:"failureMessage,omitempty"`
480487
// Conditions specifies the cpnditions for the managed control plane

controlplane/rosa/controllers/rosacontrolplane_controller.go

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -191,13 +191,16 @@ func (r *ROSAControlPlaneReconciler) reconcileNormal(ctx context.Context, rosaSc
191191
}
192192
defer rosaClient.Close()
193193

194-
isValid, err := validateControlPlaneSpec(rosaClient, rosaScope)
194+
failureMessage, err := validateControlPlaneSpec(rosaClient, rosaScope)
195195
if err != nil {
196196
return ctrl.Result{}, fmt.Errorf("failed to validate ROSAControlPlane.spec: %w", err)
197197
}
198-
if !isValid {
198+
if failureMessage != nil {
199+
rosaScope.ControlPlane.Status.FailureMessage = failureMessage
199200
// dont' requeue because input is invalid and manual intervention is needed.
200201
return ctrl.Result{}, nil
202+
} else {
203+
rosaScope.ControlPlane.Status.FailureMessage = nil
201204
}
202205

203206
cluster, err := rosaClient.GetCluster()
@@ -270,7 +273,7 @@ func (r *ROSAControlPlaneReconciler) reconcileNormal(ctx context.Context, rosaSc
270273
DisableUserWorkloadMonitoring(true).
271274
Version(
272275
cmv1.NewVersion().
273-
ID(fmt.Sprintf("openshift-v%s", rosaScope.ControlPlane.Spec.Version)).
276+
ID(rosa.VersionID(rosaScope.ControlPlane.Spec.Version)).
274277
ChannelGroup("stable"),
275278
).
276279
ExpirationTimestamp(time.Now().Add(1 * time.Hour)).
@@ -411,7 +414,7 @@ func (r *ROSAControlPlaneReconciler) reconcileDelete(ctx context.Context, rosaSc
411414

412415
func (r *ROSAControlPlaneReconciler) reconcileClusterVersion(rosaScope *scope.ROSAControlPlaneScope, rosaClient *rosa.RosaClient, cluster *cmv1.Cluster) error {
413416
version := rosaScope.ControlPlane.Spec.Version
414-
if version == cluster.Version().RawID() {
417+
if version == rosa.RawVersionID(cluster.Version()) {
415418
conditions.MarkFalse(rosaScope.ControlPlane, rosacontrolplanev1.ROSAControlPlaneUpgradingCondition, "upgraded", clusterv1.ConditionSeverityInfo, "")
416419
return nil
417420
}
@@ -560,24 +563,20 @@ func (r *ROSAControlPlaneReconciler) reconcileClusterAdminPassword(ctx context.C
560563
return password, nil
561564
}
562565

563-
func validateControlPlaneSpec(rosaClient *rosa.RosaClient, rosaScope *scope.ROSAControlPlaneScope) (bool, error) {
564-
// reset previous message.
565-
rosaScope.ControlPlane.Status.FailureMessage = nil
566-
566+
func validateControlPlaneSpec(rosaClient *rosa.RosaClient, rosaScope *scope.ROSAControlPlaneScope) (*string, error) {
567567
version := rosaScope.ControlPlane.Spec.Version
568568
isSupported, err := rosaClient.IsVersionSupported(version)
569569
if err != nil {
570-
return false, err
570+
return nil, fmt.Errorf("failed to verify if version is supported: %w", err)
571571
}
572572

573573
if !isSupported {
574574
message := fmt.Sprintf("version %s is not supported", version)
575-
rosaScope.ControlPlane.Status.FailureMessage = &message
576-
return false, nil
575+
return &message, nil
577576
}
578577

579578
// TODO: add more input validations
580-
return true, nil
579+
return nil, nil
581580
}
582581

583582
func (r *ROSAControlPlaneReconciler) rosaClusterToROSAControlPlane(log *logger.Logger) handler.MapFunc {

exp/api/v1beta2/conditions_consts.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,10 @@ const (
104104
)
105105

106106
const (
107-
// RosaMachinePoolReadyCondition condition reports on the successful reconciliation of rosa control plane.
107+
// RosaMachinePoolReadyCondition condition reports on the successful reconciliation of rosa machinepool.
108108
RosaMachinePoolReadyCondition clusterv1.ConditionType = "RosaMchinePoolReady"
109+
// RosaMachinePoolUpgradingCondition condition reports whether ROSAMachinePool is upgrading or not.
110+
RosaMachinePoolUpgradingCondition clusterv1.ConditionType = "RosaMchinePoolUpgrading"
109111
// WaitingForRosaControlPlaneReason used when the machine pool is waiting for
110112
// ROSA control plane infrastructure to be ready before proceeding.
111113
WaitingForRosaControlPlaneReason = "WaitingForRosaControlPlane"

exp/api/v1beta2/rosamachinepool_types.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@ type RosaMachinePoolSpec struct {
3333
// +kubebuilder:validation:Pattern:=`^[a-z]([-a-z0-9]*[a-z0-9])?$`
3434
NodePoolName string `json:"nodePoolName"`
3535

36+
// Version specifies the penshift version of the nodes associated with this machinepool.
37+
// ROSAControlPlane version is used if not set.
38+
//
39+
// +optional
40+
// +kubebuilder:validation:Pattern:=`^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$`
41+
Version string `json:"version,omitempty"`
42+
3643
// AvailabilityZone is an optinal field specifying the availability zone where instances of this machine pool should run
3744
// For Multi-AZ clusters, you can create a machine pool in a Single-AZ of your choice.
3845
// +optional
@@ -83,14 +90,23 @@ type RosaMachinePoolStatus struct {
8390
// the cluster
8491
// +kubebuilder:default=false
8592
Ready bool `json:"ready"`
86-
8793
// Replicas is the most recently observed number of replicas.
8894
// +optional
8995
Replicas int32 `json:"replicas"`
90-
9196
// Conditions defines current service state of the managed machine pool
9297
// +optional
9398
Conditions clusterv1.Conditions `json:"conditions,omitempty"`
99+
// FailureMessage will be set in the event that there is a terminal problem
100+
// reconciling the state and will be set to a descriptive error message.
101+
//
102+
// This field should not be set for transitive errors that a controller
103+
// faces that are expected to be fixed automatically over
104+
// time (like service outages), but instead indicate that something is
105+
// fundamentally wrong with the spec or the configuration of
106+
// the controller, and that manual intervention is required.
107+
//
108+
// +optional
109+
FailureMessage *string `json:"failureMessage,omitempty"`
94110

95111
// ID is the ID given by ROSA.
96112
ID string `json:"id,omitempty"`

exp/api/v1beta2/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exp/controllers/rosamachinepool_controller.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import (
55
"fmt"
66
"time"
77

8+
"github.com/blang/semver"
89
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
910
"github.com/pkg/errors"
11+
corev1 "k8s.io/api/core/v1"
1012
apierrors "k8s.io/apimachinery/pkg/api/errors"
1113
"k8s.io/apimachinery/pkg/runtime/schema"
1214
"k8s.io/client-go/tools/record"
@@ -179,6 +181,18 @@ func (r *ROSAMachinePoolReconciler) reconcileNormal(ctx context.Context,
179181
}
180182
defer rosaClient.Close()
181183

184+
failureMessage, err := validateMachinePoolSpec(machinePoolScope)
185+
if err != nil {
186+
return ctrl.Result{}, fmt.Errorf("failed to validate ROSAMachinePool.spec: %w", err)
187+
}
188+
if failureMessage != nil {
189+
machinePoolScope.RosaMachinePool.Status.FailureMessage = failureMessage
190+
// dont' requeue because input is invalid and manual intervention is needed.
191+
return ctrl.Result{}, nil
192+
} else {
193+
machinePoolScope.RosaMachinePool.Status.FailureMessage = nil
194+
}
195+
182196
rosaMachinePool := machinePoolScope.RosaMachinePool
183197
machinePool := machinePoolScope.MachinePool
184198
controlPlane := machinePoolScope.ControlPlane
@@ -194,6 +208,10 @@ func (r *ROSAMachinePoolReconciler) reconcileNormal(ctx context.Context,
194208
conditions.MarkTrue(rosaMachinePool, expinfrav1.RosaMachinePoolReadyCondition)
195209
rosaMachinePool.Status.Ready = true
196210

211+
if err := r.reconcileMachinePoolVersion(machinePoolScope, rosaClient, createdNodePool); err != nil {
212+
return ctrl.Result{}, err
213+
}
214+
197215
return ctrl.Result{}, nil
198216
}
199217

@@ -231,6 +249,9 @@ func (r *ROSAMachinePoolReconciler) reconcileNormal(ctx context.Context,
231249
}
232250

233251
npBuilder.AWSNodePool(cmv1.NewAWSNodePool().InstanceType(rosaMachinePool.Spec.InstanceType))
252+
if rosaMachinePool.Spec.Version != "" {
253+
npBuilder.Version(cmv1.NewVersion().ID(rosa.VersionID(rosaMachinePool.Spec.Version)))
254+
}
234255

235256
nodePoolSpec, err := npBuilder.Build()
236257
if err != nil {
@@ -274,6 +295,69 @@ func (r *ROSAMachinePoolReconciler) reconcileDelete(
274295
return nil
275296
}
276297

298+
func (r *ROSAMachinePoolReconciler) reconcileMachinePoolVersion(machinePoolScope *scope.RosaMachinePoolScope, rosaClient *rosa.RosaClient, nodePool *cmv1.NodePool) error {
299+
version := machinePoolScope.RosaMachinePool.Spec.Version
300+
if version == "" {
301+
version = machinePoolScope.ControlPlane.Spec.Version
302+
}
303+
304+
if version == rosa.RawVersionID(nodePool.Version()) {
305+
conditions.MarkFalse(machinePoolScope.RosaMachinePool, expinfrav1.RosaMachinePoolUpgradingCondition, "upgraded", clusterv1.ConditionSeverityInfo, "")
306+
return nil
307+
}
308+
309+
clusterID := *machinePoolScope.ControlPlane.Status.ID
310+
scheduledUpgrade, err := rosaClient.CheckNodePoolExistingScheduledUpgrade(clusterID, nodePool)
311+
if err != nil {
312+
return fmt.Errorf("failed to get existing scheduled upgrades: %w", err)
313+
}
314+
315+
if scheduledUpgrade == nil {
316+
scheduledUpgrade, err = rosaClient.ScheduleNodePoolUpgrade(clusterID, nodePool, version, time.Now())
317+
if err != nil {
318+
return fmt.Errorf("failed to schedule nodePool upgrade to version %s: %w", version, err)
319+
}
320+
}
321+
322+
condition := &clusterv1.Condition{
323+
Type: expinfrav1.RosaMachinePoolUpgradingCondition,
324+
Status: corev1.ConditionTrue,
325+
Reason: string(scheduledUpgrade.State().Value()),
326+
Message: fmt.Sprintf("Upgrading to version %s", scheduledUpgrade.Version()),
327+
}
328+
conditions.Set(machinePoolScope.RosaMachinePool, condition)
329+
330+
// if nodePool is already upgrading to another version we need to wait until the current upgrade is finished, return an error to requeue and try later.
331+
if scheduledUpgrade.Version() != version {
332+
return fmt.Errorf("there is already a %s upgrade to version %s", scheduledUpgrade.State().Value(), scheduledUpgrade.Version())
333+
}
334+
335+
return nil
336+
}
337+
338+
func validateMachinePoolSpec(machinePoolScope *scope.RosaMachinePoolScope) (*string, error) {
339+
if machinePoolScope.RosaMachinePool.Spec.Version == "" {
340+
return nil, nil
341+
}
342+
343+
version, err := semver.Parse(machinePoolScope.RosaMachinePool.Spec.Version)
344+
if err != nil {
345+
return nil, fmt.Errorf("failed to parse MachinePool version: %w", err)
346+
}
347+
minSupportedVersion, maxSupportedVersion, err := rosa.MachinePoolSupportedVersionsRange(machinePoolScope.ControlPlane.Spec.Version)
348+
if err != nil {
349+
return nil, fmt.Errorf("failed to get supported machinePool versions range: %w", err)
350+
}
351+
352+
if version.GT(*maxSupportedVersion) || version.LT(*minSupportedVersion) {
353+
message := fmt.Sprintf("version %s is not supported, should be in the range: >= %s and <= %s", version, minSupportedVersion, maxSupportedVersion)
354+
return &message, nil
355+
}
356+
357+
// TODO: add more input validations
358+
return nil, nil
359+
}
360+
277361
func rosaControlPlaneToRosaMachinePoolMapFunc(c client.Client, gvk schema.GroupVersionKind, log logger.Wrapper) handler.MapFunc {
278362
return func(ctx context.Context, o client.Object) []reconcile.Request {
279363
rosaControlPlane, ok := o.(*rosacontrolplanev1.ROSAControlPlane)

pkg/rosa/nodepools.go

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
package rosa
22

3-
import cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
3+
import (
4+
"time"
5+
6+
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
7+
)
48

59
// CreateNodePool adds a new node pool to the cluster.
610
func (c *RosaClient) CreateNodePool(clusterID string, nodePool *cmv1.NodePool) (*cmv1.NodePool, error) {
@@ -70,3 +74,73 @@ func (c *RosaClient) DeleteNodePool(clusterID string, nodePoolID string) error {
7074
}
7175
return nil
7276
}
77+
78+
// CheckNodePoolExistingScheduledUpgrade checks and returns the current upgrade schedule for the nodePool if any.
79+
func (c *RosaClient) CheckNodePoolExistingScheduledUpgrade(clusterID string, nodePool *cmv1.NodePool) (*cmv1.NodePoolUpgradePolicy, error) {
80+
upgradePolicies, err := c.getNodePoolUpgradePolicies(clusterID, nodePool.ID())
81+
if err != nil {
82+
return nil, err
83+
}
84+
for _, upgradePolicy := range upgradePolicies {
85+
if upgradePolicy.UpgradeType() == cmv1.UpgradeTypeNodePool {
86+
return upgradePolicy, nil
87+
}
88+
}
89+
return nil, nil
90+
}
91+
92+
// ScheduleNodePoolUpgrade schedules a new nodePool upgrade to the specified version at the specified time.
93+
func (c *RosaClient) ScheduleNodePoolUpgrade(clusterID string, nodePool *cmv1.NodePool, version string, nextRun time.Time) (*cmv1.NodePoolUpgradePolicy, error) {
94+
// earliestNextRun is set to at least 5 min from now by the OCM API.
95+
// we set it to 6 min here to account for latencty.
96+
earliestNextRun := time.Now().Add(time.Minute * 6)
97+
if nextRun.Before(earliestNextRun) {
98+
nextRun = earliestNextRun
99+
}
100+
101+
upgradePolicy, err := cmv1.NewNodePoolUpgradePolicy().
102+
UpgradeType(cmv1.UpgradeTypeNodePool).
103+
NodePoolID(nodePool.ID()).
104+
ScheduleType(cmv1.ScheduleTypeManual).
105+
Version(version).
106+
NextRun(nextRun).
107+
Build()
108+
if err != nil {
109+
return nil, err
110+
}
111+
112+
response, err := c.ocm.ClustersMgmt().V1().
113+
Clusters().Cluster(clusterID).
114+
NodePools().
115+
NodePool(nodePool.ID()).UpgradePolicies().
116+
Add().Body(upgradePolicy).
117+
Send()
118+
if err != nil {
119+
return nil, handleErr(response.Error(), err)
120+
}
121+
122+
return response.Body(), nil
123+
}
124+
125+
func (c *RosaClient) getNodePoolUpgradePolicies(clusterID string, nodePoolID string) (nodePoolUpgradePolicies []*cmv1.NodePoolUpgradePolicy, err error) {
126+
collection := c.ocm.ClustersMgmt().V1().
127+
Clusters().
128+
Cluster(clusterID).NodePools().NodePool(nodePoolID).UpgradePolicies()
129+
page := 1
130+
size := 100
131+
for {
132+
response, err := collection.List().
133+
Page(page).
134+
Size(size).
135+
Send()
136+
if err != nil {
137+
return nil, handleErr(response.Error(), err)
138+
}
139+
nodePoolUpgradePolicies = append(nodePoolUpgradePolicies, response.Items().Slice()...)
140+
if response.Size() < size {
141+
break
142+
}
143+
page++
144+
}
145+
return
146+
}

0 commit comments

Comments
 (0)