Skip to content

Commit e2bab2f

Browse files
authored
Merge pull request #8595 from ykakarap/pr-double-rollout_ms-preflight-check
✨ MS preflight checks to improve cluster stability
2 parents 89429b8 + 5b6a0f7 commit e2bab2f

20 files changed

+1061
-74
lines changed

api/v1beta1/common_types.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,18 @@ const (
117117
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
118118
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"
119119

120+
// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of
121+
// preflight checks that should be skipped during the MachineSet reconciliation.
122+
// Supported items are:
123+
// - KubeadmVersion (skips the kubeadm version skew preflight check)
124+
// - KubernetesVersion (skips the kubernetes version skew preflight check)
125+
// - ControlPlaneStable (skips checking that the control plane is neither provisioning nor upgrading)
126+
// - All (skips all preflight checks)
127+
// Example: "machineset.cluster.x-k8s.io/skip-preflight-checks": "ControlPlaneStable,KubernetesVersion".
128+
// Note: The annotation can also be set on a MachineDeployment as MachineDeployment annotations are synced to
129+
// the MachineSet.
130+
MachineSetSkipPreflightChecksAnnotation = "machineset.cluster.x-k8s.io/skip-preflight-checks"
131+
120132
// ClusterSecretType defines the type of secret created by core components.
121133
// Note: This is used by core CAPI, CAPBK, and KCP to determine whether a secret is created by the controllers
122134
// themselves or supplied by the user (e.g. bring your own certificates).
@@ -173,6 +185,38 @@ const (
173185
VariableDefinitionFromInline = "inline"
174186
)
175187

188+
// MachineSetPreflightCheck defines a valid MachineSet preflight check.
189+
type MachineSetPreflightCheck string
190+
191+
const (
192+
// MachineSetPreflightCheckAll can be used to represent all the MachineSet preflight checks.
193+
MachineSetPreflightCheckAll MachineSetPreflightCheck = "All"
194+
195+
// MachineSetPreflightCheckKubeadmVersionSkew is the name of the preflight check
196+
// that verifies if the machine being created or remediated for the MachineSet conforms to the kubeadm version
197+
// skew policy that requires the machine to be at the same version as the control plane.
198+
// Note: This is a stopgap while the root cause of the problem is fixed in kubeadm; this check will become
199+
// a no-op when this check will be available in kubeadm, and then eventually be dropped when all the
200+
// supported Kuberenetes/kubeadm versions have implemented the fix.
201+
// The preflight check is only run if a ControlPlane is used (controlPlaneRef must exist in the Cluster),
202+
// the ControlPlane has a version, the MachineSet has a version and the MachineSet uses the Kubeadm bootstrap
203+
// provider.
204+
MachineSetPreflightCheckKubeadmVersionSkew MachineSetPreflightCheck = "KubeadmVersionSkew"
205+
206+
// MachineSetPreflightCheckKubernetesVersionSkew is the name of the preflight check that verifies
207+
// if the machines being created or remediated for the MachineSet conform to the Kubernetes version skew policy
208+
// that requires the machines to be at a version that is not more than 2 minor lower than the ControlPlane version.
209+
// The preflight check is only run if a ControlPlane is used (controlPlaneRef must exist in the Cluster),
210+
// the ControlPlane has a version and the MachineSet has a version.
211+
MachineSetPreflightCheckKubernetesVersionSkew MachineSetPreflightCheck = "KubernetesVersionSkew"
212+
213+
// MachineSetPreflightCheckControlPlaneIsStable is the name of the preflight check
214+
// that verifies if the control plane is not provisioning and not upgrading.
215+
// The preflight check is only run if a ControlPlane is used (controlPlaneRef must exist in the Cluster)
216+
// and the ControlPlane has a version.
217+
MachineSetPreflightCheckControlPlaneIsStable MachineSetPreflightCheck = "ControlPlaneIsStable"
218+
)
219+
176220
// NodeUninitializedTaint can be added to Nodes at creation by the bootstrap provider, e.g. the
177221
// KubeadmBootstrap provider will add the taint.
178222
// This taint is used to prevent workloads to be scheduled on Nodes before the node is initialized by Cluster API.

api/v1beta1/machinedeployment_webhook.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
"sigs.k8s.io/controller-runtime/pkg/webhook"
3737
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
3838

39+
"sigs.k8s.io/cluster-api/feature"
3940
"sigs.k8s.io/cluster-api/util/version"
4041
)
4142

@@ -214,6 +215,14 @@ func (m *MachineDeployment) validate(old *MachineDeployment) error {
214215
)
215216
}
216217

218+
// MachineSet preflight checks that should be skipped could also be set as annotation on the MachineDeployment
219+
// since MachineDeployment annotations are synced to the MachineSet.
220+
if feature.Gates.Enabled(feature.MachineSetPreflightChecks) {
221+
if err := validateSkippedMachineSetPreflightChecks(m); err != nil {
222+
allErrs = append(allErrs, err)
223+
}
224+
}
225+
217226
if old != nil && old.Spec.ClusterName != m.Spec.ClusterName {
218227
allErrs = append(
219228
allErrs,

api/v1beta1/machineset_webhook.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,14 @@ import (
2424
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2525
"k8s.io/apimachinery/pkg/labels"
2626
"k8s.io/apimachinery/pkg/runtime"
27+
"k8s.io/apimachinery/pkg/util/sets"
2728
"k8s.io/apimachinery/pkg/util/validation/field"
2829
ctrl "sigs.k8s.io/controller-runtime"
30+
"sigs.k8s.io/controller-runtime/pkg/client"
2931
"sigs.k8s.io/controller-runtime/pkg/webhook"
3032
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
3133

34+
"sigs.k8s.io/cluster-api/feature"
3235
capilabels "sigs.k8s.io/cluster-api/internal/labels"
3336
"sigs.k8s.io/cluster-api/util/version"
3437
)
@@ -120,6 +123,12 @@ func (m *MachineSet) validate(old *MachineSet) error {
120123
)
121124
}
122125

126+
if feature.Gates.Enabled(feature.MachineSetPreflightChecks) {
127+
if err := validateSkippedMachineSetPreflightChecks(m); err != nil {
128+
allErrs = append(allErrs, err)
129+
}
130+
}
131+
123132
if old != nil && old.Spec.ClusterName != m.Spec.ClusterName {
124133
allErrs = append(
125134
allErrs,
@@ -149,3 +158,37 @@ func (m *MachineSet) validate(old *MachineSet) error {
149158

150159
return apierrors.NewInvalid(GroupVersion.WithKind("MachineSet").GroupKind(), m.Name, allErrs)
151160
}
161+
162+
func validateSkippedMachineSetPreflightChecks(o client.Object) *field.Error {
163+
if o == nil {
164+
return nil
165+
}
166+
skip := o.GetAnnotations()[MachineSetSkipPreflightChecksAnnotation]
167+
if skip == "" {
168+
return nil
169+
}
170+
171+
supported := sets.New[MachineSetPreflightCheck](
172+
MachineSetPreflightCheckAll,
173+
MachineSetPreflightCheckKubeadmVersionSkew,
174+
MachineSetPreflightCheckKubernetesVersionSkew,
175+
MachineSetPreflightCheckControlPlaneIsStable,
176+
)
177+
178+
skippedList := strings.Split(skip, ",")
179+
invalid := []MachineSetPreflightCheck{}
180+
for i := range skippedList {
181+
skipped := MachineSetPreflightCheck(strings.TrimSpace(skippedList[i]))
182+
if !supported.Has(skipped) {
183+
invalid = append(invalid, skipped)
184+
}
185+
}
186+
if len(invalid) > 0 {
187+
return field.Invalid(
188+
field.NewPath("metadata", "annotations", MachineSetSkipPreflightChecksAnnotation),
189+
invalid,
190+
fmt.Sprintf("skipped preflight check(s) must be among: %v", sets.List(supported)),
191+
)
192+
}
193+
return nil
194+
}

api/v1beta1/machineset_webhook_test.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,3 +228,73 @@ func TestMachineSetVersionValidation(t *testing.T) {
228228
})
229229
}
230230
}
231+
232+
func TestValidateSkippedMachineSetPreflightChecks(t *testing.T) {
233+
tests := []struct {
234+
name string
235+
ms *MachineSet
236+
expectErr bool
237+
}{
238+
{
239+
name: "should pass if the machine set skip preflight checks annotation is not set",
240+
ms: &MachineSet{},
241+
expectErr: false,
242+
},
243+
{
244+
name: "should pass if not preflight checks are skipped",
245+
ms: &MachineSet{
246+
ObjectMeta: metav1.ObjectMeta{
247+
Annotations: map[string]string{
248+
MachineSetSkipPreflightChecksAnnotation: "",
249+
},
250+
},
251+
},
252+
expectErr: false,
253+
},
254+
{
255+
name: "should pass if only valid preflight checks are skipped (single)",
256+
ms: &MachineSet{
257+
ObjectMeta: metav1.ObjectMeta{
258+
Annotations: map[string]string{
259+
MachineSetSkipPreflightChecksAnnotation: string(MachineSetPreflightCheckKubeadmVersionSkew),
260+
},
261+
},
262+
},
263+
expectErr: false,
264+
},
265+
{
266+
name: "should pass if only valid preflight checks are skipped (multiple)",
267+
ms: &MachineSet{
268+
ObjectMeta: metav1.ObjectMeta{
269+
Annotations: map[string]string{
270+
MachineSetSkipPreflightChecksAnnotation: string(MachineSetPreflightCheckKubeadmVersionSkew) + "," + string(MachineSetPreflightCheckControlPlaneIsStable),
271+
},
272+
},
273+
},
274+
expectErr: false,
275+
},
276+
{
277+
name: "should fail if invalid preflight checks are skipped",
278+
ms: &MachineSet{
279+
ObjectMeta: metav1.ObjectMeta{
280+
Annotations: map[string]string{
281+
MachineSetSkipPreflightChecksAnnotation: string(MachineSetPreflightCheckKubeadmVersionSkew) + ",invalid-preflight-check-name",
282+
},
283+
},
284+
},
285+
expectErr: true,
286+
},
287+
}
288+
289+
for _, tt := range tests {
290+
t.Run(tt.name, func(t *testing.T) {
291+
g := NewWithT(t)
292+
err := validateSkippedMachineSetPreflightChecks(tt.ms)
293+
if tt.expectErr {
294+
g.Expect(err).NotTo(BeNil())
295+
} else {
296+
g.Expect(err).To(BeNil())
297+
}
298+
})
299+
}
300+
}

config/manager/manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ spec:
2222
args:
2323
- "--leader-elect"
2424
- "--metrics-bind-addr=localhost:8080"
25-
- "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=false},ClusterResourceSet=${EXP_CLUSTER_RESOURCE_SET:=false},ClusterTopology=${CLUSTER_TOPOLOGY:=false},RuntimeSDK=${EXP_RUNTIME_SDK:=false}"
25+
- "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=false},ClusterResourceSet=${EXP_CLUSTER_RESOURCE_SET:=false},ClusterTopology=${CLUSTER_TOPOLOGY:=false},RuntimeSDK=${EXP_RUNTIME_SDK:=false},MachineSetPreflightChecks=${EXP_MACHINE_SET_PREFLIGHT_CHECKS:=false}"
2626
image: controller:latest
2727
name: manager
2828
env:

docs/book/src/SUMMARY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
- [Healthchecking](./tasks/automated-machine-management/healthchecking.md)
2727
- [Experimental Features](./tasks/experimental-features/experimental-features.md)
2828
- [MachinePools](./tasks/experimental-features/machine-pools.md)
29+
- [MachineSetPreflightChecks](./tasks/experimental-features/machineset-preflight-checks.md)
2930
- [ClusterResourceSet](./tasks/experimental-features/cluster-resource-set.md)
3031
- [ClusterClass](./tasks/experimental-features/cluster-class/index.md)
3132
- [Writing a ClusterClass](./tasks/experimental-features/cluster-class/write-clusterclass.md)

docs/book/src/developer/testing.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ kustomize_substitutions:
267267
EXP_CLUSTER_RESOURCE_SET: "true"
268268
EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true"
269269
EXP_RUNTIME_SDK: "true"
270+
EXP_MACHINE_SET_PREFLIGHT_CHECKS: "true"
270271
```
271272
272273
</aside>

docs/book/src/developer/tilt.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ kustomize_substitutions:
110110
EXP_CLUSTER_RESOURCE_SET: "true"
111111
EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true"
112112
EXP_RUNTIME_SDK: "true"
113+
EXP_MACHINE_SET_PREFLIGHT_CHECKS: "true"
113114
```
114115

115116
{{#tabs name:"tab-tilt-kustomize-substitution" tabs:"AWS,Azure,DigitalOcean,GCP,vSphere"}}

0 commit comments

Comments
 (0)