Skip to content

Commit 3433393

Browse files
authored
Merge pull request #8669 from ykakarap/pr-ms-preflight-checks_condition
🌱 surface failed preflight checks on MachineSet in `MachinesCreated` condition
2 parents 44406cf + 031cb49 commit 3433393

File tree

5 files changed

+287
-57
lines changed

5 files changed

+287
-57
lines changed

api/v1beta1/condition_consts.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,10 @@ const (
243243
// MachinesReadyCondition reports an aggregate of current status of the machines controlled by the MachineSet.
244244
MachinesReadyCondition ConditionType = "MachinesReady"
245245

246+
// PreflightCheckFailedReason (Severity=Error) documents a MachineSet failing preflight checks
247+
// to create machine(s).
248+
PreflightCheckFailedReason = "PreflightCheckFailed"
249+
246250
// BootstrapTemplateCloningFailedReason (Severity=Error) documents a MachineSet failing to
247251
// clone the bootstrap template.
248252
BootstrapTemplateCloningFailedReason = "BootstrapTemplateCloningFailed"

internal/controllers/machineset/machineset_controller.go

Lines changed: 81 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -296,51 +296,13 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster,
296296
filteredMachines = append(filteredMachines, machine)
297297
}
298298

299-
// Remediate failed Machines by deleting them.
300-
var errs []error
301-
machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines))
302-
for _, machine := range filteredMachines {
303-
// filteredMachines contains machines in deleting status to calculate correct status.
304-
// skip remediation for those in deleting status.
305-
if !machine.DeletionTimestamp.IsZero() {
306-
continue
307-
}
308-
if conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) {
309-
machinesToRemediate = append(machinesToRemediate, machine)
310-
}
311-
}
312-
313299
result := ctrl.Result{}
314-
if len(machinesToRemediate) > 0 {
315-
preflightChecksResult, err := r.runPreflightChecks(ctx, cluster, machineSet, "Machine Remediation")
316-
if err != nil {
317-
return ctrl.Result{}, err
318-
}
319-
// Delete the machines only if the preflight checks have passed. Do not delete machines if we cannot
320-
// guarantee creating new machines.
321-
if preflightChecksResult.IsZero() {
322-
for _, machine := range machinesToRemediate {
323-
log.Info("Deleting machine because marked as unhealthy by the MachineHealthCheck controller")
324-
patch := client.MergeFrom(machine.DeepCopy())
325-
if err := r.Client.Delete(ctx, machine); err != nil {
326-
errs = append(errs, errors.Wrap(err, "failed to delete"))
327-
continue
328-
}
329-
conditions.MarkTrue(machine, clusterv1.MachineOwnerRemediatedCondition)
330-
if err := r.Client.Status().Patch(ctx, machine, patch); err != nil && !apierrors.IsNotFound(err) {
331-
errs = append(errs, errors.Wrap(err, "failed to update status"))
332-
}
333-
}
334-
} else {
335-
result = util.LowestNonZeroResult(result, preflightChecksResult)
336-
}
337-
}
338300

339-
err = kerrors.NewAggregate(errs)
301+
reconcileUnhealthyMachinesResult, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, filteredMachines)
340302
if err != nil {
341-
log.Info("Failed while deleting unhealthy machines", "err", err)
342-
return ctrl.Result{}, errors.Wrap(err, "failed to remediate machines")
303+
return ctrl.Result{}, errors.Wrap(err, "failed to reconcile unhealthy machines")
343304
}
305+
result = util.LowestNonZeroResult(result, reconcileUnhealthyMachinesResult)
344306

345307
if err := r.syncMachines(ctx, machineSet, filteredMachines); err != nil {
346308
return ctrl.Result{}, errors.Wrap(err, "failed to update Machines")
@@ -482,8 +444,13 @@ func (r *Reconciler) syncReplicas(ctx context.Context, cluster *clusterv1.Cluste
482444
}
483445
}
484446

485-
result, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up")
447+
result, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up")
486448
if err != nil || !result.IsZero() {
449+
if err != nil {
450+
// If the error is not nil use that as the message for the condition.
451+
preflightCheckErrMessage = err.Error()
452+
}
453+
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage)
487454
return result, err
488455
}
489456

@@ -991,6 +958,78 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus
991958
return node, nil
992959
}
993960

961+
func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) (ctrl.Result, error) {
962+
log := ctrl.LoggerFrom(ctx)
963+
// List all unhealthy machines.
964+
machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines))
965+
for _, m := range filteredMachines {
966+
// filteredMachines contains machines in deleting status to calculate correct status.
967+
// skip remediation for those in deleting status.
968+
if !m.DeletionTimestamp.IsZero() {
969+
continue
970+
}
971+
if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) {
972+
machinesToRemediate = append(machinesToRemediate, m)
973+
}
974+
}
975+
976+
// If there are no machines to remediate return early.
977+
if len(machinesToRemediate) == 0 {
978+
return ctrl.Result{}, nil
979+
}
980+
981+
preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation")
982+
if err != nil {
983+
// If err is not nil use that as the preflightCheckErrMessage
984+
preflightCheckErrMessage = err.Error()
985+
}
986+
987+
preflightChecksFailed := err != nil || !preflightChecksResult.IsZero()
988+
if preflightChecksFailed {
989+
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
990+
// WaitingForRemediationReason reason.
991+
var errs []error
992+
for _, m := range machinesToRemediate {
993+
patchHelper, err := patch.NewHelper(m, r.Client)
994+
if err != nil {
995+
errs = append(errs, errors.Wrapf(err, "failed to create patch helper for Machine %s", klog.KObj(m)))
996+
continue
997+
}
998+
conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, preflightCheckErrMessage)
999+
if err := patchHelper.Patch(ctx, m); err != nil {
1000+
errs = append(errs, errors.Wrapf(err, "failed to patch Machine %s", klog.KObj(m)))
1001+
}
1002+
}
1003+
1004+
if len(errs) > 0 {
1005+
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch unhealthy Machines")
1006+
}
1007+
return preflightChecksResult, nil
1008+
}
1009+
1010+
// PreflightChecks passed, so it is safe to remediate unhealthy machines.
1011+
// Remediate unhealthy machines by deleting them.
1012+
var errs []error
1013+
for _, m := range machinesToRemediate {
1014+
log.Info(fmt.Sprintf("Deleting Machine %s because it was marked as unhealthy by the MachineHealthCheck controller", klog.KObj(m)))
1015+
patch := client.MergeFrom(m.DeepCopy())
1016+
if err := r.Client.Delete(ctx, m); err != nil {
1017+
errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m)))
1018+
continue
1019+
}
1020+
conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition)
1021+
if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) {
1022+
errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m)))
1023+
}
1024+
}
1025+
1026+
if len(errs) > 0 {
1027+
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines")
1028+
}
1029+
1030+
return ctrl.Result{}, nil
1031+
}
1032+
9941033
func reconcileExternalTemplateReference(ctx context.Context, c client.Client, cluster *clusterv1.Cluster, ref *corev1.ObjectReference) error {
9951034
if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) {
9961035
return nil

internal/controllers/machineset/machineset_controller_test.go

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,15 @@ import (
2626
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2727
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2828
"k8s.io/client-go/tools/record"
29+
utilfeature "k8s.io/component-base/featuregate/testing"
2930
"k8s.io/utils/pointer"
3031
"sigs.k8s.io/controller-runtime/pkg/client"
3132
"sigs.k8s.io/controller-runtime/pkg/client/fake"
3233
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3334

3435
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
3536
"sigs.k8s.io/cluster-api/controllers/external"
37+
"sigs.k8s.io/cluster-api/feature"
3638
"sigs.k8s.io/cluster-api/internal/contract"
3739
"sigs.k8s.io/cluster-api/internal/test/builder"
3840
"sigs.k8s.io/cluster-api/internal/util/ssa"
@@ -1326,6 +1328,190 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) {
13261328
}, 5*time.Second).Should(Succeed())
13271329
}
13281330

1331+
func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) {
1332+
t.Run("should delete unhealthy machines if preflight checks pass", func(t *testing.T) {
1333+
defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)()
1334+
1335+
g := NewWithT(t)
1336+
1337+
controlPlaneStable := builder.ControlPlane("default", "cp1").
1338+
WithVersion("v1.26.2").
1339+
WithStatusFields(map[string]interface{}{
1340+
"status.version": "v1.26.2",
1341+
}).
1342+
Build()
1343+
cluster := &clusterv1.Cluster{
1344+
ObjectMeta: metav1.ObjectMeta{
1345+
Name: "test-cluster",
1346+
Namespace: "default",
1347+
},
1348+
Spec: clusterv1.ClusterSpec{
1349+
ControlPlaneRef: contract.ObjToRef(controlPlaneStable),
1350+
},
1351+
}
1352+
machineSet := &clusterv1.MachineSet{}
1353+
1354+
unhealthyMachine := &clusterv1.Machine{
1355+
ObjectMeta: metav1.ObjectMeta{
1356+
Name: "unhealthy-machine",
1357+
Namespace: "default",
1358+
},
1359+
Status: clusterv1.MachineStatus{
1360+
Conditions: []clusterv1.Condition{
1361+
{
1362+
Type: clusterv1.MachineOwnerRemediatedCondition,
1363+
Status: corev1.ConditionFalse,
1364+
},
1365+
},
1366+
},
1367+
}
1368+
healthyMachine := &clusterv1.Machine{
1369+
ObjectMeta: metav1.ObjectMeta{
1370+
Name: "healthy-machine",
1371+
Namespace: "default",
1372+
},
1373+
}
1374+
1375+
machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine}
1376+
1377+
fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneStable, unhealthyMachine, healthyMachine).Build()
1378+
r := &Reconciler{Client: fakeClient}
1379+
_, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines)
1380+
g.Expect(err).To(BeNil())
1381+
// Verify the unhealthy machine is deleted.
1382+
m := &clusterv1.Machine{}
1383+
err = r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)
1384+
g.Expect(apierrors.IsNotFound(err)).To(BeTrue())
1385+
// Verify the healthy machine is not deleted.
1386+
m = &clusterv1.Machine{}
1387+
g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).Should(Succeed())
1388+
})
1389+
1390+
t.Run("should update the unhealthy machine MachineOwnerRemediated condition if preflight checks did not pass", func(t *testing.T) {
1391+
defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)()
1392+
1393+
g := NewWithT(t)
1394+
1395+
// An upgrading control plane should cause the preflight checks to not pass.
1396+
controlPlaneUpgrading := builder.ControlPlane("default", "cp1").
1397+
WithVersion("v1.26.2").
1398+
WithStatusFields(map[string]interface{}{
1399+
"status.version": "v1.25.2",
1400+
}).
1401+
Build()
1402+
cluster := &clusterv1.Cluster{
1403+
ObjectMeta: metav1.ObjectMeta{
1404+
Name: "test-cluster",
1405+
Namespace: "default",
1406+
},
1407+
Spec: clusterv1.ClusterSpec{
1408+
ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading),
1409+
},
1410+
}
1411+
machineSet := &clusterv1.MachineSet{}
1412+
1413+
unhealthyMachine := &clusterv1.Machine{
1414+
ObjectMeta: metav1.ObjectMeta{
1415+
Name: "unhealthy-machine",
1416+
Namespace: "default",
1417+
},
1418+
Status: clusterv1.MachineStatus{
1419+
Conditions: []clusterv1.Condition{
1420+
{
1421+
Type: clusterv1.MachineOwnerRemediatedCondition,
1422+
Status: corev1.ConditionFalse,
1423+
},
1424+
},
1425+
},
1426+
}
1427+
healthyMachine := &clusterv1.Machine{
1428+
ObjectMeta: metav1.ObjectMeta{
1429+
Name: "healthy-machine",
1430+
Namespace: "default",
1431+
},
1432+
}
1433+
1434+
machines := []*clusterv1.Machine{unhealthyMachine, healthyMachine}
1435+
fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneUpgrading, unhealthyMachine, healthyMachine).WithStatusSubresource(&clusterv1.Machine{}).Build()
1436+
r := &Reconciler{Client: fakeClient}
1437+
_, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines)
1438+
g.Expect(err).To(BeNil())
1439+
1440+
// Verify the unhealthy machine has the updated condition.
1441+
condition := clusterv1.MachineOwnerRemediatedCondition
1442+
m := &clusterv1.Machine{}
1443+
g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(unhealthyMachine), m)).To(Succeed())
1444+
g.Expect(conditions.Has(m, condition)).
1445+
To(BeTrue(), "Machine should have the %s condition set", condition)
1446+
machineOwnerRemediatedCondition := conditions.Get(m, condition)
1447+
g.Expect(machineOwnerRemediatedCondition.Status).
1448+
To(Equal(corev1.ConditionFalse), "%s condition status should be false", condition)
1449+
g.Expect(machineOwnerRemediatedCondition.Reason).
1450+
To(Equal(clusterv1.WaitingForRemediationReason), "%s condition should have reason %s", condition, clusterv1.WaitingForRemediationReason)
1451+
1452+
// Verify the healthy machine continues to not have the MachineOwnerRemediated condition.
1453+
m = &clusterv1.Machine{}
1454+
g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(healthyMachine), m)).To(Succeed())
1455+
g.Expect(conditions.Has(m, condition)).
1456+
To(BeFalse(), "Machine should not have the %s condition set", condition)
1457+
})
1458+
}
1459+
1460+
func TestMachineSetReconciler_syncReplicas(t *testing.T) {
1461+
t.Run("should hold off on creating new machines when preflight checks do not pass", func(t *testing.T) {
1462+
defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)()
1463+
1464+
g := NewWithT(t)
1465+
1466+
// An upgrading control plane should cause the preflight checks to not pass.
1467+
controlPlaneUpgrading := builder.ControlPlane("default", "test-cp").
1468+
WithVersion("v1.26.2").
1469+
WithStatusFields(map[string]interface{}{
1470+
"status.version": "v1.25.2",
1471+
}).
1472+
Build()
1473+
cluster := &clusterv1.Cluster{
1474+
ObjectMeta: metav1.ObjectMeta{
1475+
Name: "test-cluster",
1476+
Namespace: "default",
1477+
},
1478+
Spec: clusterv1.ClusterSpec{
1479+
ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading),
1480+
},
1481+
}
1482+
machineSet := &clusterv1.MachineSet{
1483+
ObjectMeta: metav1.ObjectMeta{
1484+
Name: "test-machineset",
1485+
Namespace: "default",
1486+
},
1487+
Spec: clusterv1.MachineSetSpec{
1488+
Replicas: pointer.Int32(1),
1489+
},
1490+
}
1491+
1492+
fakeClient := fake.NewClientBuilder().WithObjects(controlPlaneUpgrading, machineSet).WithStatusSubresource(&clusterv1.MachineSet{}).Build()
1493+
r := &Reconciler{Client: fakeClient}
1494+
result, err := r.syncReplicas(ctx, cluster, machineSet, nil)
1495+
g.Expect(err).To(BeNil())
1496+
g.Expect(result.IsZero()).To(BeFalse(), "syncReplicas should not return a 'zero' result")
1497+
1498+
// Verify the proper condition is set on the MachineSet.
1499+
condition := clusterv1.MachinesCreatedCondition
1500+
g.Expect(conditions.Has(machineSet, condition)).
1501+
To(BeTrue(), "MachineSet should have the %s condition set", condition)
1502+
machinesCreatedCondition := conditions.Get(machineSet, condition)
1503+
g.Expect(machinesCreatedCondition.Status).
1504+
To(Equal(corev1.ConditionFalse), "%s condition status should be %s", condition, corev1.ConditionFalse)
1505+
g.Expect(machinesCreatedCondition.Reason).
1506+
To(Equal(clusterv1.PreflightCheckFailedReason), "%s condition reason should be %s", condition, clusterv1.PreflightCheckFailedReason)
1507+
1508+
// Verify no new Machines are created.
1509+
machineList := &clusterv1.MachineList{}
1510+
g.Expect(r.Client.List(ctx, machineList)).To(Succeed())
1511+
g.Expect(machineList.Items).To(BeEmpty(), "There should not be any machines")
1512+
})
1513+
}
1514+
13291515
func TestComputeDesiredMachine(t *testing.T) {
13301516
duration5s := &metav1.Duration{Duration: 5 * time.Second}
13311517
duration10s := &metav1.Duration{Duration: 10 * time.Second}

0 commit comments

Comments
 (0)