Skip to content

Commit 7a29c87

Browse files
aaronfernacumino
andauthored
Add taint for critical components not ready after update (#1017) (#1018)
* Add taint for critical components not ready after update After a successful in-place update, the controller now adds the 'critical components not ready' taint to the node. This prevents pods from being scheduled until critical component pods are ready, improving node readiness handling. * Remove PreferNoSchedule taint after inplace update After a successful inplace update, the PreferNoSchedule taint is now removed from the node if present. This ensures that nodes are properly untainted and available for scheduling as expected. Co-authored-by: Sonu Kumar Singh <sksgkpvks@gmail.com>
1 parent 76e93e3 commit 7a29c87

File tree

3 files changed

+19
-6
lines changed

3 files changed

+19
-6
lines changed

pkg/controller/deployment_inplace.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"context"
99
"fmt"
1010
"maps"
11+
"slices"
1112
"sort"
1213

1314
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
@@ -69,7 +70,7 @@ func (dc *controller) rolloutInPlace(ctx context.Context, d *v1alpha1.MachineDep
6970
oldMachineSets, &v1.Taint{
7071
Key: PreferNoScheduleKey,
7172
Value: "True",
72-
Effect: "PreferNoSchedule",
73+
Effect: v1.TaintEffectPreferNoSchedule,
7374
},
7475
)
7576
if err != nil {
@@ -193,6 +194,18 @@ func (dc *controller) syncMachineSets(ctx context.Context, oldMachineSets []*v1a
193194
// uncordon the node since the inplace update is successful.
194195
node.Spec.Unschedulable = false
195196

197+
// remove the PreferNoSchedule taint if it exists which was added during the inplace update.
198+
node.Spec.Taints = slices.DeleteFunc(node.Spec.Taints, func(t v1.Taint) bool {
199+
return t.Key == PreferNoScheduleKey && t.Value == "True" && t.Effect == v1.TaintEffectPreferNoSchedule
200+
})
201+
202+
// add the critical components not ready taint to the node. This is to ensure that
203+
// workload pods are not scheduled on the node until the critical components pods are ready.
204+
node.Spec.Taints = append(node.Spec.Taints, v1.Taint{
205+
Key: machineutils.TaintNodeCriticalComponentsNotReady,
206+
Effect: v1.TaintEffectNoSchedule,
207+
})
208+
196209
_, err = dc.targetCoreClient.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{})
197210
if err != nil {
198211
return fmt.Errorf("failed to remove inplace labels/annotations and uncordon node %s: %w", node.Name, err)

pkg/util/provider/machinecontroller/controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ import (
4444
)
4545

4646
const (
47-
// MCMFinalizerName is the finalizer used to tag dependecies before deletion
47+
// MCMFinalizerName is the finalizer used to tag dependencies before deletion
4848
// of the object. This finalizer is carried over from the MCM
4949
MCMFinalizerName = "machine.sapcloud.io/machine-controller-manager"
5050
// MCFinalizerName is the finalizer created for the external

pkg/util/provider/machinecontroller/machine_util.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -954,8 +954,8 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
954954
// if the label update successful or failed, then skip the timeout check
955955
if node != nil && metav1.HasLabel(node.ObjectMeta, v1alpha1.LabelKeyNodeUpdateResult) {
956956
if node.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateSuccessful && clone.Status.CurrentStatus.Phase != v1alpha1.MachineInPlaceUpdateSuccessful {
957-
description = fmt.Sprintf("Machine %s successfully updated dependecies", machine.Name)
958-
klog.V(2).Infof("%s with backing node %q and providerID %q sucessfully update the dependecies", description, getNodeName(machine), getProviderID(machine))
957+
description = fmt.Sprintf("Machine %s successfully updated dependencies", machine.Name)
958+
klog.V(2).Infof("%s with backing node %q and providerID %q sucessfully update the dependencies", description, getNodeName(machine), getProviderID(machine))
959959
clone.Status.CurrentStatus = v1alpha1.CurrentStatus{
960960
Phase: v1alpha1.MachineInPlaceUpdateSuccessful,
961961
LastUpdateTime: metav1.Now(),
@@ -968,8 +968,8 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
968968
}
969969
cloneDirty = true
970970
} else if node.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateFailed && clone.Status.CurrentStatus.Phase != v1alpha1.MachineInPlaceUpdateFailed {
971-
description = fmt.Sprintf("Machine %s failed to update dependecies: %s", machine.Name, node.Annotations[v1alpha1.AnnotationKeyMachineUpdateFailedReason])
972-
klog.V(2).Infof("%s with backing node %q and providerID %q failed to update dependecies", description, getNodeName(machine), getProviderID(machine))
971+
description = fmt.Sprintf("Machine %s failed to update dependencies: %s", machine.Name, node.Annotations[v1alpha1.AnnotationKeyMachineUpdateFailedReason])
972+
klog.V(2).Infof("%s with backing node %q and providerID %q failed to update dependencies", description, getNodeName(machine), getProviderID(machine))
973973
clone.Status.CurrentStatus = v1alpha1.CurrentStatus{
974974
Phase: v1alpha1.MachineInPlaceUpdateFailed,
975975
LastUpdateTime: metav1.Now(),

0 commit comments

Comments
 (0)