Skip to content

Commit 3c002c1

Browse files
authored
Check and remove PreferNoSchedule taint from nodes after rolling update (#1001)
1 parent 33cab8f commit 3c002c1

File tree

5 files changed

+288
-295
lines changed

5 files changed

+288
-295
lines changed

pkg/controller/deployment_rollback.go

Lines changed: 8 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,9 @@ package controller
2525
import (
2626
"context"
2727
"fmt"
28-
"time"
29-
3028
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
31-
"github.com/gardener/machine-controller-manager/pkg/util/nodeops"
3229
v1 "k8s.io/api/core/v1"
3330
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34-
"k8s.io/apimachinery/pkg/labels"
3531
"k8s.io/apimachinery/pkg/types"
3632
"k8s.io/klog/v2"
3733
)
@@ -64,17 +60,17 @@ func (dc *controller) rollback(ctx context.Context, d *v1alpha1.MachineDeploymen
6460
klog.V(4).Infof("Found machine set %q with desired revision %d", is.Name, v)
6561

6662
// Remove PreferNoSchedule taints from nodes which were backing the machineSet
67-
err = dc.removeTaintNodesBackingMachineSet(
68-
ctx,
69-
is,
70-
&v1.Taint{
63+
if _, exists := is.Annotations[PreferNoScheduleKey]; exists {
64+
taint := &v1.Taint{
7165
Key: PreferNoScheduleKey,
7266
Value: "True",
7367
Effect: "PreferNoSchedule",
74-
},
75-
)
76-
if err != nil {
77-
klog.Warningf("Failed to remove taints %s off nodes. Error: %s", PreferNoScheduleKey, err)
68+
}
69+
70+
if err := dc.removeTaintNodesBackingMachineSet(ctx, is, taint); err != nil {
71+
klog.Errorf("Failed to remove taints %s from all nodes. Error: %v", PreferNoScheduleKey, err)
72+
return err
73+
}
7874
}
7975

8076
// rollback by copying podTemplate.Spec from the machine set
@@ -139,93 +135,3 @@ func (dc *controller) updateMachineDeploymentAndClearRollbackTo(ctx context.Cont
139135
_, err := dc.controlMachineClient.MachineDeployments(d.Namespace).Update(ctx, d, metav1.UpdateOptions{})
140136
return err
141137
}
142-
143-
// removeTaintNodesBackingMachineSet removes taints from all nodes backing the machineSets
144-
func (dc *controller) removeTaintNodesBackingMachineSet(ctx context.Context, machineSet *v1alpha1.MachineSet, taint *v1.Taint) error {
145-
146-
if _, exists := machineSet.Annotations[taint.Key]; !exists {
147-
// No taint exists
148-
klog.Warningf("No taint exists on machineSet: %s. Hence not removing.", machineSet.Name)
149-
return nil
150-
}
151-
152-
klog.V(2).Infof("Trying to untaint MachineSet object %q with %s to enable scheduling of pods", machineSet.Name, taint.Key)
153-
selector, err := metav1.LabelSelectorAsSelector(machineSet.Spec.Selector)
154-
if err != nil {
155-
return err
156-
}
157-
158-
// list all machines to include the machines that don't match the ms`s selector
159-
// anymore but has the stale controller ref.
160-
// TODO: Do the List and Filter in a single pass, or use an index.
161-
filteredMachines, err := dc.machineLister.List(labels.Everything())
162-
if err != nil {
163-
return err
164-
}
165-
// NOTE: filteredMachines are pointing to objects from cache - if you need to
166-
// modify them, you need to copy it first.
167-
filteredMachines, err = dc.claimMachines(ctx, machineSet, selector, filteredMachines)
168-
if err != nil {
169-
return err
170-
}
171-
172-
// Iterate through all machines and remove the PreferNoSchedule taint
173-
// to avoid scheduling on older machines
174-
for _, machine := range filteredMachines {
175-
if machine.Labels[v1alpha1.NodeLabelKey] != "" {
176-
node, err := dc.targetCoreClient.CoreV1().Nodes().Get(ctx, machine.Labels[v1alpha1.NodeLabelKey], metav1.GetOptions{})
177-
if err != nil {
178-
klog.Warningf("Node taint removal failed for node: %s, Error: %s", machine.Labels[v1alpha1.NodeLabelKey], err)
179-
continue
180-
}
181-
if err := nodeops.RemoveTaintOffNode(
182-
ctx,
183-
dc.targetCoreClient,
184-
machine.Labels[v1alpha1.NodeLabelKey],
185-
node,
186-
taint,
187-
); err != nil {
188-
klog.Warningf("Node taint removal failed for node: %s, Error: %s", machine.Labels[v1alpha1.NodeLabelKey], err)
189-
}
190-
}
191-
}
192-
193-
retryDeadline := time.Now().Add(maxRetryDeadline)
194-
for {
195-
machineSet, err = dc.controlMachineClient.MachineSets(machineSet.Namespace).Get(ctx, machineSet.Name, metav1.GetOptions{})
196-
if err != nil {
197-
if time.Now().Before(retryDeadline) {
198-
klog.Warningf("Unable to fetch MachineSet object %s, Error: %+v", machineSet.Name, err)
199-
time.Sleep(conflictRetryInterval)
200-
continue
201-
} else {
202-
// Timeout occurred
203-
klog.Errorf("Timeout occurred: Unable to fetch MachineSet object %s, Error: %+v", machineSet.Name, err)
204-
return err
205-
}
206-
}
207-
208-
msCopy := machineSet.DeepCopy()
209-
delete(msCopy.Annotations, taint.Key)
210-
211-
machineSet, err = dc.controlMachineClient.MachineSets(msCopy.Namespace).Update(ctx, msCopy, metav1.UpdateOptions{})
212-
213-
if err != nil {
214-
if time.Now().Before(retryDeadline) {
215-
klog.Warningf("Unable to update MachineSet object %s, Error: %+v", machineSet.Name, err)
216-
time.Sleep(conflictRetryInterval)
217-
continue
218-
} else {
219-
// Timeout occurred
220-
klog.Errorf("Timeout occurred: Unable to update MachineSet object %s, Error: %+v", machineSet.Name, err)
221-
return err
222-
}
223-
}
224-
225-
// Break out of loop when update succeeds
226-
break
227-
}
228-
klog.V(2).Infof("Removed taint %s from MachineSet object %q", taint.Key, machineSet.Name)
229-
230-
return nil
231-
}

pkg/controller/deployment_rollback_test.go

Lines changed: 0 additions & 193 deletions
This file was deleted.

pkg/controller/deployment_rolling.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,20 @@ func (dc *controller) rolloutRolling(ctx context.Context, d *v1alpha1.MachineDep
115115
return err
116116
}
117117
}
118+
// This check is needed because sometimes we can backtrack on a deployment, and this taint can stick
119+
// See https://github.com/gardener/machine-controller-manager/issues/989 for more details
120+
if _, ok := newIS.Annotations[PreferNoScheduleKey]; ok {
121+
taint := &v1.Taint{
122+
Key: PreferNoScheduleKey,
123+
Value: "True",
124+
Effect: "PreferNoSchedule",
125+
}
126+
127+
if err := dc.removeTaintNodesBackingMachineSet(ctx, newIS, taint); err != nil {
128+
klog.Errorf("Failed to remove taints %s from nodes. Error: %v", PreferNoScheduleKey, err)
129+
return err
130+
}
131+
}
118132
if err := dc.cleanupMachineDeployment(ctx, oldISs, d); err != nil {
119133
return err
120134
}

0 commit comments

Comments
 (0)