Skip to content

Commit c599dc6

Browse files
authored
Merge pull request #7874 from norbertcyran/threadsafe-add-taints
Make AddTaints and CleanTaints thread safe
2 parents e893521 + 83a2b64 commit c599dc6

File tree

6 files changed

+71
-68
lines changed

6 files changed

+71
-68
lines changed

cluster-autoscaler/core/scaledown/actuation/actuator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ func (a *Actuator) scaleDownNodeToReport(node *apiv1.Node, drain bool) (*status.
391391

392392
// taintNode taints the node with NoSchedule to prevent new pods scheduling on it.
393393
func (a *Actuator) taintNode(node *apiv1.Node) error {
394-
if err := taints.MarkToBeDeleted(node, a.ctx.ClientSet, a.ctx.CordonNodeBeforeTerminate); err != nil {
394+
if _, err := taints.MarkToBeDeleted(node, a.ctx.ClientSet, a.ctx.CordonNodeBeforeTerminate); err != nil {
395395
a.ctx.Recorder.Eventf(node, apiv1.EventTypeWarning, "ScaleDownFailed", "failed to mark the node as toBeDeleted/unschedulable: %v", err)
396396
return errors.ToAutoscalerError(errors.ApiCallError, err)
397397
}

cluster-autoscaler/core/scaledown/actuation/actuator_test.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1302,9 +1302,7 @@ func runStartDeletionTest(t *testing.T, tc startDeletionTestCase, force bool) {
13021302
// Verify ScaleDownNodes looks as expected.
13031303
ignoreSdNodeOrder := cmpopts.SortSlices(func(a, b *status.ScaleDownNode) bool { return a.Node.Name < b.Node.Name })
13041304
cmpNg := cmp.Comparer(func(a, b *testprovider.TestNodeGroup) bool { return a.Id() == b.Id() })
1305-
// Nodes will have deletion taints, skipping them here since we check them later
1306-
ignoreTaints := cmpopts.IgnoreFields(apiv1.NodeSpec{}, "Taints")
1307-
statusCmpOpts := cmp.Options{ignoreSdNodeOrder, cmpNg, cmpopts.EquateEmpty(), ignoreTaints}
1305+
statusCmpOpts := cmp.Options{ignoreSdNodeOrder, cmpNg, cmpopts.EquateEmpty()}
13081306
if diff := cmp.Diff(wantScaleDownNodes, gotScaleDownNodes, statusCmpOpts); diff != "" {
13091307
t.Errorf("StartDeletion scaled down nodes diff (-want +got):\n%s", diff)
13101308
}

cluster-autoscaler/core/scaledown/actuation/delete_in_batch.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ func CleanUpAndRecordFailedScaleDownEvent(ctx *context.AutoscalingContext, node
206206
klog.Errorf("Scale-down: "+logMsgFormat+", %v, status error: %v", node.Name, errMsg, status.Err)
207207
}
208208
ctx.Recorder.Eventf(node, apiv1.EventTypeWarning, "ScaleDownFailed", eventMsgFormat+": %v", status.Err)
209-
taints.CleanToBeDeleted(node.DeepCopy(), ctx.ClientSet, ctx.CordonNodeBeforeTerminate)
209+
taints.CleanToBeDeleted(node, ctx.ClientSet, ctx.CordonNodeBeforeTerminate)
210210
nodeDeletionTracker.EndDeletion(nodeGroupId, node.Name, status)
211211
}
212212

cluster-autoscaler/core/scaledown/actuation/softtaint.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func UpdateSoftDeletionTaints(context *context.AutoscalingContext, uneededNodes,
6060
continue
6161
}
6262
b.processWithinBudget(func() {
63-
err := taints.MarkDeletionCandidate(node, context.ClientSet)
63+
_, err := taints.MarkDeletionCandidate(node, context.ClientSet)
6464
if err != nil {
6565
errors = append(errors, err)
6666
klog.Warningf("Soft taint on %s adding error %v", node.Name, err)

cluster-autoscaler/utils/taints/taints.go

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package taints
1919
import (
2020
"context"
2121
"fmt"
22+
"slices"
2223
"strconv"
2324
"strings"
2425
"time"
@@ -160,7 +161,7 @@ func taintKeys(taints []apiv1.Taint) []string {
160161
}
161162

162163
// MarkToBeDeleted sets a taint that makes the node unschedulable.
163-
func MarkToBeDeleted(node *apiv1.Node, client kube_client.Interface, cordonNode bool) error {
164+
func MarkToBeDeleted(node *apiv1.Node, client kube_client.Interface, cordonNode bool) (*apiv1.Node, error) {
164165
taint := apiv1.Taint{
165166
Key: ToBeDeletedTaint,
166167
Value: fmt.Sprint(time.Now().Unix()),
@@ -170,7 +171,7 @@ func MarkToBeDeleted(node *apiv1.Node, client kube_client.Interface, cordonNode
170171
}
171172

172173
// MarkDeletionCandidate sets a soft taint that makes the node preferably unschedulable.
173-
func MarkDeletionCandidate(node *apiv1.Node, client kube_client.Interface) error {
174+
func MarkDeletionCandidate(node *apiv1.Node, client kube_client.Interface) (*apiv1.Node, error) {
174175
taint := apiv1.Taint{
175176
Key: DeletionCandidateTaint,
176177
Value: fmt.Sprint(time.Now().Unix()),
@@ -179,8 +180,8 @@ func MarkDeletionCandidate(node *apiv1.Node, client kube_client.Interface) error
179180
return AddTaints(node, client, []apiv1.Taint{taint}, false)
180181
}
181182

182-
// AddTaints sets the specified taints on the node.
183-
func AddTaints(node *apiv1.Node, client kube_client.Interface, taints []apiv1.Taint, cordonNode bool) error {
183+
// AddTaints sets the specified taints on the node and returns an updated copy of the node.
184+
func AddTaints(node *apiv1.Node, client kube_client.Interface, taints []apiv1.Taint, cordonNode bool) (*apiv1.Node, error) {
184185
retryDeadline := time.Now().Add(maxRetryDeadline)
185186
freshNode := node.DeepCopy()
186187
var err error
@@ -191,7 +192,7 @@ func AddTaints(node *apiv1.Node, client kube_client.Interface, taints []apiv1.Ta
191192
freshNode, err = client.CoreV1().Nodes().Get(context.TODO(), node.Name, metav1.GetOptions{})
192193
if err != nil || freshNode == nil {
193194
klog.Warningf("Error while adding %v taints on node %v: %v", strings.Join(taintKeys(taints), ","), node.Name, err)
194-
return fmt.Errorf("failed to get node %v: %v", node.Name, err)
195+
return nil, fmt.Errorf("failed to get node %v: %v", node.Name, err)
195196
}
196197
}
197198

@@ -201,7 +202,7 @@ func AddTaints(node *apiv1.Node, client kube_client.Interface, taints []apiv1.Ta
201202
refresh = true
202203
continue
203204
}
204-
return nil
205+
return freshNode, nil
205206
}
206207
_, err = client.CoreV1().Nodes().Update(context.TODO(), freshNode, metav1.UpdateOptions{})
207208
if err != nil && errors.IsConflict(err) && time.Now().Before(retryDeadline) {
@@ -212,11 +213,10 @@ func AddTaints(node *apiv1.Node, client kube_client.Interface, taints []apiv1.Ta
212213

213214
if err != nil {
214215
klog.Warningf("Error while adding %v taints on node %v: %v", strings.Join(taintKeys(taints), ","), node.Name, err)
215-
return err
216+
return nil, err
216217
}
217218
klog.V(1).Infof("Successfully added %v on node %v", strings.Join(taintKeys(taints), ","), node.Name)
218-
node.Spec.Taints = append([]apiv1.Taint{}, freshNode.Spec.Taints...)
219-
return nil
219+
return freshNode, nil
220220
}
221221
}
222222

@@ -286,17 +286,17 @@ func GetTaintTime(node *apiv1.Node, taintKey string) (*time.Time, error) {
286286
}
287287

288288
// CleanToBeDeleted cleans CA's NoSchedule taint from a node.
289-
func CleanToBeDeleted(node *apiv1.Node, client kube_client.Interface, cordonNode bool) (bool, error) {
289+
func CleanToBeDeleted(node *apiv1.Node, client kube_client.Interface, cordonNode bool) (*apiv1.Node, error) {
290290
return CleanTaints(node, client, []string{ToBeDeletedTaint}, cordonNode)
291291
}
292292

293293
// CleanDeletionCandidate cleans CA's soft NoSchedule taint from a node.
294-
func CleanDeletionCandidate(node *apiv1.Node, client kube_client.Interface) (bool, error) {
294+
func CleanDeletionCandidate(node *apiv1.Node, client kube_client.Interface) (*apiv1.Node, error) {
295295
return CleanTaints(node, client, []string{DeletionCandidateTaint}, false)
296296
}
297297

298-
// CleanTaints cleans the specified taints from a node.
299-
func CleanTaints(node *apiv1.Node, client kube_client.Interface, taintKeys []string, cordonNode bool) (bool, error) {
298+
// CleanTaints cleans the specified taints from a node and returns an updated copy of the node.
299+
func CleanTaints(node *apiv1.Node, client kube_client.Interface, taintKeys []string, cordonNode bool) (*apiv1.Node, error) {
300300
retryDeadline := time.Now().Add(maxRetryDeadline)
301301
freshNode := node.DeepCopy()
302302
var err error
@@ -307,7 +307,7 @@ func CleanTaints(node *apiv1.Node, client kube_client.Interface, taintKeys []str
307307
freshNode, err = client.CoreV1().Nodes().Get(context.TODO(), node.Name, metav1.GetOptions{})
308308
if err != nil || freshNode == nil {
309309
klog.Warningf("Error while removing %v taints from node %v: %v", strings.Join(taintKeys, ","), node.Name, err)
310-
return false, fmt.Errorf("failed to get node %v: %v", node.Name, err)
310+
return nil, fmt.Errorf("failed to get node %v: %v", node.Name, err)
311311
}
312312
}
313313
newTaints := make([]apiv1.Taint, 0)
@@ -330,7 +330,7 @@ func CleanTaints(node *apiv1.Node, client kube_client.Interface, taintKeys []str
330330
refresh = true
331331
continue
332332
}
333-
return false, nil
333+
return freshNode, nil
334334
}
335335

336336
freshNode.Spec.Taints = newTaints
@@ -348,11 +348,10 @@ func CleanTaints(node *apiv1.Node, client kube_client.Interface, taintKeys []str
348348

349349
if err != nil {
350350
klog.Warningf("Error while releasing %v taints on node %v: %v", strings.Join(taintKeys, ","), node.Name, err)
351-
return false, err
351+
return nil, err
352352
}
353353
klog.V(1).Infof("Successfully released %v on node %v", strings.Join(taintKeys, ","), node.Name)
354-
node.Spec.Taints = append([]apiv1.Taint{}, freshNode.Spec.Taints...)
355-
return true, nil
354+
return freshNode, nil
356355
}
357356
}
358357

@@ -376,11 +375,11 @@ func CleanAllTaints(nodes []*apiv1.Node, client kube_client.Interface, recorder
376375
if !taintsPresent {
377376
continue
378377
}
379-
cleaned, err := CleanTaints(node, client, taintKeys, cordonNode)
378+
updatedNode, err := CleanTaints(node, client, taintKeys, cordonNode)
380379
if err != nil {
381380
recorder.Eventf(node, apiv1.EventTypeWarning, "ClusterAutoscalerCleanup",
382381
"failed to clean %v on node %v: %v", strings.Join(taintKeys, ","), node.Name, err)
383-
} else if cleaned {
382+
} else if node != nil && updatedNode != nil && !slices.Equal(updatedNode.Spec.Taints, node.Spec.Taints) {
384383
recorder.Eventf(node, apiv1.EventTypeNormal, "ClusterAutoscalerCleanup",
385384
"removed %v taints from node %v", strings.Join(taintKeys, ","), node.Name)
386385
}

cluster-autoscaler/utils/taints/taints_test.go

Lines changed: 48 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package taints
1919
import (
2020
"context"
2121
"fmt"
22+
"slices"
2223
"strconv"
2324
"sync/atomic"
2425
"testing"
@@ -44,7 +45,7 @@ func TestMarkNodes(t *testing.T) {
4445
defer setConflictRetryInterval(setConflictRetryInterval(time.Millisecond))
4546
node := BuildTestNode("node", 1000, 1000)
4647
fakeClient := buildFakeClientWithConflicts(t, node)
47-
err := MarkToBeDeleted(node, fakeClient, false)
48+
_, err := MarkToBeDeleted(node, fakeClient, false)
4849
assert.NoError(t, err)
4950

5051
updatedNode := getNode(t, fakeClient, "node")
@@ -56,7 +57,7 @@ func TestSoftMarkNodes(t *testing.T) {
5657
defer setConflictRetryInterval(setConflictRetryInterval(time.Millisecond))
5758
node := BuildTestNode("node", 1000, 1000)
5859
fakeClient := buildFakeClientWithConflicts(t, node)
59-
err := MarkDeletionCandidate(node, fakeClient)
60+
_, err := MarkDeletionCandidate(node, fakeClient)
6061
assert.NoError(t, err)
6162

6263
updatedNode := getNode(t, fakeClient, "node")
@@ -115,7 +116,7 @@ func TestQueryNodes(t *testing.T) {
115116
defer setConflictRetryInterval(setConflictRetryInterval(time.Millisecond))
116117
node := BuildTestNode("node", 1000, 1000)
117118
fakeClient := buildFakeClientWithConflicts(t, node)
118-
err := MarkToBeDeleted(node, fakeClient, false)
119+
_, err := MarkToBeDeleted(node, fakeClient, false)
119120
assert.NoError(t, err)
120121

121122
updatedNode := getNode(t, fakeClient, "node")
@@ -131,7 +132,7 @@ func TestSoftQueryNodes(t *testing.T) {
131132
defer setConflictRetryInterval(setConflictRetryInterval(time.Millisecond))
132133
node := BuildTestNode("node", 1000, 1000)
133134
fakeClient := buildFakeClientWithConflicts(t, node)
134-
err := MarkDeletionCandidate(node, fakeClient)
135+
_, err := MarkDeletionCandidate(node, fakeClient)
135136
assert.NoError(t, err)
136137

137138
updatedNode := getNode(t, fakeClient, "node")
@@ -161,20 +162,21 @@ func TestCleanNodes(t *testing.T) {
161162
addTaintsToSpec(node, taints, false)
162163
fakeClient := buildFakeClientWithConflicts(t, node)
163164

164-
updatedNode := getNode(t, fakeClient, "node")
165-
assert.True(t, HasToBeDeletedTaint(updatedNode))
166-
assert.True(t, HasTaint(updatedNode, "other-taint"))
167-
assert.False(t, updatedNode.Spec.Unschedulable)
165+
apiNode := getNode(t, fakeClient, "node")
166+
assert.True(t, HasToBeDeletedTaint(apiNode))
167+
assert.True(t, HasTaint(apiNode, "other-taint"))
168+
assert.False(t, apiNode.Spec.Unschedulable)
168169

169-
cleaned, err := CleanToBeDeleted(node, fakeClient, false)
170+
updatedNode, err := CleanToBeDeleted(node, fakeClient, false)
171+
cleaned := !slices.Equal(updatedNode.Spec.Taints, node.Spec.Taints)
170172
assert.True(t, cleaned)
171173
assert.NoError(t, err)
172174

173-
updatedNode = getNode(t, fakeClient, "node")
175+
apiNode = getNode(t, fakeClient, "node")
174176
assert.NoError(t, err)
175-
assert.False(t, HasToBeDeletedTaint(updatedNode))
176-
assert.True(t, HasTaint(updatedNode, "other-taint"))
177-
assert.False(t, updatedNode.Spec.Unschedulable)
177+
assert.False(t, HasToBeDeletedTaint(apiNode))
178+
assert.True(t, HasTaint(apiNode, "other-taint"))
179+
assert.False(t, apiNode.Spec.Unschedulable)
178180
}
179181

180182
func TestCleanNodesWithCordon(t *testing.T) {
@@ -195,20 +197,21 @@ func TestCleanNodesWithCordon(t *testing.T) {
195197
addTaintsToSpec(node, taints, true)
196198
fakeClient := buildFakeClientWithConflicts(t, node)
197199

198-
updatedNode := getNode(t, fakeClient, "node")
199-
assert.True(t, HasToBeDeletedTaint(updatedNode))
200-
assert.True(t, HasTaint(updatedNode, "other-taint"))
201-
assert.True(t, updatedNode.Spec.Unschedulable)
200+
apiNode := getNode(t, fakeClient, "node")
201+
assert.True(t, HasToBeDeletedTaint(apiNode))
202+
assert.True(t, HasTaint(apiNode, "other-taint"))
203+
assert.True(t, apiNode.Spec.Unschedulable)
202204

203-
cleaned, err := CleanToBeDeleted(node, fakeClient, true)
205+
updatedNode, err := CleanToBeDeleted(node, fakeClient, true)
206+
cleaned := !slices.Equal(updatedNode.Spec.Taints, node.Spec.Taints)
204207
assert.True(t, cleaned)
205208
assert.NoError(t, err)
206209

207-
updatedNode = getNode(t, fakeClient, "node")
210+
apiNode = getNode(t, fakeClient, "node")
208211
assert.NoError(t, err)
209-
assert.False(t, HasToBeDeletedTaint(updatedNode))
210-
assert.True(t, HasTaint(updatedNode, "other-taint"))
211-
assert.False(t, updatedNode.Spec.Unschedulable)
212+
assert.False(t, HasToBeDeletedTaint(apiNode))
213+
assert.True(t, HasTaint(apiNode, "other-taint"))
214+
assert.False(t, apiNode.Spec.Unschedulable)
212215
}
213216

214217
func TestCleanNodesWithCordonOnOff(t *testing.T) {
@@ -229,20 +232,21 @@ func TestCleanNodesWithCordonOnOff(t *testing.T) {
229232
addTaintsToSpec(node, taints, true)
230233
fakeClient := buildFakeClientWithConflicts(t, node)
231234

232-
updatedNode := getNode(t, fakeClient, "node")
233-
assert.True(t, HasToBeDeletedTaint(updatedNode))
234-
assert.True(t, HasTaint(updatedNode, "other-taint"))
235-
assert.True(t, updatedNode.Spec.Unschedulable)
235+
apiNode := getNode(t, fakeClient, "node")
236+
assert.True(t, HasToBeDeletedTaint(apiNode))
237+
assert.True(t, HasTaint(apiNode, "other-taint"))
238+
assert.True(t, apiNode.Spec.Unschedulable)
236239

237-
cleaned, err := CleanToBeDeleted(node, fakeClient, false)
240+
updatedNode, err := CleanToBeDeleted(node, fakeClient, false)
241+
cleaned := !slices.Equal(updatedNode.Spec.Taints, node.Spec.Taints)
238242
assert.True(t, cleaned)
239243
assert.NoError(t, err)
240244

241-
updatedNode = getNode(t, fakeClient, "node")
245+
apiNode = getNode(t, fakeClient, "node")
242246
assert.NoError(t, err)
243-
assert.False(t, HasToBeDeletedTaint(updatedNode))
244-
assert.True(t, HasTaint(updatedNode, "other-taint"))
245-
assert.True(t, updatedNode.Spec.Unschedulable)
247+
assert.False(t, HasToBeDeletedTaint(apiNode))
248+
assert.True(t, HasTaint(apiNode, "other-taint"))
249+
assert.True(t, apiNode.Spec.Unschedulable)
246250
}
247251

248252
func TestSoftCleanNodes(t *testing.T) {
@@ -263,18 +267,19 @@ func TestSoftCleanNodes(t *testing.T) {
263267
addTaintsToSpec(node, taints, false)
264268
fakeClient := buildFakeClientWithConflicts(t, node)
265269

266-
updatedNode := getNode(t, fakeClient, "node")
267-
assert.True(t, HasDeletionCandidateTaint(updatedNode))
268-
assert.True(t, HasTaint(updatedNode, "other-taint"))
270+
apiNode := getNode(t, fakeClient, "node")
271+
assert.True(t, HasDeletionCandidateTaint(apiNode))
272+
assert.True(t, HasTaint(apiNode, "other-taint"))
269273

270-
cleaned, err := CleanDeletionCandidate(node, fakeClient)
274+
updatedNode, err := CleanDeletionCandidate(node, fakeClient)
275+
cleaned := !slices.Equal(updatedNode.Spec.Taints, node.Spec.Taints)
271276
assert.True(t, cleaned)
272277
assert.NoError(t, err)
273278

274-
updatedNode = getNode(t, fakeClient, "node")
279+
apiNode = getNode(t, fakeClient, "node")
275280
assert.NoError(t, err)
276-
assert.False(t, HasDeletionCandidateTaint(updatedNode))
277-
assert.True(t, HasTaint(updatedNode, "other-taint"))
281+
assert.False(t, HasDeletionCandidateTaint(apiNode))
282+
assert.True(t, HasTaint(apiNode, "other-taint"))
278283
}
279284

280285
func TestCleanAllToBeDeleted(t *testing.T) {
@@ -768,11 +773,11 @@ func TestAddTaints(t *testing.T) {
768773
Effect: apiv1.TaintEffectNoSchedule,
769774
}
770775
}
771-
err := AddTaints(n, fakeClient, newTaints, false)
776+
updatedNode, err := AddTaints(n, fakeClient, newTaints, false)
772777
assert.NoError(t, err)
773778
apiNode := getNode(t, fakeClient, "node")
774779
for _, want := range tc.wantTaints {
775-
assert.True(t, HasTaint(n, want))
780+
assert.True(t, HasTaint(updatedNode, want))
776781
assert.True(t, HasTaint(apiNode, want))
777782
}
778783
})
@@ -829,7 +834,8 @@ func TestCleanTaints(t *testing.T) {
829834
n.Spec.Taints = append([]apiv1.Taint{}, existingTaints...)
830835
fakeClient := buildFakeClient(t, n)
831836

832-
modified, err := CleanTaints(n, fakeClient, tc.taintsToRemove, false)
837+
updatedNode, err := CleanTaints(n, fakeClient, tc.taintsToRemove, false)
838+
modified := !slices.Equal(updatedNode.Spec.Taints, n.Spec.Taints)
833839
assert.NoError(t, err)
834840
assert.Equal(t, tc.wantModified, modified)
835841

@@ -842,7 +848,7 @@ func TestCleanTaints(t *testing.T) {
842848

843849
for _, removed := range tc.taintsToRemove {
844850
assert.False(t, HasTaint(apiNode, removed))
845-
assert.False(t, HasTaint(n, removed), "Taint %s should have been removed from local node object", removed)
851+
assert.False(t, HasTaint(updatedNode, removed), "Taint %s should have been removed from local node object", removed)
846852
}
847853
})
848854
}

0 commit comments

Comments
 (0)