Skip to content

Commit 4f98ba1

Browse files
authored
Merge pull request #7647 from jbtk/logging
Improve logging on abandoning node scale down
2 parents cf115af + 133fc60 commit 4f98ba1

File tree

4 files changed

+37
-26
lines changed

4 files changed

+37
-26
lines changed

cluster-autoscaler/core/scaledown/actuation/actuator.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -289,21 +289,19 @@ func (a *Actuator) deleteNodesAsync(nodes []*apiv1.Node, nodeGroup cloudprovider
289289

290290
clusterSnapshot, err := a.createSnapshot(nodes)
291291
if err != nil {
292-
klog.Errorf("Scale-down: couldn't create delete snapshot, err: %v", err)
293292
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "createSnapshot returned error %v", err)}
294293
for _, node := range nodes {
295-
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to create delete snapshot", nodeDeleteResult)
294+
a.nodeDeletionScheduler.AbortNodeDeletionDueToError(node, nodeGroup.Id(), drain, "failed to create delete snapshot", nodeDeleteResult)
296295
}
297296
return
298297
}
299298

300299
if drain {
301300
pdbs, err := a.ctx.PodDisruptionBudgetLister().List()
302301
if err != nil {
303-
klog.Errorf("Scale-down: couldn't fetch pod disruption budgets, err: %v", err)
304302
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "podDisruptionBudgetLister.List returned error %v", err)}
305303
for _, node := range nodes {
306-
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to fetch pod disruption budgets", nodeDeleteResult)
304+
a.nodeDeletionScheduler.AbortNodeDeletionDueToError(node, nodeGroup.Id(), drain, "failed to fetch pod disruption budgets", nodeDeleteResult)
307305
}
308306
return
309307
}
@@ -319,24 +317,21 @@ func (a *Actuator) deleteNodesAsync(nodes []*apiv1.Node, nodeGroup cloudprovider
319317
for _, node := range nodes {
320318
nodeInfo, err := clusterSnapshot.GetNodeInfo(node.Name)
321319
if err != nil {
322-
klog.Errorf("Scale-down: can't retrieve node %q from snapshot, err: %v", node.Name, err)
323320
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "nodeInfos.Get for %q returned error: %v", node.Name, err)}
324-
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to get node info", nodeDeleteResult)
321+
a.nodeDeletionScheduler.AbortNodeDeletionDueToError(node, nodeGroup.Id(), drain, "failed to get node info", nodeDeleteResult)
325322
continue
326323
}
327324

328325
podsToRemove, _, _, err := simulator.GetPodsToMove(nodeInfo, a.deleteOptions, a.drainabilityRules, registry, remainingPdbTracker, time.Now())
329326
if err != nil {
330-
klog.Errorf("Scale-down: couldn't delete node %q, err: %v", node.Name, err)
331327
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "GetPodsToMove for %q returned error: %v", node.Name, err)}
332-
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to get pods to move on node", nodeDeleteResult)
328+
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to get pods to move on node", nodeDeleteResult, true)
333329
continue
334330
}
335331

336332
if !drain && len(podsToRemove) != 0 {
337-
klog.Errorf("Scale-down: couldn't delete empty node %q, new pods got scheduled", node.Name)
338333
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "failed to delete empty node %q, new pods scheduled", node.Name)}
339-
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "node is not empty", nodeDeleteResult)
334+
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "node is not empty", nodeDeleteResult, true)
340335
continue
341336
}
342337

cluster-autoscaler/core/scaledown/actuation/delete_in_batch.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ func (d *NodeDeletionBatcher) deleteNodesAndRegisterStatus(nodes []*apiv1.Node,
9090
for _, node := range nodes {
9191
if err != nil {
9292
result := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: err}
93-
CleanUpAndRecordFailedScaleDownEvent(d.ctx, node, nodeGroupId, drain, d.nodeDeletionTracker, "", result)
93+
CleanUpAndRecordErrorForFailedScaleDownEvent(d.ctx, node, nodeGroupId, drain, d.nodeDeletionTracker, "", result)
9494
} else {
9595
RegisterAndRecordSuccessfulScaleDownEvent(d.ctx, d.scaleStateNotifier, node, nodeGroup, drain, d.nodeDeletionTracker)
9696
}
@@ -135,7 +135,7 @@ func (d *NodeDeletionBatcher) remove(nodeGroupId string) error {
135135
drain := drainedNodeDeletions[node.Name]
136136
if err != nil {
137137
result = status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: err}
138-
CleanUpAndRecordFailedScaleDownEvent(d.ctx, node, nodeGroupId, drain, d.nodeDeletionTracker, "", result)
138+
CleanUpAndRecordErrorForFailedScaleDownEvent(d.ctx, node, nodeGroupId, drain, d.nodeDeletionTracker, "", result)
139139
} else {
140140
RegisterAndRecordSuccessfulScaleDownEvent(d.ctx, d.scaleStateNotifier, node, nodeGroup, drain, d.nodeDeletionTracker)
141141
}
@@ -185,16 +185,27 @@ func IsNodeBeingDeleted(node *apiv1.Node, timestamp time.Time) bool {
185185
return deleteTime != nil && (timestamp.Sub(*deleteTime) < MaxCloudProviderNodeDeletionTime || timestamp.Sub(*deleteTime) < MaxKubernetesEmptyNodeDeletionTime)
186186
}
187187

188-
// CleanUpAndRecordFailedScaleDownEvent record failed scale down event and log an error.
189-
func CleanUpAndRecordFailedScaleDownEvent(ctx *context.AutoscalingContext, node *apiv1.Node, nodeGroupId string, drain bool, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, errMsg string, status status.NodeDeleteResult) {
190-
if drain {
191-
klog.Errorf("Scale-down: couldn't delete node %q with drain, %v, status error: %v", node.Name, errMsg, status.Err)
192-
ctx.Recorder.Eventf(node, apiv1.EventTypeWarning, "ScaleDownFailed", "failed to drain and delete node: %v", status.Err)
188+
// CleanUpAndRecordErrorForFailedScaleDownEvent record failed scale down event and log an error.
189+
func CleanUpAndRecordErrorForFailedScaleDownEvent(ctx *context.AutoscalingContext, node *apiv1.Node, nodeGroupId string, drain bool, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, errMsg string, status status.NodeDeleteResult) {
190+
CleanUpAndRecordFailedScaleDownEvent(ctx, node, nodeGroupId, drain, nodeDeletionTracker, errMsg, status, false)
191+
}
193192

193+
// CleanUpAndRecordFailedScaleDownEvent record failed scale down event and log a warning or an error.
194+
func CleanUpAndRecordFailedScaleDownEvent(ctx *context.AutoscalingContext, node *apiv1.Node, nodeGroupId string, drain bool, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, errMsg string, status status.NodeDeleteResult, logAsWarning bool) {
195+
var logMsgFormat, eventMsgFormat string
196+
if drain {
197+
logMsgFormat = "couldn't delete node %q with drain"
198+
eventMsgFormat = "failed to drain and delete node"
199+
} else {
200+
logMsgFormat = "couldn't delete empty node %q"
201+
eventMsgFormat = "failed to delete empty node"
202+
}
203+
if logAsWarning {
204+
klog.Warningf("Scale-down: "+logMsgFormat+", %v, status error: %v", node.Name, errMsg, status.Err)
194205
} else {
195-
klog.Errorf("Scale-down: couldn't delete empty node, %v, status error: %v", errMsg, status.Err)
196-
ctx.Recorder.Eventf(node, apiv1.EventTypeWarning, "ScaleDownFailed", "failed to delete empty node: %v", status.Err)
206+
klog.Errorf("Scale-down: "+logMsgFormat+", %v, status error: %v", node.Name, errMsg, status.Err)
197207
}
208+
ctx.Recorder.Eventf(node, apiv1.EventTypeWarning, "ScaleDownFailed", eventMsgFormat+": %v", status.Err)
198209
taints.CleanToBeDeleted(node, ctx.ClientSet, ctx.CordonNodeBeforeTerminate)
199210
nodeDeletionTracker.EndDeletion(nodeGroupId, node.Name, status)
200211
}

cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ func (ds *GroupDeletionScheduler) ScheduleDeletion(nodeInfo *framework.NodeInfo,
8080
opts, err := nodeGroup.GetOptions(ds.ctx.NodeGroupDefaults)
8181
if err != nil && err != cloudprovider.ErrNotImplemented {
8282
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "GetOptions returned error %v", err)}
83-
ds.AbortNodeDeletion(nodeInfo.Node(), nodeGroup.Id(), drain, "failed to get autoscaling options for a node group", nodeDeleteResult)
83+
ds.AbortNodeDeletionDueToError(nodeInfo.Node(), nodeGroup.Id(), drain, "failed to get autoscaling options for a node group", nodeDeleteResult)
8484
return
8585
}
8686
if opts == nil {
@@ -89,7 +89,7 @@ func (ds *GroupDeletionScheduler) ScheduleDeletion(nodeInfo *framework.NodeInfo,
8989

9090
nodeDeleteResult := ds.prepareNodeForDeletion(nodeInfo, drain)
9191
if nodeDeleteResult.Err != nil {
92-
ds.AbortNodeDeletion(nodeInfo.Node(), nodeGroup.Id(), drain, "prepareNodeForDeletion failed", nodeDeleteResult)
92+
ds.AbortNodeDeletion(nodeInfo.Node(), nodeGroup.Id(), drain, "prepareNodeForDeletion failed", nodeDeleteResult, true)
9393
return
9494
}
9595

@@ -122,7 +122,7 @@ func (ds *GroupDeletionScheduler) addToBatcher(nodeInfo *framework.NodeInfo, nod
122122
if atomic {
123123
if ds.failuresForGroup[nodeGroup.Id()] {
124124
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: errors.NewAutoscalerError(errors.TransientError, "couldn't scale down other nodes in this node group")}
125-
CleanUpAndRecordFailedScaleDownEvent(ds.ctx, nodeInfo.Node(), nodeGroup.Id(), drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult)
125+
CleanUpAndRecordErrorForFailedScaleDownEvent(ds.ctx, nodeInfo.Node(), nodeGroup.Id(), drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult)
126126
delete(ds.nodeQueue, nodeGroup.Id())
127127
}
128128
if len(ds.nodeQueue[nodeGroup.Id()]) < batchSize {
@@ -134,18 +134,23 @@ func (ds *GroupDeletionScheduler) addToBatcher(nodeInfo *framework.NodeInfo, nod
134134
ds.nodeQueue[nodeGroup.Id()] = []*apiv1.Node{}
135135
}
136136

137+
// AbortNodeDeletionDueToError frees up a node that couldn't be deleted successfully. If it was a part of a group, the same is applied for other nodes queued for deletion.
138+
func (ds *GroupDeletionScheduler) AbortNodeDeletionDueToError(node *apiv1.Node, nodeGroupId string, drain bool, errMsg string, result status.NodeDeleteResult) {
139+
ds.AbortNodeDeletion(node, nodeGroupId, drain, errMsg, result, false)
140+
}
141+
137142
// AbortNodeDeletion frees up a node that couldn't be deleted successfully. If it was a part of a group, the same is applied for other nodes queued for deletion.
138-
func (ds *GroupDeletionScheduler) AbortNodeDeletion(node *apiv1.Node, nodeGroupId string, drain bool, errMsg string, result status.NodeDeleteResult) {
143+
func (ds *GroupDeletionScheduler) AbortNodeDeletion(node *apiv1.Node, nodeGroupId string, drain bool, errMsg string, result status.NodeDeleteResult, logAsWarning bool) {
139144
ds.Lock()
140145
defer ds.Unlock()
141146
ds.failuresForGroup[nodeGroupId] = true
142-
CleanUpAndRecordFailedScaleDownEvent(ds.ctx, node, nodeGroupId, drain, ds.nodeDeletionTracker, errMsg, result)
147+
CleanUpAndRecordFailedScaleDownEvent(ds.ctx, node, nodeGroupId, drain, ds.nodeDeletionTracker, errMsg, result, logAsWarning)
143148
for _, otherNode := range ds.nodeQueue[nodeGroupId] {
144149
if otherNode == node {
145150
continue
146151
}
147152
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: errors.NewAutoscalerError(errors.TransientError, "couldn't scale down other nodes in this node group")}
148-
CleanUpAndRecordFailedScaleDownEvent(ds.ctx, otherNode, nodeGroupId, drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult)
153+
CleanUpAndRecordFailedScaleDownEvent(ds.ctx, otherNode, nodeGroupId, drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult, logAsWarning)
149154
}
150155
delete(ds.nodeQueue, nodeGroupId)
151156
}

cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ func TestScheduleDeletion(t *testing.T) {
173173
for _, bucket := range ti.toAbort {
174174
for _, node := range bucket.Nodes {
175175
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError}
176-
scheduler.AbortNodeDeletion(node, bucket.Group.Id(), false, "simulated abort", nodeDeleteResult)
176+
scheduler.AbortNodeDeletionDueToError(node, bucket.Group.Id(), false, "simulated abort", nodeDeleteResult)
177177
}
178178
}
179179
if err := scheduleAll(ti.toScheduleAfterAbort, scheduler); err != nil {

0 commit comments

Comments
 (0)