You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
- Remove double logging of scale down errors - the AbortNodeDeletion
already logs information.
- Change the abort node deletion to log a warning instead of an error if
it happens that scheduler manages to put a pod on a node that is
supposed to scale down. This is a known race condition that should not
result in errors logged as they are misleading to oncallers (example:
http://b/385203969#comment12)
nodeDeleteResult:= status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: errors.NewAutoscalerError(errors.TransientError, "couldn't scale down other nodes in this node group")}
125
-
CleanUpAndRecordFailedScaleDownEvent(ds.ctx, nodeInfo.Node(), nodeGroup.Id(), drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult)
125
+
CleanUpAndRecordErrorForFailedScaleDownEvent(ds.ctx, nodeInfo.Node(), nodeGroup.Id(), drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult)
// AbortNodeDeletionDueToError frees up a node that couldn't be deleted successfully. If it was a part of a group, the same is applied for other nodes queued for deletion.
138
+
func (ds*GroupDeletionScheduler) AbortNodeDeletionDueToError(node*apiv1.Node, nodeGroupIdstring, drainbool, errMsgstring, result status.NodeDeleteResult) {
// AbortNodeDeletion frees up a node that couldn't be deleted successfully. If it was a part of a group, the same is applied for other nodes queued for deletion.
138
-
func (ds*GroupDeletionScheduler) AbortNodeDeletion(node*apiv1.Node, nodeGroupIdstring, drainbool, errMsgstring, result status.NodeDeleteResult) {
143
+
func (ds*GroupDeletionScheduler) AbortNodeDeletion(node*apiv1.Node, nodeGroupIdstring, drainbool, errMsgstring, result status.NodeDeleteResult, logAsWarningbool) {
nodeDeleteResult:= status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: errors.NewAutoscalerError(errors.TransientError, "couldn't scale down other nodes in this node group")}
148
-
CleanUpAndRecordFailedScaleDownEvent(ds.ctx, otherNode, nodeGroupId, drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult)
153
+
CleanUpAndRecordFailedScaleDownEvent(ds.ctx, otherNode, nodeGroupId, drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult, logAsWarning)
0 commit comments