Skip to content

Commit d9d2e0f

Browse files
authored
Merge pull request #8387 from norbertcyran/force-delete-failed-nodes
Force delete nodes with errors
2 parents 9b8558e + 5618f9a commit d9d2e0f

File tree

4 files changed

+373
-301
lines changed

4 files changed

+373
-301
lines changed

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,8 @@ type AutoscalingOptions struct {
307307
CheckCapacityProvisioningRequestBatchTimebox time.Duration
308308
// ForceDeleteLongUnregisteredNodes is used to enable/disable ignoring min size constraints during removal of long unregistered nodes
309309
ForceDeleteLongUnregisteredNodes bool
310+
// ForceDeleteFailedNodes is used to enable/disable ignoring min size constraints during removal of failed nodes
311+
ForceDeleteFailedNodes bool
310312
// DynamicResourceAllocationEnabled configures whether logic for handling DRA objects is enabled.
311313
DynamicResourceAllocationEnabled bool
312314
// ClusterSnapshotParallelism is the maximum parallelism of cluster snapshot creation.

cluster-autoscaler/config/flags/flags.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ var (
223223
checkCapacityProvisioningRequestMaxBatchSize = flag.Int("check-capacity-provisioning-request-max-batch-size", 10, "Maximum number of provisioning requests to process in a single batch.")
224224
checkCapacityProvisioningRequestBatchTimebox = flag.Duration("check-capacity-provisioning-request-batch-timebox", 10*time.Second, "Maximum time to process a batch of provisioning requests.")
225225
forceDeleteLongUnregisteredNodes = flag.Bool("force-delete-unregistered-nodes", false, "Whether to enable force deletion of long unregistered nodes, regardless of the min size of the node group the belong to.")
226+
forceDeleteFailedNodes = flag.Bool("force-delete-failed-nodes", false, "Whether to enable force deletion of failed nodes, regardless of the min size of the node group the belong to.")
226227
enableDynamicResourceAllocation = flag.Bool("enable-dynamic-resource-allocation", false, "Whether logic for handling DRA (Dynamic Resource Allocation) objects is enabled.")
227228
clusterSnapshotParallelism = flag.Int("cluster-snapshot-parallelism", 16, "Maximum parallelism of cluster snapshot creation.")
228229
checkCapacityProcessorInstance = flag.String("check-capacity-processor-instance", "", "Name of the processor instance. Only ProvisioningRequests that define this name in their parameters with the key \"processorInstance\" will be processed by this CA instance. It only refers to check capacity ProvisioningRequests, but if not empty, best-effort atomic ProvisioningRequests processing is disabled in this instance. Not recommended: Until CA 1.35, ProvisioningRequests with this name as prefix in their class will be also processed.")
@@ -391,6 +392,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
391392
CheckCapacityProvisioningRequestMaxBatchSize: *checkCapacityProvisioningRequestMaxBatchSize,
392393
CheckCapacityProvisioningRequestBatchTimebox: *checkCapacityProvisioningRequestBatchTimebox,
393394
ForceDeleteLongUnregisteredNodes: *forceDeleteLongUnregisteredNodes,
395+
ForceDeleteFailedNodes: *forceDeleteFailedNodes,
394396
DynamicResourceAllocationEnabled: *enableDynamicResourceAllocation,
395397
ClusterSnapshotParallelism: *clusterSnapshotParallelism,
396398
CheckCapacityProcessorInstance: *checkCapacityProcessorInstance,

cluster-autoscaler/core/static_autoscaler.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -883,8 +883,13 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
883883
nodeGroup := nodeGroups[nodeGroupId]
884884
if nodeGroup == nil {
885885
err = fmt.Errorf("node group %s not found", nodeGroupId)
886-
} else if nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete); err == nil {
887-
if len(nodesToDelete) > 0 {
886+
} else if nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete); err == nil && len(nodesToDelete) > 0 {
887+
if a.ForceDeleteFailedNodes {
888+
err = nodeGroup.ForceDeleteNodes(nodesToDelete)
889+
if errors.Is(err, cloudprovider.ErrNotImplemented) {
890+
err = nodeGroup.DeleteNodes(nodesToDelete)
891+
}
892+
} else {
888893
err = nodeGroup.DeleteNodes(nodesToDelete)
889894
}
890895
}

0 commit comments

Comments
 (0)