diff --git a/cluster-autoscaler/cloudprovider/oci/common/consts.go b/cluster-autoscaler/cloudprovider/oci/common/consts.go new file mode 100644 index 000000000000..ee2e9dfb21c1 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/oci/common/consts.go @@ -0,0 +1,8 @@ +package common + +const ( + // InstanceStateUnfulfilled is a status indicating that the pool was unable to fulfill the operation + InstanceStateUnfulfilled = "Unfulfilled" + // InstanceIDUnfulfilled is the generic placeholder name for upcoming instances + InstanceIDUnfulfilled = "instance_placeholder" +) diff --git a/cluster-autoscaler/cloudprovider/oci/common/oci_ref.go b/cluster-autoscaler/cloudprovider/oci/common/oci_ref.go index df5f3ec488e8..9a91bf463ccd 100644 --- a/cluster-autoscaler/cloudprovider/oci/common/oci_ref.go +++ b/cluster-autoscaler/cloudprovider/oci/common/oci_ref.go @@ -5,9 +5,11 @@ Copyright 2021-2023 Oracle and/or its affiliates. package common import ( + "strings" + apiv1 "k8s.io/api/core/v1" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/instancepools/consts" - "strings" ) // OciRef contains s reference to some entity in OCI world. @@ -87,9 +89,9 @@ func getNodeExternalAddress(node *apiv1.Node) string { func getNodeInstancePoolID(node *apiv1.Node) string { // Handle unfilled instance placeholder (instances that have yet to be created) - if strings.Contains(node.Name, consts.InstanceIDUnfulfilled) { + if strings.Contains(node.Name, InstanceIDUnfulfilled) { instIndex := strings.LastIndex(node.Name, "-") - return strings.Replace(node.Name[:instIndex], consts.InstanceIDUnfulfilled, "", 1) + return strings.Replace(node.Name[:instIndex], InstanceIDUnfulfilled, "", 1) } poolIDPrefixLabel, _ := node.Labels[consts.InstancePoolIDLabelPrefix] @@ -111,8 +113,14 @@ func getNodeInstanceID(node *apiv1.Node) string { } // Handle unfilled instance placeholder (instances that have yet to be created) - if strings.Contains(node.Name, consts.InstanceIDUnfulfilled) { + if strings.Contains(node.Name, InstanceIDUnfulfilled) { return node.Name + } else if node.Annotations[cloudprovider.FakeNodeReasonAnnotation] == cloudprovider.FakeNodeUnregistered { + // Placeholder node created by CA for a node that has not registered itself yet. + return InstanceIDUnfulfilled + } else if node.Annotations[cloudprovider.FakeNodeReasonAnnotation] == cloudprovider.FakeNodeCreateError { + // Placeholder node created by CA for a node cannot be created because of an error. + return InstanceIDUnfulfilled } instancePrefixLabel, _ := node.Labels[consts.InstanceIDLabelPrefix] diff --git a/cluster-autoscaler/cloudprovider/oci/instancepools/consts/annotations.go b/cluster-autoscaler/cloudprovider/oci/instancepools/consts/annotations.go index 5851e0ed0153..ebdc8212329e 100644 --- a/cluster-autoscaler/cloudprovider/oci/instancepools/consts/annotations.go +++ b/cluster-autoscaler/cloudprovider/oci/instancepools/consts/annotations.go @@ -5,8 +5,9 @@ Copyright 2021-2023 Oracle and/or its affiliates. package consts import ( - v1 "k8s.io/api/core/v1" "time" + + v1 "k8s.io/api/core/v1" ) const ( @@ -46,10 +47,6 @@ const ( OciInstancePoolResourceIdent = "instancepool" // OciInstancePoolLaunchOp is an instance pools operation type OciInstancePoolLaunchOp = "LaunchInstancesInPool" - // InstanceStateUnfulfilled is a status indicating that the instance pool was unable to fulfill the operation - InstanceStateUnfulfilled = "Unfulfilled" - // InstanceIDUnfulfilled is the generic placeholder name for upcoming instances - InstanceIDUnfulfilled = "instance_placeholder" // OciInstancePoolIDNonPoolMember indicates a kubernetes node doesn't belong to any OCI Instance Pool. OciInstancePoolIDNonPoolMember = "non_pool_member" diff --git a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool_cache.go b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool_cache.go index 72bf533879dc..30f382416107 100644 --- a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool_cache.go +++ b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool_cache.go @@ -132,7 +132,7 @@ func (c *instancePoolCache) rebuild(staticInstancePools map[string]*InstancePool if unrecoverableErrorMsg != "" { klog.V(4).Infof("Creating placeholder instances for %s.", *getInstancePoolResp.InstancePool.DisplayName) for i := len(*c.instanceSummaryCache[id]); i < *c.poolCache[id].Size; i++ { - c.addUnfulfilledInstanceToCache(id, fmt.Sprintf("%s%s-%d", consts.InstanceIDUnfulfilled, + c.addUnfulfilledInstanceToCache(id, fmt.Sprintf("%s%s-%d", ocicommon.InstanceIDUnfulfilled, *getInstancePoolResp.InstancePool.Id, i), *getInstancePoolResp.InstancePool.CompartmentId, fmt.Sprintf("%s-%d", *getInstancePoolResp.InstancePool.DisplayName, i)) } @@ -152,7 +152,7 @@ func (c *instancePoolCache) addUnfulfilledInstanceToCache(instancePoolID, instan *c.instanceSummaryCache[instancePoolID] = append(*c.instanceSummaryCache[instancePoolID], core.InstanceSummary{ Id: common.String(instanceID), CompartmentId: common.String(compartmentID), - State: common.String(consts.InstanceStateUnfulfilled), + State: common.String(ocicommon.InstanceStateUnfulfilled), DisplayName: common.String(name), }) } @@ -168,7 +168,7 @@ func (c *instancePoolCache) removeInstance(instancePool InstancePoolNodeGroup, i } var err error - if strings.Contains(instanceID, consts.InstanceIDUnfulfilled) { + if strings.Contains(instanceID, ocicommon.InstanceIDUnfulfilled) { // For an unfulfilled instance, reduce the target size of the instance pool and remove the placeholder instance from cache. err = c.setSize(instancePool.Id(), *c.poolCache[instancePool.Id()].Size-1) } else { @@ -201,9 +201,9 @@ func (c *instancePoolCache) removeInstance(instancePool InstancePoolNodeGroup, i func (c *instancePoolCache) findInstanceByDetails(ociInstance ocicommon.OciRef) (*ocicommon.OciRef, error) { // Unfilled instance placeholder - if strings.Contains(ociInstance.Name, consts.InstanceIDUnfulfilled) { + if strings.Contains(ociInstance.Name, ocicommon.InstanceIDUnfulfilled) { instIndex := strings.LastIndex(ociInstance.Name, "-") - ociInstance.InstancePoolID = strings.Replace(ociInstance.Name[:instIndex], consts.InstanceIDUnfulfilled, "", 1) + ociInstance.InstancePoolID = strings.Replace(ociInstance.Name[:instIndex], ocicommon.InstanceIDUnfulfilled, "", 1) return &ociInstance, nil } // Minimum amount of information we need to make a positive match diff --git a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool_manager.go b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool_manager.go index 6aa36f0509e0..4e40d0f37fc8 100644 --- a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool_manager.go +++ b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_instance_pool_manager.go @@ -6,12 +6,13 @@ package instancepools import ( "fmt" - npconsts "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts" "os" "strconv" "strings" "time" + npconsts "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts" + ocicommon "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/common" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/instancepools/consts" @@ -284,11 +285,11 @@ func (m *InstancePoolManagerImpl) GetInstancePoolNodes(ip InstancePoolNodeGroup) status.State = cloudprovider.InstanceDeleting case string(core.InstanceLifecycleStateStopping): status.State = cloudprovider.InstanceDeleting - case consts.InstanceStateUnfulfilled: + case ocicommon.InstanceStateUnfulfilled: status.State = cloudprovider.InstanceCreating status.ErrorInfo = &cloudprovider.InstanceErrorInfo{ ErrorClass: cloudprovider.OutOfResourcesErrorClass, - ErrorCode: consts.InstanceStateUnfulfilled, + ErrorCode: ocicommon.InstanceStateUnfulfilled, ErrorMessage: "OCI cannot provision additional instances for this instance pool. Review quota and/or capacity.", } } diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/cache.go b/cluster-autoscaler/cloudprovider/oci/nodepools/cache.go index 7270be5ee1be..1efbc178001f 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/cache.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/cache.go @@ -9,6 +9,7 @@ import ( "net/http" "sync" + ocicommon "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/common" "k8s.io/klog/v2" "github.com/pkg/errors" @@ -70,16 +71,23 @@ func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool, maxGetNodep // removeInstance tries to remove the instance from the node pool. func (c *nodePoolCache) removeInstance(nodePoolID, instanceID string, nodeName string) error { - c.mu.Lock() - defer c.mu.Unlock() - if instanceID == "" { klog.Errorf("Node %s doesn't have an instance id so it can't be deleted.", nodeName) klog.Errorf("This could be due to a Compute Instance issue in OCI such as Out Of Host Capacity error. Check the instance status on OCI Console.") return errors.Errorf("Node %s doesn't have an instance id so it can't be deleted.", nodeName) + } else if ocicommon.InstanceIDUnfulfilled == instanceID { + // Remove an unprovisioned instance caused by capacity or quota issues so that it does not prevent or delay the + // autoscaler from attempting to scale a different pool that meets the scheduling requirements. + size, err := c.getSize(nodePoolID) + if err != nil { + return err + } + return c.setSize(nodePoolID, size-1) } klog.Infof("Deleting instance %q from node pool %q", instanceID, nodePoolID) + c.mu.Lock() + defer c.mu.Unlock() // always try to remove the instance. This call is idempotent scaleDown := true @@ -142,8 +150,13 @@ func (c *nodePoolCache) getByInstance(instanceID string) (*oke.NodePool, error) for _, nodePool := range c.cache { for _, node := range nodePool.Nodes { + // Either the IDs match or we're looking for an unfulfilled instance, and we've found a node pool with a + // node that cannot be created because of an error. if *node.Id == instanceID { return nodePool, nil + } else if ocicommon.InstanceIDUnfulfilled == instanceID && + *node.Id == "" && oke.NodeLifecycleStateCreating == node.LifecycleState && node.NodeError != nil { + return nodePool, nil } } }