Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cluster-autoscaler/cloudprovider/oci/common/consts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package common

const (
// InstanceStateUnfulfilled is a status indicating that the pool was unable to fulfill the operation
InstanceStateUnfulfilled = "Unfulfilled"
// InstanceIDUnfulfilled is the generic placeholder name for upcoming instances
InstanceIDUnfulfilled = "instance_placeholder"
)
16 changes: 12 additions & 4 deletions cluster-autoscaler/cloudprovider/oci/common/oci_ref.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ Copyright 2021-2023 Oracle and/or its affiliates.
package common

import (
"strings"

apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/instancepools/consts"
"strings"
)

// OciRef contains s reference to some entity in OCI world.
Expand Down Expand Up @@ -87,9 +89,9 @@ func getNodeExternalAddress(node *apiv1.Node) string {
func getNodeInstancePoolID(node *apiv1.Node) string {

// Handle unfilled instance placeholder (instances that have yet to be created)
if strings.Contains(node.Name, consts.InstanceIDUnfulfilled) {
if strings.Contains(node.Name, InstanceIDUnfulfilled) {
instIndex := strings.LastIndex(node.Name, "-")
return strings.Replace(node.Name[:instIndex], consts.InstanceIDUnfulfilled, "", 1)
return strings.Replace(node.Name[:instIndex], InstanceIDUnfulfilled, "", 1)
}

poolIDPrefixLabel, _ := node.Labels[consts.InstancePoolIDLabelPrefix]
Expand All @@ -111,8 +113,14 @@ func getNodeInstanceID(node *apiv1.Node) string {
}

// Handle unfilled instance placeholder (instances that have yet to be created)
if strings.Contains(node.Name, consts.InstanceIDUnfulfilled) {
if strings.Contains(node.Name, InstanceIDUnfulfilled) {
return node.Name
} else if node.Annotations[cloudprovider.FakeNodeReasonAnnotation] == cloudprovider.FakeNodeUnregistered {
// Placeholder node created by CA for a node that has not registered itself yet.
return InstanceIDUnfulfilled
} else if node.Annotations[cloudprovider.FakeNodeReasonAnnotation] == cloudprovider.FakeNodeCreateError {
// Placeholder node created by CA for a node cannot be created because of an error.
return InstanceIDUnfulfilled
}

instancePrefixLabel, _ := node.Labels[consts.InstanceIDLabelPrefix]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ Copyright 2021-2023 Oracle and/or its affiliates.
package consts

import (
v1 "k8s.io/api/core/v1"
"time"

v1 "k8s.io/api/core/v1"
)

const (
Expand Down Expand Up @@ -46,10 +47,6 @@ const (
OciInstancePoolResourceIdent = "instancepool"
// OciInstancePoolLaunchOp is an instance pools operation type
OciInstancePoolLaunchOp = "LaunchInstancesInPool"
// InstanceStateUnfulfilled is a status indicating that the instance pool was unable to fulfill the operation
InstanceStateUnfulfilled = "Unfulfilled"
// InstanceIDUnfulfilled is the generic placeholder name for upcoming instances
InstanceIDUnfulfilled = "instance_placeholder"

// OciInstancePoolIDNonPoolMember indicates a kubernetes node doesn't belong to any OCI Instance Pool.
OciInstancePoolIDNonPoolMember = "non_pool_member"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ func (c *instancePoolCache) rebuild(staticInstancePools map[string]*InstancePool
if unrecoverableErrorMsg != "" {
klog.V(4).Infof("Creating placeholder instances for %s.", *getInstancePoolResp.InstancePool.DisplayName)
for i := len(*c.instanceSummaryCache[id]); i < *c.poolCache[id].Size; i++ {
c.addUnfulfilledInstanceToCache(id, fmt.Sprintf("%s%s-%d", consts.InstanceIDUnfulfilled,
c.addUnfulfilledInstanceToCache(id, fmt.Sprintf("%s%s-%d", ocicommon.InstanceIDUnfulfilled,
*getInstancePoolResp.InstancePool.Id, i), *getInstancePoolResp.InstancePool.CompartmentId,
fmt.Sprintf("%s-%d", *getInstancePoolResp.InstancePool.DisplayName, i))
}
Expand All @@ -152,7 +152,7 @@ func (c *instancePoolCache) addUnfulfilledInstanceToCache(instancePoolID, instan
*c.instanceSummaryCache[instancePoolID] = append(*c.instanceSummaryCache[instancePoolID], core.InstanceSummary{
Id: common.String(instanceID),
CompartmentId: common.String(compartmentID),
State: common.String(consts.InstanceStateUnfulfilled),
State: common.String(ocicommon.InstanceStateUnfulfilled),
DisplayName: common.String(name),
})
}
Expand All @@ -168,7 +168,7 @@ func (c *instancePoolCache) removeInstance(instancePool InstancePoolNodeGroup, i
}

var err error
if strings.Contains(instanceID, consts.InstanceIDUnfulfilled) {
if strings.Contains(instanceID, ocicommon.InstanceIDUnfulfilled) {
// For an unfulfilled instance, reduce the target size of the instance pool and remove the placeholder instance from cache.
err = c.setSize(instancePool.Id(), *c.poolCache[instancePool.Id()].Size-1)
} else {
Expand Down Expand Up @@ -201,9 +201,9 @@ func (c *instancePoolCache) removeInstance(instancePool InstancePoolNodeGroup, i
func (c *instancePoolCache) findInstanceByDetails(ociInstance ocicommon.OciRef) (*ocicommon.OciRef, error) {

// Unfilled instance placeholder
if strings.Contains(ociInstance.Name, consts.InstanceIDUnfulfilled) {
if strings.Contains(ociInstance.Name, ocicommon.InstanceIDUnfulfilled) {
instIndex := strings.LastIndex(ociInstance.Name, "-")
ociInstance.InstancePoolID = strings.Replace(ociInstance.Name[:instIndex], consts.InstanceIDUnfulfilled, "", 1)
ociInstance.InstancePoolID = strings.Replace(ociInstance.Name[:instIndex], ocicommon.InstanceIDUnfulfilled, "", 1)
return &ociInstance, nil
}
// Minimum amount of information we need to make a positive match
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ package instancepools

import (
"fmt"
npconsts "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts"
"os"
"strconv"
"strings"
"time"

npconsts "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts"

ocicommon "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/common"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/instancepools/consts"

Expand Down Expand Up @@ -284,11 +285,11 @@ func (m *InstancePoolManagerImpl) GetInstancePoolNodes(ip InstancePoolNodeGroup)
status.State = cloudprovider.InstanceDeleting
case string(core.InstanceLifecycleStateStopping):
status.State = cloudprovider.InstanceDeleting
case consts.InstanceStateUnfulfilled:
case ocicommon.InstanceStateUnfulfilled:
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: consts.InstanceStateUnfulfilled,
ErrorCode: ocicommon.InstanceStateUnfulfilled,
ErrorMessage: "OCI cannot provision additional instances for this instance pool. Review quota and/or capacity.",
}
}
Expand Down
17 changes: 14 additions & 3 deletions cluster-autoscaler/cloudprovider/oci/nodepools/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"net/http"
"sync"

ocicommon "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/common"
"k8s.io/klog/v2"

"github.com/pkg/errors"
Expand Down Expand Up @@ -70,16 +71,23 @@ func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool, maxGetNodep

// removeInstance tries to remove the instance from the node pool.
func (c *nodePoolCache) removeInstance(nodePoolID, instanceID string, nodeName string) error {
c.mu.Lock()
defer c.mu.Unlock()

if instanceID == "" {
klog.Errorf("Node %s doesn't have an instance id so it can't be deleted.", nodeName)
klog.Errorf("This could be due to a Compute Instance issue in OCI such as Out Of Host Capacity error. Check the instance status on OCI Console.")
return errors.Errorf("Node %s doesn't have an instance id so it can't be deleted.", nodeName)
} else if ocicommon.InstanceIDUnfulfilled == instanceID {
// Remove an unprovisioned instance caused by capacity or quota issues so that it does not prevent or delay the
// autoscaler from attempting to scale a different pool that meets the scheduling requirements.
size, err := c.getSize(nodePoolID)
if err != nil {
return err
}
return c.setSize(nodePoolID, size-1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this code change would we ever have a situation where the above if-condition (instanceID == "")is true?

If yes, should we decrement the size of the nodepool there as well?

Copy link
Contributor Author

@jlamillan jlamillan Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This proposed code change is handling a specific scenario where the Cluster Autoscaler has requested that we remove an unregistered node whose underlying compute instance ID cannot be established. Outside of that scenario, I am not sure why a compute ID could not be determined or if whether decrementing the size of the node-pool is the right thing to do.

}

klog.Infof("Deleting instance %q from node pool %q", instanceID, nodePoolID)
c.mu.Lock()
defer c.mu.Unlock()

// always try to remove the instance. This call is idempotent
scaleDown := true
Expand Down Expand Up @@ -142,8 +150,11 @@ func (c *nodePoolCache) getByInstance(instanceID string) (*oke.NodePool, error)

for _, nodePool := range c.cache {
for _, node := range nodePool.Nodes {
// Either the IDs match or we're looking for an unfulfilled instance, and we've found an unfulfilled node.
if *node.Id == instanceID {
return nodePool, nil
} else if ocicommon.InstanceIDUnfulfilled == instanceID && *node.Id == "" {
return nodePool, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible that this line is going to cause us to return the wrong node pool?

For instance if a user used balance-similar-node-groups, and 3 node pools scaled up all at once with "unfulfilled instances".

But when we do getByInstance, will the "unfulfilled instances" return the first node pool every time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I pushed a commit to only match an unfulfilled instance to a node pool if that pool has a node in CREATING status with an associated error.

In short, if a node in a pool can't be created due to an error, we treat that pool as the match to an unfulfilled instance.

}
}
}
Expand Down