Skip to content

Commit bec784a

Browse files
authored
Merge pull request #8358 from k8s-infra-cherrypick-robot/cherry-pick-8315-to-cluster-autoscaler-release-1.31
[cluster-autoscaler-release-1.31] Handle Out of host capacity scenario in OCI nodepools
2 parents cfc67a1 + 457c853 commit bec784a

File tree

3 files changed

+32
-12
lines changed

3 files changed

+32
-12
lines changed

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,7 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance
519519

520520
nodePool, err := m.nodePoolCache.get(np.Id())
521521
if err != nil {
522+
klog.Error(err, "error while performing GetNodePoolNodes call")
522523
return nil, err
523524
}
524525

@@ -527,10 +528,14 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance
527528

528529
if node.NodeError != nil {
529530

531+
// We should move away from the approach of determining a node error as a Out of host capacity
532+
// through string comparison. An error code specifically for Out of host capacity must be set
533+
// and returned in the API response.
530534
errorClass := cloudprovider.OtherErrorClass
531535
if *node.NodeError.Code == "LimitExceeded" ||
532-
(*node.NodeError.Code == "InternalServerError" &&
533-
strings.Contains(*node.NodeError.Message, "quota")) {
536+
*node.NodeError.Code == "QuotaExceeded" ||
537+
(*node.NodeError.Code == "InternalError" &&
538+
strings.Contains(*node.NodeError.Message, "Out of host capacity")) {
534539
errorClass = cloudprovider.OutOfResourcesErrorClass
535540
}
536541

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@ func TestGetNodePoolNodes(t *testing.T) {
119119
{
120120
Id: common.String("node8"),
121121
NodeError: &oke.NodeError{
122-
Code: common.String("InternalServerError"),
123-
Message: common.String("blah blah quota exceeded blah blah"),
122+
Code: common.String("InternalError"),
123+
Message: common.String("blah blah Out of host capacity blah blah"),
124124
},
125125
},
126126
},
@@ -179,8 +179,8 @@ func TestGetNodePoolNodes(t *testing.T) {
179179
State: cloudprovider.InstanceCreating,
180180
ErrorInfo: &cloudprovider.InstanceErrorInfo{
181181
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
182-
ErrorCode: "InternalServerError",
183-
ErrorMessage: "blah blah quota exceeded blah blah",
182+
ErrorCode: "InternalError",
183+
ErrorMessage: "blah blah Out of host capacity blah blah",
184184
},
185185
},
186186
},

cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,27 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
209209
}
210210
}
211211
klog.V(4).Infof("DECREASE_TARGET_CHECK_VIA_COMPUTE: %v", decreaseTargetCheckViaComputeBool)
212+
np.manager.InvalidateAndRefreshCache()
213+
nodes, err := np.manager.GetNodePoolNodes(np)
214+
if err != nil {
215+
klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
216+
return err
217+
}
218+
// We do not have an OCI API that allows us to delete a node with a compute instance. So we rely on
219+
// the below approach to determine the number running instance in a nodepool from the compute API and
220+
//update the size of the nodepool accordingly. We should move away from this approach once we have an API
221+
// to delete a specific node without a compute instance.
222+
if !decreaseTargetCheckViaComputeBool {
223+
for _, node := range nodes {
224+
if node.Status != nil && node.Status.ErrorInfo != nil {
225+
if node.Status.ErrorInfo.ErrorClass == cloudprovider.OutOfResourcesErrorClass {
226+
klog.Infof("Using Compute to calculate nodepool size as nodepool may contain nodes without a compute instance.")
227+
decreaseTargetCheckViaComputeBool = true
228+
break
229+
}
230+
}
231+
}
232+
}
212233
var nodesLen int
213234
if decreaseTargetCheckViaComputeBool {
214235
nodesLen, err = np.manager.GetExistingNodePoolSizeViaCompute(np)
@@ -217,12 +238,6 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
217238
return err
218239
}
219240
} else {
220-
np.manager.InvalidateAndRefreshCache()
221-
nodes, err := np.manager.GetNodePoolNodes(np)
222-
if err != nil {
223-
klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
224-
return err
225-
}
226241
nodesLen = len(nodes)
227242
}
228243

0 commit comments

Comments
 (0)