Skip to content

Commit f83faea

Browse files
authored
fix: not failing the main loop when one NodeGroup fails on TemplateNodeInfo() (#8402) (#8422)
* fix: not failing the main loop when one NodeGroup fails on TemplateNodeInfo() * test: add a unit test
1 parent 4dbf1a7 commit f83faea

File tree

2 files changed

+47
-8
lines changed

2 files changed

+47
-8
lines changed

cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,10 @@ func (p *MixedTemplateNodeInfoProvider) Process(ctx *context.AutoscalingContext,
154154
// working nodes in the node groups. By default CA tries to use a real-world example.
155155
nodeInfo, err := utils.GetNodeInfoFromTemplate(nodeGroup, daemonsets, taintConfig)
156156
if err != nil {
157-
if err == cloudprovider.ErrNotImplemented {
158-
continue
159-
} else {
157+
if err != cloudprovider.ErrNotImplemented {
160158
klog.Errorf("Unable to build proper template node for %s: %v", id, err)
161-
return map[string]*schedulerframework.NodeInfo{}, errors.ToAutoscalerError(errors.CloudProviderError, err)
162159
}
160+
continue
163161
}
164162
result[id] = nodeInfo
165163
}

cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor_test.go

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,17 @@ import (
2020
"testing"
2121
"time"
2222

23+
"github.com/stretchr/testify/assert"
24+
25+
appsv1 "k8s.io/api/apps/v1"
26+
apiv1 "k8s.io/api/core/v1"
2327
testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
2428
"k8s.io/autoscaler/cluster-autoscaler/context"
29+
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
2530
"k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker"
2631
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
2732
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
2833
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
29-
30-
"github.com/stretchr/testify/assert"
31-
appsv1 "k8s.io/api/apps/v1"
32-
apiv1 "k8s.io/api/core/v1"
3334
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
3435
)
3536

@@ -290,6 +291,46 @@ func TestGetNodeInfosCacheExpired(t *testing.T) {
290291

291292
}
292293

294+
func TestProcessHandlesTemplateNodeInfoErrors(t *testing.T) {
295+
now := time.Now()
296+
297+
tn := BuildTestNode("tn", 1000, 1000)
298+
tni := schedulerframework.NewNodeInfo()
299+
tni.SetNode(tn)
300+
301+
provider := testprovider.NewTestAutoprovisioningCloudProvider(
302+
nil, nil, nil, nil, nil, map[string]*schedulerframework.NodeInfo{"ng2": tni})
303+
304+
provider.AddNodeGroup("ng1", 0, 10, 0)
305+
provider.AddNodeGroup("ng2", 0, 10, 0)
306+
307+
podLister := kube_util.NewTestPodLister([]*apiv1.Pod{})
308+
registry := kube_util.NewListerRegistry(nil, nil, podLister, nil, nil, nil, nil, nil, nil)
309+
310+
predicateChecker, err := predicatechecker.NewTestPredicateChecker()
311+
assert.NoError(t, err)
312+
313+
ctx := context.AutoscalingContext{
314+
CloudProvider: provider,
315+
ClusterSnapshot: clustersnapshot.NewBasicClusterSnapshot(),
316+
PredicateChecker: predicateChecker,
317+
AutoscalingKubeClients: context.AutoscalingKubeClients{
318+
ListerRegistry: registry,
319+
},
320+
}
321+
322+
res, err := NewMixedTemplateNodeInfoProvider(&cacheTtl, false).Process(&ctx, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
323+
324+
// Should not fail despite ng1 error - continues processing
325+
assert.NoError(t, err)
326+
assert.Equal(t, 1, len(res))
327+
328+
_, found := res["ng2"]
329+
assert.True(t, found)
330+
_, found = res["ng1"]
331+
assert.False(t, found) // ng1 skipped due to template error
332+
}
333+
293334
func assertEqualNodeCapacities(t *testing.T, expected, actual *apiv1.Node) {
294335
t.Helper()
295336
assert.NotEqual(t, actual.Status, nil, "")

0 commit comments

Comments
 (0)