Skip to content

Commit b690a99

Browse files
committed
[cluster-autoscaler-1.32]: prevent panic in SimulateNodeRemoval by handling missing node info
1 parent 7417da3 commit b690a99

File tree

2 files changed

+205
-0
lines changed

2 files changed

+205
-0
lines changed

cluster-autoscaler/simulator/cluster.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ limitations under the License.
1717
package simulator
1818

1919
import (
20+
"errors"
2021
"fmt"
2122
"time"
2223

2324
apiv1 "k8s.io/api/core/v1"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2426
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb"
2527
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
2628
"k8s.io/autoscaler/cluster-autoscaler/simulator/drainability/rules"
@@ -91,6 +93,8 @@ const (
9193
BlockedByPod
9294
// UnexpectedError - node can't be removed because of an unexpected error.
9395
UnexpectedError
96+
// NoNodeInfo - node can't be removed because it doesn't have any node info in the cluster snapshot.
97+
NoNodeInfo
9498
)
9599

96100
// RemovalSimulator is a helper object for simulating node removal scenarios.
@@ -151,6 +155,12 @@ func (r *RemovalSimulator) SimulateNodeRemoval(
151155
nodeInfo, err := r.clusterSnapshot.GetNodeInfo(nodeName)
152156
if err != nil {
153157
klog.Errorf("Can't retrieve node %s from snapshot, err: %v", nodeName, err)
158+
unremovableReason := UnexpectedError
159+
if errors.Is(err, clustersnapshot.ErrNodeNotFound) {
160+
unremovableReason = NoNodeInfo
161+
}
162+
unremovableNode := &UnremovableNode{Node: &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}, Reason: unremovableReason}
163+
return nil, unremovableNode
154164
}
155165
klog.V(2).Infof("Simulating node %s removal", nodeName)
156166

cluster-autoscaler/simulator/cluster_test.go

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,201 @@ func TestFindNodesToRemove(t *testing.T) {
258258
}
259259
}
260260

261+
type simulateNodeRemovalTestConfig struct {
262+
name string
263+
pods []*apiv1.Pod
264+
allNodes []*apiv1.Node
265+
nodeName string
266+
toRemove *NodeToBeRemoved
267+
unremovable *UnremovableNode
268+
}
269+
270+
func TestSimulateNodeRemoval(t *testing.T) {
271+
emptyNode := BuildTestNode("n1", 1000, 2000000)
272+
273+
// two small pods backed by ReplicaSet
274+
drainableNode := BuildTestNode("n2", 1000, 2000000)
275+
drainableNodeInfo := framework.NewTestNodeInfo(drainableNode)
276+
277+
// one small pod, not backed by anything
278+
nonDrainableNode := BuildTestNode("n3", 1000, 2000000)
279+
nonDrainableNodeInfo := framework.NewTestNodeInfo(nonDrainableNode)
280+
281+
// one very large pod
282+
fullNode := BuildTestNode("n4", 1000, 2000000)
283+
fullNodeInfo := framework.NewTestNodeInfo(fullNode)
284+
285+
// noExistNode it doesn't have any node info in the cluster snapshot.
286+
noExistNode := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "n5"}}
287+
288+
SetNodeReadyState(emptyNode, true, time.Time{})
289+
SetNodeReadyState(drainableNode, true, time.Time{})
290+
SetNodeReadyState(nonDrainableNode, true, time.Time{})
291+
SetNodeReadyState(fullNode, true, time.Time{})
292+
293+
replicas := int32(5)
294+
replicaSets := []*appsv1.ReplicaSet{
295+
{
296+
ObjectMeta: metav1.ObjectMeta{
297+
Name: "rs",
298+
Namespace: "default",
299+
},
300+
Spec: appsv1.ReplicaSetSpec{
301+
Replicas: &replicas,
302+
},
303+
},
304+
}
305+
rsLister, err := kube_util.NewTestReplicaSetLister(replicaSets)
306+
assert.NoError(t, err)
307+
registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, rsLister, nil)
308+
309+
ownerRefs := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
310+
311+
pod1 := BuildTestPod("p1", 100, 100000)
312+
pod1.OwnerReferences = ownerRefs
313+
pod1.Spec.NodeName = "n2"
314+
drainableNodeInfo.AddPod(&framework.PodInfo{Pod: pod1})
315+
316+
pod2 := BuildTestPod("p2", 100, 100000)
317+
pod2.OwnerReferences = ownerRefs
318+
pod2.Spec.NodeName = "n2"
319+
drainableNodeInfo.AddPod(&framework.PodInfo{Pod: pod2})
320+
321+
pod3 := BuildTestPod("p3", 100, 100000)
322+
pod3.Spec.NodeName = "n3"
323+
nonDrainableNodeInfo.AddPod(&framework.PodInfo{Pod: pod3})
324+
325+
pod4 := BuildTestPod("p4", 1000, 100000)
326+
pod4.Spec.NodeName = "n4"
327+
fullNodeInfo.AddPod(&framework.PodInfo{Pod: pod4})
328+
329+
clusterSnapshot := testsnapshot.NewTestSnapshotOrDie(t)
330+
331+
topoNode1 := BuildTestNode("topo-n1", 1000, 2000000)
332+
topoNode2 := BuildTestNode("topo-n2", 1000, 2000000)
333+
topoNode3 := BuildTestNode("topo-n3", 1000, 2000000)
334+
topoNode1.Labels = map[string]string{"kubernetes.io/hostname": "topo-n1"}
335+
topoNode2.Labels = map[string]string{"kubernetes.io/hostname": "topo-n2"}
336+
topoNode3.Labels = map[string]string{"kubernetes.io/hostname": "topo-n3"}
337+
338+
SetNodeReadyState(topoNode1, true, time.Time{})
339+
SetNodeReadyState(topoNode2, true, time.Time{})
340+
SetNodeReadyState(topoNode3, true, time.Time{})
341+
342+
minDomains := int32(2)
343+
maxSkew := int32(1)
344+
topoConstraint := apiv1.TopologySpreadConstraint{
345+
MaxSkew: maxSkew,
346+
TopologyKey: "kubernetes.io/hostname",
347+
WhenUnsatisfiable: apiv1.DoNotSchedule,
348+
MinDomains: &minDomains,
349+
LabelSelector: &metav1.LabelSelector{
350+
MatchLabels: map[string]string{
351+
"app": "topo-app",
352+
},
353+
},
354+
}
355+
356+
pod5 := BuildTestPod("p5", 100, 100000)
357+
pod5.Labels = map[string]string{"app": "topo-app"}
358+
pod5.OwnerReferences = ownerRefs
359+
pod5.Spec.NodeName = "topo-n1"
360+
pod5.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint}
361+
362+
pod6 := BuildTestPod("p6", 100, 100000)
363+
pod6.Labels = map[string]string{"app": "topo-app"}
364+
pod6.OwnerReferences = ownerRefs
365+
pod6.Spec.NodeName = "topo-n2"
366+
pod6.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint}
367+
368+
pod7 := BuildTestPod("p7", 100, 100000)
369+
pod7.Labels = map[string]string{"app": "topo-app"}
370+
pod7.OwnerReferences = ownerRefs
371+
pod7.Spec.NodeName = "topo-n3"
372+
pod7.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint}
373+
374+
blocker1 := BuildTestPod("blocker1", 100, 100000)
375+
blocker1.Spec.NodeName = "topo-n2"
376+
blocker2 := BuildTestPod("blocker2", 100, 100000)
377+
blocker2.Spec.NodeName = "topo-n3"
378+
379+
tests := []simulateNodeRemovalTestConfig{
380+
{
381+
name: "just an empty node, should be removed",
382+
nodeName: emptyNode.Name,
383+
allNodes: []*apiv1.Node{emptyNode},
384+
toRemove: &NodeToBeRemoved{Node: emptyNode},
385+
unremovable: nil,
386+
},
387+
{
388+
name: "just a drainable node, but nowhere for pods to go to",
389+
pods: []*apiv1.Pod{pod1, pod2},
390+
nodeName: drainableNode.Name,
391+
allNodes: []*apiv1.Node{drainableNode},
392+
toRemove: nil,
393+
unremovable: &UnremovableNode{
394+
Node: drainableNode,
395+
Reason: NoPlaceToMovePods,
396+
},
397+
},
398+
{
399+
name: "drainable node, and a mostly empty node that can take its pods",
400+
pods: []*apiv1.Pod{pod1, pod2, pod3},
401+
nodeName: drainableNode.Name,
402+
allNodes: []*apiv1.Node{drainableNode, nonDrainableNode},
403+
toRemove: &NodeToBeRemoved{Node: drainableNode, PodsToReschedule: []*apiv1.Pod{pod1, pod2}},
404+
unremovable: nil,
405+
},
406+
{
407+
name: "drainable node, and a full node that cannot fit anymore pods",
408+
pods: []*apiv1.Pod{pod1, pod2, pod4},
409+
nodeName: drainableNode.Name,
410+
allNodes: []*apiv1.Node{drainableNode, fullNode},
411+
toRemove: nil,
412+
unremovable: &UnremovableNode{Node: drainableNode, Reason: NoPlaceToMovePods},
413+
},
414+
{
415+
name: "4 nodes, 1 empty, 1 drainable",
416+
pods: []*apiv1.Pod{pod1, pod2, pod3, pod4},
417+
nodeName: emptyNode.Name,
418+
allNodes: []*apiv1.Node{emptyNode, drainableNode, fullNode, nonDrainableNode},
419+
toRemove: &NodeToBeRemoved{Node: emptyNode},
420+
unremovable: nil,
421+
},
422+
{
423+
name: "topology spread constraint test - one node should be removable",
424+
pods: []*apiv1.Pod{pod5, pod6, pod7, blocker1, blocker2},
425+
allNodes: []*apiv1.Node{topoNode1, topoNode2, topoNode3},
426+
nodeName: topoNode1.Name,
427+
toRemove: &NodeToBeRemoved{Node: topoNode1, PodsToReschedule: []*apiv1.Pod{pod5}},
428+
unremovable: nil,
429+
},
430+
{
431+
name: "candidate not in clusterSnapshot should be marked unremovable",
432+
nodeName: noExistNode.Name,
433+
allNodes: []*apiv1.Node{},
434+
pods: []*apiv1.Pod{},
435+
toRemove: nil,
436+
unremovable: &UnremovableNode{Node: noExistNode, Reason: NoNodeInfo},
437+
},
438+
}
439+
440+
for _, test := range tests {
441+
t.Run(test.name, func(t *testing.T) {
442+
destinations := make(map[string]bool)
443+
for _, node := range test.allNodes {
444+
destinations[node.Name] = true
445+
}
446+
clustersnapshot.InitializeClusterSnapshotOrDie(t, clusterSnapshot, test.allNodes, test.pods)
447+
r := NewRemovalSimulator(registry, clusterSnapshot, testDeleteOptions(), nil, false)
448+
toRemove, unremovable := r.SimulateNodeRemoval(test.nodeName, destinations, time.Now(), nil)
449+
fmt.Printf("Test scenario: %s, toRemove=%v, unremovable=%v\n", test.name, toRemove, unremovable)
450+
assert.Equal(t, test.toRemove, toRemove)
451+
assert.Equal(t, test.unremovable, unremovable)
452+
})
453+
}
454+
}
455+
261456
func testDeleteOptions() options.NodeDeleteOptions {
262457
return options.NodeDeleteOptions{
263458
SkipNodesWithSystemPods: true,

0 commit comments

Comments
 (0)