Skip to content

Commit 48dfe75

Browse files
authored
Merge pull request #8449 from kincoy/fix/simulate-removal-nonexistent-node
fix(cluster-autoscaler): prevent panic in SimulateNodeRemoval by handling missing node info
2 parents a2934f7 + 1b7f6b0 commit 48dfe75

File tree

2 files changed

+248
-0
lines changed

2 files changed

+248
-0
lines changed

cluster-autoscaler/simulator/cluster.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ limitations under the License.
1717
package simulator
1818

1919
import (
20+
"errors"
2021
"fmt"
2122
"time"
2223

2324
apiv1 "k8s.io/api/core/v1"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2426
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb"
2527
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
2628
"k8s.io/autoscaler/cluster-autoscaler/simulator/drainability/rules"
@@ -91,6 +93,8 @@ const (
9193
BlockedByPod
9294
// UnexpectedError - node can't be removed because of an unexpected error.
9395
UnexpectedError
96+
// NoNodeInfo - node can't be removed because it doesn't have any node info in the cluster snapshot.
97+
NoNodeInfo
9498
)
9599

96100
// RemovalSimulator is a helper object for simulating node removal scenarios.
@@ -128,6 +132,12 @@ func (r *RemovalSimulator) SimulateNodeRemoval(
128132
nodeInfo, err := r.clusterSnapshot.GetNodeInfo(nodeName)
129133
if err != nil {
130134
klog.Errorf("Can't retrieve node %s from snapshot, err: %v", nodeName, err)
135+
unremovableReason := UnexpectedError
136+
if errors.Is(err, clustersnapshot.ErrNodeNotFound) {
137+
unremovableReason = NoNodeInfo
138+
}
139+
unremovableNode := &UnremovableNode{Node: &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}, Reason: unremovableReason}
140+
return nil, unremovableNode
131141
}
132142
klog.V(2).Infof("Simulating node %s removal", nodeName)
133143

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
/*
2+
Copyright 2016 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package simulator
18+
19+
import (
20+
"fmt"
21+
"testing"
22+
"time"
23+
24+
"github.com/stretchr/testify/assert"
25+
26+
appsv1 "k8s.io/api/apps/v1"
27+
apiv1 "k8s.io/api/core/v1"
28+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
30+
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot/testsnapshot"
31+
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
32+
"k8s.io/autoscaler/cluster-autoscaler/simulator/options"
33+
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
34+
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
35+
)
36+
37+
type simulateNodeRemovalTestConfig struct {
38+
name string
39+
pods []*apiv1.Pod
40+
allNodes []*apiv1.Node
41+
nodeName string
42+
toRemove *NodeToBeRemoved
43+
unremovable *UnremovableNode
44+
}
45+
46+
func TestSimulateNodeRemoval(t *testing.T) {
47+
emptyNode := BuildTestNode("n1", 1000, 2000000)
48+
49+
// two small pods backed by ReplicaSet
50+
drainableNode := BuildTestNode("n2", 1000, 2000000)
51+
drainableNodeInfo := framework.NewTestNodeInfo(drainableNode)
52+
53+
// one small pod, not backed by anything
54+
nonDrainableNode := BuildTestNode("n3", 1000, 2000000)
55+
nonDrainableNodeInfo := framework.NewTestNodeInfo(nonDrainableNode)
56+
57+
// one very large pod
58+
fullNode := BuildTestNode("n4", 1000, 2000000)
59+
fullNodeInfo := framework.NewTestNodeInfo(fullNode)
60+
61+
// noExistNode it doesn't have any node info in the cluster snapshot.
62+
noExistNode := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "n5"}}
63+
64+
SetNodeReadyState(emptyNode, true, time.Time{})
65+
SetNodeReadyState(drainableNode, true, time.Time{})
66+
SetNodeReadyState(nonDrainableNode, true, time.Time{})
67+
SetNodeReadyState(fullNode, true, time.Time{})
68+
69+
replicas := int32(5)
70+
replicaSets := []*appsv1.ReplicaSet{
71+
{
72+
ObjectMeta: metav1.ObjectMeta{
73+
Name: "rs",
74+
Namespace: "default",
75+
},
76+
Spec: appsv1.ReplicaSetSpec{
77+
Replicas: &replicas,
78+
},
79+
},
80+
}
81+
rsLister, err := kube_util.NewTestReplicaSetLister(replicaSets)
82+
assert.NoError(t, err)
83+
registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, rsLister, nil)
84+
85+
ownerRefs := GenerateOwnerReferences("rs", "ReplicaSet", "extensions/v1beta1", "")
86+
87+
pod1 := BuildTestPod("p1", 100, 100000)
88+
pod1.OwnerReferences = ownerRefs
89+
pod1.Spec.NodeName = "n2"
90+
drainableNodeInfo.AddPod(&framework.PodInfo{Pod: pod1})
91+
92+
pod2 := BuildTestPod("p2", 100, 100000)
93+
pod2.OwnerReferences = ownerRefs
94+
pod2.Spec.NodeName = "n2"
95+
drainableNodeInfo.AddPod(&framework.PodInfo{Pod: pod2})
96+
97+
pod3 := BuildTestPod("p3", 100, 100000)
98+
pod3.Spec.NodeName = "n3"
99+
nonDrainableNodeInfo.AddPod(&framework.PodInfo{Pod: pod3})
100+
101+
pod4 := BuildTestPod("p4", 1000, 100000)
102+
pod4.Spec.NodeName = "n4"
103+
fullNodeInfo.AddPod(&framework.PodInfo{Pod: pod4})
104+
105+
clusterSnapshot := testsnapshot.NewTestSnapshotOrDie(t)
106+
107+
topoNode1 := BuildTestNode("topo-n1", 1000, 2000000)
108+
topoNode2 := BuildTestNode("topo-n2", 1000, 2000000)
109+
topoNode3 := BuildTestNode("topo-n3", 1000, 2000000)
110+
topoNode1.Labels = map[string]string{"kubernetes.io/hostname": "topo-n1"}
111+
topoNode2.Labels = map[string]string{"kubernetes.io/hostname": "topo-n2"}
112+
topoNode3.Labels = map[string]string{"kubernetes.io/hostname": "topo-n3"}
113+
114+
SetNodeReadyState(topoNode1, true, time.Time{})
115+
SetNodeReadyState(topoNode2, true, time.Time{})
116+
SetNodeReadyState(topoNode3, true, time.Time{})
117+
118+
minDomains := int32(2)
119+
maxSkew := int32(1)
120+
topoConstraint := apiv1.TopologySpreadConstraint{
121+
MaxSkew: maxSkew,
122+
TopologyKey: "kubernetes.io/hostname",
123+
WhenUnsatisfiable: apiv1.DoNotSchedule,
124+
MinDomains: &minDomains,
125+
LabelSelector: &metav1.LabelSelector{
126+
MatchLabels: map[string]string{
127+
"app": "topo-app",
128+
},
129+
},
130+
}
131+
132+
pod5 := BuildTestPod("p5", 100, 100000)
133+
pod5.Labels = map[string]string{"app": "topo-app"}
134+
pod5.OwnerReferences = ownerRefs
135+
pod5.Spec.NodeName = "topo-n1"
136+
pod5.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint}
137+
138+
pod6 := BuildTestPod("p6", 100, 100000)
139+
pod6.Labels = map[string]string{"app": "topo-app"}
140+
pod6.OwnerReferences = ownerRefs
141+
pod6.Spec.NodeName = "topo-n2"
142+
pod6.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint}
143+
144+
pod7 := BuildTestPod("p7", 100, 100000)
145+
pod7.Labels = map[string]string{"app": "topo-app"}
146+
pod7.OwnerReferences = ownerRefs
147+
pod7.Spec.NodeName = "topo-n3"
148+
pod7.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint}
149+
150+
blocker1 := BuildTestPod("blocker1", 100, 100000)
151+
blocker1.Spec.NodeName = "topo-n2"
152+
blocker2 := BuildTestPod("blocker2", 100, 100000)
153+
blocker2.Spec.NodeName = "topo-n3"
154+
155+
tests := []simulateNodeRemovalTestConfig{
156+
{
157+
name: "just an empty node, should be removed",
158+
nodeName: emptyNode.Name,
159+
allNodes: []*apiv1.Node{emptyNode},
160+
toRemove: &NodeToBeRemoved{Node: emptyNode},
161+
unremovable: nil,
162+
},
163+
{
164+
name: "just a drainable node, but nowhere for pods to go to",
165+
pods: []*apiv1.Pod{pod1, pod2},
166+
nodeName: drainableNode.Name,
167+
allNodes: []*apiv1.Node{drainableNode},
168+
toRemove: nil,
169+
unremovable: &UnremovableNode{
170+
Node: drainableNode,
171+
Reason: NoPlaceToMovePods,
172+
},
173+
},
174+
{
175+
name: "drainable node, and a mostly empty node that can take its pods",
176+
pods: []*apiv1.Pod{pod1, pod2, pod3},
177+
nodeName: drainableNode.Name,
178+
allNodes: []*apiv1.Node{drainableNode, nonDrainableNode},
179+
toRemove: &NodeToBeRemoved{Node: drainableNode, PodsToReschedule: []*apiv1.Pod{pod1, pod2}},
180+
unremovable: nil,
181+
},
182+
{
183+
name: "drainable node, and a full node that cannot fit anymore pods",
184+
pods: []*apiv1.Pod{pod1, pod2, pod4},
185+
nodeName: drainableNode.Name,
186+
allNodes: []*apiv1.Node{drainableNode, fullNode},
187+
toRemove: nil,
188+
unremovable: &UnremovableNode{Node: drainableNode, Reason: NoPlaceToMovePods},
189+
},
190+
{
191+
name: "4 nodes, 1 empty, 1 drainable",
192+
pods: []*apiv1.Pod{pod1, pod2, pod3, pod4},
193+
nodeName: emptyNode.Name,
194+
allNodes: []*apiv1.Node{emptyNode, drainableNode, fullNode, nonDrainableNode},
195+
toRemove: &NodeToBeRemoved{Node: emptyNode},
196+
unremovable: nil,
197+
},
198+
{
199+
name: "topology spread constraint test - one node should be removable",
200+
pods: []*apiv1.Pod{pod5, pod6, pod7, blocker1, blocker2},
201+
allNodes: []*apiv1.Node{topoNode1, topoNode2, topoNode3},
202+
nodeName: topoNode1.Name,
203+
toRemove: &NodeToBeRemoved{Node: topoNode1, PodsToReschedule: []*apiv1.Pod{pod5}},
204+
unremovable: nil,
205+
},
206+
{
207+
name: "candidate not in clusterSnapshot should be marked unremovable",
208+
nodeName: noExistNode.Name,
209+
allNodes: []*apiv1.Node{},
210+
pods: []*apiv1.Pod{},
211+
toRemove: nil,
212+
unremovable: &UnremovableNode{Node: noExistNode, Reason: NoNodeInfo},
213+
},
214+
}
215+
216+
for _, test := range tests {
217+
t.Run(test.name, func(t *testing.T) {
218+
destinations := make(map[string]bool)
219+
for _, node := range test.allNodes {
220+
destinations[node.Name] = true
221+
}
222+
clustersnapshot.InitializeClusterSnapshotOrDie(t, clusterSnapshot, test.allNodes, test.pods)
223+
r := NewRemovalSimulator(registry, clusterSnapshot, testDeleteOptions(), nil, false)
224+
toRemove, unremovable := r.SimulateNodeRemoval(test.nodeName, destinations, time.Now(), nil)
225+
fmt.Printf("Test scenario: %s, toRemove=%v, unremovable=%v\n", test.name, toRemove, unremovable)
226+
assert.Equal(t, test.toRemove, toRemove)
227+
assert.Equal(t, test.unremovable, unremovable)
228+
})
229+
}
230+
}
231+
232+
func testDeleteOptions() options.NodeDeleteOptions {
233+
return options.NodeDeleteOptions{
234+
SkipNodesWithSystemPods: true,
235+
SkipNodesWithLocalStorage: true,
236+
SkipNodesWithCustomControllerPods: true,
237+
}
238+
}

0 commit comments

Comments
 (0)