Skip to content

Commit d6f09f7

Browse files
Fix UpdateSnapshot when Node is partially removed
Change-Id: I5b459e9ea67020183c87d1ce0a2380efb8cc3e05
1 parent db9f1e9 commit d6f09f7

File tree

3 files changed

+35
-23
lines changed

3 files changed

+35
-23
lines changed

pkg/scheduler/internal/cache/cache.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,8 @@ func (cache *schedulerCache) Dump() *Dump {
196196

197197
// UpdateSnapshot takes a snapshot of cached NodeInfo map. This is called at
198198
// beginning of every scheduling cycle.
199+
// The snapshot only includes Nodes that are not deleted at the time this function is called.
200+
// nodeinfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
199201
// This function tracks generation number of NodeInfo and updates only the
200202
// entries of an existing snapshot that have changed after the snapshot was taken.
201203
func (cache *schedulerCache) UpdateSnapshot(nodeSnapshot *Snapshot) error {
@@ -256,7 +258,10 @@ func (cache *schedulerCache) UpdateSnapshot(nodeSnapshot *Snapshot) error {
256258
nodeSnapshot.generation = cache.headNode.info.Generation
257259
}
258260

259-
if len(nodeSnapshot.nodeInfoMap) > len(cache.nodes) {
261+
// Comparing to pods in nodeTree.
262+
// Deleted nodes get removed from the tree, but they might remain in the nodes map
263+
// if they still have non-deleted Pods.
264+
if len(nodeSnapshot.nodeInfoMap) > cache.nodeTree.numNodes {
260265
cache.removeDeletedNodesFromSnapshot(nodeSnapshot)
261266
updateAllLists = true
262267
}
@@ -318,12 +323,12 @@ func (cache *schedulerCache) updateNodeInfoSnapshotList(snapshot *Snapshot, upda
318323

319324
// If certain nodes were deleted after the last snapshot was taken, we should remove them from the snapshot.
320325
func (cache *schedulerCache) removeDeletedNodesFromSnapshot(snapshot *Snapshot) {
321-
toDelete := len(snapshot.nodeInfoMap) - len(cache.nodes)
326+
toDelete := len(snapshot.nodeInfoMap) - cache.nodeTree.numNodes
322327
for name := range snapshot.nodeInfoMap {
323328
if toDelete <= 0 {
324329
break
325330
}
326-
if _, ok := cache.nodes[name]; !ok {
331+
if n, ok := cache.nodes[name]; !ok || n.info.Node() == nil {
327332
delete(snapshot.nodeInfoMap, name)
328333
toDelete--
329334
}

pkg/scheduler/internal/cache/cache_test.go

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,66 +1257,66 @@ func TestSchedulerCache_UpdateSnapshot(t *testing.T) {
12571257

12581258
var cache *schedulerCache
12591259
var snapshot *Snapshot
1260-
type operation = func()
1260+
type operation = func(t *testing.T)
12611261

12621262
addNode := func(i int) operation {
1263-
return func() {
1263+
return func(t *testing.T) {
12641264
if err := cache.AddNode(nodes[i]); err != nil {
12651265
t.Error(err)
12661266
}
12671267
}
12681268
}
12691269
removeNode := func(i int) operation {
1270-
return func() {
1270+
return func(t *testing.T) {
12711271
if err := cache.RemoveNode(nodes[i]); err != nil {
12721272
t.Error(err)
12731273
}
12741274
}
12751275
}
12761276
updateNode := func(i int) operation {
1277-
return func() {
1277+
return func(t *testing.T) {
12781278
if err := cache.UpdateNode(nodes[i], updatedNodes[i]); err != nil {
12791279
t.Error(err)
12801280
}
12811281
}
12821282
}
12831283
addPod := func(i int) operation {
1284-
return func() {
1284+
return func(t *testing.T) {
12851285
if err := cache.AddPod(pods[i]); err != nil {
12861286
t.Error(err)
12871287
}
12881288
}
12891289
}
12901290
addPodWithAffinity := func(i int) operation {
1291-
return func() {
1291+
return func(t *testing.T) {
12921292
if err := cache.AddPod(podsWithAffinity[i]); err != nil {
12931293
t.Error(err)
12941294
}
12951295
}
12961296
}
12971297
removePod := func(i int) operation {
1298-
return func() {
1298+
return func(t *testing.T) {
12991299
if err := cache.RemovePod(pods[i]); err != nil {
13001300
t.Error(err)
13011301
}
13021302
}
13031303
}
13041304
removePodWithAffinity := func(i int) operation {
1305-
return func() {
1305+
return func(t *testing.T) {
13061306
if err := cache.RemovePod(podsWithAffinity[i]); err != nil {
13071307
t.Error(err)
13081308
}
13091309
}
13101310
}
13111311
updatePod := func(i int) operation {
1312-
return func() {
1312+
return func(t *testing.T) {
13131313
if err := cache.UpdatePod(pods[i], updatedPods[i]); err != nil {
13141314
t.Error(err)
13151315
}
13161316
}
13171317
}
13181318
updateSnapshot := func() operation {
1319-
return func() {
1319+
return func(t *testing.T) {
13201320
cache.UpdateSnapshot(snapshot)
13211321
if err := compareCacheWithNodeInfoSnapshot(t, cache, snapshot); err != nil {
13221322
t.Error(err)
@@ -1434,8 +1434,9 @@ func TestSchedulerCache_UpdateSnapshot(t *testing.T) {
14341434
{
14351435
name: "Remove node before its pods",
14361436
operations: []operation{
1437-
addNode(0), addNode(1), addPod(1), addPod(11),
1438-
removeNode(1), updatePod(1), updatePod(11), removePod(1), removePod(11),
1437+
addNode(0), addNode(1), addPod(1), addPod(11), updateSnapshot(),
1438+
removeNode(1), updateSnapshot(),
1439+
updatePod(1), updatePod(11), removePod(1), removePod(11),
14391440
},
14401441
expected: []*v1.Node{nodes[0]},
14411442
},
@@ -1471,7 +1472,7 @@ func TestSchedulerCache_UpdateSnapshot(t *testing.T) {
14711472
snapshot = NewEmptySnapshot()
14721473

14731474
for _, op := range test.operations {
1474-
op()
1475+
op(t)
14751476
}
14761477

14771478
if len(test.expected) != len(cache.nodes) {
@@ -1508,18 +1509,22 @@ func TestSchedulerCache_UpdateSnapshot(t *testing.T) {
15081509

15091510
func compareCacheWithNodeInfoSnapshot(t *testing.T, cache *schedulerCache, snapshot *Snapshot) error {
15101511
// Compare the map.
1511-
if len(snapshot.nodeInfoMap) != len(cache.nodes) {
1512-
return fmt.Errorf("unexpected number of nodes in the snapshot. Expected: %v, got: %v", len(cache.nodes), len(snapshot.nodeInfoMap))
1512+
if len(snapshot.nodeInfoMap) != cache.nodeTree.numNodes {
1513+
return fmt.Errorf("unexpected number of nodes in the snapshot. Expected: %v, got: %v", cache.nodeTree.numNodes, len(snapshot.nodeInfoMap))
15131514
}
15141515
for name, ni := range cache.nodes {
1515-
if !reflect.DeepEqual(snapshot.nodeInfoMap[name], ni.info) {
1516-
return fmt.Errorf("unexpected node info for node %q. Expected: %v, got: %v", name, ni.info, snapshot.nodeInfoMap[name])
1516+
want := ni.info
1517+
if want.Node() == nil {
1518+
want = nil
1519+
}
1520+
if !reflect.DeepEqual(snapshot.nodeInfoMap[name], want) {
1521+
return fmt.Errorf("unexpected node info for node %q.Expected:\n%v, got:\n%v", name, ni.info, snapshot.nodeInfoMap[name])
15171522
}
15181523
}
15191524

15201525
// Compare the lists.
1521-
if len(snapshot.nodeInfoList) != len(cache.nodes) {
1522-
return fmt.Errorf("unexpected number of nodes in NodeInfoList. Expected: %v, got: %v", len(cache.nodes), len(snapshot.nodeInfoList))
1526+
if len(snapshot.nodeInfoList) != cache.nodeTree.numNodes {
1527+
return fmt.Errorf("unexpected number of nodes in NodeInfoList. Expected: %v, got: %v", cache.nodeTree.numNodes, len(snapshot.nodeInfoList))
15231528
}
15241529

15251530
expectedNodeInfoList := make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)

pkg/scheduler/internal/cache/interface.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ limitations under the License.
1717
package cache
1818

1919
import (
20-
"k8s.io/api/core/v1"
20+
v1 "k8s.io/api/core/v1"
2121
framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
2222
)
2323

@@ -99,6 +99,8 @@ type Cache interface {
9999
// UpdateSnapshot updates the passed infoSnapshot to the current contents of Cache.
100100
// The node info contains aggregated information of pods scheduled (including assumed to be)
101101
// on this node.
102+
// The snapshot only includes Nodes that are not deleted at the time this function is called.
103+
// nodeinfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
102104
UpdateSnapshot(nodeSnapshot *Snapshot) error
103105

104106
// Dump produces a dump of the current cache.

0 commit comments

Comments
 (0)