Skip to content

Commit 421436a

Browse files
authored
Merge pull request kubernetes#127473 from dom4ha/fine-grain-qhints-fit
feature(scheduler): more fine-grained Node QHint for NodeResourceFit plugin
2 parents c89205f + 903b1f7 commit 421436a

File tree

3 files changed

+283
-16
lines changed

3 files changed

+283
-16
lines changed

pkg/scheduler/framework/plugins/noderesources/fit.go

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -363,21 +363,68 @@ func (f *Fit) isSchedulableAfterPodScaleDown(targetPod, originalPod, modifiedPod
363363
}
364364

365365
// isSchedulableAfterNodeChange is invoked whenever a node added or changed. It checks whether
366-
// that change made a previously unschedulable pod schedulable.
366+
// that change could make a previously unschedulable pod schedulable.
367367
func (f *Fit) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
368-
_, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
368+
originalNode, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
369369
if err != nil {
370370
return framework.Queue, err
371371
}
372-
// TODO: also check if the original node meets the pod's resource requestments once preCheck is completely removed.
373-
// See: https://github.com/kubernetes/kubernetes/issues/110175
374-
if isFit(pod, modifiedNode) {
375-
logger.V(5).Info("node was updated, and may fit with the pod's resource requestments", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
372+
// Leaving in the queue, since the pod won't fit into the modified node anyway.
373+
if !isFit(pod, modifiedNode) {
374+
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
375+
return framework.QueueSkip, nil
376+
}
377+
// The pod will fit, so since it's add, unblock scheduling.
378+
if originalNode == nil {
379+
logger.V(5).Info("node was added and it might fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
376380
return framework.Queue, nil
377381
}
382+
// The pod will fit, but since there was no increase in available resources, the change won't make the pod schedulable.
383+
if !haveAnyRequestedResourcesIncreased(pod, originalNode, modifiedNode) {
384+
logger.V(5).Info("node was updated, but haven't changed the pod's resource requestments fit assessment", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
385+
return framework.QueueSkip, nil
386+
}
387+
388+
logger.V(5).Info("node was updated, and may now fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
389+
return framework.Queue, nil
390+
}
391+
392+
// haveAnyRequestedResourcesIncreased returns true if any of the resources requested by the pod have increased or if allowed pod number increased.
393+
func haveAnyRequestedResourcesIncreased(pod *v1.Pod, originalNode, modifiedNode *v1.Node) bool {
394+
podRequest := computePodResourceRequest(pod)
395+
originalNodeInfo := framework.NewNodeInfo()
396+
originalNodeInfo.SetNode(originalNode)
397+
modifiedNodeInfo := framework.NewNodeInfo()
398+
modifiedNodeInfo.SetNode(modifiedNode)
399+
400+
if modifiedNodeInfo.Allocatable.AllowedPodNumber > originalNodeInfo.Allocatable.AllowedPodNumber {
401+
return true
402+
}
403+
404+
if podRequest.MilliCPU == 0 &&
405+
podRequest.Memory == 0 &&
406+
podRequest.EphemeralStorage == 0 &&
407+
len(podRequest.ScalarResources) == 0 {
408+
return false
409+
}
410+
411+
if (podRequest.MilliCPU > 0 && modifiedNodeInfo.Allocatable.MilliCPU > originalNodeInfo.Allocatable.MilliCPU) ||
412+
(podRequest.Memory > 0 && modifiedNodeInfo.Allocatable.Memory > originalNodeInfo.Allocatable.Memory) ||
413+
(podRequest.EphemeralStorage > 0 && modifiedNodeInfo.Allocatable.EphemeralStorage > originalNodeInfo.Allocatable.EphemeralStorage) {
414+
return true
415+
}
416+
417+
for rName, rQuant := range podRequest.ScalarResources {
418+
// Skip in case request quantity is zero
419+
if rQuant == 0 {
420+
continue
421+
}
378422

379-
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
380-
return framework.QueueSkip, nil
423+
if modifiedNodeInfo.Allocatable.ScalarResources[rName] > originalNodeInfo.Allocatable.ScalarResources[rName] {
424+
return true
425+
}
426+
}
427+
return false
381428
}
382429

383430
// isFit checks if the pod fits the node. If the node is nil, it returns false.

pkg/scheduler/framework/plugins/noderesources/fit_test.go

Lines changed: 164 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,14 +1276,37 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) {
12761276
}).Obj(),
12771277
expectedHint: framework.Queue,
12781278
},
1279-
// uncomment this case when the isSchedulableAfterNodeChange also check the
1280-
// original node's resources.
1281-
// "skip-queue-on-node-unrelated-changes": {
1282-
// pod: &v1.Pod{},
1283-
// oldObj: st.MakeNode().Obj(),
1284-
// newObj: st.MakeNode().Label("foo", "bar").Obj(),
1285-
// expectedHint: framework.QueueSkip,
1286-
// },
1279+
"skip-queue-on-node-unrelated-changes": {
1280+
pod: newResourcePod(framework.Resource{
1281+
Memory: 2,
1282+
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1},
1283+
}),
1284+
oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
1285+
v1.ResourceMemory: "2",
1286+
extendedResourceA: "2",
1287+
}).Obj(),
1288+
newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
1289+
v1.ResourceMemory: "2",
1290+
extendedResourceA: "1",
1291+
extendedResourceB: "2",
1292+
}).Obj(),
1293+
expectedHint: framework.QueueSkip,
1294+
},
1295+
"queue-on-pod-requested-resources-increase": {
1296+
pod: newResourcePod(framework.Resource{
1297+
Memory: 2,
1298+
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1},
1299+
}),
1300+
oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
1301+
v1.ResourceMemory: "2",
1302+
extendedResourceA: "1",
1303+
}).Obj(),
1304+
newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
1305+
v1.ResourceMemory: "2",
1306+
extendedResourceA: "2",
1307+
}).Obj(),
1308+
expectedHint: framework.Queue,
1309+
},
12871310
"skip-queue-on-node-changes-from-suitable-to-unsuitable": {
12881311
pod: newResourcePod(framework.Resource{
12891312
Memory: 2,
@@ -1364,3 +1387,136 @@ func TestIsFit(t *testing.T) {
13641387
})
13651388
}
13661389
}
1390+
1391+
func TestHaveAnyRequestedResourcesIncreased(t *testing.T) {
1392+
testCases := map[string]struct {
1393+
pod *v1.Pod
1394+
originalNode *v1.Node
1395+
modifiedNode *v1.Node
1396+
expected bool
1397+
}{
1398+
"no-requested-resources": {
1399+
pod: newResourcePod(framework.Resource{}),
1400+
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1401+
v1.ResourcePods: "1",
1402+
v1.ResourceCPU: "1",
1403+
v1.ResourceMemory: "1",
1404+
v1.ResourceEphemeralStorage: "1",
1405+
extendedResourceA: "1",
1406+
}).Obj(),
1407+
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1408+
v1.ResourcePods: "1",
1409+
v1.ResourceCPU: "2",
1410+
v1.ResourceMemory: "2",
1411+
v1.ResourceEphemeralStorage: "2",
1412+
extendedResourceA: "2",
1413+
}).Obj(),
1414+
expected: false,
1415+
},
1416+
"no-requested-resources-pods-increased": {
1417+
pod: newResourcePod(framework.Resource{}),
1418+
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1419+
v1.ResourcePods: "1",
1420+
v1.ResourceCPU: "1",
1421+
v1.ResourceMemory: "1",
1422+
v1.ResourceEphemeralStorage: "1",
1423+
extendedResourceA: "1",
1424+
}).Obj(),
1425+
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1426+
v1.ResourcePods: "2",
1427+
v1.ResourceCPU: "1",
1428+
v1.ResourceMemory: "1",
1429+
v1.ResourceEphemeralStorage: "1",
1430+
extendedResourceA: "1",
1431+
}).Obj(),
1432+
expected: true,
1433+
},
1434+
"requested-resources-decreased": {
1435+
pod: newResourcePod(framework.Resource{
1436+
MilliCPU: 2,
1437+
Memory: 2,
1438+
EphemeralStorage: 2,
1439+
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2},
1440+
}),
1441+
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1442+
v1.ResourceCPU: "1",
1443+
v1.ResourceMemory: "2",
1444+
v1.ResourceEphemeralStorage: "3",
1445+
extendedResourceA: "4",
1446+
}).Obj(),
1447+
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1448+
v1.ResourceCPU: "1",
1449+
v1.ResourceMemory: "2",
1450+
v1.ResourceEphemeralStorage: "1",
1451+
extendedResourceA: "1",
1452+
}).Obj(),
1453+
expected: false,
1454+
},
1455+
"requested-resources-increased": {
1456+
pod: newResourcePod(framework.Resource{
1457+
MilliCPU: 2,
1458+
Memory: 2,
1459+
EphemeralStorage: 2,
1460+
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2},
1461+
}),
1462+
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1463+
v1.ResourceCPU: "1",
1464+
v1.ResourceMemory: "2",
1465+
v1.ResourceEphemeralStorage: "3",
1466+
extendedResourceA: "4",
1467+
}).Obj(),
1468+
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1469+
v1.ResourceCPU: "3",
1470+
v1.ResourceMemory: "4",
1471+
v1.ResourceEphemeralStorage: "3",
1472+
extendedResourceA: "4",
1473+
}).Obj(),
1474+
expected: true,
1475+
},
1476+
"non-requested-resources-decreased": {
1477+
pod: newResourcePod(framework.Resource{
1478+
MilliCPU: 2,
1479+
Memory: 2,
1480+
}),
1481+
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1482+
v1.ResourceCPU: "1",
1483+
v1.ResourceMemory: "2",
1484+
v1.ResourceEphemeralStorage: "3",
1485+
extendedResourceA: "4",
1486+
}).Obj(),
1487+
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1488+
v1.ResourceCPU: "1",
1489+
v1.ResourceMemory: "2",
1490+
v1.ResourceEphemeralStorage: "1",
1491+
extendedResourceA: "1",
1492+
}).Obj(),
1493+
expected: false,
1494+
},
1495+
"non-requested-resources-increased": {
1496+
pod: newResourcePod(framework.Resource{
1497+
MilliCPU: 2,
1498+
Memory: 2,
1499+
}),
1500+
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1501+
v1.ResourceCPU: "1",
1502+
v1.ResourceMemory: "2",
1503+
v1.ResourceEphemeralStorage: "3",
1504+
extendedResourceA: "4",
1505+
}).Obj(),
1506+
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
1507+
v1.ResourceCPU: "1",
1508+
v1.ResourceMemory: "2",
1509+
v1.ResourceEphemeralStorage: "5",
1510+
extendedResourceA: "6",
1511+
}).Obj(),
1512+
expected: false,
1513+
},
1514+
}
1515+
for name, tc := range testCases {
1516+
t.Run(name, func(t *testing.T) {
1517+
if got := haveAnyRequestedResourcesIncreased(tc.pod, tc.originalNode, tc.modifiedNode); got != tc.expected {
1518+
t.Errorf("expected: %v, got: %v", tc.expected, got)
1519+
}
1520+
})
1521+
}
1522+
}

test/integration/scheduler/queue_test.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,70 @@ func TestCoreResourceEnqueue(t *testing.T) {
370370
},
371371
wantRequeuedPods: sets.New("pod1"),
372372
},
373+
{
374+
name: "Pod rejected by the NodeResourcesFit plugin isn't requeued when a Node is updated without increase in the requested resources",
375+
initialNodes: []*v1.Node{
376+
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
377+
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(),
378+
},
379+
pods: []*v1.Pod{
380+
// - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
381+
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
382+
},
383+
triggerFn: func(testCtx *testutils.TestContext) error {
384+
// Trigger a NodeUpdate event that increases unrealted (not requested) memory capacity of fake-node1, which should not requeue Pod1.
385+
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4", v1.ResourceMemory: "4000"}).Obj(), metav1.UpdateOptions{}); err != nil {
386+
return fmt.Errorf("failed to update fake-node: %w", err)
387+
}
388+
return nil
389+
},
390+
wantRequeuedPods: sets.Set[string]{},
391+
enableSchedulingQueueHint: []bool{true},
392+
},
393+
{
394+
name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the requested resources",
395+
initialNodes: []*v1.Node{
396+
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
397+
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(),
398+
},
399+
pods: []*v1.Pod{
400+
// - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
401+
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
402+
},
403+
triggerFn: func(testCtx *testutils.TestContext) error {
404+
// Trigger a NodeUpdate event that increases the requested CPU capacity of fake-node1, which should requeue Pod1.
405+
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "5"}).Obj(), metav1.UpdateOptions{}); err != nil {
406+
return fmt.Errorf("failed to update fake-node: %w", err)
407+
}
408+
return nil
409+
},
410+
wantRequeuedPods: sets.New("pod1"),
411+
enableSchedulingQueueHint: []bool{true},
412+
},
413+
{
414+
name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the allowed pods number",
415+
initialNodes: []*v1.Node{
416+
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "2"}).Obj(),
417+
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "1"}).Label("group", "b").Obj(),
418+
},
419+
initialPods: []*v1.Pod{
420+
// - Pod1 will be scheduled on fake-node2 because of the affinity label.
421+
st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Node("fake-node2").Obj(),
422+
},
423+
pods: []*v1.Pod{
424+
// - Pod2 is unschedulable because Pod1 saturated ResourcePods limit in fake-node2. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
425+
st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
426+
},
427+
triggerFn: func(testCtx *testutils.TestContext) error {
428+
// Trigger a NodeUpdate event that increases the allowed Pods number of fake-node1, which should requeue Pod2.
429+
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "3"}).Obj(), metav1.UpdateOptions{}); err != nil {
430+
return fmt.Errorf("failed to update fake-node: %w", err)
431+
}
432+
return nil
433+
},
434+
wantRequeuedPods: sets.New("pod2"),
435+
enableSchedulingQueueHint: []bool{true},
436+
},
373437
{
374438
name: "Updating pod condition doesn't retry scheduling if the Pod was rejected by TaintToleration",
375439
initialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Taints([]v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoSchedule}}).Obj()},

0 commit comments

Comments
 (0)