Skip to content

Commit 960e398

Browse files
authored
Merge pull request kubernetes#127444 from dom4ha/fine-grained-qhints
Fine grain QueueHints for NodeAffinity plugin
2 parents 5ebd0da + c7db4bb commit 960e398

File tree

4 files changed

+70
-11
lines changed

4 files changed

+70
-11
lines changed

pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ func (pl *NodeAffinity) EventsToRegister(_ context.Context) ([]framework.Cluster
104104
// isSchedulableAfterNodeChange is invoked whenever a node changed. It checks whether
105105
// that change made a previously unschedulable pod schedulable.
106106
func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
107-
_, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
107+
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
108108
if err != nil {
109109
return framework.Queue, err
110110
}
@@ -119,16 +119,27 @@ func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1
119119
if err != nil {
120120
return framework.Queue, err
121121
}
122-
if isMatched {
123-
logger.V(4).Info("node was created or updated, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
122+
if !isMatched {
123+
logger.V(5).Info("node was created or updated, but the pod's NodeAffinity doesn't match", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
124+
return framework.QueueSkip, nil
125+
}
126+
// Since the node was added and it matches the pod's affinity criteria, we can unblock it.
127+
if originalNode == nil {
128+
logger.V(5).Info("node was created, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
124129
return framework.Queue, nil
125130
}
126-
127-
// TODO: also check if the original node meets the pod's requestments once preCheck is completely removed.
128-
// See: https://github.com/kubernetes/kubernetes/issues/110175
129-
130-
logger.V(4).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
131-
return framework.QueueSkip, nil
131+
// At this point we know the operation is update so we can narrow down the criteria to unmatch -> match changes only
132+
// (necessary affinity label was added to the node in this case).
133+
wasMatched, err := requiredNodeAffinity.Match(originalNode)
134+
if err != nil {
135+
return framework.Queue, err
136+
}
137+
if wasMatched {
138+
logger.V(5).Info("node updated, but the pod's NodeAffinity hasn't changed", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
139+
return framework.QueueSkip, nil
140+
}
141+
logger.V(5).Info("node was updated and the pod's NodeAffinity changed to matched", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
142+
return framework.Queue, nil
132143
}
133144

134145
// PreFilter builds and writes cycle state used by Filter.

pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,6 +1297,13 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) {
12971297
newObj: st.MakeNode().Label("foo", "bar").Obj(),
12981298
expectedHint: framework.Queue,
12991299
},
1300+
"skip-unrelated-change-that-keeps-pod-schedulable": {
1301+
args: &config.NodeAffinityArgs{},
1302+
pod: podWithNodeAffinity.Obj(),
1303+
oldObj: st.MakeNode().Label("foo", "bar").Obj(),
1304+
newObj: st.MakeNode().Capacity(nil).Label("foo", "bar").Obj(),
1305+
expectedHint: framework.QueueSkip,
1306+
},
13001307
"skip-queue-on-add-scheduler-enforced-node-affinity": {
13011308
args: &config.NodeAffinityArgs{
13021309
AddedAffinity: &v1.NodeAffinity{

pkg/scheduler/framework/types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ const (
146146

147147
type ClusterEventWithHint struct {
148148
Event ClusterEvent
149-
// QueueingHintFn is executed for the plugin rejected by this plugin when the above Event happens,
149+
// QueueingHintFn is executed for the Pod rejected by this plugin when the above Event happens,
150150
// and filters out events to reduce useless retry of Pod's scheduling.
151151
// It's an optional field. If not set,
152152
// the scheduling of Pods will be always retried with backoff when this Event happens.

test/integration/scheduler/queue_test.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,48 @@ func TestCoreResourceEnqueue(t *testing.T) {
271271
// It causes pod1 to be requeued.
272272
// It causes pod2 not to be requeued.
273273
if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node1").Label("group", "b").Obj(), metav1.UpdateOptions{}); err != nil {
274-
return fmt.Errorf("failed to remove taints off the node: %w", err)
274+
return fmt.Errorf("failed to update the pod: %w", err)
275+
}
276+
return nil
277+
},
278+
wantRequeuedPods: sets.New("pod1"),
279+
},
280+
{
281+
name: "Pod rejected by the NodeAffinity plugin is not requeued when an updated Node haven't changed the 'match' verdict",
282+
initialNodes: []*v1.Node{
283+
st.MakeNode().Name("node1").Label("group", "a").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj(),
284+
st.MakeNode().Name("node2").Label("group", "b").Obj()},
285+
pods: []*v1.Pod{
286+
// - The initial pod would be accepted by the NodeAffinity plugin for node1, but will be blocked by the NodeResources plugin.
287+
// - The pod will be blocked by the NodeAffinity plugin for node2, therefore we know NodeAffinity will be queried for qhint for both testing nodes.
288+
st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"a"}, st.NodeSelectorTypeMatchExpressions).Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Obj(),
289+
},
290+
triggerFn: func(testCtx *testutils.TestContext) error {
291+
// Trigger a NodeUpdate event to add new label.
292+
// It won't cause pod to be requeued, because there was a match already before the update, meaning this plugin wasn't blocking the scheduling.
293+
if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("node1").Label("group", "a").Label("node", "fake-node").Obj(), metav1.UpdateOptions{}); err != nil {
294+
return fmt.Errorf("failed to update the pod: %w", err)
295+
}
296+
return nil
297+
},
298+
wantRequeuedPods: sets.Set[string]{},
299+
enableSchedulingQueueHint: []bool{true},
300+
},
301+
{
302+
name: "Pod rejected by the NodeAffinity plugin is requeued when a Node is added",
303+
initialNodes: []*v1.Node{st.MakeNode().Name("fake-node1").Label("group", "a").Obj()},
304+
pods: []*v1.Pod{
305+
// - Pod1 will be rejected by the NodeAffinity plugin.
306+
st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
307+
// - Pod2 will be rejected by the NodeAffinity plugin.
308+
st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"c"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
309+
},
310+
triggerFn: func(testCtx *testutils.TestContext) error {
311+
// Trigger a NodeAdd event with the awaited label.
312+
// It causes pod1 to be requeued.
313+
// It causes pod2 not to be requeued.
314+
if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Label("group", "b").Obj(), metav1.CreateOptions{}); err != nil {
315+
return fmt.Errorf("failed to update the pod: %w", err)
275316
}
276317
return nil
277318
},

0 commit comments

Comments
 (0)