Skip to content

Commit c7db4bb

Browse files
committed
Fine grain QueueHints for nodeaffinity plugin.
Skip queue on unrelated change that keeps pod schedulable when QueueHints are enabled. Split add from QHints disabled case Remove case when QHints are disabled Remove two GHint alternatives in unit tests more fine-grained Node QHint for NodeResourceFit plugin Return early when updated Node causes unmatch Revert "more fine-grained Node QHint for NodeResourceFit plugin" This reverts commit dfbceb6. Add integration test for requeue of a pod previously rejected by NodeAffinity plugin when a suitable Node is added Add integratin test for a Node update operation that does not trigger requeue in NodeAffinity plugin Remove innacurrate comment Apply review comments
1 parent 996e674 commit c7db4bb

File tree

4 files changed

+70
-11
lines changed

4 files changed

+70
-11
lines changed

pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ func (pl *NodeAffinity) EventsToRegister(_ context.Context) ([]framework.Cluster
104104
// isSchedulableAfterNodeChange is invoked whenever a node changed. It checks whether
105105
// that change made a previously unschedulable pod schedulable.
106106
func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
107-
_, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
107+
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
108108
if err != nil {
109109
return framework.Queue, err
110110
}
@@ -119,16 +119,27 @@ func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1
119119
if err != nil {
120120
return framework.Queue, err
121121
}
122-
if isMatched {
123-
logger.V(4).Info("node was created or updated, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
122+
if !isMatched {
123+
logger.V(5).Info("node was created or updated, but the pod's NodeAffinity doesn't match", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
124+
return framework.QueueSkip, nil
125+
}
126+
// Since the node was added and it matches the pod's affinity criteria, we can unblock it.
127+
if originalNode == nil {
128+
logger.V(5).Info("node was created, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
124129
return framework.Queue, nil
125130
}
126-
127-
// TODO: also check if the original node meets the pod's requestments once preCheck is completely removed.
128-
// See: https://github.com/kubernetes/kubernetes/issues/110175
129-
130-
logger.V(4).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
131-
return framework.QueueSkip, nil
131+
// At this point we know the operation is update so we can narrow down the criteria to unmatch -> match changes only
132+
// (necessary affinity label was added to the node in this case).
133+
wasMatched, err := requiredNodeAffinity.Match(originalNode)
134+
if err != nil {
135+
return framework.Queue, err
136+
}
137+
if wasMatched {
138+
logger.V(5).Info("node updated, but the pod's NodeAffinity hasn't changed", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
139+
return framework.QueueSkip, nil
140+
}
141+
logger.V(5).Info("node was updated and the pod's NodeAffinity changed to matched", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
142+
return framework.Queue, nil
132143
}
133144

134145
// PreFilter builds and writes cycle state used by Filter.

pkg/scheduler/framework/plugins/nodeaffinity/node_affinity_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,6 +1297,13 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) {
12971297
newObj: st.MakeNode().Label("foo", "bar").Obj(),
12981298
expectedHint: framework.Queue,
12991299
},
1300+
"skip-unrelated-change-that-keeps-pod-schedulable": {
1301+
args: &config.NodeAffinityArgs{},
1302+
pod: podWithNodeAffinity.Obj(),
1303+
oldObj: st.MakeNode().Label("foo", "bar").Obj(),
1304+
newObj: st.MakeNode().Capacity(nil).Label("foo", "bar").Obj(),
1305+
expectedHint: framework.QueueSkip,
1306+
},
13001307
"skip-queue-on-add-scheduler-enforced-node-affinity": {
13011308
args: &config.NodeAffinityArgs{
13021309
AddedAffinity: &v1.NodeAffinity{

pkg/scheduler/framework/types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ const (
146146

147147
type ClusterEventWithHint struct {
148148
Event ClusterEvent
149-
// QueueingHintFn is executed for the plugin rejected by this plugin when the above Event happens,
149+
// QueueingHintFn is executed for the Pod rejected by this plugin when the above Event happens,
150150
// and filters out events to reduce useless retry of Pod's scheduling.
151151
// It's an optional field. If not set,
152152
// the scheduling of Pods will be always retried with backoff when this Event happens.

test/integration/scheduler/queue_test.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,48 @@ func TestCoreResourceEnqueue(t *testing.T) {
271271
// It causes pod1 to be requeued.
272272
// It causes pod2 not to be requeued.
273273
if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("fake-node1").Label("group", "b").Obj(), metav1.UpdateOptions{}); err != nil {
274-
return fmt.Errorf("failed to remove taints off the node: %w", err)
274+
return fmt.Errorf("failed to update the pod: %w", err)
275+
}
276+
return nil
277+
},
278+
wantRequeuedPods: sets.New("pod1"),
279+
},
280+
{
281+
name: "Pod rejected by the NodeAffinity plugin is not requeued when an updated Node haven't changed the 'match' verdict",
282+
initialNodes: []*v1.Node{
283+
st.MakeNode().Name("node1").Label("group", "a").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj(),
284+
st.MakeNode().Name("node2").Label("group", "b").Obj()},
285+
pods: []*v1.Pod{
286+
// - The initial pod would be accepted by the NodeAffinity plugin for node1, but will be blocked by the NodeResources plugin.
287+
// - The pod will be blocked by the NodeAffinity plugin for node2, therefore we know NodeAffinity will be queried for qhint for both testing nodes.
288+
st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"a"}, st.NodeSelectorTypeMatchExpressions).Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Container("image").Obj(),
289+
},
290+
triggerFn: func(testCtx *testutils.TestContext) error {
291+
// Trigger a NodeUpdate event to add new label.
292+
// It won't cause pod to be requeued, because there was a match already before the update, meaning this plugin wasn't blocking the scheduling.
293+
if _, err := testCtx.ClientSet.CoreV1().Nodes().Update(testCtx.Ctx, st.MakeNode().Name("node1").Label("group", "a").Label("node", "fake-node").Obj(), metav1.UpdateOptions{}); err != nil {
294+
return fmt.Errorf("failed to update the pod: %w", err)
295+
}
296+
return nil
297+
},
298+
wantRequeuedPods: sets.Set[string]{},
299+
enableSchedulingQueueHint: []bool{true},
300+
},
301+
{
302+
name: "Pod rejected by the NodeAffinity plugin is requeued when a Node is added",
303+
initialNodes: []*v1.Node{st.MakeNode().Name("fake-node1").Label("group", "a").Obj()},
304+
pods: []*v1.Pod{
305+
// - Pod1 will be rejected by the NodeAffinity plugin.
306+
st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
307+
// - Pod2 will be rejected by the NodeAffinity plugin.
308+
st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"c"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
309+
},
310+
triggerFn: func(testCtx *testutils.TestContext) error {
311+
// Trigger a NodeAdd event with the awaited label.
312+
// It causes pod1 to be requeued.
313+
// It causes pod2 not to be requeued.
314+
if _, err := testCtx.ClientSet.CoreV1().Nodes().Create(testCtx.Ctx, st.MakeNode().Name("fake-node2").Label("group", "b").Obj(), metav1.CreateOptions{}); err != nil {
315+
return fmt.Errorf("failed to update the pod: %w", err)
275316
}
276317
return nil
277318
},

0 commit comments

Comments
 (0)