Skip to content

Commit 04bba3c

Browse files
authored
Merge pull request kubernetes#127427 from sanposhiho/bug-nodeunsched
bug(nodeunschedulable): register missing Pod event for NodeUnschedulable plugin
2 parents b005b40 + 4959819 commit 04bba3c

File tree

3 files changed

+111
-4
lines changed

3 files changed

+111
-4
lines changed

pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,56 @@ const (
5151
// EventsToRegister returns the possible events that may make a Pod
5252
// failed by this plugin schedulable.
5353
func (pl *NodeUnschedulable) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
54-
// preCheck is not used when QHint is enabled.
54+
if !pl.enableSchedulingQueueHint {
55+
return []framework.ClusterEventWithHint{
56+
// A note about UpdateNodeLabel event:
57+
// Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin.
58+
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
59+
// See: https://github.com/kubernetes/kubernetes/issues/109437
60+
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
61+
}, nil
62+
}
63+
5564
return []framework.ClusterEventWithHint{
65+
// When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeLabel event.
5666
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
67+
// When the QueueingHint feature is enabled,
68+
// the scheduling queue uses Pod/Update Queueing Hint
69+
// to determine whether a Pod's update makes the Pod schedulable or not.
70+
// https://github.com/kubernetes/kubernetes/pull/122234
71+
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
5772
}, nil
5873
}
5974

75+
// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed.
76+
func (pl *NodeUnschedulable) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
77+
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
78+
if err != nil {
79+
return framework.Queue, err
80+
}
81+
82+
if pod.UID == modifiedPod.UID {
83+
// Note: we don't need to check oldPod tolerations the taint because:
84+
// - Taint can be added, but can't be modified nor removed.
85+
// - If the Pod already has the toleration, it shouldn't have rejected by this plugin in the first place.
86+
// Meaning, here this Pod has been rejected by this plugin, and hence it shouldn't have the toleration yet.
87+
if v1helper.TolerationsTolerateTaint(modifiedPod.Spec.Tolerations, &v1.Taint{
88+
Key: v1.TaintNodeUnschedulable,
89+
Effect: v1.TaintEffectNoSchedule,
90+
}) {
91+
// This update makes the pod tolerate the unschedulable taint.
92+
logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod))
93+
return framework.Queue, nil
94+
}
95+
logger.V(5).Info("a new toleration is added for the unschedulable Pod, but it's an unrelated toleration", "pod", klog.KObj(modifiedPod))
96+
return framework.QueueSkip, nil
97+
}
98+
99+
logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod))
100+
101+
return framework.QueueSkip, nil
102+
}
103+
60104
// isSchedulableAfterNodeChange is invoked for all node events reported by
61105
// an informer. It checks whether that change made a previously unschedulable
62106
// pod schedulable.

pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable_test.go

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
v1 "k8s.io/api/core/v1"
2424
"k8s.io/kubernetes/pkg/scheduler/framework"
2525
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
26+
st "k8s.io/kubernetes/pkg/scheduler/testing"
2627
"k8s.io/kubernetes/test/utils/ktesting"
2728
)
2829

@@ -96,14 +97,14 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) {
9697
expectedErr bool
9798
}{
9899
{
99-
name: "backoff-wrong-new-object",
100+
name: "error-wrong-new-object",
100101
pod: &v1.Pod{},
101102
newObj: "not-a-node",
102103
expectedHint: framework.Queue,
103104
expectedErr: true,
104105
},
105106
{
106-
name: "backoff-wrong-old-object",
107+
name: "error-wrong-old-object",
107108
pod: &v1.Pod{},
108109
newObj: &v1.Node{
109110
Spec: v1.NodeSpec{
@@ -207,3 +208,64 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) {
207208
})
208209
}
209210
}
211+
212+
func TestIsSchedulableAfterPodTolerationChange(t *testing.T) {
213+
testCases := []struct {
214+
name string
215+
pod *v1.Pod
216+
oldObj, newObj interface{}
217+
expectedHint framework.QueueingHint
218+
expectedErr bool
219+
}{
220+
{
221+
name: "error-wrong-new-object",
222+
pod: &v1.Pod{},
223+
newObj: "not-a-pod",
224+
expectedHint: framework.Queue,
225+
expectedErr: true,
226+
},
227+
{
228+
name: "error-wrong-old-object",
229+
pod: &v1.Pod{},
230+
newObj: &v1.Pod{},
231+
oldObj: "not-a-pod",
232+
expectedHint: framework.Queue,
233+
expectedErr: true,
234+
},
235+
{
236+
name: "skip-different-pod-update",
237+
pod: st.MakePod().UID("uid").Obj(),
238+
newObj: st.MakePod().UID("different-uid").Toleration(v1.TaintNodeUnschedulable).Obj(),
239+
oldObj: st.MakePod().UID("different-uid").Obj(),
240+
expectedHint: framework.QueueSkip,
241+
},
242+
{
243+
name: "queue-when-the-unsched-pod-gets-toleration",
244+
pod: st.MakePod().UID("uid").Obj(),
245+
newObj: st.MakePod().UID("uid").Toleration(v1.TaintNodeUnschedulable).Obj(),
246+
oldObj: st.MakePod().UID("uid").Obj(),
247+
expectedHint: framework.Queue,
248+
},
249+
{
250+
name: "skip-when-the-unsched-pod-gets-unrelated-toleration",
251+
pod: st.MakePod().UID("uid").Obj(),
252+
newObj: st.MakePod().UID("uid").Toleration("unrelated-key").Obj(),
253+
oldObj: st.MakePod().UID("uid").Obj(),
254+
expectedHint: framework.QueueSkip,
255+
},
256+
}
257+
258+
for _, testCase := range testCases {
259+
t.Run(testCase.name, func(t *testing.T) {
260+
logger, _ := ktesting.NewTestContext(t)
261+
pl := &NodeUnschedulable{}
262+
got, err := pl.isSchedulableAfterPodTolerationChange(logger, testCase.pod, testCase.oldObj, testCase.newObj)
263+
if err != nil && !testCase.expectedErr {
264+
t.Errorf("unexpected error: %v", err)
265+
}
266+
if got != testCase.expectedHint {
267+
t.Errorf("isSchedulableAfterNodeChange() = %v, want %v", got, testCase.expectedHint)
268+
}
269+
})
270+
}
271+
}

pkg/scheduler/framework/types.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ const (
6969
UpdatePodLabel
7070
// UpdatePodScaleDown is an update for pod's scale down (i.e., any resource request is reduced).
7171
UpdatePodScaleDown
72-
// UpdatePodTolerations is an update for pod's tolerations.
72+
// UpdatePodTolerations is an addition for pod's tolerations.
73+
// (Due to API validation, we can add, but cannot modify or remove tolerations.)
7374
UpdatePodTolerations
7475
// UpdatePodSchedulingGatesEliminated is an update for pod's scheduling gates, which eliminates all scheduling gates in the Pod.
7576
UpdatePodSchedulingGatesEliminated

0 commit comments

Comments
 (0)