Skip to content

Commit aa73f31

Browse files
authored
Merge pull request kubernetes#122292 from sanposhiho/nodeupdate
register Node/UpdateTaint event to plugins which has Node/Add only and doesn't have Node/UpdateTaint
2 parents 065a0f2 + 2b56de4 commit aa73f31

File tree

13 files changed

+267
-92
lines changed

13 files changed

+267
-92
lines changed

pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,16 @@ func (pl *dynamicResources) EventsToRegister() []framework.ClusterEventWithHint
396396
{Event: framework.ClusterEvent{Resource: framework.PodSchedulingContext, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPodSchedulingContextChange},
397397
// A resource might depend on node labels for topology filtering.
398398
// A new or updated node may make pods schedulable.
399-
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
399+
//
400+
// A note about UpdateNodeTaint event:
401+
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
402+
// As a common problematic scenario,
403+
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
404+
// In such cases, this plugin may miss some events that actually make pods schedulable.
405+
// As a workaround, we add UpdateNodeTaint event to catch the case.
406+
// We can remove UpdateNodeTaint when we remove the preCheck feature.
407+
// See: https://github.com/kubernetes/kubernetes/issues/110175
408+
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
400409
// A pod might be waiting for a class to get created or modified.
401410
{Event: framework.ClusterEvent{Resource: framework.ResourceClass, ActionType: framework.Add | framework.Update}},
402411
}

pkg/scheduler/framework/plugins/interpodaffinity/plugin.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,17 @@ func (pl *InterPodAffinity) EventsToRegister() []framework.ClusterEventWithHint
6464
// an unschedulable Pod schedulable.
6565
// - Add. An unschedulable Pod may fail due to violating pod-affinity constraints,
6666
// adding an assigned Pod may make it schedulable.
67+
//
68+
// A note about UpdateNodeTaint event:
69+
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
70+
// As a common problematic scenario,
71+
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
72+
// In such cases, this plugin may miss some events that actually make pods schedulable.
73+
// As a workaround, we add UpdateNodeTaint event to catch the case.
74+
// We can remove UpdateNodeTaint when we remove the preCheck feature.
75+
// See: https://github.com/kubernetes/kubernetes/issues/110175
6776
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.All}},
68-
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
77+
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
6978
}
7079
}
7180

pkg/scheduler/framework/plugins/podtopologyspread/plugin.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,16 @@ func (pl *PodTopologySpread) EventsToRegister() []framework.ClusterEventWithHint
146146
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.All}, QueueingHintFn: pl.isSchedulableAfterPodChange},
147147
// Node add|delete|update maybe lead an topology key changed,
148148
// and make these pod in scheduling schedulable or unschedulable.
149-
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.Delete | framework.Update}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
149+
//
150+
// A note about UpdateNodeTaint event:
151+
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
152+
// As a common problematic scenario,
153+
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
154+
// In such cases, this plugin may miss some events that actually make pods schedulable.
155+
// As a workaround, we add UpdateNodeTaint event to catch the case.
156+
// We can remove UpdateNodeTaint when we remove the preCheck feature.
157+
// See: https://github.com/kubernetes/kubernetes/issues/110175
158+
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.Delete | framework.UpdateNodeLabel | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
150159
}
151160
}
152161

pkg/scheduler/framework/plugins/volumebinding/volume_binding.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,16 @@ func (pl *VolumeBinding) EventsToRegister() []framework.ClusterEventWithHint {
103103
// Pods may fail to find available PVs because the node labels do not
104104
// match the storage class's allowed topologies or PV's node affinity.
105105
// A new or updated node may make pods schedulable.
106-
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
106+
//
107+
// A note about UpdateNodeTaint event:
108+
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
109+
// As a common problematic scenario,
110+
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
111+
// In such cases, this plugin may miss some events that actually make pods schedulable.
112+
// As a workaround, we add UpdateNodeTaint event to catch the case.
113+
// We can remove UpdateNodeTaint when we remove the preCheck feature.
114+
// See: https://github.com/kubernetes/kubernetes/issues/110175
115+
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
107116
// We rely on CSI node to translate in-tree PV to CSI.
108117
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add | framework.Update}},
109118
// When CSIStorageCapacity is enabled, pods may become schedulable

pkg/scheduler/framework/plugins/volumezone/volume_zone.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,16 @@ func (pl *VolumeZone) EventsToRegister() []framework.ClusterEventWithHint {
280280
// Due to immutable field `storageClass.volumeBindingMode`, storageClass update events are ignored.
281281
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add}},
282282
// A new node or updating a node's volume zone labels may make a pod schedulable.
283-
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
283+
//
284+
// A note about UpdateNodeTaint event:
285+
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
286+
// As a common problematic scenario,
287+
// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
288+
// In such cases, this plugin may miss some events that actually make pods schedulable.
289+
// As a workaround, we add UpdateNodeTaint event to catch the case.
290+
// We can remove UpdateNodeTaint when we remove the preCheck feature.
291+
// See: https://github.com/kubernetes/kubernetes/issues/110175
292+
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
284293
// A new pvc may make a pod schedulable.
285294
// Due to fields are immutable except `spec.resources`, pvc update events are ignored.
286295
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}},

pkg/scheduler/framework/types.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,16 @@ const (
7272
// - a Pod that is deleted
7373
// - a Pod that was assumed, but gets un-assumed due to some errors in the binding cycle.
7474
// - an existing Pod that was unscheduled but gets scheduled to a Node.
75-
Pod GVK = "Pod"
75+
Pod GVK = "Pod"
76+
// A note about NodeAdd event and UpdateNodeTaint event:
77+
// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
78+
// It's definitely not something expected for plugin developers,
79+
// and registering UpdateNodeTaint event is the only mitigation for now.
80+
// So, kube-scheduler registers UpdateNodeTaint event for plugins that has NodeAdded event, but don't have UpdateNodeTaint event.
81+
// It has a bad impact for the requeuing efficiency though, a lot better than some Pods being stuck in the
82+
// unschedulable pod pool.
83+
// This behavior will be removed when we remove the preCheck feature.
84+
// See: https://github.com/kubernetes/kubernetes/issues/110175
7685
Node GVK = "Node"
7786
PersistentVolume GVK = "PersistentVolume"
7887
PersistentVolumeClaim GVK = "PersistentVolumeClaim"

pkg/scheduler/internal/queue/scheduling_queue.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ type SchedulingQueue interface {
118118
AssignedPodAdded(logger klog.Logger, pod *v1.Pod)
119119
AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod)
120120
PendingPods() ([]*v1.Pod, string)
121+
PodsInActiveQ() []*v1.Pod
121122
// Close closes the SchedulingQueue so that the goroutine which is
122123
// waiting to pop items can exit gracefully.
123124
Close()
@@ -1227,6 +1228,18 @@ func (p *PriorityQueue) getUnschedulablePodsWithMatchingAffinityTerm(logger klog
12271228
return podsToMove
12281229
}
12291230

1231+
// PodsInActiveQ returns all the Pods in the activeQ.
1232+
// This function is only used in tests.
1233+
func (p *PriorityQueue) PodsInActiveQ() []*v1.Pod {
1234+
p.lock.RLock()
1235+
defer p.lock.RUnlock()
1236+
var result []*v1.Pod
1237+
for _, pInfo := range p.activeQ.List() {
1238+
result = append(result, pInfo.(*framework.QueuedPodInfo).Pod)
1239+
}
1240+
return result
1241+
}
1242+
12301243
var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v"
12311244

12321245
// PendingPods returns all the pending pods in the queue; accompanied by a debugging string

pkg/scheduler/schedule_one.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ const (
6262
numberOfHighestScoredNodesToReport = 3
6363
)
6464

65-
// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
66-
func (sched *Scheduler) scheduleOne(ctx context.Context) {
65+
// ScheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
66+
func (sched *Scheduler) ScheduleOne(ctx context.Context) {
6767
logger := klog.FromContext(ctx)
6868
podInfo, err := sched.NextPod(logger)
6969
if err != nil {

pkg/scheduler/schedule_one_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -811,7 +811,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
811811
if err != nil {
812812
t.Fatal(err)
813813
}
814-
sched.scheduleOne(ctx)
814+
sched.ScheduleOne(ctx)
815815
<-called
816816
if e, a := item.expectAssumedPod, gotAssumedPod; !reflect.DeepEqual(e, a) {
817817
t.Errorf("assumed pod: wanted %v, got %v", e, a)
@@ -884,7 +884,7 @@ func TestSchedulerNoPhantomPodAfterExpire(t *testing.T) {
884884
// We use conflicted pod ports to incur fit predicate failure if first pod not removed.
885885
secondPod := podWithPort("bar", "", 8080)
886886
queuedPodStore.Add(secondPod)
887-
scheduler.scheduleOne(ctx)
887+
scheduler.ScheduleOne(ctx)
888888
select {
889889
case b := <-bindingChan:
890890
expectBinding := &v1.Binding{
@@ -921,7 +921,7 @@ func TestSchedulerNoPhantomPodAfterDelete(t *testing.T) {
921921
// queuedPodStore: [bar:8080]
922922
// cache: [(assumed)foo:8080]
923923

924-
scheduler.scheduleOne(ctx)
924+
scheduler.ScheduleOne(ctx)
925925
select {
926926
case err := <-errChan:
927927
expectErr := &framework.FitError{
@@ -954,7 +954,7 @@ func TestSchedulerNoPhantomPodAfterDelete(t *testing.T) {
954954
}
955955

956956
queuedPodStore.Add(secondPod)
957-
scheduler.scheduleOne(ctx)
957+
scheduler.ScheduleOne(ctx)
958958
select {
959959
case b := <-bindingChan:
960960
expectBinding := &v1.Binding{
@@ -1030,7 +1030,7 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) {
10301030
scheduler, _, errChan := setupTestScheduler(ctx, t, queuedPodStore, scache, informerFactory, nil, fns...)
10311031

10321032
queuedPodStore.Add(podWithTooBigResourceRequests)
1033-
scheduler.scheduleOne(ctx)
1033+
scheduler.ScheduleOne(ctx)
10341034
select {
10351035
case err := <-errChan:
10361036
expectErr := &framework.FitError{
@@ -1160,7 +1160,7 @@ func TestSchedulerWithVolumeBinding(t *testing.T) {
11601160
if err != nil {
11611161
t.Fatal(err)
11621162
}
1163-
s.scheduleOne(ctx)
1163+
s.ScheduleOne(ctx)
11641164
// Wait for pod to succeed or fail scheduling
11651165
select {
11661166
case <-eventChan:
@@ -3481,7 +3481,7 @@ func setupTestSchedulerWithOnePodOnNode(ctx context.Context, t *testing.T, queue
34813481
// queuedPodStore: [foo:8080]
34823482
// cache: []
34833483

3484-
scheduler.scheduleOne(ctx)
3484+
scheduler.ScheduleOne(ctx)
34853485
// queuedPodStore: []
34863486
// cache: [(assumed)foo:8080]
34873487

pkg/scheduler/scheduler.go

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,17 +389,47 @@ func buildQueueingHintMap(es []framework.EnqueueExtensions) internalqueue.Queuei
389389
// cannot be moved by any regular cluster event.
390390
// So, we can just ignore such EventsToRegister here.
391391

392+
registerNodeAdded := false
393+
registerNodeTaintUpdated := false
392394
for _, event := range events {
393395
fn := event.QueueingHintFn
394396
if fn == nil || !utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
395397
fn = defaultQueueingHintFn
396398
}
397399

400+
if event.Event.Resource == framework.Node {
401+
if event.Event.ActionType&framework.Add != 0 {
402+
registerNodeAdded = true
403+
}
404+
if event.Event.ActionType&framework.UpdateNodeTaint != 0 {
405+
registerNodeTaintUpdated = true
406+
}
407+
}
408+
398409
queueingHintMap[event.Event] = append(queueingHintMap[event.Event], &internalqueue.QueueingHintFunction{
399410
PluginName: e.Name(),
400411
QueueingHintFn: fn,
401412
})
402413
}
414+
if registerNodeAdded && !registerNodeTaintUpdated {
415+
// Temporally fix for the issue https://github.com/kubernetes/kubernetes/issues/109437
416+
// NodeAdded QueueingHint isn't always called because of preCheck.
417+
// It's definitely not something expected for plugin developers,
418+
// and registering UpdateNodeTaint event is the only mitigation for now.
419+
//
420+
// So, here registers UpdateNodeTaint event for plugins that has NodeAdded event, but don't have UpdateNodeTaint event.
421+
// It has a bad impact for the requeuing efficiency though, a lot better than some Pods being stuch in the
422+
// unschedulable pod pool.
423+
// This behavior will be removed when we remove the preCheck feature.
424+
// See: https://github.com/kubernetes/kubernetes/issues/110175
425+
queueingHintMap[framework.ClusterEvent{Resource: framework.Node, ActionType: framework.UpdateNodeTaint}] =
426+
append(queueingHintMap[framework.ClusterEvent{Resource: framework.Node, ActionType: framework.UpdateNodeTaint}],
427+
&internalqueue.QueueingHintFunction{
428+
PluginName: e.Name(),
429+
QueueingHintFn: defaultQueueingHintFn,
430+
},
431+
)
432+
}
403433
}
404434
return queueingHintMap
405435
}
@@ -415,7 +445,7 @@ func (sched *Scheduler) Run(ctx context.Context) {
415445
// If there are no new pods to schedule, it will be hanging there
416446
// and if done in this goroutine it will be blocking closing
417447
// SchedulingQueue, in effect causing a deadlock on shutdown.
418-
go wait.UntilWithContext(ctx, sched.scheduleOne, 0)
448+
go wait.UntilWithContext(ctx, sched.ScheduleOne, 0)
419449

420450
<-ctx.Done()
421451
sched.SchedulingQueue.Close()

0 commit comments

Comments
 (0)