@@ -100,11 +100,10 @@ func (f *FitError) Error() string {
100
100
// TODO: Rename this type.
101
101
type ScheduleAlgorithm interface {
102
102
Schedule (context.Context , * profile.Profile , * framework.CycleState , * v1.Pod ) (scheduleResult ScheduleResult , err error )
103
- // Preempt receives scheduling errors for a pod and tries to create room for
103
+ // Preempt receives scheduling filter result (NodeToStatusMap) for a pod and tries to create room for
104
104
// the pod by preempting lower priority pods if possible.
105
- // It returns the node where preemption happened, a list of preempted pods, a
106
- // list of pods whose nominated node name should be removed, and error if any.
107
- Preempt (context.Context , * profile.Profile , * framework.CycleState , * v1.Pod , error ) (selectedNode string , preemptedPods []* v1.Pod , cleanupNominatedPods []* v1.Pod , err error )
105
+ // It returns the node where preemption happened, and error if any.
106
+ Preempt (context.Context , * profile.Profile , * framework.CycleState , * v1.Pod , framework.NodeToStatusMap ) (selectedNode string , err error )
108
107
// Extenders returns a slice of extender config. This is exposed for
109
108
// testing.
110
109
Extenders () []framework.Extender
@@ -249,29 +248,35 @@ func (g *genericScheduler) selectHost(nodeScoreList framework.NodeScoreList) (st
249
248
// other pods with the same priority. The nominated pod prevents other pods from
250
249
// using the nominated resources and the nominated pod could take a long time
251
250
// before it is retried after many other pending pods.
252
- func (g * genericScheduler ) Preempt (ctx context.Context , prof * profile.Profile , state * framework.CycleState , pod * v1.Pod , scheduleErr error ) (string , []* v1.Pod , []* v1.Pod , error ) {
253
- // Scheduler may return various types of errors. Consider preemption only if
254
- // the error is of type FitError.
255
- fitError , ok := scheduleErr .(* FitError )
256
- if ! ok || fitError == nil {
257
- return "" , nil , nil , nil
251
+ func (g * genericScheduler ) Preempt (ctx context.Context , prof * profile.Profile , state * framework.CycleState , pod * v1.Pod , m framework.NodeToStatusMap ) (string , error ) {
252
+ cs := prof .ClientSet ()
253
+ // TODO(Huang-Wei): get pod from informer cache instead of API server.
254
+ pod , err := util .GetUpdatedPod (cs , pod )
255
+ if err != nil {
256
+ klog .Errorf ("Error getting the updated preemptor pod object: %v" , err )
257
+ return "" , err
258
258
}
259
+
259
260
if ! podEligibleToPreemptOthers (pod , g .nodeInfoSnapshot .NodeInfos ()) {
260
261
klog .V (5 ).Infof ("Pod %v/%v is not eligible for more preemption." , pod .Namespace , pod .Name )
261
- return "" , nil , nil , nil
262
+ return "" , nil
262
263
}
263
264
allNodes , err := g .nodeInfoSnapshot .NodeInfos ().List ()
264
265
if err != nil {
265
- return "" , nil , nil , err
266
+ return "" , err
266
267
}
267
268
if len (allNodes ) == 0 {
268
- return "" , nil , nil , ErrNoNodesAvailable
269
+ return "" , ErrNoNodesAvailable
269
270
}
270
- potentialNodes := nodesWherePreemptionMightHelp (allNodes , fitError )
271
+ potentialNodes := nodesWherePreemptionMightHelp (allNodes , m )
271
272
if len (potentialNodes ) == 0 {
272
273
klog .V (3 ).Infof ("Preemption will not help schedule pod %v/%v on any node." , pod .Namespace , pod .Name )
273
274
// In this case, we should clean-up any existing nominated node name of the pod.
274
- return "" , nil , []* v1.Pod {pod }, nil
275
+ if err := util .ClearNominatedNodeName (cs , pod ); err != nil {
276
+ klog .Errorf ("Cannot clear 'NominatedNodeName' field of pod %v/%v: %v" , pod .Namespace , pod .Name , err )
277
+ // We do not return as this error is not critical.
278
+ }
279
+ return "" , nil
275
280
}
276
281
if klog .V (5 ).Enabled () {
277
282
var sample []string
@@ -284,33 +289,52 @@ func (g *genericScheduler) Preempt(ctx context.Context, prof *profile.Profile, s
284
289
if g .pdbLister != nil {
285
290
pdbs , err = g .pdbLister .List (labels .Everything ())
286
291
if err != nil {
287
- return "" , nil , nil , err
292
+ return "" , err
288
293
}
289
294
}
290
295
nodeNameToVictims , err := selectNodesForPreemption (ctx , prof , g .podNominator , state , pod , potentialNodes , pdbs )
291
296
if err != nil {
292
- return "" , nil , nil , err
297
+ return "" , err
293
298
}
294
299
295
300
// We will only check nodeNameToVictims with extenders that support preemption.
296
301
// Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated
297
302
// node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles.
298
303
nodeNameToVictims , err = g .processPreemptionWithExtenders (pod , nodeNameToVictims )
299
304
if err != nil {
300
- return "" , nil , nil , err
305
+ return "" , err
301
306
}
302
307
303
308
candidateNode := pickOneNodeForPreemption (nodeNameToVictims )
304
309
if len (candidateNode ) == 0 {
305
- return "" , nil , nil , nil
310
+ return "" , nil
306
311
}
307
312
313
+ victims := nodeNameToVictims [candidateNode ].Pods
314
+ for _ , victim := range victims {
315
+ if err := util .DeletePod (cs , victim ); err != nil {
316
+ klog .Errorf ("Error preempting pod %v/%v: %v" , victim .Namespace , victim .Name , err )
317
+ return "" , err
318
+ }
319
+ // If the victim is a WaitingPod, send a reject message to the PermitPlugin
320
+ if waitingPod := prof .GetWaitingPod (victim .UID ); waitingPod != nil {
321
+ waitingPod .Reject ("preempted" )
322
+ }
323
+ prof .Recorder .Eventf (victim , pod , v1 .EventTypeNormal , "Preempted" , "Preempting" , "Preempted by %v/%v on node %v" , pod .Namespace , pod .Name , candidateNode )
324
+ }
325
+ metrics .PreemptionVictims .Observe (float64 (len (victims )))
326
+
308
327
// Lower priority pods nominated to run on this node, may no longer fit on
309
328
// this node. So, we should remove their nomination. Removing their
310
329
// nomination updates these pods and moves them to the active queue. It
311
330
// lets scheduler find another place for them.
312
331
nominatedPods := g .getLowerPriorityNominatedPods (pod , candidateNode )
313
- return candidateNode , nodeNameToVictims [candidateNode ].Pods , nominatedPods , nil
332
+ if err := util .ClearNominatedNodeName (cs , nominatedPods ... ); err != nil {
333
+ klog .Errorf ("Cannot clear 'NominatedNodeName' field: %v" , err )
334
+ // We do not return as this error is not critical.
335
+ }
336
+
337
+ return candidateNode , nil
314
338
}
315
339
316
340
// processPreemptionWithExtenders processes preemption with extenders
@@ -1041,13 +1065,13 @@ func selectVictimsOnNode(
1041
1065
1042
1066
// nodesWherePreemptionMightHelp returns a list of nodes with failed predicates
1043
1067
// that may be satisfied by removing pods from the node.
1044
- func nodesWherePreemptionMightHelp (nodes []* framework.NodeInfo , fitErr * FitError ) []* framework.NodeInfo {
1068
+ func nodesWherePreemptionMightHelp (nodes []* framework.NodeInfo , m framework. NodeToStatusMap ) []* framework.NodeInfo {
1045
1069
var potentialNodes []* framework.NodeInfo
1046
1070
for _ , node := range nodes {
1047
1071
name := node .Node ().Name
1048
1072
// We reply on the status by each plugin - 'Unschedulable' or 'UnschedulableAndUnresolvable'
1049
1073
// to determine whether preemption may help or not on the node.
1050
- if fitErr . FilteredNodesStatuses [name ].Code () == framework .UnschedulableAndUnresolvable {
1074
+ if m [name ].Code () == framework .UnschedulableAndUnresolvable {
1051
1075
continue
1052
1076
}
1053
1077
potentialNodes = append (potentialNodes , node )
0 commit comments