Skip to content

Commit 68f7a7c

Browse files
committed
bugfix(scheduler): preemption picks wrong victim node with higher priority pod on it.
Introducing pdb to preemption had disrupted the orderliness of pods in the victims, which would leads picking wrong victim node with higher priority pod on it.
1 parent ae53151 commit 68f7a7c

File tree

3 files changed

+105
-0
lines changed

3 files changed

+105
-0
lines changed

pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,8 @@ func (pl *DefaultPreemption) SelectVictimsOnNode(
191191
}
192192
var victims []*v1.Pod
193193
numViolatingVictim := 0
194+
// Sort potentialVictims by pod priority from high to low, which ensures to
195+
// reprieve higher priority pods first.
194196
sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i].Pod, potentialVictims[j].Pod) })
195197
// Try to reprieve as many pods as possible. We first try to reprieve the PDB
196198
// violating victims and then other non-violating ones. In both cases, we start
@@ -225,6 +227,11 @@ func (pl *DefaultPreemption) SelectVictimsOnNode(
225227
return nil, 0, framework.AsStatus(err)
226228
}
227229
}
230+
231+
// Sort victims after reprieving pods to keep the pods in the victims sorted in order of priority from high to low.
232+
if len(violatingVictims) != 0 && len(nonViolatingVictims) != 0 {
233+
sort.Slice(victims, func(i, j int) bool { return util.MoreImportantPod(victims[i], victims[j]) })
234+
}
228235
return victims, numViolatingVictim, framework.NewStatus(framework.Success)
229236
}
230237

pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"encoding/json"
2222
"errors"
2323
"fmt"
24+
"math"
2425
"math/rand"
2526
"sort"
2627
"strings"
@@ -144,6 +145,12 @@ func (pl *TestPlugin) Filter(ctx context.Context, state *framework.CycleState, p
144145
return nil
145146
}
146147

148+
const (
149+
LabelKeyIsViolatingPDB = "test.kubernetes.io/is-violating-pdb"
150+
LabelValueViolatingPDB = "violating"
151+
LabelValueNonViolatingPDB = "non-violating"
152+
)
153+
147154
func TestPostFilter(t *testing.T) {
148155
metrics.Register()
149156
onePodRes := map[v1.ResourceName]string{v1.ResourcePods: "1"}
@@ -152,6 +159,7 @@ func TestPostFilter(t *testing.T) {
152159
name string
153160
pod *v1.Pod
154161
pods []*v1.Pod
162+
pdbs []*policy.PodDisruptionBudget
155163
nodes []*v1.Node
156164
filteredNodesStatuses *framework.NodeToStatus
157165
extender framework.Extender
@@ -234,6 +242,29 @@ func TestPostFilter(t *testing.T) {
234242
wantResult: framework.NewPostFilterResultWithNominatedNode("node2"),
235243
wantStatus: framework.NewStatus(framework.Success),
236244
},
245+
{
246+
name: "pod can be made schedulable on minHighestPriority node",
247+
pod: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).Priority(veryHighPriority).Obj(),
248+
pods: []*v1.Pod{
249+
st.MakePod().Name("p1").UID("p1").Label(LabelKeyIsViolatingPDB, LabelValueNonViolatingPDB).Namespace(v1.NamespaceDefault).Priority(highPriority).Node("node1").Obj(),
250+
st.MakePod().Name("p2").UID("p2").Label(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).Namespace(v1.NamespaceDefault).Priority(lowPriority).Node("node1").Obj(),
251+
st.MakePod().Name("p3").UID("p3").Label(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).Namespace(v1.NamespaceDefault).Priority(midPriority).Node("node2").Obj(),
252+
},
253+
pdbs: []*policy.PodDisruptionBudget{
254+
st.MakePDB().Name("violating-pdb").Namespace(v1.NamespaceDefault).MatchLabel(LabelKeyIsViolatingPDB, LabelValueViolatingPDB).MinAvailable("100%").Obj(),
255+
st.MakePDB().Name("non-violating-pdb").Namespace(v1.NamespaceDefault).MatchLabel(LabelKeyIsViolatingPDB, LabelValueNonViolatingPDB).MinAvailable("0").DisruptionsAllowed(math.MaxInt32).Obj(),
256+
},
257+
nodes: []*v1.Node{
258+
st.MakeNode().Name("node1").Capacity(onePodRes).Obj(),
259+
st.MakeNode().Name("node2").Capacity(onePodRes).Obj(),
260+
},
261+
filteredNodesStatuses: framework.NewNodeToStatus(map[string]*framework.Status{
262+
"node1": framework.NewStatus(framework.Unschedulable),
263+
"node2": framework.NewStatus(framework.Unschedulable),
264+
}, framework.NewStatus(framework.UnschedulableAndUnresolvable)),
265+
wantResult: framework.NewPostFilterResultWithNominatedNode("node2"),
266+
wantStatus: framework.NewStatus(framework.Success),
267+
},
237268
{
238269
name: "preemption result filtered out by extenders",
239270
pod: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).Priority(highPriority).Obj(),
@@ -365,6 +396,13 @@ func TestPostFilter(t *testing.T) {
365396
for i := range tt.pods {
366397
podInformer.GetStore().Add(tt.pods[i])
367398
}
399+
pdbInformer := informerFactory.Policy().V1().PodDisruptionBudgets().Informer()
400+
for i := range tt.pdbs {
401+
if err := pdbInformer.GetStore().Add(tt.pdbs[i]); err != nil {
402+
t.Fatal(err)
403+
}
404+
}
405+
368406
// Register NodeResourceFit as the Filter & PreFilter plugin.
369407
registeredPlugins := []tf.RegisterPluginFunc{
370408
tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New),

pkg/scheduler/testing/wrappers.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@ import (
2222
"time"
2323

2424
v1 "k8s.io/api/core/v1"
25+
policy "k8s.io/api/policy/v1"
2526
resourceapi "k8s.io/api/resource/v1alpha3"
2627
storagev1 "k8s.io/api/storage/v1"
2728
"k8s.io/apimachinery/pkg/api/resource"
2829
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930
"k8s.io/apimachinery/pkg/runtime/schema"
3031
"k8s.io/apimachinery/pkg/types"
32+
"k8s.io/apimachinery/pkg/util/intstr"
3133
imageutils "k8s.io/kubernetes/test/utils/image"
3234
"k8s.io/utils/ptr"
3335
)
@@ -231,6 +233,64 @@ func (c *ContainerWrapper) RestartPolicy(restartPolicy v1.ContainerRestartPolicy
231233
return c
232234
}
233235

236+
// PodDisruptionBudgetWrapper wraps a PodDisruptionBudget inside.
237+
type PodDisruptionBudgetWrapper struct {
238+
policy.PodDisruptionBudget
239+
}
240+
241+
// MakePDB creates a PodDisruptionBudget wrapper.
242+
func MakePDB() *PodDisruptionBudgetWrapper {
243+
return &PodDisruptionBudgetWrapper{policy.PodDisruptionBudget{}}
244+
}
245+
246+
// Obj returns the inner PodDisruptionBudget.
247+
func (p *PodDisruptionBudgetWrapper) Obj() *policy.PodDisruptionBudget {
248+
return &p.PodDisruptionBudget
249+
}
250+
251+
// Name sets `name` as the name of the inner PodDisruptionBudget.
252+
func (p *PodDisruptionBudgetWrapper) Name(name string) *PodDisruptionBudgetWrapper {
253+
p.SetName(name)
254+
return p
255+
}
256+
257+
// Namespace sets `namespace` as the namespace of the inner PodDisruptionBudget.
258+
func (p *PodDisruptionBudgetWrapper) Namespace(namespace string) *PodDisruptionBudgetWrapper {
259+
p.SetNamespace(namespace)
260+
return p
261+
}
262+
263+
// MinAvailable sets `minAvailable` to the inner PodDisruptionBudget.Spec.MinAvailable.
264+
func (p *PodDisruptionBudgetWrapper) MinAvailable(minAvailable string) *PodDisruptionBudgetWrapper {
265+
p.Spec.MinAvailable = &intstr.IntOrString{
266+
Type: intstr.String,
267+
StrVal: minAvailable,
268+
}
269+
return p
270+
}
271+
272+
// MatchLabel adds a {key,value} to the inner PodDisruptionBudget.Spec.Selector.MatchLabels.
273+
func (p *PodDisruptionBudgetWrapper) MatchLabel(key, value string) *PodDisruptionBudgetWrapper {
274+
selector := p.Spec.Selector
275+
if selector == nil {
276+
selector = &metav1.LabelSelector{}
277+
}
278+
matchLabels := selector.MatchLabels
279+
if matchLabels == nil {
280+
matchLabels = map[string]string{}
281+
}
282+
matchLabels[key] = value
283+
selector.MatchLabels = matchLabels
284+
p.Spec.Selector = selector
285+
return p
286+
}
287+
288+
// DisruptionsAllowed sets `disruptionsAllowed` to the inner PodDisruptionBudget.Status.DisruptionsAllowed.
289+
func (p *PodDisruptionBudgetWrapper) DisruptionsAllowed(disruptionsAllowed int32) *PodDisruptionBudgetWrapper {
290+
p.Status.DisruptionsAllowed = disruptionsAllowed
291+
return p
292+
}
293+
234294
// PodWrapper wraps a Pod inside.
235295
type PodWrapper struct{ v1.Pod }
236296

0 commit comments

Comments
 (0)