Skip to content

Commit 5581497

Browse files
committed
Add a test that reproduces the race condition between setting nominated node name of a pod and scheduling cycle of other pods
1 parent aee1ab3 commit 5581497

File tree

2 files changed

+144
-19
lines changed

2 files changed

+144
-19
lines changed

test/integration/scheduler/preemption_test.go

Lines changed: 132 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
utilfeature "k8s.io/apiserver/pkg/util/feature"
3434
utilfeaturetesting "k8s.io/apiserver/pkg/util/feature/testing"
3535
clientset "k8s.io/client-go/kubernetes"
36+
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
3637
"k8s.io/kubernetes/pkg/features"
3738
_ "k8s.io/kubernetes/pkg/scheduler/algorithmprovider"
3839
testutils "k8s.io/kubernetes/test/utils"
@@ -73,7 +74,7 @@ func TestPreemption(t *testing.T) {
7374

7475
defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
7576
v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI),
76-
v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI)},
77+
v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
7778
}
7879

7980
tests := []struct {
@@ -91,7 +92,7 @@ func TestPreemption(t *testing.T) {
9192
Priority: &lowPriority,
9293
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
9394
v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI),
94-
v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
95+
v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
9596
},
9697
}),
9798
},
@@ -101,7 +102,7 @@ func TestPreemption(t *testing.T) {
101102
Priority: &highPriority,
102103
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
103104
v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
104-
v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
105+
v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
105106
},
106107
}),
107108
preemptedPodIndexes: map[int]struct{}{0: {}},
@@ -237,7 +238,7 @@ func TestPreemption(t *testing.T) {
237238
nodeRes := &v1.ResourceList{
238239
v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
239240
v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
240-
v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI),
241+
v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
241242
}
242243
node, err := createNode(context.clientSet, "node1", nodeRes)
243244
if err != nil {
@@ -313,7 +314,7 @@ func TestDisablePreemption(t *testing.T) {
313314
Priority: &lowPriority,
314315
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
315316
v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI),
316-
v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
317+
v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
317318
},
318319
}),
319320
},
@@ -323,7 +324,7 @@ func TestDisablePreemption(t *testing.T) {
323324
Priority: &highPriority,
324325
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
325326
v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
326-
v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
327+
v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
327328
},
328329
}),
329330
},
@@ -333,7 +334,7 @@ func TestDisablePreemption(t *testing.T) {
333334
nodeRes := &v1.ResourceList{
334335
v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
335336
v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
336-
v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI),
337+
v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
337338
}
338339
_, err := createNode(context.clientSet, "node1", nodeRes)
339340
if err != nil {
@@ -375,7 +376,7 @@ func TestDisablePreemption(t *testing.T) {
375376
func mkPriorityPodWithGrace(tc *TestContext, name string, priority int32, grace int64) *v1.Pod {
376377
defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
377378
v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI),
378-
v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI)},
379+
v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
379380
}
380381
pod := initPausePod(tc.clientSet, &pausePodConfig{
381382
Name: name,
@@ -420,7 +421,7 @@ func TestPreemptionStarvation(t *testing.T) {
420421
Priority: &highPriority,
421422
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
422423
v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
423-
v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
424+
v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
424425
},
425426
}),
426427
},
@@ -430,7 +431,7 @@ func TestPreemptionStarvation(t *testing.T) {
430431
nodeRes := &v1.ResourceList{
431432
v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
432433
v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
433-
v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI),
434+
v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
434435
}
435436
_, err := createNode(context.clientSet, "node1", nodeRes)
436437
if err != nil {
@@ -490,6 +491,119 @@ func TestPreemptionStarvation(t *testing.T) {
490491
}
491492
}
492493

494+
// TestPreemptionRaces tests that other scheduling events and operations do not
495+
// race with the preemption process.
496+
func TestPreemptionRaces(t *testing.T) {
497+
// Initialize scheduler.
498+
context := initTest(t, "preemption-race")
499+
defer cleanupTest(t, context)
500+
cs := context.clientSet
501+
502+
tests := []struct {
503+
description string
504+
numInitialPods int // Pods created and executed before running preemptor
505+
numAdditionalPods int // Pods created after creating the preemptor
506+
numRepetitions int // Repeat the tests to check races
507+
preemptor *v1.Pod
508+
}{
509+
{
510+
// This test ensures that while the preempting pod is waiting for the victims
511+
// terminate, other lower priority pods are not scheduled in the room created
512+
// after preemption and while the higher priority pods is not scheduled yet.
513+
description: "ensures that other pods are not scheduled while preemptor is being marked as nominated (issue #72124)",
514+
numInitialPods: 2,
515+
numAdditionalPods: 50,
516+
numRepetitions: 10,
517+
preemptor: initPausePod(cs, &pausePodConfig{
518+
Name: "preemptor-pod",
519+
Namespace: context.ns.Name,
520+
Priority: &highPriority,
521+
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
522+
v1.ResourceCPU: *resource.NewMilliQuantity(4900, resource.DecimalSI),
523+
v1.ResourceMemory: *resource.NewQuantity(4900, resource.DecimalSI)},
524+
},
525+
}),
526+
},
527+
}
528+
529+
// Create a node with some resources and a label.
530+
nodeRes := &v1.ResourceList{
531+
v1.ResourcePods: *resource.NewQuantity(100, resource.DecimalSI),
532+
v1.ResourceCPU: *resource.NewMilliQuantity(5000, resource.DecimalSI),
533+
v1.ResourceMemory: *resource.NewQuantity(5000, resource.DecimalSI),
534+
}
535+
_, err := createNode(context.clientSet, "node1", nodeRes)
536+
if err != nil {
537+
t.Fatalf("Error creating nodes: %v", err)
538+
}
539+
540+
for _, test := range tests {
541+
if test.numRepetitions <= 0 {
542+
test.numRepetitions = 1
543+
}
544+
for n := 0; n < test.numRepetitions; n++ {
545+
initialPods := make([]*v1.Pod, test.numInitialPods)
546+
additionalPods := make([]*v1.Pod, test.numAdditionalPods)
547+
// Create and run existingPods.
548+
for i := 0; i < test.numInitialPods; i++ {
549+
initialPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(context, fmt.Sprintf("rpod-%v", i), mediumPriority, 0))
550+
if err != nil {
551+
t.Fatalf("Test [%v]: Error creating pause pod: %v", test.description, err)
552+
}
553+
}
554+
// make sure that initial Pods are all scheduled.
555+
for _, p := range initialPods {
556+
if err := waitForPodToSchedule(cs, p); err != nil {
557+
t.Fatalf("Pod %v/%v didn't get scheduled: %v", p.Namespace, p.Name, err)
558+
}
559+
}
560+
// Create the preemptor.
561+
klog.Info("Creating the preemptor pod...")
562+
preemptor, err := createPausePod(cs, test.preemptor)
563+
if err != nil {
564+
t.Errorf("Error while creating the preempting pod: %v", err)
565+
}
566+
567+
klog.Info("Creating additional pods...")
568+
for i := 0; i < test.numAdditionalPods; i++ {
569+
additionalPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(context, fmt.Sprintf("ppod-%v", i), mediumPriority, 0))
570+
if err != nil {
571+
t.Fatalf("Test [%v]: Error creating pending pod: %v", test.description, err)
572+
}
573+
}
574+
// Check that the preemptor pod gets nominated node name.
575+
if err := waitForNominatedNodeName(cs, preemptor); err != nil {
576+
t.Errorf("Test [%v]: NominatedNodeName annotation was not set for pod %v/%v: %v", test.description, preemptor.Namespace, preemptor.Name, err)
577+
}
578+
// Make sure that preemptor is scheduled after preemptions.
579+
if err := waitForPodToScheduleWithTimeout(cs, preemptor, 60*time.Second); err != nil {
580+
t.Errorf("Preemptor pod %v didn't get scheduled: %v", preemptor.Name, err)
581+
}
582+
583+
klog.Info("Check unschedulable pods still exists and were never scheduled...")
584+
for _, p := range additionalPods {
585+
pod, err := cs.CoreV1().Pods(p.Namespace).Get(p.Name, metav1.GetOptions{})
586+
if err != nil {
587+
t.Errorf("Error in getting Pod %v/%v info: %v", p.Namespace, p.Name, err)
588+
}
589+
if len(pod.Spec.NodeName) > 0 {
590+
t.Errorf("Pod %v/%v is already scheduled", p.Namespace, p.Name)
591+
}
592+
_, cond := podutil.GetPodCondition(&pod.Status, v1.PodScheduled)
593+
if cond != nil && cond.Status != v1.ConditionFalse {
594+
t.Errorf("Pod %v/%v is no longer unschedulable: %v", p.Namespace, p.Name, err)
595+
}
596+
}
597+
// Cleanup
598+
klog.Info("Cleaning up all pods...")
599+
allPods := additionalPods
600+
allPods = append(allPods, initialPods...)
601+
allPods = append(allPods, preemptor)
602+
cleanupPods(cs, t, allPods)
603+
}
604+
}
605+
}
606+
493607
// TestNominatedNodeCleanUp checks that when there are nominated pods on a
494608
// node and a higher priority pod is nominated to run on the node, the nominated
495609
// node name of the lower priority pods is cleared.
@@ -515,7 +629,7 @@ func TestNominatedNodeCleanUp(t *testing.T) {
515629
nodeRes := &v1.ResourceList{
516630
v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
517631
v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
518-
v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI),
632+
v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
519633
}
520634
_, err := createNode(context.clientSet, "node1", nodeRes)
521635
if err != nil {
@@ -543,7 +657,7 @@ func TestNominatedNodeCleanUp(t *testing.T) {
543657
Priority: &mediumPriority,
544658
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
545659
v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI),
546-
v1.ResourceMemory: *resource.NewQuantity(400, resource.BinarySI)},
660+
v1.ResourceMemory: *resource.NewQuantity(400, resource.DecimalSI)},
547661
},
548662
})
549663
medPriPod, err := createPausePod(cs, podConf)
@@ -561,7 +675,7 @@ func TestNominatedNodeCleanUp(t *testing.T) {
561675
Priority: &highPriority,
562676
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
563677
v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
564-
v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
678+
v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
565679
},
566680
})
567681
highPriPod, err := createPausePod(cs, podConf)
@@ -626,12 +740,12 @@ func TestPDBInPreemption(t *testing.T) {
626740

627741
defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
628742
v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI),
629-
v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI)},
743+
v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
630744
}
631745
defaultNodeRes := &v1.ResourceList{
632746
v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
633747
v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
634-
v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI),
748+
v1.ResourceMemory: *resource.NewQuantity(500, resource.DecimalSI),
635749
}
636750

637751
type nodeConfig struct {
@@ -683,7 +797,7 @@ func TestPDBInPreemption(t *testing.T) {
683797
Priority: &highPriority,
684798
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
685799
v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI),
686-
v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
800+
v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
687801
},
688802
}),
689803
preemptedPodIndexes: map[int]struct{}{2: {}},
@@ -721,7 +835,7 @@ func TestPDBInPreemption(t *testing.T) {
721835
Priority: &highPriority,
722836
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
723837
v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
724-
v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
838+
v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
725839
},
726840
}),
727841
preemptedPodIndexes: map[int]struct{}{1: {}},
@@ -801,7 +915,7 @@ func TestPDBInPreemption(t *testing.T) {
801915
Priority: &highPriority,
802916
Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
803917
v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI),
804-
v1.ResourceMemory: *resource.NewQuantity(400, resource.BinarySI)},
918+
v1.ResourceMemory: *resource.NewQuantity(400, resource.DecimalSI)},
805919
},
806920
}),
807921
// The third node is chosen because PDB is not violated for node 3 and the victims have lower priority than node-2.

test/integration/scheduler/util.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,17 @@ func podScheduled(c clientset.Interface, podNamespace, podName string) wait.Cond
582582
}
583583
}
584584

585+
// podUnschedulable returns a condition function that returns true if the given pod
586+
// gets unschedulable status.
587+
func podSchedulableCondition(c clientset.Interface, podNamespace, podName string) (*v1.PodCondition, error) {
588+
pod, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{})
589+
if err != nil {
590+
return nil, err
591+
}
592+
_, cond := podutil.GetPodCondition(&pod.Status, v1.PodScheduled)
593+
return cond, nil
594+
}
595+
585596
// podUnschedulable returns a condition function that returns true if the given pod
586597
// gets unschedulable status.
587598
func podUnschedulable(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc {
@@ -710,7 +721,7 @@ func cleanupPods(cs clientset.Interface, t *testing.T, pods []*v1.Pod) {
710721
}
711722
}
712723
for _, p := range pods {
713-
if err := wait.Poll(time.Second, wait.ForeverTestTimeout,
724+
if err := wait.Poll(time.Millisecond, wait.ForeverTestTimeout,
714725
podDeleted(cs, p.Namespace, p.Name)); err != nil {
715726
t.Errorf("error while waiting for pod %v/%v to get deleted: %v", p.Namespace, p.Name, err)
716727
}

0 commit comments

Comments
 (0)