Skip to content

Commit 99b301d

Browse files
ravisantoshgudimetladamemi
authored andcommitted
Fix preemption race conditions on heavy utilized nodes
1 parent 50f9ea7 commit 99b301d

File tree

1 file changed

+80
-20
lines changed

1 file changed

+80
-20
lines changed

test/e2e/scheduling/preemption.go

Lines changed: 80 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
"k8s.io/apimachinery/pkg/runtime"
3535
"k8s.io/apimachinery/pkg/watch"
3636
clientset "k8s.io/client-go/kubernetes"
37+
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
3738
"k8s.io/kubernetes/pkg/apis/scheduling"
3839
"k8s.io/kubernetes/test/e2e/framework"
3940
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
@@ -103,57 +104,77 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
103104
var podRes v1.ResourceList
104105
// Create one pod per node that uses a lot of the node's resources.
105106
ginkgo.By("Create pods that use 60% of node resources.")
106-
pods := make([]*v1.Pod, len(nodeList.Items))
107+
pods := make([]*v1.Pod, 0, len(nodeList.Items))
108+
allPods, err := cs.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{})
109+
framework.ExpectNoError(err)
107110
for i, node := range nodeList.Items {
111+
currentCPUUsage, currentMemUsage := getCurrentPodUsageOnTheNode(node.Name, allPods.Items, podRequestedResource)
112+
framework.Logf("Current cpu and memory usage %v, %v", currentCPUUsage, currentMemUsage)
108113
cpuAllocatable, found := node.Status.Allocatable["cpu"]
109114
framework.ExpectEqual(found, true)
110-
milliCPU := cpuAllocatable.MilliValue() * 40 / 100
115+
milliCPU := cpuAllocatable.MilliValue()
116+
milliCPU = int64(float64(milliCPU-currentCPUUsage) * float64(0.6))
111117
memAllocatable, found := node.Status.Allocatable["memory"]
112118
framework.ExpectEqual(found, true)
113-
memory := memAllocatable.Value() * 60 / 100
119+
memory := memAllocatable.Value()
120+
memory = int64(float64(memory-currentMemUsage) * float64(0.6))
121+
// If a node is already heavily utilized let not's create a pod there.
122+
if milliCPU <= 0 || memory <= 0 {
123+
framework.Logf("Node is heavily utilized, let's not create a pod here")
124+
continue
125+
}
114126
podRes = v1.ResourceList{}
115127
podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
116128
podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
117129

118130
// make the first pod low priority and the rest medium priority.
119131
priorityName := mediumPriorityClassName
120-
if i == 0 {
132+
if len(pods) == 0 {
121133
priorityName = lowPriorityClassName
122134
}
123-
pods[i] = createPausePod(f, pausePodConfig{
135+
pods = append(pods, createPausePod(f, pausePodConfig{
124136
Name: fmt.Sprintf("pod%d-%v", i, priorityName),
125137
PriorityClassName: priorityName,
126138
Resources: &v1.ResourceRequirements{
127139
Requests: podRes,
128140
},
129-
})
141+
NodeName: node.Name,
142+
}))
130143
framework.Logf("Created pod: %v", pods[i].Name)
131144
}
145+
if len(pods) < 2 {
146+
framework.Failf("We need at least two pods to be created but" +
147+
"all nodes are already heavily utilized, so preemption tests cannot be run")
148+
}
132149
ginkgo.By("Wait for pods to be scheduled.")
133150
for _, pod := range pods {
134151
framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod))
135152
}
136153

137-
ginkgo.By("Run a high priority pod that use 60% of a node resources.")
138-
// Create a high priority pod and make sure it is scheduled.
154+
// Set the pod request to the first pod's resources (should be low priority pod)
155+
podRes = pods[0].Spec.Containers[0].Resources.Requests
156+
157+
ginkgo.By("Run a high priority pod that has same requirements as that of lower priority pod")
158+
// Create a high priority pod and make sure it is scheduled on the same node as the low priority pod.
139159
runPausePod(f, pausePodConfig{
140160
Name: "preemptor-pod",
141161
PriorityClassName: highPriorityClassName,
142162
Resources: &v1.ResourceRequirements{
143163
Requests: podRes,
144164
},
165+
NodeName: pods[0].Spec.NodeName,
145166
})
146-
// Make sure that the lowest priority pod is deleted.
167+
147168
preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{})
148-
podDeleted := (err != nil && apierrors.IsNotFound(err)) ||
169+
podPreempted := (err != nil && apierrors.IsNotFound(err)) ||
149170
(err == nil && preemptedPod.DeletionTimestamp != nil)
150-
framework.ExpectEqual(podDeleted, true)
151-
// Other pods (mid priority ones) should be present.
152171
for i := 1; i < len(pods); i++ {
153172
livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{})
154173
framework.ExpectNoError(err)
155174
gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
156175
}
176+
177+
framework.ExpectEqual(podPreempted, true)
157178
})
158179

159180
// This test verifies that when a critical pod is created and no node with
@@ -163,21 +184,32 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
163184
var podRes v1.ResourceList
164185
// Create one pod per node that uses a lot of the node's resources.
165186
ginkgo.By("Create pods that use 60% of node resources.")
166-
pods := make([]*v1.Pod, len(nodeList.Items))
187+
pods := make([]*v1.Pod, 0, len(nodeList.Items))
188+
allPods, err := cs.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{})
189+
framework.ExpectNoError(err)
167190
for i, node := range nodeList.Items {
191+
currentCPUUsage, currentMemUsage := getCurrentPodUsageOnTheNode(node.Name, allPods.Items, podRequestedResource)
192+
framework.Logf("Current cpu usage and memory usage is %v, %v", currentCPUUsage, currentMemUsage)
168193
cpuAllocatable, found := node.Status.Allocatable["cpu"]
169194
framework.ExpectEqual(found, true)
170-
milliCPU := cpuAllocatable.MilliValue() * 40 / 100
195+
milliCPU := cpuAllocatable.MilliValue()
196+
milliCPU = int64(float64(milliCPU-currentCPUUsage) * float64(0.6))
171197
memAllocatable, found := node.Status.Allocatable["memory"]
172198
framework.ExpectEqual(found, true)
173-
memory := memAllocatable.Value() * 60 / 100
199+
memory := memAllocatable.Value()
200+
memory = int64(float64(memory-currentMemUsage) * float64(0.6))
174201
podRes = v1.ResourceList{}
202+
// If a node is already heavily utilized let not's create a pod there.
203+
if milliCPU <= 0 || memory <= 0 {
204+
framework.Logf("Node is heavily utilized, let's not create a pod there")
205+
continue
206+
}
175207
podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
176208
podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
177209

178210
// make the first pod low priority and the rest medium priority.
179211
priorityName := mediumPriorityClassName
180-
if i == 0 {
212+
if len(pods) == 0 {
181213
priorityName = lowPriorityClassName
182214
}
183215
pods[i] = createPausePod(f, pausePodConfig{
@@ -186,15 +218,22 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
186218
Resources: &v1.ResourceRequirements{
187219
Requests: podRes,
188220
},
221+
NodeName: node.Name,
189222
})
190223
framework.Logf("Created pod: %v", pods[i].Name)
191224
}
225+
if len(pods) < 2 {
226+
framework.Skipf("We need at least two pods to be created but" +
227+
"all nodes are already heavily utilized, so preemption tests cannot be run")
228+
}
192229
ginkgo.By("Wait for pods to be scheduled.")
193230
for _, pod := range pods {
194231
framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod))
195232
}
196233

197-
ginkgo.By("Run a critical pod that use 60% of a node resources.")
234+
// We want this pod to be preempted
235+
podRes = pods[0].Spec.Containers[0].Resources.Requests
236+
ginkgo.By("Run a critical pod that use same resources as that of a lower priority pod")
198237
// Create a critical pod and make sure it is scheduled.
199238
defer func() {
200239
// Clean-up the critical pod
@@ -211,18 +250,25 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
211250
Resources: &v1.ResourceRequirements{
212251
Requests: podRes,
213252
},
253+
NodeName: pods[0].Spec.NodeName,
214254
})
255+
256+
defer func() {
257+
// Clean-up the critical pod
258+
err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete("critical-pod", metav1.NewDeleteOptions(0))
259+
framework.ExpectNoError(err)
260+
}()
215261
// Make sure that the lowest priority pod is deleted.
216262
preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{})
217-
podDeleted := (err != nil && apierrors.IsNotFound(err)) ||
263+
podPreempted := (err != nil && apierrors.IsNotFound(err)) ||
218264
(err == nil && preemptedPod.DeletionTimestamp != nil)
219-
framework.ExpectEqual(podDeleted, true)
220-
// Other pods (mid priority ones) should be present.
221265
for i := 1; i < len(pods); i++ {
222266
livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{})
223267
framework.ExpectNoError(err)
224268
gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
225269
}
270+
271+
framework.ExpectEqual(podPreempted, true)
226272
})
227273
})
228274

@@ -516,3 +562,17 @@ func waitForPreemptingWithTimeout(f *framework.Framework, pod *v1.Pod, timeout t
516562
})
517563
framework.ExpectNoError(err, "pod %v/%v failed to preempt other pods", pod.Namespace, pod.Name)
518564
}
565+
566+
func getCurrentPodUsageOnTheNode(nodeName string, pods []v1.Pod, resource *v1.ResourceRequirements) (int64, int64) {
567+
totalRequestedCPUResource := resource.Requests.Cpu().MilliValue()
568+
totalRequestedMemResource := resource.Requests.Memory().Value()
569+
for _, pod := range pods {
570+
if pod.Spec.NodeName != nodeName || v1qos.GetPodQOS(&pod) == v1.PodQOSBestEffort {
571+
continue
572+
}
573+
result := getNonZeroRequests(&pod)
574+
totalRequestedCPUResource += result.MilliCPU
575+
totalRequestedMemResource += result.Memory
576+
}
577+
return totalRequestedCPUResource, totalRequestedMemResource
578+
}

0 commit comments

Comments
 (0)