Skip to content

Commit 7226a30

Browse files
committed
DRA e2e: adapt to increased ReservedFor limit
We want to be sure that the maximum number of pods per claim are actually scheduled concurrently. Previously the test just made sure that they ran eventually. Running 256 pods only works on more than 2 nodes, so network-attached resources have to be used. This is what the increased limit is meant for anyway. Because of the tightened validation of node selectors in 1.32, the E2E test has to use MatchExpressions because they allow listing node names.
1 parent 1cee368 commit 7226a30

File tree

3 files changed

+123
-40
lines changed

3 files changed

+123
-40
lines changed

test/e2e/dra/deploy.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,10 @@ func (d *Driver) SetUp(nodes *Nodes, resources Resources, devicesPerNode ...map[
277277
},
278278
NodeSelector: &v1.NodeSelector{
279279
NodeSelectorTerms: []v1.NodeSelectorTerm{{
280-
MatchFields: []v1.NodeSelectorRequirement{{
281-
Key: "metadata.name",
280+
// MatchExpressions allow multiple values,
281+
// MatchFields don't.
282+
MatchExpressions: []v1.NodeSelectorRequirement{{
283+
Key: "kubernetes.io/hostname",
282284
Operator: v1.NodeSelectorOpIn,
283285
Values: nodes.NodeNames,
284286
}},

test/e2e/dra/dra.go

Lines changed: 112 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -554,43 +554,6 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation,
554554
wg.Wait()
555555
})
556556

557-
f.It("supports sharing a claim sequentially", f.WithSlow(), func(ctx context.Context) {
558-
var objects []klog.KMetadata
559-
objects = append(objects, b.externalClaim())
560-
561-
// This test used to test usage of the claim by one pod
562-
// at a time. After removing the "not sharable"
563-
// feature, we have to create more pods than supported
564-
// at the same time to get the same effect.
565-
numPods := resourceapi.ResourceClaimReservedForMaxSize + 10
566-
pods := make([]*v1.Pod, numPods)
567-
for i := 0; i < numPods; i++ {
568-
pod := b.podExternal()
569-
pods[i] = pod
570-
objects = append(objects, pod)
571-
}
572-
573-
b.create(ctx, objects...)
574-
575-
// We don't know the order. All that matters is that all of them get scheduled eventually.
576-
f.Timeouts.PodStartSlow *= time.Duration(numPods)
577-
var wg sync.WaitGroup
578-
wg.Add(numPods)
579-
for i := 0; i < numPods; i++ {
580-
pod := pods[i]
581-
go func() {
582-
defer ginkgo.GinkgoRecover()
583-
defer wg.Done()
584-
b.testPod(ctx, f.ClientSet, pod, expectedEnv...)
585-
// We need to delete each running pod, otherwise the others cannot use the claim.
586-
err := f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
587-
framework.ExpectNoError(err, "delete pod")
588-
framework.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodStartSlow))
589-
}()
590-
}
591-
wg.Wait()
592-
})
593-
594557
ginkgo.It("retries pod scheduling after creating device class", func(ctx context.Context) {
595558
var objects []klog.KMetadata
596559
pod, template := b.podInline()
@@ -667,7 +630,7 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation,
667630
// The following tests only make sense when there is more than one node.
668631
// They get skipped when there's only one node.
669632
multiNodeTests := func() {
670-
nodes := NewNodes(f, 2, 8)
633+
nodes := NewNodes(f, 3, 8)
671634

672635
ginkgo.Context("with different ResourceSlices", func() {
673636
firstDevice := "pre-defined-device-01"
@@ -790,6 +753,117 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation,
790753
}
791754
})
792755
})
756+
757+
ginkgo.Context("with network-attached resources", func() {
758+
driver := NewDriver(f, nodes, networkResources)
759+
b := newBuilder(f, driver)
760+
761+
f.It("supports sharing a claim sequentially", f.WithSlow(), func(ctx context.Context) {
762+
var objects []klog.KMetadata
763+
objects = append(objects, b.externalClaim())
764+
765+
// This test used to test usage of the claim by one pod
766+
// at a time. After removing the "not sharable"
767+
// feature and bumping up the maximum number of
768+
// consumers this is now a stress test which runs
769+
// the maximum number of pods per claim in parallel.
770+
// This only works on clusters with >= 3 nodes.
771+
numMaxPods := resourceapi.ResourceClaimReservedForMaxSize
772+
ginkgo.By(fmt.Sprintf("Creating %d pods sharing the same claim", numMaxPods))
773+
pods := make([]*v1.Pod, numMaxPods)
774+
for i := 0; i < numMaxPods; i++ {
775+
pod := b.podExternal()
776+
pods[i] = pod
777+
objects = append(objects, pod)
778+
}
779+
b.create(ctx, objects...)
780+
781+
timeout := f.Timeouts.PodStartSlow * time.Duration(numMaxPods)
782+
ensureDuration := f.Timeouts.PodStart // Don't check for too long, even if it is less precise.
783+
podIsPending := gomega.HaveField("Spec.NodeName", gomega.BeEmpty())
784+
waitForPodScheduled := func(pod *v1.Pod) {
785+
ginkgo.GinkgoHelper()
786+
gomega.Eventually(ctx, framework.GetObject(f.ClientSet.CoreV1().Pods(pod.Namespace).Get, pod.Name, metav1.GetOptions{})).
787+
WithTimeout(timeout).
788+
WithPolling(10*time.Second).
789+
ShouldNot(podIsPending, "Pod should get scheduled.")
790+
}
791+
ensurePodNotScheduled := func(pod *v1.Pod) {
792+
ginkgo.GinkgoHelper()
793+
gomega.Consistently(ctx, framework.GetObject(f.ClientSet.CoreV1().Pods(pod.Namespace).Get, pod.Name, metav1.GetOptions{})).
794+
WithTimeout(ensureDuration).
795+
WithPolling(10*time.Second).
796+
Should(podIsPending, "Pod should remain pending.")
797+
}
798+
799+
// We don't know the order. All that matters is that all of them get scheduled eventually.
800+
ginkgo.By(fmt.Sprintf("Waiting for %d pods to be scheduled", numMaxPods))
801+
f.Timeouts.PodStartSlow *= time.Duration(numMaxPods)
802+
var wg sync.WaitGroup
803+
wg.Add(numMaxPods)
804+
for i := 0; i < numMaxPods; i++ {
805+
pod := pods[i]
806+
go func() {
807+
defer ginkgo.GinkgoRecover()
808+
defer wg.Done()
809+
waitForPodScheduled(pod)
810+
}()
811+
}
812+
wg.Wait()
813+
814+
numMorePods := 10
815+
ginkgo.By(fmt.Sprintf("Creating %d additional pods for the same claim", numMorePods))
816+
morePods := make([]*v1.Pod, numMorePods)
817+
objects = nil
818+
for i := 0; i < numMorePods; i++ {
819+
pod := b.podExternal()
820+
morePods[i] = pod
821+
objects = append(objects, pod)
822+
}
823+
b.create(ctx, objects...)
824+
825+
// None of the additional pods can run because of the ReservedFor limit.
826+
ginkgo.By(fmt.Sprintf("Check for %s that the additional pods don't get scheduled", ensureDuration))
827+
wg.Add(numMorePods)
828+
for i := 0; i < numMorePods; i++ {
829+
pod := morePods[i]
830+
go func() {
831+
defer ginkgo.GinkgoRecover()
832+
defer wg.Done()
833+
ensurePodNotScheduled(pod)
834+
}()
835+
}
836+
wg.Wait()
837+
838+
// We need to delete each running pod, otherwise the new ones cannot use the claim.
839+
ginkgo.By(fmt.Sprintf("Deleting the initial %d pods", numMaxPods))
840+
wg.Add(numMaxPods)
841+
for i := 0; i < numMaxPods; i++ {
842+
pod := pods[i]
843+
go func() {
844+
defer ginkgo.GinkgoRecover()
845+
defer wg.Done()
846+
err := f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
847+
framework.ExpectNoError(err, "delete pod")
848+
framework.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodStartSlow))
849+
}()
850+
}
851+
wg.Wait()
852+
853+
// Now those should also run - eventually...
854+
ginkgo.By(fmt.Sprintf("Waiting for the additional %d pods to be scheduled", numMorePods))
855+
wg.Add(numMorePods)
856+
for i := 0; i < numMorePods; i++ {
857+
pod := morePods[i]
858+
go func() {
859+
defer ginkgo.GinkgoRecover()
860+
defer wg.Done()
861+
waitForPodScheduled(pod)
862+
}()
863+
}
864+
wg.Wait()
865+
})
866+
})
793867
}
794868

795869
ginkgo.Context("on single node", func() {

test/e2e/dra/kind.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ nodes:
4040
nodeRegistration:
4141
kubeletExtraArgs:
4242
v: "5"
43+
- role: worker
44+
kubeadmConfigPatches:
45+
- |
46+
kind: JoinConfiguration
47+
nodeRegistration:
48+
kubeletExtraArgs:
49+
v: "5"
4350
# Feature gates must be the last entry in this YAML.
4451
# Some Prow jobs add more feature gates with
4552
#

0 commit comments

Comments
 (0)