Skip to content

Commit 27cbe54

Browse files
authored
Merge pull request kubernetes#130163 from ffromani/e2e-node-fix-cpu-quota-test
e2e: node: cpumgr: cleanup after each test case
2 parents 566f939 + 3234106 commit 27cbe54

File tree

2 files changed

+114
-77
lines changed

2 files changed

+114
-77
lines changed

test/e2e_node/cpu_manager_test.go

Lines changed: 106 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -592,24 +592,36 @@ func runMultipleCPUContainersGuPod(ctx context.Context, f *framework.Framework)
592592
waitForContainerRemoval(ctx, pod.Spec.Containers[1].Name, pod.Name, pod.Namespace)
593593
}
594594

595-
func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool) {
595+
func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool, cpuAlloc int64) {
596596
var err error
597597
var ctnAttrs []ctnAttribute
598598
var pod1, pod2, pod3 *v1.Pod
599-
var cleanupPods []*v1.Pod
600-
ginkgo.DeferCleanup(func() {
599+
podsToClean := make(map[string]*v1.Pod) // pod.UID -> pod
600+
601+
framework.Logf("runCfsQuotaGuPods: disableQuota=%v, CPU Allocatable=%v", disabledCPUQuotaWithExclusiveCPUs, cpuAlloc)
602+
603+
deleteTestPod := func(pod *v1.Pod) {
601604
// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
602605
// 'deadline expired' message and the cleanup aborts, which we don't want.
603-
ctx2 := context.TODO()
606+
// So let's use a separate and more generous timeout (determined by trial and error)
607+
ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
608+
defer cancel()
609+
deletePodSyncAndWait(ctx2, f, pod.Namespace, pod.Name)
610+
delete(podsToClean, string(pod.UID))
611+
}
612+
613+
// cleanup leftovers on test failure. The happy path is covered by `deleteTestPod` calls
614+
ginkgo.DeferCleanup(func() {
604615
ginkgo.By("by deleting the pods and waiting for container removal")
605-
for _, cleanupPod := range cleanupPods {
606-
framework.Logf("deleting pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name)
607-
deletePodSyncByName(ctx2, f, cleanupPod.Name)
608-
waitForContainerRemoval(ctx2, cleanupPod.Spec.Containers[0].Name, cleanupPod.Name, cleanupPod.Namespace)
609-
framework.Logf("deleted pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name)
610-
}
616+
// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
617+
// 'deadline expired' message and the cleanup aborts, which we don't want.
618+
// So let's use a separate and more generous timeout (determined by trial and error)
619+
ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
620+
defer cancel()
621+
deletePodsAsync(ctx2, f, podsToClean)
611622
})
612623

624+
podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`}
613625
cfsCheckCommand := []string{"sh", "-c", "cat /sys/fs/cgroup/cpu.max && sleep 1d"}
614626
defaultPeriod := "100000"
615627

@@ -623,7 +635,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
623635
pod1 = makeCPUManagerPod("gu-pod1", ctnAttrs)
624636
pod1.Spec.Containers[0].Command = cfsCheckCommand
625637
pod1 = e2epod.NewPodClient(f).CreateSync(ctx, pod1)
626-
cleanupPods = append(cleanupPods, pod1)
638+
podsToClean[string(pod1.UID)] = pod1
627639

628640
ginkgo.By("checking if the expected cfs quota was assigned (GU pod, exclusive CPUs, unlimited)")
629641

@@ -635,6 +647,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
635647
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod1.Name, pod1.Spec.Containers[0].Name, expCFSQuotaRegex)
636648
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
637649
pod1.Spec.Containers[0].Name, pod1.Name)
650+
deleteTestPod(pod1)
638651

639652
ctnAttrs = []ctnAttribute{
640653
{
@@ -646,7 +659,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
646659
pod2 = makeCPUManagerPod("gu-pod2", ctnAttrs)
647660
pod2.Spec.Containers[0].Command = cfsCheckCommand
648661
pod2 = e2epod.NewPodClient(f).CreateSync(ctx, pod2)
649-
cleanupPods = append(cleanupPods, pod2)
662+
podsToClean[string(pod2.UID)] = pod2
650663

651664
ginkgo.By("checking if the expected cfs quota was assigned (GU pod, limited)")
652665

@@ -655,6 +668,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
655668
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod2.Name, pod2.Spec.Containers[0].Name, expCFSQuotaRegex)
656669
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
657670
pod2.Spec.Containers[0].Name, pod2.Name)
671+
deleteTestPod(pod2)
658672

659673
ctnAttrs = []ctnAttribute{
660674
{
@@ -666,7 +680,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
666680
pod3 = makeCPUManagerPod("non-gu-pod3", ctnAttrs)
667681
pod3.Spec.Containers[0].Command = cfsCheckCommand
668682
pod3 = e2epod.NewPodClient(f).CreateSync(ctx, pod3)
669-
cleanupPods = append(cleanupPods, pod3)
683+
podsToClean[string(pod3.UID)] = pod3
670684

671685
ginkgo.By("checking if the expected cfs quota was assigned (BU pod, limited)")
672686

@@ -675,73 +689,79 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
675689
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod3.Name, pod3.Spec.Containers[0].Name, expCFSQuotaRegex)
676690
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
677691
pod3.Spec.Containers[0].Name, pod3.Name)
692+
deleteTestPod(pod3)
678693

679-
ctnAttrs = []ctnAttribute{
680-
{
681-
ctnName: "gu-container-non-int-values",
682-
cpuRequest: "500m",
683-
cpuLimit: "500m",
684-
},
685-
{
686-
ctnName: "gu-container-int-values",
687-
cpuRequest: "1",
688-
cpuLimit: "1",
689-
},
690-
}
691-
pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs)
692-
pod4.Spec.Containers[0].Command = cfsCheckCommand
693-
pod4.Spec.Containers[1].Command = cfsCheckCommand
694-
pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
695-
cleanupPods = append(cleanupPods, pod4)
694+
if cpuAlloc >= 2 {
695+
ctnAttrs = []ctnAttribute{
696+
{
697+
ctnName: "gu-container-non-int-values",
698+
cpuRequest: "500m",
699+
cpuLimit: "500m",
700+
},
701+
{
702+
ctnName: "gu-container-int-values",
703+
cpuRequest: "1",
704+
cpuLimit: "1",
705+
},
706+
}
707+
pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs)
708+
pod4.Spec.Containers[0].Command = cfsCheckCommand
709+
pod4.Spec.Containers[1].Command = cfsCheckCommand
710+
pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
711+
podsToClean[string(pod4.UID)] = pod4
712+
713+
ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
714+
715+
expectedQuota = "50000"
716+
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
717+
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex)
718+
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
719+
pod4.Spec.Containers[0].Name, pod4.Name)
720+
expectedQuota = "100000"
721+
if disabledCPUQuotaWithExclusiveCPUs {
722+
expectedQuota = "max"
723+
}
724+
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
725+
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
726+
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
727+
pod4.Spec.Containers[1].Name, pod4.Name)
728+
deleteTestPod(pod4)
696729

697-
ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
730+
ctnAttrs = []ctnAttribute{
731+
{
732+
ctnName: "gu-container-non-int-values",
733+
cpuRequest: "500m",
734+
cpuLimit: "500m",
735+
},
736+
{
737+
ctnName: "gu-container-int-values",
738+
cpuRequest: "1",
739+
cpuLimit: "1",
740+
},
741+
}
698742

699-
expectedQuota = "50000"
700-
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
701-
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex)
702-
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
703-
pod4.Spec.Containers[0].Name, pod4.Name)
704-
expectedQuota = "100000"
705-
if disabledCPUQuotaWithExclusiveCPUs {
706-
expectedQuota = "max"
707-
}
708-
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
709-
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
710-
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
711-
pod4.Spec.Containers[1].Name, pod4.Name)
743+
pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
744+
pod5.Spec.Containers[0].Command = podCFSCheckCommand
745+
pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
746+
podsToClean[string(pod5.UID)] = pod5
712747

713-
ctnAttrs = []ctnAttribute{
714-
{
715-
ctnName: "gu-container-non-int-values",
716-
cpuRequest: "500m",
717-
cpuLimit: "500m",
718-
},
719-
{
720-
ctnName: "gu-container-int-values",
721-
cpuRequest: "1",
722-
cpuLimit: "1",
723-
},
724-
}
748+
ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
725749

726-
podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`}
750+
expectedQuota = "150000"
727751

728-
pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
729-
pod5.Spec.Containers[0].Command = podCFSCheckCommand
730-
pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
731-
cleanupPods = append(cleanupPods, pod5)
732-
ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
752+
if disabledCPUQuotaWithExclusiveCPUs {
753+
expectedQuota = "max"
754+
}
733755

734-
expectedQuota = "150000"
756+
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
735757

736-
if disabledCPUQuotaWithExclusiveCPUs {
737-
expectedQuota = "max"
758+
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
759+
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
760+
deleteTestPod(pod5)
761+
} else {
762+
ginkgo.By(fmt.Sprintf("some cases SKIPPED - requests at least %d allocatable cores, got %d", 2, cpuAlloc))
738763
}
739764

740-
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
741-
742-
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
743-
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
744-
745765
ctnAttrs = []ctnAttribute{
746766
{
747767
ctnName: "gu-container",
@@ -753,15 +773,15 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
753773
pod6 := makeCPUManagerPod("gu-pod6", ctnAttrs)
754774
pod6.Spec.Containers[0].Command = podCFSCheckCommand
755775
pod6 = e2epod.NewPodClient(f).CreateSync(ctx, pod6)
756-
cleanupPods = append(cleanupPods, pod6)
776+
podsToClean[string(pod6.UID)] = pod6
757777

758778
ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, limited)")
759779

760780
expectedQuota = "10000"
761781
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
762782
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod6.Name, pod6.Spec.Containers[0].Name, expCFSQuotaRegex)
763783
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod6.Spec.Containers[0].Name, pod6.Name)
764-
784+
deleteTestPod(pod6)
765785
}
766786

767787
func runMultipleGuPods(ctx context.Context, f *framework.Framework) {
@@ -921,6 +941,10 @@ func runCPUManagerTests(f *framework.Framework) {
921941
if !IsCgroup2UnifiedMode() {
922942
e2eskipper.Skipf("Skipping since CgroupV2 not used")
923943
}
944+
_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
945+
if cpuAlloc < 1 { // save expensive kubelet restart
946+
e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc)
947+
}
924948
newCfg := configureCPUManagerInKubelet(oldCfg,
925949
&cpuManagerKubeletArguments{
926950
policyName: string(cpumanager.PolicyStatic),
@@ -929,13 +953,19 @@ func runCPUManagerTests(f *framework.Framework) {
929953
},
930954
)
931955
updateKubeletConfig(ctx, f, newCfg, true)
932-
runCfsQuotaGuPods(ctx, f, true)
956+
957+
_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU
958+
runCfsQuotaGuPods(ctx, f, true, cpuAlloc)
933959
})
934960

935961
ginkgo.It("should keep enforcing the CFS quota for containers with static CPUs assigned and feature gate disabled", func(ctx context.Context) {
936962
if !IsCgroup2UnifiedMode() {
937963
e2eskipper.Skipf("Skipping since CgroupV2 not used")
938964
}
965+
_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
966+
if cpuAlloc < 1 { // save expensive kubelet restart
967+
e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc)
968+
}
939969
newCfg := configureCPUManagerInKubelet(oldCfg,
940970
&cpuManagerKubeletArguments{
941971
policyName: string(cpumanager.PolicyStatic),
@@ -945,7 +975,9 @@ func runCPUManagerTests(f *framework.Framework) {
945975
)
946976

947977
updateKubeletConfig(ctx, f, newCfg, true)
948-
runCfsQuotaGuPods(ctx, f, false)
978+
979+
_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU
980+
runCfsQuotaGuPods(ctx, f, false, cpuAlloc)
949981
})
950982

951983
f.It("should not reuse CPUs of restartable init containers", feature.SidecarContainers, func(ctx context.Context) {

test/e2e_node/topology_manager_test.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -463,14 +463,19 @@ func deletePodsAsync(ctx context.Context, f *framework.Framework, podMap map[str
463463
go func(podNS, podName string) {
464464
defer ginkgo.GinkgoRecover()
465465
defer wg.Done()
466-
467-
deletePodSyncByName(ctx, f, podName)
468-
waitForAllContainerRemoval(ctx, podName, podNS)
466+
deletePodSyncAndWait(ctx, f, podNS, podName)
469467
}(pod.Namespace, pod.Name)
470468
}
471469
wg.Wait()
472470
}
473471

472+
func deletePodSyncAndWait(ctx context.Context, f *framework.Framework, podNS, podName string) {
473+
framework.Logf("deleting pod: %s/%s", podNS, podName)
474+
deletePodSyncByName(ctx, f, podName)
475+
waitForAllContainerRemoval(ctx, podName, podNS)
476+
framework.Logf("deleted pod: %s/%s", podNS, podName)
477+
}
478+
474479
func runTopologyManagerNegativeTest(ctx context.Context, f *framework.Framework, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
475480
podName := "gu-pod"
476481
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)

0 commit comments

Comments
 (0)