Merge pull request kubernetes#130163 from ffromani/e2e-node-fix-cpu-quota-test

k8s-ci-robot · web-flow · commit 27cbe54b0946 · 2025-02-25T05:08:29.000-08:00
e2e: node: cpumgr: cleanup after each test case
diff --git a/test/e2e_node/cpu_manager_test.go b/test/e2e_node/cpu_manager_test.go
@@ -592,24 +592,36 @@ func runMultipleCPUContainersGuPod(ctx context.Context, f *framework.Framework)
 	waitForContainerRemoval(ctx, pod.Spec.Containers[1].Name, pod.Name, pod.Namespace)
 }
 
-func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool) {
+func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool, cpuAlloc int64) {
 	var err error
 	var ctnAttrs []ctnAttribute
 	var pod1, pod2, pod3 *v1.Pod
-	var cleanupPods []*v1.Pod
-	ginkgo.DeferCleanup(func() {
+	podsToClean := make(map[string]*v1.Pod) // pod.UID -> pod
+
+	framework.Logf("runCfsQuotaGuPods: disableQuota=%v, CPU Allocatable=%v", disabledCPUQuotaWithExclusiveCPUs, cpuAlloc)
+
+	deleteTestPod := func(pod *v1.Pod) {
 		// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
 		// 'deadline expired' message and the cleanup aborts, which we don't want.
-		ctx2 := context.TODO()
+		// So let's use a separate and more generous timeout (determined by trial and error)
+		ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+		defer cancel()
+		deletePodSyncAndWait(ctx2, f, pod.Namespace, pod.Name)
+		delete(podsToClean, string(pod.UID))
+	}
+
+	// cleanup leftovers on test failure. The happy path is covered by `deleteTestPod` calls
+	ginkgo.DeferCleanup(func() {
 		ginkgo.By("by deleting the pods and waiting for container removal")
-		for _, cleanupPod := range cleanupPods {
-			framework.Logf("deleting pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name)
-			deletePodSyncByName(ctx2, f, cleanupPod.Name)
-			waitForContainerRemoval(ctx2, cleanupPod.Spec.Containers[0].Name, cleanupPod.Name, cleanupPod.Namespace)
-			framework.Logf("deleted pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name)
-		}
+		// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
+		// 'deadline expired' message and the cleanup aborts, which we don't want.
+		// So let's use a separate and more generous timeout (determined by trial and error)
+		ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+		defer cancel()
+		deletePodsAsync(ctx2, f, podsToClean)
 	})
 
+	podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`}
 	cfsCheckCommand := []string{"sh", "-c", "cat /sys/fs/cgroup/cpu.max && sleep 1d"}
 	defaultPeriod := "100000"
 
@@ -623,7 +635,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod1 = makeCPUManagerPod("gu-pod1", ctnAttrs)
 	pod1.Spec.Containers[0].Command = cfsCheckCommand
 	pod1 = e2epod.NewPodClient(f).CreateSync(ctx, pod1)
-	cleanupPods = append(cleanupPods, pod1)
+	podsToClean[string(pod1.UID)] = pod1
 
 	ginkgo.By("checking if the expected cfs quota was assigned (GU pod, exclusive CPUs, unlimited)")
 
@@ -635,6 +647,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod1.Name, pod1.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
 		pod1.Spec.Containers[0].Name, pod1.Name)
+	deleteTestPod(pod1)
 
 	ctnAttrs = []ctnAttribute{
 		{
@@ -646,7 +659,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod2 = makeCPUManagerPod("gu-pod2", ctnAttrs)
 	pod2.Spec.Containers[0].Command = cfsCheckCommand
 	pod2 = e2epod.NewPodClient(f).CreateSync(ctx, pod2)
-	cleanupPods = append(cleanupPods, pod2)
+	podsToClean[string(pod2.UID)] = pod2
 
 	ginkgo.By("checking if the expected cfs quota was assigned (GU pod, limited)")
 
@@ -655,6 +668,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod2.Name, pod2.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
 		pod2.Spec.Containers[0].Name, pod2.Name)
+	deleteTestPod(pod2)
 
 	ctnAttrs = []ctnAttribute{
 		{
@@ -666,7 +680,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod3 = makeCPUManagerPod("non-gu-pod3", ctnAttrs)
 	pod3.Spec.Containers[0].Command = cfsCheckCommand
 	pod3 = e2epod.NewPodClient(f).CreateSync(ctx, pod3)
-	cleanupPods = append(cleanupPods, pod3)
+	podsToClean[string(pod3.UID)] = pod3
 
 	ginkgo.By("checking if the expected cfs quota was assigned (BU pod, limited)")
 
@@ -675,73 +689,79 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod3.Name, pod3.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
 		pod3.Spec.Containers[0].Name, pod3.Name)
+	deleteTestPod(pod3)
 
-	ctnAttrs = []ctnAttribute{
-		{
-			ctnName:    "gu-container-non-int-values",
-			cpuRequest: "500m",
-			cpuLimit:   "500m",
-		},
-		{
-			ctnName:    "gu-container-int-values",
-			cpuRequest: "1",
-			cpuLimit:   "1",
-		},
-	}
-	pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs)
-	pod4.Spec.Containers[0].Command = cfsCheckCommand
-	pod4.Spec.Containers[1].Command = cfsCheckCommand
-	pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
-	cleanupPods = append(cleanupPods, pod4)
+	if cpuAlloc >= 2 {
+		ctnAttrs = []ctnAttribute{
+			{
+				ctnName:    "gu-container-non-int-values",
+				cpuRequest: "500m",
+				cpuLimit:   "500m",
+			},
+			{
+				ctnName:    "gu-container-int-values",
+				cpuRequest: "1",
+				cpuLimit:   "1",
+			},
+		}
+		pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs)
+		pod4.Spec.Containers[0].Command = cfsCheckCommand
+		pod4.Spec.Containers[1].Command = cfsCheckCommand
+		pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
+		podsToClean[string(pod4.UID)] = pod4
+
+		ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
+
+		expectedQuota = "50000"
+		expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
+		err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex)
+		framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+			pod4.Spec.Containers[0].Name, pod4.Name)
+		expectedQuota = "100000"
+		if disabledCPUQuotaWithExclusiveCPUs {
+			expectedQuota = "max"
+		}
+		expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
+		err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
+		framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+			pod4.Spec.Containers[1].Name, pod4.Name)
+		deleteTestPod(pod4)
 
-	ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
+		ctnAttrs = []ctnAttribute{
+			{
+				ctnName:    "gu-container-non-int-values",
+				cpuRequest: "500m",
+				cpuLimit:   "500m",
+			},
+			{
+				ctnName:    "gu-container-int-values",
+				cpuRequest: "1",
+				cpuLimit:   "1",
+			},
+		}
 
-	expectedQuota = "50000"
-	expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
-	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex)
-	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
-		pod4.Spec.Containers[0].Name, pod4.Name)
-	expectedQuota = "100000"
-	if disabledCPUQuotaWithExclusiveCPUs {
-		expectedQuota = "max"
-	}
-	expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
-	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
-	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
-		pod4.Spec.Containers[1].Name, pod4.Name)
+		pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
+		pod5.Spec.Containers[0].Command = podCFSCheckCommand
+		pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
+		podsToClean[string(pod5.UID)] = pod5
 
-	ctnAttrs = []ctnAttribute{
-		{
-			ctnName:    "gu-container-non-int-values",
-			cpuRequest: "500m",
-			cpuLimit:   "500m",
-		},
-		{
-			ctnName:    "gu-container-int-values",
-			cpuRequest: "1",
-			cpuLimit:   "1",
-		},
-	}
+		ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
 
-	podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`}
+		expectedQuota = "150000"
 
-	pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
-	pod5.Spec.Containers[0].Command = podCFSCheckCommand
-	pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
-	cleanupPods = append(cleanupPods, pod5)
-	ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
+		if disabledCPUQuotaWithExclusiveCPUs {
+			expectedQuota = "max"
+		}
 
-	expectedQuota = "150000"
+		expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
 
-	if disabledCPUQuotaWithExclusiveCPUs {
-		expectedQuota = "max"
+		err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
+		framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
+		deleteTestPod(pod5)
+	} else {
+		ginkgo.By(fmt.Sprintf("some cases SKIPPED - requests at least %d allocatable cores, got %d", 2, cpuAlloc))
 	}
 
-	expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
-
-	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
-	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
-
 	ctnAttrs = []ctnAttribute{
 		{
 			ctnName:    "gu-container",
@@ -753,15 +773,15 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod6 := makeCPUManagerPod("gu-pod6", ctnAttrs)
 	pod6.Spec.Containers[0].Command = podCFSCheckCommand
 	pod6 = e2epod.NewPodClient(f).CreateSync(ctx, pod6)
-	cleanupPods = append(cleanupPods, pod6)
+	podsToClean[string(pod6.UID)] = pod6
 
 	ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, limited)")
 
 	expectedQuota = "10000"
 	expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod6.Name, pod6.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod6.Spec.Containers[0].Name, pod6.Name)
-
+	deleteTestPod(pod6)
 }
 
 func runMultipleGuPods(ctx context.Context, f *framework.Framework) {
@@ -921,6 +941,10 @@ func runCPUManagerTests(f *framework.Framework) {
 		if !IsCgroup2UnifiedMode() {
 			e2eskipper.Skipf("Skipping since CgroupV2 not used")
 		}
+		_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
+		if cpuAlloc < 1 { // save expensive kubelet restart
+			e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc)
+		}
 		newCfg := configureCPUManagerInKubelet(oldCfg,
 			&cpuManagerKubeletArguments{
 				policyName:                       string(cpumanager.PolicyStatic),
@@ -929,13 +953,19 @@ func runCPUManagerTests(f *framework.Framework) {
 			},
 		)
 		updateKubeletConfig(ctx, f, newCfg, true)
-		runCfsQuotaGuPods(ctx, f, true)
+
+		_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU
+		runCfsQuotaGuPods(ctx, f, true, cpuAlloc)
 	})
 
 	ginkgo.It("should keep enforcing the CFS quota for containers with static CPUs assigned and feature gate disabled", func(ctx context.Context) {
 		if !IsCgroup2UnifiedMode() {
 			e2eskipper.Skipf("Skipping since CgroupV2 not used")
 		}
+		_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
+		if cpuAlloc < 1 { // save expensive kubelet restart
+			e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc)
+		}
 		newCfg := configureCPUManagerInKubelet(oldCfg,
 			&cpuManagerKubeletArguments{
 				policyName:                       string(cpumanager.PolicyStatic),
@@ -945,7 +975,9 @@ func runCPUManagerTests(f *framework.Framework) {
 		)
 
 		updateKubeletConfig(ctx, f, newCfg, true)
-		runCfsQuotaGuPods(ctx, f, false)
+
+		_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU
+		runCfsQuotaGuPods(ctx, f, false, cpuAlloc)
 	})
 
 	f.It("should not reuse CPUs of restartable init containers", feature.SidecarContainers, func(ctx context.Context) {
diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go
@@ -463,14 +463,19 @@ func deletePodsAsync(ctx context.Context, f *framework.Framework, podMap map[str
 		go func(podNS, podName string) {
 			defer ginkgo.GinkgoRecover()
 			defer wg.Done()
-
-			deletePodSyncByName(ctx, f, podName)
-			waitForAllContainerRemoval(ctx, podName, podNS)
+			deletePodSyncAndWait(ctx, f, podNS, podName)
 		}(pod.Namespace, pod.Name)
 	}
 	wg.Wait()
 }
 
+func deletePodSyncAndWait(ctx context.Context, f *framework.Framework, podNS, podName string) {
+	framework.Logf("deleting pod: %s/%s", podNS, podName)
+	deletePodSyncByName(ctx, f, podName)
+	waitForAllContainerRemoval(ctx, podName, podNS)
+	framework.Logf("deleted pod: %s/%s", podNS, podName)
+}
+
 func runTopologyManagerNegativeTest(ctx context.Context, f *framework.Framework, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
 	podName := "gu-pod"
 	framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)