Skip to content

Commit 8e9d76f

Browse files
committed
e2e: topomgr: validate all containers in pod
Up until now, the test validated the alignment of resources only in the first container in a pod. That was just an overlook. With this patch, we validate all the containers in a given pod. Signed-off-by: Francesco Romani <[email protected]>
1 parent ddc18ea commit 8e9d76f

File tree

2 files changed

+129
-54
lines changed

2 files changed

+129
-54
lines changed

test/e2e_node/numa_alignment.go

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ func getCPUsPerNUMANode(nodeNum int) ([]int, error) {
8989
return cpus.ToSlice(), nil
9090
}
9191

92-
func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string, numaNodes int) (map[int]int, error) {
92+
func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, environ map[string]string, numaNodes int) (map[int]int, error) {
9393
var cpuIDs []int
9494
cpuListAllowedEnvVar := "CPULIST_ALLOWED"
9595

@@ -103,12 +103,12 @@ func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map
103103
}
104104
}
105105
if len(cpuIDs) == 0 {
106-
return nil, fmt.Errorf("variable %q found in environ", cpuListAllowedEnvVar)
106+
return nil, fmt.Errorf("variable %q not found in environ", cpuListAllowedEnvVar)
107107
}
108108

109109
cpusPerNUMA := make(map[int][]int)
110110
for numaNode := 0; numaNode < numaNodes; numaNode++ {
111-
nodeCPUList := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name,
111+
nodeCPUList := f.ExecCommandInContainer(pod.Name, cnt.Name,
112112
"/bin/cat", fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", numaNode))
113113

114114
cpus, err := cpuset.Parse(nodeCPUList)
@@ -138,7 +138,7 @@ func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map
138138
return CPUMap, nil
139139
}
140140

141-
func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string) (map[string]int, error) {
141+
func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, environ map[string]string) (map[string]int, error) {
142142
pciDevPrefix := "PCIDEVICE_"
143143
// at this point we don't care which plugin selected the device,
144144
// we only need to know which devices were assigned to the POD.
@@ -153,14 +153,11 @@ func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, envir
153153
// a single plugin can allocate more than a single device
154154
pciDevs := strings.Split(value, ",")
155155
for _, pciDev := range pciDevs {
156-
pciDevNUMANode := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name,
156+
pciDevNUMANode := f.ExecCommandInContainer(pod.Name, cnt.Name,
157157
"/bin/cat", fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", pciDev))
158158
NUMAPerDev[pciDev] = numaNodeFromSysFsEntry(pciDevNUMANode)
159159
}
160160
}
161-
if len(NUMAPerDev) == 0 {
162-
return nil, fmt.Errorf("no PCI devices found in environ")
163-
}
164161
return NUMAPerDev, nil
165162
}
166163

@@ -180,22 +177,30 @@ func makeEnvMap(logs string) (map[string]string, error) {
180177
return envMap, nil
181178
}
182179

183-
func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, logs string, numaNodes int) (numaPodResources, error) {
180+
func containerWantsDevices(cnt *v1.Container, hwinfo testEnvHWInfo) bool {
181+
_, found := cnt.Resources.Requests[v1.ResourceName(hwinfo.sriovResourceName)]
182+
return found
183+
}
184+
185+
func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, logs string, hwinfo testEnvHWInfo) (numaPodResources, error) {
184186
podEnv, err := makeEnvMap(logs)
185187
if err != nil {
186188
return numaPodResources{}, err
187189
}
188190

189-
CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, podEnv, numaNodes)
191+
CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, cnt, podEnv, hwinfo.numaNodes)
190192
if err != nil {
191193
return numaPodResources{}, err
192194
}
193195

194-
PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, podEnv)
196+
PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, cnt, podEnv)
195197
if err != nil {
196198
return numaPodResources{}, err
197199
}
198200

201+
if containerWantsDevices(cnt, hwinfo) && len(PCIDevsToNUMANode) == 0 {
202+
return numaPodResources{}, fmt.Errorf("no PCI devices found in environ")
203+
}
199204
numaRes := numaPodResources{
200205
CPUToNUMANode: CPUToNUMANode,
201206
PCIDevsToNUMANode: PCIDevsToNUMANode,

test/e2e_node/topology_manager_test.go

Lines changed: 113 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ import (
3131

3232
"k8s.io/apimachinery/pkg/api/resource"
3333
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34+
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
3435
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
3536
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
3637
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
3738
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
39+
"k8s.io/kubernetes/pkg/kubelet/types"
3840
"k8s.io/kubernetes/test/e2e/framework"
3941
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
4042
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -304,16 +306,17 @@ func deletePodInNamespace(f *framework.Framework, namespace, name string) {
304306
framework.ExpectNoError(err)
305307
}
306308

307-
func validatePodAlignment(f *framework.Framework, pod *v1.Pod, numaNodes int) {
308-
ginkgo.By("validating the Gu pod")
309-
logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name)
310-
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
311-
pod.Spec.Containers[0].Name, pod.Name)
309+
func validatePodAlignment(f *framework.Framework, pod *v1.Pod, hwinfo testEnvHWInfo) {
310+
for _, cnt := range pod.Spec.Containers {
311+
ginkgo.By(fmt.Sprintf("validating the container %s on Gu pod %s", cnt.Name, pod.Name))
312312

313-
framework.Logf("got pod logs: %v", logs)
314-
numaRes, err := checkNUMAAlignment(f, pod, logs, numaNodes)
315-
framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]: %s",
316-
pod.Spec.Containers[0].Name, pod.Name, numaRes.String())
313+
logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
314+
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", cnt.Name, pod.Name)
315+
316+
framework.Logf("got pod logs: %v", logs)
317+
numaRes, err := checkNUMAAlignment(f, pod, &cnt, logs, hwinfo)
318+
framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]: %s", cnt.Name, pod.Name, numaRes.String())
319+
}
317320
}
318321

319322
func runTopologyManagerPolicySuiteTests(f *framework.Framework) {
@@ -542,21 +545,27 @@ func runTopologyManagerPolicySuiteTests(f *framework.Framework) {
542545
waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace)
543546
}
544547

545-
func runTopologyManagerPositiveTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
546-
var pods []*v1.Pod
547-
548-
for podID := 0; podID < numPods; podID++ {
549-
ctnAttrs := []tmCtnAttribute{
550-
{
551-
ctnName: "gu-container",
552-
cpuRequest: cpuAmount,
553-
cpuLimit: cpuAmount,
554-
deviceName: sriovResourceName,
555-
deviceRequest: deviceAmount,
556-
deviceLimit: deviceAmount,
548+
func waitForAllContainerRemoval(podName, podNS string) {
549+
rs, _, err := getCRIClient()
550+
framework.ExpectNoError(err)
551+
gomega.Eventually(func() bool {
552+
containers, err := rs.ListContainers(&runtimeapi.ContainerFilter{
553+
LabelSelector: map[string]string{
554+
types.KubernetesPodNameLabel: podName,
555+
types.KubernetesPodNamespaceLabel: podNS,
557556
},
557+
})
558+
if err != nil {
559+
return false
558560
}
561+
return len(containers) == 0
562+
}, 2*time.Minute, 1*time.Second).Should(gomega.BeTrue())
563+
}
564+
565+
func runTopologyManagerPositiveTest(f *framework.Framework, numPods int, ctnAttrs []tmCtnAttribute, hwinfo testEnvHWInfo) {
566+
var pods []*v1.Pod
559567

568+
for podID := 0; podID < numPods; podID++ {
560569
podName := fmt.Sprintf("gu-pod-%d", podID)
561570
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
562571
pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs)
@@ -566,30 +575,19 @@ func runTopologyManagerPositiveTest(f *framework.Framework, numaNodes, numPods i
566575
}
567576

568577
for podID := 0; podID < numPods; podID++ {
569-
validatePodAlignment(f, pods[podID], numaNodes)
578+
validatePodAlignment(f, pods[podID], hwinfo)
570579
}
571580

572581
for podID := 0; podID < numPods; podID++ {
573582
pod := pods[podID]
574-
framework.Logf("deleting the pod %s/%s and waiting for container %s removal",
575-
pod.Namespace, pod.Name, pod.Spec.Containers[0].Name)
583+
framework.Logf("deleting the pod %s/%s and waiting for container removal",
584+
pod.Namespace, pod.Name)
576585
deletePods(f, []string{pod.Name})
577-
waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
586+
waitForAllContainerRemoval(pod.Name, pod.Namespace)
578587
}
579588
}
580589

581-
func runTopologyManagerNegativeTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
582-
ctnAttrs := []tmCtnAttribute{
583-
{
584-
ctnName: "gu-container",
585-
cpuRequest: cpuAmount,
586-
cpuLimit: cpuAmount,
587-
deviceName: sriovResourceName,
588-
deviceRequest: deviceAmount,
589-
deviceLimit: deviceAmount,
590-
},
591-
}
592-
590+
func runTopologyManagerNegativeTest(f *framework.Framework, numPods int, ctnAttrs []tmCtnAttribute, hwinfo testEnvHWInfo) {
593591
podName := "gu-pod"
594592
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
595593
pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs)
@@ -682,47 +680,119 @@ func teardownSRIOVConfigOrFail(f *framework.Framework, dpPod *v1.Pod) {
682680
waitForContainerRemoval(dpPod.Spec.Containers[0].Name, dpPod.Name, dpPod.Namespace)
683681
}
684682

683+
type testEnvHWInfo struct {
684+
numaNodes int
685+
sriovResourceName string
686+
}
687+
685688
func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs string, numaNodes, coreCount int) {
686689
threadsPerCore := 1
687690
if isHTEnabled() {
688691
threadsPerCore = 2
689692
}
690693

691694
dpPod, sriovResourceName, sriovResourceAmount := setupSRIOVConfigOrFail(f, configMap)
695+
hwinfo := testEnvHWInfo{
696+
numaNodes: numaNodes,
697+
sriovResourceName: sriovResourceName,
698+
}
692699

693700
// could have been a loop, we unroll it to explain the testcases
701+
var ctnAttrs []tmCtnAttribute
694702

695703
// simplest case
696704
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sriovResourceName))
697-
runTopologyManagerPositiveTest(f, numaNodes, 1, "1000m", sriovResourceName, "1")
705+
ctnAttrs = []tmCtnAttribute{
706+
{
707+
ctnName: "gu-container",
708+
cpuRequest: "1000m",
709+
cpuLimit: "1000m",
710+
deviceName: sriovResourceName,
711+
deviceRequest: "1",
712+
deviceLimit: "1",
713+
},
714+
}
715+
runTopologyManagerPositiveTest(f, 1, ctnAttrs, hwinfo)
698716

699717
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 2 cores, 1 %s device", sriovResourceName))
700-
runTopologyManagerPositiveTest(f, numaNodes, 1, "2000m", sriovResourceName, "1")
718+
ctnAttrs = []tmCtnAttribute{
719+
{
720+
ctnName: "gu-container",
721+
cpuRequest: "2000m",
722+
cpuLimit: "2000m",
723+
deviceName: sriovResourceName,
724+
deviceRequest: "1",
725+
deviceLimit: "1",
726+
},
727+
}
728+
runTopologyManagerPositiveTest(f, 1, ctnAttrs, hwinfo)
701729

702730
if reservedSystemCPUs != "" {
703731
// to avoid false negatives, we have put reserved CPUs in such a way there is at least a NUMA node
704732
// with 1+ SRIOV devices and not reserved CPUs.
705733
numCores := threadsPerCore * coreCount
734+
allCoresReq := fmt.Sprintf("%dm", numCores*1000)
706735
ginkgo.By(fmt.Sprintf("Successfully admit an entire socket (%d cores), 1 %s device", numCores, sriovResourceName))
707-
runTopologyManagerPositiveTest(f, numaNodes, 1, fmt.Sprintf("%dm", numCores*1000), sriovResourceName, "1")
736+
ctnAttrs = []tmCtnAttribute{
737+
{
738+
ctnName: "gu-container",
739+
cpuRequest: allCoresReq,
740+
cpuLimit: allCoresReq,
741+
deviceName: sriovResourceName,
742+
deviceRequest: "1",
743+
deviceLimit: "1",
744+
},
745+
}
746+
runTopologyManagerPositiveTest(f, 1, ctnAttrs, hwinfo)
708747
}
709748

710749
if sriovResourceAmount > 1 {
711750
// no matter how busses are connected to NUMA nodes and SRIOV devices are installed, this function
712751
// preconditions must ensure the following can be fulfilled
713752
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 1 core, 1 %s device", sriovResourceName))
714-
runTopologyManagerPositiveTest(f, numaNodes, 2, "1000m", sriovResourceName, "1")
753+
ctnAttrs = []tmCtnAttribute{
754+
{
755+
ctnName: "gu-container",
756+
cpuRequest: "1000m",
757+
cpuLimit: "1000m",
758+
deviceName: sriovResourceName,
759+
deviceRequest: "1",
760+
deviceLimit: "1",
761+
},
762+
}
763+
runTopologyManagerPositiveTest(f, 2, ctnAttrs, hwinfo)
715764

716765
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 2 cores, 1 %s device", sriovResourceName))
717-
runTopologyManagerPositiveTest(f, numaNodes, 2, "2000m", sriovResourceName, "1")
766+
ctnAttrs = []tmCtnAttribute{
767+
{
768+
ctnName: "gu-container",
769+
cpuRequest: "2000m",
770+
cpuLimit: "2000m",
771+
deviceName: sriovResourceName,
772+
deviceRequest: "1",
773+
deviceLimit: "1",
774+
},
775+
}
776+
runTopologyManagerPositiveTest(f, 2, ctnAttrs, hwinfo)
718777

719778
// testing more complex conditions require knowledge about the system cpu+bus topology
720779
}
721780

722781
// overflow NUMA node capacity: cores
723782
numCores := 1 + (threadsPerCore * coreCount)
783+
excessCoresReq := fmt.Sprintf("%dm", numCores*1000)
724784
ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pods, with %d cores, 1 %s device - and it should be rejected", numCores, sriovResourceName))
725-
runTopologyManagerNegativeTest(f, numaNodes, 1, fmt.Sprintf("%dm", numCores*1000), sriovResourceName, "1")
785+
ctnAttrs = []tmCtnAttribute{
786+
{
787+
ctnName: "gu-container",
788+
cpuRequest: excessCoresReq,
789+
cpuLimit: excessCoresReq,
790+
deviceName: sriovResourceName,
791+
deviceRequest: "1",
792+
deviceLimit: "1",
793+
},
794+
}
795+
runTopologyManagerNegativeTest(f, 1, ctnAttrs, hwinfo)
726796

727797
teardownSRIOVConfigOrFail(f, dpPod)
728798
}

0 commit comments

Comments
 (0)