Skip to content

Commit dca3f56

Browse files
committed
Add e2e test for topology manager with restartable init containers
1 parent f6f0680 commit dca3f56

File tree

1 file changed

+250
-4
lines changed

1 file changed

+250
-4
lines changed

test/e2e_node/topology_manager_test.go

Lines changed: 250 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ type tmCtnAttribute struct {
6969
deviceName string
7070
deviceRequest string
7171
deviceLimit string
72+
restartPolicy *v1.ContainerRestartPolicy
7273
}
7374

7475
func detectNUMANodes() int {
@@ -158,7 +159,8 @@ func makeContainers(ctnCmd string, ctnAttributes []tmCtnAttribute) (ctns []v1.Co
158159
v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"),
159160
},
160161
},
161-
Command: []string{"sh", "-c", ctnCmd},
162+
Command: []string{"sh", "-c", ctnCmd},
163+
RestartPolicy: ctnAttr.restartPolicy,
162164
}
163165
if ctnAttr.deviceName != "" {
164166
ctn.Resources.Requests[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceRequest)
@@ -171,8 +173,12 @@ func makeContainers(ctnCmd string, ctnAttributes []tmCtnAttribute) (ctns []v1.Co
171173

172174
func makeTopologyManagerTestPod(podName string, tmCtnAttributes, tmInitCtnAttributes []tmCtnAttribute) *v1.Pod {
173175
var containers, initContainers []v1.Container
174-
if len(tmInitCtnAttributes) > 0 {
175-
initContainers = makeContainers(numaAlignmentCommand, tmInitCtnAttributes)
176+
for _, attr := range tmInitCtnAttributes {
177+
cmd := numaAlignmentCommand
178+
if attr.restartPolicy != nil && *attr.restartPolicy == v1.ContainerRestartPolicyAlways {
179+
cmd = numaAlignmentSleepCommand
180+
}
181+
initContainers = append(initContainers, makeContainers(cmd, []tmCtnAttribute{attr})...)
176182
}
177183
containers = makeContainers(numaAlignmentSleepCommand, tmCtnAttributes)
178184

@@ -346,6 +352,25 @@ func findSRIOVResource(node *v1.Node) (string, int64) {
346352
}
347353

348354
func validatePodAlignment(ctx context.Context, f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) {
355+
for _, cnt := range pod.Spec.InitContainers {
356+
// only check restartable init containers, skip regular init containers
357+
if cnt.RestartPolicy == nil || *cnt.RestartPolicy != v1.ContainerRestartPolicyAlways {
358+
continue
359+
}
360+
361+
ginkgo.By(fmt.Sprintf("validating the init container %s on Gu pod %s", cnt.Name, pod.Name))
362+
363+
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
364+
framework.ExpectNoError(err, "expected log not found in init container [%s] of pod [%s]", cnt.Name, pod.Name)
365+
366+
framework.Logf("got init container logs: %v", logs)
367+
numaRes, err := checkNUMAAlignment(f, pod, &cnt, logs, envInfo)
368+
framework.ExpectNoError(err, "NUMA Alignment check failed for init container [%s] of pod [%s]", cnt.Name, pod.Name)
369+
if numaRes != nil {
370+
framework.Logf("NUMA resources for init container %s/%s: %s", pod.Name, cnt.Name, numaRes.String())
371+
}
372+
}
373+
349374
for _, cnt := range pod.Spec.Containers {
350375
ginkgo.By(fmt.Sprintf("validating the container %s on Gu pod %s", cnt.Name, pod.Name))
351376

@@ -367,6 +392,23 @@ func validatePodAlignmentWithPodScope(ctx context.Context, f *framework.Framewor
367392
podsNUMA := make(map[int]int)
368393

369394
ginkgo.By(fmt.Sprintf("validate pod scope alignment for %s pod", pod.Name))
395+
for _, cnt := range pod.Spec.InitContainers {
396+
// only check restartable init containers, skip regular init containers
397+
if cnt.RestartPolicy == nil || *cnt.RestartPolicy != v1.ContainerRestartPolicyAlways {
398+
continue
399+
}
400+
401+
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
402+
framework.ExpectNoError(err, "NUMA alignment failed for init container [%s] of pod [%s]", cnt.Name, pod.Name)
403+
envMap, err := makeEnvMap(logs)
404+
framework.ExpectNoError(err, "NUMA alignment failed for init container [%s] of pod [%s]", cnt.Name, pod.Name)
405+
cpuToNUMA, err := getCPUToNUMANodeMapFromEnv(f, pod, &cnt, envMap, envInfo.numaNodes)
406+
framework.ExpectNoError(err, "NUMA alignment failed for init container [%s] of pod [%s]", cnt.Name, pod.Name)
407+
for cpuID, numaID := range cpuToNUMA {
408+
podsNUMA[cpuID] = numaID
409+
}
410+
}
411+
370412
for _, cnt := range pod.Spec.Containers {
371413
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
372414
framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name)
@@ -440,7 +482,7 @@ func runTopologyManagerPositiveTest(ctx context.Context, f *framework.Framework,
440482
}
441483

442484
// per https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/693-topology-manager/README.md#multi-numa-systems-tests
443-
// we can do a menaingful validation only when using the single-numa node policy
485+
// we can do a meaningful validation only when using the single-numa node policy
444486
if envInfo.policy == topologymanager.PolicySingleNumaNode {
445487
for _, pod := range podMap {
446488
validatePodAlignment(ctx, f, pod, envInfo)
@@ -728,6 +770,94 @@ func runTMScopeResourceAlignmentTestSuite(ctx context.Context, f *framework.Fram
728770
}
729771
runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
730772

773+
ginkgo.By(fmt.Sprintf("Admit one guaranteed pod with restartable init container, 1 core and 1 %s device", sd.resourceName))
774+
initCtnAttrs = []tmCtnAttribute{
775+
{
776+
ctnName: "restartable-init-container",
777+
cpuRequest: "1000m",
778+
cpuLimit: "1000m",
779+
deviceName: sd.resourceName,
780+
deviceRequest: "1",
781+
deviceLimit: "1",
782+
restartPolicy: &containerRestartPolicyAlways,
783+
},
784+
}
785+
ctnAttrs = []tmCtnAttribute{
786+
{
787+
ctnName: "gu-container",
788+
cpuRequest: "1000m",
789+
cpuLimit: "1000m",
790+
deviceName: sd.resourceName,
791+
deviceRequest: "1",
792+
deviceLimit: "1",
793+
},
794+
}
795+
runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
796+
797+
ginkgo.By(fmt.Sprintf("Admit one guaranteed pod with multiple restartable init containers, each container with 1 CPU core. Use 1 %s device", sd.resourceName))
798+
initCtnAttrs = []tmCtnAttribute{
799+
{
800+
ctnName: "restartable-init-container-1",
801+
cpuRequest: "1000m",
802+
cpuLimit: "1000m",
803+
deviceName: sd.resourceName,
804+
deviceRequest: "1",
805+
deviceLimit: "1",
806+
restartPolicy: &containerRestartPolicyAlways,
807+
},
808+
{
809+
ctnName: "restartable-init-container-2",
810+
cpuRequest: "1000m",
811+
cpuLimit: "1000m",
812+
deviceName: sd.resourceName,
813+
deviceRequest: "1",
814+
deviceLimit: "1",
815+
restartPolicy: &containerRestartPolicyAlways,
816+
},
817+
}
818+
ctnAttrs = []tmCtnAttribute{
819+
{
820+
ctnName: "gu-container",
821+
cpuRequest: "1000m",
822+
cpuLimit: "1000m",
823+
deviceName: sd.resourceName,
824+
deviceRequest: "1",
825+
deviceLimit: "1",
826+
},
827+
}
828+
runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
829+
830+
coresReq = fmt.Sprintf("%dm", (numCores/2+1)*1000)
831+
ginkgo.By(fmt.Sprintf("Trying to admin guaranteed pod with two restartable init containers where sum of their CPU requests (%d cores) exceeds NUMA capacity. The request should be rejected", (numCores/2+1)*2))
832+
initCtnAttrs = []tmCtnAttribute{
833+
{
834+
ctnName: "restartable-init-container-1",
835+
cpuRequest: coresReq,
836+
cpuLimit: coresReq,
837+
deviceRequest: "1",
838+
deviceLimit: "1",
839+
restartPolicy: &containerRestartPolicyAlways,
840+
},
841+
{
842+
ctnName: "restartable-init-container-2",
843+
cpuRequest: coresReq,
844+
cpuLimit: coresReq,
845+
deviceRequest: "1",
846+
deviceLimit: "1",
847+
restartPolicy: &containerRestartPolicyAlways,
848+
},
849+
}
850+
ctnAttrs = []tmCtnAttribute{
851+
{
852+
ctnName: "gu-container",
853+
cpuRequest: "1000m",
854+
cpuLimit: "1000m",
855+
deviceRequest: "1",
856+
deviceLimit: "1",
857+
},
858+
}
859+
runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
860+
731861
teardownSRIOVConfigOrFail(ctx, f, sd)
732862
}
733863

@@ -820,6 +950,30 @@ func runTopologyManagerNodeAlignmentSuiteTests(ctx context.Context, f *framework
820950
}
821951
runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
822952

953+
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with restartable init container - each with 1 core, 1 %s device", sd.resourceName))
954+
initCtnAttrs = []tmCtnAttribute{
955+
{
956+
ctnName: "restartable-init-container",
957+
cpuRequest: "1000m",
958+
cpuLimit: "1000m",
959+
deviceName: sd.resourceName,
960+
deviceRequest: "1",
961+
deviceLimit: "1",
962+
restartPolicy: &containerRestartPolicyAlways,
963+
},
964+
}
965+
ctnAttrs = []tmCtnAttribute{
966+
{
967+
ctnName: "gu-container",
968+
cpuRequest: "1000m",
969+
cpuLimit: "1000m",
970+
deviceName: sd.resourceName,
971+
deviceRequest: "1",
972+
deviceLimit: "1",
973+
},
974+
}
975+
runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
976+
823977
// testing more complex conditions require knowledge about the system cpu+bus topology
824978
}
825979

@@ -884,6 +1038,39 @@ func runTopologyManagerNodeAlignmentSuiteTests(ctx context.Context, f *framework
8841038
},
8851039
}
8861040
runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
1041+
1042+
ginkgo.By(fmt.Sprintf("Successfully admit pod with multiple restartable init containers, each with 1 core, 1 %s device", sd.resourceName))
1043+
initCtnAttrs = []tmCtnAttribute{
1044+
{
1045+
ctnName: "restartable-init-container-1",
1046+
cpuRequest: "1000m",
1047+
cpuLimit: "1000m",
1048+
deviceName: sd.resourceName,
1049+
deviceRequest: "1",
1050+
deviceLimit: "1",
1051+
restartPolicy: &containerRestartPolicyAlways,
1052+
},
1053+
{
1054+
ctnName: "restartable-init-container-2",
1055+
cpuRequest: "1000m",
1056+
cpuLimit: "1000m",
1057+
deviceName: sd.resourceName,
1058+
deviceRequest: "1",
1059+
deviceLimit: "1",
1060+
restartPolicy: &containerRestartPolicyAlways,
1061+
},
1062+
}
1063+
ctnAttrs = []tmCtnAttribute{
1064+
{
1065+
ctnName: "gu-container",
1066+
cpuRequest: "1000m",
1067+
cpuLimit: "1000m",
1068+
deviceName: sd.resourceName,
1069+
deviceRequest: "1",
1070+
deviceLimit: "1",
1071+
},
1072+
}
1073+
runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
8871074
}
8881075

8891076
// this is the only policy that can guarantee reliable rejects
@@ -903,6 +1090,65 @@ func runTopologyManagerNodeAlignmentSuiteTests(ctx context.Context, f *framework
9031090
},
9041091
}
9051092
runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
1093+
1094+
if sd.resourceAmount >= 3 {
1095+
ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pod with a restartable init container demanding %d cores, 1 %s device - and it should be rejected", numCores, sd.resourceName))
1096+
initCtnAttrs = []tmCtnAttribute{
1097+
{
1098+
ctnName: "restartable-init-container",
1099+
cpuRequest: excessCoresReq,
1100+
cpuLimit: excessCoresReq,
1101+
deviceName: sd.resourceName,
1102+
deviceRequest: "1",
1103+
deviceLimit: "1",
1104+
restartPolicy: &containerRestartPolicyAlways,
1105+
},
1106+
}
1107+
ctnAttrs = []tmCtnAttribute{
1108+
{
1109+
ctnName: "gu-container",
1110+
cpuRequest: "1000m",
1111+
cpuLimit: "1000m",
1112+
deviceName: sd.resourceName,
1113+
deviceRequest: "1",
1114+
deviceLimit: "1",
1115+
},
1116+
}
1117+
runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
1118+
1119+
ginkgo.By("Trying to admit a guaranteed pod with two restartable init containers where the second one cannot achieve NUMA alignment - and it should be rejected")
1120+
initCtnAttrs = []tmCtnAttribute{
1121+
{
1122+
ctnName: "restartable-init-container-1",
1123+
cpuRequest: "1000m",
1124+
cpuLimit: "1000m",
1125+
deviceName: sd.resourceName,
1126+
deviceRequest: "1",
1127+
deviceLimit: "1",
1128+
restartPolicy: &containerRestartPolicyAlways,
1129+
},
1130+
{
1131+
ctnName: "restartable-init-container-2",
1132+
cpuRequest: excessCoresReq,
1133+
cpuLimit: excessCoresReq,
1134+
deviceName: sd.resourceName,
1135+
deviceRequest: "1",
1136+
deviceLimit: "1",
1137+
restartPolicy: &containerRestartPolicyAlways,
1138+
},
1139+
}
1140+
ctnAttrs = []tmCtnAttribute{
1141+
{
1142+
ctnName: "gu-container",
1143+
cpuRequest: "1000m",
1144+
cpuLimit: "1000m",
1145+
deviceName: sd.resourceName,
1146+
deviceRequest: "1",
1147+
deviceLimit: "1",
1148+
},
1149+
}
1150+
runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
1151+
}
9061152
}
9071153
}
9081154

0 commit comments

Comments
 (0)