Skip to content

Commit 60fb80a

Browse files
committed
e2e: Added Irq test to verify housekeeping cpu updates with node reboot.
Signed-Off-by: Sargun Narula <[email protected]>
1 parent 629b12f commit 60fb80a

File tree

4 files changed

+238
-0
lines changed

4 files changed

+238
-0
lines changed

test/e2e/performanceprofile/functests/2_performance_update/updating_profile.go

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@ import (
1212

1313
. "github.com/onsi/ginkgo/v2"
1414
. "github.com/onsi/gomega"
15+
appsv1 "k8s.io/api/apps/v1"
1516
corev1 "k8s.io/api/core/v1"
1617
"k8s.io/apimachinery/pkg/api/errors"
18+
"k8s.io/apimachinery/pkg/api/resource"
1719
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1820
"k8s.io/apimachinery/pkg/labels"
1921
"k8s.io/apimachinery/pkg/types"
@@ -27,11 +29,13 @@ import (
2729
performancev2 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/performanceprofile/v2"
2830
"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/controller/performanceprofile/components"
2931
profilecomponent "github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/controller/performanceprofile/components/profile"
32+
componenttuned "github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/controller/performanceprofile/components/tuned"
3033
manifestsutil "github.com/openshift/cluster-node-tuning-operator/pkg/util"
3134
testutils "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils"
3235
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/cgroup/runtime"
3336
testclient "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/client"
3437
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/cluster"
38+
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/deployments"
3539
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/discovery"
3640
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/hypershift"
3741
"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/label"
@@ -996,6 +1000,155 @@ var _ = Describe("[rfe_id:28761][performance] Updating parameters in performance
9961000
})
9971001
})
9981002

1003+
Context("Verify IRQ housekeeping updates", Ordered, Label(string(label.Tier2)), func() {
1004+
var targetNode *corev1.Node
1005+
var isolatedCPUSet cpuset.CPUSet
1006+
1007+
testutils.CustomBeforeAll(func() {
1008+
initialProfile = profile.DeepCopy()
1009+
})
1010+
1011+
It("[test_id:99999] should update housekeeping CPUs when performance profile is modified", func() {
1012+
1013+
if componenttuned.IsIRQBalancingGloballyDisabled(profile) {
1014+
Skip("this test needs IRQ balancing (GloballyDisableIrqLoadBalancing=false)")
1015+
}
1016+
1017+
ctx := context.TODO()
1018+
1019+
// Get current profile CPU configuration
1020+
Expect(profile.Spec.CPU.Reserved).ToNot(BeNil(), "expected reserved CPUs, found none")
1021+
Expect(profile.Spec.CPU.Isolated).ToNot(BeNil(), "expected isolated CPUs, found none")
1022+
1023+
reservedCPUSet, err := cpuset.Parse(string(*profile.Spec.CPU.Reserved))
1024+
Expect(err).ToNot(HaveOccurred(), "failed to parse reserved CPUs")
1025+
1026+
isolatedCPUSet, err = cpuset.Parse(string(*profile.Spec.CPU.Isolated))
1027+
Expect(err).ToNot(HaveOccurred(), "failed to parse isolated CPUs")
1028+
1029+
targetNodeIdx := nodes.PickNodeIdx(workerRTNodes)
1030+
targetNode = &workerRTNodes[targetNodeIdx]
1031+
Expect(targetNode).ToNot(BeNil(), "missing target node")
1032+
By(fmt.Sprintf("Using target worker node %q", targetNode.Name))
1033+
1034+
// Ensure we have enough isolated CPUs for the test
1035+
// minimum amount to avoid SMT-alignment error
1036+
cpuRequest := 2
1037+
if cpuRequest >= isolatedCPUSet.Size() {
1038+
Skip(fmt.Sprintf("cpus request %d is greater than the available isolated cpus %d", cpuRequest, isolatedCPUSet.Size()))
1039+
}
1040+
1041+
By("Creating a Deployment with guaranteed pod that has irq-load-balancing.crio.io: housekeeping annotation")
1042+
annotations := map[string]string{
1043+
"irq-load-balancing.crio.io": "housekeeping",
1044+
}
1045+
podTemplate := getTestPodWithProfileAndAnnotations(profile, annotations, cpuRequest)
1046+
1047+
dp := deployments.Make("irq-housekeeping-dp", testutils.NamespaceTesting,
1048+
deployments.WithPodTemplate(podTemplate),
1049+
deployments.WithNodeSelector(map[string]string{testutils.LabelHostname: targetNode.Name}),
1050+
)
1051+
1052+
err = testclient.DataPlaneClient.Create(ctx, dp)
1053+
Expect(err).ToNot(HaveOccurred(), "failed to create test deployment")
1054+
defer func() {
1055+
By("Cleaning up: deleting deployment")
1056+
testclient.DataPlaneClient.Delete(ctx, dp)
1057+
}()
1058+
1059+
By("Waiting for the deployment to be ready")
1060+
desiredStatus := appsv1.DeploymentStatus{
1061+
Replicas: 1,
1062+
AvailableReplicas: 1,
1063+
}
1064+
err = deployments.WaitForDesiredDeploymentStatus(ctx, dp, testclient.DataPlaneClient, dp.Namespace, dp.Name, desiredStatus)
1065+
Expect(err).ToNot(HaveOccurred(), "deployment did not reach desired status")
1066+
1067+
By("Getting the pod from the deployment")
1068+
podList := &corev1.PodList{}
1069+
listOptions := &client.ListOptions{
1070+
Namespace: dp.Namespace,
1071+
LabelSelector: labels.SelectorFromSet(dp.Spec.Selector.MatchLabels),
1072+
}
1073+
err = testclient.DataPlaneClient.List(ctx, podList, listOptions)
1074+
Expect(err).ToNot(HaveOccurred(), "failed to list pods from deployment")
1075+
Expect(len(podList.Items)).To(Equal(1), "expected exactly one pod in deployment")
1076+
testpod := &podList.Items[0]
1077+
Expect(testpod.Status.QOSClass).To(Equal(corev1.PodQOSGuaranteed), "Test pod does not have QoS class of Guaranteed")
1078+
1079+
By("Verifying OPENSHIFT_HOUSEKEEPING_CPUS environment variable is set")
1080+
initialHousekeepingCPUSet, err := getHousekeepingCPUsFromEnv(testpod)
1081+
Expect(err).ToNot(HaveOccurred(), "failed to get OPENSHIFT_HOUSEKEEPING_CPUS from pod")
1082+
Expect(initialHousekeepingCPUSet.Size()).ToNot(BeZero(), "OPENSHIFT_HOUSEKEEPING_CPUS should not be empty")
1083+
1084+
By("Verifying initial IRQ affinity includes housekeeping CPUs")
1085+
smpAffinitySet, err := nodes.GetDefaultSmpAffinitySet(ctx, targetNode)
1086+
Expect(err).ToNot(HaveOccurred(), "failed to get default smp affinity")
1087+
onlineCPUsSet, err := nodes.GetOnlineCPUsSet(ctx, targetNode)
1088+
Expect(err).ToNot(HaveOccurred(), "failed to get online CPUs")
1089+
smpAffinitySet = smpAffinitySet.Intersection(onlineCPUsSet)
1090+
1091+
Expect(initialHousekeepingCPUSet.IsSubsetOf(smpAffinitySet)).To(BeTrue(),
1092+
"Housekeeping CPUs %s should be subset of IRQ affinity %s", initialHousekeepingCPUSet.String(), smpAffinitySet.String())
1093+
1094+
By("Modifying the performance profile to change reserved and isolated CPUs")
1095+
1096+
// Move one isolated CPU to reserved to trigger housekeeping CPUs update
1097+
cpuToMove := cpuset.New(isolatedCPUSet.List()[0])
1098+
newReservedSet := reservedCPUSet.Union(cpuToMove)
1099+
newIsolatedSet := isolatedCPUSet.Difference(cpuToMove)
1100+
1101+
profile.Spec.CPU.Reserved = ptr.To(performancev2.CPUSet(newReservedSet.String()))
1102+
profile.Spec.CPU.Isolated = ptr.To(performancev2.CPUSet(newIsolatedSet.String()))
1103+
1104+
By("Updating the performance profile")
1105+
profiles.UpdateWithRetry(profile)
1106+
1107+
By("Waiting for tuning to start updating")
1108+
profilesupdate.WaitForTuningUpdating(ctx, profile)
1109+
1110+
By("Waiting for tuning to complete")
1111+
profilesupdate.WaitForTuningUpdated(ctx, profile)
1112+
1113+
By("Waiting for the deployment to be ready again after profile update and node reboot")
1114+
Eventually(func() error {
1115+
return deployments.WaitForDesiredDeploymentStatus(ctx, dp, testclient.DataPlaneClient, dp.Namespace, dp.Name, desiredStatus)
1116+
}).WithTimeout(20*time.Minute).WithPolling(30*time.Second).Should(Succeed(), "deployment did not become ready after profile update")
1117+
1118+
By("Getting the updated pod from the deployment")
1119+
err = testclient.DataPlaneClient.List(ctx, podList, listOptions)
1120+
Expect(err).ToNot(HaveOccurred(), "failed to list pods from deployment after update")
1121+
Expect(len(podList.Items)).To(Equal(1), "expected exactly one pod in deployment after update")
1122+
testpod = &podList.Items[0]
1123+
1124+
By("Verifying OPENSHIFT_HOUSEKEEPING_CPUS is updated after profile modification")
1125+
updatedHousekeepingCPUSet, err := getHousekeepingCPUsFromEnv(testpod)
1126+
Expect(err).ToNot(HaveOccurred(), "failed to get updated OPENSHIFT_HOUSEKEEPING_CPUS from pod")
1127+
Expect(updatedHousekeepingCPUSet.Size()).ToNot(BeZero(), "updated OPENSHIFT_HOUSEKEEPING_CPUS should not be empty")
1128+
1129+
By("Verifying updated IRQ affinity includes housekeeping CPUs")
1130+
updatedSmpAffinitySet, err := nodes.GetDefaultSmpAffinitySet(ctx, targetNode)
1131+
Expect(err).ToNot(HaveOccurred(), "failed to get updated default smp affinity")
1132+
updatedOnlineCPUsSet, err := nodes.GetOnlineCPUsSet(ctx, targetNode)
1133+
Expect(err).ToNot(HaveOccurred(), "failed to get updated online CPUs")
1134+
updatedSmpAffinitySet = updatedSmpAffinitySet.Intersection(updatedOnlineCPUsSet)
1135+
1136+
Expect(updatedHousekeepingCPUSet.IsSubsetOf(updatedSmpAffinitySet)).To(BeTrue(),
1137+
"Updated housekeeping CPUs %s should be subset of IRQ affinity %s", updatedHousekeepingCPUSet.String(), updatedSmpAffinitySet.String())
1138+
})
1139+
1140+
AfterAll(func() {
1141+
By("Reverting the profile to its initial state")
1142+
profiles.UpdateWithRetry(initialProfile)
1143+
1144+
By(fmt.Sprintf("Applying changes in performance profile and waiting until %s will start updating", poolName))
1145+
profilesupdate.WaitForTuningUpdating(context.TODO(), profile)
1146+
1147+
By(fmt.Sprintf("Waiting when %s finishes updates", poolName))
1148+
profilesupdate.WaitForTuningUpdated(context.TODO(), profile)
1149+
})
1150+
})
1151+
9991152
Context("[rfe_id:54374][rps_mask] Network Stack Pinning", Label(string(label.RPSMask), string(label.Tier1)), func() {
10001153

10011154
BeforeEach(func() {
@@ -1435,3 +1588,50 @@ func copyNumaCoreSiblings(src map[int]map[int][]int) map[int]map[int][]int {
14351588
}
14361589
return dst
14371590
}
1591+
1592+
// getHousekeepingCPUsFromEnv extracts the OPENSHIFT_HOUSEKEEPING_CPUS environment variable from the pod and returns it as a CPUSet.
1593+
func getHousekeepingCPUsFromEnv(pod *corev1.Pod) (cpuset.CPUSet, error) {
1594+
const housekeepingCpusEnv = "OPENSHIFT_HOUSEKEEPING_CPUS"
1595+
1596+
cmd := []string{"printenv", housekeepingCpusEnv}
1597+
output, err := pods.ExecCommandOnPod(testclient.K8sClient, pod, "", cmd)
1598+
if err != nil {
1599+
return cpuset.New(), fmt.Errorf("failed to get %s from pod %s/%s: %v", housekeepingCpusEnv, pod.Namespace, pod.Name, err)
1600+
}
1601+
1602+
value := strings.TrimSpace(string(output))
1603+
if value == "" {
1604+
return cpuset.New(), fmt.Errorf("%s environment variable not found or empty in pod %s/%s", housekeepingCpusEnv, pod.Namespace, pod.Name)
1605+
}
1606+
1607+
cpuSet, err := cpuset.Parse(value)
1608+
if err != nil {
1609+
return cpuset.New(), fmt.Errorf("failed to parse %s value %q from pod %s/%s: %v", housekeepingCpusEnv, value, pod.Namespace, pod.Name, err)
1610+
}
1611+
1612+
return cpuSet, nil
1613+
}
1614+
1615+
// getTestPodWithProfileAndAnnotations creates a test pod with specified profile and annotations
1616+
func getTestPodWithProfileAndAnnotations(perfProf *performancev2.PerformanceProfile, annotations map[string]string, cpus int) *corev1.Pod {
1617+
testpod := pods.GetTestPod()
1618+
if len(annotations) > 0 {
1619+
testpod.Annotations = annotations
1620+
}
1621+
testpod.Namespace = testutils.NamespaceTesting
1622+
1623+
cpuCount := fmt.Sprintf("%d", cpus)
1624+
resCpu := resource.MustParse(cpuCount)
1625+
resMem := resource.MustParse("256Mi")
1626+
testpod.Spec.Containers[0].Resources = corev1.ResourceRequirements{
1627+
Limits: corev1.ResourceList{
1628+
corev1.ResourceCPU: resCpu,
1629+
corev1.ResourceMemory: resMem,
1630+
},
1631+
}
1632+
if perfProf != nil {
1633+
runtimeClassName := components.GetComponentName(perfProf.Name, components.ComponentNamePrefix)
1634+
testpod.Spec.RuntimeClassName = &runtimeClassName
1635+
}
1636+
return testpod
1637+
}

test/e2e/performanceprofile/functests/utils/deployments/deployments.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ func WithPodTemplate(podTemplate *corev1.Pod) func(dp *appsv1.Deployment) {
7676
return func(dp *appsv1.Deployment) {
7777
dp.Spec.Template.Spec = podTemplate.Spec
7878
dp.Spec.Template.Labels = podTemplate.Labels
79+
dp.Spec.Template.Annotations = podTemplate.Annotations
7980
dp.Spec.Selector.MatchLabels = podTemplate.Labels
8081
}
8182
}

test/e2e/performanceprofile/functests/utils/nodes/nodes.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"os"
78
"path"
89
"sort"
910
"strconv"
@@ -586,3 +587,20 @@ func GetL3SharedCPUs(node *corev1.Node) func(cpuId int) (cpuset.CPUSet, error) {
586587
return cpuSet, err
587588
}
588589
}
590+
591+
// PickNodeIdx selects a node index based on environment variable E2E_PAO_TARGET_NODE.
592+
// If the environment variable is not set or the node is not found, returns 0.
593+
func PickNodeIdx(nodes []corev1.Node) int {
594+
name, ok := os.LookupEnv("E2E_PAO_TARGET_NODE")
595+
if !ok {
596+
return 0 // "random" default
597+
}
598+
for idx := range nodes {
599+
if nodes[idx].Name == name {
600+
testlog.Infof("node %q found among candidates, picking", name)
601+
return idx
602+
}
603+
}
604+
testlog.Infof("node %q not found among candidates, fall back to random one", name)
605+
return 0 // "safe" default
606+
}

test/e2e/performanceprofile/functests/utils/pods/pods.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"k8s.io/client-go/kubernetes"
2121
"k8s.io/client-go/kubernetes/scheme"
2222
"k8s.io/client-go/tools/remotecommand"
23+
"k8s.io/utils/cpuset"
2324
"sigs.k8s.io/controller-runtime/pkg/client"
2425

2526
testclient "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/client"
@@ -272,3 +273,21 @@ func CheckPODSchedulingFailed(c client.Client, pod *corev1.Pod) (bool, error) {
272273
}
273274
return false, nil
274275
}
276+
277+
// GetPodCPUs returns the CPUs assigned to the pod by reading the container's cpuset
278+
func GetPodCPUs(ctx context.Context, c *kubernetes.Clientset, pod *corev1.Pod) (cpuset.CPUSet, error) {
279+
// Get the CPUs allowed for the container by reading /proc/self/status
280+
cmd := []string{"/bin/bash", "-c", "grep Cpus_allowed_list /proc/self/status | awk '{print $2}'"}
281+
output, err := ExecCommandOnPod(c, pod, "", cmd)
282+
if err != nil {
283+
return cpuset.New(), fmt.Errorf("failed to get Cpus_allowed_list from pod %s/%s: %v", pod.Namespace, pod.Name, err)
284+
}
285+
286+
cpuList := strings.TrimSpace(string(output))
287+
podCPUs, err := cpuset.Parse(cpuList)
288+
if err != nil {
289+
return cpuset.New(), fmt.Errorf("failed to parse CPU list %q: %v", cpuList, err)
290+
}
291+
292+
return podCPUs, nil
293+
}

0 commit comments

Comments
 (0)