Merge pull request kubernetes#127475 from dims/resurrect-gpu-tests-that-use-jobs

k8s-ci-robot · web-flow · commit c348d099f8a2 · 2024-09-20T00:25:29.000+01:00
Resurrect GPU tests that use Jobs
diff --git a/test/e2e/node/gpu.go b/test/e2e/node/gpu.go
@@ -19,6 +19,7 @@ package node
 import (
 	"context"
 	"os"
+	"regexp"
 	"time"
 
 	appsv1 "k8s.io/api/apps/v1"
@@ -32,6 +33,7 @@ import (
 	"k8s.io/kubernetes/test/e2e/framework"
 	e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
 	e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
+	e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
 	e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
 	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -44,9 +46,13 @@ import (
 	"github.com/onsi/gomega"
 )
 
-var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", func() {
+// NOTE: All the tests in this file are run serially because they share a limited set of GPU(s), please inspect
+// the CI job definitions to see how many GPU(s) are available in the environment
+// Currently the CI jobs have 2 nodes each with 4 Nvidia T4's across both GCE and AWS harness(es).
 
-	f := framework.NewDefaultFramework("nvidia-gpu")
+var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Sanity test using nvidia-smi", func() {
+
+	f := framework.NewDefaultFramework("nvidia-gpu1")
 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
 	var podClient *e2epod.PodClient
 
@@ -72,6 +78,18 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
 		gomega.Expect(log).To(gomega.ContainSubstring("Driver Version:"))
 		gomega.Expect(log).To(gomega.ContainSubstring("CUDA Version:"))
 	})
+})
+
+var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using a Pod", func() {
+
+	f := framework.NewDefaultFramework("nvidia-gpu2")
+	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
+	var podClient *e2epod.PodClient
+
+	ginkgo.BeforeEach(func() {
+		e2eskipper.SkipUnlessProviderIs("aws", "gce")
+		podClient = e2epod.NewPodClient(f)
+	})
 
 	f.It("should run gpu based matrix multiplication", func(ctx context.Context) {
 		SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
@@ -91,6 +109,46 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
 	})
 })
 
+var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using a Job", func() {
+
+	f := framework.NewDefaultFramework("nvidia-gpu2")
+	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
+
+	ginkgo.BeforeEach(func() {
+		e2eskipper.SkipUnlessProviderIs("aws", "gce")
+	})
+
+	f.It("should run gpu based jobs", func(ctx context.Context) {
+		SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
+
+		// Job set to have 5 completions with parallelism of 1 to ensure that it lasts long enough to experience the node recreation
+		completions := int32(5)
+		ginkgo.By("Starting GPU job")
+		StartJob(ctx, f, completions)
+
+		job, err := e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, "cuda-add")
+		framework.ExpectNoError(err)
+
+		// make sure job is running by waiting for its first pod to start running
+		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
+		framework.ExpectNoError(err)
+
+		numNodes, err := e2enode.TotalRegistered(ctx, f.ClientSet)
+		framework.ExpectNoError(err)
+		_, err = e2enode.CheckReady(ctx, f.ClientSet, numNodes, framework.NodeReadyInitialTimeout)
+		framework.ExpectNoError(err)
+
+		ginkgo.By("Waiting for gpu job to finish")
+		err = e2ejob.WaitForJobFinish(ctx, f.ClientSet, f.Namespace.Name, job.Name)
+		framework.ExpectNoError(err)
+		ginkgo.By("Done with gpu job")
+
+		gomega.Expect(job.Status.Failed).To(gomega.BeZero(), "Job pods failed during node recreation: %v", job.Status.Failed)
+
+		VerifyJobNCompletions(ctx, f, completions)
+	})
+})
+
 func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) {
 	pod = podClient.Create(ctx, pod)
 
@@ -304,3 +362,109 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
 
 	return rsgather
 }
+
+// StartJob starts a simple CUDA job that requests gpu and the specified number of completions
+func StartJob(ctx context.Context, f *framework.Framework, completions int32) {
+	var activeSeconds int64 = 3600
+	testJob := e2ejob.NewTestJob("succeed", "cuda-add", v1.RestartPolicyAlways, 1, completions, &activeSeconds, 6)
+	testJob.Spec.Template.Spec = v1.PodSpec{
+		RestartPolicy: v1.RestartPolicyOnFailure,
+		Containers: []v1.Container{
+			{
+				Name:  "vector-addition",
+				Image: "cupy/cupy:v13.3.0",
+				Command: []string{
+					"python3",
+					"-c",
+					`
+import cupy as cp
+import numpy as np
+import time
+
+# Set the number of elements to test
+num_elements_list = [10, 100, 1000, 10000, 100000, 1000000]
+
+for num_elements in num_elements_list:
+    # Create random input vectors on the CPU
+    h_A = np.random.rand(num_elements).astype(np.float32)
+    h_B = np.random.rand(num_elements).astype(np.float32)
+
+    # Transfer the input vectors to the GPU
+    d_A = cp.asarray(h_A)
+    d_B = cp.asarray(h_B)
+
+    # Perform vector addition on the GPU
+    start_gpu = time.time()
+    d_C = d_A + d_B
+    gpu_time = time.time() - start_gpu
+
+    # Transfer the result back to the CPU
+    h_C = cp.asnumpy(d_C)
+
+    # Compute the expected result on the CPU
+    start_cpu = time.time()
+    h_C_expected = h_A + h_B
+    cpu_time = time.time() - start_cpu
+
+    # Verify the result
+    if np.allclose(h_C_expected, h_C, atol=1e-5):
+        print(f"GPU time: {gpu_time:.6f} seconds")
+        print(f"CPU time: {cpu_time:.6f} seconds")
+        print(f"GPU speedup: {cpu_time / gpu_time:.2f}x")
+    else:
+        print(f"Test FAILED for {num_elements} elements.")
+
+    # Print the first few elements for verification
+    print("First few elements of A:", h_A[:5])
+    print("First few elements of B:", h_B[:5])
+    print("First few elements of C:", h_C[:5])
+
+print(f"Test PASSED")
+`,
+				},
+				Resources: v1.ResourceRequirements{
+					Limits: v1.ResourceList{
+						e2egpu.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI),
+					},
+				},
+			},
+		},
+	}
+	ns := f.Namespace.Name
+	_, err := e2ejob.CreateJob(ctx, f.ClientSet, ns, testJob)
+	framework.ExpectNoError(err)
+	framework.Logf("Created job %v", testJob)
+}
+
+func podNames(pods []v1.Pod) []string {
+	originalPodNames := make([]string, len(pods))
+	for i, p := range pods {
+		originalPodNames[i] = p.ObjectMeta.Name
+	}
+	return originalPodNames
+}
+
+// VerifyJobNCompletions verifies that the job has completions number of successful pods
+func VerifyJobNCompletions(ctx context.Context, f *framework.Framework, completions int32) {
+	ns := f.Namespace.Name
+	pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, "cuda-add")
+	framework.ExpectNoError(err)
+	createdPods := pods.Items
+	createdPodNames := podNames(createdPods)
+	framework.Logf("Got the following pods for job cuda-add: %v", createdPodNames)
+
+	successes := int32(0)
+	regex := regexp.MustCompile("PASSED")
+	for _, podName := range createdPodNames {
+		e2epod.NewPodClient(f).WaitForFinish(ctx, podName, 5*time.Minute)
+		logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, ns, podName, "vector-addition")
+		framework.ExpectNoError(err, "Should be able to get logs for pod %v", podName)
+		if regex.MatchString(logs) {
+			successes++
+		}
+		gomega.Expect(logs).To(gomega.Not(gomega.ContainSubstring("FAILED")))
+	}
+	if successes != completions {
+		framework.Failf("Only got %v completions. Expected %v completions.", successes, completions)
+	}
+}