Skip to content

Commit c348d09

Browse files
authored
Merge pull request kubernetes#127475 from dims/resurrect-gpu-tests-that-use-jobs
Resurrect GPU tests that use Jobs
2 parents 7c156c5 + 3ec74e0 commit c348d09

File tree

1 file changed

+166
-2
lines changed

1 file changed

+166
-2
lines changed

test/e2e/node/gpu.go

Lines changed: 166 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package node
1919
import (
2020
"context"
2121
"os"
22+
"regexp"
2223
"time"
2324

2425
appsv1 "k8s.io/api/apps/v1"
@@ -32,6 +33,7 @@ import (
3233
"k8s.io/kubernetes/test/e2e/framework"
3334
e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
3435
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
36+
e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
3537
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
3638
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
3739
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -44,9 +46,13 @@ import (
4446
"github.com/onsi/gomega"
4547
)
4648

47-
var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", func() {
49+
// NOTE: All the tests in this file are run serially because they share a limited set of GPU(s), please inspect
50+
// the CI job definitions to see how many GPU(s) are available in the environment
51+
// Currently the CI jobs have 2 nodes each with 4 Nvidia T4's across both GCE and AWS harness(es).
4852

49-
f := framework.NewDefaultFramework("nvidia-gpu")
53+
var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Sanity test using nvidia-smi", func() {
54+
55+
f := framework.NewDefaultFramework("nvidia-gpu1")
5056
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
5157
var podClient *e2epod.PodClient
5258

@@ -72,6 +78,18 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
7278
gomega.Expect(log).To(gomega.ContainSubstring("Driver Version:"))
7379
gomega.Expect(log).To(gomega.ContainSubstring("CUDA Version:"))
7480
})
81+
})
82+
83+
var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using a Pod", func() {
84+
85+
f := framework.NewDefaultFramework("nvidia-gpu2")
86+
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
87+
var podClient *e2epod.PodClient
88+
89+
ginkgo.BeforeEach(func() {
90+
e2eskipper.SkipUnlessProviderIs("aws", "gce")
91+
podClient = e2epod.NewPodClient(f)
92+
})
7593

7694
f.It("should run gpu based matrix multiplication", func(ctx context.Context) {
7795
SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
@@ -91,6 +109,46 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
91109
})
92110
})
93111

112+
var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using a Job", func() {
113+
114+
f := framework.NewDefaultFramework("nvidia-gpu2")
115+
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
116+
117+
ginkgo.BeforeEach(func() {
118+
e2eskipper.SkipUnlessProviderIs("aws", "gce")
119+
})
120+
121+
f.It("should run gpu based jobs", func(ctx context.Context) {
122+
SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
123+
124+
// Job set to have 5 completions with parallelism of 1 to ensure that it lasts long enough to experience the node recreation
125+
completions := int32(5)
126+
ginkgo.By("Starting GPU job")
127+
StartJob(ctx, f, completions)
128+
129+
job, err := e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, "cuda-add")
130+
framework.ExpectNoError(err)
131+
132+
// make sure job is running by waiting for its first pod to start running
133+
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
134+
framework.ExpectNoError(err)
135+
136+
numNodes, err := e2enode.TotalRegistered(ctx, f.ClientSet)
137+
framework.ExpectNoError(err)
138+
_, err = e2enode.CheckReady(ctx, f.ClientSet, numNodes, framework.NodeReadyInitialTimeout)
139+
framework.ExpectNoError(err)
140+
141+
ginkgo.By("Waiting for gpu job to finish")
142+
err = e2ejob.WaitForJobFinish(ctx, f.ClientSet, f.Namespace.Name, job.Name)
143+
framework.ExpectNoError(err)
144+
ginkgo.By("Done with gpu job")
145+
146+
gomega.Expect(job.Status.Failed).To(gomega.BeZero(), "Job pods failed during node recreation: %v", job.Status.Failed)
147+
148+
VerifyJobNCompletions(ctx, f, completions)
149+
})
150+
})
151+
94152
func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) {
95153
pod = podClient.Create(ctx, pod)
96154

@@ -304,3 +362,109 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
304362

305363
return rsgather
306364
}
365+
366+
// StartJob starts a simple CUDA job that requests gpu and the specified number of completions
367+
func StartJob(ctx context.Context, f *framework.Framework, completions int32) {
368+
var activeSeconds int64 = 3600
369+
testJob := e2ejob.NewTestJob("succeed", "cuda-add", v1.RestartPolicyAlways, 1, completions, &activeSeconds, 6)
370+
testJob.Spec.Template.Spec = v1.PodSpec{
371+
RestartPolicy: v1.RestartPolicyOnFailure,
372+
Containers: []v1.Container{
373+
{
374+
Name: "vector-addition",
375+
Image: "cupy/cupy:v13.3.0",
376+
Command: []string{
377+
"python3",
378+
"-c",
379+
`
380+
import cupy as cp
381+
import numpy as np
382+
import time
383+
384+
# Set the number of elements to test
385+
num_elements_list = [10, 100, 1000, 10000, 100000, 1000000]
386+
387+
for num_elements in num_elements_list:
388+
# Create random input vectors on the CPU
389+
h_A = np.random.rand(num_elements).astype(np.float32)
390+
h_B = np.random.rand(num_elements).astype(np.float32)
391+
392+
# Transfer the input vectors to the GPU
393+
d_A = cp.asarray(h_A)
394+
d_B = cp.asarray(h_B)
395+
396+
# Perform vector addition on the GPU
397+
start_gpu = time.time()
398+
d_C = d_A + d_B
399+
gpu_time = time.time() - start_gpu
400+
401+
# Transfer the result back to the CPU
402+
h_C = cp.asnumpy(d_C)
403+
404+
# Compute the expected result on the CPU
405+
start_cpu = time.time()
406+
h_C_expected = h_A + h_B
407+
cpu_time = time.time() - start_cpu
408+
409+
# Verify the result
410+
if np.allclose(h_C_expected, h_C, atol=1e-5):
411+
print(f"GPU time: {gpu_time:.6f} seconds")
412+
print(f"CPU time: {cpu_time:.6f} seconds")
413+
print(f"GPU speedup: {cpu_time / gpu_time:.2f}x")
414+
else:
415+
print(f"Test FAILED for {num_elements} elements.")
416+
417+
# Print the first few elements for verification
418+
print("First few elements of A:", h_A[:5])
419+
print("First few elements of B:", h_B[:5])
420+
print("First few elements of C:", h_C[:5])
421+
422+
print(f"Test PASSED")
423+
`,
424+
},
425+
Resources: v1.ResourceRequirements{
426+
Limits: v1.ResourceList{
427+
e2egpu.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI),
428+
},
429+
},
430+
},
431+
},
432+
}
433+
ns := f.Namespace.Name
434+
_, err := e2ejob.CreateJob(ctx, f.ClientSet, ns, testJob)
435+
framework.ExpectNoError(err)
436+
framework.Logf("Created job %v", testJob)
437+
}
438+
439+
func podNames(pods []v1.Pod) []string {
440+
originalPodNames := make([]string, len(pods))
441+
for i, p := range pods {
442+
originalPodNames[i] = p.ObjectMeta.Name
443+
}
444+
return originalPodNames
445+
}
446+
447+
// VerifyJobNCompletions verifies that the job has completions number of successful pods
448+
func VerifyJobNCompletions(ctx context.Context, f *framework.Framework, completions int32) {
449+
ns := f.Namespace.Name
450+
pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, "cuda-add")
451+
framework.ExpectNoError(err)
452+
createdPods := pods.Items
453+
createdPodNames := podNames(createdPods)
454+
framework.Logf("Got the following pods for job cuda-add: %v", createdPodNames)
455+
456+
successes := int32(0)
457+
regex := regexp.MustCompile("PASSED")
458+
for _, podName := range createdPodNames {
459+
e2epod.NewPodClient(f).WaitForFinish(ctx, podName, 5*time.Minute)
460+
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, ns, podName, "vector-addition")
461+
framework.ExpectNoError(err, "Should be able to get logs for pod %v", podName)
462+
if regex.MatchString(logs) {
463+
successes++
464+
}
465+
gomega.Expect(logs).To(gomega.Not(gomega.ContainSubstring("FAILED")))
466+
}
467+
if successes != completions {
468+
framework.Failf("Only got %v completions. Expected %v completions.", successes, completions)
469+
}
470+
}

0 commit comments

Comments
 (0)