Skip to content

Commit 630abc6

Browse files
committed
Resurrect GPU tests that use Jobs
Signed-off-by: Davanum Srinivas <[email protected]>
1 parent cdfabdc commit 630abc6

File tree

1 file changed

+138
-0
lines changed

1 file changed

+138
-0
lines changed

test/e2e/node/gpu.go

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ package node
1818

1919
import (
2020
"context"
21+
e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
2122
"os"
23+
"regexp"
2224
"time"
2325

2426
appsv1 "k8s.io/api/apps/v1"
@@ -89,6 +91,36 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
8991
gomega.Expect(log).To(gomega.ContainSubstring("Matrix multiplication result:"))
9092
gomega.Expect(log).To(gomega.ContainSubstring("Time taken for 5000x5000 matrix multiplication"))
9193
})
94+
95+
f.It("should run gpu based jobs", func(ctx context.Context) {
96+
SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
97+
98+
// Job set to have 5 completions with parallelism of 1 to ensure that it lasts long enough to experience the node recreation
99+
completions := int32(5)
100+
ginkgo.By("Starting GPU job")
101+
StartJob(ctx, f, completions)
102+
103+
job, err := e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, "cuda-add")
104+
framework.ExpectNoError(err)
105+
106+
// make sure job is running by waiting for its first pod to start running
107+
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
108+
framework.ExpectNoError(err)
109+
110+
numNodes, err := e2enode.TotalRegistered(ctx, f.ClientSet)
111+
framework.ExpectNoError(err)
112+
_, err = e2enode.CheckReady(ctx, f.ClientSet, numNodes, framework.NodeReadyInitialTimeout)
113+
framework.ExpectNoError(err)
114+
115+
ginkgo.By("Waiting for gpu job to finish")
116+
err = e2ejob.WaitForJobFinish(ctx, f.ClientSet, f.Namespace.Name, job.Name)
117+
framework.ExpectNoError(err)
118+
ginkgo.By("Done with gpu job")
119+
120+
gomega.Expect(job.Status.Failed).To(gomega.BeZero(), "Job pods failed during node recreation: %v", job.Status.Failed)
121+
122+
VerifyJobNCompletions(ctx, f, completions)
123+
})
92124
})
93125

94126
func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) {
@@ -304,3 +336,109 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
304336

305337
return rsgather
306338
}
339+
340+
// StartJob starts a simple CUDA job that requests gpu and the specified number of completions
341+
func StartJob(ctx context.Context, f *framework.Framework, completions int32) {
342+
var activeSeconds int64 = 3600
343+
testJob := e2ejob.NewTestJob("succeed", "cuda-add", v1.RestartPolicyAlways, 1, completions, &activeSeconds, 6)
344+
testJob.Spec.Template.Spec = v1.PodSpec{
345+
RestartPolicy: v1.RestartPolicyOnFailure,
346+
Containers: []v1.Container{
347+
{
348+
Name: "vector-addition",
349+
Image: "cupy/cupy:v13.3.0",
350+
Command: []string{
351+
"python3",
352+
"-c",
353+
`
354+
import cupy as cp
355+
import numpy as np
356+
import time
357+
358+
# Set the number of elements to test
359+
num_elements_list = [10, 100, 1000, 10000, 100000, 1000000]
360+
361+
for num_elements in num_elements_list:
362+
# Create random input vectors on the CPU
363+
h_A = np.random.rand(num_elements).astype(np.float32)
364+
h_B = np.random.rand(num_elements).astype(np.float32)
365+
366+
# Transfer the input vectors to the GPU
367+
d_A = cp.asarray(h_A)
368+
d_B = cp.asarray(h_B)
369+
370+
# Perform vector addition on the GPU
371+
start_gpu = time.time()
372+
d_C = d_A + d_B
373+
gpu_time = time.time() - start_gpu
374+
375+
# Transfer the result back to the CPU
376+
h_C = cp.asnumpy(d_C)
377+
378+
# Compute the expected result on the CPU
379+
start_cpu = time.time()
380+
h_C_expected = h_A + h_B
381+
cpu_time = time.time() - start_cpu
382+
383+
# Verify the result
384+
if np.allclose(h_C_expected, h_C, atol=1e-5):
385+
print(f"GPU time: {gpu_time:.6f} seconds")
386+
print(f"CPU time: {cpu_time:.6f} seconds")
387+
print(f"GPU speedup: {cpu_time / gpu_time:.2f}x")
388+
else:
389+
print(f"Test FAILED for {num_elements} elements.")
390+
391+
# Print the first few elements for verification
392+
print("First few elements of A:", h_A[:5])
393+
print("First few elements of B:", h_B[:5])
394+
print("First few elements of C:", h_C[:5])
395+
396+
print(f"Test PASSED")
397+
`,
398+
},
399+
Resources: v1.ResourceRequirements{
400+
Limits: v1.ResourceList{
401+
e2egpu.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI),
402+
},
403+
},
404+
},
405+
},
406+
}
407+
ns := f.Namespace.Name
408+
_, err := e2ejob.CreateJob(ctx, f.ClientSet, ns, testJob)
409+
framework.ExpectNoError(err)
410+
framework.Logf("Created job %v", testJob)
411+
}
412+
413+
func podNames(pods []v1.Pod) []string {
414+
originalPodNames := make([]string, len(pods))
415+
for i, p := range pods {
416+
originalPodNames[i] = p.ObjectMeta.Name
417+
}
418+
return originalPodNames
419+
}
420+
421+
// VerifyJobNCompletions verifies that the job has completions number of successful pods
422+
func VerifyJobNCompletions(ctx context.Context, f *framework.Framework, completions int32) {
423+
ns := f.Namespace.Name
424+
pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, "cuda-add")
425+
framework.ExpectNoError(err)
426+
createdPods := pods.Items
427+
createdPodNames := podNames(createdPods)
428+
framework.Logf("Got the following pods for job cuda-add: %v", createdPodNames)
429+
430+
successes := int32(0)
431+
regex := regexp.MustCompile("PASSED")
432+
for _, podName := range createdPodNames {
433+
e2epod.NewPodClient(f).WaitForFinish(ctx, podName, 5*time.Minute)
434+
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, ns, podName, "vector-addition")
435+
framework.ExpectNoError(err, "Should be able to get logs for pod %v", podName)
436+
if regex.MatchString(logs) {
437+
successes++
438+
}
439+
gomega.Expect(logs).To(gomega.Not(gomega.ContainSubstring("FAILED")))
440+
}
441+
if successes != completions {
442+
framework.Failf("Only got %v completions. Expected %v completions.", successes, completions)
443+
}
444+
}

0 commit comments

Comments
 (0)