@@ -19,6 +19,7 @@ package node
19
19
import (
20
20
"context"
21
21
"os"
22
+ "regexp"
22
23
"time"
23
24
24
25
appsv1 "k8s.io/api/apps/v1"
@@ -32,6 +33,7 @@ import (
32
33
"k8s.io/kubernetes/test/e2e/framework"
33
34
e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
34
35
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
36
+ e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
35
37
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
36
38
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
37
39
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -44,9 +46,13 @@ import (
44
46
"github.com/onsi/gomega"
45
47
)
46
48
47
- var _ = SIGDescribe (feature .GPUDevicePlugin , "Sanity test for Nvidia Device" , func () {
49
+ // NOTE: All the tests in this file are run serially because they share a limited set of GPU(s), please inspect
50
+ // the CI job definitions to see how many GPU(s) are available in the environment
51
+ // Currently the CI jobs have 2 nodes each with 4 Nvidia T4's across both GCE and AWS harness(es).
48
52
49
- f := framework .NewDefaultFramework ("nvidia-gpu" )
53
+ var _ = SIGDescribe (feature .GPUDevicePlugin , framework .WithSerial (), "Sanity test using nvidia-smi" , func () {
54
+
55
+ f := framework .NewDefaultFramework ("nvidia-gpu1" )
50
56
f .NamespacePodSecurityLevel = admissionapi .LevelPrivileged
51
57
var podClient * e2epod.PodClient
52
58
@@ -72,6 +78,18 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
72
78
gomega .Expect (log ).To (gomega .ContainSubstring ("Driver Version:" ))
73
79
gomega .Expect (log ).To (gomega .ContainSubstring ("CUDA Version:" ))
74
80
})
81
+ })
82
+
83
+ var _ = SIGDescribe (feature .GPUDevicePlugin , framework .WithSerial (), "Test using a Pod" , func () {
84
+
85
+ f := framework .NewDefaultFramework ("nvidia-gpu2" )
86
+ f .NamespacePodSecurityLevel = admissionapi .LevelPrivileged
87
+ var podClient * e2epod.PodClient
88
+
89
+ ginkgo .BeforeEach (func () {
90
+ e2eskipper .SkipUnlessProviderIs ("aws" , "gce" )
91
+ podClient = e2epod .NewPodClient (f )
92
+ })
75
93
76
94
f .It ("should run gpu based matrix multiplication" , func (ctx context.Context ) {
77
95
SetupEnvironmentAndSkipIfNeeded (ctx , f , f .ClientSet )
@@ -91,6 +109,46 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
91
109
})
92
110
})
93
111
112
+ var _ = SIGDescribe (feature .GPUDevicePlugin , framework .WithSerial (), "Test using a Job" , func () {
113
+
114
+ f := framework .NewDefaultFramework ("nvidia-gpu2" )
115
+ f .NamespacePodSecurityLevel = admissionapi .LevelPrivileged
116
+
117
+ ginkgo .BeforeEach (func () {
118
+ e2eskipper .SkipUnlessProviderIs ("aws" , "gce" )
119
+ })
120
+
121
+ f .It ("should run gpu based jobs" , func (ctx context.Context ) {
122
+ SetupEnvironmentAndSkipIfNeeded (ctx , f , f .ClientSet )
123
+
124
+ // Job set to have 5 completions with parallelism of 1 to ensure that it lasts long enough to experience the node recreation
125
+ completions := int32 (5 )
126
+ ginkgo .By ("Starting GPU job" )
127
+ StartJob (ctx , f , completions )
128
+
129
+ job , err := e2ejob .GetJob (ctx , f .ClientSet , f .Namespace .Name , "cuda-add" )
130
+ framework .ExpectNoError (err )
131
+
132
+ // make sure job is running by waiting for its first pod to start running
133
+ err = e2ejob .WaitForJobPodsRunning (ctx , f .ClientSet , f .Namespace .Name , job .Name , 1 )
134
+ framework .ExpectNoError (err )
135
+
136
+ numNodes , err := e2enode .TotalRegistered (ctx , f .ClientSet )
137
+ framework .ExpectNoError (err )
138
+ _ , err = e2enode .CheckReady (ctx , f .ClientSet , numNodes , framework .NodeReadyInitialTimeout )
139
+ framework .ExpectNoError (err )
140
+
141
+ ginkgo .By ("Waiting for gpu job to finish" )
142
+ err = e2ejob .WaitForJobFinish (ctx , f .ClientSet , f .Namespace .Name , job .Name )
143
+ framework .ExpectNoError (err )
144
+ ginkgo .By ("Done with gpu job" )
145
+
146
+ gomega .Expect (job .Status .Failed ).To (gomega .BeZero (), "Job pods failed during node recreation: %v" , job .Status .Failed )
147
+
148
+ VerifyJobNCompletions (ctx , f , completions )
149
+ })
150
+ })
151
+
94
152
func createAndValidatePod (ctx context.Context , f * framework.Framework , podClient * e2epod.PodClient , pod * v1.Pod ) {
95
153
pod = podClient .Create (ctx , pod )
96
154
@@ -304,3 +362,109 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
304
362
305
363
return rsgather
306
364
}
365
+
366
+ // StartJob starts a simple CUDA job that requests gpu and the specified number of completions
367
+ func StartJob (ctx context.Context , f * framework.Framework , completions int32 ) {
368
+ var activeSeconds int64 = 3600
369
+ testJob := e2ejob .NewTestJob ("succeed" , "cuda-add" , v1 .RestartPolicyAlways , 1 , completions , & activeSeconds , 6 )
370
+ testJob .Spec .Template .Spec = v1.PodSpec {
371
+ RestartPolicy : v1 .RestartPolicyOnFailure ,
372
+ Containers : []v1.Container {
373
+ {
374
+ Name : "vector-addition" ,
375
+ Image : "cupy/cupy:v13.3.0" ,
376
+ Command : []string {
377
+ "python3" ,
378
+ "-c" ,
379
+ `
380
+ import cupy as cp
381
+ import numpy as np
382
+ import time
383
+
384
+ # Set the number of elements to test
385
+ num_elements_list = [10, 100, 1000, 10000, 100000, 1000000]
386
+
387
+ for num_elements in num_elements_list:
388
+ # Create random input vectors on the CPU
389
+ h_A = np.random.rand(num_elements).astype(np.float32)
390
+ h_B = np.random.rand(num_elements).astype(np.float32)
391
+
392
+ # Transfer the input vectors to the GPU
393
+ d_A = cp.asarray(h_A)
394
+ d_B = cp.asarray(h_B)
395
+
396
+ # Perform vector addition on the GPU
397
+ start_gpu = time.time()
398
+ d_C = d_A + d_B
399
+ gpu_time = time.time() - start_gpu
400
+
401
+ # Transfer the result back to the CPU
402
+ h_C = cp.asnumpy(d_C)
403
+
404
+ # Compute the expected result on the CPU
405
+ start_cpu = time.time()
406
+ h_C_expected = h_A + h_B
407
+ cpu_time = time.time() - start_cpu
408
+
409
+ # Verify the result
410
+ if np.allclose(h_C_expected, h_C, atol=1e-5):
411
+ print(f"GPU time: {gpu_time:.6f} seconds")
412
+ print(f"CPU time: {cpu_time:.6f} seconds")
413
+ print(f"GPU speedup: {cpu_time / gpu_time:.2f}x")
414
+ else:
415
+ print(f"Test FAILED for {num_elements} elements.")
416
+
417
+ # Print the first few elements for verification
418
+ print("First few elements of A:", h_A[:5])
419
+ print("First few elements of B:", h_B[:5])
420
+ print("First few elements of C:", h_C[:5])
421
+
422
+ print(f"Test PASSED")
423
+ ` ,
424
+ },
425
+ Resources : v1.ResourceRequirements {
426
+ Limits : v1.ResourceList {
427
+ e2egpu .NVIDIAGPUResourceName : * resource .NewQuantity (1 , resource .DecimalSI ),
428
+ },
429
+ },
430
+ },
431
+ },
432
+ }
433
+ ns := f .Namespace .Name
434
+ _ , err := e2ejob .CreateJob (ctx , f .ClientSet , ns , testJob )
435
+ framework .ExpectNoError (err )
436
+ framework .Logf ("Created job %v" , testJob )
437
+ }
438
+
439
+ func podNames (pods []v1.Pod ) []string {
440
+ originalPodNames := make ([]string , len (pods ))
441
+ for i , p := range pods {
442
+ originalPodNames [i ] = p .ObjectMeta .Name
443
+ }
444
+ return originalPodNames
445
+ }
446
+
447
+ // VerifyJobNCompletions verifies that the job has completions number of successful pods
448
+ func VerifyJobNCompletions (ctx context.Context , f * framework.Framework , completions int32 ) {
449
+ ns := f .Namespace .Name
450
+ pods , err := e2ejob .GetJobPods (ctx , f .ClientSet , f .Namespace .Name , "cuda-add" )
451
+ framework .ExpectNoError (err )
452
+ createdPods := pods .Items
453
+ createdPodNames := podNames (createdPods )
454
+ framework .Logf ("Got the following pods for job cuda-add: %v" , createdPodNames )
455
+
456
+ successes := int32 (0 )
457
+ regex := regexp .MustCompile ("PASSED" )
458
+ for _ , podName := range createdPodNames {
459
+ e2epod .NewPodClient (f ).WaitForFinish (ctx , podName , 5 * time .Minute )
460
+ logs , err := e2epod .GetPodLogs (ctx , f .ClientSet , ns , podName , "vector-addition" )
461
+ framework .ExpectNoError (err , "Should be able to get logs for pod %v" , podName )
462
+ if regex .MatchString (logs ) {
463
+ successes ++
464
+ }
465
+ gomega .Expect (logs ).To (gomega .Not (gomega .ContainSubstring ("FAILED" )))
466
+ }
467
+ if successes != completions {
468
+ framework .Failf ("Only got %v completions. Expected %v completions." , successes , completions )
469
+ }
470
+ }
0 commit comments