@@ -18,7 +18,9 @@ package node
18
18
19
19
import (
20
20
"context"
21
+ e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
21
22
"os"
23
+ "regexp"
22
24
"time"
23
25
24
26
appsv1 "k8s.io/api/apps/v1"
@@ -89,6 +91,36 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
89
91
gomega .Expect (log ).To (gomega .ContainSubstring ("Matrix multiplication result:" ))
90
92
gomega .Expect (log ).To (gomega .ContainSubstring ("Time taken for 5000x5000 matrix multiplication" ))
91
93
})
94
+
95
+ f .It ("should run gpu based jobs" , func (ctx context.Context ) {
96
+ SetupEnvironmentAndSkipIfNeeded (ctx , f , f .ClientSet )
97
+
98
+ // Job set to have 5 completions with parallelism of 1 to ensure that it lasts long enough to experience the node recreation
99
+ completions := int32 (5 )
100
+ ginkgo .By ("Starting GPU job" )
101
+ StartJob (ctx , f , completions )
102
+
103
+ job , err := e2ejob .GetJob (ctx , f .ClientSet , f .Namespace .Name , "cuda-add" )
104
+ framework .ExpectNoError (err )
105
+
106
+ // make sure job is running by waiting for its first pod to start running
107
+ err = e2ejob .WaitForJobPodsRunning (ctx , f .ClientSet , f .Namespace .Name , job .Name , 1 )
108
+ framework .ExpectNoError (err )
109
+
110
+ numNodes , err := e2enode .TotalRegistered (ctx , f .ClientSet )
111
+ framework .ExpectNoError (err )
112
+ _ , err = e2enode .CheckReady (ctx , f .ClientSet , numNodes , framework .NodeReadyInitialTimeout )
113
+ framework .ExpectNoError (err )
114
+
115
+ ginkgo .By ("Waiting for gpu job to finish" )
116
+ err = e2ejob .WaitForJobFinish (ctx , f .ClientSet , f .Namespace .Name , job .Name )
117
+ framework .ExpectNoError (err )
118
+ ginkgo .By ("Done with gpu job" )
119
+
120
+ gomega .Expect (job .Status .Failed ).To (gomega .BeZero (), "Job pods failed during node recreation: %v" , job .Status .Failed )
121
+
122
+ VerifyJobNCompletions (ctx , f , completions )
123
+ })
92
124
})
93
125
94
126
func createAndValidatePod (ctx context.Context , f * framework.Framework , podClient * e2epod.PodClient , pod * v1.Pod ) {
@@ -304,3 +336,109 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
304
336
305
337
return rsgather
306
338
}
339
+
340
+ // StartJob starts a simple CUDA job that requests gpu and the specified number of completions
341
+ func StartJob (ctx context.Context , f * framework.Framework , completions int32 ) {
342
+ var activeSeconds int64 = 3600
343
+ testJob := e2ejob .NewTestJob ("succeed" , "cuda-add" , v1 .RestartPolicyAlways , 1 , completions , & activeSeconds , 6 )
344
+ testJob .Spec .Template .Spec = v1.PodSpec {
345
+ RestartPolicy : v1 .RestartPolicyOnFailure ,
346
+ Containers : []v1.Container {
347
+ {
348
+ Name : "vector-addition" ,
349
+ Image : "cupy/cupy:v13.3.0" ,
350
+ Command : []string {
351
+ "python3" ,
352
+ "-c" ,
353
+ `
354
+ import cupy as cp
355
+ import numpy as np
356
+ import time
357
+
358
+ # Set the number of elements to test
359
+ num_elements_list = [10, 100, 1000, 10000, 100000, 1000000]
360
+
361
+ for num_elements in num_elements_list:
362
+ # Create random input vectors on the CPU
363
+ h_A = np.random.rand(num_elements).astype(np.float32)
364
+ h_B = np.random.rand(num_elements).astype(np.float32)
365
+
366
+ # Transfer the input vectors to the GPU
367
+ d_A = cp.asarray(h_A)
368
+ d_B = cp.asarray(h_B)
369
+
370
+ # Perform vector addition on the GPU
371
+ start_gpu = time.time()
372
+ d_C = d_A + d_B
373
+ gpu_time = time.time() - start_gpu
374
+
375
+ # Transfer the result back to the CPU
376
+ h_C = cp.asnumpy(d_C)
377
+
378
+ # Compute the expected result on the CPU
379
+ start_cpu = time.time()
380
+ h_C_expected = h_A + h_B
381
+ cpu_time = time.time() - start_cpu
382
+
383
+ # Verify the result
384
+ if np.allclose(h_C_expected, h_C, atol=1e-5):
385
+ print(f"GPU time: {gpu_time:.6f} seconds")
386
+ print(f"CPU time: {cpu_time:.6f} seconds")
387
+ print(f"GPU speedup: {cpu_time / gpu_time:.2f}x")
388
+ else:
389
+ print(f"Test FAILED for {num_elements} elements.")
390
+
391
+ # Print the first few elements for verification
392
+ print("First few elements of A:", h_A[:5])
393
+ print("First few elements of B:", h_B[:5])
394
+ print("First few elements of C:", h_C[:5])
395
+
396
+ print(f"Test PASSED")
397
+ ` ,
398
+ },
399
+ Resources : v1.ResourceRequirements {
400
+ Limits : v1.ResourceList {
401
+ e2egpu .NVIDIAGPUResourceName : * resource .NewQuantity (1 , resource .DecimalSI ),
402
+ },
403
+ },
404
+ },
405
+ },
406
+ }
407
+ ns := f .Namespace .Name
408
+ _ , err := e2ejob .CreateJob (ctx , f .ClientSet , ns , testJob )
409
+ framework .ExpectNoError (err )
410
+ framework .Logf ("Created job %v" , testJob )
411
+ }
412
+
413
+ func podNames (pods []v1.Pod ) []string {
414
+ originalPodNames := make ([]string , len (pods ))
415
+ for i , p := range pods {
416
+ originalPodNames [i ] = p .ObjectMeta .Name
417
+ }
418
+ return originalPodNames
419
+ }
420
+
421
+ // VerifyJobNCompletions verifies that the job has completions number of successful pods
422
+ func VerifyJobNCompletions (ctx context.Context , f * framework.Framework , completions int32 ) {
423
+ ns := f .Namespace .Name
424
+ pods , err := e2ejob .GetJobPods (ctx , f .ClientSet , f .Namespace .Name , "cuda-add" )
425
+ framework .ExpectNoError (err )
426
+ createdPods := pods .Items
427
+ createdPodNames := podNames (createdPods )
428
+ framework .Logf ("Got the following pods for job cuda-add: %v" , createdPodNames )
429
+
430
+ successes := int32 (0 )
431
+ regex := regexp .MustCompile ("PASSED" )
432
+ for _ , podName := range createdPodNames {
433
+ e2epod .NewPodClient (f ).WaitForFinish (ctx , podName , 5 * time .Minute )
434
+ logs , err := e2epod .GetPodLogs (ctx , f .ClientSet , ns , podName , "vector-addition" )
435
+ framework .ExpectNoError (err , "Should be able to get logs for pod %v" , podName )
436
+ if regex .MatchString (logs ) {
437
+ successes ++
438
+ }
439
+ gomega .Expect (logs ).To (gomega .Not (gomega .ContainSubstring ("FAILED" )))
440
+ }
441
+ if successes != completions {
442
+ framework .Failf ("Only got %v completions. Expected %v completions." , successes , completions )
443
+ }
444
+ }
0 commit comments