@@ -18,17 +18,26 @@ package node
18
18
19
19
import (
20
20
"context"
21
+ "os"
22
+ "time"
23
+
24
+ appsv1 "k8s.io/api/apps/v1"
21
25
v1 "k8s.io/api/core/v1"
22
26
"k8s.io/apimachinery/pkg/api/resource"
23
27
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24
28
"k8s.io/apimachinery/pkg/util/uuid"
25
29
clientset "k8s.io/client-go/kubernetes"
30
+ extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
26
31
"k8s.io/kubernetes/test/e2e/feature"
27
32
"k8s.io/kubernetes/test/e2e/framework"
33
+ e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
28
34
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
35
+ e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
29
36
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
30
37
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
38
+ e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
31
39
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
40
+ e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
32
41
admissionapi "k8s.io/pod-security-admission/api"
33
42
34
43
"github.com/onsi/ginkgo/v2"
@@ -42,12 +51,12 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
42
51
var podClient * e2epod.PodClient
43
52
44
53
ginkgo .BeforeEach (func () {
45
- e2eskipper .SkipUnlessProviderIs ("aws" )
54
+ e2eskipper .SkipUnlessProviderIs ("aws" , "gce" )
46
55
podClient = e2epod .NewPodClient (f )
47
56
})
48
57
49
58
f .It ("should run nvidia-smi cli" , func (ctx context.Context ) {
50
- checkEnvironmentAndSkipIfNeeded (ctx , f .ClientSet )
59
+ SetupEnvironmentAndSkipIfNeeded (ctx , f , f .ClientSet )
51
60
pod := testNvidiaCLIPod ()
52
61
pod .Spec .Containers [0 ].Command = []string {"nvidia-smi" }
53
62
@@ -65,7 +74,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
65
74
})
66
75
67
76
f .It ("should run gpu based matrix multiplication" , func (ctx context.Context ) {
68
- checkEnvironmentAndSkipIfNeeded (ctx , f .ClientSet )
77
+ SetupEnvironmentAndSkipIfNeeded (ctx , f , f .ClientSet )
69
78
pod := testMatrixMultiplicationPod ()
70
79
71
80
ginkgo .By ("Creating a pod that runs matrix multiplication" )
@@ -180,7 +189,18 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f
180
189
return & pod
181
190
}
182
191
183
- func checkEnvironmentAndSkipIfNeeded (ctx context.Context , clientSet clientset.Interface ) {
192
+ func SetupEnvironmentAndSkipIfNeeded (ctx context.Context , f * framework.Framework , clientSet clientset.Interface ) {
193
+ if framework .ProviderIs ("gce" ) {
194
+ rsgather := SetupNVIDIAGPUNode (ctx , f )
195
+ defer func () {
196
+ framework .Logf ("Stopping ResourceUsageGather" )
197
+ constraints := make (map [string ]e2edebug.ResourceConstraint )
198
+ // For now, just gets summary. Can pass valid constraints in the future.
199
+ summary , err := rsgather .StopAndSummarize ([]int {50 , 90 , 100 }, constraints )
200
+ f .TestSummaries = append (f .TestSummaries , summary )
201
+ framework .ExpectNoError (err , "getting resource usage summary" )
202
+ }()
203
+ }
184
204
nodes , err := e2enode .GetReadySchedulableNodes (ctx , clientSet )
185
205
framework .ExpectNoError (err )
186
206
capacity := 0
@@ -204,3 +224,83 @@ func checkEnvironmentAndSkipIfNeeded(ctx context.Context, clientSet clientset.In
204
224
e2eskipper .Skipf ("%d ready nodes do not have any allocatable Nvidia GPU(s). Skipping..." , len (nodes .Items ))
205
225
}
206
226
}
227
+
228
+ func areGPUsAvailableOnAllSchedulableNodes (ctx context.Context , clientSet clientset.Interface ) bool {
229
+ framework .Logf ("Getting list of Nodes from API server" )
230
+ nodeList , err := clientSet .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
231
+ framework .ExpectNoError (err , "getting node list" )
232
+ for _ , node := range nodeList .Items {
233
+ if node .Spec .Unschedulable {
234
+ continue
235
+ }
236
+ framework .Logf ("gpuResourceName %s" , e2egpu .NVIDIAGPUResourceName )
237
+ if val , ok := node .Status .Capacity [e2egpu .NVIDIAGPUResourceName ]; ! ok || val .Value () == 0 {
238
+ framework .Logf ("Nvidia GPUs not available on Node: %q" , node .Name )
239
+ return false
240
+ }
241
+ }
242
+ framework .Logf ("Nvidia GPUs exist on all schedulable nodes" )
243
+ return true
244
+ }
245
+
246
+ func logOSImages (ctx context.Context , f * framework.Framework ) {
247
+ nodeList , err := f .ClientSet .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
248
+ framework .ExpectNoError (err , "getting node list" )
249
+ for _ , node := range nodeList .Items {
250
+ framework .Logf ("Nodename: %v, OS Image: %v" , node .Name , node .Status .NodeInfo .OSImage )
251
+ }
252
+ }
253
+
254
+ const (
255
+ // Nvidia driver installation can take upwards of 5 minutes.
256
+ driverInstallTimeout = 10 * time .Minute
257
+ )
258
+
259
+ // SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes
260
+ func SetupNVIDIAGPUNode (ctx context.Context , f * framework.Framework ) * e2edebug.ContainerResourceGatherer {
261
+ logOSImages (ctx , f )
262
+
263
+ var err error
264
+ var ds * appsv1.DaemonSet
265
+ dsYamlURLFromEnv := os .Getenv ("NVIDIA_DRIVER_INSTALLER_DAEMONSET" )
266
+ if dsYamlURLFromEnv != "" {
267
+ // Using DaemonSet from remote URL
268
+ framework .Logf ("Using remote nvidia-driver-installer daemonset manifest from %v" , dsYamlURLFromEnv )
269
+ ds , err = e2emanifest .DaemonSetFromURL (ctx , dsYamlURLFromEnv )
270
+ framework .ExpectNoError (err , "failed get remote" )
271
+ } else {
272
+ // Using default local DaemonSet
273
+ framework .Logf ("Using default local nvidia-driver-installer daemonset manifest." )
274
+ data , err := e2etestfiles .Read ("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml" )
275
+ framework .ExpectNoError (err , "failed to read local manifest for nvidia-driver-installer daemonset" )
276
+ ds , err = e2emanifest .DaemonSetFromData (data )
277
+ framework .ExpectNoError (err , "failed to parse local manifest for nvidia-driver-installer daemonset" )
278
+ }
279
+ ds .Namespace = f .Namespace .Name
280
+ _ , err = f .ClientSet .AppsV1 ().DaemonSets (f .Namespace .Name ).Create (ctx , ds , metav1.CreateOptions {})
281
+ framework .ExpectNoError (err , "failed to create nvidia-driver-installer daemonset" )
282
+ framework .Logf ("Successfully created daemonset to install Nvidia drivers." )
283
+
284
+ pods , err := e2eresource .WaitForControlledPods (ctx , f .ClientSet , ds .Namespace , ds .Name , extensionsinternal .Kind ("DaemonSet" ))
285
+ framework .ExpectNoError (err , "failed to get pods controlled by the nvidia-driver-installer daemonset" )
286
+
287
+ devicepluginPods , err := e2eresource .WaitForControlledPods (ctx , f .ClientSet , "kube-system" , "nvidia-gpu-device-plugin" , extensionsinternal .Kind ("DaemonSet" ))
288
+ if err == nil {
289
+ framework .Logf ("Adding deviceplugin addon pod." )
290
+ pods .Items = append (pods .Items , devicepluginPods .Items ... )
291
+ }
292
+
293
+ framework .Logf ("Starting ResourceUsageGather for the created DaemonSet pods." )
294
+ rsgather , err := e2edebug .NewResourceUsageGatherer (ctx , f .ClientSet ,
295
+ e2edebug.ResourceGathererOptions {InKubemark : false , Nodes : e2edebug .AllNodes , ResourceDataGatheringPeriod : 2 * time .Second , ProbeDuration : 2 * time .Second , PrintVerboseLogs : true }, pods )
296
+ framework .ExpectNoError (err , "creating ResourceUsageGather for the daemonset pods" )
297
+ go rsgather .StartGatheringData (ctx )
298
+
299
+ // Wait for Nvidia GPUs to be available on nodes
300
+ framework .Logf ("Waiting for drivers to be installed and GPUs to be available in Node Capacity..." )
301
+ gomega .Eventually (ctx , func (ctx context.Context ) bool {
302
+ return areGPUsAvailableOnAllSchedulableNodes (ctx , f .ClientSet )
303
+ }, driverInstallTimeout , time .Second ).Should (gomega .BeTrueBecause ("expected GPU resources to be available within the timout" ))
304
+
305
+ return rsgather
306
+ }
0 commit comments