@@ -31,7 +31,6 @@ import (
31
31
extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
32
32
"k8s.io/kubernetes/test/e2e/feature"
33
33
"k8s.io/kubernetes/test/e2e/framework"
34
- e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
35
34
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
36
35
e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
37
36
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
@@ -131,7 +130,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
131
130
framework .ExpectNoError (err )
132
131
133
132
// make sure job is running by waiting for its first pod to start running
134
- err = e2ejob .WaitForJobPodsRunning (ctx , f .ClientSet , f .Namespace .Name , job .Name , 1 )
133
+ err = e2ejob .WaitForJobPodsRunningWithTimeout (ctx , f .ClientSet , f .Namespace .Name , job .Name , 1 , e2ejob . JobTimeout * 2 )
135
134
framework .ExpectNoError (err )
136
135
137
136
numNodes , err := e2enode .TotalRegistered (ctx , f .ClientSet )
@@ -140,7 +139,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
140
139
framework .ExpectNoError (err )
141
140
142
141
ginkgo .By ("Waiting for gpu job to finish" )
143
- err = e2ejob .WaitForJobFinish (ctx , f .ClientSet , f .Namespace .Name , job .Name )
142
+ err = e2ejob .WaitForJobFinishWithTimeout (ctx , f .ClientSet , f .Namespace .Name , job .Name , e2ejob . JobTimeout * 2 )
144
143
framework .ExpectNoError (err )
145
144
ginkgo .By ("Done with gpu job" )
146
145
@@ -154,7 +153,7 @@ func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient
154
153
pod = podClient .Create (ctx , pod )
155
154
156
155
ginkgo .By ("Watching for error events or started pod" )
157
- ev , err := podClient .WaitForErrorEventOrSuccessWithTimeout (ctx , pod , framework .PodStartTimeout * 3 )
156
+ ev , err := podClient .WaitForErrorEventOrSuccessWithTimeout (ctx , pod , framework .PodStartTimeout * 6 )
158
157
framework .ExpectNoError (err )
159
158
gomega .Expect (ev ).To (gomega .BeNil ())
160
159
@@ -263,15 +262,7 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f
263
262
264
263
func SetupEnvironmentAndSkipIfNeeded (ctx context.Context , f * framework.Framework , clientSet clientset.Interface ) {
265
264
if framework .ProviderIs ("gce" ) {
266
- rsgather := SetupNVIDIAGPUNode (ctx , f )
267
- defer func () {
268
- framework .Logf ("Stopping ResourceUsageGather" )
269
- constraints := make (map [string ]e2edebug.ResourceConstraint )
270
- // For now, just gets summary. Can pass valid constraints in the future.
271
- summary , err := rsgather .StopAndSummarize ([]int {50 , 90 , 100 }, constraints )
272
- f .TestSummaries = append (f .TestSummaries , summary )
273
- framework .ExpectNoError (err , "getting resource usage summary" )
274
- }()
265
+ SetupNVIDIAGPUNode (ctx , f )
275
266
}
276
267
nodes , err := e2enode .GetReadySchedulableNodes (ctx , clientSet )
277
268
framework .ExpectNoError (err )
@@ -329,7 +320,7 @@ const (
329
320
)
330
321
331
322
// SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes
332
- func SetupNVIDIAGPUNode (ctx context.Context , f * framework.Framework ) * e2edebug. ContainerResourceGatherer {
323
+ func SetupNVIDIAGPUNode (ctx context.Context , f * framework.Framework ) {
333
324
logOSImages (ctx , f )
334
325
335
326
var err error
@@ -348,6 +339,13 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
348
339
ds , err = e2emanifest .DaemonSetFromData (data )
349
340
framework .ExpectNoError (err , "failed to parse local manifest for nvidia-driver-installer daemonset" )
350
341
}
342
+
343
+ prev , err := f .ClientSet .AppsV1 ().DaemonSets (f .Namespace .Name ).Get (ctx , ds .Name , metav1.GetOptions {})
344
+ if err == nil && prev != nil {
345
+ framework .Logf ("Daemonset already installed, skipping..." )
346
+ return
347
+ }
348
+
351
349
ds .Namespace = f .Namespace .Name
352
350
_ , err = f .ClientSet .AppsV1 ().DaemonSets (f .Namespace .Name ).Create (ctx , ds , metav1.CreateOptions {})
353
351
framework .ExpectNoError (err , "failed to create nvidia-driver-installer daemonset" )
@@ -362,19 +360,11 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
362
360
pods .Items = append (pods .Items , devicepluginPods .Items ... )
363
361
}
364
362
365
- framework .Logf ("Starting ResourceUsageGather for the created DaemonSet pods." )
366
- rsgather , err := e2edebug .NewResourceUsageGatherer (ctx , f .ClientSet ,
367
- e2edebug.ResourceGathererOptions {InKubemark : false , Nodes : e2edebug .AllNodes , ResourceDataGatheringPeriod : 2 * time .Second , ProbeDuration : 2 * time .Second , PrintVerboseLogs : true }, pods )
368
- framework .ExpectNoError (err , "creating ResourceUsageGather for the daemonset pods" )
369
- go rsgather .StartGatheringData (ctx )
370
-
371
363
// Wait for Nvidia GPUs to be available on nodes
372
364
framework .Logf ("Waiting for drivers to be installed and GPUs to be available in Node Capacity..." )
373
365
gomega .Eventually (ctx , func (ctx context.Context ) bool {
374
366
return areGPUsAvailableOnAllSchedulableNodes (ctx , f .ClientSet )
375
367
}, driverInstallTimeout , time .Second ).Should (gomega .BeTrueBecause ("expected GPU resources to be available within the timout" ))
376
-
377
- return rsgather
378
368
}
379
369
380
370
// StartJob starts a simple CUDA job that requests gpu and the specified number of completions
0 commit comments