Skip to content

Commit ff391ce

Browse files
authored
Merge pull request kubernetes#127547 from dims/skip-reinstallation-of-gpu-daemonset
Skip re-installation of GPU daemonset
2 parents f187480 + 1abbb00 commit ff391ce

File tree

2 files changed

+28
-27
lines changed

2 files changed

+28
-27
lines changed

test/e2e/framework/job/wait.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,17 +42,23 @@ type JobState func(job *batchv1.Job) string
4242
// WaitForJobPodsRunning wait for all pods for the Job named JobName in namespace ns to become Running. Only use
4343
// when pods will run for a long time, or it will be racy.
4444
func WaitForJobPodsRunning(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32) error {
45-
return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning)
45+
return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning, JobTimeout)
46+
}
47+
48+
// WaitForJobPodsRunningWithTimeout wait for all pods for the Job named JobName in namespace ns to become Running. Only use
49+
// when pods will run for a long time, or it will be racy. same as WaitForJobPodsRunning but with an additional timeout parameter
50+
func WaitForJobPodsRunningWithTimeout(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, timeout time.Duration) error {
51+
return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning, timeout)
4652
}
4753

4854
// WaitForJobPodsSucceeded wait for all pods for the Job named JobName in namespace ns to become Succeeded.
4955
func WaitForJobPodsSucceeded(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32) error {
50-
return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodSucceeded)
56+
return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodSucceeded, JobTimeout)
5157
}
5258

5359
// waitForJobPodsInPhase wait for all pods for the Job named JobName in namespace ns to be in a given phase.
54-
func waitForJobPodsInPhase(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, phase v1.PodPhase) error {
55-
return wait.PollUntilContextTimeout(ctx, framework.Poll, JobTimeout, false, func(ctx context.Context) (bool, error) {
60+
func waitForJobPodsInPhase(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, phase v1.PodPhase, timeout time.Duration) error {
61+
return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, false, func(ctx context.Context) (bool, error) {
5662
pods, err := GetJobPods(ctx, c, ns, jobName)
5763
if err != nil {
5864
return false, err
@@ -157,7 +163,12 @@ func isJobFailed(j *batchv1.Job) bool {
157163

158164
// WaitForJobFinish uses c to wait for the Job jobName in namespace ns to finish (either Failed or Complete).
159165
func WaitForJobFinish(ctx context.Context, c clientset.Interface, ns, jobName string) error {
160-
return wait.PollUntilContextTimeout(ctx, framework.Poll, JobTimeout, true, func(ctx context.Context) (bool, error) {
166+
return WaitForJobFinishWithTimeout(ctx, c, ns, jobName, JobTimeout)
167+
}
168+
169+
// WaitForJobFinishWithTimeout uses c to wait for the Job jobName in namespace ns to finish (either Failed or Complete).
170+
func WaitForJobFinishWithTimeout(ctx context.Context, c clientset.Interface, ns, jobName string, timeout time.Duration) error {
171+
return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, true, func(ctx context.Context) (bool, error) {
161172
curr, err := c.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{})
162173
if err != nil {
163174
return false, err

test/e2e/node/gpu.go

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import (
3131
extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
3232
"k8s.io/kubernetes/test/e2e/feature"
3333
"k8s.io/kubernetes/test/e2e/framework"
34-
e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
3534
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
3635
e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
3736
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
@@ -131,7 +130,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
131130
framework.ExpectNoError(err)
132131

133132
// make sure job is running by waiting for its first pod to start running
134-
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
133+
err = e2ejob.WaitForJobPodsRunningWithTimeout(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1, e2ejob.JobTimeout*2)
135134
framework.ExpectNoError(err)
136135

137136
numNodes, err := e2enode.TotalRegistered(ctx, f.ClientSet)
@@ -140,7 +139,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
140139
framework.ExpectNoError(err)
141140

142141
ginkgo.By("Waiting for gpu job to finish")
143-
err = e2ejob.WaitForJobFinish(ctx, f.ClientSet, f.Namespace.Name, job.Name)
142+
err = e2ejob.WaitForJobFinishWithTimeout(ctx, f.ClientSet, f.Namespace.Name, job.Name, e2ejob.JobTimeout*2)
144143
framework.ExpectNoError(err)
145144
ginkgo.By("Done with gpu job")
146145

@@ -154,7 +153,7 @@ func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient
154153
pod = podClient.Create(ctx, pod)
155154

156155
ginkgo.By("Watching for error events or started pod")
157-
ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*3)
156+
ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*6)
158157
framework.ExpectNoError(err)
159158
gomega.Expect(ev).To(gomega.BeNil())
160159

@@ -263,15 +262,7 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f
263262

264263
func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework, clientSet clientset.Interface) {
265264
if framework.ProviderIs("gce") {
266-
rsgather := SetupNVIDIAGPUNode(ctx, f)
267-
defer func() {
268-
framework.Logf("Stopping ResourceUsageGather")
269-
constraints := make(map[string]e2edebug.ResourceConstraint)
270-
// For now, just gets summary. Can pass valid constraints in the future.
271-
summary, err := rsgather.StopAndSummarize([]int{50, 90, 100}, constraints)
272-
f.TestSummaries = append(f.TestSummaries, summary)
273-
framework.ExpectNoError(err, "getting resource usage summary")
274-
}()
265+
SetupNVIDIAGPUNode(ctx, f)
275266
}
276267
nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet)
277268
framework.ExpectNoError(err)
@@ -329,7 +320,7 @@ const (
329320
)
330321

331322
// SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes
332-
func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.ContainerResourceGatherer {
323+
func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) {
333324
logOSImages(ctx, f)
334325

335326
var err error
@@ -348,6 +339,13 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
348339
ds, err = e2emanifest.DaemonSetFromData(data)
349340
framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
350341
}
342+
343+
prev, err := f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Get(ctx, ds.Name, metav1.GetOptions{})
344+
if err == nil && prev != nil {
345+
framework.Logf("Daemonset already installed, skipping...")
346+
return
347+
}
348+
351349
ds.Namespace = f.Namespace.Name
352350
_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ctx, ds, metav1.CreateOptions{})
353351
framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
@@ -362,19 +360,11 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
362360
pods.Items = append(pods.Items, devicepluginPods.Items...)
363361
}
364362

365-
framework.Logf("Starting ResourceUsageGather for the created DaemonSet pods.")
366-
rsgather, err := e2edebug.NewResourceUsageGatherer(ctx, f.ClientSet,
367-
e2edebug.ResourceGathererOptions{InKubemark: false, Nodes: e2edebug.AllNodes, ResourceDataGatheringPeriod: 2 * time.Second, ProbeDuration: 2 * time.Second, PrintVerboseLogs: true}, pods)
368-
framework.ExpectNoError(err, "creating ResourceUsageGather for the daemonset pods")
369-
go rsgather.StartGatheringData(ctx)
370-
371363
// Wait for Nvidia GPUs to be available on nodes
372364
framework.Logf("Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
373365
gomega.Eventually(ctx, func(ctx context.Context) bool {
374366
return areGPUsAvailableOnAllSchedulableNodes(ctx, f.ClientSet)
375367
}, driverInstallTimeout, time.Second).Should(gomega.BeTrueBecause("expected GPU resources to be available within the timout"))
376-
377-
return rsgather
378368
}
379369

380370
// StartJob starts a simple CUDA job that requests gpu and the specified number of completions

0 commit comments

Comments
 (0)