Skip to content

Commit bf3f651

Browse files
authored
Merge pull request kubernetes#127456 from dims/install-nvidia-daemonset-in-test-harness-for-gce
Install Nvidia Daemonset in test harness for GCE
2 parents 283ff76 + 08a8cf7 commit bf3f651

File tree

1 file changed

+104
-4
lines changed

1 file changed

+104
-4
lines changed

test/e2e/node/gpu.go

Lines changed: 104 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,26 @@ package node
1818

1919
import (
2020
"context"
21+
"os"
22+
"time"
23+
24+
appsv1 "k8s.io/api/apps/v1"
2125
v1 "k8s.io/api/core/v1"
2226
"k8s.io/apimachinery/pkg/api/resource"
2327
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2428
"k8s.io/apimachinery/pkg/util/uuid"
2529
clientset "k8s.io/client-go/kubernetes"
30+
extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
2631
"k8s.io/kubernetes/test/e2e/feature"
2732
"k8s.io/kubernetes/test/e2e/framework"
33+
e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
2834
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
35+
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
2936
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
3037
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
38+
e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
3139
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
40+
e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
3241
admissionapi "k8s.io/pod-security-admission/api"
3342

3443
"github.com/onsi/ginkgo/v2"
@@ -42,12 +51,12 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
4251
var podClient *e2epod.PodClient
4352

4453
ginkgo.BeforeEach(func() {
45-
e2eskipper.SkipUnlessProviderIs("aws")
54+
e2eskipper.SkipUnlessProviderIs("aws", "gce")
4655
podClient = e2epod.NewPodClient(f)
4756
})
4857

4958
f.It("should run nvidia-smi cli", func(ctx context.Context) {
50-
checkEnvironmentAndSkipIfNeeded(ctx, f.ClientSet)
59+
SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
5160
pod := testNvidiaCLIPod()
5261
pod.Spec.Containers[0].Command = []string{"nvidia-smi"}
5362

@@ -65,7 +74,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
6574
})
6675

6776
f.It("should run gpu based matrix multiplication", func(ctx context.Context) {
68-
checkEnvironmentAndSkipIfNeeded(ctx, f.ClientSet)
77+
SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
6978
pod := testMatrixMultiplicationPod()
7079

7180
ginkgo.By("Creating a pod that runs matrix multiplication")
@@ -180,7 +189,18 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f
180189
return &pod
181190
}
182191

183-
func checkEnvironmentAndSkipIfNeeded(ctx context.Context, clientSet clientset.Interface) {
192+
func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework, clientSet clientset.Interface) {
193+
if framework.ProviderIs("gce") {
194+
rsgather := SetupNVIDIAGPUNode(ctx, f)
195+
defer func() {
196+
framework.Logf("Stopping ResourceUsageGather")
197+
constraints := make(map[string]e2edebug.ResourceConstraint)
198+
// For now, just gets summary. Can pass valid constraints in the future.
199+
summary, err := rsgather.StopAndSummarize([]int{50, 90, 100}, constraints)
200+
f.TestSummaries = append(f.TestSummaries, summary)
201+
framework.ExpectNoError(err, "getting resource usage summary")
202+
}()
203+
}
184204
nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet)
185205
framework.ExpectNoError(err)
186206
capacity := 0
@@ -204,3 +224,83 @@ func checkEnvironmentAndSkipIfNeeded(ctx context.Context, clientSet clientset.In
204224
e2eskipper.Skipf("%d ready nodes do not have any allocatable Nvidia GPU(s). Skipping...", len(nodes.Items))
205225
}
206226
}
227+
228+
func areGPUsAvailableOnAllSchedulableNodes(ctx context.Context, clientSet clientset.Interface) bool {
229+
framework.Logf("Getting list of Nodes from API server")
230+
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
231+
framework.ExpectNoError(err, "getting node list")
232+
for _, node := range nodeList.Items {
233+
if node.Spec.Unschedulable {
234+
continue
235+
}
236+
framework.Logf("gpuResourceName %s", e2egpu.NVIDIAGPUResourceName)
237+
if val, ok := node.Status.Capacity[e2egpu.NVIDIAGPUResourceName]; !ok || val.Value() == 0 {
238+
framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
239+
return false
240+
}
241+
}
242+
framework.Logf("Nvidia GPUs exist on all schedulable nodes")
243+
return true
244+
}
245+
246+
func logOSImages(ctx context.Context, f *framework.Framework) {
247+
nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
248+
framework.ExpectNoError(err, "getting node list")
249+
for _, node := range nodeList.Items {
250+
framework.Logf("Nodename: %v, OS Image: %v", node.Name, node.Status.NodeInfo.OSImage)
251+
}
252+
}
253+
254+
const (
255+
// Nvidia driver installation can take upwards of 5 minutes.
256+
driverInstallTimeout = 10 * time.Minute
257+
)
258+
259+
// SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes
260+
func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.ContainerResourceGatherer {
261+
logOSImages(ctx, f)
262+
263+
var err error
264+
var ds *appsv1.DaemonSet
265+
dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
266+
if dsYamlURLFromEnv != "" {
267+
// Using DaemonSet from remote URL
268+
framework.Logf("Using remote nvidia-driver-installer daemonset manifest from %v", dsYamlURLFromEnv)
269+
ds, err = e2emanifest.DaemonSetFromURL(ctx, dsYamlURLFromEnv)
270+
framework.ExpectNoError(err, "failed get remote")
271+
} else {
272+
// Using default local DaemonSet
273+
framework.Logf("Using default local nvidia-driver-installer daemonset manifest.")
274+
data, err := e2etestfiles.Read("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml")
275+
framework.ExpectNoError(err, "failed to read local manifest for nvidia-driver-installer daemonset")
276+
ds, err = e2emanifest.DaemonSetFromData(data)
277+
framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
278+
}
279+
ds.Namespace = f.Namespace.Name
280+
_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ctx, ds, metav1.CreateOptions{})
281+
framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
282+
framework.Logf("Successfully created daemonset to install Nvidia drivers.")
283+
284+
pods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, ds.Namespace, ds.Name, extensionsinternal.Kind("DaemonSet"))
285+
framework.ExpectNoError(err, "failed to get pods controlled by the nvidia-driver-installer daemonset")
286+
287+
devicepluginPods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, "kube-system", "nvidia-gpu-device-plugin", extensionsinternal.Kind("DaemonSet"))
288+
if err == nil {
289+
framework.Logf("Adding deviceplugin addon pod.")
290+
pods.Items = append(pods.Items, devicepluginPods.Items...)
291+
}
292+
293+
framework.Logf("Starting ResourceUsageGather for the created DaemonSet pods.")
294+
rsgather, err := e2edebug.NewResourceUsageGatherer(ctx, f.ClientSet,
295+
e2edebug.ResourceGathererOptions{InKubemark: false, Nodes: e2edebug.AllNodes, ResourceDataGatheringPeriod: 2 * time.Second, ProbeDuration: 2 * time.Second, PrintVerboseLogs: true}, pods)
296+
framework.ExpectNoError(err, "creating ResourceUsageGather for the daemonset pods")
297+
go rsgather.StartGatheringData(ctx)
298+
299+
// Wait for Nvidia GPUs to be available on nodes
300+
framework.Logf("Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
301+
gomega.Eventually(ctx, func(ctx context.Context) bool {
302+
return areGPUsAvailableOnAllSchedulableNodes(ctx, f.ClientSet)
303+
}, driverInstallTimeout, time.Second).Should(gomega.BeTrueBecause("expected GPU resources to be available within the timout"))
304+
305+
return rsgather
306+
}

0 commit comments

Comments
 (0)