Skip to content

Commit 71bdcab

Browse files
committed
Add some simple tests for nvidia GPU(s)
Signed-off-by: Davanum Srinivas <[email protected]>
1 parent d6e5fb4 commit 71bdcab

File tree

1 file changed

+206
-0
lines changed

1 file changed

+206
-0
lines changed

test/e2e/node/gpu.go

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package node
18+
19+
import (
20+
"context"
21+
v1 "k8s.io/api/core/v1"
22+
"k8s.io/apimachinery/pkg/api/resource"
23+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24+
"k8s.io/apimachinery/pkg/util/uuid"
25+
clientset "k8s.io/client-go/kubernetes"
26+
"k8s.io/kubernetes/test/e2e/feature"
27+
"k8s.io/kubernetes/test/e2e/framework"
28+
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
29+
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
30+
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
31+
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
32+
admissionapi "k8s.io/pod-security-admission/api"
33+
34+
"github.com/onsi/ginkgo/v2"
35+
"github.com/onsi/gomega"
36+
)
37+
38+
var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", func() {
39+
40+
f := framework.NewDefaultFramework("nvidia-gpu")
41+
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
42+
var podClient *e2epod.PodClient
43+
44+
ginkgo.BeforeEach(func() {
45+
e2eskipper.SkipUnlessProviderIs("aws")
46+
podClient = e2epod.NewPodClient(f)
47+
})
48+
49+
f.It("should run nvidia-smi cli", func(ctx context.Context) {
50+
checkEnvironmentAndSkipIfNeeded(ctx, f.ClientSet)
51+
pod := testNvidiaCLIPod()
52+
pod.Spec.Containers[0].Command = []string{"nvidia-smi"}
53+
54+
ginkgo.By("Creating a pod that runs nvidia-smi")
55+
createAndValidatePod(ctx, f, podClient, pod)
56+
57+
ginkgo.By("Getting logs from the pod")
58+
log, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name)
59+
framework.ExpectNoError(err)
60+
61+
ginkgo.By("Checking output from nvidia-smi")
62+
gomega.Expect(log).To(gomega.ContainSubstring("NVIDIA-SMI"))
63+
gomega.Expect(log).To(gomega.ContainSubstring("Driver Version:"))
64+
gomega.Expect(log).To(gomega.ContainSubstring("CUDA Version:"))
65+
})
66+
67+
f.It("should run gpu based matrix multiplication", func(ctx context.Context) {
68+
checkEnvironmentAndSkipIfNeeded(ctx, f.ClientSet)
69+
pod := testMatrixMultiplicationPod()
70+
71+
ginkgo.By("Creating a pod that runs matrix multiplication")
72+
createAndValidatePod(ctx, f, podClient, pod)
73+
74+
ginkgo.By("Getting logs from the pod")
75+
log, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name)
76+
framework.ExpectNoError(err)
77+
78+
ginkgo.By("Checking output from nvidia-smi")
79+
gomega.Expect(log).To(gomega.ContainSubstring("TensorFlow version"))
80+
gomega.Expect(log).To(gomega.ContainSubstring("Matrix multiplication result:"))
81+
gomega.Expect(log).To(gomega.ContainSubstring("Time taken for 5000x5000 matrix multiplication"))
82+
})
83+
})
84+
85+
func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) {
86+
pod = podClient.Create(ctx, pod)
87+
88+
ginkgo.By("Watching for error events or started pod")
89+
ev, err := podClient.WaitForErrorEventOrSuccess(ctx, pod)
90+
framework.ExpectNoError(err)
91+
gomega.Expect(ev).To(gomega.BeNil())
92+
93+
ginkgo.By("Waiting for pod completion")
94+
err = e2epod.WaitForPodNoLongerRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name)
95+
framework.ExpectNoError(err)
96+
pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{})
97+
framework.ExpectNoError(err)
98+
99+
ginkgo.By("Checking that the pod succeeded")
100+
gomega.Expect(pod.Status.Phase).To(gomega.Equal(v1.PodSucceeded))
101+
}
102+
103+
func testNvidiaCLIPod() *v1.Pod {
104+
podName := "gpu-cli-" + string(uuid.NewUUID())
105+
pod := v1.Pod{
106+
ObjectMeta: metav1.ObjectMeta{
107+
Name: podName,
108+
Annotations: map[string]string{},
109+
},
110+
Spec: v1.PodSpec{
111+
Containers: []v1.Container{
112+
{
113+
Name: "nvidia-smi",
114+
Image: "nvidia/cuda:12.3.2-runtime-ubuntu22.04",
115+
Resources: v1.ResourceRequirements{
116+
Limits: v1.ResourceList{
117+
"nvidia.com/gpu": resource.MustParse("1"),
118+
},
119+
},
120+
},
121+
},
122+
RestartPolicy: v1.RestartPolicyNever,
123+
},
124+
}
125+
return &pod
126+
}
127+
128+
func testMatrixMultiplicationPod() *v1.Pod {
129+
podName := "gpu-matmul-" + string(uuid.NewUUID())
130+
pod := v1.Pod{
131+
ObjectMeta: metav1.ObjectMeta{
132+
Name: podName,
133+
Annotations: map[string]string{},
134+
},
135+
Spec: v1.PodSpec{
136+
Containers: []v1.Container{
137+
{
138+
Name: "gpu-matmul",
139+
Image: "tensorflow/tensorflow:latest-gpu",
140+
Command: []string{
141+
"python",
142+
"-c",
143+
`
144+
import tensorflow as tf
145+
import time
146+
147+
print("TensorFlow version:", tf.__version__)
148+
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
149+
150+
# Simple matrix multiplication test
151+
with tf.device('/GPU:0'):
152+
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
153+
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
154+
c = tf.matmul(a, b)
155+
156+
print("Matrix multiplication result:", c.numpy())
157+
158+
# Performance test
159+
n = 5000
160+
start_time = time.time()
161+
with tf.device('/GPU:0'):
162+
matrix1 = tf.random.normal((n, n))
163+
matrix2 = tf.random.normal((n, n))
164+
result = tf.matmul(matrix1, matrix2)
165+
end_time = time.time()
166+
167+
print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f} seconds")
168+
`,
169+
},
170+
Resources: v1.ResourceRequirements{
171+
Limits: v1.ResourceList{
172+
"nvidia.com/gpu": resource.MustParse("1"),
173+
},
174+
},
175+
},
176+
},
177+
RestartPolicy: v1.RestartPolicyNever,
178+
},
179+
}
180+
return &pod
181+
}
182+
183+
func checkEnvironmentAndSkipIfNeeded(ctx context.Context, clientSet clientset.Interface) {
184+
nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet)
185+
framework.ExpectNoError(err)
186+
capacity := 0
187+
allocatable := 0
188+
for _, node := range nodes.Items {
189+
val, ok := node.Status.Capacity[e2egpu.NVIDIAGPUResourceName]
190+
if !ok {
191+
continue
192+
}
193+
capacity += int(val.Value())
194+
val, ok = node.Status.Allocatable[e2egpu.NVIDIAGPUResourceName]
195+
if !ok {
196+
continue
197+
}
198+
allocatable += int(val.Value())
199+
}
200+
if capacity == 0 {
201+
e2eskipper.Skipf("%d ready nodes do not have any Nvidia GPU(s). Skipping...", len(nodes.Items))
202+
}
203+
if allocatable == 0 {
204+
e2eskipper.Skipf("%d ready nodes do not have any allocatable Nvidia GPU(s). Skipping...", len(nodes.Items))
205+
}
206+
}

0 commit comments

Comments
 (0)