Skip to content

Commit bae2ea3

Browse files
zkoopmansgvisor-bot
authored andcommitted
Implement driver version test
PiperOrigin-RevId: 762098050
1 parent 14ece0d commit bae2ea3

File tree

5 files changed

+326
-1
lines changed

5 files changed

+326
-1
lines changed

test/gpu/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,9 @@ go_test(
160160

161161
proto_library(
162162
name = "gpu_driver_versions",
163+
testonly = True,
163164
srcs = ["gpu_driver_versions.proto"],
165+
visibility = ["//test/kubernetes:__subpackages__"],
164166
)
165167

166168
go_test(

test/kubernetes/testcluster/testcluster.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,7 @@ func (t *TestCluster) applyCommonPodConfigurations(ctx context.Context, np *Node
635635

636636
// Figure out which runtime to use for this pod, either by flag override or
637637
// autodetection based on the nodepool configuration.
638-
var applyRuntime = np.runtime
638+
applyRuntime := np.runtime
639639
if np.nodePooltype == TestRuntimeNodepoolName && t.testNodepoolRuntimeOverride != "" {
640640
applyRuntime = t.testNodepoolRuntimeOverride
641641
}

test/kubernetes/tests/BUILD

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,28 @@ go_test(
2929
],
3030
deps = ["//test/kubernetes/k8sctx/kubectlctx"],
3131
)
32+
33+
go_library(
34+
name = "driver",
35+
testonly = True,
36+
srcs = ["driver_version.go"],
37+
deps = [
38+
"//test/gpu:gpu_driver_versions_go_proto",
39+
"//test/kubernetes/k8sctx",
40+
"//test/kubernetes/testcluster",
41+
"@io_k8s_api//core/v1:go_default_library",
42+
"@org_golang_google_protobuf//encoding/prototext:go_default_library",
43+
],
44+
)
45+
46+
go_test(
47+
name = "driver_test",
48+
srcs = ["driver_version_test.go"],
49+
library = ":driver",
50+
tags = [
51+
"local",
52+
"noguitar",
53+
"notap",
54+
],
55+
deps = ["//test/kubernetes/k8sctx/kubectlctx"],
56+
)
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
// Copyright 2025 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Package driver implements tests for driver version compatibility.
16+
package driver
17+
18+
import (
19+
"bytes"
20+
"context"
21+
"encoding/json"
22+
"fmt"
23+
"io"
24+
"net/http"
25+
"strings"
26+
"testing"
27+
"time"
28+
29+
"google.golang.org/protobuf/encoding/prototext"
30+
cospb "gvisor.dev/gvisor/test/gpu/gpu_driver_versions_go_proto"
31+
"gvisor.dev/gvisor/test/kubernetes/k8sctx"
32+
"gvisor.dev/gvisor/test/kubernetes/testcluster"
33+
v13 "k8s.io/api/core/v1"
34+
)
35+
36+
var unsupportedGPUs = map[string]any{
37+
"NVIDIA_TESLA_V100": true,
38+
"NVIDIA_TESLA_P100": true,
39+
"NVIDIA_TESLA_P4": true,
40+
"OTHERS": true,
41+
"NO_GPU": true,
42+
}
43+
44+
// RunDriverVersion tests that all driver versions the cluster version are compatible with
45+
// the runsc version.
46+
func RunDriverVersion(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesContext, cluster *testcluster.TestCluster) {
47+
ns := cluster.Namespace(testcluster.NamespaceDefault)
48+
image, err := k8sCtx.ResolveImage(ctx, "alpine")
49+
if err != nil {
50+
t.Fatalf("Failed to resolve image: %v", err)
51+
}
52+
53+
runscSupportedDrivers, err := getRunscDriverInfo(ctx, ns, cluster, image)
54+
if err != nil {
55+
t.Fatalf("Failed to get runsc supported drivers: %v", err)
56+
}
57+
58+
cosDriverVersions, err := getCOSDrivers(ctx, ns, cluster, image)
59+
if err != nil {
60+
t.Fatalf("Failed to get COS driver versions: %v", err)
61+
}
62+
63+
for _, info := range cosDriverVersions.GetGpuDriverVersionInfo() {
64+
if _, ok := unsupportedGPUs[info.GetGpuDevice().GetGpuType()]; ok {
65+
continue
66+
}
67+
t.Run(info.GetGpuDevice().GetGpuType(), func(t *testing.T) {
68+
for _, driver := range info.GetSupportedDriverVersions() {
69+
switch driver.GetLabel() {
70+
case "LATEST":
71+
case "DEFAULT":
72+
default:
73+
continue
74+
}
75+
76+
if _, ok := runscSupportedDrivers[driver.GetVersion()]; !ok {
77+
t.Errorf("Driver version %v is not supported by runsc", driver)
78+
}
79+
}
80+
})
81+
}
82+
}
83+
84+
func getRunscDriverInfo(ctx context.Context, ns *testcluster.Namespace, cluster *testcluster.TestCluster, image string) (map[string]any, error) {
85+
const runtimePath = "/home/containerd/usr/local/sbin/runsc"
86+
pod := ns.NewAlpinePod(fmt.Sprintf("hello-%d", time.Now().UnixNano()), image, []string{})
87+
pod, err := cluster.ConfigurePodForRuntimeTestNodepool(ctx, pod)
88+
if err != nil {
89+
return nil, fmt.Errorf("failed to set pod on cluster %q: %v", cluster.GetName(), err)
90+
}
91+
92+
pod.Spec.RuntimeClassName = nil
93+
pod.Spec.Tolerations = append(pod.Spec.Tolerations, cluster.GetGVisorRuntimeToleration())
94+
95+
pod.Spec.Volumes = append(pod.Spec.Volumes, v13.Volume{
96+
Name: "runsc",
97+
VolumeSource: v13.VolumeSource{
98+
HostPath: &v13.HostPathVolumeSource{
99+
Path: runtimePath,
100+
Type: new(v13.HostPathType),
101+
},
102+
},
103+
})
104+
105+
container := v13.Container{
106+
Name: "runsc",
107+
Image: image,
108+
Command: []string{"./runsc", "nvproxy", "list-supported-drivers"},
109+
VolumeMounts: []v13.VolumeMount{
110+
{
111+
Name: "runsc",
112+
MountPath: "runsc",
113+
ReadOnly: false,
114+
},
115+
},
116+
}
117+
118+
pod.Spec.Containers = []v13.Container{container}
119+
pod, err = cluster.CreatePod(ctx, pod)
120+
if err != nil {
121+
return nil, fmt.Errorf("failed to create pod on cluster %q: %v", cluster.GetName(), err)
122+
}
123+
defer cluster.DeletePod(ctx, pod)
124+
if err := cluster.WaitForPodCompleted(ctx, pod); err != nil {
125+
return nil, fmt.Errorf("failed to wait for pod on cluster %q: %v", cluster.GetName(), err)
126+
}
127+
reader, err := cluster.GetLogReader(ctx, pod, v13.PodLogOptions{})
128+
if err != nil {
129+
return nil, fmt.Errorf("failed to get log reader on cluster %q: %v", cluster.GetName(), err)
130+
}
131+
defer reader.Close()
132+
133+
buf := new(bytes.Buffer)
134+
if _, err := io.Copy(buf, reader); err != nil {
135+
return nil, fmt.Errorf("failed to read log on cluster %q: %v", cluster.GetName(), err)
136+
}
137+
versions := make(map[string]any)
138+
for _, v := range strings.Split(strings.TrimSpace(buf.String()), "\n") {
139+
versions[v] = true
140+
}
141+
if len(versions) == 0 {
142+
return nil, fmt.Errorf("no driver versions found in log: %s", buf.String())
143+
}
144+
return versions, nil
145+
}
146+
147+
func getCOSDrivers(ctx context.Context, ns *testcluster.Namespace, cluster *testcluster.TestCluster, image string) (*cospb.GPUDriverVersionInfoList, error) {
148+
const cosExtensionsPath = "/etc/cos-package-info.json"
149+
const cosExtensions = "cos-extensions"
150+
pod := ns.NewAlpinePod(fmt.Sprintf("cos-%d", time.Now().UnixNano()), image, []string{})
151+
pod, err := cluster.ConfigurePodForRuntimeTestNodepool(ctx, pod)
152+
if err != nil {
153+
return nil, fmt.Errorf("failed to set pod on cluster %q: %v", cluster.GetName(), err)
154+
}
155+
pod.Spec.RuntimeClassName = nil
156+
pod.Spec.Tolerations = append(pod.Spec.Tolerations, cluster.GetGVisorRuntimeToleration())
157+
158+
file := v13.HostPathType(v13.HostPathFile)
159+
pod.Spec.Volumes = append(pod.Spec.Volumes, v13.Volume{
160+
Name: "host",
161+
VolumeSource: v13.VolumeSource{
162+
HostPath: &v13.HostPathVolumeSource{
163+
Path: cosExtensionsPath,
164+
Type: &file,
165+
},
166+
},
167+
})
168+
169+
container := v13.Container{
170+
Name: cosExtensions,
171+
Image: image,
172+
Command: []string{"cat", cosExtensionsPath},
173+
VolumeMounts: []v13.VolumeMount{
174+
{
175+
Name: "host",
176+
MountPath: cosExtensionsPath,
177+
ReadOnly: true,
178+
},
179+
},
180+
}
181+
pod.Spec.Containers = []v13.Container{container}
182+
pod, err = cluster.CreatePod(ctx, pod)
183+
if err != nil {
184+
return nil, fmt.Errorf("failed to create pod on cluster %q: %v", cluster.GetName(), err)
185+
}
186+
defer cluster.DeletePod(ctx, pod)
187+
if err := cluster.WaitForPodCompleted(ctx, pod); err != nil {
188+
return nil, fmt.Errorf("failed to wait for pod on cluster %q: %v", cluster.GetName(), err)
189+
}
190+
reader, err := cluster.GetLogReader(ctx, pod, v13.PodLogOptions{})
191+
if err != nil {
192+
return nil, fmt.Errorf("failed to get log reader on cluster %q: %v", cluster.GetName(), err)
193+
}
194+
defer reader.Close()
195+
buf := new(bytes.Buffer)
196+
if _, err := io.Copy(buf, reader); err != nil {
197+
return nil, fmt.Errorf("failed to read log on cluster %q: %v", cluster.GetName(), err)
198+
}
199+
200+
cosVersion, err := extractCosVersion(buf)
201+
if err != nil {
202+
return nil, fmt.Errorf("failed to extract COS version: %v", err)
203+
}
204+
return getCOSDriverFromReleaseVersion(cosVersion)
205+
206+
}
207+
208+
func extractCosVersion(content *bytes.Buffer) (string, error) {
209+
cosMap := make(map[string]any)
210+
if err := json.Unmarshal(content.Bytes(), &cosMap); err != nil {
211+
return "", fmt.Errorf("failed to unmarshal cos-extensions: %v", err)
212+
}
213+
214+
packages := cosMap["installedPackages"]
215+
pkgs, ok := packages.([]any)
216+
if !ok {
217+
return "", fmt.Errorf("cos-extensions not found in cos-extensions: %v", packages)
218+
}
219+
220+
pkg, ok := pkgs[0].(map[string]any)
221+
if !ok {
222+
return "", fmt.Errorf("cos-extensions not found in cos-extensions: %v", pkgs)
223+
}
224+
225+
version, ok := pkg["version"].(string)
226+
if !ok {
227+
return "", fmt.Errorf("version not found in cos-extensions: %v", pkg)
228+
}
229+
230+
return version, nil
231+
}
232+
233+
func getCOSDriverFromReleaseVersion(cosVersion string) (*cospb.GPUDriverVersionInfoList, error) {
234+
// Each entry on the COS release list has a corresponding textproto file with the list of GPU
235+
// driver versions supported in that release.
236+
// See: https://cloud.google.com/container-optimized-os/docs/release-notes
237+
url := fmt.Sprintf("https://storage.googleapis.com/cos-tools/%s/lakitu/gpu_driver_versions.textproto", cosVersion)
238+
resp, err := http.Get(url)
239+
if err != nil {
240+
return nil, fmt.Errorf("failed to get driver versions for release %q: %w", cosVersion, err)
241+
}
242+
defer resp.Body.Close()
243+
244+
var content []byte
245+
246+
switch {
247+
case resp.StatusCode == 404:
248+
// When COS versions are newly released, they will often show up in projects but not the release
249+
// page. In this case, we return an empty list of driver versions.
250+
content = []byte("gpu_driver_version_info: []")
251+
default:
252+
content, err = io.ReadAll(resp.Body)
253+
if err != nil {
254+
return nil, fmt.Errorf("failed to read driver versions for release %q: %w", cosVersion, err)
255+
}
256+
}
257+
258+
list := cospb.GPUDriverVersionInfoList{}
259+
if err := prototext.Unmarshal(content, &list); err != nil {
260+
return nil, fmt.Errorf("failed to unmarshal driver versions: %v", err)
261+
}
262+
263+
return &list, nil
264+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// Copyright 2025 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package driver
16+
17+
import (
18+
"context"
19+
"testing"
20+
21+
"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
22+
)
23+
24+
// TestDriverVersion tests that a trivial alpine container runs correctly.
25+
func TestDriverVersion(t *testing.T) {
26+
ctx := context.Background()
27+
k8sCtx, err := kubectlctx.New(ctx)
28+
if err != nil {
29+
t.Fatalf("Failed to get kubernetes context: %v", err)
30+
}
31+
cluster, releaseFn := k8sCtx.Cluster(ctx, t)
32+
defer releaseFn()
33+
RunDriverVersion(ctx, t, k8sCtx, cluster)
34+
}

0 commit comments

Comments
 (0)