|
| 1 | +// Copyright 2025 The gVisor Authors. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +// Package driver implements tests for driver version compatibility. |
| 16 | +package driver |
| 17 | + |
| 18 | +import ( |
| 19 | + "bytes" |
| 20 | + "context" |
| 21 | + "encoding/json" |
| 22 | + "fmt" |
| 23 | + "io" |
| 24 | + "net/http" |
| 25 | + "strings" |
| 26 | + "testing" |
| 27 | + "time" |
| 28 | + |
| 29 | + "google.golang.org/protobuf/encoding/prototext" |
| 30 | + cospb "gvisor.dev/gvisor/test/gpu/gpu_driver_versions_go_proto" |
| 31 | + "gvisor.dev/gvisor/test/kubernetes/k8sctx" |
| 32 | + "gvisor.dev/gvisor/test/kubernetes/testcluster" |
| 33 | + v13 "k8s.io/api/core/v1" |
| 34 | +) |
| 35 | + |
| 36 | +var unsupportedGPUs = map[string]any{ |
| 37 | + "NVIDIA_TESLA_V100": true, |
| 38 | + "NVIDIA_TESLA_P100": true, |
| 39 | + "NVIDIA_TESLA_P4": true, |
| 40 | + "OTHERS": true, |
| 41 | + "NO_GPU": true, |
| 42 | +} |
| 43 | + |
| 44 | +// RunDriverVersion tests that all driver versions the cluster version are compatible with |
| 45 | +// the runsc version. |
| 46 | +func RunDriverVersion(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesContext, cluster *testcluster.TestCluster) { |
| 47 | + ns := cluster.Namespace(testcluster.NamespaceDefault) |
| 48 | + image, err := k8sCtx.ResolveImage(ctx, "alpine") |
| 49 | + if err != nil { |
| 50 | + t.Fatalf("Failed to resolve image: %v", err) |
| 51 | + } |
| 52 | + |
| 53 | + runscSupportedDrivers, err := getRunscDriverInfo(ctx, ns, cluster, image) |
| 54 | + if err != nil { |
| 55 | + t.Fatalf("Failed to get runsc supported drivers: %v", err) |
| 56 | + } |
| 57 | + |
| 58 | + cosDriverVersions, err := getCOSDrivers(ctx, ns, cluster, image) |
| 59 | + if err != nil { |
| 60 | + t.Fatalf("Failed to get COS driver versions: %v", err) |
| 61 | + } |
| 62 | + |
| 63 | + for _, info := range cosDriverVersions.GetGpuDriverVersionInfo() { |
| 64 | + if _, ok := unsupportedGPUs[info.GetGpuDevice().GetGpuType()]; ok { |
| 65 | + continue |
| 66 | + } |
| 67 | + t.Run(info.GetGpuDevice().GetGpuType(), func(t *testing.T) { |
| 68 | + for _, driver := range info.GetSupportedDriverVersions() { |
| 69 | + switch driver.GetLabel() { |
| 70 | + case "LATEST": |
| 71 | + case "DEFAULT": |
| 72 | + default: |
| 73 | + continue |
| 74 | + } |
| 75 | + |
| 76 | + if _, ok := runscSupportedDrivers[driver.GetVersion()]; !ok { |
| 77 | + t.Errorf("Driver version %v is not supported by runsc", driver) |
| 78 | + } |
| 79 | + } |
| 80 | + }) |
| 81 | + } |
| 82 | +} |
| 83 | + |
| 84 | +func getRunscDriverInfo(ctx context.Context, ns *testcluster.Namespace, cluster *testcluster.TestCluster, image string) (map[string]any, error) { |
| 85 | + const runtimePath = "/home/containerd/usr/local/sbin/runsc" |
| 86 | + pod := ns.NewAlpinePod(fmt.Sprintf("hello-%d", time.Now().UnixNano()), image, []string{}) |
| 87 | + pod, err := cluster.ConfigurePodForRuntimeTestNodepool(ctx, pod) |
| 88 | + if err != nil { |
| 89 | + return nil, fmt.Errorf("failed to set pod on cluster %q: %v", cluster.GetName(), err) |
| 90 | + } |
| 91 | + |
| 92 | + pod.Spec.RuntimeClassName = nil |
| 93 | + pod.Spec.Tolerations = append(pod.Spec.Tolerations, cluster.GetGVisorRuntimeToleration()) |
| 94 | + |
| 95 | + pod.Spec.Volumes = append(pod.Spec.Volumes, v13.Volume{ |
| 96 | + Name: "runsc", |
| 97 | + VolumeSource: v13.VolumeSource{ |
| 98 | + HostPath: &v13.HostPathVolumeSource{ |
| 99 | + Path: runtimePath, |
| 100 | + Type: new(v13.HostPathType), |
| 101 | + }, |
| 102 | + }, |
| 103 | + }) |
| 104 | + |
| 105 | + container := v13.Container{ |
| 106 | + Name: "runsc", |
| 107 | + Image: image, |
| 108 | + Command: []string{"./runsc", "nvproxy", "list-supported-drivers"}, |
| 109 | + VolumeMounts: []v13.VolumeMount{ |
| 110 | + { |
| 111 | + Name: "runsc", |
| 112 | + MountPath: "runsc", |
| 113 | + ReadOnly: false, |
| 114 | + }, |
| 115 | + }, |
| 116 | + } |
| 117 | + |
| 118 | + pod.Spec.Containers = []v13.Container{container} |
| 119 | + pod, err = cluster.CreatePod(ctx, pod) |
| 120 | + if err != nil { |
| 121 | + return nil, fmt.Errorf("failed to create pod on cluster %q: %v", cluster.GetName(), err) |
| 122 | + } |
| 123 | + defer cluster.DeletePod(ctx, pod) |
| 124 | + if err := cluster.WaitForPodCompleted(ctx, pod); err != nil { |
| 125 | + return nil, fmt.Errorf("failed to wait for pod on cluster %q: %v", cluster.GetName(), err) |
| 126 | + } |
| 127 | + reader, err := cluster.GetLogReader(ctx, pod, v13.PodLogOptions{}) |
| 128 | + if err != nil { |
| 129 | + return nil, fmt.Errorf("failed to get log reader on cluster %q: %v", cluster.GetName(), err) |
| 130 | + } |
| 131 | + defer reader.Close() |
| 132 | + |
| 133 | + buf := new(bytes.Buffer) |
| 134 | + if _, err := io.Copy(buf, reader); err != nil { |
| 135 | + return nil, fmt.Errorf("failed to read log on cluster %q: %v", cluster.GetName(), err) |
| 136 | + } |
| 137 | + versions := make(map[string]any) |
| 138 | + for _, v := range strings.Split(strings.TrimSpace(buf.String()), "\n") { |
| 139 | + versions[v] = true |
| 140 | + } |
| 141 | + if len(versions) == 0 { |
| 142 | + return nil, fmt.Errorf("no driver versions found in log: %s", buf.String()) |
| 143 | + } |
| 144 | + return versions, nil |
| 145 | +} |
| 146 | + |
| 147 | +func getCOSDrivers(ctx context.Context, ns *testcluster.Namespace, cluster *testcluster.TestCluster, image string) (*cospb.GPUDriverVersionInfoList, error) { |
| 148 | + const cosExtensionsPath = "/etc/cos-package-info.json" |
| 149 | + const cosExtensions = "cos-extensions" |
| 150 | + pod := ns.NewAlpinePod(fmt.Sprintf("cos-%d", time.Now().UnixNano()), image, []string{}) |
| 151 | + pod, err := cluster.ConfigurePodForRuntimeTestNodepool(ctx, pod) |
| 152 | + if err != nil { |
| 153 | + return nil, fmt.Errorf("failed to set pod on cluster %q: %v", cluster.GetName(), err) |
| 154 | + } |
| 155 | + pod.Spec.RuntimeClassName = nil |
| 156 | + pod.Spec.Tolerations = append(pod.Spec.Tolerations, cluster.GetGVisorRuntimeToleration()) |
| 157 | + |
| 158 | + file := v13.HostPathType(v13.HostPathFile) |
| 159 | + pod.Spec.Volumes = append(pod.Spec.Volumes, v13.Volume{ |
| 160 | + Name: "host", |
| 161 | + VolumeSource: v13.VolumeSource{ |
| 162 | + HostPath: &v13.HostPathVolumeSource{ |
| 163 | + Path: cosExtensionsPath, |
| 164 | + Type: &file, |
| 165 | + }, |
| 166 | + }, |
| 167 | + }) |
| 168 | + |
| 169 | + container := v13.Container{ |
| 170 | + Name: cosExtensions, |
| 171 | + Image: image, |
| 172 | + Command: []string{"cat", cosExtensionsPath}, |
| 173 | + VolumeMounts: []v13.VolumeMount{ |
| 174 | + { |
| 175 | + Name: "host", |
| 176 | + MountPath: cosExtensionsPath, |
| 177 | + ReadOnly: true, |
| 178 | + }, |
| 179 | + }, |
| 180 | + } |
| 181 | + pod.Spec.Containers = []v13.Container{container} |
| 182 | + pod, err = cluster.CreatePod(ctx, pod) |
| 183 | + if err != nil { |
| 184 | + return nil, fmt.Errorf("failed to create pod on cluster %q: %v", cluster.GetName(), err) |
| 185 | + } |
| 186 | + defer cluster.DeletePod(ctx, pod) |
| 187 | + if err := cluster.WaitForPodCompleted(ctx, pod); err != nil { |
| 188 | + return nil, fmt.Errorf("failed to wait for pod on cluster %q: %v", cluster.GetName(), err) |
| 189 | + } |
| 190 | + reader, err := cluster.GetLogReader(ctx, pod, v13.PodLogOptions{}) |
| 191 | + if err != nil { |
| 192 | + return nil, fmt.Errorf("failed to get log reader on cluster %q: %v", cluster.GetName(), err) |
| 193 | + } |
| 194 | + defer reader.Close() |
| 195 | + buf := new(bytes.Buffer) |
| 196 | + if _, err := io.Copy(buf, reader); err != nil { |
| 197 | + return nil, fmt.Errorf("failed to read log on cluster %q: %v", cluster.GetName(), err) |
| 198 | + } |
| 199 | + |
| 200 | + cosVersion, err := extractCosVersion(buf) |
| 201 | + if err != nil { |
| 202 | + return nil, fmt.Errorf("failed to extract COS version: %v", err) |
| 203 | + } |
| 204 | + return getCOSDriverFromReleaseVersion(cosVersion) |
| 205 | + |
| 206 | +} |
| 207 | + |
| 208 | +func extractCosVersion(content *bytes.Buffer) (string, error) { |
| 209 | + cosMap := make(map[string]any) |
| 210 | + if err := json.Unmarshal(content.Bytes(), &cosMap); err != nil { |
| 211 | + return "", fmt.Errorf("failed to unmarshal cos-extensions: %v", err) |
| 212 | + } |
| 213 | + |
| 214 | + packages := cosMap["installedPackages"] |
| 215 | + pkgs, ok := packages.([]any) |
| 216 | + if !ok { |
| 217 | + return "", fmt.Errorf("cos-extensions not found in cos-extensions: %v", packages) |
| 218 | + } |
| 219 | + |
| 220 | + pkg, ok := pkgs[0].(map[string]any) |
| 221 | + if !ok { |
| 222 | + return "", fmt.Errorf("cos-extensions not found in cos-extensions: %v", pkgs) |
| 223 | + } |
| 224 | + |
| 225 | + version, ok := pkg["version"].(string) |
| 226 | + if !ok { |
| 227 | + return "", fmt.Errorf("version not found in cos-extensions: %v", pkg) |
| 228 | + } |
| 229 | + |
| 230 | + return version, nil |
| 231 | +} |
| 232 | + |
| 233 | +func getCOSDriverFromReleaseVersion(cosVersion string) (*cospb.GPUDriverVersionInfoList, error) { |
| 234 | + // Each entry on the COS release list has a corresponding textproto file with the list of GPU |
| 235 | + // driver versions supported in that release. |
| 236 | + // See: https://cloud.google.com/container-optimized-os/docs/release-notes |
| 237 | + url := fmt.Sprintf("https://storage.googleapis.com/cos-tools/%s/lakitu/gpu_driver_versions.textproto", cosVersion) |
| 238 | + resp, err := http.Get(url) |
| 239 | + if err != nil { |
| 240 | + return nil, fmt.Errorf("failed to get driver versions for release %q: %w", cosVersion, err) |
| 241 | + } |
| 242 | + defer resp.Body.Close() |
| 243 | + |
| 244 | + var content []byte |
| 245 | + |
| 246 | + switch { |
| 247 | + case resp.StatusCode == 404: |
| 248 | + // When COS versions are newly released, they will often show up in projects but not the release |
| 249 | + // page. In this case, we return an empty list of driver versions. |
| 250 | + content = []byte("gpu_driver_version_info: []") |
| 251 | + default: |
| 252 | + content, err = io.ReadAll(resp.Body) |
| 253 | + if err != nil { |
| 254 | + return nil, fmt.Errorf("failed to read driver versions for release %q: %w", cosVersion, err) |
| 255 | + } |
| 256 | + } |
| 257 | + |
| 258 | + list := cospb.GPUDriverVersionInfoList{} |
| 259 | + if err := prototext.Unmarshal(content, &list); err != nil { |
| 260 | + return nil, fmt.Errorf("failed to unmarshal driver versions: %v", err) |
| 261 | + } |
| 262 | + |
| 263 | + return &list, nil |
| 264 | +} |
0 commit comments