Merge pull request #4728 from shiv-tyagi/detect-vendor-id

AkihiroSuda · web-flow · commit 364e97fcf1f1 · 2026-02-10T19:54:09.000+09:00
Detect vendor before crafting cdiDeviceIDs for --gpus
diff --git a/cmd/nerdctl/container/container_run_linux_test.go b/cmd/nerdctl/container/container_run_linux_test.go
@@ -674,7 +674,16 @@ func TestRunDeviceCDI(t *testing.T) {
 	// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
 	testutil.DockerIncompatible(t)
 	cdiSpecDir := filepath.Join(t.TempDir(), "cdi")
-	writeTestCDISpec(t, cdiSpecDir)
+	const testCDIVendor1 = `
+cdiVersion: "0.3.0"
+kind: "vendor1.com/device"
+devices:
+- name: foo
+  containerEdits:
+    env:
+    - FOO=injected
+`
+	writeTestCDISpec(t, testCDIVendor1, "vendor1.yaml", cdiSpecDir)
 
 	base := testutil.NewBase(t)
 	base.Cmd("--cdi-spec-dirs", cdiSpecDir, "run",
@@ -689,7 +698,16 @@ func TestRunDeviceCDIWithNerdctlConfig(t *testing.T) {
 	// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
 	testutil.DockerIncompatible(t)
 	cdiSpecDir := filepath.Join(t.TempDir(), "cdi")
-	writeTestCDISpec(t, cdiSpecDir)
+	const testCDIVendor1 = `
+cdiVersion: "0.3.0"
+kind: "vendor1.com/device"
+devices:
+- name: foo
+  containerEdits:
+    env:
+    - FOO=injected
+`
+	writeTestCDISpec(t, testCDIVendor1, "vendor1.yaml", cdiSpecDir)
 
 	tomlPath := filepath.Join(t.TempDir(), "nerdctl.toml")
 	err := os.WriteFile(tomlPath, []byte(fmt.Sprintf(`
@@ -706,8 +724,128 @@ cdi_spec_dirs = ["%s"]
 	).AssertOutContains("FOO=injected")
 }
 
-func writeTestCDISpec(t *testing.T, cdiSpecDir string) {
-	const testCDIVendor1 = `
+// TestRunGPU tests GPU injection using the --gpus flag.
+func TestRunGPU(t *testing.T) {
+	t.Parallel()
+	// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
+	testutil.DockerIncompatible(t)
+	const nvidiaSpec = `
+cdiVersion: "0.5.0"
+kind: "nvidia.com/gpu"
+devices:
+- name: "0"
+  containerEdits:
+    env:
+    - NVIDIA_GPU_0=injected
+- name: "1"
+  containerEdits:
+    env:
+    - NVIDIA_GPU_1=injected
+`
+	const amdSpec = `
+cdiVersion: "0.5.0"
+kind: "amd.com/gpu"
+devices:
+- name: "0"
+  containerEdits:
+    env:
+    - AMD_GPU_0=injected
+- name: "1"
+  containerEdits:
+    env:
+    - AMD_GPU_1=injected
+`
+	const unknownSpec = `
+cdiVersion: "0.5.0"
+kind: "unknown.com/gpu"
+devices:
+- name: "0"
+  containerEdits:
+    env:
+    - UNKNOWN_GPU_0=injected
+`
+
+	testCases := []struct {
+		name         string
+		specs        map[string]string
+		gpuFlags     []string
+		expectedEnvs []string
+		expectFail   bool
+	}{
+		{
+			name:         "nvidia device injection",
+			specs:        map[string]string{"nvidia.yaml": nvidiaSpec},
+			gpuFlags:     []string{"--gpus", "2"},
+			expectedEnvs: []string{"NVIDIA_GPU_0=injected", "NVIDIA_GPU_1=injected"},
+		},
+		{
+			name:         "amd device injection",
+			specs:        map[string]string{"amd.yaml": amdSpec},
+			gpuFlags:     []string{"--gpus", "2"},
+			expectedEnvs: []string{"AMD_GPU_0=injected", "AMD_GPU_1=injected"},
+		},
+		{
+			name:         "multiple vendors",
+			specs:        map[string]string{"nvidia.yaml": nvidiaSpec, "amd.yaml": amdSpec},
+			gpuFlags:     []string{"--gpus", "1"},
+			expectedEnvs: []string{"NVIDIA_GPU_0=injected"},
+		},
+		{
+			name:       "unknown vendor fails",
+			specs:      map[string]string{"unknown.yaml": unknownSpec},
+			gpuFlags:   []string{"--gpus", "1"},
+			expectFail: true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+			tmpDir := t.TempDir()
+			for fileName, spec := range tc.specs {
+				writeTestCDISpec(t, spec, fileName, tmpDir)
+			}
+
+			base := testutil.NewBase(t)
+			args := []string{"--cdi-spec-dirs", tmpDir, "run", "--rm"}
+			args = append(args, tc.gpuFlags...)
+			args = append(args, testutil.AlpineImage, "env")
+
+			if tc.expectFail {
+				base.Cmd(args...).AssertFail()
+			} else {
+				base.Cmd(args...).AssertOutWithFunc(func(stdout string) error {
+					for _, expectedEnv := range tc.expectedEnvs {
+						if !strings.Contains(stdout, expectedEnv) {
+							return fmt.Errorf("%s not found", expectedEnv)
+						}
+					}
+					return nil
+				})
+			}
+		})
+	}
+}
+
+// TestRunGPUWithOtherCDIDevices tests GPU CDI injection along with other CDI devices.
+func TestRunGPUWithOtherCDIDevices(t *testing.T) {
+	t.Parallel()
+	// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
+	testutil.DockerIncompatible(t)
+	const amdSpec = `
+cdiVersion: "0.5.0"
+kind: "amd.com/gpu"
+devices:
+- name: "0"
+  containerEdits:
+    env:
+    - AMD_GPU_0=injected
+- name: "1"
+  containerEdits:
+    env:
+    - AMD_GPU_1=injected
+`
+	const vendor1Spec = `
 cdiVersion: "0.3.0"
 kind: "vendor1.com/device"
 devices:
@@ -716,10 +854,33 @@ devices:
     env:
     - FOO=injected
 `
+	tmpDir := t.TempDir()
+	writeTestCDISpec(t, amdSpec, "amd.yaml", tmpDir)
+	writeTestCDISpec(t, vendor1Spec, "vendor1.yaml", tmpDir)
+
+	base := testutil.NewBase(t)
+	base.Cmd("--cdi-spec-dirs", tmpDir, "run", "--rm",
+		"--gpus", "2",
+		"--device", "vendor1.com/device=foo",
+		testutil.AlpineImage, "env",
+	).AssertOutWithFunc(func(stdout string) error {
+		if !strings.Contains(stdout, "AMD_GPU_0=injected") {
+			return errors.New("AMD_GPU_0=injected not found")
+		}
+		if !strings.Contains(stdout, "AMD_GPU_1=injected") {
+			return errors.New("AMD_GPU_1=injected not found")
+		}
+		if !strings.Contains(stdout, "FOO=injected") {
+			return errors.New("FOO=injected not found")
+		}
+		return nil
+	})
+}
 
+func writeTestCDISpec(t *testing.T, spec string, fileName string, cdiSpecDir string) {
 	err := os.MkdirAll(cdiSpecDir, 0700)
 	assert.NilError(t, err)
-	cdiSpecPath := filepath.Join(cdiSpecDir, "vendor1.yaml")
-	err = os.WriteFile(cdiSpecPath, []byte(testCDIVendor1), 0400)
+	cdiSpecPath := filepath.Join(cdiSpecDir, fileName)
+	err = os.WriteFile(cdiSpecPath, []byte(spec), 0400)
 	assert.NilError(t, err)
 }
diff --git a/docs/gpu.md b/docs/gpu.md
@@ -7,32 +7,40 @@
 > The description in this section applies to nerdctl v2.3 or later.
 > Users of prior releases of nerdctl should refer to <https://github.com/containerd/nerdctl/blob/v2.2.0/docs/gpu.md>
 
-nerdctl provides docker-compatible NVIDIA GPU support.
+nerdctl provides docker-compatible NVIDIA and AMD GPU support.
 
 ## Prerequisites
 
-- NVIDIA Drivers
-  - Same requirement as when you use GPUs on Docker. For details, please refer to [the doc by NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#pre-requisites).
-- The NVIDIA Container Toolkit
-  - containerd relies on the NVIDIA Container Toolkit to make GPUs usable inside a container. You can install the NVIDIA Container Toolkit by following the [official installation instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+- GPU Drivers
+  - Same requirement as when you use GPUs on Docker. For details, please refer to these docs by [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#pre-requisites) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/quick-start-guide.html#step-2-install-the-amdgpu-driver).
+- Container Toolkit
+  - containerd relies on vendor Container Toolkits to make GPUs available to the containers. You can install those by following the official installation instructions from [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/quick-start-guide.html).
+- CDI Specification
+  - Container Device Interface (CDI) specification for the GPU devices is required for the GPU support to work. Follow the official documentation from [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/cdi-guide.html) to ensure that the required CDI specifications are present on the system.
 
 ## Options for `nerdctl run --gpus`
 
 `nerdctl run --gpus` is compatible to [`docker run --gpus`](https://docs.docker.com/engine/reference/commandline/run/#access-an-nvidia-gpu).
 
 You can specify number of GPUs to use via `--gpus` option.
-The following example exposes all available GPUs.
+The following examples expose all available GPUs to the container.
 
 ```
 nerdctl run -it --rm --gpus all nvidia/cuda:12.3.1-base-ubuntu20.04 nvidia-smi
 ```
 
+or
+
+```
+nerdctl run -it --rm --gpus=all rocm/rocm-terminal rocm-smi
+```
+
 You can also pass detailed configuration to `--gpus` option as a list of key-value pairs. The following options are provided.
 
 - `count`: number of GPUs to use. `all` exposes all available GPUs.
-- `device`: IDs of GPUs to use. UUID or numbers of GPUs can be specified.
+- `device`: IDs of GPUs to use. UUID or numbers of GPUs can be specified. This only works for NVIDIA GPUs.
 
-The following example exposes a specific GPU to the container.
+The following example exposes a specific NVIDIA GPU to the container.
 
 ```
 nerdctl run -it --rm --gpus 'device=GPU-3a23c669-1f69-c64e-cf85-44e9b07e7a2a' nvidia/cuda:12.3.1-base-ubuntu20.04 nvidia-smi
@@ -72,17 +80,17 @@ services:
 
 ### `nerdctl run --gpus` fails due to an unresolvable CDI device
 
-If the required CDI specifications for NVIDIA devices are not available on the
+If the required CDI specifications for your GPU devices are not available on the
 system, the `nerdctl run` command will fail with an error similar to: `CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all` (the
-exact error message will depend on the device(s) requested).
+exact error message will depend on the vendor and the device(s) requested).
 
 This should be the same error message that is reported when the `--device` flag
 is used to request a CDI device:
 ```
 nerdctl run --device=nvidia.com/gpu=all
 ```
 
-Ensure that the NVIDIA Container Toolkit (>= v1.18.0 is recommended) is installed and the requested CDI devices are present in the ouptut of `nvidia-ctk cdi list`:
+Ensure that the NVIDIA (or AMD) Container Toolkit is installed and the requested CDI devices are present in the ouptut of `nvidia-ctk cdi list` (or `amd-ctk cdi list` for AMD GPUs):
 
 ```
 $ nvidia-ctk cdi list
@@ -92,7 +100,9 @@ nvidia.com/gpu=GPU-3eb87630-93d5-b2b6-b8ff-9b359caf4ee2
 nvidia.com/gpu=all
 ```
 
-See the NVIDIA Container Toolkit [CDI documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) for more information.
+For NVIDIA Container Toolkit, version >= v1.18.0 is recommended. See the NVIDIA Container Toolkit [CDI documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) for more information.
+
+For AMD Container Toolkit, version >= v1.2.0 is recommended. See the AMD Container Toolkit [CDI documentation](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/cdi-guide.html) for more information.
 
 
 ### `nerdctl run --gpus` fails when using the Nvidia gpu-operator
diff --git a/pkg/cmd/container/create.go b/pkg/cmd/container/create.go
@@ -129,7 +129,14 @@ func Create(ctx context.Context, client *containerd.Client, args []string, netMa
 	}
 	opts = append(opts, platformOpts...)
 
-	opts = append(opts, withCDIDevices(options.GOptions.CDISpecDirs, options.CDIDevices...))
+	if len(options.CDIDevices) > 0 || len(options.GPUs) > 0 {
+		opts = append(opts, withStaticCDIRegistry(options.GOptions.CDISpecDirs))
+	}
+
+	opts = append(opts,
+		withGPUs(options.GPUs...),
+		withCDIDevices(options.CDIDevices...),
+	)
 
 	if _, err := referenceutil.Parse(args[0]); errors.Is(err, referenceutil.ErrLoadOCIArchiveRequired) {
 		imageRef := args[0]
diff --git a/pkg/cmd/container/run_cdi.go b/pkg/cmd/container/run_cdi.go
@@ -24,23 +24,68 @@ import (
 	"github.com/containerd/containerd/v2/core/containers"
 	cdispec "github.com/containerd/containerd/v2/pkg/cdi"
 	"github.com/containerd/containerd/v2/pkg/oci"
+	"github.com/containerd/log"
 )
 
-// withCDIDevices creates the OCI runtime spec options for injecting CDI devices.
-// Two options are returned: The first ensures that the CDI registry is initialized with
-// refresh disabled, and the second injects the devices into the container.
-func withCDIDevices(cdiSpecDirs []string, devices ...string) oci.SpecOpts {
-	return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
-		if len(devices) == 0 {
-			return nil
+// detectGPUVendorFromCDI detects the first available GPU vendor from CDI cache.
+// Returns empty string if no known vendor is found.
+func detectGPUVendorFromCDI() string {
+	cache := cdi.GetDefaultCache()
+	availableVendors := cache.ListVendors()
+	knownGPUVendors := []string{"nvidia.com", "amd.com"}
+	for _, known := range knownGPUVendors {
+		for _, available := range availableVendors {
+			if known == available {
+				return known
+			}
 		}
+	}
 
-		// We configure the CDI registry with the configured spec dirs and disable refresh.
-		cdi.Configure(
+	return ""
+}
+
+// withStaticCDIRegistry inits the CDI registry with given spec dirs
+// and disables auto-refresh.
+func withStaticCDIRegistry(cdiSpecDirs []string) oci.SpecOpts {
+	return func(ctx context.Context, _ oci.Client, _ *containers.Container, _ *oci.Spec) error {
+		_ = cdi.Configure(
 			cdi.WithSpecDirs(cdiSpecDirs...),
 			cdi.WithAutoRefresh(false),
 		)
+		if err := cdi.Refresh(); err != nil {
+			// We don't consider registry refresh failure a fatal error.
+			// For instance, a dynamically generated invalid CDI Spec file for
+			// any particular vendor shouldn't prevent injection of devices of
+			// different vendors. CDI itself knows better and it will fail the
+			// injection if necessary.
+			log.L.Warnf("CDI cache refresh failed: %v", err)
+		}
+		return nil
+	}
+}
 
+// withCDIDevices creates the OCI runtime spec options for injecting CDI devices.
+func withCDIDevices(devices ...string) oci.SpecOpts {
+	return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
+		if len(devices) == 0 {
+			return nil
+		}
 		return cdispec.WithCDIDevices(devices...)(ctx, client, c, s)
 	}
 }
+
+// withGPUs creates the OCI runtime spec options for injecting GPUs via CDI.
+// It parses the given GPU options and converts them to CDI device IDs.
+// withCDIDevices is then used to perform the actual injection.
+func withGPUs(gpuOpts ...string) oci.SpecOpts {
+	return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
+		if len(gpuOpts) == 0 {
+			return nil
+		}
+		cdiDevices, err := parseGPUOpts(gpuOpts)
+		if err != nil {
+			return err
+		}
+		return withCDIDevices(cdiDevices...)(ctx, client, c, s)
+	}
+}
diff --git a/pkg/cmd/container/run_gpus.go b/pkg/cmd/container/run_gpus.go
diff --git a/pkg/cmd/container/run_linux.go b/pkg/cmd/container/run_linux.go