Skip to content

Commit 364e97f

Browse files
authored
Merge pull request #4728 from shiv-tyagi/detect-vendor-id
Detect vendor before crafting cdiDeviceIDs for --gpus
2 parents 2f107d5 + 4649baa commit 364e97f

File tree

6 files changed

+295
-67
lines changed

6 files changed

+295
-67
lines changed

cmd/nerdctl/container/container_run_linux_test.go

Lines changed: 167 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,16 @@ func TestRunDeviceCDI(t *testing.T) {
674674
// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
675675
testutil.DockerIncompatible(t)
676676
cdiSpecDir := filepath.Join(t.TempDir(), "cdi")
677-
writeTestCDISpec(t, cdiSpecDir)
677+
const testCDIVendor1 = `
678+
cdiVersion: "0.3.0"
679+
kind: "vendor1.com/device"
680+
devices:
681+
- name: foo
682+
containerEdits:
683+
env:
684+
- FOO=injected
685+
`
686+
writeTestCDISpec(t, testCDIVendor1, "vendor1.yaml", cdiSpecDir)
678687

679688
base := testutil.NewBase(t)
680689
base.Cmd("--cdi-spec-dirs", cdiSpecDir, "run",
@@ -689,7 +698,16 @@ func TestRunDeviceCDIWithNerdctlConfig(t *testing.T) {
689698
// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
690699
testutil.DockerIncompatible(t)
691700
cdiSpecDir := filepath.Join(t.TempDir(), "cdi")
692-
writeTestCDISpec(t, cdiSpecDir)
701+
const testCDIVendor1 = `
702+
cdiVersion: "0.3.0"
703+
kind: "vendor1.com/device"
704+
devices:
705+
- name: foo
706+
containerEdits:
707+
env:
708+
- FOO=injected
709+
`
710+
writeTestCDISpec(t, testCDIVendor1, "vendor1.yaml", cdiSpecDir)
693711

694712
tomlPath := filepath.Join(t.TempDir(), "nerdctl.toml")
695713
err := os.WriteFile(tomlPath, []byte(fmt.Sprintf(`
@@ -706,8 +724,128 @@ cdi_spec_dirs = ["%s"]
706724
).AssertOutContains("FOO=injected")
707725
}
708726

709-
func writeTestCDISpec(t *testing.T, cdiSpecDir string) {
710-
const testCDIVendor1 = `
727+
// TestRunGPU tests GPU injection using the --gpus flag.
728+
func TestRunGPU(t *testing.T) {
729+
t.Parallel()
730+
// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
731+
testutil.DockerIncompatible(t)
732+
const nvidiaSpec = `
733+
cdiVersion: "0.5.0"
734+
kind: "nvidia.com/gpu"
735+
devices:
736+
- name: "0"
737+
containerEdits:
738+
env:
739+
- NVIDIA_GPU_0=injected
740+
- name: "1"
741+
containerEdits:
742+
env:
743+
- NVIDIA_GPU_1=injected
744+
`
745+
const amdSpec = `
746+
cdiVersion: "0.5.0"
747+
kind: "amd.com/gpu"
748+
devices:
749+
- name: "0"
750+
containerEdits:
751+
env:
752+
- AMD_GPU_0=injected
753+
- name: "1"
754+
containerEdits:
755+
env:
756+
- AMD_GPU_1=injected
757+
`
758+
const unknownSpec = `
759+
cdiVersion: "0.5.0"
760+
kind: "unknown.com/gpu"
761+
devices:
762+
- name: "0"
763+
containerEdits:
764+
env:
765+
- UNKNOWN_GPU_0=injected
766+
`
767+
768+
testCases := []struct {
769+
name string
770+
specs map[string]string
771+
gpuFlags []string
772+
expectedEnvs []string
773+
expectFail bool
774+
}{
775+
{
776+
name: "nvidia device injection",
777+
specs: map[string]string{"nvidia.yaml": nvidiaSpec},
778+
gpuFlags: []string{"--gpus", "2"},
779+
expectedEnvs: []string{"NVIDIA_GPU_0=injected", "NVIDIA_GPU_1=injected"},
780+
},
781+
{
782+
name: "amd device injection",
783+
specs: map[string]string{"amd.yaml": amdSpec},
784+
gpuFlags: []string{"--gpus", "2"},
785+
expectedEnvs: []string{"AMD_GPU_0=injected", "AMD_GPU_1=injected"},
786+
},
787+
{
788+
name: "multiple vendors",
789+
specs: map[string]string{"nvidia.yaml": nvidiaSpec, "amd.yaml": amdSpec},
790+
gpuFlags: []string{"--gpus", "1"},
791+
expectedEnvs: []string{"NVIDIA_GPU_0=injected"},
792+
},
793+
{
794+
name: "unknown vendor fails",
795+
specs: map[string]string{"unknown.yaml": unknownSpec},
796+
gpuFlags: []string{"--gpus", "1"},
797+
expectFail: true,
798+
},
799+
}
800+
801+
for _, tc := range testCases {
802+
t.Run(tc.name, func(t *testing.T) {
803+
t.Parallel()
804+
tmpDir := t.TempDir()
805+
for fileName, spec := range tc.specs {
806+
writeTestCDISpec(t, spec, fileName, tmpDir)
807+
}
808+
809+
base := testutil.NewBase(t)
810+
args := []string{"--cdi-spec-dirs", tmpDir, "run", "--rm"}
811+
args = append(args, tc.gpuFlags...)
812+
args = append(args, testutil.AlpineImage, "env")
813+
814+
if tc.expectFail {
815+
base.Cmd(args...).AssertFail()
816+
} else {
817+
base.Cmd(args...).AssertOutWithFunc(func(stdout string) error {
818+
for _, expectedEnv := range tc.expectedEnvs {
819+
if !strings.Contains(stdout, expectedEnv) {
820+
return fmt.Errorf("%s not found", expectedEnv)
821+
}
822+
}
823+
return nil
824+
})
825+
}
826+
})
827+
}
828+
}
829+
830+
// TestRunGPUWithOtherCDIDevices tests GPU CDI injection along with other CDI devices.
831+
func TestRunGPUWithOtherCDIDevices(t *testing.T) {
832+
t.Parallel()
833+
// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
834+
testutil.DockerIncompatible(t)
835+
const amdSpec = `
836+
cdiVersion: "0.5.0"
837+
kind: "amd.com/gpu"
838+
devices:
839+
- name: "0"
840+
containerEdits:
841+
env:
842+
- AMD_GPU_0=injected
843+
- name: "1"
844+
containerEdits:
845+
env:
846+
- AMD_GPU_1=injected
847+
`
848+
const vendor1Spec = `
711849
cdiVersion: "0.3.0"
712850
kind: "vendor1.com/device"
713851
devices:
@@ -716,10 +854,33 @@ devices:
716854
env:
717855
- FOO=injected
718856
`
857+
tmpDir := t.TempDir()
858+
writeTestCDISpec(t, amdSpec, "amd.yaml", tmpDir)
859+
writeTestCDISpec(t, vendor1Spec, "vendor1.yaml", tmpDir)
860+
861+
base := testutil.NewBase(t)
862+
base.Cmd("--cdi-spec-dirs", tmpDir, "run", "--rm",
863+
"--gpus", "2",
864+
"--device", "vendor1.com/device=foo",
865+
testutil.AlpineImage, "env",
866+
).AssertOutWithFunc(func(stdout string) error {
867+
if !strings.Contains(stdout, "AMD_GPU_0=injected") {
868+
return errors.New("AMD_GPU_0=injected not found")
869+
}
870+
if !strings.Contains(stdout, "AMD_GPU_1=injected") {
871+
return errors.New("AMD_GPU_1=injected not found")
872+
}
873+
if !strings.Contains(stdout, "FOO=injected") {
874+
return errors.New("FOO=injected not found")
875+
}
876+
return nil
877+
})
878+
}
719879

880+
func writeTestCDISpec(t *testing.T, spec string, fileName string, cdiSpecDir string) {
720881
err := os.MkdirAll(cdiSpecDir, 0700)
721882
assert.NilError(t, err)
722-
cdiSpecPath := filepath.Join(cdiSpecDir, "vendor1.yaml")
723-
err = os.WriteFile(cdiSpecPath, []byte(testCDIVendor1), 0400)
883+
cdiSpecPath := filepath.Join(cdiSpecDir, fileName)
884+
err = os.WriteFile(cdiSpecPath, []byte(spec), 0400)
724885
assert.NilError(t, err)
725886
}

docs/gpu.md

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,32 +7,40 @@
77
> The description in this section applies to nerdctl v2.3 or later.
88
> Users of prior releases of nerdctl should refer to <https://github.com/containerd/nerdctl/blob/v2.2.0/docs/gpu.md>
99
10-
nerdctl provides docker-compatible NVIDIA GPU support.
10+
nerdctl provides docker-compatible NVIDIA and AMD GPU support.
1111

1212
## Prerequisites
1313

14-
- NVIDIA Drivers
15-
- Same requirement as when you use GPUs on Docker. For details, please refer to [the doc by NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#pre-requisites).
16-
- The NVIDIA Container Toolkit
17-
- containerd relies on the NVIDIA Container Toolkit to make GPUs usable inside a container. You can install the NVIDIA Container Toolkit by following the [official installation instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
14+
- GPU Drivers
15+
- Same requirement as when you use GPUs on Docker. For details, please refer to these docs by [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#pre-requisites) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/quick-start-guide.html#step-2-install-the-amdgpu-driver).
16+
- Container Toolkit
17+
- containerd relies on vendor Container Toolkits to make GPUs available to the containers. You can install those by following the official installation instructions from [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/quick-start-guide.html).
18+
- CDI Specification
19+
- Container Device Interface (CDI) specification for the GPU devices is required for the GPU support to work. Follow the official documentation from [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/cdi-guide.html) to ensure that the required CDI specifications are present on the system.
1820

1921
## Options for `nerdctl run --gpus`
2022

2123
`nerdctl run --gpus` is compatible to [`docker run --gpus`](https://docs.docker.com/engine/reference/commandline/run/#access-an-nvidia-gpu).
2224

2325
You can specify number of GPUs to use via `--gpus` option.
24-
The following example exposes all available GPUs.
26+
The following examples expose all available GPUs to the container.
2527

2628
```
2729
nerdctl run -it --rm --gpus all nvidia/cuda:12.3.1-base-ubuntu20.04 nvidia-smi
2830
```
2931

32+
or
33+
34+
```
35+
nerdctl run -it --rm --gpus=all rocm/rocm-terminal rocm-smi
36+
```
37+
3038
You can also pass detailed configuration to `--gpus` option as a list of key-value pairs. The following options are provided.
3139

3240
- `count`: number of GPUs to use. `all` exposes all available GPUs.
33-
- `device`: IDs of GPUs to use. UUID or numbers of GPUs can be specified.
41+
- `device`: IDs of GPUs to use. UUID or numbers of GPUs can be specified. This only works for NVIDIA GPUs.
3442

35-
The following example exposes a specific GPU to the container.
43+
The following example exposes a specific NVIDIA GPU to the container.
3644

3745
```
3846
nerdctl run -it --rm --gpus 'device=GPU-3a23c669-1f69-c64e-cf85-44e9b07e7a2a' nvidia/cuda:12.3.1-base-ubuntu20.04 nvidia-smi
@@ -72,17 +80,17 @@ services:
7280

7381
### `nerdctl run --gpus` fails due to an unresolvable CDI device
7482

75-
If the required CDI specifications for NVIDIA devices are not available on the
83+
If the required CDI specifications for your GPU devices are not available on the
7684
system, the `nerdctl run` command will fail with an error similar to: `CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all` (the
77-
exact error message will depend on the device(s) requested).
85+
exact error message will depend on the vendor and the device(s) requested).
7886

7987
This should be the same error message that is reported when the `--device` flag
8088
is used to request a CDI device:
8189
```
8290
nerdctl run --device=nvidia.com/gpu=all
8391
```
8492

85-
Ensure that the NVIDIA Container Toolkit (>= v1.18.0 is recommended) is installed and the requested CDI devices are present in the ouptut of `nvidia-ctk cdi list`:
93+
Ensure that the NVIDIA (or AMD) Container Toolkit is installed and the requested CDI devices are present in the ouptut of `nvidia-ctk cdi list` (or `amd-ctk cdi list` for AMD GPUs):
8694

8795
```
8896
$ nvidia-ctk cdi list
@@ -92,7 +100,9 @@ nvidia.com/gpu=GPU-3eb87630-93d5-b2b6-b8ff-9b359caf4ee2
92100
nvidia.com/gpu=all
93101
```
94102

95-
See the NVIDIA Container Toolkit [CDI documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) for more information.
103+
For NVIDIA Container Toolkit, version >= v1.18.0 is recommended. See the NVIDIA Container Toolkit [CDI documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) for more information.
104+
105+
For AMD Container Toolkit, version >= v1.2.0 is recommended. See the AMD Container Toolkit [CDI documentation](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/cdi-guide.html) for more information.
96106

97107

98108
### `nerdctl run --gpus` fails when using the Nvidia gpu-operator

pkg/cmd/container/create.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,14 @@ func Create(ctx context.Context, client *containerd.Client, args []string, netMa
129129
}
130130
opts = append(opts, platformOpts...)
131131

132-
opts = append(opts, withCDIDevices(options.GOptions.CDISpecDirs, options.CDIDevices...))
132+
if len(options.CDIDevices) > 0 || len(options.GPUs) > 0 {
133+
opts = append(opts, withStaticCDIRegistry(options.GOptions.CDISpecDirs))
134+
}
135+
136+
opts = append(opts,
137+
withGPUs(options.GPUs...),
138+
withCDIDevices(options.CDIDevices...),
139+
)
133140

134141
if _, err := referenceutil.Parse(args[0]); errors.Is(err, referenceutil.ErrLoadOCIArchiveRequired) {
135142
imageRef := args[0]

pkg/cmd/container/run_cdi.go

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,68 @@ import (
2424
"github.com/containerd/containerd/v2/core/containers"
2525
cdispec "github.com/containerd/containerd/v2/pkg/cdi"
2626
"github.com/containerd/containerd/v2/pkg/oci"
27+
"github.com/containerd/log"
2728
)
2829

29-
// withCDIDevices creates the OCI runtime spec options for injecting CDI devices.
30-
// Two options are returned: The first ensures that the CDI registry is initialized with
31-
// refresh disabled, and the second injects the devices into the container.
32-
func withCDIDevices(cdiSpecDirs []string, devices ...string) oci.SpecOpts {
33-
return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
34-
if len(devices) == 0 {
35-
return nil
30+
// detectGPUVendorFromCDI detects the first available GPU vendor from CDI cache.
31+
// Returns empty string if no known vendor is found.
32+
func detectGPUVendorFromCDI() string {
33+
cache := cdi.GetDefaultCache()
34+
availableVendors := cache.ListVendors()
35+
knownGPUVendors := []string{"nvidia.com", "amd.com"}
36+
for _, known := range knownGPUVendors {
37+
for _, available := range availableVendors {
38+
if known == available {
39+
return known
40+
}
3641
}
42+
}
3743

38-
// We configure the CDI registry with the configured spec dirs and disable refresh.
39-
cdi.Configure(
44+
return ""
45+
}
46+
47+
// withStaticCDIRegistry inits the CDI registry with given spec dirs
48+
// and disables auto-refresh.
49+
func withStaticCDIRegistry(cdiSpecDirs []string) oci.SpecOpts {
50+
return func(ctx context.Context, _ oci.Client, _ *containers.Container, _ *oci.Spec) error {
51+
_ = cdi.Configure(
4052
cdi.WithSpecDirs(cdiSpecDirs...),
4153
cdi.WithAutoRefresh(false),
4254
)
55+
if err := cdi.Refresh(); err != nil {
56+
// We don't consider registry refresh failure a fatal error.
57+
// For instance, a dynamically generated invalid CDI Spec file for
58+
// any particular vendor shouldn't prevent injection of devices of
59+
// different vendors. CDI itself knows better and it will fail the
60+
// injection if necessary.
61+
log.L.Warnf("CDI cache refresh failed: %v", err)
62+
}
63+
return nil
64+
}
65+
}
4366

67+
// withCDIDevices creates the OCI runtime spec options for injecting CDI devices.
68+
func withCDIDevices(devices ...string) oci.SpecOpts {
69+
return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
70+
if len(devices) == 0 {
71+
return nil
72+
}
4473
return cdispec.WithCDIDevices(devices...)(ctx, client, c, s)
4574
}
4675
}
76+
77+
// withGPUs creates the OCI runtime spec options for injecting GPUs via CDI.
78+
// It parses the given GPU options and converts them to CDI device IDs.
79+
// withCDIDevices is then used to perform the actual injection.
80+
func withGPUs(gpuOpts ...string) oci.SpecOpts {
81+
return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
82+
if len(gpuOpts) == 0 {
83+
return nil
84+
}
85+
cdiDevices, err := parseGPUOpts(gpuOpts)
86+
if err != nil {
87+
return err
88+
}
89+
return withCDIDevices(cdiDevices...)(ctx, client, c, s)
90+
}
91+
}

0 commit comments

Comments
 (0)