Skip to content

Commit 5b81e30

Browse files
author
Evan Lezar
committed
Merge branch 'cherry-pick-for-v1.13.2' into 'release-1.13'
Cherry pick changes for 1.13.2 See merge request nvidia/container-toolkit/container-toolkit!407
2 parents f13d440 + a34b089 commit 5b81e30

File tree

240 files changed

+18212
-2858
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

240 files changed

+18212
-2858
lines changed

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[submodule "third_party/libnvidia-container"]
22
path = third_party/libnvidia-container
33
url = https://gitlab.com/nvidia/container-toolkit/libnvidia-container.git
4-
branch = main
4+
branch = release-1.13
55
[submodule "third_party/nvidia-container-runtime"]
66
path = third_party/nvidia-container-runtime
77
url = https://gitlab.com/nvidia/container-toolkit/container-runtime.git

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
# NVIDIA Container Toolkit Changelog
22

3+
## v1.13.2
4+
5+
* Add `nvidia-container-runtime-hook.path` config option to specify NVIDIA Container Runtime Hook path explicitly.
6+
* Fix bug in creation of `/dev/char` symlinks by failing operation if kernel modules are not loaded.
7+
* Add option to load kernel modules when creating device nodes
8+
* Add option to create device nodes when creating `/dev/char` symlinks
9+
* Treat failures to open debug log files as non-fatal.
10+
* Bump CUDA base image version to 12.1.1.
11+
312
## v1.13.1
413

514
* Update `update-ldcache` hook to only update ldcache if it exists.

cmd/nvidia-container-runtime/main_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ func TestDuplicateHook(t *testing.T) {
172172
// addNVIDIAHook is a basic wrapper for an addHookModifier that is used for
173173
// testing.
174174
func addNVIDIAHook(spec *specs.Spec) error {
175-
m := modifier.NewStableRuntimeModifier(logrus.StandardLogger())
175+
m := modifier.NewStableRuntimeModifier(logrus.StandardLogger(), nvidiaHook)
176176
return m.Modify(spec)
177177
}
178178

cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,23 @@ func newAllPossible(logger *logrus.Logger, driverRoot string) (nodeLister, error
4040
if err != nil {
4141
return nil, fmt.Errorf("failed reading device majors: %v", err)
4242
}
43+
44+
var requiredMajors []devices.Name
4345
migCaps, err := nvcaps.NewMigCaps()
4446
if err != nil {
4547
return nil, fmt.Errorf("failed to read MIG caps: %v", err)
4648
}
4749
if migCaps == nil {
4850
migCaps = make(nvcaps.MigCaps)
51+
} else {
52+
requiredMajors = append(requiredMajors, devices.NVIDIACaps)
53+
}
54+
55+
requiredMajors = append(requiredMajors, devices.NVIDIAGPU, devices.NVIDIAUVM)
56+
for _, name := range requiredMajors {
57+
if !deviceMajors.Exists(name) {
58+
return nil, fmt.Errorf("missing required device major %s", name)
59+
}
4960
}
5061

5162
l := allPossible{

cmd/nvidia-ctk/system/create-dev-char-symlinks/create-dev-char-symlinks.go

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"strings"
2525
"syscall"
2626

27+
"github.com/NVIDIA/nvidia-container-toolkit/internal/system"
2728
"github.com/fsnotify/fsnotify"
2829
"github.com/sirupsen/logrus"
2930
"github.com/urfave/cli/v2"
@@ -38,11 +39,13 @@ type command struct {
3839
}
3940

4041
type config struct {
41-
devCharPath string
42-
driverRoot string
43-
dryRun bool
44-
watch bool
45-
createAll bool
42+
devCharPath string
43+
driverRoot string
44+
dryRun bool
45+
watch bool
46+
createAll bool
47+
createDeviceNodes bool
48+
loadKernelModules bool
4649
}
4750

4851
// NewCommand constructs a command sub-command with the specified logger
@@ -97,6 +100,18 @@ func (m command) build() *cli.Command {
97100
Destination: &cfg.createAll,
98101
EnvVars: []string{"CREATE_ALL"},
99102
},
103+
&cli.BoolFlag{
104+
Name: "load-kernel-modules",
105+
Usage: "Load the NVIDIA kernel modules before creating symlinks. This is only applicable when --create-all is set.",
106+
Destination: &cfg.loadKernelModules,
107+
EnvVars: []string{"LOAD_KERNEL_MODULES"},
108+
},
109+
&cli.BoolFlag{
110+
Name: "create-device-nodes",
111+
Usage: "Create the NVIDIA control device nodes in the driver root if they do not exist. This is only applicable when --create-all is set",
112+
Destination: &cfg.createDeviceNodes,
113+
EnvVars: []string{"CREATE_DEVICE_NODES"},
114+
},
100115
&cli.BoolFlag{
101116
Name: "dry-run",
102117
Usage: "If set, the command will not create any symlinks.",
@@ -114,6 +129,16 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error {
114129
return fmt.Errorf("create-all and watch are mutually exclusive")
115130
}
116131

132+
if cfg.loadKernelModules && !cfg.createAll {
133+
m.logger.Warn("load-kernel-modules is only applicable when create-all is set; ignoring")
134+
cfg.loadKernelModules = false
135+
}
136+
137+
if cfg.createDeviceNodes && !cfg.createAll {
138+
m.logger.Warn("create-device-nodes is only applicable when create-all is set; ignoring")
139+
cfg.createDeviceNodes = false
140+
}
141+
117142
return nil
118143
}
119144

@@ -137,6 +162,8 @@ func (m command) run(c *cli.Context, cfg *config) error {
137162
WithDriverRoot(cfg.driverRoot),
138163
WithDryRun(cfg.dryRun),
139164
WithCreateAll(cfg.createAll),
165+
WithLoadKernelModules(cfg.loadKernelModules),
166+
WithCreateDeviceNodes(cfg.createDeviceNodes),
140167
)
141168
if err != nil {
142169
return fmt.Errorf("failed to create symlink creator: %v", err)
@@ -186,12 +213,14 @@ create:
186213
}
187214

188215
type linkCreator struct {
189-
logger *logrus.Logger
190-
lister nodeLister
191-
driverRoot string
192-
devCharPath string
193-
dryRun bool
194-
createAll bool
216+
logger *logrus.Logger
217+
lister nodeLister
218+
driverRoot string
219+
devCharPath string
220+
dryRun bool
221+
createAll bool
222+
createDeviceNodes bool
223+
loadKernelModules bool
195224
}
196225

197226
// Creator is an interface for creating symlinks to /dev/nv* devices in /dev/char.
@@ -218,6 +247,10 @@ func NewSymlinkCreator(opts ...Option) (Creator, error) {
218247
c.devCharPath = defaultDevCharPath
219248
}
220249

250+
if err := c.setup(); err != nil {
251+
return nil, err
252+
}
253+
221254
if c.createAll {
222255
lister, err := newAllPossible(c.logger, c.driverRoot)
223256
if err != nil {
@@ -230,6 +263,34 @@ func NewSymlinkCreator(opts ...Option) (Creator, error) {
230263
return c, nil
231264
}
232265

266+
func (m linkCreator) setup() error {
267+
if !m.loadKernelModules && !m.createDeviceNodes {
268+
return nil
269+
}
270+
271+
s, err := system.New(
272+
system.WithLogger(m.logger),
273+
system.WithDryRun(m.dryRun),
274+
)
275+
if err != nil {
276+
return err
277+
}
278+
279+
if m.loadKernelModules {
280+
if err := s.LoadNVIDIAKernelModules(); err != nil {
281+
return fmt.Errorf("failed to load NVIDIA kernel modules: %v", err)
282+
}
283+
}
284+
285+
if m.createDeviceNodes {
286+
if err := s.CreateNVIDIAControlDeviceNodesAt(m.driverRoot); err != nil {
287+
return fmt.Errorf("failed to create NVIDIA device nodes: %v", err)
288+
}
289+
}
290+
291+
return nil
292+
}
293+
233294
// WithDriverRoot sets the driver root path.
234295
func WithDriverRoot(root string) Option {
235296
return func(c *linkCreator) {
@@ -265,6 +326,20 @@ func WithCreateAll(createAll bool) Option {
265326
}
266327
}
267328

329+
// WithLoadKernelModules sets the loadKernelModules flag for the linkCreator.
330+
func WithLoadKernelModules(loadKernelModules bool) Option {
331+
return func(lc *linkCreator) {
332+
lc.loadKernelModules = loadKernelModules
333+
}
334+
}
335+
336+
// WithCreateDeviceNodes sets the createDeviceNodes flag for the linkCreator.
337+
func WithCreateDeviceNodes(createDeviceNodes bool) Option {
338+
return func(lc *linkCreator) {
339+
lc.createDeviceNodes = createDeviceNodes
340+
}
341+
}
342+
268343
// CreateLinks creates symlinks for all NVIDIA device nodes found in the driver root.
269344
func (m linkCreator) CreateLinks() error {
270345
deviceNodes, err := m.lister.DeviceNodes()

cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ type options struct {
3434
dryRun bool
3535

3636
control bool
37+
38+
loadKernelModules bool
3739
}
3840

3941
// NewCommand constructs a command sub-command with the specified logger
@@ -72,6 +74,11 @@ func (m command) build() *cli.Command {
7274
Usage: "create all control device nodes: nvidiactl, nvidia-modeset, nvidia-uvm, nvidia-uvm-tools",
7375
Destination: &opts.control,
7476
},
77+
&cli.BoolFlag{
78+
Name: "load-kernel-modules",
79+
Usage: "load the NVIDIA Kernel Modules before creating devices nodes",
80+
Destination: &opts.loadKernelModules,
81+
},
7582
&cli.BoolFlag{
7683
Name: "dry-run",
7784
Usage: "if set, the command will not create any symlinks.",
@@ -92,6 +99,7 @@ func (m command) run(c *cli.Context, opts *options) error {
9299
s, err := system.New(
93100
system.WithLogger(m.logger),
94101
system.WithDryRun(opts.dryRun),
102+
system.WithLoadKernelModules(opts.loadKernelModules),
95103
)
96104
if err != nil {
97105
return fmt.Errorf("failed to create library: %v", err)

go.mod

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,36 @@
11
module github.com/NVIDIA/nvidia-container-toolkit
22

3-
go 1.18
3+
go 1.20
44

55
require (
6-
github.com/BurntSushi/toml v1.0.0
6+
github.com/BurntSushi/toml v1.2.1
77
github.com/NVIDIA/go-nvml v0.12.0-0
88
github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a
99
github.com/fsnotify/fsnotify v1.5.4
10-
github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb
10+
github.com/opencontainers/runtime-spec v1.1.0-rc.2
1111
github.com/pelletier/go-toml v1.9.4
1212
github.com/sirupsen/logrus v1.9.0
13-
github.com/stretchr/testify v1.7.0
13+
github.com/stretchr/testify v1.8.1
1414
github.com/urfave/cli/v2 v2.3.0
1515
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438
1616
golang.org/x/mod v0.5.0
17-
golang.org/x/sys v0.0.0-20220927170352-d9d178bc13c6
18-
sigs.k8s.io/yaml v1.3.0
17+
golang.org/x/sys v0.7.0
1918
)
2019

2120
require (
22-
github.com/cpuguy83/go-md2man/v2 v2.0.1 // indirect
21+
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
2322
github.com/davecgh/go-spew v1.1.1 // indirect
2423
github.com/hashicorp/errwrap v1.1.0 // indirect
25-
github.com/kr/text v0.2.0 // indirect
26-
github.com/opencontainers/runc v1.1.4 // indirect
24+
github.com/kr/pretty v0.3.1 // indirect
25+
github.com/opencontainers/runc v1.1.6 // indirect
2726
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
28-
github.com/opencontainers/selinux v1.10.1 // indirect
27+
github.com/opencontainers/selinux v1.11.0 // indirect
2928
github.com/pmezard/go-difflib v1.0.0 // indirect
3029
github.com/russross/blackfriday/v2 v2.1.0 // indirect
3130
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
3231
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
3332
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
3433
gopkg.in/yaml.v2 v2.4.0 // indirect
3534
gopkg.in/yaml.v3 v3.0.1 // indirect
35+
sigs.k8s.io/yaml v1.3.0 // indirect
3636
)

0 commit comments

Comments
 (0)