Skip to content

Commit 39a4394

Browse files
committed
nvidia-k8s-device-plugin: add MPS control daemon support
Add support for NVIDIA Multi-Process Service (MPS) control daemon, including service configuration and device plugin updates. Signed-off-by: Matthew Yeazel <yeazelm@amazon.com>
1 parent cd2087c commit 39a4394

File tree

5 files changed

+121
-0
lines changed

5 files changed

+121
-0
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
From efde4273b89c623e036109bfb949926e7fc89b50 Mon Sep 17 00:00:00 2001
2+
From: Matthew Yeazel <yeazelm@amazon.com>
3+
Date: Wed, 31 Dec 2025 19:46:25 +0000
4+
Subject: [PATCH] Update MPS roots for immutable host OS
5+
6+
The code assumes its in the container for the device plugin. The paths
7+
it creates for tracking state and shared memory won't work on
8+
Bottlerocket since the root filesystem is immutable. Move these paths to
9+
/run to allow the files to be created.
10+
11+
Signed-off-by: Matthew Yeazel <yeazelm@amazon.com>
12+
---
13+
cmd/mps-control-daemon/main.go | 4 ++--
14+
cmd/mps-control-daemon/mount/mount-shm.go | 2 +-
15+
cmd/mps-control-daemon/mps/root.go | 2 +-
16+
3 files changed, 4 insertions(+), 4 deletions(-)
17+
18+
diff --git a/cmd/mps-control-daemon/main.go b/cmd/mps-control-daemon/main.go
19+
index bb5502f..ec296a4 100644
20+
--- a/cmd/mps-control-daemon/main.go
21+
+++ b/cmd/mps-control-daemon/main.go
22+
@@ -214,7 +214,7 @@ func startDaemons(c *cli.Context, cfg *Config) ([]*mps.Daemon, bool, error) {
23+
return mpsDaemons, true, nil
24+
}
25+
}
26+
- readyFile, err := os.Create("/mps/.ready")
27+
+ readyFile, err := os.Create("/run/mps/.ready")
28+
if err != nil {
29+
return mpsDaemons, true, fmt.Errorf("failed to create .ready file")
30+
}
31+
@@ -224,7 +224,7 @@ func startDaemons(c *cli.Context, cfg *Config) ([]*mps.Daemon, bool, error) {
32+
}
33+
34+
func stopDaemons(mpsDaemons ...*mps.Daemon) error {
35+
- if err := os.Remove("/mps/.ready"); err != nil {
36+
+ if err := os.Remove("/run/mps/.ready"); err != nil {
37+
klog.Warningf("Failed to remove .ready file: %v", err)
38+
}
39+
klog.Info("Stopping MPS daemons.")
40+
diff --git a/cmd/mps-control-daemon/mount/mount-shm.go b/cmd/mps-control-daemon/mount/mount-shm.go
41+
index 83825e8..507a1e4 100644
42+
--- a/cmd/mps-control-daemon/mount/mount-shm.go
43+
+++ b/cmd/mps-control-daemon/mount/mount-shm.go
44+
@@ -49,7 +49,7 @@ func mountShm(c *cli.Context) error {
45+
mounter := mount.New(mountExecutable)
46+
47+
// TODO: /mps should be configurable.
48+
- shmDir := "/mps/shm"
49+
+ shmDir := "/run/mps/shm"
50+
err = mount.CleanupMountPoint(shmDir, mounter, true)
51+
if err != nil {
52+
return fmt.Errorf("error unmounting %v: %w", shmDir, err)
53+
diff --git a/cmd/mps-control-daemon/mps/root.go b/cmd/mps-control-daemon/mps/root.go
54+
index 90655d1..805e58d 100644
55+
--- a/cmd/mps-control-daemon/mps/root.go
56+
+++ b/cmd/mps-control-daemon/mps/root.go
57+
@@ -23,7 +23,7 @@ import (
58+
)
59+
60+
const (
61+
- ContainerRoot = Root("/mps")
62+
+ ContainerRoot = Root("/run/mps")
63+
)
64+
65+
// Root represents an MPS root.
66+
--
67+
2.52.0
68+

packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin-conf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ flags:
1616
{{/if}}
1717
failOnInitError: true
1818
nvidiaDriverRoot: "/"
19+
{{#if settings.kubelet-device-plugins.nvidia.device-sharing-strategy}}
20+
{{#if (eq settings.kubelet-device-plugins.nvidia.device-sharing-strategy "mps")}}
21+
mpsRoot: "/run/nvidia/mps"
22+
{{/if}}
23+
{{/if}}
1924
plugin:
2025
passDeviceSpecs: {{default true settings.kubelet-device-plugins.nvidia.pass-device-specs}}
2126
{{#if settings.kubelet-device-plugins.nvidia.device-list-strategy}}
@@ -39,4 +44,12 @@ sharing:
3944
- name: "nvidia.com/gpu"
4045
replicas: {{default 2 settings.kubelet-device-plugins.nvidia.time-slicing.replicas}}
4146
{{/if}}
47+
{{#if (eq settings.kubelet-device-plugins.nvidia.device-sharing-strategy "mps")}}
48+
sharing:
49+
mps:
50+
renameByDefault: {{default true settings.kubelet-device-plugins.nvidia.mps.rename-by-default}}
51+
resources:
52+
- name: "nvidia.com/gpu"
53+
replicas: {{default 2 settings.kubelet-device-plugins.nvidia.mps.replicas}}
54+
{{/if}}
4255
{{/if}}

packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin.spec

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ Source1: nvidia-k8s-device-plugin.service
1717
Source2: nvidia-k8s-device-plugin-conf
1818
Source3: nvidia-k8s-device-plugin-exec-start-conf
1919
Source4: nvidia-k8s-device-plugin-mig-conf
20+
Source5: nvidia-mps-control-daemon.service
21+
Source6: nvidia-mps-control-daemon-exec-start-conf
22+
23+
Patch0001: 0001-Update-MPS-roots-for-immutable-host-OS.patch
2024

2125
BuildRequires: %{_cross_os}glibc-devel
2226
Requires: %{name}(binaries)
@@ -54,34 +58,46 @@ export CGO_LDFLAGS="-Wl,-z,relro -Wl,--export-dynamic"
5458
export GOLDFLAGS="-compressdwarf=false -linkmode=external -extldflags '${CGO_LDFLAGS}'"
5559

5660
go build -ldflags="${GOLDFLAGS}" -o nvidia-device-plugin ./cmd/nvidia-device-plugin/
61+
go build -ldflags="${GOLDFLAGS}" -o mps-control-daemon ./cmd/mps-control-daemon/
5762
gofips build -ldflags="${GOLDFLAGS}" -o fips/nvidia-device-plugin ./cmd/nvidia-device-plugin/
63+
gofips build -ldflags="${GOLDFLAGS}" -o fips/mps-control-daemon ./cmd/mps-control-daemon/
5864

5965
%install
6066
install -d %{buildroot}%{_cross_bindir}
6167
install -p -m 0755 nvidia-device-plugin %{buildroot}%{_cross_bindir}
68+
install -p -m 0755 mps-control-daemon %{buildroot}%{_cross_bindir}
6269

6370
install -d %{buildroot}%{_cross_fips_bindir}
6471
install -p -m 0755 fips/nvidia-device-plugin %{buildroot}%{_cross_fips_bindir}
72+
install -p -m 0755 fips/mps-control-daemon %{buildroot}%{_cross_fips_bindir}
6573

6674
install -d %{buildroot}%{_cross_unitdir}
6775
install -p -m 0644 %{S:1} %{buildroot}%{_cross_unitdir}
76+
install -p -m 0644 %{S:5} %{buildroot}%{_cross_unitdir}
6877
install -d %{buildroot}%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d
78+
install -d %{buildroot}%{_cross_unitdir}/nvidia-mps-control-daemon.service.d
6979
install -D -m 0644 %{S:2} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-conf
7080
install -D -m 0644 %{S:3} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-exec-start-conf
7181
install -D -m 0644 %{S:4} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-mig-conf
82+
install -D -m 0644 %{S:6} %{buildroot}%{_cross_templatedir}/nvidia-mps-control-daemon-exec-start-conf
7283

7384

7485
%files
7586
%license LICENSE
7687
%{_cross_attribution_file}
7788
%{_cross_unitdir}/nvidia-k8s-device-plugin.service
89+
%{_cross_unitdir}/nvidia-mps-control-daemon.service
7890
%dir %{_cross_unitdir}/nvidia-k8s-device-plugin.service.d
91+
%dir %{_cross_unitdir}/nvidia-mps-control-daemon.service.d
7992
%{_cross_templatedir}/nvidia-k8s-device-plugin-conf
8093
%{_cross_templatedir}/nvidia-k8s-device-plugin-exec-start-conf
8194
%{_cross_templatedir}/nvidia-k8s-device-plugin-mig-conf
95+
%{_cross_templatedir}/nvidia-mps-control-daemon-exec-start-conf
8296

8397
%files bin
8498
%{_cross_bindir}/nvidia-device-plugin
99+
%{_cross_bindir}/mps-control-daemon
85100

86101
%files fips-bin
87102
%{_cross_fips_bindir}/nvidia-device-plugin
103+
%{_cross_fips_bindir}/mps-control-daemon
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[required-extensions]
2+
kubelet-device-plugins = "v1"
3+
+++
4+
[Service]
5+
{{#if (eq settings.kubelet-device-plugins.nvidia.device-sharing-strategy "mps")}}
6+
ExecStart=
7+
ExecStart=/usr/bin/mps-control-daemon --config-file /etc/nvidia-k8s-device-plugin/settings.yaml
8+
{{else}}
9+
ExecStart=
10+
ExecStart=/usr/bin/sleep infinity
11+
{{/if}}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[Unit]
2+
Description=NVIDIA MPS Control Daemon
3+
After=nvidia-k8s-device-plugin.service
4+
Requires=nvidia-k8s-device-plugin.service
5+
6+
[Service]
7+
Type=simple
8+
ExecStart=/bin/true
9+
Restart=on-failure
10+
RestartSec=2
11+
12+
[Install]
13+
WantedBy=multi-user.target

0 commit comments

Comments
 (0)