diff --git a/packages/nvidia-k8s-device-plugin/0001-Update-MPS-roots-for-immutable-host-OS.patch b/packages/nvidia-k8s-device-plugin/0001-Update-MPS-roots-for-immutable-host-OS.patch new file mode 100644 index 000000000..2c0b98ec2 --- /dev/null +++ b/packages/nvidia-k8s-device-plugin/0001-Update-MPS-roots-for-immutable-host-OS.patch @@ -0,0 +1,68 @@ +From efde4273b89c623e036109bfb949926e7fc89b50 Mon Sep 17 00:00:00 2001 +From: Matthew Yeazel +Date: Wed, 31 Dec 2025 19:46:25 +0000 +Subject: [PATCH] Update MPS roots for immutable host OS + +The code assumes its in the container for the device plugin. The paths +it creates for tracking state and shared memory won't work on +Bottlerocket since the root filesystem is immutable. Move these paths to +/run to allow the files to be created. + +Signed-off-by: Matthew Yeazel +--- + cmd/mps-control-daemon/main.go | 4 ++-- + cmd/mps-control-daemon/mount/mount-shm.go | 2 +- + cmd/mps-control-daemon/mps/root.go | 2 +- + 3 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/cmd/mps-control-daemon/main.go b/cmd/mps-control-daemon/main.go +index bb5502f..ec296a4 100644 +--- a/cmd/mps-control-daemon/main.go ++++ b/cmd/mps-control-daemon/main.go +@@ -214,7 +214,7 @@ func startDaemons(c *cli.Context, cfg *Config) ([]*mps.Daemon, bool, error) { + return mpsDaemons, true, nil + } + } +- readyFile, err := os.Create("/mps/.ready") ++ readyFile, err := os.Create("/run/mps/.ready") + if err != nil { + return mpsDaemons, true, fmt.Errorf("failed to create .ready file") + } +@@ -224,7 +224,7 @@ func startDaemons(c *cli.Context, cfg *Config) ([]*mps.Daemon, bool, error) { + } + + func stopDaemons(mpsDaemons ...*mps.Daemon) error { +- if err := os.Remove("/mps/.ready"); err != nil { ++ if err := os.Remove("/run/mps/.ready"); err != nil { + klog.Warningf("Failed to remove .ready file: %v", err) + } + klog.Info("Stopping MPS daemons.") +diff --git a/cmd/mps-control-daemon/mount/mount-shm.go b/cmd/mps-control-daemon/mount/mount-shm.go +index 83825e8..507a1e4 100644 +--- a/cmd/mps-control-daemon/mount/mount-shm.go ++++ b/cmd/mps-control-daemon/mount/mount-shm.go +@@ -49,7 +49,7 @@ func mountShm(c *cli.Context) error { + mounter := mount.New(mountExecutable) + + // TODO: /mps should be configurable. +- shmDir := "/mps/shm" ++ shmDir := "/run/mps/shm" + err = mount.CleanupMountPoint(shmDir, mounter, true) + if err != nil { + return fmt.Errorf("error unmounting %v: %w", shmDir, err) +diff --git a/cmd/mps-control-daemon/mps/root.go b/cmd/mps-control-daemon/mps/root.go +index 90655d1..805e58d 100644 +--- a/cmd/mps-control-daemon/mps/root.go ++++ b/cmd/mps-control-daemon/mps/root.go +@@ -23,7 +23,7 @@ import ( + ) + + const ( +- ContainerRoot = Root("/mps") ++ ContainerRoot = Root("/run/mps") + ) + + // Root represents an MPS root. +-- +2.52.0 + diff --git a/packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin-conf b/packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin-conf index 519a735a8..e5f4b9206 100644 --- a/packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin-conf +++ b/packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin-conf @@ -16,6 +16,11 @@ flags: {{/if}} failOnInitError: true nvidiaDriverRoot: "/" +{{#if settings.kubelet-device-plugins.nvidia.device-sharing-strategy}} +{{#if (eq settings.kubelet-device-plugins.nvidia.device-sharing-strategy "mps")}} + mpsRoot: "/run/nvidia/mps" +{{/if}} +{{/if}} plugin: passDeviceSpecs: {{default true settings.kubelet-device-plugins.nvidia.pass-device-specs}} {{#if settings.kubelet-device-plugins.nvidia.device-list-strategy}} @@ -39,4 +44,12 @@ sharing: - name: "nvidia.com/gpu" replicas: {{default 2 settings.kubelet-device-plugins.nvidia.time-slicing.replicas}} {{/if}} +{{#if (eq settings.kubelet-device-plugins.nvidia.device-sharing-strategy "mps")}} +sharing: + mps: + renameByDefault: {{default true settings.kubelet-device-plugins.nvidia.mps.rename-by-default}} + resources: + - name: "nvidia.com/gpu" + replicas: {{default 2 settings.kubelet-device-plugins.nvidia.mps.replicas}} +{{/if}} {{/if}} diff --git a/packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin.spec b/packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin.spec index 9e959f3f7..bd5b39a43 100644 --- a/packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin.spec +++ b/packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin.spec @@ -17,6 +17,10 @@ Source1: nvidia-k8s-device-plugin.service Source2: nvidia-k8s-device-plugin-conf Source3: nvidia-k8s-device-plugin-exec-start-conf Source4: nvidia-k8s-device-plugin-mig-conf +Source5: nvidia-mps-control-daemon.service +Source6: nvidia-mps-control-daemon-exec-start-conf + +Patch0001: 0001-Update-MPS-roots-for-immutable-host-OS.patch BuildRequires: %{_cross_os}glibc-devel Requires: %{name}(binaries) @@ -54,34 +58,46 @@ export CGO_LDFLAGS="-Wl,-z,relro -Wl,--export-dynamic" export GOLDFLAGS="-compressdwarf=false -linkmode=external -extldflags '${CGO_LDFLAGS}'" go build -ldflags="${GOLDFLAGS}" -o nvidia-device-plugin ./cmd/nvidia-device-plugin/ +go build -ldflags="${GOLDFLAGS}" -o mps-control-daemon ./cmd/mps-control-daemon/ gofips build -ldflags="${GOLDFLAGS}" -o fips/nvidia-device-plugin ./cmd/nvidia-device-plugin/ +gofips build -ldflags="${GOLDFLAGS}" -o fips/mps-control-daemon ./cmd/mps-control-daemon/ %install install -d %{buildroot}%{_cross_bindir} install -p -m 0755 nvidia-device-plugin %{buildroot}%{_cross_bindir} +install -p -m 0755 mps-control-daemon %{buildroot}%{_cross_bindir} install -d %{buildroot}%{_cross_fips_bindir} install -p -m 0755 fips/nvidia-device-plugin %{buildroot}%{_cross_fips_bindir} +install -p -m 0755 fips/mps-control-daemon %{buildroot}%{_cross_fips_bindir} install -d %{buildroot}%{_cross_unitdir} install -p -m 0644 %{S:1} %{buildroot}%{_cross_unitdir} +install -p -m 0644 %{S:5} %{buildroot}%{_cross_unitdir} install -d %{buildroot}%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d +install -d %{buildroot}%{_cross_unitdir}/nvidia-mps-control-daemon.service.d install -D -m 0644 %{S:2} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-conf install -D -m 0644 %{S:3} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-exec-start-conf install -D -m 0644 %{S:4} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-mig-conf +install -D -m 0644 %{S:6} %{buildroot}%{_cross_templatedir}/nvidia-mps-control-daemon-exec-start-conf %files %license LICENSE %{_cross_attribution_file} %{_cross_unitdir}/nvidia-k8s-device-plugin.service +%{_cross_unitdir}/nvidia-mps-control-daemon.service %dir %{_cross_unitdir}/nvidia-k8s-device-plugin.service.d +%dir %{_cross_unitdir}/nvidia-mps-control-daemon.service.d %{_cross_templatedir}/nvidia-k8s-device-plugin-conf %{_cross_templatedir}/nvidia-k8s-device-plugin-exec-start-conf %{_cross_templatedir}/nvidia-k8s-device-plugin-mig-conf +%{_cross_templatedir}/nvidia-mps-control-daemon-exec-start-conf %files bin %{_cross_bindir}/nvidia-device-plugin +%{_cross_bindir}/mps-control-daemon %files fips-bin %{_cross_fips_bindir}/nvidia-device-plugin +%{_cross_fips_bindir}/mps-control-daemon diff --git a/packages/nvidia-k8s-device-plugin/nvidia-mps-control-daemon-exec-start-conf b/packages/nvidia-k8s-device-plugin/nvidia-mps-control-daemon-exec-start-conf new file mode 100644 index 000000000..378f050c9 --- /dev/null +++ b/packages/nvidia-k8s-device-plugin/nvidia-mps-control-daemon-exec-start-conf @@ -0,0 +1,20 @@ +[required-extensions] +kubelet-device-plugins = "v1" ++++ +[Service] +ExecStart= +{{#if (eq settings.kubelet-device-plugins.nvidia.device-sharing-strategy "mps")}} +{{#if (eq settings.kubelet-device-plugins.nvidia.device-partitioning-strategy "mig")}} +# MPS and MIG are not supported at the same time +Type=oneshot +ExecStart=/bin/echo "MPS and MIG are not supported at the same time" +{{else}} +Type=simple +ExecStart=/usr/bin/mps-control-daemon --config-file /etc/nvidia-k8s-device-plugin/settings.yaml +RemainAfterExit=no +{{/if}} +{{else}} +Type=oneshot +ExecStart=/usr/bin/true +RemainAfterExit=yes +{{/if}} diff --git a/packages/nvidia-k8s-device-plugin/nvidia-mps-control-daemon.service b/packages/nvidia-k8s-device-plugin/nvidia-mps-control-daemon.service new file mode 100644 index 000000000..0c95a7c1d --- /dev/null +++ b/packages/nvidia-k8s-device-plugin/nvidia-mps-control-daemon.service @@ -0,0 +1,12 @@ +[Unit] +Description=NVIDIA MPS Control Daemon +Before=nvidia-k8s-device-plugin.service + +[Service] +ExecStart=/usr/bin/true +Restart=on-failure +RestartSec=2 +RemainAfterExit=true + +[Install] +RequiredBy=nvidia-k8s-device-plugin.service