Skip to content

Commit 5c89203

Browse files
committed
kmod-6.12-nvidia-r580: Add grid-license-check
Add a unit that checks for the license to be valid for GRID. The NVIDIA k8s device plugin requires this unit so if the license is not present, then the node never offers gpu resources. This prevents a situation where a node could fail to get a license, join the cluster, and then later have workloads start to fail due to the unlicensed status. Signed-off-by: Matthew Yeazel <yeazelm@amazon.com>
1 parent 1831f72 commit 5c89203

File tree

6 files changed

+74
-1
lines changed

6 files changed

+74
-1
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[Unit]
2+
Description=GRID License Check
3+
RefuseManualStart=true
4+
RefuseManualStop=true
5+
After=nvidia-gridd.service
6+
Requires=nvidia-gridd.service
7+
8+
[Service]
9+
Type=oneshot
10+
ExecCondition=/usr/bin/ghostdog match-nvidia-driver grid
11+
ExecStart=-/usr/bin/truncate /tmp/.nvidia-gridd-license
12+
ExecStart=/usr/bin/nvidia-smi -q
13+
ExecStart=/usr/bin/grep -q "License Status.*: Licensed" /tmp/.nvidia-gridd-license
14+
ExecStart=/usr/bin/touch /etc/drivers/.grid-licensed
15+
ExecStart=/usr/bin/systemctl stop grid-license-check.timer --no-block
16+
StandardOutput=append:/tmp/.nvidia-gridd-license
17+
18+
[Install]
19+
WantedBy=nvidia-k8s-device-plugin.service
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[Unit]
2+
Description=GRID License Check Timer
3+
RefuseManualStart=true
4+
5+
[Timer]
6+
Unit=grid-license-check.service
7+
OnBootSec=5s
8+
OnUnitActiveSec=2s
9+
AccuracySec=1s
10+
11+
[Install]
12+
WantedBy=timers.target
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[Service]
2+
ExecStartPre=/usr/bin/test -f /etc/drivers/.grid-licensed

packages/kmod-6.12-nvidia-r580/kmod-6.12-nvidia-r580.spec

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ Source206: nvidia-persistenced.service
5656
Source207: fabricmanager.env
5757
Source208: gridd.conf
5858
Source209: nvidia-gridd.service
59+
Source210: grid-license-check.service
60+
Source211: grid-license-check.timer
61+
Source212: open-gpu-license-fallback.service
62+
Source213: tesla-license-fallback.service
63+
Source214: grid-license-file-check.conf
5964

6065
# NVIDIA tesla conf files from 300 to 399
6166
Source300: nvidia-tesla-tmpfiles.conf
@@ -410,7 +415,9 @@ install kernel-open/nvidia-drm.ko %{buildroot}%{_cross_datadir}/nvidia/grid/driv
410415
# Install nvidia-gridd and related files
411416
install -m 755 nvidia-gridd %{buildroot}%{_cross_bindir}/nvidia-gridd
412417
install -m 644 %{S:208} %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf
413-
install -p -m 0644 %{S:209} %{buildroot}%{_cross_unitdir}
418+
install -p -m 0644 %{S:209} %{S:210} %{S:211} %{S:212} %{S:213} %{buildroot}%{_cross_unitdir}
419+
install -d %{buildroot}%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d
420+
install -p -m 0644 %{S:214} %{buildroot}%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d
414421
popd
415422
# End GRID driver
416423
%endif
@@ -754,6 +761,11 @@ popd
754761
%{_cross_bindir}/nvidia-gridd
755762
%{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf
756763
%{_cross_unitdir}/nvidia-gridd.service
764+
%{_cross_unitdir}/grid-license-check.service
765+
%{_cross_unitdir}/grid-license-check.timer
766+
%{_cross_unitdir}/open-gpu-license-fallback.service
767+
%{_cross_unitdir}/tesla-license-fallback.service
768+
%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d/grid-license-file-check.conf
757769

758770
%{_cross_datadir}/nvidia/grid/drivers/nvidia.ko
759771
%{_cross_datadir}/nvidia/grid/drivers/nvidia-uvm.ko
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[Unit]
2+
Description=Open GPU GRID License Check Fallback
3+
RefuseManualStart=true
4+
RefuseManualStop=true
5+
6+
[Service]
7+
Type=oneshot
8+
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
9+
ExecStart=/usr/bin/touch /etc/drivers/.grid-licensed
10+
ExecStart=/usr/bin/systemctl stop grid-license-check.timer --no-block
11+
RemainAfterExit=true
12+
13+
[Install]
14+
WantedBy=nvidia-k8s-device-plugin.service
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[Unit]
2+
Description=Tesla GRID License Check Fallback
3+
RefuseManualStart=true
4+
RefuseManualStop=true
5+
6+
[Service]
7+
Type=oneshot
8+
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
9+
ExecStart=/usr/bin/touch /etc/drivers/.grid-licensed
10+
ExecStart=/usr/bin/systemctl stop grid-license-check.timer --no-block
11+
RemainAfterExit=true
12+
13+
[Install]
14+
WantedBy=nvidia-k8s-device-plugin.service

0 commit comments

Comments
 (0)