Skip to content

Commit 8f80e75

Browse files
committed
kmod-6.1-nvidia-r580: Add grid-license-check
Add a unit that checks for the license to be valid for GRID. The NVIDIA k8s device plugin requires this unit so if the license is not present, then the node never offers gpu resources. This prevents a situation where a node could fail to get a license, join the cluster, and then later have workloads start to fail due to the unlicensed status. Signed-off-by: Matthew Yeazel <yeazelm@amazon.com>
1 parent 5c89203 commit 8f80e75

File tree

6 files changed

+74
-1
lines changed

6 files changed

+74
-1
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[Unit]
2+
Description=GRID License Check
3+
RefuseManualStart=true
4+
RefuseManualStop=true
5+
After=nvidia-gridd.service
6+
Requires=nvidia-gridd.service
7+
8+
[Service]
9+
Type=oneshot
10+
ExecCondition=/usr/bin/ghostdog match-nvidia-driver grid
11+
ExecStart=-/usr/bin/truncate /tmp/.nvidia-gridd-license
12+
ExecStart=/usr/bin/nvidia-smi -q
13+
ExecStart=/usr/bin/grep -q "License Status.*: Licensed" /tmp/.nvidia-gridd-license
14+
ExecStart=/usr/bin/touch /etc/drivers/.grid-licensed
15+
ExecStart=/usr/bin/systemctl stop grid-license-check.timer --no-block
16+
StandardOutput=append:/tmp/.nvidia-gridd-license
17+
18+
[Install]
19+
WantedBy=nvidia-k8s-device-plugin.service
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[Unit]
2+
Description=GRID License Check Timer
3+
RefuseManualStart=true
4+
5+
[Timer]
6+
Unit=grid-license-check.service
7+
OnBootSec=5s
8+
OnUnitActiveSec=2s
9+
AccuracySec=1s
10+
11+
[Install]
12+
WantedBy=timers.target
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[Service]
2+
ExecStartPre=/usr/bin/test -f /etc/drivers/.grid-licensed

packages/kmod-6.1-nvidia-r580/kmod-6.1-nvidia-r580.spec

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,11 @@ Source206: nvidia-persistenced.service
4949
Source207: fabricmanager.env
5050
Source208: gridd.conf
5151
Source209: nvidia-gridd.service
52+
Source210: grid-license-check.service
53+
Source211: grid-license-check.timer
54+
Source212: open-gpu-license-fallback.service
55+
Source213: tesla-license-fallback.service
56+
Source214: grid-license-file-check.conf
5257

5358
# NVIDIA tesla conf files from 300 to 399
5459
Source300: nvidia-tesla-tmpfiles.conf
@@ -394,7 +399,9 @@ install kernel-open/nvidia-drm.ko %{buildroot}%{_cross_datadir}/nvidia/grid/driv
394399
# Install nvidia-gridd and related files
395400
install -m 755 nvidia-gridd %{buildroot}%{_cross_bindir}/nvidia-gridd
396401
install -m 644 %{S:208} %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf
397-
install -p -m 0644 %{S:209} %{buildroot}%{_cross_unitdir}
402+
install -p -m 0644 %{S:209} %{S:210} %{S:211} %{S:212} %{S:213} %{buildroot}%{_cross_unitdir}
403+
install -d %{buildroot}%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d
404+
install -p -m 0644 %{S:214} %{buildroot}%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d
398405
popd
399406
# End GRID driver
400407
%endif
@@ -730,6 +737,11 @@ popd
730737
%{_cross_bindir}/nvidia-gridd
731738
%{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf
732739
%{_cross_unitdir}/nvidia-gridd.service
740+
%{_cross_unitdir}/grid-license-check.service
741+
%{_cross_unitdir}/grid-license-check.timer
742+
%{_cross_unitdir}/open-gpu-license-fallback.service
743+
%{_cross_unitdir}/tesla-license-fallback.service
744+
%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d/grid-license-file-check.conf
733745

734746
%{_cross_datadir}/nvidia/grid/drivers/nvidia.ko
735747
%{_cross_datadir}/nvidia/grid/drivers/nvidia-uvm.ko
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[Unit]
2+
Description=Open GPU GRID License Check Fallback
3+
RefuseManualStart=true
4+
RefuseManualStop=true
5+
6+
[Service]
7+
Type=oneshot
8+
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
9+
ExecStart=/usr/bin/touch /etc/drivers/.grid-licensed
10+
ExecStart=/usr/bin/systemctl stop grid-license-check.timer --no-block
11+
RemainAfterExit=true
12+
13+
[Install]
14+
WantedBy=nvidia-k8s-device-plugin.service
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[Unit]
2+
Description=Tesla GRID License Check Fallback
3+
RefuseManualStart=true
4+
RefuseManualStop=true
5+
6+
[Service]
7+
Type=oneshot
8+
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
9+
ExecStart=/usr/bin/touch /etc/drivers/.grid-licensed
10+
ExecStart=/usr/bin/systemctl stop grid-license-check.timer --no-block
11+
RemainAfterExit=true
12+
13+
[Install]
14+
WantedBy=nvidia-k8s-device-plugin.service

0 commit comments

Comments
 (0)