From d42096dd3d00a86e2c2878c70eff029ceadbb106 Mon Sep 17 00:00:00 2001 From: Matthew Yeazel Date: Sat, 11 Oct 2025 00:09:00 +0000 Subject: [PATCH 1/3] kmod-6.1-nvidia-r570: Add grid-license-check Add a unit that checks for the license to be valid for GRID. Kubelet requires this unit so if the license is not present, then the node never joins the cluster. This prevents a situation where a node could fail to get a license, join the cluster, and then later have workloads start to fail due to the unlicensed status. Signed-off-by: Matthew Yeazel --- packages/kmod-6.1-nvidia-r570/.gitignore | 1 + .../grid-license-check.service | 26 +++++++++++++++++++ .../kmod-6.1-nvidia-r570.spec | 4 ++- 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 packages/kmod-6.1-nvidia-r570/grid-license-check.service diff --git a/packages/kmod-6.1-nvidia-r570/.gitignore b/packages/kmod-6.1-nvidia-r570/.gitignore index db8b415b..43486efa 100644 --- a/packages/kmod-6.1-nvidia-r570/.gitignore +++ b/packages/kmod-6.1-nvidia-r570/.gitignore @@ -1,3 +1,4 @@ NVidiaEULAforAWS.pdf COPYING *.rpm +NvidiaGridAWSUserLicenseAgreement.DOCX diff --git a/packages/kmod-6.1-nvidia-r570/grid-license-check.service b/packages/kmod-6.1-nvidia-r570/grid-license-check.service new file mode 100644 index 00000000..3318f956 --- /dev/null +++ b/packages/kmod-6.1-nvidia-r570/grid-license-check.service @@ -0,0 +1,26 @@ +[Unit] +Description=GRID License Check +RefuseManualStart=true +RefuseManualStop=true +DefaultDependencies=no +Before=kubelet.service +After=nvidia-gridd.service +Requires=nvidia-gridd.service + +[Service] +Type=oneshot +ExecCondition=/usr/bin/ghostdog match-nvidia-driver grid +# Otherwise, attempt to load the module. +ExecStart=/usr/bin/nvidia-smi -q +# Ensure that the stderr file exists. Otherwise, grep fails on an empty file. +ExecStart=-/usr/bin/touch /tmp/.nvidia-gridd-license +# Succeed unless there was a fatal error. +ExecStart=/usr/bin/grep -Fqvzw Unlicensed /tmp/.nvidia-gridd-license +RemainAfterExit=true +StandardOutput=append:/tmp/.nvidia-gridd-license +Restart=on-failure +RestartSec=1 +StartLimitBurst=120 + +[Install] +RequiredBy=kubelet.service diff --git a/packages/kmod-6.1-nvidia-r570/kmod-6.1-nvidia-r570.spec b/packages/kmod-6.1-nvidia-r570/kmod-6.1-nvidia-r570.spec index 08365a78..25873091 100644 --- a/packages/kmod-6.1-nvidia-r570/kmod-6.1-nvidia-r570.spec +++ b/packages/kmod-6.1-nvidia-r570/kmod-6.1-nvidia-r570.spec @@ -49,6 +49,7 @@ Source206: nvidia-persistenced.service Source207: fabricmanager.env Source208: gridd.conf Source209: nvidia-gridd.service +Source210: grid-license-check.service # NVIDIA tesla conf files from 300 to 399 Source300: nvidia-tesla-tmpfiles.conf @@ -394,7 +395,7 @@ install kernel-open/nvidia-drm.ko %{buildroot}%{_cross_datadir}/nvidia/grid/driv # Install nvidia-gridd and related files install -m 755 nvidia-gridd %{buildroot}%{_cross_bindir}/nvidia-gridd install -m 644 %{S:208} %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf -install -p -m 0644 %{S:209} %{buildroot}%{_cross_unitdir} +install -p -m 0644 %{S:209} %{S:210} %{buildroot}%{_cross_unitdir} popd # End GRID driver %endif @@ -722,6 +723,7 @@ popd %{_cross_bindir}/nvidia-gridd %{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf %{_cross_unitdir}/nvidia-gridd.service +%{_cross_unitdir}/grid-license-check.service %{_cross_datadir}/nvidia/grid/drivers/nvidia.ko %{_cross_datadir}/nvidia/grid/drivers/nvidia-uvm.ko From d621a5e457a33f2c04abc48430a8caf47cea6d67 Mon Sep 17 00:00:00 2001 From: Matthew Yeazel Date: Sat, 11 Oct 2025 00:14:49 +0000 Subject: [PATCH 2/3] kmod-6.12-nvidia-r570: Add grid-license-check Add a unit that checks for the license to be valid for GRID. Kubelet requires this unit so if the license is not present, then the node never joins the cluster. This prevents a situation where a node could fail to get a license, join the cluster, and then later have workloads start to fail due to the unlicensed status. Signed-off-by: Matthew Yeazel --- packages/kmod-6.12-nvidia-r570/.gitignore | 1 + .../grid-license-check.service | 26 +++++++++++++++++++ .../kmod-6.12-nvidia-r570.spec | 4 ++- 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 packages/kmod-6.12-nvidia-r570/grid-license-check.service diff --git a/packages/kmod-6.12-nvidia-r570/.gitignore b/packages/kmod-6.12-nvidia-r570/.gitignore index db8b415b..43486efa 100644 --- a/packages/kmod-6.12-nvidia-r570/.gitignore +++ b/packages/kmod-6.12-nvidia-r570/.gitignore @@ -1,3 +1,4 @@ NVidiaEULAforAWS.pdf COPYING *.rpm +NvidiaGridAWSUserLicenseAgreement.DOCX diff --git a/packages/kmod-6.12-nvidia-r570/grid-license-check.service b/packages/kmod-6.12-nvidia-r570/grid-license-check.service new file mode 100644 index 00000000..3b2d1f2f --- /dev/null +++ b/packages/kmod-6.12-nvidia-r570/grid-license-check.service @@ -0,0 +1,26 @@ +[Unit] +Description=GRID License Check +RefuseManualStart=true +RefuseManualStop=true +DefaultDependencies=no +Before=kubelet.service +After=nvidia-gridd.service +Requires=nvidia-gridd.service + +[Service] +Type=oneshot +ExecCondition=/usr/bin/ghostdog match-nvidia-driver grid +# Otherwise, attempt to load the module. +ExecStart=/usr/bin/nvidia-smi -q +# Ensure that the stderr file exists. Otherwise, grep fails on an empty file. +ExecStart=-/usr/bin/touch /tmp/.nvidia-gridd-license +# Succeed unless there was a fatal error. +ExecStart=/usr/bin/grep -Fqvzw Unlicensed /tmp/.nvidia-gridd-license +RemainAfterExit=true +StandardOutput=append:/tmp/.nvidia-gridd-license +Restart=on-failure +RestartSec=1 +StartLimitBurst=120 + +[Install] +RequiredBy=nvidia-k8s-device-plugin.service diff --git a/packages/kmod-6.12-nvidia-r570/kmod-6.12-nvidia-r570.spec b/packages/kmod-6.12-nvidia-r570/kmod-6.12-nvidia-r570.spec index ff563e2f..052f5122 100644 --- a/packages/kmod-6.12-nvidia-r570/kmod-6.12-nvidia-r570.spec +++ b/packages/kmod-6.12-nvidia-r570/kmod-6.12-nvidia-r570.spec @@ -56,6 +56,7 @@ Source206: nvidia-persistenced.service Source207: fabricmanager.env Source208: gridd.conf Source209: nvidia-gridd.service +Source210: grid-license-check.service # NVIDIA tesla conf files from 300 to 399 Source300: nvidia-tesla-tmpfiles.conf @@ -410,7 +411,7 @@ install kernel-open/nvidia-drm.ko %{buildroot}%{_cross_datadir}/nvidia/grid/driv # Install nvidia-gridd and related files install -m 755 nvidia-gridd %{buildroot}%{_cross_bindir}/nvidia-gridd install -m 644 %{S:208} %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf -install -p -m 0644 %{S:209} %{buildroot}%{_cross_unitdir} +install -p -m 0644 %{S:209} %{S:210} %{buildroot}%{_cross_unitdir} popd # End GRID driver %endif @@ -748,6 +749,7 @@ popd %{_cross_bindir}/nvidia-gridd %{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf %{_cross_unitdir}/nvidia-gridd.service +%{_cross_unitdir}/grid-license-check.service %{_cross_datadir}/nvidia/grid/drivers/nvidia.ko %{_cross_datadir}/nvidia/grid/drivers/nvidia-uvm.ko From 0e100c17c271418dc066af5d7ea637d8a5fbb525 Mon Sep 17 00:00:00 2001 From: Matthew Yeazel Date: Sat, 11 Oct 2025 00:15:51 +0000 Subject: [PATCH 3/3] kmod-6.12-nvidia-r580: Add grid-license-check Add a unit that checks for the license to be valid for GRID. Kubelet requires this unit so if the license is not present, then the node never joins the cluster. This prevents a situation where a node could fail to get a license, join the cluster, and then later have workloads start to fail due to the unlicensed status. Signed-off-by: Matthew Yeazel --- packages/kmod-6.12-nvidia-r580/.gitignore | 1 + .../grid-license-check.service | 26 +++++++++++++++++++ .../kmod-6.12-nvidia-r580.spec | 4 ++- 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 packages/kmod-6.12-nvidia-r580/grid-license-check.service diff --git a/packages/kmod-6.12-nvidia-r580/.gitignore b/packages/kmod-6.12-nvidia-r580/.gitignore index db8b415b..43486efa 100644 --- a/packages/kmod-6.12-nvidia-r580/.gitignore +++ b/packages/kmod-6.12-nvidia-r580/.gitignore @@ -1,3 +1,4 @@ NVidiaEULAforAWS.pdf COPYING *.rpm +NvidiaGridAWSUserLicenseAgreement.DOCX diff --git a/packages/kmod-6.12-nvidia-r580/grid-license-check.service b/packages/kmod-6.12-nvidia-r580/grid-license-check.service new file mode 100644 index 00000000..3318f956 --- /dev/null +++ b/packages/kmod-6.12-nvidia-r580/grid-license-check.service @@ -0,0 +1,26 @@ +[Unit] +Description=GRID License Check +RefuseManualStart=true +RefuseManualStop=true +DefaultDependencies=no +Before=kubelet.service +After=nvidia-gridd.service +Requires=nvidia-gridd.service + +[Service] +Type=oneshot +ExecCondition=/usr/bin/ghostdog match-nvidia-driver grid +# Otherwise, attempt to load the module. +ExecStart=/usr/bin/nvidia-smi -q +# Ensure that the stderr file exists. Otherwise, grep fails on an empty file. +ExecStart=-/usr/bin/touch /tmp/.nvidia-gridd-license +# Succeed unless there was a fatal error. +ExecStart=/usr/bin/grep -Fqvzw Unlicensed /tmp/.nvidia-gridd-license +RemainAfterExit=true +StandardOutput=append:/tmp/.nvidia-gridd-license +Restart=on-failure +RestartSec=1 +StartLimitBurst=120 + +[Install] +RequiredBy=kubelet.service diff --git a/packages/kmod-6.12-nvidia-r580/kmod-6.12-nvidia-r580.spec b/packages/kmod-6.12-nvidia-r580/kmod-6.12-nvidia-r580.spec index d9a7806c..ccafb0e0 100644 --- a/packages/kmod-6.12-nvidia-r580/kmod-6.12-nvidia-r580.spec +++ b/packages/kmod-6.12-nvidia-r580/kmod-6.12-nvidia-r580.spec @@ -56,6 +56,7 @@ Source206: nvidia-persistenced.service Source207: fabricmanager.env Source208: gridd.conf Source209: nvidia-gridd.service +Source210: grid-license-check.service # NVIDIA tesla conf files from 300 to 399 Source300: nvidia-tesla-tmpfiles.conf @@ -410,7 +411,7 @@ install kernel-open/nvidia-drm.ko %{buildroot}%{_cross_datadir}/nvidia/grid/driv # Install nvidia-gridd and related files install -m 755 nvidia-gridd %{buildroot}%{_cross_bindir}/nvidia-gridd install -m 644 %{S:208} %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf -install -p -m 0644 %{S:209} %{buildroot}%{_cross_unitdir} +install -p -m 0644 %{S:209} %{S:210} %{buildroot}%{_cross_unitdir} popd # End GRID driver %endif @@ -754,6 +755,7 @@ popd %{_cross_bindir}/nvidia-gridd %{_cross_factorydir}%{_cross_sysconfdir}/nvidia/gridd.conf %{_cross_unitdir}/nvidia-gridd.service +%{_cross_unitdir}/grid-license-check.service %{_cross_datadir}/nvidia/grid/drivers/nvidia.ko %{_cross_datadir}/nvidia/grid/drivers/nvidia-uvm.ko