backport: Update drivers and use subs for version

mikemckiernan · mikemckiernan · commit 4dac656b168d · 2024-11-21T09:18:28.000-05:00
/not-latest

Signed-off-by: Mike McKiernan &lt;mmckiernan@nvidia.com&gt;
diff --git a/gpu-operator/custom-driver-params.rst b/gpu-operator/custom-driver-params.rst
@@ -38,4 +38,5 @@ containing the kernel module parameters.
    $ helm install --wait --generate-name \
         -n gpu-operator --create-namespace \
         nvidia/gpu-operator \
+        --set version=${version} \
         --set driver.kernelModuleConfig.name="kernel-module-params"
diff --git a/gpu-operator/getting-started.rst b/gpu-operator/getting-started.rst
@@ -99,7 +99,8 @@ Procedure
 
         $ helm install --wait --generate-name \
             -n gpu-operator --create-namespace \
-            nvidia/gpu-operator
+            nvidia/gpu-operator \
+            --set version=${version}
 
    - Install the Operator and specify configuration options:
 
@@ -108,6 +109,7 @@ Procedure
         $ helm install --wait --generate-name \
             -n gpu-operator --create-namespace \
             nvidia/gpu-operator \
+            --set version=${version} \
             --set <option-name>=<option-value>
 
      Refer to the :ref:`gpu-operator-helm-chart-options`
@@ -292,7 +294,8 @@ For example, to install the GPU Operator in the ``nvidia-gpu-operator`` namespac
 
    $ helm install --wait --generate-name \
         -n nvidia-gpu-operator --create-namespace \
-        nvidia/gpu-operator
+        nvidia/gpu-operator \
+        --set version=${version} \
 
 If you do not specify a namespace during installation, all GPU Operator components are installed in the ``default`` namespace.
 
@@ -330,6 +333,7 @@ In this scenario, use the NVIDIA Container Toolkit image that is built on UBI 8:
    $ helm install --wait --generate-name \
         -n gpu-operator --create-namespace \
         nvidia/gpu-operator \
+        --set version=${version} \
         --set toolkit.version=v1.16.1-ubi8
 
 Replace the ``v1.16.1`` value in the preceding command with the version that is supported
@@ -350,6 +354,7 @@ In this scenario, the NVIDIA GPU driver is already installed on the worker nodes
    $ helm install --wait --generate-name \
         -n gpu-operator --create-namespace \
         nvidia/gpu-operator \
+        --set version=${version} \
         --set driver.enabled=false
 
 The preceding command prevents the Operator from installing the GPU driver on any nodes in the cluster.
@@ -378,9 +383,10 @@ Install the Operator with the following options:
 
    $ helm install --wait --generate-name \
         -n gpu-operator --create-namespace \
-         nvidia/gpu-operator \
-         --set driver.enabled=false \
-         --set toolkit.enabled=false
+        nvidia/gpu-operator \
+        --set version=${version} \
+        --set driver.enabled=false \
+        --set toolkit.enabled=false
 
 
 Pre-Installed NVIDIA Container Toolkit (but no drivers)
@@ -401,6 +407,7 @@ In this scenario, the NVIDIA Container Toolkit is already installed on the worke
       $ helm install --wait --generate-name \
           -n gpu-operator --create-namespace \
           nvidia/gpu-operator \
+          --set version=${version} \
           --set toolkit.enabled=false
 
 Running a Custom Driver Image
@@ -429,6 +436,7 @@ you can build a custom driver container image. Follow these steps:
      $ helm install --wait --generate-name \
           -n gpu-operator --create-namespace \
           nvidia/gpu-operator \
+          --set version=${version} \
           --set driver.repository=docker.io/nvidia \
           --set driver.version="465.27"
 
@@ -466,6 +474,7 @@ If you need to specify custom values, refer to the following sample command for
 
   helm install gpu-operator -n gpu-operator --create-namespace \
     nvidia/gpu-operator $HELM_OPTIONS \
+      --set version=${version} \
       --set toolkit.env[0].name=CONTAINERD_CONFIG \
       --set toolkit.env[0].value=/etc/containerd/config.toml \
       --set toolkit.env[1].name=CONTAINERD_SOCKET \
@@ -538,6 +547,7 @@ These options can be passed to GPU Operator during install time as below.
 
   helm install gpu-operator -n gpu-operator --create-namespace \
     nvidia/gpu-operator $HELM_OPTIONS \
+      --set version=${version} \
       --set toolkit.env[0].name=CONTAINERD_CONFIG \
       --set toolkit.env[0].value=/var/snap/microk8s/current/args/containerd-template.toml \
       --set toolkit.env[1].name=CONTAINERD_SOCKET \
diff --git a/gpu-operator/google-gke.rst b/gpu-operator/google-gke.rst
@@ -169,6 +169,7 @@ You can create a node pool that uses a Container-Optimized OS node image or a Ub
       $ helm install --wait --generate-name \
           -n gpu-operator \
           nvidia/gpu-operator \
+          --set version=${version} \
           --set hostPaths.driverInstallDir=/home/kubernetes/bin/nvidia \
           --set toolkit.installDir=/home/kubernetes/bin/nvidia \
           --set cdi.enabled=true \
diff --git a/gpu-operator/gpu-driver-configuration.rst b/gpu-operator/gpu-driver-configuration.rst
@@ -277,6 +277,7 @@ Perform the following steps to install the GPU Operator and use the NVIDIA drive
         $ helm install --wait --generate-name \
             -n gpu-operator --create-namespace \
             nvidia/gpu-operator \
+            --set version=${version}
             --set driver.nvidiaDriverCRD.enabled=true
 
      By default, Helm configures a ``default`` NVIDIA driver custom resource during installation.
diff --git a/gpu-operator/gpu-operator-confidential-containers.rst b/gpu-operator/gpu-operator-confidential-containers.rst
@@ -407,6 +407,7 @@ Perform the following steps to install the Operator for use with confidential co
       $ helm install --wait --generate-name \
          -n gpu-operator --create-namespace \
          nvidia/gpu-operator \
+         --set version=${version} \
          --set sandboxWorkloads.enabled=true \
          --set kataManager.enabled=true \
          --set ccManager.enabled=true \
diff --git a/gpu-operator/gpu-operator-kata.rst b/gpu-operator/gpu-operator-kata.rst
@@ -269,6 +269,7 @@ Perform the following steps to install the Operator for use with Kata Containers
       $ helm install --wait --generate-name \
          -n gpu-operator --create-namespace \
          nvidia/gpu-operator \
+         --set version=${version} \
          --set sandboxWorkloads.enabled=true \
          --set kataManager.enabled=true
 
diff --git a/gpu-operator/gpu-operator-kubevirt.rst b/gpu-operator/gpu-operator-kubevirt.rst
@@ -140,6 +140,7 @@ Install the GPU Operator, enabling ``sandboxWorkloads``:
    $ helm install --wait --generate-name \
          -n gpu-operator --create-namespace \
          nvidia/gpu-operator \
+         --set version=${version} \
          --set sandboxWorkloads.enabled=true
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -171,6 +172,7 @@ Install the GPU Operator with ``sandboxWorkloads`` and ``vgpuManager`` enabled a
    $ helm install --wait --generate-name \
          -n gpu-operator --create-namespace \
          nvidia/gpu-operator \
+         --set version=${version} \
          --set sandboxWorkloads.enabled=true \
          --set vgpuManager.enabled=true \
          --set vgpuManager.repository=<path to private repository> \
diff --git a/gpu-operator/gpu-operator-mig.rst b/gpu-operator/gpu-operator-mig.rst
@@ -57,6 +57,7 @@ Perform the following steps to install the Operator and configure MIG:
       $ helm install --wait --generate-name \
           -n gpu-operator --create-namespace \
           nvidia/gpu-operator \
+          --set version=${version} \
           --set mig.strategy=single
 
    Set ``mig.strategy`` to ``mixed`` when MIG mode is not enabled on all GPUs on a node.
@@ -463,6 +464,7 @@ can be used to install the GPU Operator:
     $ helm install gpu-operator \
         -n gpu-operator --create-namespace \
         nvidia/gpu-operator \
+        --set version=${version} \
         --set driver.enabled=false
 
 
@@ -513,6 +515,7 @@ Alternatively, you can create a custom config map for use by MIG Manager by perf
      $ helm install gpu-operator \
          -n gpu-operator --create-namespace \
          nvidia/gpu-operator \
+         --set version=${version} \
          --set migManager.gpuClientsConfig.name=gpu-clients
          --set driver.enabled=false
 
diff --git a/gpu-operator/gpu-operator-rdma.rst b/gpu-operator/gpu-operator-rdma.rst
@@ -132,6 +132,7 @@ To use DMA-BUF and network device drivers that are installed by the Network Oper
    $ helm install --wait --generate-name \
         -n gpu-operator --create-namespace \
         nvidia/gpu-operator \
+        --set version=${version} \
         --set driver.useOpenKernelModules=true
 
 To use DMA-BUF and network device drivers that are installed on the host:
@@ -141,6 +142,7 @@ To use DMA-BUF and network device drivers that are installed on the host:
    $ helm install --wait --generate-name \
         -n gpu-operator --create-namespace \
         nvidia/gpu-operator \
+        --set version=${version} \
         --set driver.useOpenKernelModules=true \
         --set driver.rdma.useHostMofed=true
 
@@ -433,6 +435,7 @@ The following sample command applies to clusters that use the Network Operator t
    $ helm install --wait --generate-name \
         -n gpu-operator --create-namespace \
         nvidia/gpu-operator \
+        --set version=${version} \
         --set driver.useOpenKernelModules=true \
         --set gds.enabled=true
 
diff --git a/gpu-operator/gpu-sharing.rst b/gpu-operator/gpu-sharing.rst
@@ -364,6 +364,7 @@ Perform the following steps to configure time-slicing before installing the oper
 
       $ helm install gpu-operator nvidia/gpu-operator \
           -n gpu-operator \
+          --set version=${version} \
           --set devicePlugin.config.name=time-slicing-config
 
 #. Refer to either :ref:`time-slicing-cluster-wide-config` or
diff --git a/gpu-operator/install-gpu-operator-air-gapped.rst b/gpu-operator/install-gpu-operator-air-gapped.rst
@@ -122,13 +122,13 @@ An example is shown below with the Operator container image:
    operator:
      repository: nvcr.io/nvidia
      image: gpu-operator
-     version: "v1.9.0"
+     version: "${version}"
 
-For instance, to pull the gpu-operator image version v1.9.0, use the following instruction:
+For instance, to pull the gpu-operator image version ${version}, use the following instruction:
 
 .. code-block:: console
 
-   $ docker pull nvcr.io/nvidia/gpu-operator:v1.9.0
+   $ docker pull nvcr.io/nvidia/gpu-operator:${version}
 
 There is one caveat with regards to the driver image. The version field must be appended by the OS name running on the worker node.
 
@@ -137,35 +137,29 @@ There is one caveat with regards to the driver image. The version field must be
    driver:
      repository: nvcr.io/nvidia
      image: driver
-     version: "470.82.01"
+     version: "${recommended}"
 
 To pull the driver image for Ubuntu 20.04:
 
 .. code-block:: console
 
-   $ docker pull nvcr.io/nvidia/driver:470.82.01-ubuntu20.04
-
-To pull the driver image for CentOS 8:
-
-.. code-block:: console
-
-   $ docker pull nvcr.io/nvidia/driver:470.82.01-centos8
+   $ docker pull nvcr.io/nvidia/driver:${recommended}-ubuntu20.04
 
 To push the images to the local registry, simply tag the pulled images by prefixing the image with the image registry information.
 
 Using the above examples, this will result in:
 
 .. code-block:: console
 
-   $ docker tag nvcr.io/nvidia/gpu-operator:v1.9.0 <local-registry>/<local-path>/gpu-operator:v1.9.0
-   $ docker tag nvcr.io/nvidia/driver:470.82.01-ubuntu20.04 <local-registry>/<local-path>/driver:470.82.01-ubuntu20.04
+   $ docker tag nvcr.io/nvidia/gpu-operator:${version} <local-registry>/<local-path>/gpu-operator:${version}
+   $ docker tag nvcr.io/nvidia/driver:${recommended}-ubuntu20.04 <local-registry>/<local-path>/driver:${recommended}-ubuntu20.04
 
 Finally, push the images to the local registry:
 
 .. code-block:: console
 
-   $ docker push <local-registry>/<local-path>/gpu-operator:v1.9.0
-   $ docker push <local-registry>/<local-path>/driver:470.82.01-ubuntu20.04
+   $ docker push <local-registry>/<local-path>/gpu-operator:${version}
+   $ docker push <local-registry>/<local-path>/driver:${recommended}-ubuntu20.04
 
 Update ``values.yaml`` with local registry information in the repository field.
 
@@ -380,19 +374,19 @@ Deploy GPU Operator
 
 Download and deploy GPU Operator Helm Chart with the updated ``values.yaml``.
 
-Fetch the chart from NGC repository. ``v1.9.0`` is used in the command below:
+Fetch the chart from NGC repository:
 
 .. code-block:: console
 
-    $ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/gpu-operator-v1.9.0.tgz
+    $ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/gpu-operator-${version}.tgz
 
 Install the GPU Operator with the customized ``values.yaml``:
 
 .. code-block:: console
 
     $ helm install --wait gpu-operator \
          -n gpu-operator --create-namespace \
-         gpu-operator-v1.9.0.tgz \
+         gpu-operator-${version}.tgz \
          -f values.yaml
 
 Check the status of the pods to ensure all the containers are running:
diff --git a/gpu-operator/install-gpu-operator-outdated-kernels.rst b/gpu-operator/install-gpu-operator-outdated-kernels.rst
@@ -7,17 +7,17 @@ Considerations when Installing with Outdated Kernels in Cluster
 ***************************************************************
 
 The ``driver`` container deployed as part of the GPU Operator requires certain packages to be available as part of the driver installation.
-On GPU nodes where the running kernel is not the latest, the ``driver`` container may fail to find the right version of these packages 
-(e.g. kernel-headers, kernel-devel) that correspond to the running kernel version. In the ``driver`` container logs, you will most likely 
+On GPU nodes where the running kernel is not the latest, the ``driver`` container may fail to find the right version of these packages
+(e.g. kernel-headers, kernel-devel) that correspond to the running kernel version. In the ``driver`` container logs, you will most likely
 see the following error message: ``Could not resolve Linux kernel version``.
 
-In general, upgrading your system to the latest kernel should fix this issue. But if this is not an option, the following is a 
+In general, upgrading your system to the latest kernel should fix this issue. But if this is not an option, the following is a
 workaround to successfully deploy the GPU operator when GPU nodes in your cluster may not be running the latest kernel.
 
 Add Archived Package Repositories
 =================================
 
-The workaround is to find the package archive containing packages for your outdated kernel and to add this repository to the package 
+The workaround is to find the package archive containing packages for your outdated kernel and to add this repository to the package
 manager running inside the ``driver`` container. To achieve this, we can simply mount a repository list file into the ``driver`` container using a ``ConfigMap``.
 The ``ConfigMap`` containing the repository list file needs to be created in the ``gpu-operator`` namespace.
 
@@ -87,6 +87,7 @@ Deploy GPU Operator with updated ``values.yaml``:
    $ helm install --wait --generate-name \
         -n gpu-operator --create-namespace \
         nvidia/gpu-operator \
+        --set version=${version} \
         -f values.yaml
 
 
diff --git a/gpu-operator/install-gpu-operator-proxy.rst b/gpu-operator/install-gpu-operator-proxy.rst
@@ -57,26 +57,18 @@ HTTP Proxy Configuration for Openshift
 ======================================
 
 For Openshift, it is recommended to use the cluster-wide Proxy object to provide proxy information for the cluster.
-Please follow the procedure described in `Configuring the cluster-wide proxy <https://docs.openshift.com/container-platform/4.8/networking/enable-cluster-wide-proxy.html>`_
+Follow the procedure described in `Configuring the cluster-wide proxy <https://docs.openshift.com/container-platform/4.8/networking/enable-cluster-wide-proxy.html>`_
 from Red Hat Openshift public documentation. The GPU Operator will automatically inject proxy related ENV into the ``driver`` container
 based on information present in the cluster-wide Proxy object.
 
-.. note::
-
-   * GPU Operator v1.8.0 does not work well on RedHat OpenShift when a cluster-wide Proxy object is configured and causes constant restarts of ``driver`` container. This will be fixed in an upcoming patch release v1.8.2.
-
 HTTP Proxy Configuration
 ========================
 
 First, get the ``values.yaml`` file used for GPU Operator configuration:
 
 .. code-block:: console
 
-  $ curl -sO https://raw.githubusercontent.com/NVIDIA/gpu-operator/v1.7.0/deployments/gpu-operator/values.yaml
-
-.. note::
-
-   Replace ``v1.7.0`` in the above command with the version you want to use.
+  $ curl -sO https://raw.githubusercontent.com/NVIDIA/gpu-operator/${version}/deployments/gpu-operator/values.yaml
 
 Specify ``driver.env`` in ``values.yaml`` with appropriate HTTP_PROXY, HTTPS_PROXY, and NO_PROXY environment variables
 (in both uppercase and lowercase).
@@ -108,19 +100,19 @@ Deploy GPU Operator
 
 Download and deploy GPU Operator Helm Chart with the updated ``values.yaml``.
 
-Fetch the chart from NGC repository. ``v1.10.0`` is used as an example in the command below:
+Fetch the chart from NGC repository:
 
 .. code-block:: console
 
-    $ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/gpu-operator-v1.10.0.tgz
+    $ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/gpu-operator-${version}.tgz
 
 Install the GPU Operator with updated ``values.yaml``:
 
 .. code-block:: console
 
     $ helm install --wait gpu-operator \
          -n gpu-operator --create-namespace \
-         gpu-operator-v1.10.0.tgz \
+         gpu-operator-${version}.tgz \
          -f values.yaml
 
 Check the status of the pods to ensure all the containers are running:
diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst
@@ -86,11 +86,11 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information.
      - Version
 
    * - NVIDIA GPU Operator
-     - v24.6.1
+     - ${version}
 
    * - NVIDIA GPU Driver
      - | `560.35.03 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-560-35-03/index.html>`_
-       | `550.127.05 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-127-05/index.html>`_ (recommended),
+       | `550.127.08 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-127-08/index.html>`_ (recommended),
        | `550.90.07 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-07/index.html>`_ (default),
        | `535.216.01 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-216-01/index.html>`_
 
@@ -120,7 +120,7 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information.
      - `3.3.7-1 <https://docs.nvidia.com/datacenter/dcgm/latest/release-notes/changelog.html>`__
 
    * - Validator for NVIDIA GPU Operator
-     - v24.6.1
+     - ${version}
 
    * - NVIDIA KubeVirt GPU Device Plugin
      - `v1.2.9 <https://github.com/NVIDIA/kubevirt-gpu-device-plugin>`__
diff --git a/gpu-operator/microsoft-aks.rst b/gpu-operator/microsoft-aks.rst
@@ -112,6 +112,7 @@ deploying NVIDIA Driver Containers and the NVIDIA Container Toolkit.
 
       $ helm install gpu-operator nvidia/gpu-operator \
           -n gpu-operator --create-namespace \
+          --set version=${version} \
           --set driver.enabled=false \
           --set toolkit.enabled=false \
           --set operator.runtimeClass=nvidia-container-runtime
diff --git a/gpu-operator/precompiled-drivers.rst b/gpu-operator/precompiled-drivers.rst
diff --git a/repo.toml b/repo.toml