diff --git a/gpu-operator/getting-started.rst b/gpu-operator/getting-started.rst index 647a84cca..66e5c7021 100644 --- a/gpu-operator/getting-started.rst +++ b/gpu-operator/getting-started.rst @@ -173,6 +173,13 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. Set this value to ``false`` when using the Operator on systems with pre-installed drivers. - ``true`` + * - ``driver.kernelModelType`` + - Specifies the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are `auto` (defualt), `proprietary` and `open`. + * `auto` means that the recommended kernel module type is chosen based on the GPU devices on the host and the driver branch used. + * `proprietary` means the propietary kernel modules. + - ``auto`` + * - ``driver.repository`` - The images are downloaded from NGC. Specify another image repository when using custom driver images. @@ -197,8 +204,11 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. runs slowly in your cluster. - ``60s`` - * - ``driver.useOpenKernelModules`` - - When set to ``true``, the driver containers install the NVIDIA Open GPU Kernel module driver. + * - ``driver.useOpenKernelModules`` (deprecated) + - Use `kernelModelType` instead. + This field is no longer honor in v25.3.0 and later. + It will be removed in future release. + When set to ``true``, the driver containers install the NVIDIA Open GPU Kernel module driver. - ``false`` * - ``driver.usePrecompiled`` diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst index d5b62915f..f73c2371e 100644 --- a/gpu-operator/life-cycle-policy.rst +++ b/gpu-operator/life-cycle-policy.rst @@ -55,13 +55,13 @@ The product life cycle and versioning are subject to change in the future. * - GPU Operator Version - Status - * - 24.9.x + * - 25.3.x - Generally Available - * - 24.6.x + * - 24.9.x - Maintenance - * - 24.3.x and lower + * - 24.6.x and lower - EOL @@ -98,35 +98,35 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. | `535.216.03 `_ * - NVIDIA Driver Manager for Kubernetes - - `v0.7.0 `__ + - `v0.8.0 `__ * - NVIDIA Container Toolkit - `1.17.4 `__ * - NVIDIA Kubernetes Device Plugin - - `0.17.0 `__ + - `0.17.1 `__ * - DCGM Exporter - - `3.3.9-3.6.1 `__ + - `4.1.1-4.0.4 `__ * - Node Feature Discovery - - v0.16.6 + - `v0.17.2 `__ * - | NVIDIA GPU Feature Discovery | for Kubernetes - `0.17.0 `__ * - NVIDIA MIG Manager for Kubernetes - - `0.10.0 `__ + - `0.11.0 `__ * - DCGM - - `3.3.9-1 `__ + - `4.1.1 `__ * - Validator for NVIDIA GPU Operator - ${version} * - NVIDIA KubeVirt GPU Device Plugin - - `v1.2.10 `__ + - `v1.3.1 `__ * - NVIDIA vGPU Device Manager - `v0.2.8 `__ @@ -135,14 +135,14 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. - `2.20.5 `__ * - NVIDIA Kata Manager for Kubernetes - - `v0.2.2 `__ + - `v0.2.3 `__ * - | NVIDIA Confidential Computing | Manager for Kubernetes - v0.1.1 * - NVIDIA GDRCopy Driver - - `v2.4.1-1 `__ + - `v2.4.4 `__ .. _gds-open-kernel: diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index f7b941576..41bdcab59 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -462,7 +462,7 @@ Operating System Kubernetes KubeVirt OpenShift Virtual ================ =========== ============= ========= ============= =========== Ubuntu 20.04 LTS 1.23---1.29 0.36+ 0.59.1+ Ubuntu 22.04 LTS 1.23---1.29 0.36+ 0.59.1+ -Red Hat Core OS 4.12---4.17 4.13---4.17 +Red Hat Core OS 4.12---4.18 4.13---4.18 ================ =========== ============= ========= ============= =========== You can run GPU passthrough and NVIDIA vGPU in the same cluster as long as you use diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 8ee75a294..b2df2ade2 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -34,6 +34,93 @@ See the :ref:`GPU Operator Component Matrix` for a list of software components a ---- +.. _v25.3.0: + +25.3.0 +====== + +.. _v25.3.0-new-features: + +New Features +------------ + +* Added support for the following software component versions: + + - NVIDIA Container Toolkit version v1.15.0 + - NVIDIA Driver Manager for Kubernetes v0.8.0 + - NVIDIA DCGM Exporter v4.1.1-4.0.4 + - NVIDIA DCGM v4.1.1 + - Node Feature Discovery v0.17.2 + - NVIDIA MIG Manager for Kubernetes v0.11.0 + - NVIDIA KubeVirt GPU Device Plugin v1.3.1 + - NVIDIA Kata Manager for Kubernetes v0.2.3 + - NVIDIA GDRCopy Driver v2.4.4 + + +* Added new parameter, `kernelModuleType`, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel models to use. + The `kernelModuleType` field supports three values to determine how the the kernal model is selected. Valid values for the new field are auto (default), open, and proprietary. + + In previous versions, the `useOpenKernelModules` field specified the driver containers to install the NVIDIA Open GPU Kernel module driver. + This field is now deprecated and will be removed in a future release. + If you were using the `useOpenKernelModules` field, it's recommended that you update your configuration to use the `kernelModuleType` field instead. + + Note, `auto`` is only supported with the 570.86.15 and 570.124.06 or later driver containers. + 550 and 535 branch drivers do not yet support this mode. + If you wish to install the open kernel modules with 550 or 535 drivers, set `driver.kernelModuleType=open`. + +* Added support for Ubuntu 24.04. + +* Added support for NVIDIA HGX GB200 NVL. + +* Added support for OpenShift 4.18. + +* Add support for Containerd 2.0. + +* Added support for the following driver versions: 570.yy.zz, 550.yy.zz, 535.yy.zz + +* Added support for new MIG profiles with HGX B200. + + * Added support for the following profiles: + + * ``1g.23gb`` + * ``1g.23gb+me`` + * ``1g.45gb`` + * ``2g.45gb`` + * ``3g.90gb`` + * ``7g.180gb`` + + * Added an ``all-balanced`` profile creates the following GPU instances: + + * ``1g.23gb`` :math:`\times` 2 + * ``2g.45gb`` :math:`\times` 1 + * ``3g.90gb`` :math:`\times` 1 + +Improvements +------------ + +* Improved security by removing unneeded permissions in the GPU Operator ClusterRole. + +* Improved GPU Operator metrics to include a `operatorMetricsNamespace` field that sets the metrcis namespace to `gpu_operator`. + +.. _v25.3.0-fixed-issues: + +Fixed Issues +------------ + +* Removed default liveiness prode from the GDS and GDRCopy containers of the driver-daemonset. +Long response times of the `lsmod` commands were causing timeout errors in the probe and unnecessary restarts of the container, resulting in the DaemonSet being in a bad state. + +* Fixed an issue where the GPU Operator failed to create a valid DaemonSet name on OpenShift Container Platform when using 64 kernel page size. + Refer to Github `issue #1207 `__ for more details. + +* Removed deprecated `operator.defaultRuntime`` parameter. + +.. _v25.3.0-known-limitations: + +Known Limitations +----------------- + + .. _v24.9.2: 24.9.2 diff --git a/gpu-operator/versions1.json b/gpu-operator/versions1.json index 50cafe979..5035b1c98 100644 --- a/gpu-operator/versions1.json +++ b/gpu-operator/versions1.json @@ -1,6 +1,14 @@ [ { "preferred": "true", + "url": "../25.3.0", + "version": "25.3.0" + }, + { + "url": "../24.9.2", + "version": "24.9.2" + }, + { "url": "../24.9.1", "version": "24.9.1" }, @@ -15,13 +23,5 @@ { "url": "../24.6.1", "version": "24.6.1" - }, - { - "url": "../24.6.0", - "version": "24.6.0" - }, - { - "url": "../24.3.0", - "version": "24.3.0" } ] \ No newline at end of file diff --git a/openshift/versions1.json b/openshift/versions1.json index 89826a87b..b76c62ad2 100644 --- a/openshift/versions1.json +++ b/openshift/versions1.json @@ -1,4 +1,13 @@ [ + { + "preferred": "true", + "url": "../25.3.0", + "version": "25.3.0" + }, + { + "url": "../24.9.2", + "version": "24.9.2" + }, { "preferred": "true", "url": "../24.9.1", @@ -12,12 +21,4 @@ "url": "../24.6.2", "version": "24.6.2" }, - { - "url": "../24.6.1", - "version": "24.6.1" - }, - { - "url": "../24.3.0", - "version": "24.3.0" - } ] \ No newline at end of file diff --git a/repo.toml b/repo.toml index d574c70a5..86e8170b3 100644 --- a/repo.toml +++ b/repo.toml @@ -153,8 +153,8 @@ output_format = "linkcheck" docs_root = "${root}/gpu-operator" project = "gpu-operator" name = "NVIDIA GPU Operator" -version = "24.9.2" -source_substitutions = { version = "v24.9.2", recommended = "570.86.15" } +version = "25.3.0" +source_substitutions = { version = "v25.3.0", recommended = "570.86.15" } copyright_start = 2020 sphinx_exclude_patterns = [ "life-cycle-policy.rst", @@ -212,7 +212,7 @@ output_format = "linkcheck" docs_root = "${root}/openshift" project = "gpu-operator-openshift" name = "NVIDIA GPU Operator on Red Hat OpenShift Container Platform" -version = "24.9.2" +version = "25.3.0" copyright_start = 2020 sphinx_exclude_patterns = [ "get-entitlement.rst", @@ -249,7 +249,7 @@ docs_root = "${root}/secure-services-istio-keycloak" project = "secure-services-istio-keycloak" name = "Securing NVIDIA Services with Istio and Keycloak" version = "0.1.0" -copyright_start = 2024 +copyright_start = 2025 [repo_docs.projects.secure-services-istio-keycloak.builds.linkcheck] build_by_default = false