diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index c968532b5..d369036ab 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -2,14 +2,10 @@ Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. -## Prerequisites - -Requires OFED to be installed to provide required kernel-* packages. - ## Role Variables - `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. - `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. -- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-9']`. +- `cuda_packages`: Optional. Default provides CUDA Toolkit and GPUDirect Storage (GDS). - `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 6a67b1f12..bc62ae843 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,12 +1,12 @@ cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" cuda_nvidia_driver_stream: '575-open' -cuda_package_version: '12.9.0-1' -cuda_version_short: '12.9' +cuda_nvidia_driver_pkg: "nvidia-open-3:575.57.08-1.el{{ ansible_distribution_major_version }}" +cuda_package_version: '12.9.1-1' +cuda_version_short: "{{ (cuda_package_version | split('.'))[0:2] | join('.') }}" # major.minor cuda_packages: - - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" + - "cuda-toolkit-{{ cuda_package_version }}" - nvidia-gds - cmake - - cuda-toolkit-12-9 cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz" cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples" cuda_samples_programs: diff --git a/ansible/roles/cuda/tasks/install.yml b/ansible/roles/cuda/tasks/install.yml index 51c92a0d3..39bd20d94 100644 --- a/ansible/roles/cuda/tasks/install.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -1,16 +1,5 @@ -# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation - -- name: Check for OFED/DOCA - command: - cmd: dnf list --installed rdma-core - register: _dnf_rdma_core - changed_when: false - -- name: Assert OFED installed - assert: - that: "'mlnx' in _dnf_rdma_core.stdout" - fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?" +# Based on https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/ - name: Install cuda repo get_url: @@ -29,23 +18,18 @@ when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" -- name: Check if nvidia driver module is installed - ansible.builtin.command: dnf module list --installed nvidia-driver - changed_when: false - failed_when: false - register: _cuda_driver_module_installed - - name: Install nvidia drivers - ansible.builtin.command: dnf module install -y nvidia-driver + ansible.builtin.dnf: + name: "{{ cuda_nvidia_driver_pkg }}" register: _cuda_driver_install - when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" - changed_when: "'Nothing to do' not in _cuda_driver_install.stdout" - name: Check kernel has not been modified assert: that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" +# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html + - name: Install cuda packages ansible.builtin.dnf: name: "{{ cuda_packages }}" diff --git a/environments/common/inventory/group_vars/all/timestamps.yml b/environments/common/inventory/group_vars/all/timestamps.yml index 8d046437a..a7a4be269 100644 --- a/environments/common/inventory/group_vars/all/timestamps.yml +++ b/environments/common/inventory/group_vars/all/timestamps.yml @@ -39,10 +39,10 @@ appliances_pulp_repos: epel: '8': path: epel/8/Everything/x86_64 - timestamp: 20250326T000103 + timestamp: 20250609T000109 '9': path: epel/9/Everything/x86_64 - timestamp: 20250326T000103 + timestamp: 20250609T000109 extras: '8.10': path: rocky/8.10/extras/x86_64/os