Skip to content

Commit 16c8c21

Browse files
committed
match NVIDIA instructions and bump cuda to 12.9.1-1
1 parent 286805b commit 16c8c21

File tree

3 files changed

+8
-28
lines changed

3 files changed

+8
-28
lines changed

ansible/roles/cuda/README.md

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,10 @@
22

33
Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
44

5-
## Prerequisites
6-
7-
Requires OFED to be installed to provide required kernel-* packages.
8-
95
## Role Variables
106

117
- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture.
128
- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version.
13-
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-9']`.
9+
- `cuda_packages`: Optional. Default provides CUDA Toolkit and GPUDirect Storage (GDS).
1410
- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA.
1511
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.

ansible/roles/cuda/defaults/main.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
22
cuda_nvidia_driver_stream: '575-open'
3-
cuda_package_version: '12.9.0-1'
4-
cuda_version_short: '12.9'
3+
cuda_package_version: '12.9.1-1'
4+
cuda_version_short: "{{ (cuda_package_version | split('.'))[0:2] | join('.') }}" # major.minor
55
cuda_packages:
6-
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
6+
- "cuda-toolkit-{{ cuda_package_version }}"
77
- nvidia-gds
88
- cmake
9-
- cuda-toolkit-12-9
109
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
1110
cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples"
1211
cuda_samples_programs:

ansible/roles/cuda/tasks/install.yml

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,5 @@
11

2-
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation
3-
4-
- name: Check for OFED/DOCA
5-
command:
6-
cmd: dnf list --installed rdma-core
7-
register: _dnf_rdma_core
8-
changed_when: false
9-
10-
- name: Assert OFED installed
11-
assert:
12-
that: "'mlnx' in _dnf_rdma_core.stdout"
13-
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?"
2+
# Based on https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/
143

154
- name: Install cuda repo
165
get_url:
@@ -29,14 +18,8 @@
2918
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
3019
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
3120

32-
- name: Check if nvidia driver module is installed
33-
ansible.builtin.command: dnf module list --installed nvidia-driver
34-
changed_when: false
35-
failed_when: false
36-
register: _cuda_driver_module_installed
37-
3821
- name: Install nvidia drivers
39-
ansible.builtin.command: dnf module install -y nvidia-driver
22+
ansible.builtin.command: dnf install -y nvidia-open
4023
register: _cuda_driver_install
4124
when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr"
4225
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
@@ -46,6 +29,8 @@
4629
that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched
4730
fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}"
4831

32+
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
33+
4934
- name: Install cuda packages
5035
ansible.builtin.dnf:
5136
name: "{{ cuda_packages }}"

0 commit comments

Comments
 (0)