Skip to content

Commit ad20d20

Browse files
committed
pin nvidia driver to working version and autodetect os/arch
1 parent c4b2795 commit ad20d20

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

ansible/roles/cuda/defaults/main.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
2-
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
3-
cuda_driver_stream: default
1+
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
2+
cuda_driver_stream: '560-open'
43
cuda_package_version: 'latest'
54
cuda_packages:
65
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"

ansible/roles/cuda/tasks/install.yml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation
33

4-
- name: Check for OFED
4+
- name: Check for OFED/DOCA
55
command:
66
cmd: dnf list --installed rdma-core
77
register: _dnf_rdma_core
@@ -10,12 +10,12 @@
1010
- name: Assert OFED installed
1111
assert:
1212
that: "'mlnx' in _dnf_rdma_core.stdout"
13-
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?"
13+
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?"
1414

1515
- name: Install cuda repo
1616
get_url:
17-
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
18-
url: "{{ cuda_repo }}"
17+
dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo"
18+
url: "{{ cuda_repo_url }}"
1919

2020
- name: Check if nvidia driver module is enabled
2121
shell:
@@ -25,7 +25,7 @@
2525
register: _cuda_driver_module_enabled
2626

2727
- name: Enable nvidia driver module
28-
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
28+
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_driver_stream }}"
2929
register: _cuda_driver_module_enable
3030
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
3131
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
@@ -36,6 +36,10 @@
3636
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
3737
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
3838

39+
- name: Check kernel has not been modified
40+
assert:
41+
that: "'kernel' not in _cuda_driver_install.stdout"
42+
3943
- name: Install cuda packages
4044
ansible.builtin.dnf:
4145
name: "{{ cuda_packages }}"

0 commit comments

Comments
 (0)