|
1 | 1 |
|
2 | 2 | # Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation
|
3 | 3 |
|
4 |
| -- name: Check for OFED |
| 4 | +- name: Check for OFED/DOCA |
5 | 5 | command:
|
6 | 6 | cmd: dnf list --installed rdma-core
|
7 | 7 | register: _dnf_rdma_core
|
|
10 | 10 | - name: Assert OFED installed
|
11 | 11 | assert:
|
12 | 12 | that: "'mlnx' in _dnf_rdma_core.stdout"
|
13 |
| - fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?" |
| 13 | + fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?" |
14 | 14 |
|
15 | 15 | - name: Install cuda repo
|
16 | 16 | get_url:
|
17 |
| - dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo" |
18 |
| - url: "{{ cuda_repo }}" |
| 17 | + dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" |
| 18 | + url: "{{ cuda_repo_url }}" |
19 | 19 |
|
20 | 20 | - name: Check if nvidia driver module is enabled
|
21 |
| - shell: |
22 |
| - cmd: dnf module list --enabled nvidia-driver |
| 21 | + ansible.builtin.command: dnf module list --enabled nvidia-driver |
23 | 22 | changed_when: false
|
24 | 23 | failed_when: false
|
25 | 24 | register: _cuda_driver_module_enabled
|
26 | 25 |
|
27 | 26 | - name: Enable nvidia driver module
|
28 |
| - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" |
| 27 | + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}" |
29 | 28 | register: _cuda_driver_module_enable
|
30 | 29 | when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
|
31 | 30 | changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
|
32 | 31 |
|
| 32 | +- name: Check if nvidia driver module is installed |
| 33 | + ansible.builtin.command: dnf module list --installed nvidia-driver |
| 34 | + changed_when: false |
| 35 | + failed_when: false |
| 36 | + register: _cuda_driver_module_installed |
| 37 | + |
33 | 38 | - name: Install nvidia drivers
|
34 | 39 | ansible.builtin.command: dnf module install -y nvidia-driver
|
35 | 40 | register: _cuda_driver_install
|
36 |
| - when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" |
| 41 | + when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" |
37 | 42 | changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
|
38 | 43 |
|
| 44 | +- name: Check kernel has not been modified |
| 45 | + assert: |
| 46 | + that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched |
| 47 | + fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" |
| 48 | + |
39 | 49 | - name: Install cuda packages
|
40 | 50 | ansible.builtin.dnf:
|
41 | 51 | name: "{{ cuda_packages }}"
|
| 52 | + when: cuda_package_version != 'none' |
42 | 53 | register: cuda_package_install
|
43 | 54 |
|
44 | 55 | - name: Add cuda binaries to path
|
45 | 56 | lineinfile:
|
46 | 57 | path: /etc/profile.d/sh.local
|
47 | 58 | line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'
|
| 59 | + when: cuda_package_version != 'none' |
48 | 60 |
|
49 | 61 | - name: Enable NVIDIA Persistence Daemon
|
50 | 62 | systemd:
|
|
60 | 72 | - name: Wait for hosts to be reachable
|
61 | 73 | wait_for_connection:
|
62 | 74 | sleep: 15
|
| 75 | + when: cuda_package_install.changed |
0 commit comments