diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 33a25d9b4..193cb6091 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,6 +1,6 @@ cuda_distro: "rhel{{ ansible_distribution_major_version }}" cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo" -cuda_driver_stream: default +cuda_driver_stream: 560-open cuda_package_version: 'latest' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/main.yml index 22f8e9e8e..3dbc45268 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/main.yml @@ -25,7 +25,7 @@ register: _cuda_driver_module_enabled - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_driver_stream }}" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" diff --git a/ansible/roles/lustre/tasks/install.yml b/ansible/roles/lustre/tasks/install.yml index e0af857cf..0ae3bf6a5 100644 --- a/ansible/roles/lustre/tasks/install.yml +++ b/ansible/roles/lustre/tasks/install.yml @@ -42,29 +42,8 @@ name: "{{ _lustre_find_rpms.files | map(attribute='path')}}" disable_gpg_check: yes -- block: - - name: Remove lustre build prerequisites - # NB Only remove ones this role installed which weren't upgrades - ansible.builtin.dnf: - name: "{{ _new_pkgs }}" - state: absent - vars: - _installed_pkgs: | - {{ - _lustre_dnf_build_packages.results | - select('match', 'Installed:') | - map('regex_replace', '^Installed: (.+?)-[0-9].*$', '\1') - }} - _removed_pkgs: | - {{ - _lustre_dnf_build_packages.results | - select('match', 'Removed:') | - map('regex_replace', '^Removed: (.+?)-[0-9].*$', '\1') - }} - _new_pkgs: "{{ _installed_pkgs | difference(_removed_pkgs) }}" - - - name: Delete lustre build dir - file: +- name: Delete lustre build dir + file: path: "{{ lustre_build_dir }}" state: absent when: lustre_build_cleanup | bool diff --git a/ansible/roles/ofed/tasks/install.yml b/ansible/roles/ofed/tasks/install.yml index 45f341bf9..ac3d264c1 100644 --- a/ansible/roles/ofed/tasks/install.yml +++ b/ansible/roles/ofed/tasks/install.yml @@ -1,3 +1,13 @@ + +- name: Install latest kernel packages + ansible.builtin.dnf: + name: + - kernel + - kernel-core + - kernel-tools + - kernel-tools-libs + state: latest + - name: Get installed kernels command: dnf list --installed kernel register: _ofed_dnf_kernels @@ -8,16 +18,20 @@ register: _ofed_loaded_kernel changed_when: false -- name: Check current kernel is newest installed - assert: - that: _ofed_kernel_current == _ofed_dnf_kernels_newest - fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" +- name: Reboot into new kernel if not on latest + ansible.builtin.reboot: vars: _ofed_kernel_current: >- {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} _ofed_dnf_kernels_newest: >- {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + when: _ofed_kernel_current != _ofed_dnf_kernels_newest + +- name: Get new running kernel + command: uname -r + register: _ofed_loaded_kernel + changed_when: false - name: Enable epel dnf: