From c5db876ef9f3a2372a972122a26743216b3c9369 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 10 Dec 2024 17:18:40 +0000 Subject: [PATCH] WIP: move to DOCA --- ansible/roles/ofed/defaults/main.yml | 32 +---- .../roles/ofed/tasks/install-kernel-devel.yml | 24 ++++ ansible/roles/ofed/tasks/install.yml | 110 +++++++----------- 3 files changed, 71 insertions(+), 95 deletions(-) create mode 100644 ansible/roles/ofed/tasks/install-kernel-devel.yml diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 0d040b55e..60136905b 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,29 +1,3 @@ -ofed_version: '23.10-3.2.2.0' # LTS -ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz -ofed_distro: rhel # NB: not expected to work on other distros due to installation differences -ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' -ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8' -ofed_arch: "{{ ansible_architecture }}" -ofed_tmp_dir: /tmp -ofed_update_firmware: false -ofed_build_packages: # may require additional packages depending on ofed_package_selection - - autoconf - - automake - - gcc - - gcc-gfortran - - kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }} - - kernel-rpm-macros - - libtool - - lsof - - patch - - pciutils - - perl - - rpm-build - - tcl - - tk -ofed_build_rl8_packages: - - gdb-headless - - python36 -ofed_package_selection: # list of package selection flags for mlnxofedinstall script - - hpc - - with-nfsrdma +doca_version: '2.9.1' +doca_profile: doca-ofed +doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/ofed/tasks/install-kernel-devel.yml b/ansible/roles/ofed/tasks/install-kernel-devel.yml new file mode 100644 index 000000000..6a1943a32 --- /dev/null +++ b/ansible/roles/ofed/tasks/install-kernel-devel.yml @@ -0,0 +1,24 @@ +- name: Get installed kernels + command: dnf list --installed kernel + register: _ofed_dnf_kernels + changed_when: false + +- name: Determine running kernel + command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + register: _ofed_loaded_kernel + changed_when: false + +- name: Check current kernel is newest installed + assert: + that: _ofed_kernel_current == _ofed_dnf_kernels_newest + fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" + vars: + _ofed_kernel_current: >- + {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} + _ofed_dnf_kernels_newest: >- + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + +- name: Install matching kernel-devel package + dnf: + name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" diff --git a/ansible/roles/ofed/tasks/install.yml b/ansible/roles/ofed/tasks/install.yml index 45f341bf9..9d297e946 100644 --- a/ansible/roles/ofed/tasks/install.yml +++ b/ansible/roles/ofed/tasks/install.yml @@ -1,75 +1,53 @@ -- name: Get installed kernels - command: dnf list --installed kernel - register: _ofed_dnf_kernels +- import_tasks: install-kernel-devel.yml + +- name: Install DOCA repo + ansible.builtin.yum_repository: + name: doca + file: doca + description: DOCA Online Repo + baseurl: "{{ doca_repo_url }}" + enabled: true + gpgcheck: false + +- name: Install doca-extra package + ansible.builtin.dnf: + name: doca-extra + +- name: Build DOCA kernel modules + ansible.builtin.shell: + cmd: /opt/mellanox/doca/tools/doca-kernel-support + register: _doca_kernel_build + + +- name: Find generated doca-kernel-repo + ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' + register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm changed_when: false -- name: Determine running kernel - command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 - register: _ofed_loaded_kernel - changed_when: false - -- name: Check current kernel is newest installed - assert: - that: _ofed_kernel_current == _ofed_dnf_kernels_newest - fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" - vars: - _ofed_kernel_current: >- - {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} - _ofed_dnf_kernels_newest: >- - {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} - # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " - -- name: Enable epel - dnf: - name: epel-release - -- name: Check for existing OFED installation - command: ofed_info - changed_when: false - failed_when: - - _ofed_info.rc > 0 - - "'No such file or directory' not in _ofed_info.msg" - register: _ofed_info +- name: Create dnf cache + ansible.builtin.command: dnf makecache -- name: Install build prerequisites - dnf: - name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_major_version == '8' else []) }}" - when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" - # don't want to install a load of prereqs unnecessarily +- name: Install DOCA repository package + ansible.builtin.dnf: + name: "{{ _doca_kernel_repo.stdout }}" + disable_gpg_check: true -- name: Download and unpack Mellanox OFED tarball - ansible.builtin.unarchive: - src: "{{ ofed_download_url }}" - dest: "{{ ofed_tmp_dir }}" - remote_src: yes - become: no - when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" +- name: Install DOCA packages + ansible.builtin.dnf: + name: "{{ doca_profile }}" -# Below from https://docs.nvidia.com/networking/display/mlnxofedv24010331/user+manual -- name: Run OFED install script - command: - cmd: > - ./mlnxofedinstall - --add-kernel-support - {% if not ofed_update_firmware %}--without-fw-update{% endif %} - --force - --skip-repo - {% for pkgsel in ofed_package_selection %} - --{{ pkgsel }} - {% endfor %} - chdir: "{{ ofed_tmp_dir }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}/" - register: _ofed_install - when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" - async: "{{ 45 * 60 }}" # wait for up to 45 minutes - poll: 15 # check every 15 seconds +- name: Cleanup DOCA build directories + ansible.builtin.file: + state: absent + path: "{{ (_doca_kernel_repo.stdout | split('/'))[:2] | join('/') }}" - name: Update initramfs - command: - cmd: dracut -f - when: '"update your initramfs" in _ofed_install.stdout | default("")' - failed_when: false # always shows errors due to deleted modules for inbox RDMA drivers + ansible.builtin.command: + cmd: dracut -f --tmpdir /var/tmp + environment: + TMPDIR: /var/tmp + register: _doca_dracut + failed_when: _doca_dracut.stderr != '' # appears rc is always 0 - name: Load the new driver - command: - cmd: /etc/init.d/openibd restart - when: '"To load the new driver" in _ofed_install.stdout | default("")' + ansible.builtin.command: /etc/init.d/openibd restart