From 1be331f10b584df900b06d38a50ee4fc437635d2 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 14 Aug 2024 15:26:44 +0100 Subject: [PATCH 01/17] Check major version for RL8 package installs --- ansible/roles/ofed/defaults/main.yml | 1 + ansible/roles/ofed/tasks/install.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 7233809bc..62b484a75 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -2,6 +2,7 @@ ofed_version: '24.04-0.6.6.0' # LTS version 23.10-2.1.3.1 does not support RL9. ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' +ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8' ofed_arch: "{{ ansible_architecture }}" ofed_tmp_dir: /tmp ofed_update_firmware: false diff --git a/ansible/roles/ofed/tasks/install.yml b/ansible/roles/ofed/tasks/install.yml index 454ef787e..c0500c9fb 100644 --- a/ansible/roles/ofed/tasks/install.yml +++ b/ansible/roles/ofed/tasks/install.yml @@ -31,7 +31,7 @@ - name: Install build prerequisites dnf: - name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_version == '8.9' else []) }}" + name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_major_version == '8' else []) }}" when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" # don't want to install a load of prereqs unnecessarily From 1a15e4c65c1b62d1ed6bf1165b4ad8fd9dbfa438 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 14 Aug 2024 15:27:00 +0100 Subject: [PATCH 02/17] Gather facts on ofed role --- ansible/bootstrap.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index e8e2713a5..cb51dbea4 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -244,7 +244,7 @@ when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false)) - hosts: ofed - gather_facts: no + gather_facts: yes become: yes tags: ofed tasks: From de270bf75d1bc454484f46f91d82a8c82540277b Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 14 Aug 2024 15:29:53 +0100 Subject: [PATCH 03/17] Support kernel checks with mismatching version length 4.18.0-553.16.1.el8_9.x86_64 4.18.0-553.el8_9.x86_64 These would fail with the error: '<' not supported between instances of 'str' and 'int'. as the community.general.version_sort was trying to compare the `el8_9` of the latter with the `16` of the former. Strip the last two chunks so we just compare numbers. --- ansible/roles/ofed/tasks/install.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ansible/roles/ofed/tasks/install.yml b/ansible/roles/ofed/tasks/install.yml index c0500c9fb..45f341bf9 100644 --- a/ansible/roles/ofed/tasks/install.yml +++ b/ansible/roles/ofed/tasks/install.yml @@ -10,11 +10,13 @@ - name: Check current kernel is newest installed assert: - that: _ofed_loaded_kernel.stdout == _ofed_dnf_kernels_newest + that: _ofed_kernel_current == _ofed_dnf_kernels_newest fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" vars: + _ofed_kernel_current: >- + {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} _ofed_dnf_kernels_newest: >- - {{ _ofed_dnf_kernels.stdout_lines[1:] | map('regex_replace', '^\w+\.(\w+)\s+(\S+)\s+\S+\s*$', '\2.\1') | community.general.version_sort | last }} + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " - name: Enable epel From d98f2862cff68adc048961245afe6004ad1e6a21 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 14 Aug 2024 15:30:05 +0100 Subject: [PATCH 04/17] Move to LTS version now RL9.4 is supported --- ansible/roles/ofed/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 62b484a75..0d040b55e 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,4 +1,4 @@ -ofed_version: '24.04-0.6.6.0' # LTS version 23.10-2.1.3.1 does not support RL9.4 +ofed_version: '23.10-3.2.2.0' # LTS ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' From f594d37d1a4b26eda7374f08843471f381118c2b Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 14 Aug 2024 16:12:08 +0100 Subject: [PATCH 05/17] Fail when any inventory source cannot be parsed --- .../skeleton/{{cookiecutter.environment}}/ansible.cfg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg b/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg index 2a12e06b6..04c1fe143 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg +++ b/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg @@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins [ssh_connection] ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True + +[inventory] +# Fail when any inventory source cannot be parsed. +any_unparsed_is_failed = True From 809e3f86053745807d607089ef8098d2dfe0629c Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 21 Aug 2024 13:41:17 +0100 Subject: [PATCH 06/17] Always reboot after selinux and package updates --- ansible/bootstrap.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index cb51dbea4..286d18a00 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -226,16 +226,9 @@ - selinux - update tasks: - - name: Check for pending reboot from package updates - stat: - path: /var/run/reboot-required - register: update_reboot_required - - debug: - msg: "setstatus:{{ (sestatus.reboot_required | default(false)) }} packages: {{ (update_reboot_required.stat.exists | bool) }}" - - name: Reboot if required from SELinux state change or package upgrades + - name: Reboot to cover SELinux state change or package upgrades reboot: post_reboot_delay: 30 - when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.stat.exists | bool) - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 From 374c42c4a1afb032f1cf6fdb671ab6df29815a42 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 21 Aug 2024 13:41:38 +0100 Subject: [PATCH 07/17] Cleat facts before OFED so install will match newest kernel --- ansible/bootstrap.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 286d18a00..5bbb1a3f7 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -232,9 +232,13 @@ - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 - - name: update facts - setup: - when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false)) + +- hosts: ofed + gather_facts: false + tags: ofed + tasks: + - name: refresh facts + meta: clear_facts - hosts: ofed gather_facts: yes From afc76778f5e89b0944a62b65485986fe9a7ecdc2 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 21 Aug 2024 13:41:38 +0100 Subject: [PATCH 08/17] Clear facts after reboot so OFED install will match newest kernel --- ansible/bootstrap.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 5bbb1a3f7..0bee023f1 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -232,13 +232,10 @@ - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 - -- hosts: ofed - gather_facts: false - tags: ofed - tasks: - - name: refresh facts + - name: Clear facts meta: clear_facts + - name: Update facts + setup: - hosts: ofed gather_facts: yes From fe42ccfc383f25c893a2995be23858c55e485623 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 09:02:58 +0000 Subject: [PATCH 09/17] fail caas and stackhpc if any inventory can't be read --- environments/.caas/ansible.cfg | 4 ++++ environments/.stackhpc/ansible.cfg | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/environments/.caas/ansible.cfg b/environments/.caas/ansible.cfg index 54a1c2a50..922f086aa 100644 --- a/environments/.caas/ansible.cfg +++ b/environments/.caas/ansible.cfg @@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins [ssh_connection] ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True + +[inventory] +# Fail when any inventory source cannot be parsed. +any_unparsed_is_failed = True diff --git a/environments/.stackhpc/ansible.cfg b/environments/.stackhpc/ansible.cfg index aa0ec5aaf..26587e33f 100644 --- a/environments/.stackhpc/ansible.cfg +++ b/environments/.stackhpc/ansible.cfg @@ -14,3 +14,7 @@ filter_plugins = ../../ansible/filter_plugins [ssh_connection] ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True + +[inventory] +# Fail when any inventory source cannot be parsed. +any_unparsed_is_failed = True From e84cc0b641dfad8f3d61af1a5034adb2bc2b6309 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 09:10:03 +0000 Subject: [PATCH 10/17] make reboot conditional on package or SELinux changes again --- ansible/bootstrap.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 0bee023f1..003c675b9 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -226,9 +226,14 @@ - selinux - update tasks: + - name: Check for pending reboot from package updates + command: + cmd: dnf needs-restarting + register: update_reboot_required - name: Reboot to cover SELinux state change or package upgrades reboot: post_reboot_delay: 30 + when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.stdout != '') - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 From b4fb7899253ef13267502bc9bd8263835f4c69ec Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 09:22:29 +0000 Subject: [PATCH 11/17] include OFED in both RL8 and RL9 builds --- .github/workflows/fatimage.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 7e2fc35b1..6fd69f9e3 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -11,21 +11,16 @@ jobs: runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails - matrix: # build RL8, RL9+OFED, RL9+CUDA versions + matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions os_version: - RL8 - RL9 build: - - openstack.openhpc - openstack.openhpc-ofed - openstack.openhpc-cuda exclude: - - os_version: RL8 - build: openstack.openhpc-ofed - os_version: RL8 build: openstack.openhpc-cuda - - os_version: RL9 - build: openstack.openhpc env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack From 908d8decab61e0777370903b7d951af8ba91c5f1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 09:27:55 +0000 Subject: [PATCH 12/17] always run CI tests on RL8 and RL9 --- .github/workflows/stackhpc.yml | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 1813eac13..43b6a25b7 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -2,12 +2,6 @@ name: Test deployment and reimage on OpenStack on: workflow_dispatch: - inputs: - use_RL8: - required: true - description: Include RL8 tests - type: boolean - default: false push: branches: - main @@ -20,22 +14,12 @@ jobs: strategy: matrix: os_version: [RL8, RL9] - rl8_selected: - - ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch - rl8_branch: - - ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge - rl8_label: - - ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created - exclude: - - os_version: RL8 - rl8_selected: false - rl8_branch: false - rl8_label: false env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} CI_CLOUD: ${{ vars.CI_CLOUD }} + TF_VAR_os_version: ${{ matrix.os_version }} steps: - uses: actions/checkout@v2 @@ -89,8 +73,6 @@ jobs: . environments/.stackhpc/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" - env: - TF_VAR_os_version: ${{ matrix.os_version }} - name: Delete infrastructure if provisioning failed run: | @@ -99,8 +81,6 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' - env: - TF_VAR_os_version: ${{ matrix.os_version }} - name: Configure cluster run: | @@ -199,8 +179,6 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} - env: - TF_VAR_os_version: ${{ matrix.os_version }} # - name: Delete images # run: | From 53baab2912925e30a6471fc418061951f0248b43 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 09:33:33 +0000 Subject: [PATCH 13/17] allow concurrent RL8/RL9 CI tests --- .github/workflows/fatimage.yml | 2 +- .github/workflows/stackhpc.yml | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 6fd69f9e3..70655120b 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -3,7 +3,7 @@ name: Build fat image 'on': workflow_dispatch: concurrency: - group: ${{ github.ref }}-{{ matrix.os_version }}-{{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build cancel-in-progress: true jobs: openstack: diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 43b6a25b7..c33dc14b4 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -9,11 +9,15 @@ on: jobs: openstack: name: openstack-ci - concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS + cancel-in-progress: true runs-on: ubuntu-22.04 strategy: matrix: - os_version: [RL8, RL9] + os_version: + - RL8 + - RL9 env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack From a84331166b7a95dbd11d11152c338ba0978a2df6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 10:35:12 +0000 Subject: [PATCH 14/17] mark pending reboot check as not a change --- ansible/bootstrap.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 003c675b9..b8e41ec99 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -230,6 +230,7 @@ command: cmd: dnf needs-restarting register: update_reboot_required + changed_when: false - name: Reboot to cover SELinux state change or package upgrades reboot: post_reboot_delay: 30 From 32568dbbeef0a606c0ae2be835f338c45e2dca17 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 10:39:37 +0000 Subject: [PATCH 15/17] fix workflow matrix definitions --- .github/workflows/fatimage.yml | 8 ++++---- .github/workflows/stackhpc.yml | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 70655120b..59eb1b78e 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -1,13 +1,13 @@ name: Build fat image -'on': +on: workflow_dispatch: -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build - cancel-in-progress: true jobs: openstack: name: openstack-imagebuild + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index c33dc14b4..711d24c21 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -14,6 +14,7 @@ jobs: cancel-in-progress: true runs-on: ubuntu-22.04 strategy: + fail-fast: false # allow other matrix jobs to continue even if one fails matrix: os_version: - RL8 From 17b2b42e2a2c18b25cd3e2e763fd81b8fde2328f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 12:39:06 +0000 Subject: [PATCH 16/17] bump CI images - now both OFED --- environments/.stackhpc/terraform/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 96e04538b..0b34a4947 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/413 - RL8: "openhpc-RL8-240904-1509-1687368f" - RL9: "openhpc-ofed-RL9-240904-1509-1687368f" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/427 + RL8: "openhpc-ofed-RL8-240906-1042-32568dbb" + RL9: "openhpc-ofed-RL9-240906-1041-32568dbb" } } From 76b4eadc13a4029471d9839d14bc240840a30c13 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Sep 2024 15:08:54 +0000 Subject: [PATCH 17/17] use reboot hint for checking reboot required --- ansible/bootstrap.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index b8e41ec99..c43d614db 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -228,13 +228,14 @@ tasks: - name: Check for pending reboot from package updates command: - cmd: dnf needs-restarting + cmd: dnf needs-restarting -r register: update_reboot_required + failed_when: "update_reboot_required.rc not in [0, 1]" changed_when: false - name: Reboot to cover SELinux state change or package upgrades reboot: post_reboot_delay: 30 - when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.stdout != '') + when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.rc == 1) - name: Wait for hosts to be reachable wait_for_connection: sleep: 15