diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 7e2fc35b1..59eb1b78e 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -1,31 +1,26 @@ name: Build fat image -'on': +on: workflow_dispatch: -concurrency: - group: ${{ github.ref }}-{{ matrix.os_version }}-{{ matrix.build }} # to branch/PR + OS + build - cancel-in-progress: true jobs: openstack: name: openstack-imagebuild + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails - matrix: # build RL8, RL9+OFED, RL9+CUDA versions + matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions os_version: - RL8 - RL9 build: - - openstack.openhpc - openstack.openhpc-ofed - openstack.openhpc-cuda exclude: - - os_version: RL8 - build: openstack.openhpc-ofed - os_version: RL8 build: openstack.openhpc-cuda - - os_version: RL9 - build: openstack.openhpc env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 1813eac13..711d24c21 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -2,12 +2,6 @@ name: Test deployment and reimage on OpenStack on: workflow_dispatch: - inputs: - use_RL8: - required: true - description: Include RL8 tests - type: boolean - default: false push: branches: - main @@ -15,27 +9,22 @@ on: jobs: openstack: name: openstack-ci - concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS + cancel-in-progress: true runs-on: ubuntu-22.04 strategy: + fail-fast: false # allow other matrix jobs to continue even if one fails matrix: - os_version: [RL8, RL9] - rl8_selected: - - ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch - rl8_branch: - - ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge - rl8_label: - - ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created - exclude: - - os_version: RL8 - rl8_selected: false - rl8_branch: false - rl8_label: false + os_version: + - RL8 + - RL9 env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} CI_CLOUD: ${{ vars.CI_CLOUD }} + TF_VAR_os_version: ${{ matrix.os_version }} steps: - uses: actions/checkout@v2 @@ -89,8 +78,6 @@ jobs: . environments/.stackhpc/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" - env: - TF_VAR_os_version: ${{ matrix.os_version }} - name: Delete infrastructure if provisioning failed run: | @@ -99,8 +86,6 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' - env: - TF_VAR_os_version: ${{ matrix.os_version }} - name: Configure cluster run: | @@ -199,8 +184,6 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} - env: - TF_VAR_os_version: ${{ matrix.os_version }} # - name: Delete images # run: | diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index e8e2713a5..c43d614db 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -227,24 +227,25 @@ - update tasks: - name: Check for pending reboot from package updates - stat: - path: /var/run/reboot-required + command: + cmd: dnf needs-restarting -r register: update_reboot_required - - debug: - msg: "setstatus:{{ (sestatus.reboot_required | default(false)) }} packages: {{ (update_reboot_required.stat.exists | bool) }}" - - name: Reboot if required from SELinux state change or package upgrades + failed_when: "update_reboot_required.rc not in [0, 1]" + changed_when: false + - name: Reboot to cover SELinux state change or package upgrades reboot: post_reboot_delay: 30 - when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.stat.exists | bool) + when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.rc == 1) - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 - - name: update facts + - name: Clear facts + meta: clear_facts + - name: Update facts setup: - when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false)) - hosts: ofed - gather_facts: no + gather_facts: yes become: yes tags: ofed tasks: diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 7233809bc..0d040b55e 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,7 +1,8 @@ -ofed_version: '24.04-0.6.6.0' # LTS version 23.10-2.1.3.1 does not support RL9.4 +ofed_version: '23.10-3.2.2.0' # LTS ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' +ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8' ofed_arch: "{{ ansible_architecture }}" ofed_tmp_dir: /tmp ofed_update_firmware: false diff --git a/ansible/roles/ofed/tasks/install.yml b/ansible/roles/ofed/tasks/install.yml index 454ef787e..45f341bf9 100644 --- a/ansible/roles/ofed/tasks/install.yml +++ b/ansible/roles/ofed/tasks/install.yml @@ -10,11 +10,13 @@ - name: Check current kernel is newest installed assert: - that: _ofed_loaded_kernel.stdout == _ofed_dnf_kernels_newest + that: _ofed_kernel_current == _ofed_dnf_kernels_newest fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" vars: + _ofed_kernel_current: >- + {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} _ofed_dnf_kernels_newest: >- - {{ _ofed_dnf_kernels.stdout_lines[1:] | map('regex_replace', '^\w+\.(\w+)\s+(\S+)\s+\S+\s*$', '\2.\1') | community.general.version_sort | last }} + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " - name: Enable epel @@ -31,7 +33,7 @@ - name: Install build prerequisites dnf: - name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_version == '8.9' else []) }}" + name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_major_version == '8' else []) }}" when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" # don't want to install a load of prereqs unnecessarily diff --git a/environments/.caas/ansible.cfg b/environments/.caas/ansible.cfg index 54a1c2a50..922f086aa 100644 --- a/environments/.caas/ansible.cfg +++ b/environments/.caas/ansible.cfg @@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins [ssh_connection] ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True + +[inventory] +# Fail when any inventory source cannot be parsed. +any_unparsed_is_failed = True diff --git a/environments/.stackhpc/ansible.cfg b/environments/.stackhpc/ansible.cfg index aa0ec5aaf..26587e33f 100644 --- a/environments/.stackhpc/ansible.cfg +++ b/environments/.stackhpc/ansible.cfg @@ -14,3 +14,7 @@ filter_plugins = ../../ansible/filter_plugins [ssh_connection] ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True + +[inventory] +# Fail when any inventory source cannot be parsed. +any_unparsed_is_failed = True diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 96e04538b..0b34a4947 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/413 - RL8: "openhpc-RL8-240904-1509-1687368f" - RL9: "openhpc-ofed-RL9-240904-1509-1687368f" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/427 + RL8: "openhpc-ofed-RL8-240906-1042-32568dbb" + RL9: "openhpc-ofed-RL9-240906-1041-32568dbb" } } diff --git a/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg b/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg index 2a12e06b6..04c1fe143 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg +++ b/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg @@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins [ssh_connection] ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True + +[inventory] +# Fail when any inventory source cannot be parsed. +any_unparsed_is_failed = True