diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 92666a4e6..4c7e28b2b 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -43,8 +43,18 @@ jobs: TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} + STACKHPC_TF_DIR: environments/.stackhpc/tofu steps: - - uses: actions/checkout@v2 + + - name: Find the latest release + run: | + echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" + + - name: Checkout latest release + uses: actions/checkout@v4 + with: + ref: ${{ env.LATEST_RELEASE_TAG }} + fetch-depth: 0 - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -60,9 +70,10 @@ jobs: fi done - - name: Record settings for CI cloud + - name: Record debug info run: | - echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG + echo CI_CLOUD: $CI_CLOUD - name: Setup ssh run: | @@ -76,7 +87,7 @@ jobs: run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts shell: bash - - name: Install ansible etc + - name: Install ansible, pip and galaxy requirements run: dev/setup-env.sh - name: Install OpenTofu @@ -86,7 +97,7 @@ jobs: - name: Initialise tofu run: tofu init - working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu + working-directory: ${{ env.STACKHPC_TF_DIR }} - name: Write clouds.yaml run: | @@ -103,23 +114,23 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Provision nodes using fat image + - name: Provision nodes using latest release image id: provision_servers run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' - - name: Configure cluster + - name: Configure cluster at latest release run: | . venv/bin/activate . environments/.stackhpc/activate @@ -127,11 +138,11 @@ jobs: ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Run MPI-based tests + - name: Run MPI-based tests at latest release run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook -vv ansible/adhoc/hpctests.yml + ansible-playbook -vv ansible/adhoc/hpctests.yml --tags pingpong # - name: Run EESSI tests # run: | @@ -139,6 +150,54 @@ jobs: # . environments/.stackhpc/activate # ansible-playbook -vv ansible/ci/check_eessi.yml + - name: Checkout current branch + run: git checkout ${{ github.head_ref || github.ref_name }} + + - name: Update ansible, pip and galaxy requirements + run: dev/setup-env.sh + + - name: Reimage login and control nodes to image in current branch + id: reimage_non_compute + run: | + . venv/bin/activate + . environments/.stackhpc/activate + cd $STACKHPC_TF_DIR + tofu init + tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" + + - name: Configure cluster using current branch + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible all -m wait_for_connection + ansible-playbook -v ansible/site.yml + ansible-playbook -v ansible/ci/check_slurm.yml + + - name: Reimage compute nodes to image in current branch using slurm - tests compute-init + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml + ansible-playbook -v ansible/ci/check_slurm.yml + + - name: Check sacct state survived reimage to current branch + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml + + - name: Check MPI-based tests are shown in Grafana + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -vv ansible/ci/check_grafana.yml + + - name: Run MPI-based tests again in current branch + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml + - name: Confirm Open Ondemand is up (via SOCKS proxy) run: | . venv/bin/activate @@ -170,43 +229,10 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Test reimage of login and control nodes (via rebuild adhoc) - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml - ansible-playbook -v ansible/site.yml - ansible-playbook -v ansible/ci/check_slurm.yml - - - name: Test compute node reboot and compute-init - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml - ansible-playbook -v ansible/ci/check_slurm.yml - - - name: Check sacct state survived reimage - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml - - - name: Check MPI-based tests are shown in Grafana - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible-playbook -vv ansible/ci/check_grafana.yml - - name: Delete infrastructure run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu - tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" + cd $STACKHPC_TF_DIR + tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR" if: ${{ success() || cancelled() }} - - # - name: Delete images - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible-playbook -vv ansible/ci/delete_images.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index adece9a3f..93dbd9502 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -84,5 +84,7 @@ roles/* !roles/pytools/** !roles/rebuild/ !roles/rebuild/** +!roles/slurm_tools/ +!roles/slurm_tools/** !roles/gateway/ !roles/gateway/** diff --git a/ansible/ci/check_grafana.yml b/ansible/ci/check_grafana.yml index 216cb1ed9..36fb78b72 100644 --- a/ansible/ci/check_grafana.yml +++ b/ansible/ci/check_grafana.yml @@ -23,4 +23,4 @@ delay: 5 vars: _found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}" - _expected_jobs: ['hpl-solo.sh', 'pingpong.sh', 'pingmatrix.sh'] + _expected_jobs: ['pingpong.sh'] diff --git a/ansible/ci/check_sacct_hpctests.yml b/ansible/ci/check_sacct_hpctests.yml index 2ed6fda19..1ebbf2171 100644 --- a/ansible/ci/check_sacct_hpctests.yml +++ b/ansible/ci/check_sacct_hpctests.yml @@ -5,10 +5,6 @@ sacct_stdout_expected: |- # based on CI running hpctests as the first job JobID,JobName,State 1,pingpong.sh,COMPLETED - 2,pingmatrix.sh,COMPLETED - 3,hpl-build-linux64.sh,COMPLETED - 4_0,hpl-solo.sh,COMPLETED - 4_1,hpl-solo.sh,COMPLETED tasks: - name: Get info for ended jobs shell: diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index e97b5918d..b09bd7f3b 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -193,7 +193,7 @@ tasks_from: nfs-clients.yml when: - enable_nfs - - nfs_enable.clients | default(item.nfs_enable.clients) | bool + - nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) loop: "{{ nfs_configurations }}" - name: Manila mounts diff --git a/ansible/roles/slurm_tools/defaults/main.yml b/ansible/roles/slurm_tools/defaults/main.yml index 39070255c..2e3bd7ddb 100644 --- a/ansible/roles/slurm_tools/defaults/main.yml +++ b/ansible/roles/slurm_tools/defaults/main.yml @@ -1,4 +1,4 @@ --- pytools_editable: false -pytools_gitref: master +pytools_gitref: v2.0 pytools_user: root diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index a56dda976..7e213c00c 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250319-1045-69713f23", - "RL9": "openhpc-RL9-250319-1045-69713f23" + "RL8": "openhpc-RL8-250326-1048-3e132168", + "RL9": "openhpc-RL9-250326-1049-3e132168" } } diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 39c264576..abde7c76e 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -24,17 +24,9 @@ nfs_configurations: # NB: this is stackhpc.nfs role defaults but are set here to prevent being # accidently overriden via default options nfs_export_options: 'rw,secure,root_squash' - # prevent non-cluster IPs mounting the share: - # NB: this is set as default for all shares above but is repeated here - # in case nfs_export_clients is overriden - nfs_export_clients: "{{ _nfs_node_ips }}" - comment: Export /exports/cluster from Slurm control node nfs_enable: server: "{{ inventory_hostname in groups['control'] }}" clients: false nfs_export: "/exports/cluster" - # prevent non-cluster IPs mounting the share: - # NB: this is set as default for all shares above but is repeated here - # in case nfs_export_clients is overriden - nfs_export_clients: "{{ _nfs_node_ips }}"