From 111107ca1d08f6185aeb3a85dfa251e51ec782b3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 12:07:19 +0000 Subject: [PATCH 01/36] use latest release for initial CI cluster setup --- .github/workflows/stackhpc.yml | 53 ++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 871aff155..def6c420a 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -44,7 +44,10 @@ jobs: CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -76,6 +79,14 @@ jobs: run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts shell: bash + - name: Find the latest release + run: | + echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" + echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG + + - name: Checkout latest release + run: git checkout $LATEST_RELEASE_TAG + - name: Install ansible etc run: dev/setup-env.sh @@ -103,7 +114,7 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Provision nodes using fat image + - name: Provision nodes using latest release image id: provision_servers run: | . venv/bin/activate @@ -119,7 +130,7 @@ jobs: tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' - - name: Configure cluster + - name: Configure cluster at latest release run: | . venv/bin/activate . environments/.stackhpc/activate @@ -127,7 +138,7 @@ jobs: ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Run MPI-based tests + - name: Run MPI-based tests at latest release run: | . venv/bin/activate . environments/.stackhpc/activate @@ -170,23 +181,33 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Test reimage of login and control nodes (via rebuild adhoc) + - name: Switch to current branch + run: git checkout - + + - name: Reimage login and control nodes to image in current branch + id: reimage_non_compute + run: | + . venv/bin/activate + . environments/.stackhpc/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json + + - name: Configure cluster using current branch run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml + ansible all -m wait_for_connection ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Test compute node reboot and compute-init + - name: Reimage compute nodes to image in current branch using slurm - tests compute-init run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - ansible-playbook -v ansible/ci/check_slurm.yml ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml + ansible-playbook -v ansible/ci/check_slurm.yml - - name: Check sacct state survived reimage + - name: Check sacct state survived reimage to current branch run: | . venv/bin/activate . environments/.stackhpc/activate @@ -198,6 +219,12 @@ jobs: . environments/.stackhpc/activate ansible-playbook -vv ansible/ci/check_grafana.yml + - name: Run MPI-based tests again in current branch + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml + - name: Delete infrastructure run: | . venv/bin/activate @@ -205,9 +232,3 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/tofu tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} - - # - name: Delete images - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible-playbook -vv ansible/ci/delete_images.yml From 85eafcc54648e5d6fb9343fd0d89a3b9a21c5d36 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 12:20:48 +0000 Subject: [PATCH 02/36] fix changing branches --- .github/workflows/stackhpc.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index def6c420a..cc657143d 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -44,10 +44,16 @@ jobs: CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} steps: - - uses: actions/checkout@v4 + + - name: Find the latest release + run: | + echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" + echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG + + - name: Checkout latest release + uses: actions/checkout@v4 with: - fetch-depth: 0 - fetch-tags: true + ref: ${{ env.LATEST_RELEASE_TAG }} - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -79,14 +85,6 @@ jobs: run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts shell: bash - - name: Find the latest release - run: | - echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" - echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - - - name: Checkout latest release - run: git checkout $LATEST_RELEASE_TAG - - name: Install ansible etc run: dev/setup-env.sh @@ -181,9 +179,9 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Switch to current branch - run: git checkout - - + - name: Checkout current branch + uses: actions/checkout@v4 + - name: Reimage login and control nodes to image in current branch id: reimage_non_compute run: | From f0cd48f5325657e8232cf577168a18e18fd63d09 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 13:46:13 +0000 Subject: [PATCH 03/36] cope with change from stackhpc terraform-> tofu directory --- .github/workflows/stackhpc.yml | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index cc657143d..b83aef2f6 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -48,13 +48,17 @@ jobs: - name: Find the latest release run: | echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" - echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - name: Checkout latest release uses: actions/checkout@v4 with: ref: ${{ env.LATEST_RELEASE_TAG }} + - name: Find stackhpc tofu/terraform directory + # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 + run: | + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu )) >> "$GITHUB_ENV" + - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} run: | @@ -69,9 +73,11 @@ jobs: fi done - - name: Record settings for CI cloud + - name: Record debug info run: | - echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG + echo STACKHPC_TF_DIR: $STACKHPC_TF_DIR + echo CI_CLOUD: $CI_CLOUD - name: Setup ssh run: | @@ -95,7 +101,7 @@ jobs: - name: Initialise tofu run: tofu init - working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu + working-directory: ${{ env.STACKHPC_TF_DIR }} - name: Write clouds.yaml run: | @@ -117,14 +123,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -182,12 +188,17 @@ jobs: - name: Checkout current branch uses: actions/checkout@v4 + - name: Find stackhpc tofu/terraform directory + # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 + run: | + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu \) ) >> "$GITHUB_ENV" + - name: Reimage login and control nodes to image in current branch id: reimage_non_compute run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json - name: Configure cluster using current branch @@ -227,6 +238,6 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} From 8be90875f4e858e581bf2dadef1fdc12627b159e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 13:52:31 +0000 Subject: [PATCH 04/36] try to get workflow to update --- .github/workflows/stackhpc.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b83aef2f6..97baf9fd4 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -191,7 +191,9 @@ jobs: - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu \) ) >> "$GITHUB_ENV" + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d -name tofu -o -type d -name terraform) >> "$GITHUB_ENV" + # something about GH actions parsing eats \( \) type expressions + - name: Reimage login and control nodes to image in current branch id: reimage_non_compute From bf1ceed770e12b445106f4305eefc175bc50bc17 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 14:06:52 +0000 Subject: [PATCH 05/36] fix finding stackhpc tf directory --- .github/workflows/stackhpc.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 97baf9fd4..773433825 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -57,7 +57,7 @@ jobs: - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu )) >> "$GITHUB_ENV" + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -187,13 +187,11 @@ jobs: - name: Checkout current branch uses: actions/checkout@v4 - + - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d -name tofu -o -type d -name terraform) >> "$GITHUB_ENV" - # something about GH actions parsing eats \( \) type expressions - + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - name: Reimage login and control nodes to image in current branch id: reimage_non_compute From 0c3b0474234cefe0f29b8bf39baf88393646a338 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 14:50:01 +0000 Subject: [PATCH 06/36] update ansible etc when switching branches --- .github/workflows/stackhpc.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 773433825..0047d8484 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -91,7 +91,7 @@ jobs: run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts shell: bash - - name: Install ansible etc + - name: Install ansible, pip and galaxy requirements run: dev/setup-env.sh - name: Install OpenTofu @@ -188,6 +188,9 @@ jobs: - name: Checkout current branch uses: actions/checkout@v4 + - name: Update ansible, pip and galaxy requirements + run: dev/setup-env.sh + - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | From 4c40546a8ff979ea48083ea305516c29945cd174 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 15:21:38 +0000 Subject: [PATCH 07/36] fixup reimaging --- .github/workflows/stackhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 0047d8484..7550409f8 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -202,7 +202,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate cd $STACKHPC_TF_DIR - tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json + tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Configure cluster using current branch run: | From cfc14a3834db4b3af3ef283d65f0de1325a19034 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 16:13:03 +0000 Subject: [PATCH 08/36] fix losing tf state on new checkout --- .github/workflows/stackhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 7550409f8..1ff41ea8c 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -53,6 +53,7 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ env.LATEST_RELEASE_TAG }} + fetch-depth: 0 - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 @@ -186,7 +187,7 @@ jobs: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Checkout current branch - uses: actions/checkout@v4 + run: git checkout ${{ github.head_ref || github.ref_name }} - name: Update ansible, pip and galaxy requirements run: dev/setup-env.sh From e14094851888be6092415317fb6a602c87b911a0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 16:51:28 +0000 Subject: [PATCH 09/36] fix STACKHPC_TF_DIR being concatenated not overwritten in GITHUB_ENV --- .github/workflows/stackhpc.yml | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 1ff41ea8c..7df726cfc 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -55,10 +55,10 @@ jobs: ref: ${{ env.LATEST_RELEASE_TAG }} fetch-depth: 0 - - name: Find stackhpc tofu/terraform directory + - name: Find stackhpc tofu/terraform directory at latest release # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" + echo STACKHPC_TF_DIR_RELEASE=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -77,7 +77,7 @@ jobs: - name: Record debug info run: | echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - echo STACKHPC_TF_DIR: $STACKHPC_TF_DIR + echo STACKHPC_TF_DIR_RELEASE: $STACKHPC_TF_DIR_RELEASE echo CI_CLOUD: $CI_CLOUD - name: Setup ssh @@ -102,7 +102,7 @@ jobs: - name: Initialise tofu run: tofu init - working-directory: ${{ env.STACKHPC_TF_DIR }} + working-directory: ${{ env.STACKHPC_TF_DIR_RELEASE }} - name: Write clouds.yaml run: | @@ -124,14 +124,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd $STACKHPC_TF_DIR_RELEASE tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd $STACKHPC_TF_DIR_RELEASE tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -192,17 +192,18 @@ jobs: - name: Update ansible, pip and galaxy requirements run: dev/setup-env.sh - - name: Find stackhpc tofu/terraform directory + - name: Find stackhpc tofu/terraform directory in current branch # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" + echo STACKHPC_TF_DIR_BRANCH=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - name: Reimage login and control nodes to image in current branch id: reimage_non_compute run: | + echo STACKHPC_TF_DIR_BRANCH: $STACKHPC_TF_DIR_BRANCH . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd $STACKHPC_TF_DIR_BRANCH tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Configure cluster using current branch @@ -242,6 +243,6 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd $STACKHPC_TF_DIR_BRANCH tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} From d5043630e154c1f264dfd66df44f45d597011abe Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Feb 2025 09:33:05 +0000 Subject: [PATCH 10/36] fix stackhpc tf directory change --- .github/workflows/stackhpc.yml | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 7df726cfc..5e3fba618 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -43,6 +43,10 @@ jobs: TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} + # Once a v1.158 is released including https://github.com/stackhpc/ansible-slurm-appliance/pull/541 + # this can be simplified: + STACKHPC_TF_DIR_OLD: environments/.stackhpc/terraform/ # v1.158 + STACKHPC_TF_DIR_NEW: environments/.stackhpc/tofu/ steps: - name: Find the latest release @@ -55,11 +59,6 @@ jobs: ref: ${{ env.LATEST_RELEASE_TAG }} fetch-depth: 0 - - name: Find stackhpc tofu/terraform directory at latest release - # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 - run: | - echo STACKHPC_TF_DIR_RELEASE=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} run: | @@ -77,7 +76,6 @@ jobs: - name: Record debug info run: | echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - echo STACKHPC_TF_DIR_RELEASE: $STACKHPC_TF_DIR_RELEASE echo CI_CLOUD: $CI_CLOUD - name: Setup ssh @@ -102,7 +100,7 @@ jobs: - name: Initialise tofu run: tofu init - working-directory: ${{ env.STACKHPC_TF_DIR_RELEASE }} + working-directory: ${{ env.STACKHPC_TF_DIR_OLD }} - name: Write clouds.yaml run: | @@ -124,14 +122,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_RELEASE + cd $STACKHPC_TF_DIR_OLD tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_RELEASE + cd $STACKHPC_TF_DIR_OLD tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -192,18 +190,12 @@ jobs: - name: Update ansible, pip and galaxy requirements run: dev/setup-env.sh - - name: Find stackhpc tofu/terraform directory in current branch - # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 - run: | - echo STACKHPC_TF_DIR_BRANCH=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - - name: Reimage login and control nodes to image in current branch id: reimage_non_compute run: | - echo STACKHPC_TF_DIR_BRANCH: $STACKHPC_TF_DIR_BRANCH . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_BRANCH + cd $STACKHPC_TF_DIR_NEW tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Configure cluster using current branch @@ -243,6 +235,9 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_BRANCH - tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" + cd $STACKHPC_TF_DIR_OLD + tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR_OLD" + cd - + cd $STACKHPC_TF_DIR_NEW + tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR_NEW" if: ${{ success() || cancelled() }} From 8d597f5540f4bcd04c0108ae7db5dcdc45e4a92d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Feb 2025 10:14:46 +0000 Subject: [PATCH 11/36] fix slurm_tools gitignore/version comment --- ansible/.gitignore | 2 ++ ansible/roles/slurm_tools/defaults/main.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ansible/.gitignore b/ansible/.gitignore index 94c094ae6..30f944133 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -82,3 +82,5 @@ roles/* !roles/pytools/** !roles/rebuild/ !roles/rebuild/** +!roles/slurm_tools/ +!roles/slurm_tools/** diff --git a/ansible/roles/slurm_tools/defaults/main.yml b/ansible/roles/slurm_tools/defaults/main.yml index 39070255c..697583514 100644 --- a/ansible/roles/slurm_tools/defaults/main.yml +++ b/ansible/roles/slurm_tools/defaults/main.yml @@ -1,4 +1,4 @@ --- pytools_editable: false -pytools_gitref: master +pytools_gitref: master # TODO: FIXME: do a release! pytools_user: root From 16fa657ddcc78d948e590b4e68458e4aac209ae6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Feb 2025 11:30:44 +0000 Subject: [PATCH 12/36] fixup TF state moving --- .github/workflows/stackhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 5e3fba618..f825d6184 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -195,6 +195,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate + cp $STACKHPC_TF_DIR_OLD/terraform.tfstate $STACKHPC_TF_DIR_NEW cd $STACKHPC_TF_DIR_NEW tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" From 1355b5fb9cceb84145c362a29176e43fc16f16af Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Feb 2025 12:04:42 +0000 Subject: [PATCH 13/36] re-init tofu after changing branches --- .github/workflows/stackhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index f825d6184..c9b20318d 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -197,6 +197,7 @@ jobs: . environments/.stackhpc/activate cp $STACKHPC_TF_DIR_OLD/terraform.tfstate $STACKHPC_TF_DIR_NEW cd $STACKHPC_TF_DIR_NEW + tofu init tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Configure cluster using current branch From 91484d2bd2acf97e2af62eeb575ce50f9352d69f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 12:07:19 +0000 Subject: [PATCH 14/36] use latest release for initial CI cluster setup --- .github/workflows/stackhpc.yml | 53 ++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 871aff155..def6c420a 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -44,7 +44,10 @@ jobs: CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -76,6 +79,14 @@ jobs: run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts shell: bash + - name: Find the latest release + run: | + echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" + echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG + + - name: Checkout latest release + run: git checkout $LATEST_RELEASE_TAG + - name: Install ansible etc run: dev/setup-env.sh @@ -103,7 +114,7 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Provision nodes using fat image + - name: Provision nodes using latest release image id: provision_servers run: | . venv/bin/activate @@ -119,7 +130,7 @@ jobs: tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' - - name: Configure cluster + - name: Configure cluster at latest release run: | . venv/bin/activate . environments/.stackhpc/activate @@ -127,7 +138,7 @@ jobs: ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Run MPI-based tests + - name: Run MPI-based tests at latest release run: | . venv/bin/activate . environments/.stackhpc/activate @@ -170,23 +181,33 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Test reimage of login and control nodes (via rebuild adhoc) + - name: Switch to current branch + run: git checkout - + + - name: Reimage login and control nodes to image in current branch + id: reimage_non_compute + run: | + . venv/bin/activate + . environments/.stackhpc/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json + + - name: Configure cluster using current branch run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml + ansible all -m wait_for_connection ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Test compute node reboot and compute-init + - name: Reimage compute nodes to image in current branch using slurm - tests compute-init run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - ansible-playbook -v ansible/ci/check_slurm.yml ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml + ansible-playbook -v ansible/ci/check_slurm.yml - - name: Check sacct state survived reimage + - name: Check sacct state survived reimage to current branch run: | . venv/bin/activate . environments/.stackhpc/activate @@ -198,6 +219,12 @@ jobs: . environments/.stackhpc/activate ansible-playbook -vv ansible/ci/check_grafana.yml + - name: Run MPI-based tests again in current branch + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml + - name: Delete infrastructure run: | . venv/bin/activate @@ -205,9 +232,3 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/tofu tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} - - # - name: Delete images - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible-playbook -vv ansible/ci/delete_images.yml From ca367ce63d3215761a8f5402d2149dfa086d1b2d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 12:20:48 +0000 Subject: [PATCH 15/36] fix changing branches --- .github/workflows/stackhpc.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index def6c420a..cc657143d 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -44,10 +44,16 @@ jobs: CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} steps: - - uses: actions/checkout@v4 + + - name: Find the latest release + run: | + echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" + echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG + + - name: Checkout latest release + uses: actions/checkout@v4 with: - fetch-depth: 0 - fetch-tags: true + ref: ${{ env.LATEST_RELEASE_TAG }} - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -79,14 +85,6 @@ jobs: run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts shell: bash - - name: Find the latest release - run: | - echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" - echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - - - name: Checkout latest release - run: git checkout $LATEST_RELEASE_TAG - - name: Install ansible etc run: dev/setup-env.sh @@ -181,9 +179,9 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Switch to current branch - run: git checkout - - + - name: Checkout current branch + uses: actions/checkout@v4 + - name: Reimage login and control nodes to image in current branch id: reimage_non_compute run: | From 807d5f16832df6d69661d19b79c7088776c6c61b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 13:46:13 +0000 Subject: [PATCH 16/36] cope with change from stackhpc terraform-> tofu directory --- .github/workflows/stackhpc.yml | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index cc657143d..b83aef2f6 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -48,13 +48,17 @@ jobs: - name: Find the latest release run: | echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" - echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - name: Checkout latest release uses: actions/checkout@v4 with: ref: ${{ env.LATEST_RELEASE_TAG }} + - name: Find stackhpc tofu/terraform directory + # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 + run: | + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu )) >> "$GITHUB_ENV" + - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} run: | @@ -69,9 +73,11 @@ jobs: fi done - - name: Record settings for CI cloud + - name: Record debug info run: | - echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG + echo STACKHPC_TF_DIR: $STACKHPC_TF_DIR + echo CI_CLOUD: $CI_CLOUD - name: Setup ssh run: | @@ -95,7 +101,7 @@ jobs: - name: Initialise tofu run: tofu init - working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu + working-directory: ${{ env.STACKHPC_TF_DIR }} - name: Write clouds.yaml run: | @@ -117,14 +123,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -182,12 +188,17 @@ jobs: - name: Checkout current branch uses: actions/checkout@v4 + - name: Find stackhpc tofu/terraform directory + # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 + run: | + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu \) ) >> "$GITHUB_ENV" + - name: Reimage login and control nodes to image in current branch id: reimage_non_compute run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json - name: Configure cluster using current branch @@ -227,6 +238,6 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/tofu + cd $STACKHPC_TF_DIR tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} From 3a3859103d10e07caa137ec75e3c1690f8cf23b2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 13:52:31 +0000 Subject: [PATCH 17/36] try to get workflow to update --- .github/workflows/stackhpc.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b83aef2f6..97baf9fd4 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -191,7 +191,9 @@ jobs: - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu \) ) >> "$GITHUB_ENV" + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d -name tofu -o -type d -name terraform) >> "$GITHUB_ENV" + # something about GH actions parsing eats \( \) type expressions + - name: Reimage login and control nodes to image in current branch id: reimage_non_compute From e45fdf0364d7cb1891d0ae2e74f4d878035d6609 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 14:06:52 +0000 Subject: [PATCH 18/36] fix finding stackhpc tf directory --- .github/workflows/stackhpc.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 97baf9fd4..773433825 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -57,7 +57,7 @@ jobs: - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu )) >> "$GITHUB_ENV" + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -187,13 +187,11 @@ jobs: - name: Checkout current branch uses: actions/checkout@v4 - + - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d -name tofu -o -type d -name terraform) >> "$GITHUB_ENV" - # something about GH actions parsing eats \( \) type expressions - + echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - name: Reimage login and control nodes to image in current branch id: reimage_non_compute From 142bcb0fa163ad5d105febda3fb637777e863d69 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 14:50:01 +0000 Subject: [PATCH 19/36] update ansible etc when switching branches --- .github/workflows/stackhpc.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 773433825..0047d8484 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -91,7 +91,7 @@ jobs: run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts shell: bash - - name: Install ansible etc + - name: Install ansible, pip and galaxy requirements run: dev/setup-env.sh - name: Install OpenTofu @@ -188,6 +188,9 @@ jobs: - name: Checkout current branch uses: actions/checkout@v4 + - name: Update ansible, pip and galaxy requirements + run: dev/setup-env.sh + - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | From fddd413c0d1105ea959f90d46afb9e789e5f7970 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 15:21:38 +0000 Subject: [PATCH 20/36] fixup reimaging --- .github/workflows/stackhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 0047d8484..7550409f8 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -202,7 +202,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate cd $STACKHPC_TF_DIR - tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json + tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Configure cluster using current branch run: | From 032e51d89d10701d7ed02e094ad737815de0fc27 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 16:13:03 +0000 Subject: [PATCH 21/36] fix losing tf state on new checkout --- .github/workflows/stackhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 7550409f8..1ff41ea8c 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -53,6 +53,7 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ env.LATEST_RELEASE_TAG }} + fetch-depth: 0 - name: Find stackhpc tofu/terraform directory # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 @@ -186,7 +187,7 @@ jobs: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Checkout current branch - uses: actions/checkout@v4 + run: git checkout ${{ github.head_ref || github.ref_name }} - name: Update ansible, pip and galaxy requirements run: dev/setup-env.sh From 27b07e94241a3b9747087b63cac9b0b085f24184 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 13 Feb 2025 16:51:28 +0000 Subject: [PATCH 22/36] fix STACKHPC_TF_DIR being concatenated not overwritten in GITHUB_ENV --- .github/workflows/stackhpc.yml | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 1ff41ea8c..7df726cfc 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -55,10 +55,10 @@ jobs: ref: ${{ env.LATEST_RELEASE_TAG }} fetch-depth: 0 - - name: Find stackhpc tofu/terraform directory + - name: Find stackhpc tofu/terraform directory at latest release # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" + echo STACKHPC_TF_DIR_RELEASE=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} @@ -77,7 +77,7 @@ jobs: - name: Record debug info run: | echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - echo STACKHPC_TF_DIR: $STACKHPC_TF_DIR + echo STACKHPC_TF_DIR_RELEASE: $STACKHPC_TF_DIR_RELEASE echo CI_CLOUD: $CI_CLOUD - name: Setup ssh @@ -102,7 +102,7 @@ jobs: - name: Initialise tofu run: tofu init - working-directory: ${{ env.STACKHPC_TF_DIR }} + working-directory: ${{ env.STACKHPC_TF_DIR_RELEASE }} - name: Write clouds.yaml run: | @@ -124,14 +124,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd $STACKHPC_TF_DIR_RELEASE tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd $STACKHPC_TF_DIR_RELEASE tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -192,17 +192,18 @@ jobs: - name: Update ansible, pip and galaxy requirements run: dev/setup-env.sh - - name: Find stackhpc tofu/terraform directory + - name: Find stackhpc tofu/terraform directory in current branch # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 run: | - echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" + echo STACKHPC_TF_DIR_BRANCH=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - name: Reimage login and control nodes to image in current branch id: reimage_non_compute run: | + echo STACKHPC_TF_DIR_BRANCH: $STACKHPC_TF_DIR_BRANCH . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd $STACKHPC_TF_DIR_BRANCH tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Configure cluster using current branch @@ -242,6 +243,6 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd $STACKHPC_TF_DIR_BRANCH tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: ${{ success() || cancelled() }} From 0502aa666d814af416411a10242624d714f125ea Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Feb 2025 09:33:05 +0000 Subject: [PATCH 23/36] fix stackhpc tf directory change --- .github/workflows/stackhpc.yml | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 7df726cfc..5e3fba618 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -43,6 +43,10 @@ jobs: TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} + # Once a v1.158 is released including https://github.com/stackhpc/ansible-slurm-appliance/pull/541 + # this can be simplified: + STACKHPC_TF_DIR_OLD: environments/.stackhpc/terraform/ # v1.158 + STACKHPC_TF_DIR_NEW: environments/.stackhpc/tofu/ steps: - name: Find the latest release @@ -55,11 +59,6 @@ jobs: ref: ${{ env.LATEST_RELEASE_TAG }} fetch-depth: 0 - - name: Find stackhpc tofu/terraform directory at latest release - # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 - run: | - echo STACKHPC_TF_DIR_RELEASE=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} run: | @@ -77,7 +76,6 @@ jobs: - name: Record debug info run: | echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - echo STACKHPC_TF_DIR_RELEASE: $STACKHPC_TF_DIR_RELEASE echo CI_CLOUD: $CI_CLOUD - name: Setup ssh @@ -102,7 +100,7 @@ jobs: - name: Initialise tofu run: tofu init - working-directory: ${{ env.STACKHPC_TF_DIR_RELEASE }} + working-directory: ${{ env.STACKHPC_TF_DIR_OLD }} - name: Write clouds.yaml run: | @@ -124,14 +122,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_RELEASE + cd $STACKHPC_TF_DIR_OLD tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_RELEASE + cd $STACKHPC_TF_DIR_OLD tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -192,18 +190,12 @@ jobs: - name: Update ansible, pip and galaxy requirements run: dev/setup-env.sh - - name: Find stackhpc tofu/terraform directory in current branch - # changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541 - run: | - echo STACKHPC_TF_DIR_BRANCH=$(find environments/.stackhpc/ -type d \( -name tofu -o -name terraform \)) >> "$GITHUB_ENV" - - name: Reimage login and control nodes to image in current branch id: reimage_non_compute run: | - echo STACKHPC_TF_DIR_BRANCH: $STACKHPC_TF_DIR_BRANCH . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_BRANCH + cd $STACKHPC_TF_DIR_NEW tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Configure cluster using current branch @@ -243,6 +235,9 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_BRANCH - tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" + cd $STACKHPC_TF_DIR_OLD + tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR_OLD" + cd - + cd $STACKHPC_TF_DIR_NEW + tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR_NEW" if: ${{ success() || cancelled() }} From 7362e596507bee4460a836df41dec66b38d02b8c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Feb 2025 10:14:46 +0000 Subject: [PATCH 24/36] fix slurm_tools gitignore/version comment --- ansible/.gitignore | 2 ++ ansible/roles/slurm_tools/defaults/main.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ansible/.gitignore b/ansible/.gitignore index 20ff5d7b5..c53e3e2c5 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -84,3 +84,5 @@ roles/* !roles/pytools/** !roles/rebuild/ !roles/rebuild/** +!roles/slurm_tools/ +!roles/slurm_tools/** diff --git a/ansible/roles/slurm_tools/defaults/main.yml b/ansible/roles/slurm_tools/defaults/main.yml index 39070255c..697583514 100644 --- a/ansible/roles/slurm_tools/defaults/main.yml +++ b/ansible/roles/slurm_tools/defaults/main.yml @@ -1,4 +1,4 @@ --- pytools_editable: false -pytools_gitref: master +pytools_gitref: master # TODO: FIXME: do a release! pytools_user: root From 59f8ead06acb0e2e377c1e1454ab33c34ab67e53 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Feb 2025 11:30:44 +0000 Subject: [PATCH 25/36] fixup TF state moving --- .github/workflows/stackhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 5e3fba618..f825d6184 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -195,6 +195,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate + cp $STACKHPC_TF_DIR_OLD/terraform.tfstate $STACKHPC_TF_DIR_NEW cd $STACKHPC_TF_DIR_NEW tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" From 0ac9de5a1ce284bf79dec9ceb1501620345e84f2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Feb 2025 12:04:42 +0000 Subject: [PATCH 26/36] re-init tofu after changing branches --- .github/workflows/stackhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index f825d6184..c9b20318d 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -197,6 +197,7 @@ jobs: . environments/.stackhpc/activate cp $STACKHPC_TF_DIR_OLD/terraform.tfstate $STACKHPC_TF_DIR_NEW cd $STACKHPC_TF_DIR_NEW + tofu init tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Configure cluster using current branch From 9af56d10c472fd42802c5b88c0119d935b544532 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 18 Feb 2025 09:04:00 +0000 Subject: [PATCH 27/36] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 99bca2f54..ad0502848 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250211-1540-a0b4a57e", - "RL9": "openhpc-RL9-250211-1540-a0b4a57e" + "RL8": "openhpc-RL8-250214-1648-0ac9de5a", + "RL9": "openhpc-RL9-250214-1647-0ac9de5a" } } From ca610f8d516cdc8f6aab9793c8b8d219211d3850 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 20 Mar 2025 13:09:29 +0000 Subject: [PATCH 28/36] simplify TF_DIR path handling --- .github/workflows/stackhpc.yml | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index c9b20318d..fccb41c6f 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -43,10 +43,7 @@ jobs: TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} - # Once a v1.158 is released including https://github.com/stackhpc/ansible-slurm-appliance/pull/541 - # this can be simplified: - STACKHPC_TF_DIR_OLD: environments/.stackhpc/terraform/ # v1.158 - STACKHPC_TF_DIR_NEW: environments/.stackhpc/tofu/ + STACKHPC_TF_DIR: environemnts/.stackhpc/tofu steps: - name: Find the latest release @@ -100,7 +97,7 @@ jobs: - name: Initialise tofu run: tofu init - working-directory: ${{ env.STACKHPC_TF_DIR_OLD }} + working-directory: ${{ env.STACKHPC_TF_DIR }} - name: Write clouds.yaml run: | @@ -122,14 +119,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_OLD + cd $STACKHPC_TF_DIR tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_OLD + cd $STACKHPC_TF_DIR tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -195,8 +192,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cp $STACKHPC_TF_DIR_OLD/terraform.tfstate $STACKHPC_TF_DIR_NEW - cd $STACKHPC_TF_DIR_NEW + cd $STACKHPC_TF_DIR tofu init tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" @@ -237,9 +233,6 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR_OLD - tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR_OLD" - cd - - cd $STACKHPC_TF_DIR_NEW - tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR_NEW" + cd $STACKHPC_TF_DIR + tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR" if: ${{ success() || cancelled() }} From 5aa9eaa58fb40cd742add599228b9b1209497b05 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 20 Mar 2025 13:26:05 +0000 Subject: [PATCH 29/36] fix typo --- .github/workflows/stackhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index fccb41c6f..7a00a8349 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -43,7 +43,7 @@ jobs: TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings TF_VAR_os_version: ${{ matrix.os_version }} - STACKHPC_TF_DIR: environemnts/.stackhpc/tofu + STACKHPC_TF_DIR: environments/.stackhpc/tofu steps: - name: Find the latest release From e760db7f9e3fd47dab1e4834d54c4558398bd135 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 21 Mar 2025 16:33:21 +0000 Subject: [PATCH 30/36] nfs_configurations workaround --- ansible/roles/compute_init/files/compute-init.yml | 2 +- environments/common/inventory/group_vars/all/nfs.yml | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index e97b5918d..4f316af2f 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -193,7 +193,7 @@ tasks_from: nfs-clients.yml when: - enable_nfs - - nfs_enable.clients | default(item.nfs_enable.clients) | bool + - nfs_enable.server | bool or ('nfs_enable' in item and item.nfs_enable.server | bool) loop: "{{ nfs_configurations }}" - name: Manila mounts diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 39c264576..abde7c76e 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -24,17 +24,9 @@ nfs_configurations: # NB: this is stackhpc.nfs role defaults but are set here to prevent being # accidently overriden via default options nfs_export_options: 'rw,secure,root_squash' - # prevent non-cluster IPs mounting the share: - # NB: this is set as default for all shares above but is repeated here - # in case nfs_export_clients is overriden - nfs_export_clients: "{{ _nfs_node_ips }}" - comment: Export /exports/cluster from Slurm control node nfs_enable: server: "{{ inventory_hostname in groups['control'] }}" clients: false nfs_export: "/exports/cluster" - # prevent non-cluster IPs mounting the share: - # NB: this is set as default for all shares above but is repeated here - # in case nfs_export_clients is overriden - nfs_export_clients: "{{ _nfs_node_ips }}" From 7be84d3218b4583917c57388a4cf7e86b89d40d3 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 25 Mar 2025 13:31:22 +0000 Subject: [PATCH 31/36] bump images --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index a56dda976..2dd5ba198 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250319-1045-69713f23", - "RL9": "openhpc-RL9-250319-1045-69713f23" + "RL8": "openhpc-RL8-250325-1051-9e40ffb3", + "RL9": "openhpc-RL9-250325-1246-9e40ffb3" } } From 3e132168972d8953cac59455c93da4a2d87d2794 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 26 Mar 2025 10:37:52 +0000 Subject: [PATCH 32/36] fix compute-init nfs-clients --- ansible/roles/compute_init/files/compute-init.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 4f316af2f..b09bd7f3b 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -193,7 +193,7 @@ tasks_from: nfs-clients.yml when: - enable_nfs - - nfs_enable.server | bool or ('nfs_enable' in item and item.nfs_enable.server | bool) + - nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) loop: "{{ nfs_configurations }}" - name: Manila mounts From 5bc03dcf72da5925cceb57be98e3aa88856db150 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 26 Mar 2025 11:19:57 +0000 Subject: [PATCH 33/36] bump images --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 2dd5ba198..7e213c00c 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250325-1051-9e40ffb3", - "RL9": "openhpc-RL9-250325-1246-9e40ffb3" + "RL8": "openhpc-RL8-250326-1048-3e132168", + "RL9": "openhpc-RL9-250326-1049-3e132168" } } From 6d3ad77614fe8da63dde2917db835427e17c7bae Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 27 Mar 2025 11:36:22 +0000 Subject: [PATCH 34/36] move OOD checks to current branch section + reduce hpctests in release image test --- .github/workflows/stackhpc.yml | 64 ++++++++++----------- ansible/roles/slurm_tools/defaults/main.yml | 2 +- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 7a00a8349..4c7e28b2b 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -142,7 +142,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook -vv ansible/adhoc/hpctests.yml + ansible-playbook -vv ansible/adhoc/hpctests.yml --tags pingpong # - name: Run EESSI tests # run: | @@ -150,37 +150,6 @@ jobs: # . environments/.stackhpc/activate # ansible-playbook -vv ansible/ci/check_eessi.yml - - name: Confirm Open Ondemand is up (via SOCKS proxy) - run: | - . venv/bin/activate - . environments/.stackhpc/activate - - # load ansible variables into shell: - ansible-playbook ansible/ci/output_vars.yml \ - -e output_vars_hosts=openondemand \ - -e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \ - -e output_vars_items=bastion_ip,bastion_user,openondemand_servername - source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt - - # setup ssh proxying: - sudo apt-get --yes install proxychains - echo proxychains installed - ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip} - echo port 9050 forwarded - - # check OOD server returns 200: - statuscode=$(proxychains wget \ - --quiet \ - --spider \ - --server-response \ - --no-check-certificate \ - --http-user=demo_user \ - --http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ - 2>&1) - (echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) - env: - DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Checkout current branch run: git checkout ${{ github.head_ref || github.ref_name }} @@ -229,6 +198,37 @@ jobs: . environments/.stackhpc/activate ansible-playbook -vv ansible/adhoc/hpctests.yml + - name: Confirm Open Ondemand is up (via SOCKS proxy) + run: | + . venv/bin/activate + . environments/.stackhpc/activate + + # load ansible variables into shell: + ansible-playbook ansible/ci/output_vars.yml \ + -e output_vars_hosts=openondemand \ + -e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \ + -e output_vars_items=bastion_ip,bastion_user,openondemand_servername + source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt + + # setup ssh proxying: + sudo apt-get --yes install proxychains + echo proxychains installed + ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip} + echo port 9050 forwarded + + # check OOD server returns 200: + statuscode=$(proxychains wget \ + --quiet \ + --spider \ + --server-response \ + --no-check-certificate \ + --http-user=demo_user \ + --http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ + 2>&1) + (echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) + env: + DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + - name: Delete infrastructure run: | . venv/bin/activate diff --git a/ansible/roles/slurm_tools/defaults/main.yml b/ansible/roles/slurm_tools/defaults/main.yml index 697583514..2e3bd7ddb 100644 --- a/ansible/roles/slurm_tools/defaults/main.yml +++ b/ansible/roles/slurm_tools/defaults/main.yml @@ -1,4 +1,4 @@ --- pytools_editable: false -pytools_gitref: master # TODO: FIXME: do a release! +pytools_gitref: v2.0 pytools_user: root From 98e152616f95a28f7f6f97a5e9ce10961b19e3da Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 27 Mar 2025 11:41:48 +0000 Subject: [PATCH 35/36] change check_grafana to expect just pingpong --- ansible/ci/check_grafana.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/ci/check_grafana.yml b/ansible/ci/check_grafana.yml index 216cb1ed9..36fb78b72 100644 --- a/ansible/ci/check_grafana.yml +++ b/ansible/ci/check_grafana.yml @@ -23,4 +23,4 @@ delay: 5 vars: _found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}" - _expected_jobs: ['hpl-solo.sh', 'pingpong.sh', 'pingmatrix.sh'] + _expected_jobs: ['pingpong.sh'] From a75a9764db876dd2197a07881204b6cea1f9ef22 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 27 Mar 2025 13:03:49 +0000 Subject: [PATCH 36/36] check sacct for just pingpong --- ansible/ci/check_sacct_hpctests.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ansible/ci/check_sacct_hpctests.yml b/ansible/ci/check_sacct_hpctests.yml index 2ed6fda19..1ebbf2171 100644 --- a/ansible/ci/check_sacct_hpctests.yml +++ b/ansible/ci/check_sacct_hpctests.yml @@ -5,10 +5,6 @@ sacct_stdout_expected: |- # based on CI running hpctests as the first job JobID,JobName,State 1,pingpong.sh,COMPLETED - 2,pingmatrix.sh,COMPLETED - 3,hpl-build-linux64.sh,COMPLETED - 4_0,hpl-solo.sh,COMPLETED - 4_1,hpl-solo.sh,COMPLETED tasks: - name: Get info for ended jobs shell: