Skip to content
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
111107c
use latest release for initial CI cluster setup
sjpb Feb 13, 2025
85eafcc
fix changing branches
sjpb Feb 13, 2025
f0cd48f
cope with change from stackhpc terraform-> tofu directory
sjpb Feb 13, 2025
8be9087
try to get workflow to update
sjpb Feb 13, 2025
bf1ceed
fix finding stackhpc tf directory
sjpb Feb 13, 2025
0c3b047
update ansible etc when switching branches
sjpb Feb 13, 2025
4c40546
fixup reimaging
sjpb Feb 13, 2025
cfc14a3
fix losing tf state on new checkout
sjpb Feb 13, 2025
e140948
fix STACKHPC_TF_DIR being concatenated not overwritten in GITHUB_ENV
sjpb Feb 13, 2025
d504363
fix stackhpc tf directory change
sjpb Feb 14, 2025
8d597f5
fix slurm_tools gitignore/version comment
sjpb Feb 14, 2025
16fa657
fixup TF state moving
sjpb Feb 14, 2025
1355b5f
re-init tofu after changing branches
sjpb Feb 14, 2025
91484d2
use latest release for initial CI cluster setup
sjpb Feb 13, 2025
ca367ce
fix changing branches
sjpb Feb 13, 2025
807d5f1
cope with change from stackhpc terraform-> tofu directory
sjpb Feb 13, 2025
3a38591
try to get workflow to update
sjpb Feb 13, 2025
e45fdf0
fix finding stackhpc tf directory
sjpb Feb 13, 2025
142bcb0
update ansible etc when switching branches
sjpb Feb 13, 2025
fddd413
fixup reimaging
sjpb Feb 13, 2025
032e51d
fix losing tf state on new checkout
sjpb Feb 13, 2025
27b07e9
fix STACKHPC_TF_DIR being concatenated not overwritten in GITHUB_ENV
sjpb Feb 13, 2025
0502aa6
fix stackhpc tf directory change
sjpb Feb 14, 2025
7362e59
fix slurm_tools gitignore/version comment
sjpb Feb 14, 2025
59f8ead
fixup TF state moving
sjpb Feb 14, 2025
0ac9de5
re-init tofu after changing branches
sjpb Feb 14, 2025
af97440
Merge branch 'ci/test-compute-init' of github.com:stackhpc/ansible-sl…
sjpb Feb 18, 2025
9af56d1
bump CI image
sjpb Feb 18, 2025
9ac9bac
Merge branch 'main' into ci/test-compute-init
bertiethorpe Mar 19, 2025
ca610f8
simplify TF_DIR path handling
bertiethorpe Mar 20, 2025
b13a8da
Merge branch 'main' into ci/test-compute-init
bertiethorpe Mar 20, 2025
5aa9eaa
fix typo
bertiethorpe Mar 20, 2025
e760db7
nfs_configurations workaround
bertiethorpe Mar 21, 2025
bc7540f
Merge branch 'main' into ci/test-compute-init
bertiethorpe Mar 21, 2025
9e40ffb
Merge branch 'main' into ci/test-compute-init
bertiethorpe Mar 25, 2025
7be84d3
bump images
bertiethorpe Mar 25, 2025
3e13216
fix compute-init nfs-clients
bertiethorpe Mar 26, 2025
5bc03dc
bump images
bertiethorpe Mar 26, 2025
6d3ad77
move OOD checks to current branch section + reduce hpctests in releas…
bertiethorpe Mar 27, 2025
98e1526
change check_grafana to expect just pingpong
bertiethorpe Mar 27, 2025
a75a976
check sacct for just pingpong
bertiethorpe Mar 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 48 additions & 22 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,18 @@ jobs:
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
TF_VAR_os_version: ${{ matrix.os_version }}
STACKHPC_TF_DIR: environments/.stackhpc/tofu
steps:
- uses: actions/checkout@v2

- name: Find the latest release
run: |
echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV"

- name: Checkout latest release
uses: actions/checkout@v4
with:
ref: ${{ env.LATEST_RELEASE_TAG }}
fetch-depth: 0

- name: Override CI_CLOUD if PR label is present
if: ${{ github.event_name == 'pull_request' }}
Expand All @@ -60,9 +70,10 @@ jobs:
fi
done

- name: Record settings for CI cloud
- name: Record debug info
run: |
echo CI_CLOUD: ${{ env.CI_CLOUD }}
echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG
echo CI_CLOUD: $CI_CLOUD

- name: Setup ssh
run: |
Expand All @@ -76,7 +87,7 @@ jobs:
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash

- name: Install ansible etc
- name: Install ansible, pip and galaxy requirements
run: dev/setup-env.sh

- name: Install OpenTofu
Expand All @@ -86,7 +97,7 @@ jobs:

- name: Initialise tofu
run: tofu init
working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu
working-directory: ${{ env.STACKHPC_TF_DIR }}

- name: Write clouds.yaml
run: |
Expand All @@ -103,31 +114,31 @@ jobs:
env:
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Provision nodes using fat image
- name: Provision nodes using latest release image
id: provision_servers
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
cd $STACKHPC_TF_DIR
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"

- name: Delete infrastructure if provisioning failed
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
cd $STACKHPC_TF_DIR
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'

- name: Configure cluster
- name: Configure cluster at latest release
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml

- name: Run MPI-based tests
- name: Run MPI-based tests at latest release
run: |
. venv/bin/activate
. environments/.stackhpc/activate
Expand Down Expand Up @@ -170,22 +181,37 @@ jobs:
env:
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Test reimage of login and control nodes (via rebuild adhoc)
- name: Checkout current branch
run: git checkout ${{ github.head_ref || github.ref_name }}

- name: Update ansible, pip and galaxy requirements
run: dev/setup-env.sh

- name: Reimage login and control nodes to image in current branch
id: reimage_non_compute
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
cd $STACKHPC_TF_DIR
tofu init
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"

- name: Configure cluster using current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml

- name: Test compute node reboot and compute-init
- name: Reimage compute nodes to image in current branch using slurm - tests compute-init
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
ansible-playbook -v ansible/ci/check_slurm.yml

- name: Check sacct state survived reimage
- name: Check sacct state survived reimage to current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
Expand All @@ -197,16 +223,16 @@ jobs:
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_grafana.yml

- name: Run MPI-based tests again in current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml

- name: Delete infrastructure
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
cd $STACKHPC_TF_DIR
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR"
if: ${{ success() || cancelled() }}

# - name: Delete images
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/delete_images.yml
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,7 @@ roles/*
!roles/pytools/**
!roles/rebuild/
!roles/rebuild/**
!roles/slurm_tools/
!roles/slurm_tools/**
!roles/gateway/
!roles/gateway/**
2 changes: 1 addition & 1 deletion ansible/roles/compute_init/files/compute-init.yml
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@
tasks_from: nfs-clients.yml
when:
- enable_nfs
- nfs_enable.clients | default(item.nfs_enable.clients) | bool
- nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool)
loop: "{{ nfs_configurations }}"

- name: Manila mounts
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/slurm_tools/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
---
pytools_editable: false
pytools_gitref: master
pytools_gitref: master # TODO: FIXME: do a release!
pytools_user: root
4 changes: 2 additions & 2 deletions environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"cluster_image": {
"RL8": "openhpc-RL8-250319-1045-69713f23",
"RL9": "openhpc-RL9-250319-1045-69713f23"
"RL8": "openhpc-RL8-250326-1048-3e132168",
"RL9": "openhpc-RL9-250326-1049-3e132168"
}
}
8 changes: 0 additions & 8 deletions environments/common/inventory/group_vars/all/nfs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,9 @@ nfs_configurations:
# NB: this is stackhpc.nfs role defaults but are set here to prevent being
# accidently overriden via default options
nfs_export_options: 'rw,secure,root_squash'
# prevent non-cluster IPs mounting the share:
# NB: this is set as default for all shares above but is repeated here
# in case nfs_export_clients is overriden
nfs_export_clients: "{{ _nfs_node_ips }}"

- comment: Export /exports/cluster from Slurm control node
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: false
nfs_export: "/exports/cluster"
# prevent non-cluster IPs mounting the share:
# NB: this is set as default for all shares above but is repeated here
# in case nfs_export_clients is overriden
nfs_export_clients: "{{ _nfs_node_ips }}"