Skip to content

Commit 0aec76c

Browse files
authored
Merge pull request #576 from stackhpc/ci/test-compute-init
Test upgrade from latest release to current branch image in CI
2 parents a2bd816 + a75a976 commit 0aec76c

File tree

8 files changed

+79
-63
lines changed

8 files changed

+79
-63
lines changed

.github/workflows/stackhpc.yml

Lines changed: 72 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,18 @@ jobs:
4343
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
4444
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
4545
TF_VAR_os_version: ${{ matrix.os_version }}
46+
STACKHPC_TF_DIR: environments/.stackhpc/tofu
4647
steps:
47-
- uses: actions/checkout@v2
48+
49+
- name: Find the latest release
50+
run: |
51+
echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV"
52+
53+
- name: Checkout latest release
54+
uses: actions/checkout@v4
55+
with:
56+
ref: ${{ env.LATEST_RELEASE_TAG }}
57+
fetch-depth: 0
4858

4959
- name: Override CI_CLOUD if PR label is present
5060
if: ${{ github.event_name == 'pull_request' }}
@@ -60,9 +70,10 @@ jobs:
6070
fi
6171
done
6272
63-
- name: Record settings for CI cloud
73+
- name: Record debug info
6474
run: |
65-
echo CI_CLOUD: ${{ env.CI_CLOUD }}
75+
echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG
76+
echo CI_CLOUD: $CI_CLOUD
6677
6778
- name: Setup ssh
6879
run: |
@@ -76,7 +87,7 @@ jobs:
7687
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
7788
shell: bash
7889

79-
- name: Install ansible etc
90+
- name: Install ansible, pip and galaxy requirements
8091
run: dev/setup-env.sh
8192

8293
- name: Install OpenTofu
@@ -86,7 +97,7 @@ jobs:
8697

8798
- name: Initialise tofu
8899
run: tofu init
89-
working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu
100+
working-directory: ${{ env.STACKHPC_TF_DIR }}
90101

91102
- name: Write clouds.yaml
92103
run: |
@@ -103,42 +114,90 @@ jobs:
103114
env:
104115
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
105116

106-
- name: Provision nodes using fat image
117+
- name: Provision nodes using latest release image
107118
id: provision_servers
108119
run: |
109120
. venv/bin/activate
110121
. environments/.stackhpc/activate
111-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
122+
cd $STACKHPC_TF_DIR
112123
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
113124
114125
- name: Delete infrastructure if provisioning failed
115126
run: |
116127
. venv/bin/activate
117128
. environments/.stackhpc/activate
118-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
129+
cd $STACKHPC_TF_DIR
119130
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
120131
if: failure() && steps.provision_servers.outcome == 'failure'
121132

122-
- name: Configure cluster
133+
- name: Configure cluster at latest release
123134
run: |
124135
. venv/bin/activate
125136
. environments/.stackhpc/activate
126137
ansible all -m wait_for_connection
127138
ansible-playbook -v ansible/site.yml
128139
ansible-playbook -v ansible/ci/check_slurm.yml
129140
130-
- name: Run MPI-based tests
141+
- name: Run MPI-based tests at latest release
131142
run: |
132143
. venv/bin/activate
133144
. environments/.stackhpc/activate
134-
ansible-playbook -vv ansible/adhoc/hpctests.yml
145+
ansible-playbook -vv ansible/adhoc/hpctests.yml --tags pingpong
135146
136147
# - name: Run EESSI tests
137148
# run: |
138149
# . venv/bin/activate
139150
# . environments/.stackhpc/activate
140151
# ansible-playbook -vv ansible/ci/check_eessi.yml
141152

153+
- name: Checkout current branch
154+
run: git checkout ${{ github.head_ref || github.ref_name }}
155+
156+
- name: Update ansible, pip and galaxy requirements
157+
run: dev/setup-env.sh
158+
159+
- name: Reimage login and control nodes to image in current branch
160+
id: reimage_non_compute
161+
run: |
162+
. venv/bin/activate
163+
. environments/.stackhpc/activate
164+
cd $STACKHPC_TF_DIR
165+
tofu init
166+
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
167+
168+
- name: Configure cluster using current branch
169+
run: |
170+
. venv/bin/activate
171+
. environments/.stackhpc/activate
172+
ansible all -m wait_for_connection
173+
ansible-playbook -v ansible/site.yml
174+
ansible-playbook -v ansible/ci/check_slurm.yml
175+
176+
- name: Reimage compute nodes to image in current branch using slurm - tests compute-init
177+
run: |
178+
. venv/bin/activate
179+
. environments/.stackhpc/activate
180+
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
181+
ansible-playbook -v ansible/ci/check_slurm.yml
182+
183+
- name: Check sacct state survived reimage to current branch
184+
run: |
185+
. venv/bin/activate
186+
. environments/.stackhpc/activate
187+
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
188+
189+
- name: Check MPI-based tests are shown in Grafana
190+
run: |
191+
. venv/bin/activate
192+
. environments/.stackhpc/activate
193+
ansible-playbook -vv ansible/ci/check_grafana.yml
194+
195+
- name: Run MPI-based tests again in current branch
196+
run: |
197+
. venv/bin/activate
198+
. environments/.stackhpc/activate
199+
ansible-playbook -vv ansible/adhoc/hpctests.yml
200+
142201
- name: Confirm Open Ondemand is up (via SOCKS proxy)
143202
run: |
144203
. venv/bin/activate
@@ -170,43 +229,10 @@ jobs:
170229
env:
171230
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
172231

173-
- name: Test reimage of login and control nodes (via rebuild adhoc)
174-
run: |
175-
. venv/bin/activate
176-
. environments/.stackhpc/activate
177-
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
178-
ansible-playbook -v ansible/site.yml
179-
ansible-playbook -v ansible/ci/check_slurm.yml
180-
181-
- name: Test compute node reboot and compute-init
182-
run: |
183-
. venv/bin/activate
184-
. environments/.stackhpc/activate
185-
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
186-
ansible-playbook -v ansible/ci/check_slurm.yml
187-
188-
- name: Check sacct state survived reimage
189-
run: |
190-
. venv/bin/activate
191-
. environments/.stackhpc/activate
192-
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
193-
194-
- name: Check MPI-based tests are shown in Grafana
195-
run: |
196-
. venv/bin/activate
197-
. environments/.stackhpc/activate
198-
ansible-playbook -vv ansible/ci/check_grafana.yml
199-
200232
- name: Delete infrastructure
201233
run: |
202234
. venv/bin/activate
203235
. environments/.stackhpc/activate
204-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
205-
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
236+
cd $STACKHPC_TF_DIR
237+
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR"
206238
if: ${{ success() || cancelled() }}
207-
208-
# - name: Delete images
209-
# run: |
210-
# . venv/bin/activate
211-
# . environments/.stackhpc/activate
212-
# ansible-playbook -vv ansible/ci/delete_images.yml

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,7 @@ roles/*
8484
!roles/pytools/**
8585
!roles/rebuild/
8686
!roles/rebuild/**
87+
!roles/slurm_tools/
88+
!roles/slurm_tools/**
8789
!roles/gateway/
8890
!roles/gateway/**

ansible/ci/check_grafana.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,4 @@
2323
delay: 5
2424
vars:
2525
_found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}"
26-
_expected_jobs: ['hpl-solo.sh', 'pingpong.sh', 'pingmatrix.sh']
26+
_expected_jobs: ['pingpong.sh']

ansible/ci/check_sacct_hpctests.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,6 @@
55
sacct_stdout_expected: |- # based on CI running hpctests as the first job
66
JobID,JobName,State
77
1,pingpong.sh,COMPLETED
8-
2,pingmatrix.sh,COMPLETED
9-
3,hpl-build-linux64.sh,COMPLETED
10-
4_0,hpl-solo.sh,COMPLETED
11-
4_1,hpl-solo.sh,COMPLETED
128
tasks:
139
- name: Get info for ended jobs
1410
shell:

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@
193193
tasks_from: nfs-clients.yml
194194
when:
195195
- enable_nfs
196-
- nfs_enable.clients | default(item.nfs_enable.clients) | bool
196+
- nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool)
197197
loop: "{{ nfs_configurations }}"
198198

199199
- name: Manila mounts
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
---
22
pytools_editable: false
3-
pytools_gitref: master
3+
pytools_gitref: v2.0
44
pytools_user: root
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-250319-1045-69713f23",
4-
"RL9": "openhpc-RL9-250319-1045-69713f23"
3+
"RL8": "openhpc-RL8-250326-1048-3e132168",
4+
"RL9": "openhpc-RL9-250326-1049-3e132168"
55
}
66
}

environments/common/inventory/group_vars/all/nfs.yml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,9 @@ nfs_configurations:
2424
# NB: this is stackhpc.nfs role defaults but are set here to prevent being
2525
# accidently overriden via default options
2626
nfs_export_options: 'rw,secure,root_squash'
27-
# prevent non-cluster IPs mounting the share:
28-
# NB: this is set as default for all shares above but is repeated here
29-
# in case nfs_export_clients is overriden
30-
nfs_export_clients: "{{ _nfs_node_ips }}"
3127

3228
- comment: Export /exports/cluster from Slurm control node
3329
nfs_enable:
3430
server: "{{ inventory_hostname in groups['control'] }}"
3531
clients: false
3632
nfs_export: "/exports/cluster"
37-
# prevent non-cluster IPs mounting the share:
38-
# NB: this is set as default for all shares above but is repeated here
39-
# in case nfs_export_clients is overriden
40-
nfs_export_clients: "{{ _nfs_node_ips }}"

0 commit comments

Comments
 (0)