Skip to content

Commit d6b4513

Browse files
authored
Merge branch 'main' into temp/try-image-build-without-volume
2 parents 2a2bd38 + e0bdcd7 commit d6b4513

File tree

14 files changed

+249
-138
lines changed

14 files changed

+249
-138
lines changed

.github/workflows/stackhpc.yml

Lines changed: 72 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,18 @@ jobs:
4343
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
4444
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
4545
TF_VAR_os_version: ${{ matrix.os_version }}
46+
STACKHPC_TF_DIR: environments/.stackhpc/tofu
4647
steps:
47-
- uses: actions/checkout@v2
48+
49+
- name: Find the latest release
50+
run: |
51+
echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV"
52+
53+
- name: Checkout latest release
54+
uses: actions/checkout@v4
55+
with:
56+
ref: ${{ env.LATEST_RELEASE_TAG }}
57+
fetch-depth: 0
4858

4959
- name: Override CI_CLOUD if PR label is present
5060
if: ${{ github.event_name == 'pull_request' }}
@@ -60,9 +70,10 @@ jobs:
6070
fi
6171
done
6272
63-
- name: Record settings for CI cloud
73+
- name: Record debug info
6474
run: |
65-
echo CI_CLOUD: ${{ env.CI_CLOUD }}
75+
echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG
76+
echo CI_CLOUD: $CI_CLOUD
6677
6778
- name: Setup ssh
6879
run: |
@@ -76,7 +87,7 @@ jobs:
7687
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
7788
shell: bash
7889

79-
- name: Install ansible etc
90+
- name: Install ansible, pip and galaxy requirements
8091
run: dev/setup-env.sh
8192

8293
- name: Install OpenTofu
@@ -86,7 +97,7 @@ jobs:
8697

8798
- name: Initialise tofu
8899
run: tofu init
89-
working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu
100+
working-directory: ${{ env.STACKHPC_TF_DIR }}
90101

91102
- name: Write clouds.yaml
92103
run: |
@@ -103,42 +114,90 @@ jobs:
103114
env:
104115
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
105116

106-
- name: Provision nodes using fat image
117+
- name: Provision nodes using latest release image
107118
id: provision_servers
108119
run: |
109120
. venv/bin/activate
110121
. environments/.stackhpc/activate
111-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
122+
cd $STACKHPC_TF_DIR
112123
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
113124
114125
- name: Delete infrastructure if provisioning failed
115126
run: |
116127
. venv/bin/activate
117128
. environments/.stackhpc/activate
118-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
129+
cd $STACKHPC_TF_DIR
119130
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
120131
if: failure() && steps.provision_servers.outcome == 'failure'
121132

122-
- name: Configure cluster
133+
- name: Configure cluster at latest release
123134
run: |
124135
. venv/bin/activate
125136
. environments/.stackhpc/activate
126137
ansible all -m wait_for_connection
127138
ansible-playbook -v ansible/site.yml
128139
ansible-playbook -v ansible/ci/check_slurm.yml
129140
130-
- name: Run MPI-based tests
141+
- name: Run MPI-based tests at latest release
131142
run: |
132143
. venv/bin/activate
133144
. environments/.stackhpc/activate
134-
ansible-playbook -vv ansible/adhoc/hpctests.yml
145+
ansible-playbook -vv ansible/adhoc/hpctests.yml --tags pingpong
135146
136147
# - name: Run EESSI tests
137148
# run: |
138149
# . venv/bin/activate
139150
# . environments/.stackhpc/activate
140151
# ansible-playbook -vv ansible/ci/check_eessi.yml
141152

153+
- name: Checkout current branch
154+
run: git checkout ${{ github.head_ref || github.ref_name }}
155+
156+
- name: Update ansible, pip and galaxy requirements
157+
run: dev/setup-env.sh
158+
159+
- name: Reimage login and control nodes to image in current branch
160+
id: reimage_non_compute
161+
run: |
162+
. venv/bin/activate
163+
. environments/.stackhpc/activate
164+
cd $STACKHPC_TF_DIR
165+
tofu init
166+
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
167+
168+
- name: Configure cluster using current branch
169+
run: |
170+
. venv/bin/activate
171+
. environments/.stackhpc/activate
172+
ansible all -m wait_for_connection
173+
ansible-playbook -v ansible/site.yml
174+
ansible-playbook -v ansible/ci/check_slurm.yml
175+
176+
- name: Reimage compute nodes to image in current branch using slurm - tests compute-init
177+
run: |
178+
. venv/bin/activate
179+
. environments/.stackhpc/activate
180+
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
181+
ansible-playbook -v ansible/ci/check_slurm.yml
182+
183+
- name: Check sacct state survived reimage to current branch
184+
run: |
185+
. venv/bin/activate
186+
. environments/.stackhpc/activate
187+
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
188+
189+
- name: Check MPI-based tests are shown in Grafana
190+
run: |
191+
. venv/bin/activate
192+
. environments/.stackhpc/activate
193+
ansible-playbook -vv ansible/ci/check_grafana.yml
194+
195+
- name: Run MPI-based tests again in current branch
196+
run: |
197+
. venv/bin/activate
198+
. environments/.stackhpc/activate
199+
ansible-playbook -vv ansible/adhoc/hpctests.yml
200+
142201
- name: Confirm Open Ondemand is up (via SOCKS proxy)
143202
run: |
144203
. venv/bin/activate
@@ -170,43 +229,10 @@ jobs:
170229
env:
171230
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
172231

173-
- name: Test reimage of login and control nodes (via rebuild adhoc)
174-
run: |
175-
. venv/bin/activate
176-
. environments/.stackhpc/activate
177-
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
178-
ansible-playbook -v ansible/site.yml
179-
ansible-playbook -v ansible/ci/check_slurm.yml
180-
181-
- name: Test compute node reboot and compute-init
182-
run: |
183-
. venv/bin/activate
184-
. environments/.stackhpc/activate
185-
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
186-
ansible-playbook -v ansible/ci/check_slurm.yml
187-
188-
- name: Check sacct state survived reimage
189-
run: |
190-
. venv/bin/activate
191-
. environments/.stackhpc/activate
192-
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
193-
194-
- name: Check MPI-based tests are shown in Grafana
195-
run: |
196-
. venv/bin/activate
197-
. environments/.stackhpc/activate
198-
ansible-playbook -vv ansible/ci/check_grafana.yml
199-
200232
- name: Delete infrastructure
201233
run: |
202234
. venv/bin/activate
203235
. environments/.stackhpc/activate
204-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
205-
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
236+
cd $STACKHPC_TF_DIR
237+
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR"
206238
if: ${{ success() || cancelled() }}
207-
208-
# - name: Delete images
209-
# run: |
210-
# . venv/bin/activate
211-
# . environments/.stackhpc/activate
212-
# ansible-playbook -vv ansible/ci/delete_images.yml

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,7 @@ roles/*
8484
!roles/pytools/**
8585
!roles/rebuild/
8686
!roles/rebuild/**
87+
!roles/slurm_tools/
88+
!roles/slurm_tools/**
8789
!roles/gateway/
8890
!roles/gateway/**

ansible/ci/check_grafana.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,4 @@
2323
delay: 5
2424
vars:
2525
_found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}"
26-
_expected_jobs: ['hpl-solo.sh', 'pingpong.sh', 'pingmatrix.sh']
26+
_expected_jobs: ['pingpong.sh']

ansible/ci/check_sacct_hpctests.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,6 @@
55
sacct_stdout_expected: |- # based on CI running hpctests as the first job
66
JobID,JobName,State
77
1,pingpong.sh,COMPLETED
8-
2,pingmatrix.sh,COMPLETED
9-
3,hpl-build-linux64.sh,COMPLETED
10-
4_0,hpl-solo.sh,COMPLETED
11-
4_1,hpl-solo.sh,COMPLETED
128
tasks:
139
- name: Get info for ended jobs
1410
shell:

ansible/ci/update_timestamps.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
- hosts: localhost
2+
tasks:
3+
- name: Get latest timestamps from sources
4+
latest_timestamps:
5+
repos_dict: "{{ appliances_pulp_repos }}"
6+
content_url: "https://ark.stackhpc.com/pulp/content"
7+
register: _result
8+
9+
- name: Overwrite repo timestamps with latest
10+
ansible.builtin.copy:
11+
dest: "{{ appliances_repository_root }}/environments/common/inventory/group_vars/all/timestamps.yml"
12+
content: "{{ repo_template | to_nice_yaml(indent=2) }}"
13+
backup: true
14+
vars:
15+
repo_template:
16+
appliances_pulp_repos: "{{ _result.timestamps }}"
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
__metaclass__ = type
2+
3+
DOCUMENTATION = r'''
4+
---
5+
module: latest_timestamps
6+
short_description: Gets the latest set of snapshots from Pulp
7+
version_added: "1.0.0"
8+
description: Gets the latest set of snapshots from given source URLs and returns dictionary to replace 'appliances_repo_timestamps' with
9+
author:
10+
- William Tripp
11+
- Steve Brasier
12+
'''
13+
14+
EXAMPLES = r'''
15+
- name: Get latest timestamps
16+
latest_timestamps:
17+
repos_dict: "{{ appliances_repo_timestamp_sources }}"
18+
content_url: "https://ark.stackhpc.com/pulp/content"
19+
register: result
20+
'''
21+
22+
RETURN = r'''
23+
latest_dict:
24+
description: Dictionary with updated timestamps
25+
type: dict
26+
returned: always
27+
changed_timestamps:
28+
description: List of repos that have updated timestamps
29+
type: str[]
30+
returned: always
31+
'''
32+
33+
from ansible.module_utils.basic import AnsibleModule
34+
import requests
35+
from bs4 import BeautifulSoup
36+
37+
def run_module():
38+
module_args = dict(
39+
repos_dict=dict(type='dict', required=True),
40+
content_url=dict(type='str', required=True)
41+
)
42+
43+
result = dict(
44+
changed=False,
45+
original_message='',
46+
message=''
47+
)
48+
49+
module = AnsibleModule(
50+
argument_spec=module_args,
51+
supports_check_mode=True
52+
)
53+
54+
timestamps = dict(module.params['repos_dict'])
55+
for repo in timestamps:
56+
for version in timestamps[repo]:
57+
58+
html_txt = requests.get(
59+
url= module.params['content_url'] + '/' + timestamps[repo][version]['path']
60+
).text
61+
timestamp_link_list = BeautifulSoup(html_txt,features="html.parser").body.find('pre').find_all() # getting raw list of timestamps from html
62+
timestamp_link_list = map(lambda x: x.string,timestamp_link_list) # stripping xml tags
63+
latest_timestamp = list(timestamp_link_list)[-1][:-1] # last timestamp in list with trailing / removed
64+
timestamps[repo][version]['timestamp'] = latest_timestamp
65+
66+
result['timestamps'] = dict(sorted(timestamps.items()))
67+
68+
module.exit_json(**result)
69+
70+
71+
def main():
72+
run_module()
73+
74+
75+
if __name__ == '__main__':
76+
main()

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,13 @@
6161
owner: slurm
6262
group: root
6363
mode: u=rX,g=rwX,o=
64-
64+
65+
- name: Wait for NFS to reachable (checks host network up)
66+
ansible.builtin.wait_for:
67+
port: 2049
68+
host: '{{ server_node_ip }}'
69+
timeout: 120
70+
6571
- name: Mount /mnt/cluster
6672
mount:
6773
path: /mnt/cluster
@@ -70,8 +76,6 @@
7076
opts: ro,sync
7177
state: mounted
7278
register: _mount_mnt_cluster
73-
ignore_errors: true
74-
# TODO: add some retries here?
7579

7680
- block:
7781
- name: Report skipping initialization if cannot mount nfs
@@ -193,7 +197,7 @@
193197
tasks_from: nfs-clients.yml
194198
when:
195199
- enable_nfs
196-
- nfs_enable.clients | default(item.nfs_enable.clients) | bool
200+
- nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool)
197201
loop: "{{ nfs_configurations }}"
198202

199203
- name: Manila mounts
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
---
22
pytools_editable: false
3-
pytools_gitref: master
3+
pytools_gitref: v2.0
44
pytools_user: root

environments/.stackhpc/ansible.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ inventory = ../common/inventory,inventory
1010
collections_path = ../../ansible/collections
1111
roles_path = ../../ansible/roles
1212
filter_plugins = ../../ansible/filter_plugins
13+
library = ../../ansible/library
1314

1415
[ssh_connection]
1516
ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-250319-1045-69713f23",
4-
"RL9": "openhpc-RL9-250319-1045-69713f23"
3+
"RL8": "openhpc-RL8-250401-1100-9a3cffdb",
4+
"RL9": "openhpc-RL9-250401-1100-9a3cffdb"
55
}
66
}

0 commit comments

Comments
 (0)