Skip to content

Commit bf9c7f5

Browse files
authored
Merge branch 'main' into update/timestamps
2 parents 99652a6 + 0aec76c commit bf9c7f5

File tree

30 files changed

+439
-258
lines changed

30 files changed

+439
-258
lines changed

.github/workflows/stackhpc.yml

Lines changed: 72 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,18 @@ jobs:
4343
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
4444
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
4545
TF_VAR_os_version: ${{ matrix.os_version }}
46+
STACKHPC_TF_DIR: environments/.stackhpc/tofu
4647
steps:
47-
- uses: actions/checkout@v2
48+
49+
- name: Find the latest release
50+
run: |
51+
echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV"
52+
53+
- name: Checkout latest release
54+
uses: actions/checkout@v4
55+
with:
56+
ref: ${{ env.LATEST_RELEASE_TAG }}
57+
fetch-depth: 0
4858

4959
- name: Override CI_CLOUD if PR label is present
5060
if: ${{ github.event_name == 'pull_request' }}
@@ -60,9 +70,10 @@ jobs:
6070
fi
6171
done
6272
63-
- name: Record settings for CI cloud
73+
- name: Record debug info
6474
run: |
65-
echo CI_CLOUD: ${{ env.CI_CLOUD }}
75+
echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG
76+
echo CI_CLOUD: $CI_CLOUD
6677
6778
- name: Setup ssh
6879
run: |
@@ -76,7 +87,7 @@ jobs:
7687
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
7788
shell: bash
7889

79-
- name: Install ansible etc
90+
- name: Install ansible, pip and galaxy requirements
8091
run: dev/setup-env.sh
8192

8293
- name: Install OpenTofu
@@ -86,7 +97,7 @@ jobs:
8697

8798
- name: Initialise tofu
8899
run: tofu init
89-
working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu
100+
working-directory: ${{ env.STACKHPC_TF_DIR }}
90101

91102
- name: Write clouds.yaml
92103
run: |
@@ -103,42 +114,90 @@ jobs:
103114
env:
104115
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
105116

106-
- name: Provision nodes using fat image
117+
- name: Provision nodes using latest release image
107118
id: provision_servers
108119
run: |
109120
. venv/bin/activate
110121
. environments/.stackhpc/activate
111-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
122+
cd $STACKHPC_TF_DIR
112123
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
113124
114125
- name: Delete infrastructure if provisioning failed
115126
run: |
116127
. venv/bin/activate
117128
. environments/.stackhpc/activate
118-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
129+
cd $STACKHPC_TF_DIR
119130
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
120131
if: failure() && steps.provision_servers.outcome == 'failure'
121132

122-
- name: Configure cluster
133+
- name: Configure cluster at latest release
123134
run: |
124135
. venv/bin/activate
125136
. environments/.stackhpc/activate
126137
ansible all -m wait_for_connection
127138
ansible-playbook -v ansible/site.yml
128139
ansible-playbook -v ansible/ci/check_slurm.yml
129140
130-
- name: Run MPI-based tests
141+
- name: Run MPI-based tests at latest release
131142
run: |
132143
. venv/bin/activate
133144
. environments/.stackhpc/activate
134-
ansible-playbook -vv ansible/adhoc/hpctests.yml
145+
ansible-playbook -vv ansible/adhoc/hpctests.yml --tags pingpong
135146
136147
# - name: Run EESSI tests
137148
# run: |
138149
# . venv/bin/activate
139150
# . environments/.stackhpc/activate
140151
# ansible-playbook -vv ansible/ci/check_eessi.yml
141152

153+
- name: Checkout current branch
154+
run: git checkout ${{ github.head_ref || github.ref_name }}
155+
156+
- name: Update ansible, pip and galaxy requirements
157+
run: dev/setup-env.sh
158+
159+
- name: Reimage login and control nodes to image in current branch
160+
id: reimage_non_compute
161+
run: |
162+
. venv/bin/activate
163+
. environments/.stackhpc/activate
164+
cd $STACKHPC_TF_DIR
165+
tofu init
166+
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
167+
168+
- name: Configure cluster using current branch
169+
run: |
170+
. venv/bin/activate
171+
. environments/.stackhpc/activate
172+
ansible all -m wait_for_connection
173+
ansible-playbook -v ansible/site.yml
174+
ansible-playbook -v ansible/ci/check_slurm.yml
175+
176+
- name: Reimage compute nodes to image in current branch using slurm - tests compute-init
177+
run: |
178+
. venv/bin/activate
179+
. environments/.stackhpc/activate
180+
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
181+
ansible-playbook -v ansible/ci/check_slurm.yml
182+
183+
- name: Check sacct state survived reimage to current branch
184+
run: |
185+
. venv/bin/activate
186+
. environments/.stackhpc/activate
187+
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
188+
189+
- name: Check MPI-based tests are shown in Grafana
190+
run: |
191+
. venv/bin/activate
192+
. environments/.stackhpc/activate
193+
ansible-playbook -vv ansible/ci/check_grafana.yml
194+
195+
- name: Run MPI-based tests again in current branch
196+
run: |
197+
. venv/bin/activate
198+
. environments/.stackhpc/activate
199+
ansible-playbook -vv ansible/adhoc/hpctests.yml
200+
142201
- name: Confirm Open Ondemand is up (via SOCKS proxy)
143202
run: |
144203
. venv/bin/activate
@@ -170,43 +229,10 @@ jobs:
170229
env:
171230
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
172231

173-
- name: Test reimage of login and control nodes (via rebuild adhoc)
174-
run: |
175-
. venv/bin/activate
176-
. environments/.stackhpc/activate
177-
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
178-
ansible-playbook -v ansible/site.yml
179-
ansible-playbook -v ansible/ci/check_slurm.yml
180-
181-
- name: Test compute node reboot and compute-init
182-
run: |
183-
. venv/bin/activate
184-
. environments/.stackhpc/activate
185-
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
186-
ansible-playbook -v ansible/ci/check_slurm.yml
187-
188-
- name: Check sacct state survived reimage
189-
run: |
190-
. venv/bin/activate
191-
. environments/.stackhpc/activate
192-
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
193-
194-
- name: Check MPI-based tests are shown in Grafana
195-
run: |
196-
. venv/bin/activate
197-
. environments/.stackhpc/activate
198-
ansible-playbook -vv ansible/ci/check_grafana.yml
199-
200232
- name: Delete infrastructure
201233
run: |
202234
. venv/bin/activate
203235
. environments/.stackhpc/activate
204-
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
205-
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
236+
cd $STACKHPC_TF_DIR
237+
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR"
206238
if: ${{ success() || cancelled() }}
207-
208-
# - name: Delete images
209-
# run: |
210-
# . venv/bin/activate
211-
# . environments/.stackhpc/activate
212-
# ansible-playbook -vv ansible/ci/delete_images.yml

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ It requires an OpenStack cloud, and an Ansible "deploy host" with access to that
3131

3232
Before starting ensure that:
3333
- You have root access on the deploy host.
34-
- You can create instances from the [latest Slurm appliance image](https://github.com/stackhpc/ansible-slurm-appliance/releases), which already contains the required packages. This is built and tested in StackHPC's CI. Although you can use a Rocky Linux 9 GenericCloud instead, it is not recommended.
34+
- You can create instances from the [latest Slurm appliance image](https://github.com/stackhpc/ansible-slurm-appliance/releases), which already contains the required packages. This is built and tested in StackHPC's CI.
3535
- You have an SSH keypair defined in OpenStack, with the private key available on the deploy host.
3636
- Created instances have access to internet (note proxies can be setup through the appliance if necessary).
3737
- Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance).
@@ -49,6 +49,7 @@ These instructions assume the deployment host is running Rocky Linux 8:
4949
sudo yum install -y git python38
5050
git clone https://github.com/stackhpc/ansible-slurm-appliance
5151
cd ansible-slurm-appliance
52+
git checkout ${latest-release-tag}
5253
./dev/setup-env.sh
5354

5455
You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install/rpm/).

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,7 @@ roles/*
8484
!roles/pytools/**
8585
!roles/rebuild/
8686
!roles/rebuild/**
87+
!roles/slurm_tools/
88+
!roles/slurm_tools/**
8789
!roles/gateway/
8890
!roles/gateway/**

ansible/adhoc/cudatests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
- hosts: cuda
22
become: yes
3-
gather_facts: no
3+
gather_facts: yes
44
tags: cuda_samples
55
tasks:
66
- import_role:

ansible/ci/check_grafana.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,4 @@
2323
delay: 5
2424
vars:
2525
_found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}"
26-
_expected_jobs: ['hpl-solo.sh', 'pingpong.sh', 'pingmatrix.sh']
26+
_expected_jobs: ['pingpong.sh']

ansible/ci/check_sacct_hpctests.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,6 @@
55
sacct_stdout_expected: |- # based on CI running hpctests as the first job
66
JobID,JobName,State
77
1,pingpong.sh,COMPLETED
8-
2,pingmatrix.sh,COMPLETED
9-
3,hpl-build-linux64.sh,COMPLETED
10-
4_0,hpl-solo.sh,COMPLETED
11-
4_1,hpl-solo.sh,COMPLETED
128
tasks:
139
- name: Get info for ended jobs
1410
shell:

ansible/roles/basic_users/README.md

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,44 +11,52 @@ without requiring LDAP etc. Features:
1111
- Login to the control node is prevented (by default).
1212
- When deleting users, systemd user sessions are terminated first.
1313

14-
> [!IMPORTANT] This role assumes that `$HOME` for users managed by this role
15-
(e.g. not `rocky` and other system users) is on a shared filesystem. The export
16-
of this shared filesystem may be root squashed if its server is in the
17-
`basic_user` group - see configuration examples below.
14+
> [!IMPORTANT] The defaults for this role assumes that `$HOME` for users
15+
managed by this role (e.g. not `rocky` and other system users) is on a shared
16+
filesystem. The export of this shared filesystem may be root squashed if its
17+
server is in the `basic_user` group - see configuration examples below.
1818

1919
Role Variables
2020
--------------
2121

22-
- `basic_users_users`: Optional, default empty list. A list of mappings defining information for each user. In general, mapping keys/values are passed through as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html) and default values are as given there. However:
23-
- `create_home` and `generate_ssh_key`: Normally set automatically. Can be
24-
set `false` if necessary to disable home directory creation/cluster ssh
25-
key creation. Should not be set `true` to avoid trying to modify home
26-
directories from multiple nodes simultaneously.
22+
- `basic_users_homedir_server`: Optional inventory hostname in the `basic_users`
23+
group defining the host to use to create home directories. If the home
24+
directory export is root squashed, this host *must* be the home directory
25+
server. Default is the `control` node which is appropriate for the default
26+
appliance configuration. Not relevant if `create_home` is false for all users.
27+
- `basic_users_homedir_server_path`: Optional path prefix for home directories on
28+
the `basic_users_homedir_server`, i.e. on the "server side". Default is
29+
`/exports/home` which is appropriate for the default appliance configuration.
30+
- `basic_users_homedir_client`: Optional inventory hostname in the `basic_users`
31+
group defining the host to use to create ssh keys etc in home directories.
32+
This should be a host mounting the home directories. Default is the first
33+
node in the `login` group which is appropriate for the default appliance
34+
configuration.
35+
- `basic_users_users`: Optional, default empty list. A list of mappings defining
36+
information for each user. In general, mapping keys/values are passed through
37+
as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html)
38+
and default values are as given there, with the following differences:
39+
- `generate_ssh_key`: Default is `true`, and the generated key is added to
40+
the user's authorized keys.
2741
- `ssh_key_comment`: Default is user name.
2842
- `home`: Set automatically based on the user name and
29-
`basic_users_homedir_host_path`. Can be overriden if required for e.g.
30-
users with non-standard home directory paths.
43+
`basic_users_homedir_server_path`. Can be overriden for users with
44+
non-standard home directory paths.
3145
- `uid`: Should be set, so that the UID/GID is consistent across the cluster
3246
(which Slurm requires).
3347
- `shell`: If *not* set will be `/sbin/nologin` on the `control` node to
3448
prevent users logging in to this node, and the default shell on other
3549
nodes. Explicitly setting this defines the shell for all nodes and if the
3650
shared home directories are mounted on the control node will allow the
3751
user to log in to the control node.
38-
- An additional key `public_key` may optionally be specified to define a key to log into the cluster.
39-
- An additional key `sudo` may optionally be specified giving a string (possibly multiline) defining sudo rules to be templated.
40-
- `ssh_key_type` defaults to `ed25519` instead of the `ansible.builtin.user` default of `rsa`.
52+
- `public_key`: Optional, define a key to log into the cluster with.
53+
- `sudo`: Optional, a (possibly multiline) string defining sudo rules for the
54+
user.
55+
- `ssh_key_type` defaults to `ed25519` instead of the `ansible.builtin.user`
56+
default of `rsa`.
4157
- Any other keys may present for other purposes (i.e. not used by this role).
4258
- `basic_users_groups`: Optional, default empty list. A list of mappings defining information for each group. Mapping keys/values are passed through as parameters to [ansible.builtin.group](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/group_module.html) and default values are as given there.
4359
- `basic_users_override_sssd`: Optional bool, default false. Whether to disable `sssd` when ensuring users/groups exist with this role. Permits creating local users/groups even if they clash with users provided via sssd (e.g. from LDAP). Ignored if host is not in group `sssd` as well. Note with this option active `sssd` will be stopped and restarted each time this role is run.
44-
- `basic_users_homedir_host`: Optional inventory hostname defining the host
45-
to use to create home directories. If the home directory export is root
46-
squashed, this host *must* be the home directory server. Default is the
47-
`control` node which is appropriate for the default appliance configuration.
48-
Not relevant if `create_home` is false for all users.
49-
- `basic_users_homedir_host_path`: Optional path prefix for home directories on
50-
the `basic_users_homedir_host`, i.e. on the "server side". Default is
51-
`/exports/home` which is appropriate for the default appliance configuration.
5260

5361
Dependencies
5462
------------

ansible/roles/basic_users/defaults/main.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
basic_users_homedir_host: "{{ groups['control'] | first }}" # no way, generally, to find the nfs_server
2-
basic_users_homedir_host_path: /exports/home
3-
# _basic_users_manage_homedir: "{{ ansible_hostname == basic_users_homedir_host }}"
1+
basic_users_homedir_server: "{{ groups['control'] | first }}" # no way, generally, to find the nfs_server
2+
basic_users_homedir_server_path: /exports/home
3+
basic_users_homedir_client: "{{ groups['login'] | first }}"
44
basic_users_userdefaults:
55
state: present # need this here so don't have to add default() everywhere
66
generate_ssh_key: true

0 commit comments

Comments
 (0)