Skip to content

Commit a21bf18

Browse files
authored
Merge branch 'main' into fix-readme-tofu
2 parents 01d9aa4 + 9a8f123 commit a21bf18

File tree

27 files changed

+315
-89
lines changed

27 files changed

+315
-89
lines changed

.github/workflows/stackhpc.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,13 @@ jobs:
178178
ansible-playbook -v ansible/site.yml
179179
ansible-playbook -v ansible/ci/check_slurm.yml
180180
181-
- name: Test reimage of compute nodes and compute-init (via rebuild adhoc)
181+
- name: Test compute node reboot and compute-init
182182
run: |
183183
. venv/bin/activate
184184
. environments/.stackhpc/activate
185185
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
186186
ansible-playbook -v ansible/ci/check_slurm.yml
187+
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
187188
188189
- name: Check sacct state survived reimage
189190
run: |

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,5 @@ roles/*
8080
!roles/slurm_stats/**
8181
!roles/pytools/
8282
!roles/pytools/**
83+
!roles/rebuild/
84+
!roles/rebuild/**

ansible/adhoc/reboot_via_slurm.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Reboot compute nodes via slurm. Nodes will be rebuilt if `image_id` in inventory is different to the currently-provisioned image.
2+
# Example:
3+
# ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
4+
5+
- hosts: login
6+
run_once: true
7+
become: yes
8+
gather_facts: no
9+
tasks:
10+
- name: Submit a Slurm job to reboot compute nodes
11+
ansible.builtin.shell: |
12+
set -e
13+
srun --reboot -N 2 uptime
14+
become_user: root
15+
register: slurm_result
16+
failed_when: slurm_result.rc != 0
17+
18+
- name: Fetch Slurm controller logs if reboot fails
19+
ansible.builtin.shell: |
20+
journalctl -u slurmctld --since "10 minutes ago" | tail -n 50
21+
become_user: root
22+
register: slurm_logs
23+
when: slurm_result.rc != 0
24+
delegate_to: "{{ groups['control'] | first }}"

ansible/bootstrap.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,9 @@
126126
ansible.builtin.assert:
127127
that: dnf_repos_password is undefined
128128
fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password'
129-
when: appliances_mode == 'configure'
129+
when:
130+
- appliances_mode == 'configure'
131+
- not (dnf_repos_allow_insecure_creds | default(false)) # useful for development
130132

131133
- hosts: squid
132134
tags: squid

ansible/roles/compute_init/README.md

Lines changed: 108 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,109 @@
1-
# EXPERIMENTAL: compute-init
2-
3-
Experimental / in-progress functionality to allow compute nodes to rejoin the
4-
cluster after a reboot.
5-
6-
To enable this add compute nodes (or a subset of them into) the `compute_init`
7-
group.
8-
1+
# EXPERIMENTAL: compute_init
2+
3+
Experimental functionality to allow compute nodes to rejoin the cluster after
4+
a reboot without running the `ansible/site.yml` playbook.
5+
6+
To enable this:
7+
1. Add the `compute` group (or a subset) into the `compute_init` group. This is
8+
the default when using cookiecutter to create an environment, via the
9+
"everything" template.
10+
2. Build an image which includes the `compute_init` group. This is the case
11+
for StackHPC-built release images.
12+
3. Enable the required functionalities during boot, by setting the
13+
`compute_init_enable` property for a compute group in the
14+
OpenTofu `compute` variable to a list which includes "compute", plus the
15+
other roles/functionalities required, e.g.:
16+
17+
```terraform
18+
...
19+
compute = {
20+
general = {
21+
nodes = ["general-0", "general-1"]
22+
compute_init_enable = ["compute", ... ] # see below
23+
}
24+
}
25+
...
26+
```
27+
28+
## Supported appliance functionalities
29+
30+
In the table below, if a role is marked as supported then its functionality
31+
can be enabled during boot by adding the role name to the `compute_init_enable`
32+
property described above. If a role is marked as requiring a custom image then
33+
it also requires an image build with the role name added to the
34+
[Packer inventory_groups variable](../../../docs/image-build.md).
35+
36+
| Playbook | Role (or functionality) | Support | Custom image reqd.? |
37+
| -------------------------|-------------------------|---------------------------------|---------------------|
38+
| hooks/pre.yml | ? | None at present | n/a |
39+
| validate.yml | n/a | Not relevant during boot | n/a |
40+
| bootstrap.yml | (wait for ansible-init) | Not relevant during boot | n/a |
41+
| bootstrap.yml | resolv_conf | Fully supported | No |
42+
| bootstrap.yml | etc_hosts | Fully supported | No |
43+
| bootstrap.yml | proxy | None at present | No |
44+
| bootstrap.yml | (/etc permissions) | None required - use image build | No |
45+
| bootstrap.yml | (ssh /home fix) | None required - use image build | No |
46+
| bootstrap.yml | (system users) | None required - use image build | No |
47+
| bootstrap.yml | systemd | None required - use image build | No |
48+
| bootstrap.yml | selinux | None required - use image build | Maybe [1] |
49+
| bootstrap.yml | sshd | None at present | No |
50+
| bootstrap.yml | dnf_repos | None at present [2] | - |
51+
| bootstrap.yml | squid | Not relevant for compute nodes | n/a |
52+
| bootstrap.yml | tuned | Fully supported | No |
53+
| bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a |
54+
| bootstrap.yml | cockpit | None required - use image build | No |
55+
| bootstrap.yml | firewalld | Not relevant for compute nodes | n/a |
56+
| bootstrap.yml | fail2ban | Not relevant for compute nodes | n/a |
57+
| bootstrap.yml | podman | Not relevant for compute nodes | n/a |
58+
| bootstrap.yml | update | Not relevant during boot | n/a |
59+
| bootstrap.yml | reboot | Not relevant for compute nodes | n/a |
60+
| bootstrap.yml | ofed | Not relevant during boot | Yes |
61+
| bootstrap.yml | ansible_init (install) | Not relevant during boot | n/a |
62+
| bootstrap.yml | k3s (install) | Not relevant during boot | n/a |
63+
| hooks/post-bootstrap.yml | ? | None at present | n/a |
64+
| iam.yml | freeipa_client | None at present [3] | Yes |
65+
| iam.yml | freeipa_server | Not relevant for compute nodes | n/a |
66+
| iam.yml | sssd | None at present | No |
67+
| filesystems.yml | block_devices | None required - role deprecated | n/a |
68+
| filesystems.yml | nfs | All client functionality | No |
69+
| filesystems.yml | manila | All functionality | No [4] |
70+
| filesystems.yml | lustre | None at present | Yes |
71+
| extras.yml | basic_users | All functionality [5] | No |
72+
| extras.yml | eessi | All functionality [6] | No |
73+
| extras.yml | cuda | None required - use image build | Yes [7] |
74+
| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
75+
| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
76+
| extras.yml | k9s (install) | Not relevant during boot | n/a |
77+
| extras.yml | extra_packages | None at present [8] | - |
78+
| slurm.yml | mysql | Not relevant for compute nodes | n/a |
79+
| slurm.yml | rebuild | Not relevant for compute nodes | n/a |
80+
| slurm.yml | openhpc [9] | All slurmd functionality | No |
81+
| slurm.yml | (set memory limits) | None at present | - |
82+
| slurm.yml | (block ssh) | None at present | - |
83+
| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a |
84+
| portal.yml | (openondemand vnc desktop) | None required - use image build | No |
85+
| portal.yml | (openondemand jupyter server) | None required - use image build | No |
86+
| monitoring.yml | node_exporter | None required - use image build | No |
87+
| monitoring.yml | (other monitoring) | Not relevant for compute nodes | - |
88+
| disable-repos.yml | dnf_repos | None at present [2] | - |
89+
| hooks/post.yml | ? | None at present | - |
90+
91+
92+
Notes:
93+
1. `selinux` is set to disabled in StackHPC images.
94+
2. Requirement for this functionality is TBD.
95+
3. FreeIPA client functionality would be better provided using a client fork
96+
which uses pkinit keys rather than OTP to reenrol nodes.
97+
4. Assuming default Ceph client version.
98+
5. Assumes home directory already exists on shared storage.
99+
6. Assumes `cvmfs_config` is the same on control node and all compute nodes.
100+
7. If `cuda` role was run during build, the nvidia-persistenced is enabled
101+
and will start during boot.
102+
8. Would require `dnf_repos`.
103+
9. `openhpc` does not need to be added to `compute_init_enable`, this is
104+
automatically enabled by adding `compute`.
105+
106+
## Approach
9107
This works as follows:
10108
1. During image build, an ansible-init playbook and supporting files
11109
(e.g. templates, filters, etc) are installed.
@@ -31,21 +129,7 @@ The check in 4b. above is what prevents the compute-init script from trying
31129
to configure the node before the services on the control node are available
32130
(which requires running the site.yml playbook).
33131

34-
The following roles/groups are currently fully functional:
35-
- `resolv_conf`: all functionality
36-
- `etc_hosts`: all functionality
37-
- `nfs`: client functionality only
38-
- `manila`: all functionality
39-
- `basic_users`: all functionality, assumes home directory already exists on
40-
shared storage
41-
- `eessi`: all functionality, assumes `cvmfs_config` is the same on control
42-
node and all compute nodes.
43-
- `openhpc`: all functionality
44-
45-
The above may be enabled by setting the compute_init_enable property on the
46-
tofu compute variable.
47-
48-
# Development/debugging
132+
## Development/debugging
49133

50134
To develop/debug changes to the compute script without actually having to build
51135
a new image:
@@ -83,7 +167,7 @@ reimage the compute node(s) first as in step 2 and/or add additional metadata
83167
as in step 3.
84168

85169

86-
# Design notes
170+
## Design notes
87171
- Duplicating code in roles into the `compute-init` script is unfortunate, but
88172
does allow developing this functionality without wider changes to the
89173
appliance.

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
enable_compute: "{{ os_metadata.meta.compute | default(false) | bool }}"
1010
enable_resolv_conf: "{{ os_metadata.meta.resolv_conf | default(false) | bool }}"
1111
enable_etc_hosts: "{{ os_metadata.meta.etc_hosts | default(false) | bool }}"
12+
enable_tuned: "{{ os_metadata.meta.tuned | default(false) | bool }}"
1213
enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}"
1314
enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}"
1415
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
@@ -17,6 +18,12 @@
1718
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
1819
resolv_conf_nameservers: []
1920

21+
tuned_profile_baremetal: hpc-compute
22+
tuned_profile_vm: virtual-guest
23+
tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}"
24+
tuned_enabled: true
25+
tuned_started: true
26+
2027
nfs_client_mnt_point: "/mnt"
2128
nfs_client_mnt_options:
2229
nfs_client_mnt_state: mounted
@@ -59,9 +66,9 @@
5966
file:
6067
path: /mnt/cluster
6168
state: directory
62-
owner: root
69+
owner: slurm
6370
group: root
64-
mode: u=rwX,go= # is sensitive
71+
mode: u=rX,g=rwX,o=
6572

6673
- name: Mount /mnt/cluster
6774
mount:
@@ -125,6 +132,10 @@
125132
mode: 0644
126133
when: enable_etc_hosts
127134

135+
- name: Configure tuned
136+
include_tasks: tasks/tuned.yml
137+
when: enable_tuned
138+
128139
# NFS client mount
129140
- name: If nfs-clients is present
130141
include_tasks: tasks/nfs-clients.yml
@@ -276,6 +287,27 @@
276287
enabled: true
277288
state: started
278289

290+
- name: Set locked memory limits on user-facing nodes
291+
lineinfile:
292+
path: /etc/security/limits.conf
293+
regexp: '\* soft memlock unlimited'
294+
line: "* soft memlock unlimited"
295+
296+
- name: Configure sshd pam module
297+
blockinfile:
298+
path: /etc/pam.d/sshd
299+
insertafter: 'account\s+required\s+pam_nologin.so'
300+
block: |
301+
account sufficient pam_access.so
302+
account required pam_slurm.so
303+
304+
- name: Configure login access control
305+
blockinfile:
306+
path: /etc/security/access.conf
307+
block: |
308+
+:adm:ALL
309+
-:ALL:ALL
310+
279311
- name: Ensure node is resumed
280312
# TODO: consider if this is always safe for all job states?
281313
command: scontrol update state=resume nodename={{ ansible_hostname }}

ansible/roles/compute_init/tasks/export.yml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
file:
33
path: /exports/cluster
44
state: directory
5-
owner: root
5+
owner: slurm
66
group: root
7-
mode: u=rwX,go=
7+
mode: u=rX,g=rwX,o=
88
run_once: true
99
delegate_to: "{{ groups['control'] | first }}"
1010

@@ -23,21 +23,27 @@
2323
file:
2424
path: /exports/cluster/hostvars/{{ inventory_hostname }}/
2525
state: directory
26-
mode: u=rwX,go=
27-
# TODO: owner,mode,etc
26+
owner: slurm
27+
group: root
28+
mode: u=rX,g=rwX,o=
2829
delegate_to: "{{ groups['control'] | first }}"
2930

3031
- name: Template out hostvars
3132
template:
3233
src: hostvars.yml.j2
3334
dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml
34-
mode: u=rw,go=
35+
owner: slurm
36+
group: root
37+
mode: u=r,g=rw,o=
3538
delegate_to: "{{ groups['control'] | first }}"
3639

3740
- name: Copy manila share info to /exports/cluster
3841
copy:
3942
content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}"
4043
dest: /exports/cluster/manila_share_info.yml
44+
owner: root
45+
group: root
46+
mode: u=rw,g=r
4147
run_once: true
4248
delegate_to: "{{ groups['control'] | first }}"
4349
when: os_manila_mount_share_info is defined

ansible/roles/compute_init/tasks/install.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
dest: files/NetworkManager-dns-none.conf
3333
- src: ../../basic_users/filter_plugins/filter_keys.py
3434
dest: filter_plugins/filter_keys.py
35+
- src: ../../tuned/tasks/configure.yml
36+
dest: tasks/tuned.yml
3537
- src: ../../stackhpc.nfs/tasks/nfs-clients.yml
3638
dest: tasks/nfs-clients.yml
3739

ansible/roles/lustre/tasks/configure.yml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,18 @@
11
- name: Gather Lustre interface info
22
shell:
33
cmd: |
4-
ip r get {{ _lustre_mgs_ip }}
4+
ip --json r get {{ _lustre_mgs_ip }}
55
changed_when: false
66
register: _lustre_ip_r_mgs
77
vars:
88
_lustre_mgs_ip: "{{ lustre_mgs_nid | split('@') | first }}"
99

1010
- name: Set facts for Lustre interface
1111
set_fact:
12-
_lustre_interface: "{{ _lustre_ip_r_mgs_info[4] }}"
13-
_lustre_ip: "{{ _lustre_ip_r_mgs_info[6] }}"
12+
_lustre_interface: "{{ _lustre_ip_r_mgs_info.dev }}"
13+
_lustre_ip: "{{ _lustre_ip_r_mgs_info.prefsrc }}"
1414
vars:
15-
_lustre_ip_r_mgs_info: "{{ _lustre_ip_r_mgs.stdout_lines.0 | split }}"
16-
# first line e.g. "10.167.128.1 via 10.179.0.2 dev eth0 src 10.179.3.149 uid 1000"
15+
_lustre_ip_r_mgs_info: "{{ _lustre_ip_r_mgs.stdout | from_json | first }}"
1716

1817
- name: Write LNet configuration file
1918
template:
@@ -44,4 +43,3 @@
4443
state: "{{ (item.mount_state | default(lustre_mount_state)) }}"
4544
opts: "{{ item.mount_options | default(lustre_mount_options) }}"
4645
loop: "{{ lustre_mounts }}"
47-

ansible/roles/lustre/tasks/validate.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
- name: Assert using RockyLinux 9
2-
assert:
3-
that: ansible_distribution_major_version | int == 9
4-
fail_msg: The 'lustre' role requires RockyLinux 9
5-
61
- name: Check kernel-devel package is installed
72
command: "dnf list --installed kernel-devel-{{ ansible_kernel }}"
83
changed_when: false

0 commit comments

Comments
 (0)