Skip to content

Commit 9ca2d7f

Browse files
authored
Merge pull request #634 from stackhpc/docs/rebuild
Document (and test) slurm controlled rebuild configuration and usage
2 parents 8ecf4d4 + 6c65f72 commit 9ca2d7f

File tree

18 files changed

+510
-82
lines changed

18 files changed

+510
-82
lines changed

.github/workflows/stackhpc.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,11 +173,11 @@ jobs:
173173
ansible-playbook -v ansible/site.yml
174174
ansible-playbook -v ansible/ci/check_slurm.yml
175175
176-
- name: Reimage compute nodes to image in current branch using slurm - tests compute-init
176+
- name: Reimage compute nodes to image in current branch using slurm
177177
run: |
178178
. venv/bin/activate
179179
. environments/.stackhpc/activate
180-
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
180+
ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml
181181
ansible-playbook -v ansible/ci/check_slurm.yml
182182
183183
- name: Check sacct state survived reimage to current branch

ansible/adhoc/reboot_via_slurm.yml

Lines changed: 0 additions & 24 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Rebuild compute nodes via slurm.
2+
# Nodes will be rebuilt if `image_id` in inventory is different to the
3+
# currently-provisioned image. Otherwise they are rebooted.
4+
5+
# Example:
6+
# ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml
7+
8+
# See docs/slurm-controlled-rebuild.md.
9+
10+
- hosts: login
11+
run_once: true
12+
gather_facts: no
13+
tasks:
14+
- name: Run slurm-controlled rebuild
15+
import_role:
16+
name: rebuild
17+
tasks_from: rebuild.yml

ansible/ci/check_slurm.yml

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,10 @@
66
shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
77
register: sinfo
88
changed_when: false
9-
until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout or "down" in sinfo.stdout)
10-
retries: 10
9+
until: sinfo.stdout_lines == expected_sinfo
10+
retries: 200
1111
delay: 5
12-
- name: Check nodes have expected slurm state
13-
assert:
14-
that: sinfo.stdout_lines == expected_sinfo
15-
fail_msg: |
16-
sinfo output not as expected:
17-
actual:
18-
{{ sinfo.stdout_lines }}
19-
expected:
20-
{{ expected_sinfo }}
21-
<end>
2212
vars:
2313
expected_sinfo:
14+
- " extra up 60-00:00:00 0 n/a" # empty partition
2415
- "{{ openhpc_cluster_name }}-compute-[0-1] standard* up 60-00:00:00 2 idle"

ansible/roles/compute_init/README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
# EXPERIMENTAL: compute_init
22

3-
Experimental functionality to allow compute nodes to rejoin the cluster after
4-
a reboot without running the `ansible/site.yml` playbook.
3+
Allow compute nodes to rejoin the cluster after a reboot without running the
4+
`ansible/site.yml` playbook.
55

6-
**CAUTION:** The approach used here of exporting cluster secrets over NFS
7-
is considered to be a security risk due to the potential for cluster users to
8-
mount the share on a user-controlled machine by tunnelling through a login
9-
node. This feature should not be enabled on production clusters at this time.
6+
> [!NOTE]
7+
> This functionality is marked as experimental as it may be incomplete and the
8+
> required configuration may change with further development.
109
1110
To enable this:
1211
1. Add the `compute` group (or a subset) into the `compute_init` group.

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -324,12 +324,6 @@
324324
enabled: true
325325
state: started
326326

327-
- name: Ensure slurmd service state
328-
service:
329-
name: slurmd
330-
enabled: true
331-
state: started
332-
333327
- name: Set locked memory limits on user-facing nodes
334328
lineinfile:
335329
path: /etc/security/limits.conf
@@ -351,6 +345,13 @@
351345
+:adm:ALL
352346
-:ALL:ALL
353347
348+
- name: Ensure slurmd service state
349+
service:
350+
name: slurmd
351+
enabled: true
352+
state: started
353+
354+
354355
- name: Ensure node is resumed
355356
# TODO: consider if this is always safe for all job states?
356357
command: scontrol update state=resume nodename={{ ansible_hostname }}

ansible/roles/rebuild/README.md

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,55 @@
11
rebuild
22
=========
33

4-
Enables reboot tool from https://github.com/stackhpc/slurm-openstack-tools.git to be run from control node.
4+
Enables reboot tool from https://github.com/stackhpc/slurm-openstack-tools.git
5+
to be run from control node.
56

67
Requirements
78
------------
89

9-
clouds.yaml file
10+
An OpenStack clouds.yaml file containing credentials for a cloud under the
11+
"openstack" key.
1012

1113
Role Variables
1214
--------------
1315

14-
- `openhpc_rebuild_clouds`: Directory. Path to clouds.yaml file.
16+
The below is only used by this role's `main.yml` task file, i.e. when running
17+
the `ansible/site.yml` or `ansible/slurm.yml` playbooks:
1518

19+
- `rebuild_clouds_path`: Optional. Path to `clouds.yaml` file on the deploy
20+
host, default `~/.config/openstack/clouds.yaml`.
1621

17-
Example Playbook
18-
----------------
22+
The below are only used by this role's `rebuild.yml` task file, i.e. when
23+
running the `ansible/adhoc/rebuild-via-slurm.yml` playbook:
1924

20-
- hosts: control
21-
become: yes
22-
tasks:
23-
- import_role:
24-
name: rebuild
25+
- `rebuild_job_partitions`: Optional. Comma-separated list of names of rebuild
26+
partitions defined in `openhpc_slurm_partitions`. Useful as an extra-var for
27+
limiting rebuilds. Default `rebuild`.
2528

26-
License
27-
-------
29+
- `rebuild_job_name`: Optional. Name of rebuild jobs. Default is `rebuild-`
30+
suffixed with the node name.
2831

29-
Apache-2.0
32+
- `rebuild_job_command`: Optional. String giving command to run in job after
33+
node has been rebuilt. Default is to sleep for 5 seconds. Note job output is
34+
send to `/dev/null` by default, as the root user running this has no shared
35+
directory for job output.
3036

37+
- `rebuild_job_reboot`: Optional. A bool controlling whether to add the
38+
`--reboot` flag to the job to actually trigger a rebuild. Useful for e.g.
39+
testing partition configurations. Default `true`.
40+
41+
- `rebuild_job_options`: Optional. A string giving any other options to pass to
42+
[sbatch](https://slurm.schedmd.com/sbatch.html). Default is empty string.
43+
44+
- `rebuild_job_user`: Optional. The user to run the rebuild setup and job as.
45+
Default `root`.
46+
47+
- `rebuild_job_template`: Optional. The string to use to submit the job. See
48+
[defaults.yml](defaults/main.yml).
49+
50+
- `rebuild_job_hostlist`: String with a Slurm hostlist expression to restrict
51+
a rebuild to only those nodes (e.g. `tux[1-3]` or `tux1,tux2`). If set,
52+
`rebuild_partitions` must only define a single partition and that partition
53+
must contain those nodes. Not for routine use, but may be useful to e.g.
54+
reattempt a rebuild if this failed on specific nodes. Default is all nodes
55+
in the relevant partition.
Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,23 @@
11
---
2-
openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml
2+
3+
rebuild_clouds_path: ~/.config/openstack/clouds.yaml
4+
5+
rebuild_job_partitions: rebuild
6+
rebuild_job_name: "rebuild-{{ item }}" # item is nodename
7+
rebuild_job_command: 'sleep 5'
8+
rebuild_job_reboot: true
9+
rebuild_job_options: ''
10+
rebuild_job_user: root
11+
rebuild_job_template: >-
12+
sbatch
13+
--nodelist={{ item }}
14+
{{ '--reboot' if rebuild_job_reboot | bool else '' }}
15+
--job-name={{ rebuild_job_name }}
16+
--nodes=1
17+
--exclusive
18+
--partition={{ _rebuild_job_current_partition }}
19+
--no-requeue
20+
--output=/dev/null
21+
--wrap="{{ rebuild_job_command }}"
22+
{{ rebuild_job_options }}
23+
#rebuild_job_hostlist:

ansible/roles/rebuild/tasks/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
- name: Copy out clouds.yaml
1212
copy:
13-
src: "{{ openhpc_rebuild_clouds }}"
13+
src: "{{ rebuild_clouds_path }}"
1414
dest: /etc/openstack/clouds.yaml
1515
owner: slurm
1616
group: root
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
- name: Create rebuild jobs for partition
2+
include_tasks:
3+
file: rebuild_partition.yml
4+
args:
5+
apply:
6+
become: yes
7+
become_user: "{{ rebuild_job_user }}"
8+
loop: "{{ rebuild_job_partitions | split(',') }}"
9+
loop_control:
10+
loop_var: _rebuild_job_current_partition
11+

0 commit comments

Comments
 (0)