Skip to content

Commit af5ef65

Browse files
authored
Merge pull request #85 from stackhpc/fix/noconf-add
Fix issues with slurm daemon startup when adding nodes
2 parents ee2fb9c + 4497d97 commit af5ef65

File tree

23 files changed

+285
-260
lines changed

23 files changed

+285
-260
lines changed

.github/workflows/ci.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,25 @@ jobs:
2727
- test4
2828
- test5
2929
- test6
30+
- test7
31+
- test8
32+
- test9
33+
- test10
34+
3035
exclude:
3136
- image: 'centos:7'
3237
scenario: test5
3338
- image: 'centos:7'
3439
scenario: test6
40+
- image: 'centos:7'
41+
scenario: test7
42+
- image: 'centos:7'
43+
scenario: test8
44+
- image: 'centos:7'
45+
scenario: test9
46+
- image: 'centos:7'
47+
scenario: test10
48+
3549

3650
steps:
3751
- name: Check out the codebase.

handlers/main.yml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,8 @@
1818
- openhpc_slurmdbd_host in play_hosts
1919
run_once: true
2020

21-
- name: Reload SLURM service
22-
service:
23-
name: "{{ openhpc_slurm_service }}"
24-
state: reloaded
25-
when:
26-
- openhpc_slurm_service is not none
27-
- openhpc_slurm_service_started | bool
28-
29-
# NOTE: Allows you to restart slurmctld from another host
21+
# NOTE: we need this running before slurmd
22+
# Allows you to reconfigure slurmctld from another host
3023
- name: Restart slurmctld service
3124
service:
3225
name: "slurmctld"
@@ -36,3 +29,11 @@
3629
when:
3730
- openhpc_slurm_service_started | bool
3831
- openhpc_slurm_control_host in play_hosts
32+
33+
- name: Restart slurmd service
34+
service:
35+
name: "slurmd"
36+
state: restarted
37+
when:
38+
- openhpc_slurm_service_started | bool
39+
- openhpc_slurm_service == 'slurmd'

molecule/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@ test3 | 1 | Y | -
1414
test4 | 1 | N | 2x compute node, accounting enabled
1515
test5 | 1 | N | As for #1 but configless
1616
test6 | 1 | N | 0x compute nodes, configless
17-
test7 | 1 | N | 1x compute node, no login node, configless
17+
test7 | 1 | N | 1x compute node, no login node, configless (checks image build should work)
1818
test8 | 1 | N | 2x compute node, 2x login-only nodes, configless
1919
test9 | 1 | N | As test8 but uses `--limit=testohpc-control,testohpc-compute-0` and checks login nodes still end up in slurm.conf
20+
test10 | 1 | N | As for #5 but then tries to add an additional node
21+
test11 | 1 | N | As for #5 but then deletes a node (actually changes the partition due to molecule/ansible limitations)
2022

2123
# Local Installation & Running
2224

molecule/test1/INSTALL.rst

Lines changed: 0 additions & 22 deletions
This file was deleted.

molecule/test10/converge.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
---
2+
- name: Create initial cluster
3+
hosts: initial
4+
tasks:
5+
- name: "Include ansible-role-openhpc"
6+
include_role:
7+
name: "ansible-role-openhpc/"
8+
vars:
9+
openhpc_enable:
10+
control: "{{ inventory_hostname in groups['testohpc_login'] }}"
11+
batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
12+
runtime: true
13+
openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
14+
openhpc_slurm_partitions:
15+
- name: "compute"
16+
openhpc_cluster_name: testohpc
17+
openhpc_slurm_configless: true

molecule/test10/molecule.yml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
---
2+
name: single partition, group is partition
3+
driver:
4+
name: docker
5+
platforms:
6+
- name: testohpc-login-0
7+
image: ${MOLECULE_IMAGE}
8+
pre_build_image: true
9+
groups:
10+
- testohpc_login
11+
- initial
12+
command: /sbin/init
13+
tmpfs:
14+
- /run
15+
- /tmp
16+
volumes:
17+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
18+
networks:
19+
- name: net1
20+
- name: testohpc-compute-0
21+
image: ${MOLECULE_IMAGE}
22+
pre_build_image: true
23+
groups:
24+
- testohpc_compute
25+
- initial
26+
command: /sbin/init
27+
tmpfs:
28+
- /run
29+
- /tmp
30+
volumes:
31+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
32+
networks:
33+
- name: net1
34+
- name: testohpc-compute-1
35+
image: ${MOLECULE_IMAGE}
36+
pre_build_image: true
37+
groups:
38+
- testohpc_compute
39+
- initial
40+
command: /sbin/init
41+
tmpfs:
42+
- /run
43+
- /tmp
44+
volumes:
45+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
46+
networks:
47+
- name: net1
48+
- name: testohpc-compute-2
49+
image: ${MOLECULE_IMAGE}
50+
pre_build_image: true
51+
groups: # NB this is NOT in the "testohpc_compute" so that it isn't added to slurm.conf initially
52+
- new
53+
command: /sbin/init
54+
tmpfs:
55+
- /run
56+
- /tmp
57+
volumes:
58+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
59+
networks:
60+
- name: net1
61+
provisioner:
62+
name: ansible
63+
verifier:
64+
name: ansible

molecule/test10/verify.yml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
---
2+
- name: Check initial cluster has 2x nodes
3+
hosts: testohpc_login
4+
tasks:
5+
- name: Get slurm partition info
6+
command: sinfo --noheader --format="%P,%a,%l,%D,%t,%N" # using --format ensures we control whitespace
7+
register: sinfo
8+
changed_when: false
9+
- assert: # PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
10+
that: "sinfo.stdout_lines == ['compute*,up,60-00:00:00,2,idle,testohpc-compute-[0-1]']"
11+
fail_msg: "FAILED - actual value: {{ sinfo.stdout_lines }}"
12+
success_msg: "OK - 2x nodes idle"
13+
14+
- name: Add new host(s) to cluster
15+
hosts: all
16+
tasks:
17+
- name: Add new host(s) to group for slurm partition
18+
add_host:
19+
name: "{{ item }}"
20+
groups: testohpc_compute
21+
loop: "{{ groups['new'] }}"
22+
run_once: true
23+
- name: "Include ansible-role-openhpc"
24+
include_role:
25+
name: "ansible-role-openhpc/"
26+
vars:
27+
openhpc_enable:
28+
control: "{{ inventory_hostname in groups['testohpc_login'] }}"
29+
batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
30+
runtime: true
31+
openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
32+
openhpc_slurm_partitions:
33+
- name: "compute"
34+
openhpc_cluster_name: testohpc
35+
openhpc_slurm_configless: true
36+
37+
- name: Check modified cluster has 3x nodes
38+
hosts: testohpc_login
39+
tasks:
40+
- name: Get slurm partition info
41+
command: sinfo --noheader --format="%P,%a,%l,%D,%t,%N" # using --format ensures we control whitespace
42+
register: sinfo
43+
changed_when: false
44+
- assert: # PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
45+
that: "sinfo.stdout_lines == ['compute*,up,60-00:00:00,3,idle,testohpc-compute-[0-2]']"
46+
fail_msg: "FAILED - actual value: {{ sinfo.stdout_lines }}"
47+
success_msg: "OK - 3x nodes idle"

molecule/test11/converge.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
---
2+
- name: Converge
3+
hosts: all
4+
tasks:
5+
- name: "Include ansible-role-openhpc"
6+
include_role:
7+
name: "ansible-role-openhpc/"
8+
vars:
9+
openhpc_enable:
10+
control: "{{ inventory_hostname in groups['testohpc_login'] }}"
11+
batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
12+
runtime: true
13+
openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
14+
openhpc_slurm_partitions:
15+
- name: "compute_orig"
16+
openhpc_cluster_name: testohpc
17+
openhpc_slurm_configless: true

molecule/test11/molecule.yml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
---
2+
name: single partition, group is partition
3+
driver:
4+
name: docker
5+
platforms:
6+
- name: testohpc-login-0
7+
image: ${MOLECULE_IMAGE}
8+
pre_build_image: true
9+
groups:
10+
- testohpc_login
11+
command: /sbin/init
12+
tmpfs:
13+
- /run
14+
- /tmp
15+
volumes:
16+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
17+
networks:
18+
- name: net1
19+
- name: testohpc-compute-0
20+
image: ${MOLECULE_IMAGE}
21+
pre_build_image: true
22+
groups:
23+
- testohpc_compute
24+
- testohpc_compute_orig
25+
- testohpc_compute_new
26+
command: /sbin/init
27+
tmpfs:
28+
- /run
29+
- /tmp
30+
volumes:
31+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
32+
networks:
33+
- name: net1
34+
- name: testohpc-compute-1
35+
image: ${MOLECULE_IMAGE}
36+
pre_build_image: true
37+
groups:
38+
- testohpc_compute
39+
- testohpc_compute_orig
40+
command: /sbin/init
41+
tmpfs:
42+
- /run
43+
- /tmp
44+
volumes:
45+
- /sys/fs/cgroup:/sys/fs/cgroup:ro
46+
networks:
47+
- name: net1
48+
provisioner:
49+
name: ansible
50+
verifier:
51+
name: ansible

molecule/test11/verify.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
---
2+
3+
- name: Check initial cluster has 2x nodes
4+
hosts: testohpc_login
5+
tasks:
6+
- name: Get slurm partition info
7+
command: sinfo --noheader --format="%P,%a,%l,%D,%t,%N" # using --format ensures we control whitespace
8+
register: sinfo
9+
changed_when: false
10+
- assert: # PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
11+
that: "sinfo.stdout_lines == ['compute_orig*,up,60-00:00:00,2,idle,testohpc-compute-[0-1]']"
12+
fail_msg: "FAILED - actual value: {{ sinfo.stdout_lines }}"
13+
success_msg: "OK - 2x nodes idle"
14+
15+
- name: Rerun with smaller compute group
16+
hosts:
17+
- testohpc_login
18+
- testohpc_compute_new
19+
tasks:
20+
- name: "Include ansible-role-openhpc"
21+
include_role:
22+
name: "ansible-role-openhpc/"
23+
vars:
24+
openhpc_enable:
25+
control: "{{ inventory_hostname in groups['testohpc_login'] }}"
26+
batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
27+
runtime: true
28+
openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
29+
openhpc_slurm_partitions:
30+
- name: "compute_new"
31+
openhpc_cluster_name: testohpc
32+
openhpc_slurm_configless: true
33+
34+
- name: Check modified cluster has 1x nodes
35+
hosts: testohpc_login
36+
tasks:
37+
- name: Get slurm partition info
38+
command: sinfo --noheader --format="%P,%a,%l,%D,%t,%N" # using --format ensures we control whitespace
39+
register: sinfo
40+
changed_when: false
41+
- assert: # PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
42+
that: "sinfo.stdout_lines == ['compute_new*,up,60-00:00:00,1,idle,testohpc-compute-0']"
43+
fail_msg: "FAILED - actual value: {{ sinfo.stdout_lines }}"
44+
success_msg: "OK - 1x nodes idle"

0 commit comments

Comments
 (0)