File tree Expand file tree Collapse file tree 4 files changed +18
-4
lines changed Expand file tree Collapse file tree 4 files changed +18
-4
lines changed Original file line number Diff line number Diff line change @@ -5,6 +5,8 @@ openhpc_slurm_control_host:
5
5
openhpc_slurm_partitions : []
6
6
openhpc_cluster_name :
7
7
openhpc_packages : []
8
+ openhpc_drain_timeout : 86400
9
+ openhpc_resume_timeout : 300
8
10
openhpc_enable :
9
11
control : false
10
12
batch : false
Original file line number Diff line number Diff line change 6
6
# - node_to_drain: compute node to drain
7
7
# - drain_timeout: seconds to wait for node to drain, default is 86400.
8
8
9
+ - name : Get nodes in DRAINED state
10
+ command : " sinfo --noheader --Node --format='%N' --states=DRAINED"
11
+ register : drained_nodes_results
12
+ changed_when : false
13
+
9
14
- name : Drain compute node
10
- become : yes
11
15
command : " scontrol update nodename={{ inventory_hostname }} state=DRAIN reason='maintenance'"
16
+ when : inventory_hostname not in drained_nodes_results.stdout_lines
12
17
13
18
- name : Check node has drained
14
19
command : " sinfo --noheader --Node --format='%N' --states=DRAINED"
15
20
register : drained_nodes
16
21
until : " inventory_hostname in drained_nodes.stdout_lines"
17
22
delay : 10
18
- retries : " {{ (drain_timeout|default(86400) / 10)| int }}"
23
+ retries : " {{ (openhpc_drain_timeout/ 10) | int }}"
19
24
changed_when : false
Original file line number Diff line number Diff line change 10
10
11
11
- include : drain.yml
12
12
when : openhpc_enable.drain | default(false) | bool
13
+ delegate_to : " {{ openhpc_slurm_control_host }}"
13
14
14
15
- include : resume.yml
15
16
when : openhpc_enable.resume | default(false) | bool
17
+ delegate_to : " {{ openhpc_slurm_control_host }}"
16
18
...
Original file line number Diff line number Diff line change 6
6
# - nodes_to_resume: compute node to resume
7
7
# - resume_timeout: seconds to wait for node to resume, default is 300.
8
8
9
+ - name : Get nodes in ALLOC,IDLE states
10
+ command : " sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
11
+ register : resumed_nodes_results
12
+ changed_when : false
13
+
9
14
- name : Resume compute node
10
- become : yes
11
15
command : " scontrol update nodename={{ inventory_hostname }} state=RESUME"
16
+ when : inventory_hostname not in resumed_nodes_results.stdout_lines
12
17
13
18
- name : Check node has resumed
14
19
command : " sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
15
20
register : resumed_nodes
16
21
until : " inventory_hostname in resumed_nodes.stdout_lines"
17
22
delay : 10
18
- retries : " {{ (resume_timeout|default(300) / 10)| int }}"
23
+ retries : " {{ (openhpc_resume_timeout/ 10) | int }}"
19
24
changed_when : false
You can’t perform that action at this time.
0 commit comments