Skip to content

Commit bc2f9f3

Browse files
author
Bharat Kunwar
committed
Check if a node needs to be drained/resumed before actually doing it
1 parent 03bb3b5 commit bc2f9f3

File tree

4 files changed

+18
-4
lines changed

4 files changed

+18
-4
lines changed

defaults/main.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ openhpc_slurm_control_host:
55
openhpc_slurm_partitions: []
66
openhpc_cluster_name:
77
openhpc_packages: []
8+
openhpc_drain_timeout: 86400
9+
openhpc_resume_timeout: 300
810
openhpc_enable:
911
control: false
1012
batch: false

tasks/drain.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,19 @@
66
# - node_to_drain: compute node to drain
77
# - drain_timeout: seconds to wait for node to drain, default is 86400.
88

9+
- name: Get nodes in DRAINED state
10+
command: "sinfo --noheader --Node --format='%N' --states=DRAINED"
11+
register: drained_nodes_results
12+
changed_when: false
13+
914
- name: Drain compute node
10-
become: yes
1115
command: "scontrol update nodename={{ inventory_hostname }} state=DRAIN reason='maintenance'"
16+
when: inventory_hostname not in drained_nodes_results.stdout_lines
1217

1318
- name: Check node has drained
1419
command: "sinfo --noheader --Node --format='%N' --states=DRAINED"
1520
register: drained_nodes
1621
until: "inventory_hostname in drained_nodes.stdout_lines"
1722
delay: 10
18-
retries: "{{ (drain_timeout|default(86400) / 10)|int }}"
23+
retries: "{{ (openhpc_drain_timeout/10) | int }}"
1924
changed_when: false

tasks/main.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
- include: drain.yml
1212
when: openhpc_enable.drain | default(false) | bool
13+
delegate_to: "{{ openhpc_slurm_control_host }}"
1314

1415
- include: resume.yml
1516
when: openhpc_enable.resume | default(false) | bool
17+
delegate_to: "{{ openhpc_slurm_control_host }}"
1618
...

tasks/resume.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,19 @@
66
# - nodes_to_resume: compute node to resume
77
# - resume_timeout: seconds to wait for node to resume, default is 300.
88

9+
- name: Get nodes in ALLOC,IDLE states
10+
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
11+
register: resumed_nodes_results
12+
changed_when: false
13+
914
- name: Resume compute node
10-
become: yes
1115
command: "scontrol update nodename={{ inventory_hostname }} state=RESUME"
16+
when: inventory_hostname not in resumed_nodes_results.stdout_lines
1217

1318
- name: Check node has resumed
1419
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
1520
register: resumed_nodes
1621
until: "inventory_hostname in resumed_nodes.stdout_lines"
1722
delay: 10
18-
retries: "{{ (resume_timeout|default(300) / 10)|int }}"
23+
retries: "{{ (openhpc_resume_timeout/10) | int }}"
1924
changed_when: false

0 commit comments

Comments
 (0)