Skip to content

Commit cca818a

Browse files
author
Bharat Kunwar
committed
Address review comments
- Verbose description of how to operate drain/resume in README - Set openhpc_retry_delay as a variable
1 parent 7fc59e7 commit cca818a

File tree

4 files changed

+30
-18
lines changed

4 files changed

+30
-18
lines changed

README.md

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ Role Variables
1919

2020
`openhpc_packages`: additional OpenHPC packages to install
2121

22-
`openhpc_enable`:
22+
`openhpc_enable`:
2323
* `control`: whether to enable control host
24-
* `batch`: whether to enable compute nodes
24+
* `batch`: whether to enable compute nodes
2525
* `runtime`: whether to enable OpenHPC runtime
2626
* `drain`: whether to drain a compute nodes
2727
* `resume`: whether to resume a compute nodes
@@ -38,6 +38,9 @@ And an Ansible inventory as this:
3838
openhpc-compute-0 ansible_host=10.60.253.31 ansible_user=centos
3939
openhpc-compute-1 ansible_host=10.60.253.32 ansible_user=centos
4040

41+
[cluster_login:children]
42+
openhpc_login
43+
4144
[cluster_control:children]
4245
openhpc_login
4346

@@ -46,7 +49,7 @@ And an Ansible inventory as this:
4649

4750
Example Playbooks
4851
----------------
49-
52+
5053
To deploy, create a playbook which looks like this:
5154

5255
---
@@ -80,24 +83,28 @@ To drain nodes, for example, before scaling down the cluster to 6 nodes:
8083
---
8184
- hosts: openstack
8285
gather_facts: false
83-
86+
vars:
87+
partition: "{{ cluster_group.output_value | selectattr('group', 'equalto', item.name) | list }}"
88+
openhpc_slurm_partitions:
89+
- name: "compute"
90+
flavor: "compute-A"
91+
image: "CentOS7.5-OpenHPC"
92+
num_nodes: 6
93+
user: "centos"
94+
openhpc_cluster_name: openhpc
8495
roles:
96+
# Our stackhpc.cluster-infra role can be invoked in `query` mode which
97+
# looks up the state of the cluster by querying the Heat API.
8598
- role: stackhpc.cluster-infra
8699
cluster_name: "{{ cluster_name }}"
87100
cluster_state: query
88101
cluster_params:
89102
cluster_groups: "{{ cluster_groups }}"
90103
tasks:
104+
# Given that the original cluster that was created had 8 nodes and the
105+
# cluster we want to create has 6 nodes, the computed desired_state
106+
# variable stores the list of instances to leave untouched.
91107
- name: Count the number of compute nodes per slurm partition
92-
vars:
93-
partition: "{{ cluster_group.output_value | selectattr('group', 'equalto', item.name) | list }}"
94-
openhpc_slurm_partitions:
95-
- name: "compute"
96-
flavor: "compute-A"
97-
image: "CentOS7.5-OpenHPC"
98-
num_nodes: 6
99-
user: "centos"
100-
openhpc_cluster_name: openhpc
101108
set_fact:
102109
desired_state: "{{ (( partition | first).nodes | map(attribute='name') | list )[:item.num_nodes] + desired_state | default([]) }}"
103110
when: partition | length > 0
@@ -106,9 +113,13 @@ To drain nodes, for example, before scaling down the cluster to 6 nodes:
106113

107114
- hosts: cluster_batch
108115
become: yes
116+
vars:
117+
desired_state: "{{ hostvars['localhost']['desired_state'] | default([]) }}"
109118
roles:
119+
# Now, the stackhpc.openhpc role is invoked in drain/resume modes where
120+
# the instances in desired_state are resumed if in a drained state and
121+
# drained if in a resumed state.
110122
- role: stackhpc.openhpc
111-
desired_state: "{{ hostvars['localhost']['desired_state'] | default([]) }}"
112123
openhpc_slurm_control_host: "{{ groups['cluster_control'] | first }}"
113124
openhpc_enable:
114125
drain: "{{ inventory_hostname not in desired_state }}"

defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ openhpc_cluster_name:
77
openhpc_packages: []
88
openhpc_drain_timeout: 86400
99
openhpc_resume_timeout: 300
10+
openhpc_retry_delay: 10
1011
openhpc_enable:
1112
control: false
1213
batch: false

tasks/drain.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,6 @@
1919
command: "sinfo --noheader --Node --format='%N' --states=DRAINED"
2020
register: drained_nodes
2121
until: "inventory_hostname in drained_nodes.stdout_lines"
22-
delay: 10
23-
retries: "{{ (openhpc_drain_timeout/10) | int }}"
22+
delay: "{{ openhpc_retry_delay }}"
23+
retries: "{{ (openhpc_drain_timeout/openhpc_retry_delay) | int }}"
2424
changed_when: false

tasks/resume.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,6 @@
1919
command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
2020
register: resumed_nodes
2121
until: "inventory_hostname in resumed_nodes.stdout_lines"
22-
delay: 10
23-
retries: "{{ (openhpc_resume_timeout/10) | int }}"
22+
delay: "{{ openhpc_retry_delay }}"
23+
retries: "{{ (openhpc_resume_timeout/openhpc_retry_delay) | int }}"
2424
changed_when: false

0 commit comments

Comments
 (0)