Address review comments

Bharat Kunwar · Bharat Kunwar · commit cca818ae8950 · 2019-05-20T11:48:18.000+01:00
- Verbose description of how to operate drain/resume in README
- Set openhpc_retry_delay as a variable
diff --git a/README.md b/README.md
@@ -19,9 +19,9 @@ Role Variables
 
 `openhpc_packages`: additional OpenHPC packages to install
 
-`openhpc_enable`: 
+`openhpc_enable`:
 * `control`: whether to enable control host
-* `batch`: whether to enable compute nodes 
+* `batch`: whether to enable compute nodes
 * `runtime`: whether to enable OpenHPC runtime
 * `drain`: whether to drain a compute nodes
 * `resume`: whether to resume a compute nodes
@@ -38,6 +38,9 @@ And an Ansible inventory as this:
     openhpc-compute-0 ansible_host=10.60.253.31 ansible_user=centos
     openhpc-compute-1 ansible_host=10.60.253.32 ansible_user=centos
 
+    [cluster_login:children]
+    openhpc_login
+
     [cluster_control:children]
     openhpc_login
 
@@ -46,7 +49,7 @@ And an Ansible inventory as this:
 
 Example Playbooks
 ----------------
- 
+
 To deploy, create a playbook which looks like this:
 
     ---
@@ -80,24 +83,28 @@ To drain nodes, for example, before scaling down the cluster to 6 nodes:
     ---
     - hosts: openstack
       gather_facts: false
-      
+      vars:
+        partition: "{{ cluster_group.output_value | selectattr('group', 'equalto', item.name) | list }}"
+        openhpc_slurm_partitions:
+          - name: "compute"
+            flavor: "compute-A"
+            image: "CentOS7.5-OpenHPC"
+            num_nodes: 6
+            user: "centos"
+        openhpc_cluster_name: openhpc
       roles:
+        # Our stackhpc.cluster-infra role can be invoked in `query` mode which
+        # looks up the state of the cluster by querying the Heat API.
         - role: stackhpc.cluster-infra
           cluster_name: "{{ cluster_name }}"
           cluster_state: query
           cluster_params:
             cluster_groups: "{{ cluster_groups }}"
       tasks:
+        # Given that the original cluster that was created had 8 nodes and the
+        # cluster we want to create has 6 nodes, the computed desired_state
+        # variable stores the list of instances to leave untouched.
         - name: Count the number of compute nodes per slurm partition
-          vars:
-            partition: "{{ cluster_group.output_value | selectattr('group', 'equalto', item.name) | list }}"
-            openhpc_slurm_partitions:
-              - name: "compute"
-                flavor: "compute-A"
-                image: "CentOS7.5-OpenHPC"
-                num_nodes: 6
-                user: "centos"
-            openhpc_cluster_name: openhpc
           set_fact:
             desired_state: "{{ (( partition | first).nodes | map(attribute='name') | list )[:item.num_nodes] + desired_state | default([]) }}"
           when: partition | length > 0
@@ -106,9 +113,13 @@ To drain nodes, for example, before scaling down the cluster to 6 nodes:
 
     - hosts: cluster_batch
       become: yes
+      vars:
+        desired_state: "{{ hostvars['localhost']['desired_state'] | default([]) }}"
       roles:
+        # Now, the stackhpc.openhpc role is invoked in drain/resume modes where
+        # the instances in desired_state are resumed if in a drained state and
+        # drained if in a resumed state.
         - role: stackhpc.openhpc
-          desired_state: "{{ hostvars['localhost']['desired_state'] | default([]) }}"
           openhpc_slurm_control_host: "{{ groups['cluster_control'] | first }}"
           openhpc_enable:
             drain: "{{ inventory_hostname not in desired_state }}"
diff --git a/defaults/main.yml b/defaults/main.yml
@@ -7,6 +7,7 @@ openhpc_cluster_name:
 openhpc_packages: []
 openhpc_drain_timeout: 86400
 openhpc_resume_timeout: 300
+openhpc_retry_delay: 10
 openhpc_enable:
   control: false
   batch: false
diff --git a/tasks/drain.yml b/tasks/drain.yml
@@ -19,6 +19,6 @@
   command: "sinfo --noheader --Node --format='%N' --states=DRAINED"
   register: drained_nodes
   until: "inventory_hostname in drained_nodes.stdout_lines"
-  delay: 10
-  retries: "{{ (openhpc_drain_timeout/10) | int }}"
+  delay: "{{ openhpc_retry_delay }}"
+  retries: "{{ (openhpc_drain_timeout/openhpc_retry_delay) | int }}"
   changed_when: false
diff --git a/tasks/resume.yml b/tasks/resume.yml
@@ -19,6 +19,6 @@
   command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE"
   register: resumed_nodes
   until: "inventory_hostname in resumed_nodes.stdout_lines"
-  delay: 10
-  retries: "{{ (openhpc_resume_timeout/10) | int }}"
+  delay: "{{ openhpc_retry_delay }}"
+  retries: "{{ (openhpc_resume_timeout/openhpc_retry_delay) | int }}"
   changed_when: false