stackhpc
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 24 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 24 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 55 deletions b/‎README.md‎
Lines changed: 3 additions & 55 deletions
diff --git a/‎defaults/main.yml‎
Lines changed: 0 additions & 27 deletions b/‎defaults/main.yml‎
Lines changed: 0 additions & 27 deletions
diff --git a/‎handlers/main.yml‎
Lines changed: 2 additions & 1 deletion b/‎handlers/main.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎molecule/README.md‎
Lines changed: 1 addition & 2 deletions b/‎molecule/README.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎molecule/test1/converge.yml‎
Lines changed: 9 additions & 10 deletions b/‎molecule/test1/converge.yml‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎molecule/test10/converge.yml‎
Lines changed: 10 additions & 10 deletions b/‎molecule/test10/converge.yml‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎molecule/test13/converge.yml‎
Lines changed: 14 additions & 14 deletions b/‎molecule/test13/converge.yml‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎molecule/test14/converge.yml‎
Lines changed: 22 additions & 23 deletions b/‎molecule/test14/converge.yml‎
Lines changed: 22 additions & 23 deletions
diff --git a/‎molecule/test1b/converge.yml‎
Lines changed: 9 additions & 10 deletions b/‎molecule/test1b/converge.yml‎
Lines changed: 9 additions & 10 deletions
@@ -24,7 +24,6 @@ jobs:
       fail-fast: false
       matrix:
         image:
-          - 'centos:7'
           - 'rockylinux:8.8'
         scenario:
           - test1
@@ -44,29 +43,7 @@ jobs:
           - test13
           - test14
 
-        exclude:
-          - image: 'centos:7'
-            scenario: test5
-          - image: 'centos:7'
-            scenario: test6
-          - image: 'centos:7'
-            scenario: test7
-          - image: 'centos:7'
-            scenario: test8
-          - image: 'centos:7'
-            scenario: test9
-          - image: 'centos:7'
-            scenario: test10
-          - image: 'centos:7'
-            scenario: test11
-          - image: 'centos:7'
-            scenario: test12
-          - image: 'centos:7'
-            scenario: test13
-          - image: 'centos:7'
-            scenario: test14
-          - image: 'centos:7'
-            scenario: test15
+        exclude: []
 
     steps:
       - name: Check out the codebase.
 
@@ -2,16 +2,14 @@
 
 # stackhpc.openhpc
 
-This Ansible role installs packages and performs configuration to provide an OpenHPC Slurm cluster. It can also be used to drain and resume nodes.
+This Ansible role installs packages and performs configuration to provide an OpenHPC v2.x Slurm cluster.
 
 As a role it must be used from a playbook, for which a simple example is given below. This approach means it is totally modular with no assumptions about available networks or any cluster features except for some hostname conventions. Any desired cluster fileystem or other required functionality may be freely integrated using additional Ansible roles or other approaches.
 
-The minimal image for nodes is a CentOS 7 or RockyLinux 8 GenericCloud image. These use OpenHPC v1 and v2 respectively. Centos8/OpenHPCv2 is generally preferred as it provides additional functionality for Slurm, compilers, MPI and transport libraries.
+The minimal image for nodes is a RockyLinux 8 GenericCloud image.
 
 ## Role Variables
 
-`openhpc_version`: Optional. OpenHPC version to install. Defaults provide `1.3` for Centos 7 and `2` for RockyLinux/CentOS 8.
-
 `openhpc_extra_repos`: Optional list. Extra Yum repository definitions to configure, following the format of the Ansible
 [yum_repository](https://docs.ansible.com/ansible/2.9/modules/yum_repository_module.html) module. Respected keys for
 each list element:
@@ -39,12 +37,10 @@ each list element:
 * `database`: whether to enable slurmdbd
 * `batch`: whether to enable compute nodes
 * `runtime`: whether to enable OpenHPC runtime
-* `drain`: whether to drain compute nodes
-* `resume`: whether to resume compute nodes
 
 `openhpc_slurmdbd_host`: Optional. Where to deploy slurmdbd if are using this role to deploy slurmdbd, otherwise where an existing slurmdbd is running. This should be the name of a host in your inventory. Set this to `none` to prevent the role from managing slurmdbd. Defaults to `openhpc_slurm_control_host`.
 
-`openhpc_slurm_configless`: Optional, default false. If true then slurm's ["configless" mode](https://slurm.schedmd.com/configless_slurm.html) is used. **NB: Requires Centos8/OpenHPC v2.**
+`openhpc_slurm_configless`: Optional, default false. If true then slurm's ["configless" mode](https://slurm.schedmd.com/configless_slurm.html) is used.
 
 `openhpc_munge_key`: Optional. Define a munge key to use. If not provided then one is generated but the `openhpc_slurm_control_host` must be in the play.
 
@@ -184,54 +180,6 @@ To deploy, create a playbook which looks like this:
           openhpc_packages: []
     ...
 
-To drain nodes, for example, before scaling down the cluster to 6 nodes:
-
-    ---
-    - hosts: openstack
-      gather_facts: false
-      vars:
-        partition: "{{ cluster_group.output_value | selectattr('group', 'equalto', item.name) | list }}"
-        openhpc_slurm_partitions:
-          - name: "compute"
-            flavor: "compute-A"
-            image: "CentOS7.5-OpenHPC"
-            num_nodes: 6
-            user: "centos"
-        openhpc_cluster_name: openhpc
-      roles:
-        # Our stackhpc.cluster-infra role can be invoked in `query` mode which
-        # looks up the state of the cluster by querying the Heat API.
-        - role: stackhpc.cluster-infra
-          cluster_name: "{{ cluster_name }}"
-          cluster_state: query
-          cluster_params:
-            cluster_groups: "{{ cluster_groups }}"
-      tasks:
-        # Given that the original cluster that was created had 8 nodes and the
-        # cluster we want to create has 6 nodes, the computed desired_state
-        # variable stores the list of instances to leave untouched.
-        - name: Count the number of compute nodes per slurm partition
-          set_fact:
-            desired_state: "{{ (( partition | first).nodes | map(attribute='name') | list )[:item.num_nodes] + desired_state | default([]) }}"
-          when: partition | length > 0
-          with_items: "{{ openhpc_slurm_partitions }}"
-        - debug: var=desired_state
-
-    - hosts: cluster_batch
-      become: yes
-      vars:
-        desired_state: "{{ hostvars['localhost']['desired_state'] | default([]) }}"
-      roles:
-        # Now, the stackhpc.openhpc role is invoked in drain/resume modes where
-        # the instances in desired_state are resumed if in a drained state and
-        # drained if in a resumed state.
-        - role: stackhpc.openhpc
-          openhpc_slurm_control_host: "{{ groups['cluster_control'] | first }}"
-          openhpc_enable:
-            drain: "{{ inventory_hostname not in desired_state }}"
-            resume: "{{ inventory_hostname in desired_state }}"
-    ...
-
 ---
 
 <b id="slurm_ver_footnote">1</b> Slurm 20.11 removed `accounting_storage/filetxt` as an option. This version of Slurm was introduced in OpenHPC v2.1 but the OpenHPC repos are common to all OpenHPC v2.x releases. [↩](#accounting_storage)
@@ -1,5 +1,4 @@
 ---
-openhpc_version: "{{ '1.3' if ansible_distribution_major_version == '7' else '2' }}"
 openhpc_slurm_service_enabled: true
 openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}"
 openhpc_slurm_service:
@@ -9,7 +8,6 @@ openhpc_slurm_partitions: []
 openhpc_cluster_name:
 openhpc_packages:
   - slurm-libpmi-ohpc
-openhpc_drain_timeout: 86400
 openhpc_resume_timeout: 300
 openhpc_retry_delay: 10
 openhpc_job_maxtime: '60-0' # quote this to avoid ansible converting some formats to seconds, which is interpreted as minutes by Slurm
@@ -46,29 +44,11 @@ openhpc_enable:
   batch: false
   database: false
   runtime: false
-  drain: false
-  resume: false
-ohpc_slurm_services:
-  control: slurmctld
-  batch: slurmd
 
 # Repository configuration
 openhpc_extra_repos: []
 
 ohpc_openhpc_repos:
-  "7":
-    - name: OpenHPC
-      file: OpenHPC
-      description: "OpenHPC-1.3 - Base"
-      baseurl: "http://build.openhpc.community/OpenHPC:/1.3/CentOS_7"
-      gpgcheck: true
-      gpgkey: https://raw.githubusercontent.com/openhpc/ohpc/v1.3.5.GA/components/admin/ohpc-release/SOURCES/RPM-GPG-KEY-OpenHPC-1
-    - name: OpenHPC-updates
-      file: OpenHPC
-      description: "OpenHPC-1.3 - Updates"
-      baseurl: "http://build.openhpc.community/OpenHPC:/1.3/updates/CentOS_7"
-      gpgcheck: true
-      gpgkey: https://raw.githubusercontent.com/openhpc/ohpc/v1.3.5.GA/components/admin/ohpc-release/SOURCES/RPM-GPG-KEY-OpenHPC-1
   "8":
     - name: OpenHPC
       file: OpenHPC
@@ -84,13 +64,6 @@ ohpc_openhpc_repos:
       gpgkey: https://raw.githubusercontent.com/openhpc/ohpc/v2.6.1.GA/components/admin/ohpc-release/SOURCES/RPM-GPG-KEY-OpenHPC-2
 
 ohpc_default_extra_repos:
-  "7":
-    - name: epel
-      file: epel
-      description: "Extra Packages for Enterprise Linux 7 - $basearch"
-      metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-7&arch=$basearch&infra=$infra&content=$contentdir"
-      gpgcheck: true
-      gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-7"
   "8":
     - name: epel
       file: epel
 
@@ -60,4 +60,5 @@
     state: restarted
   when:
     - openhpc_slurm_service_started | bool
-    - openhpc_slurm_service == 'slurmd'
+    - openhpc_enable.batch | default(false) | bool
+    # 2nd condition required as notification happens on controller, which isn't necessarily a compute note
@@ -42,8 +42,7 @@ Local installation on a RockyLinux 8.x machine looks like:
 Then to run tests, e.g.::
 
     cd ansible-role-openhpc/
-    MOLECULE_IMAGE=centos:7 molecule test --all # NB some won't work as require OpenHPC v2.x (-> CentOS 8.x) features - see `.github/workflows/ci.yml`
-    MOLECULE_IMAGE=rockylinux:8.6 molecule test --all
+    MOLECULE_IMAGE=rockylinux:8.8 molecule test --all
 
 During development you may want to:
 
 
@@ -1,17 +1,16 @@
 ---
 - name: Converge
   hosts: all
+  vars:
+    openhpc_enable:
+      control: "{{ inventory_hostname in groups['testohpc_login'] }}"
+      batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
+      runtime: true
+    openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
+    openhpc_slurm_partitions:
+      - name: "compute"
+    openhpc_cluster_name: testohpc
   tasks:
     - name: "Include ansible-role-openhpc"
       include_role:
         name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}"
-      vars:
-        openhpc_enable:
-          control: "{{ inventory_hostname in groups['testohpc_login'] }}"
-          batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
-          runtime: true
-        openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
-        openhpc_slurm_partitions:
-          - name: "compute"
-        openhpc_cluster_name: testohpc
-
@@ -1,17 +1,17 @@
 ---
 - name: Create initial cluster
   hosts: initial
+  vars:
+    openhpc_enable:
+      control: "{{ inventory_hostname in groups['testohpc_login'] }}"
+      batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
+      runtime: true
+    openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
+    openhpc_slurm_partitions:
+      - name: "compute"
+    openhpc_cluster_name: testohpc
+    openhpc_slurm_configless: true
   tasks:
     - name: "Include ansible-role-openhpc"
       include_role:
         name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}"
-      vars:
-        openhpc_enable:
-          control: "{{ inventory_hostname in groups['testohpc_login'] }}"
-          batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
-          runtime: true
-        openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
-        openhpc_slurm_partitions:
-          - name: "compute"
-        openhpc_cluster_name: testohpc
-        openhpc_slurm_configless: true
@@ -1,21 +1,21 @@
 ---
 - name: Converge
   hosts: all
+  vars:
+    openhpc_enable:
+      control: "{{ inventory_hostname in groups['testohpc_control'] }}"
+      batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
+      runtime: true
+    openhpc_slurm_control_host: "{{ groups['testohpc_control'] | first }}"
+    openhpc_slurm_partitions:
+      - name: "compute"
+    openhpc_cluster_name: testohpc
+    openhpc_slurm_configless: true
+    openhpc_login_only_nodes: 'testohpc_login'
+    openhpc_config:
+      FirstJobId: 13
+      SlurmctldSyslogDebug: error
   tasks:
     - name: "Include ansible-role-openhpc"
       include_role:
         name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}"
-      vars:
-        openhpc_enable:
-          control: "{{ inventory_hostname in groups['testohpc_control'] }}"
-          batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
-          runtime: true
-        openhpc_slurm_control_host: "{{ groups['testohpc_control'] | first }}"
-        openhpc_slurm_partitions:
-          - name: "compute"
-        openhpc_cluster_name: testohpc
-        openhpc_slurm_configless: true
-        openhpc_login_only_nodes: 'testohpc_login'
-        openhpc_config:
-          FirstJobId: 13
-          SlurmctldSyslogDebug: error
@@ -1,30 +1,29 @@
 ---
 - name: Converge
   hosts: all
+  vars:
+    openhpc_enable:
+      control: "{{ inventory_hostname in groups['testohpc_login'] }}"
+      batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
+      runtime: true
+    openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
+    openhpc_slurm_partitions:
+      - name: "compute"
+        extra_nodes:
+          # Need to specify IPs for the non-existent State=DOWN nodes, because otherwise even in this state slurmctld will exclude a node with no lookup information from the config.
+          # We use invalid IPs here (i.e. starting 0.) to flag the fact the nodes shouldn't exist.
+          # Note this has to be done via slurm config rather than /etc/hosts due to Docker limitations on modifying the latter.
+          - NodeName: fake-x,fake-y
+            NodeAddr: 0.42.42.0,0.42.42.1
+            State: DOWN
+            CPUs: 1
+          - NodeName: fake-2cpu-[3,7-9]
+            NodeAddr: 0.42.42.3,0.42.42.7,0.42.42.8,0.42.42.9
+            State: DOWN
+            CPUs: 2
+    openhpc_cluster_name: testohpc
+    openhpc_slurm_configless: true
   tasks:
     - name: "Include ansible-role-openhpc"
       include_role:
         name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}"
-      vars:
-        openhpc_enable:
-          control: "{{ inventory_hostname in groups['testohpc_login'] }}"
-          batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
-          runtime: true
-        openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
-        openhpc_slurm_partitions:
-          - name: "compute"
-            extra_nodes:
-              # Need to specify IPs for the non-existent State=DOWN nodes, because otherwise even in this state slurmctld will exclude a node with no lookup information from the config.
-              # We use invalid IPs here (i.e. starting 0.) to flag the fact the nodes shouldn't exist.
-              # Note this has to be done via slurm config rather than /etc/hosts due to Docker limitations on modifying the latter.
-              - NodeName: fake-x,fake-y
-                NodeAddr: 0.42.42.0,0.42.42.1
-                State: DOWN
-                CPUs: 1
-              - NodeName: fake-2cpu-[3,7-9]
-                NodeAddr: 0.42.42.3,0.42.42.7,0.42.42.8,0.42.42.9
-                State: DOWN
-                CPUs: 2
-        openhpc_cluster_name: testohpc
-        openhpc_slurm_configless: true
-
@@ -1,17 +1,16 @@
 ---
 - name: Converge
   hosts: all
+  vars:
+    openhpc_enable:
+      control: "{{ inventory_hostname in groups['testohpc_login'] }}"
+      batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
+      runtime: true
+    openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
+    openhpc_slurm_partitions:
+      - name: "compute"
+    openhpc_cluster_name: testohpc
   tasks:
     - name: "Include ansible-role-openhpc"
       include_role:
         name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}"
-      vars:
-        openhpc_enable:
-          control: "{{ inventory_hostname in groups['testohpc_login'] }}"
-          batch: "{{ inventory_hostname in groups['testohpc_compute'] }}"
-          runtime: true
-        openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}"
-        openhpc_slurm_partitions:
-          - name: "compute"
-        openhpc_cluster_name: testohpc
-