From 30984da5e62fa8c3649f1f4ee060b0113fee44ed Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Thu, 27 Mar 2025 16:12:25 +0000
Subject: [PATCH 01/15] slurm controlled rebuild docs & adhoc support

---
 ansible/adhoc/reboot_via_slurm.yml            |  24 --
 ansible/adhoc/rebuild-via-slurm.yml           |  17 +
 ansible/roles/compute_init/README.md          |  11 +-
 ansible/roles/rebuild/README.md               |  51 ++-
 ansible/roles/rebuild/defaults/main.yml       |  23 +-
 ansible/roles/rebuild/tasks/main.yml          |   2 +-
 ansible/roles/rebuild/tasks/rebuild.yml       |  11 +
 .../roles/rebuild/tasks/rebuild_partition.yml |  21 ++
 docs/experimental/slurm-controlled-rebuild.md | 310 ++++++++++++++++++
 9 files changed, 425 insertions(+), 45 deletions(-)
 delete mode 100644 ansible/adhoc/reboot_via_slurm.yml
 create mode 100644 ansible/adhoc/rebuild-via-slurm.yml
 create mode 100644 ansible/roles/rebuild/tasks/rebuild.yml
 create mode 100644 ansible/roles/rebuild/tasks/rebuild_partition.yml
 create mode 100644 docs/experimental/slurm-controlled-rebuild.md

diff --git a/ansible/adhoc/reboot_via_slurm.yml b/ansible/adhoc/reboot_via_slurm.yml
deleted file mode 100644
index b5d5d0d0f..000000000
--- a/ansible/adhoc/reboot_via_slurm.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Reboot compute nodes via slurm. Nodes will be rebuilt if `image_id` in inventory is different to the currently-provisioned image.
-# Example:
-#   ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
-
-- hosts: login
-  run_once: true
-  become: yes
-  gather_facts: no
-  tasks:
-    - name: Submit a Slurm job to reboot compute nodes
-      ansible.builtin.shell: |
-        set -e
-        srun --reboot -N 2 uptime
-      become_user: root
-      register: slurm_result
-      failed_when: slurm_result.rc != 0
-
-    - name: Fetch Slurm controller logs if reboot fails
-      ansible.builtin.shell: |
-        journalctl -u slurmctld --since "10 minutes ago" | tail -n 50
-      become_user: root
-      register: slurm_logs
-      when: slurm_result.rc != 0
-      delegate_to: "{{ groups['control'] | first }}"
\ No newline at end of file
diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml
new file mode 100644
index 000000000..4f7b5a576
--- /dev/null
+++ b/ansible/adhoc/rebuild-via-slurm.yml
@@ -0,0 +1,17 @@
+# Rebuild compute nodes via slurm.
+# Nodes will be rebuilt if `image_id` in inventory is different to the
+# currently-provisioned image. Otherwise they are rebooted.
+
+# Example:
+#   ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml
+
+# See docs/slurm-controlled-rebuild.md.
+
+- hosts: login
+  run_once: true
+  gather_facts: no
+  tasks:
+    - name: Run slurm-controlled rebuild
+      import_role:
+        name: rebuild
+        tasks_from: rebuild.yml
diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md
index e64ea6ffb..81a62bade 100644
--- a/ansible/roles/compute_init/README.md
+++ b/ansible/roles/compute_init/README.md
@@ -1,12 +1,11 @@
 # EXPERIMENTAL: compute_init
 
-Experimental functionality to allow compute nodes to rejoin the cluster after
-a reboot without running the `ansible/site.yml` playbook.
+Allow compute nodes to rejoin the cluster after a reboot without running the
+`ansible/site.yml` playbook.
 
-**CAUTION:** The approach used here of exporting cluster secrets over NFS
-is considered to be a security risk due to the potential for cluster users to
-mount the share on a user-controlled machine by tunnelling through a login
-node. This feature should not be enabled on production clusters at this time.
+> [!NOTE]
+> This functionality is marked as experimental as it may be incomplete and the
+> required configuration may change with further development.
 
 To enable this:
 1. Add the `compute` group (or a subset) into the `compute_init` group.
diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md
index 314d7c94d..ddce17933 100644
--- a/ansible/roles/rebuild/README.md
+++ b/ansible/roles/rebuild/README.md
@@ -1,30 +1,55 @@
 rebuild
 =========
 
-Enables reboot tool from https://github.com/stackhpc/slurm-openstack-tools.git to be run from control node.
+Enables reboot tool from https://github.com/stackhpc/slurm-openstack-tools.git
+to be run from control node.
 
 Requirements
 ------------
 
-clouds.yaml file
+An OpenStack clouds.yaml file containing credentials for a cloud under the
+"openstack" key.
 
 Role Variables
 --------------
 
-- `openhpc_rebuild_clouds`: Directory. Path to clouds.yaml file.
+The below is only used by this role's `main.yml` task file, i.e. when running
+the `ansible/site.yml` or `ansible/slurm.yml` playbooks:
 
+- `rebuild_clouds_path`: Optional. Path to `clouds.yaml` file on the deploy
+  host, default `~/.config/openstack/clouds.yaml`.
 
-Example Playbook
-----------------
+The below are only used by this role's `rebuild.yml` task file, i.e. when
+running the `ansible/adhoc/rebuild-via-slurm.yml` playbook:
 
-    - hosts: control
-      become: yes
-      tasks:
-        - import_role:
-            name: rebuild
+- `rebuild_job_partitions`: Optional. Comma-separated list of names of rebuild
+  partitions defined in `openhpc_slurm_partitions`. Useful as an extra-var for
+  limiting rebuilds. Default `rebuild`.
 
-License
--------
+- `rebuild_job_name`: Optional. Name of rebuild jobs. Default is `rebuild-`
+  suffixed with the node name.
 
-Apache-2.0
+- `rebuild_job_command`: Optional. String giving command to run in job after
+  node has been rebuilt. Default is to sleep for 5 seconds. Note job output is
+  send to `/dev/null` by default, as the root user running this has no shared
+  directory for job output.
 
+- `rebuild_job_reboot`: Bool, whether to add the `--reboot` flag to the job
+  to actually trigger a rebuild. Useful for e.g. testing priorities. Default
+  `true`.
+
+- `rebuild_job_options`: Optional. A string giving any other options to pass to
+  [sbatch](https://slurm.schedmd.com/sbatch.html). Default is empty string.
+
+- `rebuild_job_user`: Optional. The user to run the rebuild setup and job as.
+  Default `root`.
+
+- `rebuild_job_template`: Optional. The string to use to submit the job. See
+  [defaults.yml](defaults/main.yml).
+
+- `rebuild_job_hostlist`: String with a Slurm hostlist expression to restrict
+  a rebuild to only those nodes (e.g. `tux[1-3]` or `tux1,tux2`). If set,
+  `rebuild_partitions` must only define a single partition and that partition
+  must contain those nodes. Not for routine use, but may be useful to e.g.
+  reattempt a rebuild if this failed on specific nodes. Default is all nodes
+  in the relevant partition.
diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml
index 06b237ef2..948283633 100644
--- a/ansible/roles/rebuild/defaults/main.yml
+++ b/ansible/roles/rebuild/defaults/main.yml
@@ -1,2 +1,23 @@
 ---
-openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml
\ No newline at end of file
+
+rebuild_clouds_path: ~/.config/openstack/clouds.yaml
+
+rebuild_job_partitions: rebuild
+rebuild_job_name: "rebuild-{{ item }}" # item is nodename
+rebuild_job_command: 'sleep 5'
+rebuild_job_reboot: true
+rebuild_job_options: ''
+rebuild_job_user: root
+rebuild_job_template: >-
+  sbatch
+  --nodelist={{ item }}
+  {{ '--reboot' if rebuild_job_reboot | bool else '' }}
+  --job-name={{ rebuild_job_name }}
+  --nodes=1
+  --exclusive
+  --partition={{ _rebuild_job_current_partition }}
+  --no-requeue
+  --output=/dev/null
+  --wrap="{{ rebuild_job_command }}"
+  {{ rebuild_job_options }}
+#rebuild_job_hostlist:
\ No newline at end of file
diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml
index c677716c7..5612ab515 100644
--- a/ansible/roles/rebuild/tasks/main.yml
+++ b/ansible/roles/rebuild/tasks/main.yml
@@ -10,7 +10,7 @@
 
 - name: Copy out clouds.yaml
   copy:
-    src: "{{ openhpc_rebuild_clouds }}"
+    src: "{{ rebuild_clouds_path }}"
     dest: /etc/openstack/clouds.yaml
     owner: slurm
     group: root
diff --git a/ansible/roles/rebuild/tasks/rebuild.yml b/ansible/roles/rebuild/tasks/rebuild.yml
new file mode 100644
index 000000000..466951f63
--- /dev/null
+++ b/ansible/roles/rebuild/tasks/rebuild.yml
@@ -0,0 +1,11 @@
+- name: Create rebuild jobs for partition
+  include_tasks:
+    file: rebuild_partition.yml
+  args:
+    apply:
+      become: yes
+      become_user: "{{ rebuild_job_user }}"
+  loop: "{{ rebuild_job_partitions | split(',') }}"
+  loop_control:
+    loop_var: _rebuild_job_current_partition
+
diff --git a/ansible/roles/rebuild/tasks/rebuild_partition.yml b/ansible/roles/rebuild/tasks/rebuild_partition.yml
new file mode 100644
index 000000000..0ce4ee88a
--- /dev/null
+++ b/ansible/roles/rebuild/tasks/rebuild_partition.yml
@@ -0,0 +1,21 @@
+- name: Get list of nodes in partition
+  ansible.builtin.command:
+    cmd: >-
+      sinfo
+      --Node
+      --Format=NodeList
+      --noheader
+      --partition={{ _rebuild_job_current_partition }}
+  register: _sinfo_partition
+  when: rebuild_job_hostlist is not defined
+
+- name: Expand rebuild_job_hostlist to host names
+  ansible.builtin.command:
+    cmd: "scontrol show hostnames {{ rebuild_job_hostlist }}"
+  register: _scontrol_hostnames
+  when: rebuild_job_hostlist is defined
+
+- name: Submit rebuild jobs
+  ansible.builtin.command:
+    cmd: "{{ rebuild_job_template }}"
+  loop: "{{ _scontrol_hostnames.stdout_lines | default(_sinfo_partition.stdout_lines) }}"
diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md
new file mode 100644
index 000000000..4320071da
--- /dev/null
+++ b/docs/experimental/slurm-controlled-rebuild.md
@@ -0,0 +1,310 @@
+# EXPERIMENTAL: Slurm Controlled Rebuild
+
+This page describes how to configure the appliance to enable reimaging of
+Slurm nodes via submission of Slurm jobs, and how to use that functionality.
+This provides a way to upgrade nodes with less impact than the normal approach.
+
+> [!NOTE]  
+> This functionality is marked as experimental as the required configuration
+> or usage may change with further development.
+
+In summary, the way this functionality works is as follows:
+1. The image references(s) are manually updated in the OpenTofu configuration
+   in the normal way.
+2. `tofu apply` is run which rebuilds the login and control nodes to the new
+   image(s). The new image reference for compute nodes is ignored, but is
+   written into the hosts inventory file (and is therefore available as an
+   Ansible hostvar).
+3. The `site.yml` playbook is run which reconfigures the cluster as normal. At
+   this point the cluster is functional, but using a new image for the login
+   and control nodes and the old image for the compute nodes. This playbook
+   also:
+   - Writes cluster configuration to the control node, using the
+    [compute_init](../../ansible/roles/compute_init/README.md) role.
+   - Configures an application credential and helper programs on the control
+     node, using the [rebuild](../../ansible/roles/rebuild/README.md) role.
+4. An admin submits Slurm jobs, one for each node, to a special "rebuild"
+   partition using an Ansible playbook. Because this partition has higher
+   priority than the partitions normal users can use, these rebuild jobs become
+   the next job in the queue for every node (although any jobs currently
+   running will complete as normal).
+5. Because these rebuild jobs have the `--reboot` flag set, before launching them
+   the Slurm control node runs a [RebootProgram](https://slurm.schedmd.com/slurm.conf.html#OPT_RebootProgram)
+   which compares the current image for the node to the one in the cluster
+   configuration, and if it does not match, uses OpenStack to rebuild the
+   node to the desired (updated) image.
+   TODO: Describe the logic if they DO match
+6. After a rebuild, the compute node runs various Ansible tasks during boot, 
+   controlled by the [compute_init](../../ansible/roles/compute_init/README.md)
+   role, to fully configure the node again. It retrieves the required cluster
+   configuration information from the control node via an NFS mount.
+7. Once the `slurmd` daemon starts on a compute node, the slurm controller
+   registers the node as having finished rebooting. It then launches the actual
+   job, which does not do anything.
+
+   # TODO: check that this is the LAST thing we do?
+   
+
+
+TODO: note terraform parallel limits
+
+nova compute: 10 per nova controller, so either per HV or whole-cloud BM * (different in Caracel onwards). Could tune to 50-100 if properly set.
+
+
+## Prerequsites
+
+To enable a compute node to rejoin the cluster after a vrebuild, functionality
+must be built into the image. Before progressing you should check that all the
+functionality required for your cluster is currently supported by the
+`compute_init` role. Review that role's [README](../../ansible/roles/compute_init/README.md)
+against `environments/*/inventory/groups` files (and any similar files which
+define groups). Note that some functionality does not require support, e.g.
+because it does not run on compute nodes.
+
+## Configuration
+
+The configuration of this is complex and involves:
+- OpenTofu variables to stop tracking image changes on compute nodes
+- Definition of partition(s) to use for launching rebuild jobs
+- Configuration of the [rebuild](../../ansible/roles/rebuild/README.md) role 
+  to enable the Slurm controller to rebuild compute nodes via OpenStack.
+- Configuration of the [compute_init](../../ansible/roles/compute_init/README.md)
+  role so that compute nodes rejoin the cluster after rebuilding - this is likely
+  to require a custom image build.
+
+1. Decide on which nodes rebuilding via Slurm should be enabled. These are
+   referred to as the "rebuildable" nodes below. Generally, this can be all
+   compute nodes.
+
+2. Configure OpenTofu not to manage image changes on rebuildable nodes: For each
+   relevant node group in the OpenTofu `compute` variable, set the
+   parameter `ignore_image_changes: true`. E.g.
+
+    ```terraform
+    # environments/$ENV/main.tf:
+    ...
+    compute = {
+        general = {
+            nodes = ["general-0", "general-1"]
+            ignore_image_changes: true
+        }
+    }
+    ...
+    ```
+
+3. Follow the [compute_init](../../ansible/roles/compute_init/README.md) README
+   to add OpenTofu and Ansible configuration for that role. The "rebootable"
+   nodes should all be in the `compute_init` group with the `compute_init_enable`
+   OpenTofu parameter set.
+
+4. If the [compute_init](../../ansible/roles/compute_init/README.md) README
+   showed that a custom image is required for any entry in the
+   `compute_init_enable` parameter, follow the usual process to build new
+   images as required.
+
+5. Update image references in the OpenTofu configuration. Normally these should
+   be in:
+    - `environments/site/tofu/variables.tf`: `cluster_image_id` for the default
+      cluster image.
+    - `enviroments/$ENV/tofu/main.tf`: parameter `image_id` in node groups
+      defined in the `compute` or `login` variables, to override the default
+      image for specific node groups.
+
+5. Modify `openhpc_slurm_partitions` to add a new partition covering rebuildable
+   nodes to use for for rebuild jobs. If using the default OpenTofu
+   configurations, this variable is contained in an OpenTofu-templated file
+   `environments/$ENV/group_vars/all/partitions.yml` which must be overriden
+   by copying it to e.g. a `z_partitions.yml` file in the same directory.
+   However production sites will probably be overriding this file anyway to
+   customise it.
+
+   An example partition definition is:
+
+    ```yaml
+    openhpc_slurm_partitions:
+        ...
+        - name: rebuild
+          groups:
+            - name: general
+          default: NO
+          maxtime: 30
+          partition_params:
+            PriorityJobFactor: 65533
+            Hidden: YES
+            RootOnly: YES
+            DisableRootJobs: NO
+            PreemptMode: 'OFF'
+            OverSubscribe: EXCLUSIVE
+    ```
+
+    Which has parameters as follows:
+    TODO: update me!
+    - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`,
+      default `rebuild`.
+    - `groups`: A list of node group names, matching keys in the OpenTofu `compute`
+      variable (see example configuration above). See discussion below.
+    - `default`: Must be set to `NO` so that it is not the default partition.
+    - `maxtime`: Maximum time to allow for rebuild jobs, in
+      [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime).
+      The example here is 30 minutes, but see discussion below
+    - `partition_params`: A mapping of additional parameters, which must be set
+      as follows:
+        - `PriorityJobFactor`: Ensures jobs in this partition (i.e. rebuild jobs)
+          are always scheduled before jobs in "normal" partitions on the same
+          nodes. This value is the highest which can be set. See
+          [slurm.conf docs](https://slurm.schedmd.com/slurm.conf.html#OPT_PriorityJobFactor).
+          Note this is used instead of `PriorityTier` as the latter (with the
+          default appliance configuration) allows rebuild jobs to preempt and
+          suspend running user jobs, which is probably undesirable.
+        - `Hidden`: Don't show this partition in e.g. `sinfo` for unpriviledged
+          users.
+        - `RootOnly`: Only allow the root user to submit jobs to this partition.
+        - `DisableRootJobs`: Don't disable the root user, in case this parameter
+          is set globally via `openhpc_config_extra`.
+        - `PreemptMode`: Don't allow reboot jobs to be preempted/suspended.
+        - `OverSubscribe`: Ensure that jobs run in this partition require the
+          entire node. This means they do not run on nodes as the same time as
+          user jobs running in partitions allowing non-exclusive use.
+    
+    Note that this partition overlaps with "normal" partitions. If it is
+    desirable to roll out changes more gradually, it is possible to create
+    multiple "rebuild" partitions, but it is necessary that:
+    - The rebuild partitions should not themselves overlap, else nodes may be
+      rebuilt more than once.
+    - Each rebuild partition should entirely cover one or more "normal"
+      partitions, to avoid the possibility of user jobs being scheduled to a
+      mix of nodes using old and new images.
+
+6. Configure the [rebuild](../../ansible/roles/rebuild/README.md) role:
+    - Add the `control` node into the `rebuild` group.
+    - Ensure an application credential to use for rebuilding nodes is available
+      on the deploy host (default location `~/.config/openstack/clouds.yaml`).
+      If not using that location override `rebuild_clouds`.
+    - **TODO:** CONFIGURE rebuild job defaults!
+
+7. Run `tofu apply` as usual to apply the new OpenTofu configuration.
+
+    > [!NOTE]
+    > If the cluster image references were updated at step 5, this will be
+    > a disruptive operation and should be planned as part of a normal upgrade
+    > cycle.
+
+    > [!CAUTION]
+    > Due to OpenTofu/Terraform state limitations, this will plan to delete and
+    > recreate all compute nodes in node groups where `ignore_image_changes: true`.
+    > was not previously set. This is a one-time issue with adding this
+    > parameter, i.e. subsequent applys will not require this.
+
+TODO: clarify whether, if the image is bumped at this point, the compute nodes
+actually get recreated on the new or the old image??
+
+8. Run the `site.yml` playbook as normal to configure the cluster.
+
+The cluster is now ready to perform slurm-controlled upgrades as described in
+the next section.
+
+## Operations with Slurm-controlled Rebuilds
+
+This section describes how to trigger and control Slurm-controlled rebuilds.
+However in general these are likely to be done as part of a general cluster
+upgrade. As described in the introduction to this page that will involve
+rebuilding the login and control nodes to the new image then re-running the
+`site.yml` playbook to reconfigure the cluster. That process is disruptive in
+that users have no access via SSH or Open Ondemand while it is occuring.
+However there is no need to drain compute nodes and create reservations etc.
+
+Triggering rebuild jobs is done using the following playbook:
+
+    ansible-playbook ansible/adhoc/rebuild-via-slurm.yml
+
+This will create jobs to reimage every slurm-rebuildable node to the image
+currently defined in the OpenTofu configuration.
+
+Note that some of the [rebuild role variables](../../ansible/roles/rebuild/README.md)
+may also be useful as extravars, especially for testing or debugging.
+
+## Testing
+
+The below demonstrates testing this using the `.stackhpc` CI environment, using:
+- A 2-node default "standard" partition.
+- A 2-node "extra" partition (note this does not usually have any nodes by default).
+
+In one terminal launch a watch of job state:
+
+    [root@RL9-control rocky]# clear && ~/ewatch/ewatch.py -n 1 -i '\d+:\d+' 'squeue --all --Format=PARTITION,NAME:25,USERNAME:11,STATE:12,NUMNODES:8,NODELIST'
+
+This uses [ewatch](https://github.com/sjpb/ewatch) to summarise changes in
+output.
+
+In a second terminal, launch 2x normal jobs into the default ("standard")
+partition:
+
+    [demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobA --wrap "sleep 20" && sbatch -N2 --job-name=JobB --wrap "sleep 10"
+
+In a third terminal, trigger rebuild jobs:
+
+    .stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_reboot=false rebuild_job_command="sleep 30"' -
+
+Back in the second terminal, submit more user jobs to either partition:
+
+    [demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobC --partition,standard,extra --wrap "sleep 10"
+
+The output from the first terminal should show:
+- Job A runs on submission in the default "standard" partition.
+- Job B pends for the default "standard" partition.
+- Rebuild jobs runs on submission in the "extra" partition and pend for the "standard" partition
+- Job C pends for both partitions
+- Job A completes
+- Rebuild jobs run on the "standard" partition, jumping ahead of JobB and JobC
+- Rebuild jobs complete in the "extra" paritition
+- JobC runs in the "extra" partition
+- JobC completes
+- Rebuild jobs complete in the "standard" partition
+- Job B runs in the "standard" partition
+
+Example output:
+```
+[2025-03-28T14:26:34.510466]
+PARTITION           NAME                     USER       STATE       NODES   NODELIST            
+standard            JobB                     demo_user  PENDING     2                           
+standard            JobA                     demo_user  RUNNING     2       RL9-compute-[0-1]   
+
+[2025-03-28T14:26:38.530213]
+PARTITION           NAME                     USER       STATE       NODES   NODELIST            
+rebuild             rebuild-RL9-compute-1    root       PENDING     1                           
+rebuild             rebuild-RL9-compute-0    root       PENDING     1                           
+rebuild             rebuild-RL9-extra-0      root       RUNNING     1       RL9-extra-0         
+rebuild             rebuild-RL9-extra-1      root       RUNNING     1       RL9-extra-1         
+standard            JobB                     demo_user  PENDING     2                           
+standard            JobA                     demo_user  RUNNING     2       RL9-compute-[0-1]   
+standard,extra      JobC                     demo_user  PENDING     2                           
+
+[2025-03-28T14:26:54.609651]
+PARTITION           NAME                     USER       STATE       NODES   NODELIST            
+rebuild             rebuild-RL9-compute-0    root       RUNNING     1       RL9-compute-0       
+rebuild             rebuild-RL9-compute-1    root       RUNNING     1       RL9-compute-1       
+rebuild             rebuild-RL9-extra-0      root       RUNNING     1       RL9-extra-0         
+rebuild             rebuild-RL9-extra-1      root       RUNNING     1       RL9-extra-1         
+standard            JobB                     demo_user  PENDING     2                           
+standard,extra      JobC                     demo_user  PENDING     2                           
+
+[2025-03-28T14:28:39.091571]
+PARTITION           NAME                     USER       STATE       NODES   NODELIST            
+extra               JobC                     demo_user  RUNNING     2       RL9-extra-[0-1]     
+rebuild             rebuild-RL9-compute-0    root       RUNNING     1       RL9-compute-0       
+rebuild             rebuild-RL9-compute-1    root       RUNNING     1       RL9-compute-1       
+standard            JobB                     demo_user  PENDING     2                           
+
+[2025-03-28T14:28:49.139349]
+PARTITION           NAME                     USER       STATE       NODES   NODELIST            
+rebuild             rebuild-RL9-compute-0    root       RUNNING     1       RL9-compute-0       
+rebuild             rebuild-RL9-compute-1    root       RUNNING     1       RL9-compute-1       
+standard            JobB                     demo_user  PENDING     2                           
+
+[2025-03-28T14:28:55.168264]
+PARTITION           NAME                     USER       STATE       NODES   NODELIST            
+standard            JobB                     demo_user  RUNNING     2       RL9-compute-[0-1]   
+
+[2025-03-28T14:29:05.216346]
+PARTITION           NAME                     USER       STATE       NODES   NODELIST 
+```

From b1639d0cd83912fe34b1ff60da32c9fb03101084 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 28 Mar 2025 14:53:23 +0000
Subject: [PATCH 02/15] add (normally-empty) second partition for stackhpc

---
 environments/.stackhpc/tofu/main.tf | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf
index 7c6b774fb..8d78401bf 100644
--- a/environments/.stackhpc/tofu/main.tf
+++ b/environments/.stackhpc/tofu/main.tf
@@ -81,11 +81,12 @@ module "cluster" {
             compute_init_enable: ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts"]
             ignore_image_changes: true
         }
-        # Example of how to add another partition:
-        # extra: {
-        #     nodes: ["compute-2", "compute-3"]
-        #     flavor: var.other_node_flavor
-        # }
+        # Normally-empty partition for testing:
+        extra: {
+            nodes: []
+            #nodes: ["extra-0", "extra-1"]
+            flavor: var.other_node_flavor
+        }
     }
 
     volume_backed_instances = var.volume_backed_instances

From 2d287cbea674a1faa074a6276935e39809e4a27d Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 28 Mar 2025 14:59:44 +0000
Subject: [PATCH 03/15] define rebuild partition for stackhpc

---
 .../inventory/group_vars/all/z_partitions.yml  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100755 environments/.stackhpc/inventory/group_vars/all/z_partitions.yml

diff --git a/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml b/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml
new file mode 100755
index 000000000..ea489770e
--- /dev/null
+++ b/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml
@@ -0,0 +1,18 @@
+# override tofu-generated file:
+openhpc_slurm_partitions:
+    - name: extra
+    - name: standard
+    - name: rebuild
+      groups:
+      - name: extra
+      - name: standard
+      default: NO
+      maxtime: 30
+      partition_params:
+       PriorityJobFactor: 65533
+       Hidden: YES
+       RootOnly: YES
+       DisableRootJobs: NO
+       PreemptMode: 'OFF'
+       OverSubscribe: EXCLUSIVE
+

From 5c3c90eddeb9c860c847c38e334f23b8dfafaba0 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 28 Mar 2025 15:00:48 +0000
Subject: [PATCH 04/15] add leafcloud-dev tf vars file

---
 environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars

diff --git a/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars b/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars
new file mode 100644
index 000000000..82e336dd8
--- /dev/null
+++ b/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars
@@ -0,0 +1,10 @@
+cluster_networks = [
+    {
+        network = "stackhpc-dev"
+        subnet = "stackhpc-dev"
+    }
+]
+control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment
+other_node_flavor = "en1.xsmall"
+state_volume_type = "unencrypted"
+home_volume_type = "unencrypted"

From 3b79b77a20ca6303f8d221b3ad672b70473b7201 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 28 Mar 2025 15:07:03 +0000
Subject: [PATCH 05/15] rebuild docs tweak

---
 docs/experimental/slurm-controlled-rebuild.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md
index 4320071da..53de62bc6 100644
--- a/docs/experimental/slurm-controlled-rebuild.md
+++ b/docs/experimental/slurm-controlled-rebuild.md
@@ -221,7 +221,12 @@ This will create jobs to reimage every slurm-rebuildable node to the image
 currently defined in the OpenTofu configuration.
 
 Note that some of the [rebuild role variables](../../ansible/roles/rebuild/README.md)
-may also be useful as extravars, especially for testing or debugging.
+may also be useful as extravars, especially for testing or debugging. For
+example the following comand will run in a non-default partition and does not
+actually reboot/rebuild nodes, which may be useful for testing interactions with
+other priority or QOS settings:
+
+    ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_partitions=test rebuild_job_reboot=false'
 
 ## Testing
 

From af9aa52a4eb9c103a601a3dea5c2ac460c8f451f Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 28 Mar 2025 15:16:04 +0000
Subject: [PATCH 06/15] ensure user limits via slurm are in place before
 starting slurmd

---
 .../roles/compute_init/files/compute-init.yml | 13 ++++++------
 ansible/slurm.yml                             | 20 +++++++++----------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml
index 25af01154..06d62c798 100644
--- a/ansible/roles/compute_init/files/compute-init.yml
+++ b/ansible/roles/compute_init/files/compute-init.yml
@@ -324,12 +324,6 @@
         enabled: true
         state: started
 
-    - name: Ensure slurmd service state
-      service:
-        name: slurmd
-        enabled: true
-        state: started
-
     - name: Set locked memory limits on user-facing nodes
       lineinfile:
         path: /etc/security/limits.conf
@@ -351,6 +345,13 @@
           +:adm:ALL
           -:ALL:ALL
 
+    - name: Ensure slurmd service state
+      service:
+        name: slurmd
+        enabled: true
+        state: started
+
+
     - name: Ensure node is resumed
       # TODO: consider if this is always safe for all job states?
       command: scontrol update state=resume nodename={{ ansible_hostname }}
diff --git a/ansible/slurm.yml b/ansible/slurm.yml
index 80812ae7d..d1bb93a9f 100644
--- a/ansible/slurm.yml
+++ b/ansible/slurm.yml
@@ -19,16 +19,6 @@
     - import_role:
         name: rebuild
 
-- name: Setup slurm
-  hosts: openhpc
-  become: yes
-  tags:
-    - openhpc
-  tasks:
-    - include_role:
-        name: stackhpc.openhpc
-        tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}"
-
 - name: Set locked memory limits on user-facing nodes
   hosts:
     - compute
@@ -63,3 +53,13 @@
           +:adm:ALL
           -:ALL:ALL
       # vagrant uses (deprecated) ansible_ssh_user
+
+- name: Setup slurm
+  hosts: openhpc
+  become: yes
+  tags:
+    - openhpc
+  tasks:
+    - include_role:
+        name: stackhpc.openhpc
+        tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}"

From 25fb17d9214494a4f9e4347e1ae353e35ad3bbf7 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 28 Mar 2025 15:42:38 +0000
Subject: [PATCH 07/15] add docs re. parellelism

---
 ansible/roles/rebuild/README.md               |  6 +--
 docs/experimental/slurm-controlled-rebuild.md | 37 ++++++++++++-------
 docs/production.md                            | 19 ++++++++++
 3 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md
index ddce17933..58cb26502 100644
--- a/ansible/roles/rebuild/README.md
+++ b/ansible/roles/rebuild/README.md
@@ -34,9 +34,9 @@ running the `ansible/adhoc/rebuild-via-slurm.yml` playbook:
   send to `/dev/null` by default, as the root user running this has no shared
   directory for job output.
 
-- `rebuild_job_reboot`: Bool, whether to add the `--reboot` flag to the job
-  to actually trigger a rebuild. Useful for e.g. testing priorities. Default
-  `true`.
+- `rebuild_job_reboot`: Optional. A bool controlling whether to add the
+  `--reboot` flag to the job to actually trigger a rebuild. Useful for e.g.
+  testing partition configurations. Default `true`.
 
 - `rebuild_job_options`: Optional. A string giving any other options to pass to
   [sbatch](https://slurm.schedmd.com/sbatch.html). Default is empty string.
diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md
index 53de62bc6..f7aae3ced 100644
--- a/docs/experimental/slurm-controlled-rebuild.md
+++ b/docs/experimental/slurm-controlled-rebuild.md
@@ -42,9 +42,6 @@ In summary, the way this functionality works is as follows:
    registers the node as having finished rebooting. It then launches the actual
    job, which does not do anything.
 
-   # TODO: check that this is the LAST thing we do?
-   
-
 
 TODO: note terraform parallel limits
 
@@ -86,7 +83,13 @@ The configuration of this is complex and involves:
     compute = {
         general = {
             nodes = ["general-0", "general-1"]
-            ignore_image_changes: true
+            ignore_image_changes = true
+            ...
+        }
+        gpu = {
+            node = ["a100-0", "a100-1"]
+            ignore_image_changes = true
+            ...
         }
     }
     ...
@@ -118,7 +121,8 @@ The configuration of this is complex and involves:
    However production sites will probably be overriding this file anyway to
    customise it.
 
-   An example partition definition is:
+   An example partition definition, given the two node groups "general" and
+   "gpu" shown in Step 2, is:
 
     ```yaml
     openhpc_slurm_partitions:
@@ -126,6 +130,7 @@ The configuration of this is complex and involves:
         - name: rebuild
           groups:
             - name: general
+            - name: gpu
           default: NO
           maxtime: 30
           partition_params:
@@ -138,15 +143,16 @@ The configuration of this is complex and involves:
     ```
 
     Which has parameters as follows:
-    TODO: update me!
     - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`,
       default `rebuild`.
-    - `groups`: A list of node group names, matching keys in the OpenTofu `compute`
-      variable (see example configuration above). See discussion below.
+    - `groups`: A list of node group names, matching keys in the OpenTofu
+      `compute` variable (see example in step 2 above). Normally every compute
+      node group should be listed here, unless Slurm-controlled rebuild is not
+      required for certain node groups.
     - `default`: Must be set to `NO` so that it is not the default partition.
     - `maxtime`: Maximum time to allow for rebuild jobs, in
       [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime).
-      The example here is 30 minutes, but see discussion below
+      The example here is 30 minutes, but see discussion below.
     - `partition_params`: A mapping of additional parameters, which must be set
       as follows:
         - `PriorityJobFactor`: Ensures jobs in this partition (i.e. rebuild jobs)
@@ -166,9 +172,12 @@ The configuration of this is complex and involves:
           entire node. This means they do not run on nodes as the same time as
           user jobs running in partitions allowing non-exclusive use.
     
-    Note that this partition overlaps with "normal" partitions. If it is
-    desirable to roll out changes more gradually, it is possible to create
-    multiple "rebuild" partitions, but it is necessary that:
+    The value for `maxtime` needs to be sufficent not just for a single node
+    to be rebuilt, but also to allow for any batching in either OpenTofu or
+    in Nova - see remarks in the [production docs](../production.md).
+
+    If it is desirable to roll out changes more gradually, it is possible to
+    create multiple "rebuild" partitions, but it is necessary that:
     - The rebuild partitions should not themselves overlap, else nodes may be
       rebuilt more than once.
     - Each rebuild partition should entirely cover one or more "normal"
@@ -179,8 +188,8 @@ The configuration of this is complex and involves:
     - Add the `control` node into the `rebuild` group.
     - Ensure an application credential to use for rebuilding nodes is available
       on the deploy host (default location `~/.config/openstack/clouds.yaml`).
-      If not using that location override `rebuild_clouds`.
-    - **TODO:** CONFIGURE rebuild job defaults!
+    - If required, override `rebuild_clouds_path` or other variables in the site
+      environment.
 
 7. Run `tofu apply` as usual to apply the new OpenTofu configuration.
 
diff --git a/docs/production.md b/docs/production.md
index 7876af126..c15298887 100644
--- a/docs/production.md
+++ b/docs/production.md
@@ -130,3 +130,22 @@ and referenced from the `site` and `production` environments, e.g.:
 
 - See the [hpctests docs](../ansible/roles/hpctests/README.md) for advice on
   raising `hpctests_hpl_mem_frac` during tests.
+
+- By default, OpenTofu (and Terraform) [limits](https://opentofu.org/docs/cli/commands/apply/#apply-options)
+  the number of concurrent operations to 10. This means that for example only
+  10 ports or 10 instances can be deployed at once. This should be raised by
+  modifying `environments/$ENV/activate` to add a line like:
+
+      export TF_CLI_ARGS_apply="-parallelism=25"
+
+  The value chosen should be the highest value demonstrated during testing.
+  Note that any time spent blocked due to this parallelism limit does not count
+  against the (un-overridable) internal OpenTofu timeout of 30 minutes
+
+- By default, OpenStack Nova also [limits](https://docs.openstack.org/nova/latest/configuration/config.html#DEFAULT.max_concurrent_builds)
+  the number of concurrent instance builds to 10. This is per Nova controller,
+  so 10x virtual machines per hypervisor. For baremetal nodes it is 10 per cloud
+  if the OpenStack version is earlier than Caracel, else this limit can be
+  raised using [shards](https://specs.openstack.org/openstack/nova-specs/specs/2024.1/implemented/ironic-shards.html).
+  In general it should be possible to raise this value to 50-100 if the cloud
+  is properly tuned, again, demonstrated through testing.

From f4a5fb31197ddf463fd4d2a360f162513b04680a Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 28 Mar 2025 15:43:14 +0000
Subject: [PATCH 08/15] update expected stackhpc sinfo  for empty 'extra'
 partition

---
 ansible/ci/check_slurm.yml                    | 1 +
 docs/experimental/slurm-controlled-rebuild.md | 6 ------
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml
index 4d77c6fb5..00e8d62cf 100644
--- a/ansible/ci/check_slurm.yml
+++ b/ansible/ci/check_slurm.yml
@@ -21,4 +21,5 @@
           <end>
       vars:
         expected_sinfo:
+          - "  extra up 60-00:00:00 0 n/a" # empty partition
           - "{{ openhpc_cluster_name }}-compute-[0-1] standard* up 60-00:00:00 2 idle"
diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md
index f7aae3ced..ca8081079 100644
--- a/docs/experimental/slurm-controlled-rebuild.md
+++ b/docs/experimental/slurm-controlled-rebuild.md
@@ -42,12 +42,6 @@ In summary, the way this functionality works is as follows:
    registers the node as having finished rebooting. It then launches the actual
    job, which does not do anything.
 
-
-TODO: note terraform parallel limits
-
-nova compute: 10 per nova controller, so either per HV or whole-cloud BM * (different in Caracel onwards). Could tune to 50-100 if properly set.
-
-
 ## Prerequsites
 
 To enable a compute node to rejoin the cluster after a vrebuild, functionality

From 5d50649393c9eaebbe52f35e5e5f8dc99f18328a Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 28 Mar 2025 15:43:53 +0000
Subject: [PATCH 09/15] use new rebuild adhoc for CI

---
 .github/workflows/stackhpc.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index 4c7e28b2b..348edff35 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -173,11 +173,11 @@ jobs:
           ansible-playbook -v ansible/site.yml
           ansible-playbook -v ansible/ci/check_slurm.yml
 
-      - name: Reimage compute nodes to image in current branch using slurm - tests compute-init
+      - name: Reimage compute nodes to image in current branch using slurm
         run: |
           . venv/bin/activate
           . environments/.stackhpc/activate
-          ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
+          ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml
           ansible-playbook -v ansible/ci/check_slurm.yml
 
       - name: Check sacct state survived reimage to current branch

From 916460bb378d9424fb005c7c3ed6cb845b52edb3 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Wed, 2 Apr 2025 09:00:29 +0000
Subject: [PATCH 10/15] fixup empty partition for CI check

---
 ansible/ci/check_slurm.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml
index 00e8d62cf..feac502ed 100644
--- a/ansible/ci/check_slurm.yml
+++ b/ansible/ci/check_slurm.yml
@@ -21,5 +21,5 @@
           <end>
       vars:
         expected_sinfo:
-          - "  extra up 60-00:00:00 0 n/a" # empty partition
+          - " extra up 60-00:00:00 0 n/a" # empty partition
           - "{{ openhpc_cluster_name }}-compute-[0-1] standard* up 60-00:00:00 2 idle"

From d01dc4097b1d11e66496234cbbdb5bddd6d55db1 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Wed, 2 Apr 2025 10:26:56 +0100
Subject: [PATCH 11/15] Fix typo in
 docs/experimental/slurm-controlled-rebuild.md

Co-authored-by: Will Szumski <will@stackhpc.com>
---
 docs/experimental/slurm-controlled-rebuild.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md
index ca8081079..18ad42042 100644
--- a/docs/experimental/slurm-controlled-rebuild.md
+++ b/docs/experimental/slurm-controlled-rebuild.md
@@ -44,7 +44,7 @@ In summary, the way this functionality works is as follows:
 
 ## Prerequsites
 
-To enable a compute node to rejoin the cluster after a vrebuild, functionality
+To enable a compute node to rejoin the cluster after a rebuild, functionality
 must be built into the image. Before progressing you should check that all the
 functionality required for your cluster is currently supported by the
 `compute_init` role. Review that role's [README](../../ansible/roles/compute_init/README.md)

From d42c6bac996b642bd88f059eec9c81104c375fa2 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 4 Apr 2025 15:45:14 +0000
Subject: [PATCH 12/15] fix long nodenames getting truncated when listing
 partitions

---
 ansible/roles/rebuild/tasks/rebuild_partition.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ansible/roles/rebuild/tasks/rebuild_partition.yml b/ansible/roles/rebuild/tasks/rebuild_partition.yml
index 0ce4ee88a..3b319e6e2 100644
--- a/ansible/roles/rebuild/tasks/rebuild_partition.yml
+++ b/ansible/roles/rebuild/tasks/rebuild_partition.yml
@@ -3,7 +3,7 @@
     cmd: >-
       sinfo
       --Node
-      --Format=NodeList
+      --format=%N
       --noheader
       --partition={{ _rebuild_job_current_partition }}
   register: _sinfo_partition

From 1343125d808d53d2cd294d3ee3d581f1889370dc Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Tue, 8 Apr 2025 09:10:44 +0000
Subject: [PATCH 13/15] bump CI image

---
 environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
index ea8d46e4a..814769ed1 100644
--- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
+++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
@@ -1,6 +1,6 @@
 {
     "cluster_image": {
-        "RL8": "openhpc-RL8-250401-1100-9a3cffdb",
-        "RL9": "openhpc-RL9-250401-1100-9a3cffdb"
+        "RL8": "openhpc-RL8-250408-0812-c3b68b9c",
+        "RL9": "openhpc-RL9-250408-0813-c3b68b9c"
     }
 }

From 1087f19275905207b922da62f54d8a17d41edc61 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Tue, 8 Apr 2025 09:11:20 +0000
Subject: [PATCH 14/15] cope with nodes still running reboot job in check_slurm
 playbook

---
 ansible/ci/check_slurm.yml | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml
index feac502ed..8d2b08350 100644
--- a/ansible/ci/check_slurm.yml
+++ b/ansible/ci/check_slurm.yml
@@ -6,19 +6,9 @@
       shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name
       register: sinfo
       changed_when: false
-      until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout or "down" in sinfo.stdout)
+      until: sinfo.stdout_lines == expected_sinfo
       retries: 10
       delay: 5
-    - name: Check nodes have expected slurm state
-      assert:
-        that: sinfo.stdout_lines == expected_sinfo
-        fail_msg: |
-          sinfo output not as expected:
-          actual:
-          {{ sinfo.stdout_lines }}
-          expected:
-          {{ expected_sinfo }}
-          <end>
       vars:
         expected_sinfo:
           - " extra up 60-00:00:00 0 n/a" # empty partition

From 6c65f72bf33a52961be405346e17a9c247b0f780 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Tue, 8 Apr 2025 11:02:03 +0000
Subject: [PATCH 15/15] increase retries when checking slurm state in CI

---
 ansible/ci/check_slurm.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml
index 8d2b08350..ff527da06 100644
--- a/ansible/ci/check_slurm.yml
+++ b/ansible/ci/check_slurm.yml
@@ -7,7 +7,7 @@
       register: sinfo
       changed_when: false
       until: sinfo.stdout_lines == expected_sinfo
-      retries: 10
+      retries: 200
       delay: 5
       vars:
         expected_sinfo: