From e42282087420592c320ca285476fc46eba15f693 Mon Sep 17 00:00:00 2001
From: scrungus <tyler@stackhpc.com>
Date: Wed, 15 May 2024 20:26:32 +0000
Subject: [PATCH 01/78] add storage VLAN interface on all slurm nodes

---
 .../cluster_infra/templates/resources.tf.j2   | 89 ++++++++++++++++++-
 1 file changed, 87 insertions(+), 2 deletions(-)

diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
index 3de64f12c..dc991a6df 100644
--- a/ansible/roles/cluster_infra/templates/resources.tf.j2
+++ b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -116,6 +116,11 @@ data "openstack_networking_network_v2" "cluster_external_network" {
   name = "{{ cluster_external_network }}"
 }
 
+# Always get the SRIOV storage network 
+data "openstack_networking_network_v2" "portal_storage_direct" {
+  tags = ["portal-storage-direct"]
+}
+
 data "openstack_networking_subnet_ids_v2" "cluster_external_subnets" {
   network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}"
 }
@@ -177,6 +182,11 @@ data "openstack_networking_subnet_v2" "cluster_subnet" {
 ##### Cluster ports
 #####
 
+###
+# Login node
+###
+
+# VXLAN
 resource "openstack_networking_port_v2" "login" {
   name = "{{ cluster_name }}-login-0"
   network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
@@ -201,6 +211,29 @@ resource "openstack_networking_port_v2" "login" {
   }
 }
 
+# Storage VLAN
+resource "openstack_networking_port_v2" "login_portal_storage" {
+  name           = "{{ cluster_name }}-login-0-portal-storage"
+  network_id     = data.openstack_networking_network_v2.portal_storage_direct.id
+  admin_state_up = "true"
+
+  # not sure if needed here
+  security_group_ids = [
+    "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}",
+    "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}"
+  ]
+
+  # for now, until we set up flavors
+  binding {
+    vnic_type = "normal"
+  }
+}
+
+###
+# Control node
+###
+
+# VLXAN
 resource "openstack_networking_port_v2" "control" {
   name = "{{ cluster_name }}-control-0"
   network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
@@ -224,7 +257,29 @@ resource "openstack_networking_port_v2" "control" {
   }
 }
 
+# Storage VLAN
+
+resource "openstack_networking_port_v2" "control_portal_storage" {
+  name           = "{{ cluster_name }}-control-0-portal-storage"
+  network_id     = data.openstack_networking_network_v2.portal_storage_direct.id
+  admin_state_up = "true"
+
+  # not sure if needed here
+  security_group_ids = [
+    "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}"
+  ]
+
+  # for now, until we set up flavors
+  binding {
+    vnic_type = "normal"
+  }
+}
+
+###
+# Workers
+###
 {% for partition in openhpc_slurm_partitions %}
+# VXLAN
 resource "openstack_networking_port_v2" "{{ partition.name }}" {
   count = {{ partition.count }}
   name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
@@ -249,6 +304,24 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" {
   }
 }
 
+# Storage VLAN
+resource "openstack_networking_port_v2" "{{ partition.name }}_portal_storage" {
+  count          = {{ partition.count }}
+  name           = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}-portal-storage"
+  network_id     = data.openstack_networking_network_v2.portal_storage_direct.id
+  admin_state_up = "true"
+
+  # not sure if needed here
+  security_group_ids = [
+    "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}"
+  ]
+
+  # for now, until we set up flavors
+  binding {
+    vnic_type = "normal"
+  }
+}
+
 {% endfor %}
 
 #####
@@ -274,7 +347,11 @@ resource "openstack_compute_instance_v2" "login" {
   {% endif %}
 
   network {
-    port = "${openstack_networking_port_v2.login.id}"
+    port = openstack_networking_port_v2.login.id
+  }
+
+  network {
+    port = openstack_networking_port_v2.login_portal_storage.id
   }
 
   # root device:
@@ -317,7 +394,11 @@ resource "openstack_compute_instance_v2" "control" {
   {% endif %}
 
   network {
-    port = "${openstack_networking_port_v2.control.id}"
+    port = openstack_networking_port_v2.control.id
+  }
+
+  network {
+    port = openstack_networking_port_v2.control_portal_storage.id
   }
 
   # root device:
@@ -393,6 +474,10 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
     port = openstack_networking_port_v2.{{ partition.name }}[count.index].id
   }
 
+  network {
+    port = openstack_networking_port_v2.{{ partition.name }}_portal_storage[count.index].id
+  }
+
   # root device:
   block_device {
       uuid = "{{ cluster_image }}"

From 0572a5763c9fcc29d15f4175d0dffe83d57d14c2 Mon Sep 17 00:00:00 2001
From: scrungus <tyler@stackhpc.com>
Date: Tue, 28 May 2024 11:35:09 +0000
Subject: [PATCH 02/78] configure storage network components based on
 `cluster_storage_network` ,set storage port mode based on
 `cluster_storage_vnic_type`

---
 .../cluster_infra/templates/resources.tf.j2   | 57 ++++++++++++-------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
index dc991a6df..0dfc72604 100644
--- a/ansible/roles/cluster_infra/templates/resources.tf.j2
+++ b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -116,10 +116,12 @@ data "openstack_networking_network_v2" "cluster_external_network" {
   name = "{{ cluster_external_network }}"
 }
 
-# Always get the SRIOV storage network 
-data "openstack_networking_network_v2" "portal_storage_direct" {
-  tags = ["portal-storage-direct"]
+# SRIOV storage network 
+{% if cluster_storage_network is defined %}
+data "openstack_networking_network_v2" "cluster_storage_direct" {
+  name = "{{ cluster_storage_network }}"
 }
+{% endif %}
 
 data "openstack_networking_subnet_ids_v2" "cluster_external_subnets" {
   network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}"
@@ -186,7 +188,7 @@ data "openstack_networking_subnet_v2" "cluster_subnet" {
 # Login node
 ###
 
-# VXLAN
+# Primary network
 resource "openstack_networking_port_v2" "login" {
   name = "{{ cluster_name }}-login-0"
   network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
@@ -212,9 +214,10 @@ resource "openstack_networking_port_v2" "login" {
 }
 
 # Storage VLAN
-resource "openstack_networking_port_v2" "login_portal_storage" {
-  name           = "{{ cluster_name }}-login-0-portal-storage"
-  network_id     = data.openstack_networking_network_v2.portal_storage_direct.id
+{% if cluster_storage_network is defined %}
+resource "openstack_networking_port_v2" "login_storage" {
+  name           = "{{ cluster_name }}-login-storage-0"
+  network_id     = data.openstack_networking_network_v2.cluster_storage_direct.id
   admin_state_up = "true"
 
   # not sure if needed here
@@ -225,15 +228,16 @@ resource "openstack_networking_port_v2" "login_portal_storage" {
 
   # for now, until we set up flavors
   binding {
-    vnic_type = "normal"
+    vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
   }
 }
+{% endif %}
 
 ###
 # Control node
 ###
 
-# VLXAN
+# Primary network
 resource "openstack_networking_port_v2" "control" {
   name = "{{ cluster_name }}-control-0"
   network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
@@ -258,10 +262,10 @@ resource "openstack_networking_port_v2" "control" {
 }
 
 # Storage VLAN
-
-resource "openstack_networking_port_v2" "control_portal_storage" {
-  name           = "{{ cluster_name }}-control-0-portal-storage"
-  network_id     = data.openstack_networking_network_v2.portal_storage_direct.id
+{% if cluster_storage_network is defined %}
+resource "openstack_networking_port_v2" "control_storage" {
+  name           = "{{ cluster_name }}-control-storage-0"
+  network_id     = data.openstack_networking_network_v2.cluster_storage_direct.id
   admin_state_up = "true"
 
   # not sure if needed here
@@ -271,15 +275,16 @@ resource "openstack_networking_port_v2" "control_portal_storage" {
 
   # for now, until we set up flavors
   binding {
-    vnic_type = "normal"
+    vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
   }
 }
+{% endif %}
 
 ###
 # Workers
 ###
 {% for partition in openhpc_slurm_partitions %}
-# VXLAN
+# Primary network
 resource "openstack_networking_port_v2" "{{ partition.name }}" {
   count = {{ partition.count }}
   name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
@@ -305,10 +310,11 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" {
 }
 
 # Storage VLAN
-resource "openstack_networking_port_v2" "{{ partition.name }}_portal_storage" {
+{% if cluster_storage_network is defined %}
+resource "openstack_networking_port_v2" "{{ partition.name }}_storage" {
   count          = {{ partition.count }}
-  name           = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}-portal-storage"
-  network_id     = data.openstack_networking_network_v2.portal_storage_direct.id
+  name           = "{{ cluster_name }}-compute-{{ partition.name }}-storage-${count.index}"
+  network_id     = data.openstack_networking_network_v2.cluster_storage_direct.id
   admin_state_up = "true"
 
   # not sure if needed here
@@ -318,9 +324,10 @@ resource "openstack_networking_port_v2" "{{ partition.name }}_portal_storage" {
 
   # for now, until we set up flavors
   binding {
-    vnic_type = "normal"
+    vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
   }
 }
+{% endif %}
 
 {% endfor %}
 
@@ -350,9 +357,11 @@ resource "openstack_compute_instance_v2" "login" {
     port = openstack_networking_port_v2.login.id
   }
 
+  {% if cluster_storage_network is defined %}
   network {
-    port = openstack_networking_port_v2.login_portal_storage.id
+    port = openstack_networking_port_v2.login_storage.id
   }
+  {% endif %}
 
   # root device:
   block_device {
@@ -397,9 +406,11 @@ resource "openstack_compute_instance_v2" "control" {
     port = openstack_networking_port_v2.control.id
   }
 
+  {% if cluster_storage_network is defined %}
   network {
-    port = openstack_networking_port_v2.control_portal_storage.id
+    port = openstack_networking_port_v2.control_storage.id
   }
+  {% endif %}
 
   # root device:
   block_device {
@@ -474,9 +485,11 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
     port = openstack_networking_port_v2.{{ partition.name }}[count.index].id
   }
 
+  {% if cluster_storage_network is defined %}
   network {
-    port = openstack_networking_port_v2.{{ partition.name }}_portal_storage[count.index].id
+    port = openstack_networking_port_v2.{{ partition.name }}_storage[count.index].id
   }
+  {% endif %}
 
   # root device:
   block_device {

From 40bf3e6cae3d6672bbb52c035629b23b1a3d4c6e Mon Sep 17 00:00:00 2001
From: scrungus <tyler@stackhpc.com>
Date: Tue, 28 May 2024 11:47:49 +0000
Subject: [PATCH 03/78] remove login sec group from storage port

---
 ansible/roles/cluster_infra/templates/resources.tf.j2 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
index 0dfc72604..d8492663e 100644
--- a/ansible/roles/cluster_infra/templates/resources.tf.j2
+++ b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -223,7 +223,6 @@ resource "openstack_networking_port_v2" "login_storage" {
   # not sure if needed here
   security_group_ids = [
     "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}",
-    "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}"
   ]
 
   # for now, until we set up flavors

From 284d23bbc284b7958f2e7e015bd96b15c68062d3 Mon Sep 17 00:00:00 2001
From: scrungus <tyler@stackhpc.com>
Date: Tue, 28 May 2024 12:43:01 +0000
Subject: [PATCH 04/78] remove comments, storage network renamed

---
 .../cluster_infra/templates/resources.tf.j2   | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
index d8492663e..a0b042b8f 100644
--- a/ansible/roles/cluster_infra/templates/resources.tf.j2
+++ b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -116,9 +116,9 @@ data "openstack_networking_network_v2" "cluster_external_network" {
   name = "{{ cluster_external_network }}"
 }
 
-# SRIOV storage network 
+# Storage network 
 {% if cluster_storage_network is defined %}
-data "openstack_networking_network_v2" "cluster_storage_direct" {
+data "openstack_networking_network_v2" "cluster_storage" {
   name = "{{ cluster_storage_network }}"
 }
 {% endif %}
@@ -217,15 +217,13 @@ resource "openstack_networking_port_v2" "login" {
 {% if cluster_storage_network is defined %}
 resource "openstack_networking_port_v2" "login_storage" {
   name           = "{{ cluster_name }}-login-storage-0"
-  network_id     = data.openstack_networking_network_v2.cluster_storage_direct.id
+  network_id     = data.openstack_networking_network_v2.cluster_storage.id
   admin_state_up = "true"
 
-  # not sure if needed here
   security_group_ids = [
     "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}",
   ]
 
-  # for now, until we set up flavors
   binding {
     vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
   }
@@ -260,19 +258,17 @@ resource "openstack_networking_port_v2" "control" {
   }
 }
 
-# Storage VLAN
+# Storage network
 {% if cluster_storage_network is defined %}
 resource "openstack_networking_port_v2" "control_storage" {
   name           = "{{ cluster_name }}-control-storage-0"
-  network_id     = data.openstack_networking_network_v2.cluster_storage_direct.id
+  network_id     = data.openstack_networking_network_v2.cluster_storage.id
   admin_state_up = "true"
 
-  # not sure if needed here
   security_group_ids = [
     "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}"
   ]
 
-  # for now, until we set up flavors
   binding {
     vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
   }
@@ -308,20 +304,18 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" {
   }
 }
 
-# Storage VLAN
+# Storage network
 {% if cluster_storage_network is defined %}
 resource "openstack_networking_port_v2" "{{ partition.name }}_storage" {
   count          = {{ partition.count }}
   name           = "{{ cluster_name }}-compute-{{ partition.name }}-storage-${count.index}"
-  network_id     = data.openstack_networking_network_v2.cluster_storage_direct.id
+  network_id     = data.openstack_networking_network_v2.cluster_storage.id
   admin_state_up = "true"
 
-  # not sure if needed here
   security_group_ids = [
     "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}"
   ]
 
-  # for now, until we set up flavors
   binding {
     vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
   }

From 87286e2f284074cb07984cfb4f9b616920ab42fa Mon Sep 17 00:00:00 2001
From: scrungus <tyler@stackhpc.com>
Date: Tue, 28 May 2024 15:15:20 +0000
Subject: [PATCH 05/78] consistent comments

---
 ansible/roles/cluster_infra/templates/resources.tf.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
index a0b042b8f..2fbb6da33 100644
--- a/ansible/roles/cluster_infra/templates/resources.tf.j2
+++ b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -213,7 +213,7 @@ resource "openstack_networking_port_v2" "login" {
   }
 }
 
-# Storage VLAN
+# Storage network
 {% if cluster_storage_network is defined %}
 resource "openstack_networking_port_v2" "login_storage" {
   name           = "{{ cluster_name }}-login-storage-0"

From 08efaab5c136908bc216eedc42881e2f39e89055 Mon Sep 17 00:00:00 2001
From: scrungus <tyler@stackhpc.com>
Date: Tue, 28 May 2024 16:01:30 +0000
Subject: [PATCH 06/78] cluster_vnic_profile removed as no longer used

---
 .../cluster_infra/templates/resources.tf.j2      | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
index 2fbb6da33..03eab5afb 100644
--- a/ansible/roles/cluster_infra/templates/resources.tf.j2
+++ b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -205,11 +205,6 @@ resource "openstack_networking_port_v2" "login" {
 
   binding {
     vnic_type = "{{ cluster_vnic_type | default('normal') }}"
-    {% if cluster_vnic_profile is defined %}
-    profile = <<EOF
-    {{ cluster_vnic_profile | to_json }}
-    EOF
-    {% endif %}
   }
 }
 
@@ -250,11 +245,7 @@ resource "openstack_networking_port_v2" "control" {
 
   binding {
     vnic_type = "{{ cluster_vnic_type | default('normal') }}"
-    {% if cluster_vnic_profile is defined %}
-    profile = <<EOF
-    {{ cluster_vnic_profile | to_json }}
-    EOF
-    {% endif %}
+
   }
 }
 
@@ -296,11 +287,6 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" {
 
   binding {
     vnic_type = "{{ cluster_vnic_type | default('normal') }}"
-    {% if cluster_vnic_profile is defined %}
-    profile = <<EOF
-    {{ cluster_vnic_profile | to_json }}
-    EOF
-    {% endif %}
   }
 }
 

From 43094848207e61b8511c8510e329a7bb9df40c77 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Thu, 6 Jun 2024 11:46:33 +0100
Subject: [PATCH 07/78] Image update - OpenHPC v3.1 for RL9 (#394)

* bump Packer source image to RL9.4

* downgrade OFED to LTS to get stable download url

* bump OOD role, now ondemand dnf package installed will be latest

* Revert Packer source image to RL9.3 to avoid hanging after post-update reboot"

This reverts commit 851c494fa7b88581cfb4194f9e7f305b63f9e5c0.

* bump OFED to get RL9.4-supported version

* bump leafcloud packer vm to 8GB RAM

* DEBUG: disable (working) OFED build

* Revert "DEBUG: disable (working) OFED build"

This reverts commit 45a48c3bab3f86c4f4e91534df3308fb653ad944.

* DEBUG: output builder hostname

* Revert "DEBUG: output builder hostname"

This reverts commit 3f95f8ea3e14f2e3bbd022c0895d13cf8b5b4794.

* fix build workflow concurrency

* DEBUG: disable updates

* Revert "DEBUG: disable updates"

This reverts commit 3581a35529aa54cdaebaaba11d691f1684f22d0c.

* bump packer build volume size for non-ofed to avoid RL8 build running out of root space

* try to prevent stackhpc env image build connection drops

* bump packer source image to fixed RL9.4 image

* run test CI workflow on RL8 image if PR labeled with 'RL8'

* bump CI images

* bump openhpc role to fix munge checks on key path
---
 .github/workflows/fatimage.yml                              | 4 +++-
 .github/workflows/stackhpc.yml                              | 3 +++
 ansible/roles/ofed/defaults/main.yml                        | 2 +-
 environments/.stackhpc/ARCUS.pkrvars.hcl                    | 3 ++-
 environments/.stackhpc/LEAFCLOUD.pkrvars.hcl                | 4 ++--
 environments/.stackhpc/ansible.cfg                          | 2 +-
 environments/.stackhpc/terraform/main.tf                    | 6 +++---
 .../common/inventory/group_vars/all/openondemand.yml        | 2 --
 packer/openstack.pkr.hcl                                    | 2 +-
 requirements.yml                                            | 4 ++--
 10 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index d3c9adaf7..9209e85ea 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -8,11 +8,13 @@ name: Build fat image
         description: Include RL8 image build
         type: boolean
         default: false
+concurrency:
+  group: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
+  cancel-in-progress: true
 jobs:
   openstack:
     name: openstack-imagebuild
     runs-on: ubuntu-20.04
-    concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
     strategy:
       matrix:
         os_version: [RL8, RL9]
diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index c8bb9b06f..d0f74ad1c 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -24,10 +24,13 @@ jobs:
           - ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch
         rl8_branch:
           - ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge
+        rl8_label:
+          - ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created
         exclude:
           - os_version: RL8
             rl8_selected: false
             rl8_branch: false
+            rl8_label: false
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml
index 369e43a15..7233809bc 100644
--- a/ansible/roles/ofed/defaults/main.yml
+++ b/ansible/roles/ofed/defaults/main.yml
@@ -1,4 +1,4 @@
-ofed_version: 24.01-0.3.3.1
+ofed_version: '24.04-0.6.6.0'  # LTS version 23.10-2.1.3.1 does not support RL9.4
 ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz
 ofed_distro: rhel # NB: not expected to work on other distros due to installation differences
 ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9'
diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl
index 738a021c0..72e978c95 100644
--- a/environments/.stackhpc/ARCUS.pkrvars.hcl
+++ b/environments/.stackhpc/ARCUS.pkrvars.hcl
@@ -1,6 +1,7 @@
 flavor = "vm.ska.cpu.general.small"
 use_blockstorage_volume = true
-volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny
+volume_size = 15 # GB
+volume_size_ofed = 15 # GB
 image_disk_format = "qcow2"
 networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
 ssh_keypair_name = "slurm-app-ci"
diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
index 1a1bcd0ab..1f6ece01f 100644
--- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
+++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
@@ -1,6 +1,6 @@
-flavor = "ec1.medium"
+flavor = "ec1.large"
 use_blockstorage_volume = true
-volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny
+volume_size = 15 # GB
 volume_size_ofed = 15 # GB
 volume_type = "unencrypted"
 image_disk_format = "qcow2"
diff --git a/environments/.stackhpc/ansible.cfg b/environments/.stackhpc/ansible.cfg
index 139ffa033..aa0ec5aaf 100644
--- a/environments/.stackhpc/ansible.cfg
+++ b/environments/.stackhpc/ansible.cfg
@@ -12,5 +12,5 @@ roles_path = ../../ansible/roles
 filter_plugins = ../../ansible/filter_plugins
 
 [ssh_connection]
-ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
+ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
 pipelining = True
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index a66aa256d..47e02f6b1 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,9 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/353
-        RL8: "openhpc-RL8-240423-1002-4b09ba85"
-        RL9: "openhpc-ofed-RL9-240423-1059-4b09ba85"
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/394
+        RL8: "openhpc-RL8-240605-1205-a3002d19"
+        RL9: "openhpc-ofed-RL9-240605-1204-a3002d19"
     }
 }
 
diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml
index 18e741ce7..5e85392ca 100644
--- a/environments/common/inventory/group_vars/all/openondemand.yml
+++ b/environments/common/inventory/group_vars/all/openondemand.yml
@@ -13,8 +13,6 @@
 # or include regex special characters.
 openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_regex }}"
 
-ondemand_package: ondemand-3.0.3
-
 # Add grafana to dashboard links to OOD only if grafana group is available
 openondemand_dashboard_links_grafana:
   - name: Grafana
diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl
index 0db3591f7..262b071f4 100644
--- a/packer/openstack.pkr.hcl
+++ b/packer/openstack.pkr.hcl
@@ -49,7 +49,7 @@ variable "fatimage_source_image_name" {
   type = map(string)
   default = {
     RL8: "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
-    RL9: "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2"
+    RL9: "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2"
   }
 }
 
diff --git a/requirements.yml b/requirements.yml
index e00e19680..995329fbf 100644
--- a/requirements.yml
+++ b/requirements.yml
@@ -3,7 +3,7 @@ roles:
   - src: stackhpc.nfs
     version: v23.12.1 # Tolerate state nfs file handles
   - src: https://github.com/stackhpc/ansible-role-openhpc.git
-    version: v0.25.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/167
+    version: v0.26.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/168
     name: stackhpc.openhpc
   - src: https://github.com/stackhpc/ansible-node-exporter.git
     version: stackhpc
@@ -19,7 +19,7 @@ roles:
     # No versions available
   - src: https://github.com/OSC/ood-ansible.git
     name: osc.ood
-    version: v3.0.6
+    version: v3.1.5
   - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git
     name: stackhpc.os-manila-mount
     version: v24.2.0 # Support RockyLinux 9

From 453b1e6d4feb01089447cd8dcb124787ad68b2e5 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Thu, 6 Jun 2024 15:11:15 +0100
Subject: [PATCH 08/78] Support ceph quincy for RL9 (#397)

* support ceph quincy for RL9

* bump CI image
---
 environments/.stackhpc/terraform/main.tf | 6 +++---
 requirements.yml                         | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index 47e02f6b1..d4567f912 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,9 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/394
-        RL8: "openhpc-RL8-240605-1205-a3002d19"
-        RL9: "openhpc-ofed-RL9-240605-1204-a3002d19"
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/397
+        RL8: "openhpc-RL8-240606-1054-5ec8558e"
+        RL9: "openhpc-ofed-RL9-240606-1054-5ec8558e"
     }
 }
 
diff --git a/requirements.yml b/requirements.yml
index 995329fbf..c0ded971b 100644
--- a/requirements.yml
+++ b/requirements.yml
@@ -22,7 +22,7 @@ roles:
     version: v3.1.5
   - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git
     name: stackhpc.os-manila-mount
-    version: v24.2.0 # Support RockyLinux 9
+    version: v24.5.1 # Support ceph quincy for RL9
 
 collections:
   - name: containers.podman

From 24c6b0563963167d4fc3bdb0f093e2bc86cc4c72 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Wed, 19 Jun 2024 14:37:34 +0100
Subject: [PATCH 09/78] Disable grafana repos by default (#399)

* disable grafana repos by default

* bump CI image
---
 environments/.stackhpc/terraform/main.tf | 6 +++---
 requirements.yml                         | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index d4567f912..c5ef5b699 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,9 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/397
-        RL8: "openhpc-RL8-240606-1054-5ec8558e"
-        RL9: "openhpc-ofed-RL9-240606-1054-5ec8558e"
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/399
+        RL8: "openhpc-RL8-240619-0949-66c0e540"
+        RL9: "openhpc-ofed-RL9-240619-0949-66c0e540"
     }
 }
 
diff --git a/requirements.yml b/requirements.yml
index c0ded971b..757c851d5 100644
--- a/requirements.yml
+++ b/requirements.yml
@@ -15,8 +15,7 @@ roles:
     version: 0.19.1
   - src: https://github.com/stackhpc/ansible-grafana.git
     name: cloudalchemy.grafana
-    version: service-state
-    # No versions available
+    version: stackhpc-0.19.0 # fix grafana install
   - src: https://github.com/OSC/ood-ansible.git
     name: osc.ood
     version: v3.1.5

From 9d1ae1a72a61f00683c9e07bde846fe86c8565bd Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Thu, 27 Jun 2024 14:24:18 +0100
Subject: [PATCH 10/78] Add squid role (#401)

* add squid role + config

* make systemd proxy config idempotent (and clearer)

* fix defaults for squid/builder

* bump CI image

* fix squid cache dir definition error

* Fix squid README nits

Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com>

* Fix squid README nits

Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com>

* Fix squid README nits

Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com>

* Fix squid README nits

Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com>

* make proxy default to squid port

---------

Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com>
---
 ansible/.gitignore                            |  4 +-
 ansible/bootstrap.yml                         | 24 +++++++--
 ansible/roles/proxy/tasks/main.yml            |  6 ++-
 ansible/roles/squid/README.md                 | 39 ++++++++++++++
 ansible/roles/squid/defaults/main.yml         | 24 +++++++++
 ansible/roles/squid/handlers/main.yml         |  5 ++
 ansible/roles/squid/tasks/configure.yml       | 24 +++++++++
 ansible/roles/squid/tasks/install.yml         |  3 ++
 ansible/roles/squid/tasks/main.yml            |  2 +
 ansible/roles/squid/templates/squid.conf.j2   | 54 +++++++++++++++++++
 environments/.stackhpc/inventory/extra_groups |  4 ++
 environments/.stackhpc/terraform/main.tf      |  5 +-
 .../common/inventory/group_vars/all/proxy.yml |  2 +
 .../common/inventory/group_vars/all/squid.yml |  1 +
 .../inventory/group_vars/builder/defaults.yml |  4 ++
 environments/common/inventory/groups          |  4 ++
 environments/common/layouts/everything        |  3 ++
 17 files changed, 197 insertions(+), 11 deletions(-)
 create mode 100644 ansible/roles/squid/README.md
 create mode 100644 ansible/roles/squid/defaults/main.yml
 create mode 100644 ansible/roles/squid/handlers/main.yml
 create mode 100644 ansible/roles/squid/tasks/configure.yml
 create mode 100644 ansible/roles/squid/tasks/install.yml
 create mode 100644 ansible/roles/squid/tasks/main.yml
 create mode 100644 ansible/roles/squid/templates/squid.conf.j2
 create mode 100644 environments/common/inventory/group_vars/all/proxy.yml
 create mode 100644 environments/common/inventory/group_vars/all/squid.yml

diff --git a/ansible/.gitignore b/ansible/.gitignore
index bf59b57e9..6dc63b547 100644
--- a/ansible/.gitignore
+++ b/ansible/.gitignore
@@ -53,4 +53,6 @@ roles/*
 !roles/persist_hostkeys/
 !roles/persist_hostkeys/**
 !roles/ofed/
-!roles/ofed/**
\ No newline at end of file
+!roles/ofed/**
+!roles/squid/
+!roles/squid/**
diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml
index 8ea2cd54c..e9f6393e9 100644
--- a/ansible/bootstrap.yml
+++ b/ansible/bootstrap.yml
@@ -46,10 +46,6 @@
         path: /etc/profile
         search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \
         state: absent
-    - name: Remove RHEL cockpit
-      dnf:
-        name: cockpit-ws
-        state: "{{ appliances_cockpit_state }}"
     - name: Add system user groups
       ansible.builtin.group: "{{ item.group }}"
       loop: "{{ appliances_local_users }}"
@@ -91,6 +87,16 @@
         policy: "{{ selinux_policy }}"
       register: sestatus
 
+# --- tasks after here require access to package repos ---
+- hosts: squid
+  tags: squid
+  gather_facts: yes
+  become: yes
+  tasks:
+    - name: Configure squid proxy
+      import_role:
+        name: squid
+
 - hosts: freeipa_server
   # Done here as it might be providing DNS
   tags:
@@ -104,7 +110,15 @@
         name: freeipa
         tasks_from: server.yml
 
-# --- tasks after here require access to package repos ---
+- hosts: cluster
+  gather_facts: false
+  become: yes
+  tags: cockpit
+  tasks:
+    - name: Remove RHEL cockpit
+      dnf:
+        name: cockpit-ws
+        state: "{{ appliances_cockpit_state }}"
 
 - hosts: firewalld
   gather_facts: false
diff --git a/ansible/roles/proxy/tasks/main.yml b/ansible/roles/proxy/tasks/main.yml
index 3bc33cfa2..70a7eca67 100644
--- a/ansible/roles/proxy/tasks/main.yml
+++ b/ansible/roles/proxy/tasks/main.yml
@@ -43,8 +43,10 @@
     path: /etc/systemd/system.conf.d/90-proxy.conf
     section: Manager
     option: DefaultEnvironment
-    value: >
-        "http_proxy={{ proxy_http_proxy }}" "https_proxy={{ proxy_http_proxy }}" "no_proxy={{ proxy_no_proxy }}"
+    value: >-
+        "http_proxy={{ proxy_http_proxy }}"
+        "https_proxy={{ proxy_http_proxy }}"
+        "no_proxy={{ proxy_no_proxy }}"
     no_extra_spaces: true
     owner: root
     group: root
diff --git a/ansible/roles/squid/README.md b/ansible/roles/squid/README.md
new file mode 100644
index 000000000..e514c3605
--- /dev/null
+++ b/ansible/roles/squid/README.md
@@ -0,0 +1,39 @@
+# squid
+
+Deploy a caching proxy.
+
+**NB:** The default configuration is aimed at providing a proxy for package installs etc. for
+nodes which do not have direct internet connectivity. It assumes access to the proxy is protected
+by the OpenStack security groups applied to the cluster. The generated configuration should be
+reviewed if this is not case.
+
+## Role Variables
+
+Where noted these map to squid parameters of the same name without the `squid_` prefix - see [squid documentation](https://www.squid-cache.org/Doc/config) for details.
+
+- `squid_conf_template`: Optional str. Path (using Ansible search paths) to squid.conf template. Default is in-role template.
+- `squid_started`: Optional bool. Whether to start squid service. Default `true`.
+- `squid_enabled`: Optional bool. Whether squid service is enabled on boot. Default `true`.
+- `squid_cache_mem`: Required str. Size of memory cache, e.g "1024 KB", "12 GB" etc. See squid parameter.
+- `squid_cache_dir`: Optional. Path to cache directory. Default `/var/spool/squid`.
+- `squid_cache_disk`: Required int. Size of disk cache in MB. See Mbytes under "ufs" store type for squid parameter [cache_dir](https://www.squid-cache.org/Doc/config/cache_dir/).
+- `squid_maximum_object_size_in_memory`: Optional str. Upper size limit for objects in memory cache, default '64 MB'. See squid parameter.
+- `squid_maximum_object_size`: Optional str. Upper size limit for objects in disk cache, default '200 MB'. See squid parameter.
+- `squid_http_port`: Optional str. Socket addresses to listen for client requests, default '3128'. See squid parameter.
+- `squid_acls`: Optional str, can be multiline. Define access lists. Default `acl anywhere src all`, i.e. rely on OpenStack security groups (or other firewall if deployed). See squid parameter `acl`. NB: The default template also defines acls for `SSL_ports` and `Safe_ports` as is common practice.
+- `squid_http_access`: Optional str, can be multiline. Allow/deny access based on access lists. Default:
+
+        # Deny requests to certain unsafe ports
+        http_access deny !Safe_ports
+        # Deny CONNECT to other than secure SSL ports
+        http_access deny CONNECT !SSL_ports
+        # Only allow cachemgr access from localhost
+        http_access allow localhost manager
+        http_access deny manager
+        # Rules allowing http access
+        http_access allow anywhere
+        http_access allow localhost
+        # Finally deny all other access to this proxy
+        http_access deny all
+        
+  See squid parameter.
diff --git a/ansible/roles/squid/defaults/main.yml b/ansible/roles/squid/defaults/main.yml
new file mode 100644
index 000000000..7457bdccf
--- /dev/null
+++ b/ansible/roles/squid/defaults/main.yml
@@ -0,0 +1,24 @@
+squid_conf_template: squid.conf.j2
+squid_started: true
+squid_enabled: true
+
+squid_cache_mem: "{{ undef(hint='squid_cache_mem required, e.g. \"12 GB\"')  }}"
+squid_cache_dir: /var/spool/squid
+squid_cache_disk: "{{ undef(hint='squid_cache_disk (in MB) required, e.g. \"1024\"')  }}" # always in MB
+squid_maximum_object_size_in_memory: '64 MB'
+squid_maximum_object_size: '200 MB'
+squid_http_port: 3128
+squid_acls: acl anywhere src all # rely on openstack security groups
+squid_http_access: |
+  # Deny requests to certain unsafe ports
+  http_access deny !Safe_ports
+  # Deny CONNECT to other than secure SSL ports
+  http_access deny CONNECT !SSL_ports
+  # Only allow cachemgr access from localhost
+  http_access allow localhost manager
+  http_access deny manager
+  # Rules allowing http access
+  http_access allow anywhere
+  http_access allow localhost
+  # Finally deny all other access to this proxy
+  http_access deny all
diff --git a/ansible/roles/squid/handlers/main.yml b/ansible/roles/squid/handlers/main.yml
new file mode 100644
index 000000000..135d98d3b
--- /dev/null
+++ b/ansible/roles/squid/handlers/main.yml
@@ -0,0 +1,5 @@
+- name: Restart squid
+  service:
+    name: squid
+    state: restarted
+  when: squid_started | bool
diff --git a/ansible/roles/squid/tasks/configure.yml b/ansible/roles/squid/tasks/configure.yml
new file mode 100644
index 000000000..0d4dec681
--- /dev/null
+++ b/ansible/roles/squid/tasks/configure.yml
@@ -0,0 +1,24 @@
+- name: Ensure squid cache directory exists
+  file:
+    path: "{{ squid_cache_dir }}"
+    # based on what dnf package creates:
+    owner: squid
+    group: squid
+    mode: u=rwx,g=rw,o=
+
+- name: Template squid configuration
+  template:
+    src: "{{ squid_conf_template }}"
+    dest: /etc/squid/squid.conf
+    owner: squid
+    group: squid
+    mode: ug=rwX,go=
+  notify: Restart squid
+
+- meta: flush_handlers
+
+- name: Ensure squid service state
+  systemd:
+    name: squid
+    state: "{{ 'started' if squid_started | bool else 'stopped' }}"
+    enabled: "{{ true if squid_enabled else false }}"
diff --git a/ansible/roles/squid/tasks/install.yml b/ansible/roles/squid/tasks/install.yml
new file mode 100644
index 000000000..672186c48
--- /dev/null
+++ b/ansible/roles/squid/tasks/install.yml
@@ -0,0 +1,3 @@
+- name: Install squid package
+  dnf:
+    name: squid
diff --git a/ansible/roles/squid/tasks/main.yml b/ansible/roles/squid/tasks/main.yml
new file mode 100644
index 000000000..2b65e84b4
--- /dev/null
+++ b/ansible/roles/squid/tasks/main.yml
@@ -0,0 +1,2 @@
+- import_tasks: install.yml
+- import_tasks: configure.yml
diff --git a/ansible/roles/squid/templates/squid.conf.j2 b/ansible/roles/squid/templates/squid.conf.j2
new file mode 100644
index 000000000..b6d10e7dc
--- /dev/null
+++ b/ansible/roles/squid/templates/squid.conf.j2
@@ -0,0 +1,54 @@
+#
+# Based on combination of configs from
+# - https://github.com/stackhpc/docker-squid/blob/master/squid.conf
+# - https://github.com/drosskopp/squid-cache/blob/main/squid.conf
+#
+
+# Define ACLs:
+{{ squid_acls }}
+
+acl SSL_ports port 443
+acl Safe_ports port 80		# http
+acl Safe_ports port 21		# ftp
+acl Safe_ports port 443		# https
+acl Safe_ports port 70		# gopher
+acl Safe_ports port 210		# wais
+acl Safe_ports port 1025-65535	# unregistered ports
+acl Safe_ports port 280		# http-mgmt
+acl Safe_ports port 488		# gss-http
+acl Safe_ports port 591		# filemaker
+acl Safe_ports port 777		# multiling http
+acl CONNECT method CONNECT
+
+# Rules allowing http access
+{{ squid_http_access}}
+
+# Squid normally listens to port 3128
+http_port {{ squid_http_port }}
+
+# Define cache parameters:
+cache_dir ufs /var/spool/squid {{ squid_cache_disk | int }} 16 256
+cache_mem {{ squid_cache_mem }}
+maximum_object_size_in_memory {{ squid_maximum_object_size_in_memory }}
+maximum_object_size {{ squid_maximum_object_size }}
+
+# Keep largest objects around longer:
+cache_replacement_policy heap LFUDA
+
+memory_replacement_policy heap GDSF
+
+# Leave coredumps in the first cache dir
+coredump_dir /var/spool/squid
+
+# Configure refresh
+# cache repodata only few minutes and then query parent whether it is fresh:
+refresh_pattern /XMLRPC/GET-REQ/.*/repodata/.*$ 0 1% 1440 ignore-no-cache reload-into-ims refresh-ims
+# rpm will hardly ever change, force it to cache for very long time:
+refresh_pattern \.rpm$ 10080 100% 525960 override-expire override-lastmod ignore-reload reload-into-ims
+refresh_pattern ^ftp: 1440 20% 10080
+refresh_pattern ^gopher: 1440 0% 1440
+refresh_pattern -i (/cgi-bin/|\?) 0 0% 0
+refresh_pattern . 0 20% 4320
+
+# Disable squid doing logfile rotation as the RockyLinux dnf package configures logrotate
+logfile_rotate 0
diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups
index 62a693e19..90df4a02f 100644
--- a/environments/.stackhpc/inventory/extra_groups
+++ b/environments/.stackhpc/inventory/extra_groups
@@ -27,3 +27,7 @@ cluster
 # Allows demo; also installs manila client in fat image
 login
 compute
+
+[squid:children]
+# Install squid into fat image
+builder
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index c5ef5b699..d241fb0fc 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,8 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/399
-        RL8: "openhpc-RL8-240619-0949-66c0e540"
-        RL9: "openhpc-ofed-RL9-240619-0949-66c0e540"
+        RL8: "openhpc-RL8-240619-0949-66c0e540" # https://github.com/stackhpc/ansible-slurm-appliance/pull/399
+        RL9: "openhpc-ofed-RL9-240621-1308-96959324" #  https://github.com/stackhpc/ansible-slurm-appliance/pull/401
     }
 }
 
diff --git a/environments/common/inventory/group_vars/all/proxy.yml b/environments/common/inventory/group_vars/all/proxy.yml
new file mode 100644
index 000000000..ddd009c63
--- /dev/null
+++ b/environments/common/inventory/group_vars/all/proxy.yml
@@ -0,0 +1,2 @@
+# default proxy address to first squid api address port 3128 if squid group non-empty, else empty string to avoid breaking hostvars
+proxy_http_proxy: "{{ 'http://' + hostvars[groups['squid'].0].api_address + ':' + squid_http_port if groups['squid'] else '' }}"
diff --git a/environments/common/inventory/group_vars/all/squid.yml b/environments/common/inventory/group_vars/all/squid.yml
new file mode 100644
index 000000000..59557291b
--- /dev/null
+++ b/environments/common/inventory/group_vars/all/squid.yml
@@ -0,0 +1 @@
+squid_http_port: 3128 # defined here for proxy role
diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml
index 7c4ea5712..4629fb72d 100644
--- a/environments/common/inventory/group_vars/builder/defaults.yml
+++ b/environments/common/inventory/group_vars/builder/defaults.yml
@@ -16,3 +16,7 @@ opensearch_state: stopped # avoid writing config+certs+db into image
 cuda_persistenced_state: stopped # probably don't have GPU in Packer build VMs
 firewalld_enabled: false # dnf install of firewalld enables it
 firewalld_state: stopped
+squid_started: false
+squid_enabled: false
+squid_cache_disk: 0 # just needs to be defined
+squid_cache_mem: 0
diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups
index 94903d38e..ab6324817 100644
--- a/environments/common/inventory/groups
+++ b/environments/common/inventory/groups
@@ -32,6 +32,7 @@ openhpc
 opensearch
 filebeat
 mysql
+squid
 
 [prometheus]
 # Single node to host monitoring server.
@@ -126,3 +127,6 @@ freeipa_client
 
 [persist_hostkeys]
 # Hosts to persist hostkeys for across reimaging. NB: Requires appliances_state_dir on hosts.
+
+[squid]
+# Hosts to run squid proxy
diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything
index 4a30485af..086084da0 100644
--- a/environments/common/layouts/everything
+++ b/environments/common/layouts/everything
@@ -72,3 +72,6 @@ openhpc
 
 [persist_hostkeys]
 # Hosts to persist hostkeys for across reimaging. NB: Requires appliances_state_dir on hosts.
+
+[squid]
+# Hosts to run squid proxy

From 18faae443f032866bfbf35e07847a8a013c62e5c Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:09:22 +0100
Subject: [PATCH 11/78] Upgrade ssh from SIG/security to fix CVE-2024-6387
 (#404)

* upgrade ssh from SIG/security to fix CVE-2024-6387

* refactor ssh update from sig/security to work on existing fatimage
---
 ansible/bootstrap.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml
index e9f6393e9..f8f536830 100644
--- a/ansible/bootstrap.yml
+++ b/ansible/bootstrap.yml
@@ -157,6 +157,20 @@
   tags:
     - update
   tasks:
+    - name: Install SIG/security release repo
+      dnf:
+        name: rocky-release-security
+    - name: Update openssh
+      dnf:
+        name:
+          - openssh
+          - openssh-askpass
+          - openssh-clients
+          - openssh-server
+        state: latest
+        update_only: true
+        enablerepo:
+          - security-common
     - block:
       - name: Update selected packages
         yum:

From fcf4648b24e263cfed91bce084b74fbd48e4359b Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Thu, 4 Jul 2024 11:33:12 +0100
Subject: [PATCH 12/78] fix squid port default (#405)

---
 environments/common/inventory/group_vars/all/proxy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/common/inventory/group_vars/all/proxy.yml b/environments/common/inventory/group_vars/all/proxy.yml
index ddd009c63..d606ee1d9 100644
--- a/environments/common/inventory/group_vars/all/proxy.yml
+++ b/environments/common/inventory/group_vars/all/proxy.yml
@@ -1,2 +1,2 @@
 # default proxy address to first squid api address port 3128 if squid group non-empty, else empty string to avoid breaking hostvars
-proxy_http_proxy: "{{ 'http://' + hostvars[groups['squid'].0].api_address + ':' + squid_http_port if groups['squid'] else '' }}"
+proxy_http_proxy: "{{ 'http://' + hostvars[groups['squid'].0].api_address + ':' + (squid_http_port | string) if groups['squid'] else '' }}"

From 8623b15542b66d92069422566f2307622a173b27 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:11:58 +0100
Subject: [PATCH 13/78] allow extending fat images with site-specific groups
 (#403)

---
 .github/workflows/fatimage.yml               |  2 +-
 ansible/cleanup.yml                          |  3 +
 environments/.stackhpc/ARCUS.pkrvars.hcl     |  1 -
 environments/.stackhpc/LEAFCLOUD.pkrvars.hcl |  1 -
 packer/README.md                             | 93 ++++++++++++++------
 packer/openstack.pkr.hcl                     | 52 +++++++----
 6 files changed, 104 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index 9209e85ea..e6727948b 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -63,7 +63,7 @@ jobs:
           . environments/.stackhpc/activate
           cd packer/
           packer init .
-          PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
+          PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -except=openstack.openhpc-extra -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
         env:
           PKR_VAR_os_version: ${{ matrix.os_version }}
 
diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml
index fc3391a23..e0fabf5e1 100644
--- a/ansible/cleanup.yml
+++ b/ansible/cleanup.yml
@@ -35,3 +35,6 @@
 
 - name: Run cloud-init cleanup
   command: cloud-init clean --logs --seed
+
+- name: Cleanup /tmp
+  command : rm -rf /tmp/*
diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl
index 72e978c95..c07717156 100644
--- a/environments/.stackhpc/ARCUS.pkrvars.hcl
+++ b/environments/.stackhpc/ARCUS.pkrvars.hcl
@@ -1,7 +1,6 @@
 flavor = "vm.ska.cpu.general.small"
 use_blockstorage_volume = true
 volume_size = 15 # GB
-volume_size_ofed = 15 # GB
 image_disk_format = "qcow2"
 networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
 ssh_keypair_name = "slurm-app-ci"
diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
index 1f6ece01f..da2d96d38 100644
--- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
+++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
@@ -1,7 +1,6 @@
 flavor = "ec1.large"
 use_blockstorage_volume = true
 volume_size = 15 # GB
-volume_size_ofed = 15 # GB
 volume_type = "unencrypted"
 image_disk_format = "qcow2"
 networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci
diff --git a/packer/README.md b/packer/README.md
index c2a754e5d..597cfd4f9 100644
--- a/packer/README.md
+++ b/packer/README.md
@@ -1,47 +1,86 @@
 # Packer-based image build
 
-The appliance contains code and configuration to use Packer with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images.
+The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images.
 
-The image built is referred to as a "fat" image as it contains binaries for all nodes, but no configuration. Using a "fat" image:
+The Packer configuration defined here builds "fat images" which contain binaries for all nodes, but no cluster-specific configuration. Using these:
 - Enables the image to be tested in CI before production use.
 - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates).
 - Improves deployment speed by reducing the number of package downloads to improve deployment speed.
 
-A default fat image is built in StackHPC's CI workflow and made available to clients. However it is possible to build site-specific fat images if required.
+By default, a fat image build starts from a RockyLinux GenericCloud image and updates all DNF packages already present.
 
-A fat image build starts from a RockyLinux GenericCloud image and (by default) updates all dnf packages in that image.
+The fat images StackHPC builds and test in CI are  available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to:
+1. Build site-specific fat images from scratch.
+2. Extend an existing fat image with additional software.
 
-# Build Process
-- Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration).
-- Create a file `environments/<environment>/builder.pkrvars.hcl` containing at a minimum e.g.:
-  
-  ```hcl
-  flavor = "general.v1.small"                           # VM flavor to use for builder VMs
-  networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"]   # List of network UUIDs to attach the VM to
-  source_image_name = "Rocky-8.9-GenericCloud"          # Name of source image. This must exist in OpenStack and should be a Rocky Linux GenericCloud-based image.
-  ```
-  
-  This configuration will generate and use an ephemeral SSH key for communicating with the Packer VM. If this is undesirable, set `ssh_keypair_name` to the name of an existing keypair in OpenStack. The private key must be on the host running Packer, and its path can be set using `ssh_private_key_file`.
 
-  The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.).
+# Usage
+
+The steps for building site-specific fat images or extending an existing fat image are the same:
+
+1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration).
+2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments/<environment>/builder.pkrvars.hcl` containing at a minimum e.g.:
   
-  For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`.
+    ```hcl
+    flavor = "general.v1.small"                           # VM flavor to use for builder VMs
+    networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"]   # List of network UUIDs to attach the VM to
+    ```
+    
+    - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.).
+    
+    - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`.
 
-- Activate the venv and the relevant environment.
+    - For an example of configuration for extending an existing fat image see below.
 
-- Build images using the relevant variable definition file:
+3. Activate the venv and the relevant environment.
+
+4. Build images using the relevant variable definition file, e.g.:
+
+        cd packer/
+        PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
+
+  Note that the `-only` flag here restricts the build to the non-OFED fat image "source" (in Packer terminology). Other
+  source options are:
+    - `-only=openhpc-ofed`: Build a fat image including Mellanox OFED
+    - `-only=openhpc-extra`: Build an image which extends an existing fat image - in this case the variable `source_image` or `source_image_name}` must also be set in the Packer variables file.
+    
+5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash.
+
+# Build Process
 
-        cd packer
-        PACKER_LOG=1 /usr/bin/packer build -only openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
+In summary, Packer creates an OpenStack VM, runs Ansible on that, shuts it down, then creates an image from the root disk.
 
-  Note the build VM is added to the `builder` group to differentiate them from "real" nodes - see developer notes below.
+Many of the Packer variables defined in `openstack.pkr.hcl` control the definition of the build VM and how to SSH to it to run Ansible, which are generic OpenStack builder options. Packer varibles can be set in a file at any convenient path; the above
+example shows the use of the environment variable `$PKR_VAR_environment_root` (which itself sets the Packer variable
+`environment_root`) to automatically select a variable file from the current environment, but for site-specific builds
+using a path in a "parent" environment is likely to be more appropriate (as builds should not be environment-specific, to allow testing).
 
-- The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash.
+What is Slurm Appliance-specific are the details of how Ansible is run:
+- The build VM is always added to the `builder` inventory group, which differentiates it from "real" nodes. This allows
+  variables to be set differently during Packer builds, e.g. to prevent services starting. The defaults for this are in `environments/common/inventory/group_vars/builder/`, which could be extended or overriden for site-specific fat image builds using `builder` groupvars for the relevant environment. It also runs some builder-specific code (e.g. to ensure Packer's SSH
+  keys are removed from the image).
+- The default fat image build also adds the build VM to the "top-level" `compute`, `control` and `login` groups. This ensures
+  the Ansible specific to all of these types of nodes run (other inventory groups are constructed from these by `environments/common/inventory/groups file` - this is not builder-specific).
+- Which groups the build VM is added to is controlled by the Packer `groups` variable. This can be redefined for builds using the `openhpc-extra` source to add the build VM into specific groups. E.g. with a Packer variable file:
 
-# Notes for developers
+      source_image_name = {
+          RL9 = "openhpc-ofed-RL9-240619-0949-66c0e540"
+      }
+      groups = {
+          openhpc-extra = ["foo"]
+      }
 
-Packer build VMs are added to both the `builder` group and the other top-level groups (e.g. `control`, `compute`, etc.). The former group allows `environments/common/inventory/group_vars/builder/defaults.yml` to set variables specifically for the Packer builds, e.g. for services which should not be started.
+    the build VM uses an existing "fat image" (rather than a RockyLinyux GenericCloud one) and is added to the `builder` and `foo` groups. This means only code targeting `builder` and `foo` groups runs. In this way an existing image can be extended with site-specific code, without modifying the part of the image which has already been tested in the StackHPC CI.
 
-Note that hostnames in the Packer VMs are not the same as the equivalent "real" hosts. Therefore variables required inside a Packer VM must be defined as group vars, not hostvars.
+ - The playbook `ansible/fatimage.yml` is run which is only a subset of `ansible/site.yml`. This allows restricting the code
+   which runs during build for cases where setting `builder` groupvars is not sufficient (e.g. a role always attempts to configure or start services). This may eventually be removed.
 
-Ansible may need to proxy to compute nodes. If the Packer build should not use the same proxy to connect to the builder VMs, note that proxy configuration should not be added to the `all` group.
+There are some things to be aware of when developing Ansible to run in a Packer build VM:
+  - Only some tasks make sense. E.g. any services with a reliance on the network cannot be started, and may not be able to be enabled if when creating an instance with the resulting image the remote service will not be immediately present.
+  - Nothing should be written to the persistent state directory `appliances_state_dir`, as this is on the root filesystem rather than an OpenStack volume.
+  - Care should be taken not to leave data on the root filesystem which is not wanted in the final image, (e.g secrets).
+  - Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars.
+  - Ansible may need to proxy to real compute nodes. If Packer should not use the same proxy to connect to the
+    build VMs (e.g. build happens on a different network), proxy configuration should not be added to the `all` group.
+  - Currently two fat image "sources" are defined, with and without OFED. This simplifies CI configuration by allowing the
+    default source images to be defined in the `openstack.pkr.hcl` definition.
diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl
index 262b071f4..bb6af1a38 100644
--- a/packer/openstack.pkr.hcl
+++ b/packer/openstack.pkr.hcl
@@ -41,24 +41,27 @@ variable "networks" {
 
 variable "os_version" {
   type = string
-  description = "RL8 or RL9"
+  description = "'RL8' or 'RL9' with default source_image_* mappings"
+  default = "RL9"
 }
 
-# Must supply either fatimage_source_image_name or fatimage_source_image
-variable "fatimage_source_image_name" {
+# Must supply either source_image_name or source_image_id
+variable "source_image_name" {
   type = map(string)
+  description = "name of source image, keyed from var.os_version"
   default = {
     RL8: "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
     RL9: "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2"
   }
 }
 
-variable "fatimage_source_image" {
+variable "source_image" {
   type = map(string)
   default = {
     RL8: null
     RL9: null
   }
+  description = "UUID of source image, keyed from var.os_version"
 }
 
 variable "flavor" {
@@ -130,11 +133,6 @@ variable "volume_size" {
   default = null # When not specified use the size of the builder instance root disk
 }
 
-variable "volume_size_ofed" {
-  type = number
-  default = null # When not specified use the size of the builder instance root disk
-}
-
 variable "image_disk_format" {
   type = string
   default = null # When not specified use the image default
@@ -145,6 +143,16 @@ variable "metadata" {
   default = {}
 }
 
+variable "groups" {
+  type = map(list(string))
+  description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name"
+  default = {
+    # fat image builds:
+    openhpc = ["control", "compute", "login"]
+    openhpc-ofed = ["control", "compute", "login", "ofed"]
+  }
+}
+
 source "openstack" "openhpc" {
   # Build VM:
   flavor = var.flavor
@@ -154,10 +162,11 @@ source "openstack" "openhpc" {
   networks = var.networks
   floating_ip_network = var.floating_ip_network
   security_groups = var.security_groups
+  volume_size = var.volume_size
   
   # Input image:
-  source_image = "${var.fatimage_source_image[var.os_version]}"
-  source_image_name = "${var.fatimage_source_image_name[var.os_version]}" # NB: must already exist in OpenStack
+  source_image = "${var.source_image[var.os_version]}"
+  source_image_name = "${var.source_image_name[var.os_version]}" # NB: must already exist in OpenStack
   
   # SSH:
   ssh_username = var.ssh_username
@@ -174,27 +183,34 @@ source "openstack" "openhpc" {
   image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
 }
 
-# "fat" image builds:
 build {
 
-  # non-OFED:
+  # non-OFED fat image:
   source "source.openstack.openhpc" {
     name = "openhpc"
-    volume_size = var.volume_size
   }
 
-  # OFED:
+  # OFED fat image:
   source "source.openstack.openhpc" {
     name = "openhpc-ofed"
-    volume_size = var.volume_size_ofed
+  }
+
+  # Extended site-specific image, built on fat image:
+  source "source.openstack.openhpc" {
+    name = "openhpc-extra"
   }
 
   provisioner "ansible" {
     playbook_file = "${var.repo_root}/ansible/fatimage.yml"
-    groups = concat(["builder", "control", "compute", "login"], [for g in split("-", "${source.name}"): g if g != "openhpc"])
+    groups = concat(["builder"], var.groups[source.name])
     keep_inventory_file = true # for debugging
     use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting
-    extra_arguments = ["--limit", "builder", "-i", "${var.repo_root}/packer/ansible-inventory.sh", "-vv", "-e", "@${var.repo_root}/packer/openhpc_extravars.yml"]
+    extra_arguments = [
+      "--limit", "builder", # prevent running against real nodes, if in inventory!
+      "-i", "${var.repo_root}/packer/ansible-inventory.sh",
+      "-vv",
+      "-e", "@${var.repo_root}/packer/openhpc_extravars.yml", # not overridable by environments
+      ]
   }
 
   post-processor "manifest" {

From 45e5173303e8e0e47d02d1ea307583ce90c3309a Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:29:32 +0100
Subject: [PATCH 14/78] remove squid nodes from podman group - is not
 containerised (#407)

---
 environments/common/inventory/groups | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups
index ab6324817..4209f5067 100644
--- a/environments/common/inventory/groups
+++ b/environments/common/inventory/groups
@@ -32,7 +32,6 @@ openhpc
 opensearch
 filebeat
 mysql
-squid
 
 [prometheus]
 # Single node to host monitoring server.

From c410634e7ff44c59f83ed3ec0dc9b78a2fb28a05 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:29:52 +0100
Subject: [PATCH 15/78] fix README for RL9 (#408)

---
 README.md | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 55d152eda..f1d6f461a 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 # StackHPC Slurm Appliance
 
 This repository contains playbooks and configuration to define a Slurm-based HPC environment including:
-- A Rocky Linux 8 and OpenHPC v2-based Slurm cluster.
+- A Rocky Linux 9 and OpenHPC v3-based Slurm cluster.
 - Shared fileystem(s) using NFS (with servers within or external to the cluster).
 - Slurm accounting using a MySQL backend.
 - A monitoring backend using Prometheus and ElasticSearch.
@@ -18,7 +18,8 @@ While it is tested on OpenStack it should work on any cloud, except for node reb
 ## Prerequisites
 It is recommended to check the following before starting:
 - You have root access on the "ansible deploy host" which will be used to deploy the appliance.
-- You can create instances using a Rocky 8 GenericCloud image (or an image based on that).
+- You can create instances using a Rocky 9 GenericCloud image (or an image based on that).
+    - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. However the appliance will install the necessary packages if a GenericCloud image is used.
 - SSH keys get correctly injected into instances.
 - Instances have access to internet (note proxies can be setup through the appliance if necessary).
 - DNS works (if not this can be partially worked around but additional configuration will be required).
@@ -31,14 +32,7 @@ These instructions assume the deployment host is running Rocky Linux 8:
     sudo yum install -y git python38
     git clone https://github.com/stackhpc/ansible-slurm-appliance
     cd ansible-slurm-appliance
-    /usr/bin/python3.8 -m venv venv
-    . venv/bin/activate
-    pip install -U pip
-    pip install -r requirements.txt
-    # Install ansible dependencies ...
-    ansible-galaxy role install -r requirements.yml -p ansible/roles
-    ansible-galaxy collection install -r requirements.yml -p ansible/collections # ignore the path warning here
-
+    ./dev/setup-env.sh
 
 ## Overview of directory structure
 

From 7e8dab6ac9a06e0e5abe6ae2b7f7bd3430f3f3d5 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:01:44 +0100
Subject: [PATCH 16/78] add groups support to basic_users (#406)

---
 ansible/roles/basic_users/README.md         | 15 ++++++++-------
 ansible/roles/basic_users/defaults/main.yml |  2 ++
 ansible/roles/basic_users/tasks/main.yml    |  4 ++++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/ansible/roles/basic_users/README.md b/ansible/roles/basic_users/README.md
index 4d6c5485c..4b75100ca 100644
--- a/ansible/roles/basic_users/README.md
+++ b/ansible/roles/basic_users/README.md
@@ -16,13 +16,14 @@ Requirements
 Role Variables
 --------------
 
-`basic_users_users`: Required. A list of mappings defining information for each user. In general, mapping keys/values are passed through as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html) and default values are as given there. However:
-- `create_home`, `generate_ssh_key` and `ssh_key_comment` are set automatically and should not be overriden.
-- `uid` should be set, so that the UID/GID is consistent across the cluster (which Slurm requires).
-- `shell` if *not* set will be `/sbin/nologin` on the `control` node and the default shell on other users. Explicitly setting this defines the shell for all nodes.
-- An additional key `public_key` may optionally be specified to define a key to log into the cluster.
-- An additional key `sudo` may optionally be specified giving a string (possibly multiline) defining sudo rules to be templated.
-- Any other keys may present for other purposes (i.e. not used by this role).
+- `basic_users_users`: Optional, default empty list. A list of mappings defining information for each user. In general, mapping keys/values are passed through as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html) and default values are as given there. However:
+  - `create_home`, `generate_ssh_key` and `ssh_key_comment` are set automatically; this assumes home directories are on a cluster-shared filesystem.
+  - `uid` should be set, so that the UID/GID is consistent across the cluster (which Slurm requires).
+  - `shell` if *not* set will be `/sbin/nologin` on the `control` node and the default shell on other users. Explicitly setting this defines the shell for all nodes.
+  - An additional key `public_key` may optionally be specified to define a key to log into the cluster.
+  - An additional key `sudo` may optionally be specified giving a string (possibly multiline) defining sudo rules to be templated.
+  - Any other keys may present for other purposes (i.e. not used by this role).
+- `basic_users_groups`: Optional, default empty list. A list of mappings defining information for each group. Mapping keys/values are passed through as parameters to [ansible.builtin.group](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/group_module.html) and default values are as given there.
 
 Dependencies
 ------------
diff --git a/ansible/roles/basic_users/defaults/main.yml b/ansible/roles/basic_users/defaults/main.yml
index eefb5dc25..9f34bdf4c 100644
--- a/ansible/roles/basic_users/defaults/main.yml
+++ b/ansible/roles/basic_users/defaults/main.yml
@@ -5,3 +5,5 @@ basic_users_userdefaults:
   generate_ssh_key:  "{{ basic_users_manage_homedir }}"
   ssh_key_comment: "{{ item.name }}"
   shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}"
+basic_users_users: []
+basic_users_groups: []
diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml
index d2d3d0d4a..c27d024b4 100644
--- a/ansible/roles/basic_users/tasks/main.yml
+++ b/ansible/roles/basic_users/tasks/main.yml
@@ -8,6 +8,10 @@
   when:
     - "item.state | default('present') == 'absent'"
   
+- name: Create groups
+  ansible.builtin.group: "{{ item }}"
+  loop:  "{{ basic_users_groups }}"
+
 - name: Create users and generate public keys
   user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}"
   loop: "{{ basic_users_users }}"

From c1cab497906832a1e748931256eacceee510cb76 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Tue, 16 Jul 2024 10:10:44 +0100
Subject: [PATCH 17/78] Revert to base ssh repos (#410)

* revert to base ssh repos

* fix Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 /etc permissions

* bump CI image
---
 ansible/bootstrap.yml                    | 22 ++++++++--------------
 environments/.stackhpc/terraform/main.tf |  5 +++--
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml
index f8f536830..58b18f7bc 100644
--- a/ansible/bootstrap.yml
+++ b/ansible/bootstrap.yml
@@ -41,6 +41,14 @@
   gather_facts: false
   become: yes
   tasks:
+    - name: Fix incorrect permissions on /etc in Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2
+      # breaks munge
+      file:
+        path: /etc
+        state: directory
+        owner: root
+        group: root
+        mode: u=rwx,go=rx # has g=rwx
     - name: Prevent ssh hanging if shared home is unavailable
       lineinfile:
         path: /etc/profile
@@ -157,20 +165,6 @@
   tags:
     - update
   tasks:
-    - name: Install SIG/security release repo
-      dnf:
-        name: rocky-release-security
-    - name: Update openssh
-      dnf:
-        name:
-          - openssh
-          - openssh-askpass
-          - openssh-clients
-          - openssh-server
-        state: latest
-        update_only: true
-        enablerepo:
-          - security-common
     - block:
       - name: Update selected packages
         yum:
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index d241fb0fc..8b2991f46 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,8 +29,9 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        RL8: "openhpc-RL8-240619-0949-66c0e540" # https://github.com/stackhpc/ansible-slurm-appliance/pull/399
-        RL9: "openhpc-ofed-RL9-240621-1308-96959324" #  https://github.com/stackhpc/ansible-slurm-appliance/pull/401
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/410
+        RL8: "openhpc-RL8-240712-1426-6830f97b"
+        RL9: "openhpc-ofed-RL9-240712-1425-6830f97b"
     }
 }
 

From e44d704e5ed9c0a4b6e000e6dece10503c7105f0 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Thu, 18 Jul 2024 13:18:49 +0100
Subject: [PATCH 18/78] Add TuneD (#409)

* TuneD configuration role

* bump CI image

* Update README.md

* Update environments/common/layouts/everything

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>

* Update ansible/roles/tuned/README.md

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>

* Update ansible/roles/tuned/README.md

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>

* Update ansible/roles/tuned/README.md

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>

* Update README.md

---------

Co-authored-by: Bertie <bertie@stackhpc.com>
Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>
---
 ansible/.gitignore                            |  3 +++
 ansible/bootstrap.yml                         |  9 +++++++++
 ansible/roles/tuned/README.md                 | 14 +++++++++++++
 ansible/roles/tuned/defaults/main.yml         |  7 +++++++
 ansible/roles/tuned/tasks/configure.yml       | 20 +++++++++++++++++++
 ansible/roles/tuned/tasks/install.yml         |  5 +++++
 ansible/roles/tuned/tasks/main.yml            |  3 +++
 .../inventory/group_vars/builder/defaults.yml |  2 ++
 environments/common/inventory/groups          |  3 +++
 environments/common/layouts/everything        |  3 +++
 10 files changed, 69 insertions(+)
 create mode 100644 ansible/roles/tuned/README.md
 create mode 100644 ansible/roles/tuned/defaults/main.yml
 create mode 100644 ansible/roles/tuned/tasks/configure.yml
 create mode 100644 ansible/roles/tuned/tasks/install.yml
 create mode 100644 ansible/roles/tuned/tasks/main.yml

diff --git a/ansible/.gitignore b/ansible/.gitignore
index 6dc63b547..2ceeb596b 100644
--- a/ansible/.gitignore
+++ b/ansible/.gitignore
@@ -56,3 +56,6 @@ roles/*
 !roles/ofed/**
 !roles/squid/
 !roles/squid/**
+!roles/tuned/
+!roles/tuned/**
+
diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml
index 58b18f7bc..b53a4f29a 100644
--- a/ansible/bootstrap.yml
+++ b/ansible/bootstrap.yml
@@ -105,6 +105,15 @@
       import_role:
         name: squid
 
+- hosts: tuned
+  tags: tuned
+  gather_facts: yes
+  become: yes
+  tasks:
+    - name: Install and configure tuneD
+      import_role:
+        name: tuned
+
 - hosts: freeipa_server
   # Done here as it might be providing DNS
   tags:
diff --git a/ansible/roles/tuned/README.md b/ansible/roles/tuned/README.md
new file mode 100644
index 000000000..34885af84
--- /dev/null
+++ b/ansible/roles/tuned/README.md
@@ -0,0 +1,14 @@
+tuned
+=========
+
+This role configures the TuneD tool for system tuning, ensuring optimal performance based on the profile settings defined.
+
+Role Variables
+--------------
+
+See the [TuneD documentation](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9/html/monitoring_and_managing_system_status_and_performance/getting-started-with-tuned_monitoring-and-managing-system-status-and-performance) for profile details.
+
+
+- `tuned_profile_baremetal`: Optional str. Name of default profile for non-virtualised hosts. Default `hpc-compute`.
+- `tuned_profile_vm`: Optional str. Name of default profile for virtualised hosts. Default `virtual-guest`.
+- `tuned_profile`: Optional str. Name of profile to apply to host. Defaults to `tuned_profile_baremetal` or `tuned_profile_vm` as appropriate.
diff --git a/ansible/roles/tuned/defaults/main.yml b/ansible/roles/tuned/defaults/main.yml
new file mode 100644
index 000000000..1426bbedd
--- /dev/null
+++ b/ansible/roles/tuned/defaults/main.yml
@@ -0,0 +1,7 @@
+---
+# defaults file for tuned
+tuned_profile_baremetal: hpc-compute
+tuned_profile_vm: virtual-guest
+tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}"
+tuned_enabled: true
+tuned_started: true 
diff --git a/ansible/roles/tuned/tasks/configure.yml b/ansible/roles/tuned/tasks/configure.yml
new file mode 100644
index 000000000..424063119
--- /dev/null
+++ b/ansible/roles/tuned/tasks/configure.yml
@@ -0,0 +1,20 @@
+---
+- name: Enable and start TuneD
+  ansible.builtin.systemd:
+    name: tuned
+    enabled: "{{ tuned_enabled | bool }}"
+    state: "{{ 'started' if tuned_started | bool else 'stopped' }}"
+
+- name: Check TuneD profile
+  ansible.builtin.command:
+    cmd: tuned-adm active
+  when: tuned_started
+  register: _tuned_profile_current
+  changed_when: false
+
+- name: Set tuned-adm profile
+  ansible.builtin.command:
+    cmd: "tuned-adm profile {{ tuned_profile }}"
+  when: 
+    - tuned_started | bool
+    - tuned_profile not in _tuned_profile_current.stdout
diff --git a/ansible/roles/tuned/tasks/install.yml b/ansible/roles/tuned/tasks/install.yml
new file mode 100644
index 000000000..89c08a412
--- /dev/null
+++ b/ansible/roles/tuned/tasks/install.yml
@@ -0,0 +1,5 @@
+---
+- name: Install tuneD
+  ansible.builtin.dnf:
+    name: tuned
+    state: present
\ No newline at end of file
diff --git a/ansible/roles/tuned/tasks/main.yml b/ansible/roles/tuned/tasks/main.yml
new file mode 100644
index 000000000..ef0bea2d1
--- /dev/null
+++ b/ansible/roles/tuned/tasks/main.yml
@@ -0,0 +1,3 @@
+---
+- import_tasks: install.yml
+- import_tasks: configure.yml
\ No newline at end of file
diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml
index 4629fb72d..22042c1bf 100644
--- a/environments/common/inventory/group_vars/builder/defaults.yml
+++ b/environments/common/inventory/group_vars/builder/defaults.yml
@@ -20,3 +20,5 @@ squid_started: false
 squid_enabled: false
 squid_cache_disk: 0 # just needs to be defined
 squid_cache_mem: 0
+tuned_started: false
+tuned_enabled: false
diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups
index 4209f5067..a48e6823f 100644
--- a/environments/common/inventory/groups
+++ b/environments/common/inventory/groups
@@ -129,3 +129,6 @@ freeipa_client
 
 [squid]
 # Hosts to run squid proxy
+
+[tuned]
+# Hosts to run TuneD configuration
diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything
index 086084da0..e9523eec9 100644
--- a/environments/common/layouts/everything
+++ b/environments/common/layouts/everything
@@ -75,3 +75,6 @@ openhpc
 
 [squid]
 # Hosts to run squid proxy
+
+[tuned:children]
+# Hosts to run TuneD configuration

From 5504fa36aef4ddb959ce516177e7beb7313b998a Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Tue, 23 Jul 2024 13:25:52 +0100
Subject: [PATCH 19/78] Use shorter names for CI clusters (#415)

* use run_number as a shorter ID for CI

* slurmci group name warning

* Revert "slurmci group name warning"  - underscores not valid linux hostname and stripped in host, leading to slurmdbd config failure

This reverts commit 61dfad6716ff144a4b3769319d02394d61df5675.

---------

Co-authored-by: Bertie <bertie@stackhpc.com>
---
 .github/workflows/stackhpc.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index d0f74ad1c..401530fb4 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -34,7 +34,7 @@ jobs:
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
-      TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_id }}
+      TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
       CI_CLOUD: ${{ vars.CI_CLOUD }}
     steps:
       - uses: actions/checkout@v2

From 0a5f62c177503767441573d569c2f010570f49a3 Mon Sep 17 00:00:00 2001
From: Bertie <bertie@stackhpc.com>
Date: Fri, 19 Jul 2024 16:06:18 +0000
Subject: [PATCH 20/78] install ood apps in fatimage

---
 ansible/fatimage.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml
index 0764477b3..82c1ecc07 100644
--- a/ansible/fatimage.yml
+++ b/ansible/fatimage.yml
@@ -62,12 +62,16 @@
         tasks_from: install.yml
 
     # - import_playbook: portal.yml
-    - name: Open Ondemand server
+    - name: Open Ondemand server (packages)
       include_role:
         name: osc.ood
         tasks_from: install-package.yml
         vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
     # # FUTURE: install-apps.yml - this is git clones
+    - name: Open Ondemand server (apps)
+      include_role:
+        name: osc.ood
+        tasks_from: install-apps.yml
     - name: Open Ondemand remote desktop
       import_role:
         name: openondemand

From 5da7f4feaae840bc6f61916541c779ee92713df5 Mon Sep 17 00:00:00 2001
From: Bertie <bertie@stackhpc.com>
Date: Mon, 22 Jul 2024 09:27:49 +0000
Subject: [PATCH 21/78] add ood jupyter install to fatimage

---
 ansible/fatimage.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml
index 82c1ecc07..25968cfe5 100644
--- a/ansible/fatimage.yml
+++ b/ansible/fatimage.yml
@@ -72,6 +72,7 @@
       include_role:
         name: osc.ood
         tasks_from: install-apps.yml
+        vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
     - name: Open Ondemand remote desktop
       import_role:
         name: openondemand

From 2c87644e3a81b746d172fc58d750fe8da7716093 Mon Sep 17 00:00:00 2001
From: Bertie <bertie@stackhpc.com>
Date: Mon, 22 Jul 2024 12:40:51 +0000
Subject: [PATCH 22/78] jupyter_compute ood into fatimage

---
 ansible/fatimage.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml
index 25968cfe5..35f1b10ab 100644
--- a/ansible/fatimage.yml
+++ b/ansible/fatimage.yml
@@ -77,6 +77,10 @@
       import_role:
         name: openondemand
         tasks_from: vnc_compute.yml
+    - name: Open Ondemand jupyter node
+      import_role:
+        name: openondemand
+        tasks_from: jupyter_compute.yml
 
     # - import_playbook: monitoring.yml:
     - import_role:

From 49182d74306a42d66b02e4a6e2574d30590610c6 Mon Sep 17 00:00:00 2001
From: Bertie <bertie@stackhpc.com>
Date: Tue, 23 Jul 2024 10:11:32 +0000
Subject: [PATCH 23/78] bump fatimage

---
 environments/.stackhpc/terraform/main.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index 8b2991f46..9364f08e2 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,9 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/410
-        RL8: "openhpc-RL8-240712-1426-6830f97b"
-        RL9: "openhpc-ofed-RL9-240712-1425-6830f97b"
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/414
+        RL8: "openhpc-RL8-240723-0907-b560bf4c"
+        RL9: "openhpc-ofed-RL9-240723-0907-b560bf4c"
     }
 }
 

From 99c52ed2235d96b6142ddaf224896d5728d8bbe1 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Tue, 23 Jul 2024 13:30:08 +0100
Subject: [PATCH 24/78] allow items in compute mapping to have different keys
 e.g. only specify image_id for some compute groups (#412)

---
 .../{{cookiecutter.environment}}/terraform/variables.tf         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf
index ba0dbfb20..289de3fef 100644
--- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf
+++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf
@@ -40,7 +40,7 @@ variable "cluster_image_id" {
 }
 
 variable "compute" {
-    type = map
+    type = any
     description = <<-EOF
         Mapping defining compute infrastructure. Keys are names of groups. Values are a
         mapping as follows:

From df8dd0c872ecd5c4b88499a7908ac55a40ededc6 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Wed, 7 Aug 2024 10:06:37 +0100
Subject: [PATCH 25/78] Support ansible-init for remote collections (#411)

* Add ansible-init role to requirements.yml

* Add ansible-init to groups and plays

* Configure cluster_infra ansible-init metadata

* Only run site.yml once ansible-init has completed

* Wait for ansible init to finish before running bootstrap

* revert to using cluster_infra metadata defaults

* update image

* revert sausage bastion changes

* set ansible_init_wait as common var

* use run_number as a shorter ID for CI

* install ood apps in fatimage

* add ood jupyter install to fatimage

* bump image

* jupyter_compute ood into fatimage

* bump fatimage for jupyter_compute ood

* Update stackhpc.yml

* duplicate tuned inventory group name

* Fix invalid group name for slurmci

* Update stackhpc.yml

undo groupname changes

* slurmci group name warning

* rm ood changes

* bump fatimage

* change azimuth collection in bootstrap

* update azimuth image utils version

* update requirements

* Update bastion.yml

* Use azimuth image utils collection for ansible-init

* bump fatimage

---------

Co-authored-by: bertie <bertie@stackhpc.com>
Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
---
 ansible/bootstrap.yml                         | 23 +++++++++
 ansible/fatimage.yml                          |  2 +
 ansible/roles/cluster_infra/defaults/main.yml |  2 +
 .../cluster_infra/templates/resources.tf.j2   | 51 +++++++++++++++++++
 environments/.stackhpc/terraform/main.tf      |  6 +--
 .../inventory/group_vars/all/ansible_init.yml |  1 +
 environments/common/inventory/groups          |  3 ++
 environments/common/layouts/everything        |  4 ++
 requirements.yml                              |  3 ++
 9 files changed, 92 insertions(+), 3 deletions(-)
 create mode 100644 ansible/roles/cluster_infra/defaults/main.yml
 create mode 100644 environments/common/inventory/group_vars/all/ansible_init.yml

diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml
index b53a4f29a..e8e2713a5 100644
--- a/ansible/bootstrap.yml
+++ b/ansible/bootstrap.yml
@@ -1,5 +1,20 @@
 ---
 
+- hosts: cluster
+  gather_facts: false
+  become: yes
+  tasks:
+    - name: Check if ansible-init is installed
+      stat:
+        path: /etc/systemd/system/ansible-init.service
+      register: _stat_ansible_init_unitfile
+    
+    - name: Wait for ansible-init to finish
+      wait_for:
+        path: /var/lib/ansible-init.done
+        timeout: "{{ ansible_init_wait }}" # seconds
+      when: _stat_ansible_init_unitfile.stat.exists
+
 - hosts: localhost
   gather_facts: false
   become: false
@@ -235,3 +250,11 @@
   tasks:
     - include_role:
         name: ofed
+
+- hosts: ansible_init
+  gather_facts: yes
+  become: yes
+  tags: linux_ansible_init
+  tasks:
+    - include_role:
+        name: azimuth_cloud.image_utils.linux_ansible_init
diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml
index 35f1b10ab..58e1d72c7 100644
--- a/ansible/fatimage.yml
+++ b/ansible/fatimage.yml
@@ -68,11 +68,13 @@
         tasks_from: install-package.yml
         vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
     # # FUTURE: install-apps.yml - this is git clones
+
     - name: Open Ondemand server (apps)
       include_role:
         name: osc.ood
         tasks_from: install-apps.yml
         vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
+
     - name: Open Ondemand remote desktop
       import_role:
         name: openondemand
diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml
new file mode 100644
index 000000000..f2f9637b9
--- /dev/null
+++ b/ansible/roles/cluster_infra/defaults/main.yml
@@ -0,0 +1,2 @@
+ansible_init_collections: []
+ansible_init_playbooks: []
diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
index 03eab5afb..4c7534d62 100644
--- a/ansible/roles/cluster_infra/templates/resources.tf.j2
+++ b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -370,6 +370,23 @@ resource "openstack_compute_instance_v2" "login" {
       - "${openstack_compute_keypair_v2.cluster_keypair.public_key}"
     {%- endif %}
   EOF
+
+  metadata = {
+    {% for playbook in ansible_init_playbooks %}
+      ansible_init_pb_{{ loop.index0 }}_name = "{{ playbook.name }}"
+      {% if playbook.stage is defined %}
+        ansible_init_pb_{{ loop.index0 }}_stage = "{{ playbook.stage }}"
+      {% endif %}
+    {% endfor %}
+    {% for collection in ansible_init_collections %}
+      ansible_init_coll_{{ loop.index0 }}_name = "{{ collection.name }}"
+      ansible_init_coll_{{ loop.index0 }}_type = "{{ collection.type }}"
+      ansible_init_coll_{{ loop.index0 }}_version = "{{ collection.version }}"
+      {% if collection.source is defined %}
+        ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
+      {% endif %}
+    {% endfor %} 
+  }
 }
 
 resource "openstack_compute_instance_v2" "control" {
@@ -446,6 +463,23 @@ resource "openstack_compute_instance_v2" "control" {
         - [LABEL=home, /exports/home, auto]
         {% endif %}
   EOF
+
+  metadata = {
+    {% for playbook in ansible_init_playbooks %}
+      ansible_init_pb_{{ loop.index0 }}_name = "{{ playbook.name }}"
+      {% if playbook.stage is defined %}
+        ansible_init_pb_{{ loop.index0 }}_stage = "{{ playbook.stage }}"
+      {% endif %}
+    {% endfor %}
+    {% for collection in ansible_init_collections %}
+      ansible_init_coll_{{ loop.index0 }}_name = "{{ collection.name }}"
+      ansible_init_coll_{{ loop.index0 }}_type = "{{ collection.type }}"
+      ansible_init_coll_{{ loop.index0 }}_version = "{{ collection.version }}"
+      {% if collection.source is defined %}
+        ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
+      {% endif %}
+    {% endfor %} 
+  }
 }
 
 {% for partition in openhpc_slurm_partitions %}
@@ -498,6 +532,23 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
       - "${openstack_compute_keypair_v2.cluster_keypair.public_key}"
     {%- endif %}
   EOF
+
+  metadata = {
+    {% for playbook in ansible_init_playbooks %}
+      ansible_init_pb_{{ loop.index0 }}_name = "{{ playbook.name }}"
+      {% if playbook.stage is defined %}
+        ansible_init_pb_{{ loop.index0 }}_stage = "{{ playbook.stage }}"
+      {% endif %}
+    {% endfor %}
+    {% for collection in ansible_init_collections %}
+      ansible_init_coll_{{ loop.index0 }}_name = "{{ collection.name }}"
+      ansible_init_coll_{{ loop.index0 }}_type = "{{ collection.type }}"
+      ansible_init_coll_{{ loop.index0 }}_version = "{{ collection.version }}"
+      {% if collection.source is defined %}
+        ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
+      {% endif %}
+    {% endfor %} 
+  }
 }
 
 {% endfor %}
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index 9364f08e2..c2447bdf5 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,9 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/414
-        RL8: "openhpc-RL8-240723-0907-b560bf4c"
-        RL9: "openhpc-ofed-RL9-240723-0907-b560bf4c"
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/411
+        RL8: "openhpc-RL8-240725-1710-325c7b47"
+        RL9: "openhpc-ofed-RL9-240725-1710-325c7b47"
     }
 }
 
diff --git a/environments/common/inventory/group_vars/all/ansible_init.yml b/environments/common/inventory/group_vars/all/ansible_init.yml
new file mode 100644
index 000000000..be68dbe8c
--- /dev/null
+++ b/environments/common/inventory/group_vars/all/ansible_init.yml
@@ -0,0 +1 @@
+ansible_init_wait: 1200 # seconds
\ No newline at end of file
diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups
index a48e6823f..ea0bebebc 100644
--- a/environments/common/inventory/groups
+++ b/environments/common/inventory/groups
@@ -132,3 +132,6 @@ freeipa_client
 
 [tuned]
 # Hosts to run TuneD configuration
+
+[ansible_init]
+# Hosts to run linux-anisble-init
\ No newline at end of file
diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything
index e9523eec9..85af46c06 100644
--- a/environments/common/layouts/everything
+++ b/environments/common/layouts/everything
@@ -78,3 +78,7 @@ openhpc
 
 [tuned:children]
 # Hosts to run TuneD configuration
+
+[ansible_init:children]
+# Hosts to run ansible-init
+cluster
\ No newline at end of file
diff --git a/requirements.yml b/requirements.yml
index 757c851d5..da6ac5d29 100644
--- a/requirements.yml
+++ b/requirements.yml
@@ -46,4 +46,7 @@ collections:
   - name: https://github.com/stackhpc/ansible-collection-terraform
     type: git
     version: 0.2.0
+  - name: https://github.com/azimuth-cloud/ansible-collection-image-utils
+    type: git
+    version: main # update on release
 ...

From b4a47ec4705899091517cb0190bba44eae2e2981 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Wed, 7 Aug 2024 14:41:30 +0100
Subject: [PATCH 26/78] avoid python-openstackclient v7 due to rebuild bug
 (#420)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index badb1a94b..bf5a43430 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 ansible==6.0.0
 openstacksdk
-python-openstackclient
+python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild
 python-manilaclient
 jmespath
 passlib[bcrypt]==1.7.4

From 9c6efa1b3319e7f6f36ed5e01fd6689d7efe4107 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Wed, 7 Aug 2024 15:27:13 +0100
Subject: [PATCH 27/78] Update hpctests to obey UCX_NET_DEVICES when RoCE
 devices present (#421)

* Turn off higher priority MPI net devices

* Update pingmatrix.sh.j2

* Update pingmatrix.sh.j2

* Update pingpong.sh.j2

* Replace j2 comments with bash

* Update pingpong.sh.j2

---------

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>
---
 ansible/roles/hpctests/templates/pingmatrix.sh.j2 | 6 +++++-
 ansible/roles/hpctests/templates/pingpong.sh.j2   | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2
index d886e9ac8..990018d85 100644
--- a/ansible/roles/hpctests/templates/pingmatrix.sh.j2
+++ b/ansible/roles/hpctests/templates/pingmatrix.sh.j2
@@ -16,4 +16,8 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES
 module load {{ hpctests_pingmatrix_modules | join(' ' ) }}
 
 mpicc -o nxnlatbw mpi_nxnlatbw.c
-mpirun nxnlatbw
+
+# mpirun flags force using UCX TCP transports, overriding higher
+# priority of OpenMPI btl/openib component, which is also using RDMA
+# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 
+mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any nxnlatbw
diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2
index 4dc2eebd5..dad4499b1 100644
--- a/ansible/roles/hpctests/templates/pingpong.sh.j2
+++ b/ansible/roles/hpctests/templates/pingpong.sh.j2
@@ -16,4 +16,8 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES
 module load {{ hpctests_pingpong_modules | join(' ' ) }}
 
 #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1
-mpirun IMB-MPI1 pingpong
+
+# mpirun flags force using UCX TCP transports, overriding higher 
+# priority of OpenMPI btl/openib component, which is also using RDMA 
+# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100
+mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any IMB-MPI1 pingpong

From 1aff0c34c20f5a4f1bc31b9ee9d1d0c5381e87ee Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Wed, 14 Aug 2024 14:04:12 +0100
Subject: [PATCH 28/78] Update OSes available for deployment (#424)

* make changes for SMS

* Delete clouds.yaml

* Delete install_tofu directory

* Delete environments/.stackhpc/inventory/hosts.yml

* Handle setup-env Python versions

* Change CI Ubuntu version

* FETCH_HEAD

* Delete FETCH_HEAD

* bump image from main

* Update main.tf

* Update README.md to show deploy host OSes supported

* Update bastion.yml

* Delete environments/.stackhpc/terraform/SMS.tfvars

* Update dev/setup-env.sh

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>

* Update dev/setup-env.sh

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>

---------

Co-authored-by: rocky Cloud User <cloud-user@bertie-deploy.novalocal>
Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>
---
 .github/workflows/stackhpc.yml |  2 +-
 README.md                      |  6 ++++++
 dev/setup-env.sh               | 26 +++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index 401530fb4..1813eac13 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -16,7 +16,7 @@ jobs:
   openstack:
     name: openstack-ci
     concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         os_version: [RL8, RL9]
diff --git a/README.md b/README.md
index f1d6f461a..e40b5add2 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,12 @@ It is recommended to check the following before starting:
 
 ## Installation on deployment host
 
+Current Operating Systems supported to be deploy hosts:
+
+- Rocky Linux 9
+- Rocky Linux 8
+- Ubuntu 22.04
+
 These instructions assume the deployment host is running Rocky Linux 8:
 
     sudo yum install -y git python38
diff --git a/dev/setup-env.sh b/dev/setup-env.sh
index e47b3d8a9..bfa0758e6 100755
--- a/dev/setup-env.sh
+++ b/dev/setup-env.sh
@@ -2,9 +2,33 @@
 
 set -euo pipefail
 
+if [[ -f /etc/os-release ]]; then
+    . /etc/os-release
+    OS=$ID
+    OS_VERSION=$VERSION_ID
+else
+    exit 1
+fi
+
+MAJOR_VERSION=$(echo $OS_VERSION | cut -d. -f1)
+
+PYTHON_VERSION=""
+
+if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then
+    PYTHON_VERSION="/usr/bin/python3.10"
+elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then
+    PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this
+elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then
+    PYTHON_VERSION="/usr/bin/python3.9"
+else
+    echo "Unsupported OS version: $OS $MAJOR_VERSION"
+    exit 1
+fi
+
 if [[ ! -d "venv" ]]; then
-    /usr/bin/python3.8 -m venv venv # use `sudo yum install python38` on Rocky Linux 8 to install this
+    $PYTHON_VERSION -m venv venv
 fi
+
 . venv/bin/activate
 pip install -U pip
 pip install -r requirements.txt

From c2d796cc39de5e0e3eb4c7876beb4ba6b1baf6cc Mon Sep 17 00:00:00 2001
From: Matt Crees <matthew.crees1@gmail.com>
Date: Wed, 14 Aug 2024 14:04:52 +0100
Subject: [PATCH 29/78] Correct the -only options in the Packer README (#423)

The options need to be prefixed with ``openstack.``
---
 packer/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packer/README.md b/packer/README.md
index 597cfd4f9..3bc188c7e 100644
--- a/packer/README.md
+++ b/packer/README.md
@@ -41,8 +41,8 @@ The steps for building site-specific fat images or extending an existing fat ima
 
   Note that the `-only` flag here restricts the build to the non-OFED fat image "source" (in Packer terminology). Other
   source options are:
-    - `-only=openhpc-ofed`: Build a fat image including Mellanox OFED
-    - `-only=openhpc-extra`: Build an image which extends an existing fat image - in this case the variable `source_image` or `source_image_name}` must also be set in the Packer variables file.
+    - `-only=openstack.openhpc-ofed`: Build a fat image including Mellanox OFED
+    - `-only=openstack.openhpc-extra`: Build an image which extends an existing fat image - in this case the variable `source_image` or `source_image_name}` must also be set in the Packer variables file.
     
 5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash.
 

From 09bcb71044afaedc126160c616fd6dbbbe054a27 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Wed, 14 Aug 2024 14:49:02 +0100
Subject: [PATCH 30/78] Add trivy image scanning (#413)

* add trivy image scanning

* bump fatimage workflow to ubuntu 22.04

* make setup script work in CI TODO: FIXME

* fix libguestfs install

* run only 1x build per matrix entry, & only builds required

* fix packer README

* fix image download

* Use shorter names for CI clusters (#415)

* use run_number as a shorter ID for CI

* slurmci group name warning

* Revert "slurmci group name warning"  - underscores not valid linux hostname and stripped in host, leading to slurmdbd config failure

This reverts commit 61dfad6716ff144a4b3769319d02394d61df5675.

---------

Co-authored-by: Bertie <bertie@stackhpc.com>

* install ood apps in fatimage

* add ood jupyter install to fatimage

* jupyter_compute ood into fatimage

* bump fatimage

* allow items in compute mapping to have different keys e.g. only specify image_id for some compute groups (#412)

* Support ansible-init for remote collections (#411)

* Add ansible-init role to requirements.yml

* Add ansible-init to groups and plays

* Configure cluster_infra ansible-init metadata

* Only run site.yml once ansible-init has completed

* Wait for ansible init to finish before running bootstrap

* revert to using cluster_infra metadata defaults

* update image

* revert sausage bastion changes

* set ansible_init_wait as common var

* use run_number as a shorter ID for CI

* install ood apps in fatimage

* add ood jupyter install to fatimage

* bump image

* jupyter_compute ood into fatimage

* bump fatimage for jupyter_compute ood

* Update stackhpc.yml

* duplicate tuned inventory group name

* Fix invalid group name for slurmci

* Update stackhpc.yml

undo groupname changes

* slurmci group name warning

* rm ood changes

* bump fatimage

* change azimuth collection in bootstrap

* update azimuth image utils version

* update requirements

* Update bastion.yml

* Use azimuth image utils collection for ansible-init

* bump fatimage

---------

Co-authored-by: bertie <bertie@stackhpc.com>
Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>

* avoid python-openstackclient v7 due to rebuild bug (#420)

* Update hpctests to obey UCX_NET_DEVICES when RoCE devices present (#421)

* Turn off higher priority MPI net devices

* Update pingmatrix.sh.j2

* Update pingmatrix.sh.j2

* Update pingpong.sh.j2

* Replace j2 comments with bash

* Update pingpong.sh.j2

---------

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>

* delete trivy scanned vulnerabilities

* update grafana

* bump image

* Update environments/.stackhpc/hooks/post.yml

Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com>

* Update setup-env.sh

---------

Co-authored-by: Bertie <bertie@stackhpc.com>
Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Co-authored-by: bertiethorpe <bertie443@gmail.com>
---
 .github/workflows/fatimage.yml                | 84 ++++++++++++++-----
 environments/.stackhpc/hooks/post.yml         | 14 ++++
 environments/.stackhpc/terraform/main.tf      |  6 +-
 .../inventory/group_vars/all/grafana.yml      |  2 +-
 4 files changed, 82 insertions(+), 24 deletions(-)
 create mode 100644 environments/.stackhpc/hooks/post.yml

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index e6727948b..31fcc789a 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -2,27 +2,26 @@
 name: Build fat image
 'on':
   workflow_dispatch:
-    inputs:
-      use_RL8:
-        required: true
-        description: Include RL8 image build
-        type: boolean
-        default: false
 concurrency:
-  group: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
+  group: ${{ github.ref }}-{{ matrix.os_version }}-{{ matrix.build }} # to branch/PR + OS + build
   cancel-in-progress: true
 jobs:
   openstack:
     name: openstack-imagebuild
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
-        os_version: [RL8, RL9]
-        rl8_selected:
-          - ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch
+        os_version:
+          - RL8
+          - RL9
+        build:
+          - openstack.openhpc
+          - openstack.openhpc-ofed
         exclude:
           - os_version: RL8
-            rl8_selected: false
+            build: openstack.openhpc-ofed
+          - os_version: RL9
+            build: openstack.openhpc
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
@@ -63,7 +62,7 @@ jobs:
           . environments/.stackhpc/activate
           cd packer/
           packer init .
-          PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -except=openstack.openhpc-extra -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
+          PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only=${{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
         env:
           PKR_VAR_os_version: ${{ matrix.os_version }}
 
@@ -71,11 +70,56 @@ jobs:
         id: manifest
         run: |
           . venv/bin/activate
-          for IMAGE_ID in $(jq --raw-output '.builds[].artifact_id' packer/packer-manifest.json)
-          do
-            while ! openstack image show -f value -c name $IMAGE_ID; do
-              sleep 5
-            done
-            IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
-            echo $IMAGE_NAME
+          IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
+          while ! openstack image show -f value -c name $IMAGE_ID; do
+            sleep 5
           done
+          IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
+          echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
+          echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
+
+      - name: Download image
+        run: |
+          . venv/bin/activate
+          openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: install libguestfs
+        run: |
+          sudo apt -y update
+          sudo apt -y install libguestfs-tools
+
+      - name: mkdir for mount
+        run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
+
+      - name: mount qcow2 file
+        run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
+
+      - name: Run Trivy vulnerability scanner
+        uses: aquasecurity/trivy-action@0.17.0
+        with:
+          scan-type: fs
+          scan-ref: "./${{ steps.manifest.outputs.image-name }}"
+          scanners: "vuln"
+          format: sarif
+          output: "${{ steps.manifest.outputs.image-name }}.sarif"
+          # turn off secret scanning to speed things up
+
+      - name: Upload Trivy scan results to GitHub Security tab
+        uses: github/codeql-action/upload-sarif@v3
+        with:
+          sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
+          category: "${{ matrix.os_version }}-${{ matrix.build }}"
+
+      - name: Fail if scan has CRITICAL vulnerabilities
+        uses: aquasecurity/trivy-action@0.16.1
+        with:
+          scan-type: fs
+          scan-ref: "./${{ steps.manifest.outputs.image-name }}"
+          scanners: "vuln"
+          format: table
+          exit-code: '1'
+          severity: 'CRITICAL'
+          ignore-unfixed: true
diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml
new file mode 100644
index 000000000..eceadcbd8
--- /dev/null
+++ b/environments/.stackhpc/hooks/post.yml
@@ -0,0 +1,14 @@
+- hosts: openondemand
+  become: yes
+  gather_facts: false
+  tasks:
+    - name: Delete ondemand files causing Trivy scan false-positives
+      # Raised at https://github.com/OSC/ondemand/security/advisories/GHSA-f7j8-ppqm-m5vw
+      # All declared not to be an issue by Open Ondemand as relevant packages not installed
+      ansible.builtin.file:
+        path: "{{ item }}"
+        state: absent
+      with_items:
+        - /opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/3.1.7-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock
+        - /opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/3.1.7-1/gems/bootstrap_form-4.5.0/demo/yarn.lock
+        - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock
\ No newline at end of file
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index c2447bdf5..53ed174f4 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,9 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/411
-        RL8: "openhpc-RL8-240725-1710-325c7b47"
-        RL9: "openhpc-ofed-RL9-240725-1710-325c7b47"
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/413
+        RL8: "openhpc-RL8-240813-1317-1b370a36"
+        RL9: "openhpc-ofed-RL9-240813-1317-1b370a36"
     }
 }
 
diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml
index 8222a3cca..90ef51c59 100644
--- a/environments/common/inventory/group_vars/all/grafana.yml
+++ b/environments/common/inventory/group_vars/all/grafana.yml
@@ -2,7 +2,7 @@
 
 # See: https://github.com/cloudalchemy/ansible-grafana
 # for variable definitions.
-grafana_version: '9.0.3'
+grafana_version: '9.5.21'
 
 # need to copy some role defaults here so we can use in inventory:
 grafana_port: 3000

From ccdf03639ecf118fb2538d2b33a009c9db7722be Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Thu, 15 Aug 2024 10:49:49 +0100
Subject: [PATCH 31/78] enable 'openstack baremetal ...' commands (#425)

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index bf5a43430..6651506fb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ ansible==6.0.0
 openstacksdk
 python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild
 python-manilaclient
+python-ironicclient
 jmespath
 passlib[bcrypt]==1.7.4
 cookiecutter

From 25533b6163caece2cfc1f58b398fc6605c51cf06 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Tue, 20 Aug 2024 14:47:57 +0000
Subject: [PATCH 32/78] check for upstream changes

---
 .github/bin/create-merge-branch.sh         | 81 ++++++++++++++++++++++
 .github/workflows/upgrade-check.yml.sample | 67 ++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 .github/bin/create-merge-branch.sh
 create mode 100644 .github/workflows/upgrade-check.yml.sample

diff --git a/.github/bin/create-merge-branch.sh b/.github/bin/create-merge-branch.sh
new file mode 100644
index 000000000..d76fe45de
--- /dev/null
+++ b/.github/bin/create-merge-branch.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+#####
+# This script creates a branch that merges the latest release
+#####
+
+set -ex
+
+# Only allow running on main
+CURRENT_BRANCH="$(git branch --show-current)"
+if [ "$CURRENT_BRANCH" != "main" ]; then
+  echo "[ERROR] This script can only be run on the main branch" >&2
+  exit 1
+fi
+
+if [ -n "$(git status --short)" ]; then
+  echo "[ERROR] This script cannot run with uncommitted changes" >&2
+  exit 1
+fi
+
+UPSTREAM_REPO="${UPSTREAM_REPO:-"stackhpc/ansible-slurm-appliance"}"
+echo "[INFO] Using upstream repo - $UPSTREAM_REPO"
+
+# Fetch the tag for the latest release from the upstream repository
+RELEASE_TAG="$(curl -fsSL "https://api.github.com/repos/${UPSTREAM_REPO}/releases/latest" | jq -r '.tag_name')"
+echo "[INFO] Found latest release tag - $RELEASE_TAG"
+
+# Add the repository as an upstream
+echo "[INFO] Adding upstream remote..."
+git remote add upstream "https://github.com/${UPSTREAM_REPO}.git"
+git remote show upstream
+
+echo "[INFO] Fetching remote tags..."
+git remote update
+
+# Use a branch that is named for the release
+BRANCH_NAME="upgrade/$RELEASE_TAG"
+
+# Check if the branch already exists on the origin
+# If it does, there is nothing more to do as the branch can be rebased from the MR
+if git show-branch "remotes/origin/$BRANCH_NAME" >/dev/null 2>&1; then
+  echo "[INFO] Merge branch already created for $RELEASE_TAG"
+  exit
+fi
+
+echo "[INFO] Merging release tag - $RELEASE_TAG"
+git merge --strategy recursive -X theirs --no-commit $RELEASE_TAG
+
+# Check if the merge resulted in any changes being staged
+if [ -n "$(git status --short)" ]; then
+  echo "[INFO] Merge resulted in the following changes"
+  git status
+
+  # NOTE(scott): The GitHub create-pull-request action does
+  # the commiting for us, so we only need to make branches
+  # and commits if running outside of GitHub actions.
+  if [ ! $GITHUB_ACTIONS ]; then
+    echo "[INFO] Checking out temporary branch '$BRANCH_NAME'..."
+    git checkout -b "$BRANCH_NAME"
+
+    echo "[INFO] Committing changes"
+    git commit -m "Upgrade ansible-slurm-applaince to $RELEASE_TAG"
+
+    echo "[INFO] Pushing changes to origin"
+    git push --set-upstream origin "$BRANCH_NAME"
+
+    # Go back to the main branch at the end
+    echo "[INFO] Reverting back to main"
+    git checkout main
+
+    echo "[INFO] Removing temporary branch"
+    git branch -d "$BRANCH_NAME"
+  fi
+
+  # Write a file containing the branch name and tag
+  # for automatic PR or MR creation that follows
+  echo "BRANCH_NAME=\"$BRANCH_NAME\"" > .mergeenv
+  echo "RELEASE_TAG=\"$RELEASE_TAG\"" >> .mergeenv
+else
+  echo "[INFO] Merge resulted in no changes"
+fi
\ No newline at end of file
diff --git a/.github/workflows/upgrade-check.yml.sample b/.github/workflows/upgrade-check.yml.sample
new file mode 100644
index 000000000..c03f72cf3
--- /dev/null
+++ b/.github/workflows/upgrade-check.yml.sample
@@ -0,0 +1,67 @@
+# This workflow compares a downstream ansible-slurm-appliance repository for a specific site with the upstream
+# stackhpc/ansible-slurm-appliance repository to check whether there is a new upstream version available. If a
+# newer tag is found in the upstream repository then a pull request is created to the downstream repo
+# in order to merge in the changes from the new upstream release.
+
+# To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
+# and give it an appropriate name, e.g.
+# cp .github/workflows/upgrade-check.yml.sample .github/workflows/upgrade-check.yml
+
+name: Check for upstream updates
+on:
+  schedule:
+    - cron: "0 9 * * *"
+  workflow_dispatch:
+jobs:
+  check_for_update:
+    runs-on: ubuntu-22.04
+    # permissions:
+    #   contents: write
+    #   pull-requests: write
+    #   actions: write
+
+    steps:
+      - name: Checkout the config repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+
+      # Based on equivalent GitLab CI job
+      - name: Check for new release
+        shell: bash
+        run: |
+          set -xe
+
+          # Tell git who we are for commits
+          git config user.email "${{ github.actor }}-ci@slurmapp.ci"
+          git config user.name "${{ github.actor }} CI"
+
+          # Create the merge branch and write vars to .mergeenv file
+          .github/bin/create-merge-branch.sh
+
+      - name: Set release tag output
+        id: release_tag
+        if: ${{ hashFiles('.mergeenv') }}
+        run: source .mergeenv && echo value=$RELEASE_TAG >> $GITHUB_OUTPUT
+
+      - name: Set branch name output
+        id: branch_name
+        if: ${{ hashFiles('.mergeenv') }}
+        run: source .mergeenv && echo value=$BRANCH_NAME >> $GITHUB_OUTPUT
+
+      - name: Remove tmp file
+        run: rm .mergeenv
+        if: ${{ hashFiles('.mergeenv') }}
+
+      - name: Create Pull Request
+        if: ${{ steps.release_tag.outputs.value }}
+        uses: peter-evans/create-pull-request@v6
+        with:
+          base: main
+          branch: ${{ steps.branch_name.outputs.value }}
+          title: "Upgrade ansible-slurm-appliance to ${{ steps.release_tag.outputs.value }}"
+          body: This PR was automatically generated by GitHub Actions.
+          commit-message: "Upgrade ansible-slurm-appliance to ${{ steps.release_tag.outputs.value }}"
+          delete-branch: true
+          token: ${{ secrets.WORKFLOW_TOKEN }}
\ No newline at end of file

From d765077eace8fe81b1c8e3b9d02bca97b1239e92 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Tue, 20 Aug 2024 15:48:08 +0000
Subject: [PATCH 33/78] update README.md

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index e40b5add2..3c7328bf4 100644
--- a/README.md
+++ b/README.md
@@ -144,3 +144,20 @@ Please contact us for specific advice, but in outline this generally involves:
 ## Monitoring and logging
 
 Please see the [monitoring-and-logging.README.md](docs/monitoring-and-logging.README.md) for details.
+
+## CI/CD automation
+
+A GitHub Actions workflow which checks for new upstream version release tags and updates the downstream repo, can be found at:
+
+        .github/workflows/upgrade-check.yml.sample
+
+If activated, the workflow is scheduled by default to run every day at 9 AM UTC and can be triggered manually via the `workflow_dispatch` event. How to activate the workflow is detailed at the top of the file.
+
+In order for GitHub actions to fetch workflow changes in `.github/workflows`, a PAT for each deployment must be provided.
+
+The following repository permissions must be set for the PAT:
+ - `Workflows : Read and write`
+ - `Actions : Read and write`
+ - `Pull requests: Read and write`
+
+The PAT should then be copied into a downstream repo secret with the title `WORKFLOW_TOKEN`.

From e8431190cd803830e5a0f4c7e3023ddda7466d78 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Tue, 20 Aug 2024 15:52:07 +0000
Subject: [PATCH 34/78] update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3c7328bf4..e764fb736 100644
--- a/README.md
+++ b/README.md
@@ -156,8 +156,8 @@ If activated, the workflow is scheduled by default to run every day at 9 AM UTC
 In order for GitHub actions to fetch workflow changes in `.github/workflows`, a PAT for each deployment must be provided.
 
 The following repository permissions must be set for the PAT:
- - `Workflows : Read and write`
- - `Actions : Read and write`
+ - `Workflows: Read and write`
+ - `Actions: Read and write`
  - `Pull requests: Read and write`
 
-The PAT should then be copied into a downstream repo secret with the title `WORKFLOW_TOKEN`.
+The PAT should then be copied into an Actions repository secret in the downstream repo with the title `WORKFLOW_TOKEN`.

From 30bcb4d96d7bdae9dc465cfe5ad57c2eeb3c2f9c Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Tue, 20 Aug 2024 16:10:22 +0000
Subject: [PATCH 35/78] update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e764fb736..12c7cf639 100644
--- a/README.md
+++ b/README.md
@@ -147,12 +147,14 @@ Please see the [monitoring-and-logging.README.md](docs/monitoring-and-logging.RE
 
 ## CI/CD automation
 
-A GitHub Actions workflow which checks for new upstream version release tags and updates the downstream repo, can be found at:
+A GitHub Actions workflow which checks for new upstream version release tags and creates a PR to update the downstream repo, can be found at:
 
         .github/workflows/upgrade-check.yml.sample
 
 If activated, the workflow is scheduled by default to run every day at 9 AM UTC and can be triggered manually via the `workflow_dispatch` event. How to activate the workflow is detailed at the top of the file.
 
+Workflow uses [create-pull-request](https://github.com/peter-evans/create-pull-request) to handle the pull request action. See for action inputs.
+
 In order for GitHub actions to fetch workflow changes in `.github/workflows`, a PAT for each deployment must be provided.
 
 The following repository permissions must be set for the PAT:

From eb9b6e7508c2d197f096dd1fe6c3778f73c3353b Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Wed, 21 Aug 2024 09:42:02 +0000
Subject: [PATCH 36/78] update documentation

---
 .github/workflows/upgrade-check.yml.sample | 24 ++++++++++++++++------
 README.md                                  | 17 ++-------------
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/upgrade-check.yml.sample b/.github/workflows/upgrade-check.yml.sample
index c03f72cf3..39efcd8fe 100644
--- a/.github/workflows/upgrade-check.yml.sample
+++ b/.github/workflows/upgrade-check.yml.sample
@@ -2,10 +2,26 @@
 # stackhpc/ansible-slurm-appliance repository to check whether there is a new upstream version available. If a
 # newer tag is found in the upstream repository then a pull request is created to the downstream repo
 # in order to merge in the changes from the new upstream release.
-
+#
 # To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
 # and give it an appropriate name, e.g.
 # cp .github/workflows/upgrade-check.yml.sample .github/workflows/upgrade-check.yml
+#
+# Workflow uses https://github.com/peter-evans/create-pull-request to handle the pull request action. 
+# See the docs for action inputs.
+#
+# In order for GitHub actions to create pull requests that make changes to workflows in `.github/workflows`, 
+# a token for each deployment must be provided. Both user PAT and fine-grained tokens should work, but it was tested
+# with a PAT. Fine-grained repo-scoped token is preferred if possible but requires organisation admin privileges.
+#
+# See https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
+# for security considerations around tokens. TREAT YOUR ACCESS TOKENS LIKE PASSWORDS.
+#
+# The following repository permissions must be set for the PAT:
+#  - `Workflows: Read and write`
+#  - `Actions: Read and write`
+#  - `Pull requests: Read and write`
+# The PAT should then be copied into an Actions repository secret in the downstream repo with the title `WORKFLOW_TOKEN`.
 
 name: Check for upstream updates
 on:
@@ -15,10 +31,6 @@ on:
 jobs:
   check_for_update:
     runs-on: ubuntu-22.04
-    # permissions:
-    #   contents: write
-    #   pull-requests: write
-    #   actions: write
 
     steps:
       - name: Checkout the config repo
@@ -27,7 +39,7 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
 
-      # Based on equivalent GitLab CI job
+      # Based on equivalent azimuth-config job
       - name: Check for new release
         shell: bash
         run: |
diff --git a/README.md b/README.md
index 12c7cf639..6c3696c99 100644
--- a/README.md
+++ b/README.md
@@ -147,19 +147,6 @@ Please see the [monitoring-and-logging.README.md](docs/monitoring-and-logging.RE
 
 ## CI/CD automation
 
-A GitHub Actions workflow which checks for new upstream version release tags and creates a PR to update the downstream repo, can be found at:
+The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include:
 
-        .github/workflows/upgrade-check.yml.sample
-
-If activated, the workflow is scheduled by default to run every day at 9 AM UTC and can be triggered manually via the `workflow_dispatch` event. How to activate the workflow is detailed at the top of the file.
-
-Workflow uses [create-pull-request](https://github.com/peter-evans/create-pull-request) to handle the pull request action. See for action inputs.
-
-In order for GitHub actions to fetch workflow changes in `.github/workflows`, a PAT for each deployment must be provided.
-
-The following repository permissions must be set for the PAT:
- - `Workflows: Read and write`
- - `Actions: Read and write`
- - `Pull requests: Read and write`
-
-The PAT should then be copied into an Actions repository secret in the downstream repo with the title `WORKFLOW_TOKEN`.
+- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published.
\ No newline at end of file

From ad84245bc2554c67ab5287ea7b2ab4758ee3e28f Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Wed, 21 Aug 2024 12:49:32 +0000
Subject: [PATCH 37/78] test upload images commit

---
 .github/bin/get-s3-image.sh                | 29 +++++++++++++++
 .github/workflows/upload-release-image.yml | 41 ++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 .github/bin/get-s3-image.sh
 create mode 100644 .github/workflows/upload-release-image.yml

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
new file mode 100644
index 000000000..62fc277df
--- /dev/null
+++ b/.github/bin/get-s3-image.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+set -x
+
+# Variables (adjust these paths as necessary)
+S3_BUCKET="s3://openhpc-images"
+APPLIANCES_ENVIRONMENT_ROOT="$APPLIANCES_ENVIRONMENT_ROOT"
+MAIN_TF="$APPLIANCES_ENVIRONMENT_ROOT/terraform/main.tf"
+
+for IMAGE_OS in $IMAGE_LIST; do
+    echo "Extracting CI image name from $MAIN_TF"
+    ci_image=$(grep -oP 'openhpc-[0-9a-zA-Z-]+' "$MAIN_TF" | grep $IMAGE_OS)
+
+    echo "Checking if image $ci_image exists in OpenStack"
+    image_exists=$(openstack image list --name "$ci_image" -f value -c Name)
+
+    if [ "$image_exists" == "$ci_image" ]; then
+        echo "Image $ci_image already exists in OpenStack."
+    else
+        echo "Image $ci_image not found in OpenStack. Getting it from S3."
+
+        wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/$ci_image
+
+        echo "Uploading image $ci_image to OpenStack..."
+        openstack image create --file "$ci_image.qcow2" --disk-format qcow2 "$ci_image"
+
+        echo "Image $ci_image has been uploaded to OpenStack."
+    fi
+done
\ No newline at end of file
diff --git a/.github/workflows/upload-release-image.yml b/.github/workflows/upload-release-image.yml
new file mode 100644
index 000000000..de3fbbb41
--- /dev/null
+++ b/.github/workflows/upload-release-image.yml
@@ -0,0 +1,41 @@
+
+name: Upload release images to client sites from s3
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - master
+      - feat/fatimage-auto-upload
+
+jobs:
+  image_upload:
+    runs-on: ubuntu-22.04
+    concurrency: ${{ github.ref }}
+    env: 
+      OS_CLOUD: openstack
+      CI_CLOUD: ${{ vars.CI_CLOUD }}
+      IMAGE_LIST: "RL8 RL9"
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Record which cloud CI is running on
+        run: |
+          echo CI_CLOUD: ${{ vars.CI_CLOUD }}
+
+      - name: Write clouds.yaml
+        run: |
+          mkdir -p ~/.config/openstack/
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+        shell: bash
+
+      - name: Upload latest image if missing
+        run: |
+          /usr/bin/python3.10 -m venv venv
+          . venv/bin/activate
+          pip install -U pip
+          pip install -r requirements.txt
+          ansible --version
+          . environments/.stackhpc/activate
+          ansible-galaxy collection install openstack.cloud
+          . .github/bin/get-s3-image.sh -o
\ No newline at end of file

From 8790e12f7cfcaf86e70aa3891f0613ffe7fc2130 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 22 Aug 2024 10:50:48 +0000
Subject: [PATCH 38/78] just use workflow_dispatch

---
 .github/bin/get-s3-image.sh                | 25 ++++++++---------
 .github/workflows/upload-release-image.yml | 31 +++++++++++-----------
 2 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index 62fc277df..decd4e0fa 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -4,26 +4,23 @@ set -x
 
 # Variables (adjust these paths as necessary)
 S3_BUCKET="s3://openhpc-images"
-APPLIANCES_ENVIRONMENT_ROOT="$APPLIANCES_ENVIRONMENT_ROOT"
-MAIN_TF="$APPLIANCES_ENVIRONMENT_ROOT/terraform/main.tf"
 
 for IMAGE_OS in $IMAGE_LIST; do
-    echo "Extracting CI image name from $MAIN_TF"
-    ci_image=$(grep -oP 'openhpc-[0-9a-zA-Z-]+' "$MAIN_TF" | grep $IMAGE_OS)
+    
+    image_name=$1
+    echo "Checking if image $image_name exists in OpenStack"
+    image_exists=$(openstack image list --name "$image_name" -f value -c Name)
 
-    echo "Checking if image $ci_image exists in OpenStack"
-    image_exists=$(openstack image list --name "$ci_image" -f value -c Name)
-
-    if [ "$image_exists" == "$ci_image" ]; then
-        echo "Image $ci_image already exists in OpenStack."
+    if [ "$image_exists" == "$image_name" ]; then
+        echo "Image $image_name already exists in OpenStack."
     else
-        echo "Image $ci_image not found in OpenStack. Getting it from S3."
+        echo "Image $image_name not found in OpenStack. Getting it from S3."
 
-        wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/$ci_image
+        wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/$image_name
 
-        echo "Uploading image $ci_image to OpenStack..."
-        openstack image create --file "$ci_image.qcow2" --disk-format qcow2 "$ci_image"
+        echo "Uploading image $image_name to OpenStack..."
+        openstack image create --file "$image_name.qcow2" --disk-format qcow2 "$image_name"
 
-        echo "Image $ci_image has been uploaded to OpenStack."
+        echo "Image $image_name has been uploaded to OpenStack."
     fi
 done
\ No newline at end of file
diff --git a/.github/workflows/upload-release-image.yml b/.github/workflows/upload-release-image.yml
index de3fbbb41..a52c0aa80 100644
--- a/.github/workflows/upload-release-image.yml
+++ b/.github/workflows/upload-release-image.yml
@@ -2,11 +2,17 @@
 name: Upload release images to client sites from s3
 on:
   workflow_dispatch:
-  push:
-    branches:
-      - main
-      - master
-      - feat/fatimage-auto-upload
+    inputs:
+      image_name:
+        type: string
+        description: Image name from https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/
+        required: true
+
+  # push:
+  #   branches:
+  #     - main
+  #     - master
+  #     - feat/fatimage-auto-upload
 
 jobs:
   image_upload:
@@ -14,14 +20,9 @@ jobs:
     concurrency: ${{ github.ref }}
     env: 
       OS_CLOUD: openstack
-      CI_CLOUD: ${{ vars.CI_CLOUD }}
       IMAGE_LIST: "RL8 RL9"
     steps:
-      - uses: actions/checkout@v2
-
-      - name: Record which cloud CI is running on
-        run: |
-          echo CI_CLOUD: ${{ vars.CI_CLOUD }}
+      - uses: actions/checkout@v4
 
       - name: Write clouds.yaml
         run: |
@@ -31,11 +32,9 @@ jobs:
 
       - name: Upload latest image if missing
         run: |
-          /usr/bin/python3.10 -m venv venv
+          python3 -m venv venv
           . venv/bin/activate
           pip install -U pip
-          pip install -r requirements.txt
-          ansible --version
+          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
           . environments/.stackhpc/activate
-          ansible-galaxy collection install openstack.cloud
-          . .github/bin/get-s3-image.sh -o
\ No newline at end of file
+          bash .github/bin/get-s3-image.sh ${{ inputs.image_name }}
\ No newline at end of file

From c2b87e4d2bd1720a197c99ab3f9bc2a85008fbe3 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 22 Aug 2024 11:59:45 +0000
Subject: [PATCH 39/78] fix cloud config parse

---
 .github/bin/get-s3-image.sh                | 32 +++++++++-------------
 .github/workflows/upload-release-image.yml |  3 +-
 2 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index decd4e0fa..7a81a236f 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -1,26 +1,20 @@
 #!/bin/bash
 
-set -x
+set -ex
 
-# Variables (adjust these paths as necessary)
-S3_BUCKET="s3://openhpc-images"
+image_name=$1
+echo "Checking if image $image_name exists in OpenStack"
+image_exists=$(openstack image list --name "$image_name" -f value -c Name)
 
-for IMAGE_OS in $IMAGE_LIST; do
-    
-    image_name=$1
-    echo "Checking if image $image_name exists in OpenStack"
-    image_exists=$(openstack image list --name "$image_name" -f value -c Name)
+if [ "$image_exists" == "$image_name" ]; then
+    echo "Image $image_name already exists in OpenStack."
+else
+    echo "Image $image_name not found in OpenStack. Getting it from S3."
 
-    if [ "$image_exists" == "$image_name" ]; then
-        echo "Image $image_name already exists in OpenStack."
-    else
-        echo "Image $image_name not found in OpenStack. Getting it from S3."
+    wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/$image_name --progress=dot:giga
 
-        wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/$image_name
+    echo "Uploading image $image_name to OpenStack..."
+    openstack image create --file "$image_name.qcow2" --disk-format qcow2 "$image_name"
 
-        echo "Uploading image $image_name to OpenStack..."
-        openstack image create --file "$image_name.qcow2" --disk-format qcow2 "$image_name"
-
-        echo "Image $image_name has been uploaded to OpenStack."
-    fi
-done
\ No newline at end of file
+    echo "Image $image_name has been uploaded to OpenStack."
+fi
diff --git a/.github/workflows/upload-release-image.yml b/.github/workflows/upload-release-image.yml
index a52c0aa80..74328afd9 100644
--- a/.github/workflows/upload-release-image.yml
+++ b/.github/workflows/upload-release-image.yml
@@ -27,7 +27,7 @@ jobs:
       - name: Write clouds.yaml
         run: |
           mkdir -p ~/.config/openstack/
-          echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+          echo "${{ secrets.OS_CLOUD_YAML }}" > ~/.config/openstack/clouds.yaml
         shell: bash
 
       - name: Upload latest image if missing
@@ -36,5 +36,4 @@ jobs:
           . venv/bin/activate
           pip install -U pip
           pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
-          . environments/.stackhpc/activate
           bash .github/bin/get-s3-image.sh ${{ inputs.image_name }}
\ No newline at end of file

From 377a607cff41cda7b99127f248b5025aa7c42fd2 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 22 Aug 2024 12:56:58 +0000
Subject: [PATCH 40/78] fix image create name

---
 .github/bin/get-s3-image.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index 7a81a236f..4b2433bc9 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -14,7 +14,7 @@ else
     wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/$image_name --progress=dot:giga
 
     echo "Uploading image $image_name to OpenStack..."
-    openstack image create --file "$image_name.qcow2" --disk-format qcow2 "$image_name"
+    openstack image create --file "$image_name" --disk-format qcow2 "$image_name" --progress
 
     echo "Image $image_name has been uploaded to OpenStack."
 fi

From 1866fc806b95c8e6edbbed61c80b4ec9317c91c7 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 22 Aug 2024 13:59:40 +0000
Subject: [PATCH 41/78] pick bucket and handle cancellation

---
 .github/bin/get-s3-image.sh                |  3 +-
 .github/workflows/upload-release-image.yml | 35 ++++++++++++++++------
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index 4b2433bc9..64caaaf17 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -3,6 +3,7 @@
 set -ex
 
 image_name=$1
+bucket_name=$2
 echo "Checking if image $image_name exists in OpenStack"
 image_exists=$(openstack image list --name "$image_name" -f value -c Name)
 
@@ -11,7 +12,7 @@ if [ "$image_exists" == "$image_name" ]; then
 else
     echo "Image $image_name not found in OpenStack. Getting it from S3."
 
-    wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/$image_name --progress=dot:giga
+    wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/$bucket_name/$image_name --progress=dot:giga
 
     echo "Uploading image $image_name to OpenStack..."
     openstack image create --file "$image_name" --disk-format qcow2 "$image_name" --progress
diff --git a/.github/workflows/upload-release-image.yml b/.github/workflows/upload-release-image.yml
index 74328afd9..6c6242158 100644
--- a/.github/workflows/upload-release-image.yml
+++ b/.github/workflows/upload-release-image.yml
@@ -5,14 +5,15 @@ on:
     inputs:
       image_name:
         type: string
-        description: Image name from https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/
+        description: Image name from https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{BUCKET_NAME}/
         required: true
-
-  # push:
-  #   branches:
-  #     - main
-  #     - master
-  #     - feat/fatimage-auto-upload
+      bucket_name:
+        type: choice
+        required: true
+        description: Bucket name
+        options:
+          - openhpc-images
+          # - openhpc-images-prerelease
 
 jobs:
   image_upload:
@@ -20,7 +21,6 @@ jobs:
     concurrency: ${{ github.ref }}
     env: 
       OS_CLOUD: openstack
-      IMAGE_LIST: "RL8 RL9"
     steps:
       - uses: actions/checkout@v4
 
@@ -36,4 +36,21 @@ jobs:
           . venv/bin/activate
           pip install -U pip
           pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
-          bash .github/bin/get-s3-image.sh ${{ inputs.image_name }}
\ No newline at end of file
+          bash .github/bin/get-s3-image.sh ${{ inputs.image_name }} ${{ inputs.bucket_name }}
+
+      - name: Cleanup OpenStack Image (on error or cancellation)
+        if: cancelled()
+        run: |
+          . venv/bin/activate
+          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID)
+          if [ -n "$image_hanging" ]; then
+            echo "Cleaning up OpenStack image with ID: $image_hanging"
+            openstack image delete $image_hanging
+          else
+            echo "No image ID found, skipping cleanup."
+          fi
+        shell: bash
+
+      - name: Confirm Success
+        if: success()
+        run: echo "Deployment succeeded, no cleanup needed."
\ No newline at end of file

From 3d0dde70ea58acb0da6369a2af4a8f8cc13905eb Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 22 Aug 2024 16:17:16 +0000
Subject: [PATCH 42/78] documentation

---
 .github/bin/get-s3-image.sh                           |  5 +++++
 ...ease-image.yml => upload-release-image.yml.sample} | 11 +++++++++++
 README.md                                             |  4 +++-
 3 files changed, 19 insertions(+), 1 deletion(-)
 rename .github/workflows/{upload-release-image.yml => upload-release-image.yml.sample} (65%)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index 64caaaf17..64e3e3f7f 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
 
+#####
+# This script looks for an image in OpenStack and if not found, downloads from
+# S3 bucket, and then uploads to OpenStack
+#####
+
 set -ex
 
 image_name=$1
diff --git a/.github/workflows/upload-release-image.yml b/.github/workflows/upload-release-image.yml.sample
similarity index 65%
rename from .github/workflows/upload-release-image.yml
rename to .github/workflows/upload-release-image.yml.sample
index 6c6242158..e117d3680 100644
--- a/.github/workflows/upload-release-image.yml
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -1,3 +1,14 @@
+# This workflow dispatch is to be used on a downstream ansible-slurm-appliance repository. The workflow takes two inputs:
+# image name, and s3 bucket name. The image is searched for in the environment set openstack, and if not found there, 
+# downloads it from the ARCUS S3 bucket specified. The workflow then uploads the image to the openstack.
+#
+# To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
+# and give it an appropriate name, e.g.
+# cp .github/workflows/upload-s3-image.yml.sample .github/workflows/upload-s3-image.yml
+#
+# In order for the workflow to access the openstack, credentials in the form of a clouds.yaml file must be provided by a
+# secret. This secret should have the name OS_CLOUD_YAML to be found by the workflow.
+# Details on the contents of the clouds.yaml file can be found at https://docs.openstack.org/keystone/latest/user/application_credentials.html
 
 name: Upload release images to client sites from s3
 on:
diff --git a/README.md b/README.md
index 6c3696c99..f11be22b2 100644
--- a/README.md
+++ b/README.md
@@ -149,4 +149,6 @@ Please see the [monitoring-and-logging.README.md](docs/monitoring-and-logging.RE
 
 The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include:
 
-- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published.
\ No newline at end of file
+- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published.
+
+- An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow dispatch which takes an image name, downloads it from ARCUS s3 bucket if available, and uploads to downstream environment openstack.
\ No newline at end of file

From b23b0cd3dfcabb216ef7fbfa94b024dc0c215ff2 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 10:56:07 +0000
Subject: [PATCH 43/78] suggested changes

---
 .github/bin/get-s3-image.sh                   |  4 ++--
 .../workflows/upload-release-image.yml.sample | 23 +++++++++----------
 README.md                                     |  2 +-
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index 64e3e3f7f..da510fcb6 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -12,7 +12,7 @@ bucket_name=$2
 echo "Checking if image $image_name exists in OpenStack"
 image_exists=$(openstack image list --name "$image_name" -f value -c Name)
 
-if [ "$image_exists" == "$image_name" ]; then
+if [ -n $image_exists ]; then
     echo "Image $image_name already exists in OpenStack."
 else
     echo "Image $image_name not found in OpenStack. Getting it from S3."
@@ -20,7 +20,7 @@ else
     wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/$bucket_name/$image_name --progress=dot:giga
 
     echo "Uploading image $image_name to OpenStack..."
-    openstack image create --file "$image_name" --disk-format qcow2 "$image_name" --progress
+    openstack image create --file $image_name --disk-format qcow2 $image_name --progress
 
     echo "Image $image_name has been uploaded to OpenStack."
 fi
diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index e117d3680..1ce1509f6 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -1,13 +1,16 @@
-# This workflow dispatch is to be used on a downstream ansible-slurm-appliance repository. The workflow takes two inputs:
-# image name, and s3 bucket name. The image is searched for in the environment set openstack, and if not found there, 
-# downloads it from the ARCUS S3 bucket specified. The workflow then uploads the image to the openstack.
+# This workflow can be used to fetch images published by StackHPC and upload them to a client's OpenStack. 
+# The workflow takes two inputs:
+# - image name
+# - s3 bucket name
+# and first checks to see if the image exists in the target OpenStack. If the image doesn't exist, it is downloaded 
+# from StackHPC's public S3 bucket and then uploaded to the target OpenStack.
 #
 # To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
 # and give it an appropriate name, e.g.
 # cp .github/workflows/upload-s3-image.yml.sample .github/workflows/upload-s3-image.yml
 #
-# In order for the workflow to access the openstack, credentials in the form of a clouds.yaml file must be provided by a
-# secret. This secret should have the name OS_CLOUD_YAML to be found by the workflow.
+# In order for the workflow to access the target OpenStack, an application credential clouds.yaml file must be 
+# added as a repository secret named OS_CLOUD_YAML.
 # Details on the contents of the clouds.yaml file can be found at https://docs.openstack.org/keystone/latest/user/application_credentials.html
 
 name: Upload release images to client sites from s3
@@ -53,15 +56,11 @@ jobs:
         if: cancelled()
         run: |
           . venv/bin/activate
-          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID)
-          if [ -n "$image_hanging" ]; then
+          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' queued$' | awk '{print $1}'
+          if [ -n $image_hanging ]; then
             echo "Cleaning up OpenStack image with ID: $image_hanging"
             openstack image delete $image_hanging
           else
             echo "No image ID found, skipping cleanup."
           fi
-        shell: bash
-
-      - name: Confirm Success
-        if: success()
-        run: echo "Deployment succeeded, no cleanup needed."
\ No newline at end of file
+        shell: bash
\ No newline at end of file
diff --git a/README.md b/README.md
index f11be22b2..d348d66d0 100644
--- a/README.md
+++ b/README.md
@@ -151,4 +151,4 @@ The `.github` directory contains a set of sample workflows which can be used by
 
 - An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published.
 
-- An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow dispatch which takes an image name, downloads it from ARCUS s3 bucket if available, and uploads to downstream environment openstack.
\ No newline at end of file
+- An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow which takes an image name, downloads it from StackHPC's public S3 bucket if available, and uploads it to the target OpenStack cloud.
\ No newline at end of file

From e260818285eff2070c10c6831b74f0aa230693b3 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 11:37:16 +0000
Subject: [PATCH 44/78] filter out active images

---
 .github/workflows/upload-release-image.yml.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index 1ce1509f6..30b4b9660 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -56,7 +56,7 @@ jobs:
         if: cancelled()
         run: |
           . venv/bin/activate
-          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' queued$' | awk '{print $1}'
+          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}'
           if [ -n $image_hanging ]; then
             echo "Cleaning up OpenStack image with ID: $image_hanging"
             openstack image delete $image_hanging

From 37e7240a05a82b851b7f699d21c7f1c3befa91cd Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 12:28:44 +0000
Subject: [PATCH 45/78] fix image exists

---
 .github/bin/get-s3-image.sh                       | 2 +-
 .github/workflows/upload-release-image.yml.sample | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index da510fcb6..276248513 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -12,7 +12,7 @@ bucket_name=$2
 echo "Checking if image $image_name exists in OpenStack"
 image_exists=$(openstack image list --name "$image_name" -f value -c Name)
 
-if [ -n $image_exists ]; then
+if [ ! -n $image_exists ]; then
     echo "Image $image_name already exists in OpenStack."
 else
     echo "Image $image_name not found in OpenStack. Getting it from S3."
diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index 30b4b9660..46cfad534 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -57,7 +57,7 @@ jobs:
         run: |
           . venv/bin/activate
           image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}'
-          if [ -n $image_hanging ]; then
+          if [ ! -n $image_hanging ]; then
             echo "Cleaning up OpenStack image with ID: $image_hanging"
             openstack image delete $image_hanging
           else

From 7f0036ef799612eb66570881a86999ef0c95e1f0 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 12:52:20 +0000
Subject: [PATCH 46/78] fix image test logic

---
 .github/bin/get-s3-image.sh                       | 2 +-
 .github/workflows/upload-release-image.yml.sample | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index 276248513..da510fcb6 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -12,7 +12,7 @@ bucket_name=$2
 echo "Checking if image $image_name exists in OpenStack"
 image_exists=$(openstack image list --name "$image_name" -f value -c Name)
 
-if [ ! -n $image_exists ]; then
+if [ -n $image_exists ]; then
     echo "Image $image_name already exists in OpenStack."
 else
     echo "Image $image_name not found in OpenStack. Getting it from S3."
diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index 46cfad534..30b4b9660 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -57,7 +57,7 @@ jobs:
         run: |
           . venv/bin/activate
           image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}'
-          if [ ! -n $image_hanging ]; then
+          if [ -n $image_hanging ]; then
             echo "Cleaning up OpenStack image with ID: $image_hanging"
             openstack image delete $image_hanging
           else

From 57e1e494176bf78c273b2f34d6d5622c4cd2f898 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 13:05:10 +0000
Subject: [PATCH 47/78] add quotes to var

---
 .github/bin/get-s3-image.sh                       | 2 +-
 .github/workflows/upload-release-image.yml.sample | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh
index da510fcb6..7da721610 100644
--- a/.github/bin/get-s3-image.sh
+++ b/.github/bin/get-s3-image.sh
@@ -12,7 +12,7 @@ bucket_name=$2
 echo "Checking if image $image_name exists in OpenStack"
 image_exists=$(openstack image list --name "$image_name" -f value -c Name)
 
-if [ -n $image_exists ]; then
+if [ -n "$image_exists" ]; then
     echo "Image $image_name already exists in OpenStack."
 else
     echo "Image $image_name not found in OpenStack. Getting it from S3."
diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index 30b4b9660..d4b6e4c1d 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -56,7 +56,7 @@ jobs:
         if: cancelled()
         run: |
           . venv/bin/activate
-          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}'
+          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')
           if [ -n $image_hanging ]; then
             echo "Cleaning up OpenStack image with ID: $image_hanging"
             openstack image delete $image_hanging

From 0a8f9ed520beeef8a3d3fcc03fda1e782292530f Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 13:35:58 +0000
Subject: [PATCH 48/78] finalise for upstream

---
 .github/workflows/upload-release-image.yml.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index d4b6e4c1d..2d609d237 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -57,7 +57,7 @@ jobs:
         run: |
           . venv/bin/activate
           image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')
-          if [ -n $image_hanging ]; then
+          if [ -n "$image_hanging" ]; then
             echo "Cleaning up OpenStack image with ID: $image_hanging"
             openstack image delete $image_hanging
           else

From a45a6150cac3f6b4166c263df0d548523d4111bd Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 14:08:03 +0000
Subject: [PATCH 49/78] markdown test

---
 .github/workflows/upload-release-image.yml.sample | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index 2d609d237..67739f7dc 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -19,7 +19,8 @@ on:
     inputs:
       image_name:
         type: string
-        description: Image name from https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{BUCKET_NAME}/
+        description: Image name from [openhpc-images](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images}) or [openhpc-images-prerelease](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images-prerelease})
+
         required: true
       bucket_name:
         type: choice

From 86fb48dd600e19e0032bd0a73a882d1065adfde3 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 14:24:39 +0000
Subject: [PATCH 50/78] markdown block text

---
 .github/workflows/upgrade-check.yml           | 79 +++++++++++++++++++
 .github/workflows/upload-release-image.yml    | 70 ++++++++++++++++
 .../workflows/upload-release-image.yml.sample |  5 +-
 3 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/upgrade-check.yml
 create mode 100644 .github/workflows/upload-release-image.yml

diff --git a/.github/workflows/upgrade-check.yml b/.github/workflows/upgrade-check.yml
new file mode 100644
index 000000000..39efcd8fe
--- /dev/null
+++ b/.github/workflows/upgrade-check.yml
@@ -0,0 +1,79 @@
+# This workflow compares a downstream ansible-slurm-appliance repository for a specific site with the upstream
+# stackhpc/ansible-slurm-appliance repository to check whether there is a new upstream version available. If a
+# newer tag is found in the upstream repository then a pull request is created to the downstream repo
+# in order to merge in the changes from the new upstream release.
+#
+# To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
+# and give it an appropriate name, e.g.
+# cp .github/workflows/upgrade-check.yml.sample .github/workflows/upgrade-check.yml
+#
+# Workflow uses https://github.com/peter-evans/create-pull-request to handle the pull request action. 
+# See the docs for action inputs.
+#
+# In order for GitHub actions to create pull requests that make changes to workflows in `.github/workflows`, 
+# a token for each deployment must be provided. Both user PAT and fine-grained tokens should work, but it was tested
+# with a PAT. Fine-grained repo-scoped token is preferred if possible but requires organisation admin privileges.
+#
+# See https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
+# for security considerations around tokens. TREAT YOUR ACCESS TOKENS LIKE PASSWORDS.
+#
+# The following repository permissions must be set for the PAT:
+#  - `Workflows: Read and write`
+#  - `Actions: Read and write`
+#  - `Pull requests: Read and write`
+# The PAT should then be copied into an Actions repository secret in the downstream repo with the title `WORKFLOW_TOKEN`.
+
+name: Check for upstream updates
+on:
+  schedule:
+    - cron: "0 9 * * *"
+  workflow_dispatch:
+jobs:
+  check_for_update:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Checkout the config repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+
+      # Based on equivalent azimuth-config job
+      - name: Check for new release
+        shell: bash
+        run: |
+          set -xe
+
+          # Tell git who we are for commits
+          git config user.email "${{ github.actor }}-ci@slurmapp.ci"
+          git config user.name "${{ github.actor }} CI"
+
+          # Create the merge branch and write vars to .mergeenv file
+          .github/bin/create-merge-branch.sh
+
+      - name: Set release tag output
+        id: release_tag
+        if: ${{ hashFiles('.mergeenv') }}
+        run: source .mergeenv && echo value=$RELEASE_TAG >> $GITHUB_OUTPUT
+
+      - name: Set branch name output
+        id: branch_name
+        if: ${{ hashFiles('.mergeenv') }}
+        run: source .mergeenv && echo value=$BRANCH_NAME >> $GITHUB_OUTPUT
+
+      - name: Remove tmp file
+        run: rm .mergeenv
+        if: ${{ hashFiles('.mergeenv') }}
+
+      - name: Create Pull Request
+        if: ${{ steps.release_tag.outputs.value }}
+        uses: peter-evans/create-pull-request@v6
+        with:
+          base: main
+          branch: ${{ steps.branch_name.outputs.value }}
+          title: "Upgrade ansible-slurm-appliance to ${{ steps.release_tag.outputs.value }}"
+          body: This PR was automatically generated by GitHub Actions.
+          commit-message: "Upgrade ansible-slurm-appliance to ${{ steps.release_tag.outputs.value }}"
+          delete-branch: true
+          token: ${{ secrets.WORKFLOW_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/upload-release-image.yml b/.github/workflows/upload-release-image.yml
new file mode 100644
index 000000000..3cb29fb85
--- /dev/null
+++ b/.github/workflows/upload-release-image.yml
@@ -0,0 +1,70 @@
+# This workflow can be used to fetch images published by StackHPC and upload them to a client's OpenStack. 
+# The workflow takes two inputs:
+# - image name
+# - s3 bucket name
+# and first checks to see if the image exists in the target OpenStack. If the image doesn't exist, it is downloaded 
+# from StackHPC's public S3 bucket and then uploaded to the target OpenStack.
+#
+# To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
+# and give it an appropriate name, e.g.
+# cp .github/workflows/upload-s3-image.yml.sample .github/workflows/upload-s3-image.yml
+#
+# In order for the workflow to access the target OpenStack, an application credential clouds.yaml file must be 
+# added as a repository secret named OS_CLOUD_YAML.
+# Details on the contents of the clouds.yaml file can be found at https://docs.openstack.org/keystone/latest/user/application_credentials.html
+
+name: Upload release images to client sites from s3
+on:
+  workflow_dispatch:
+    inputs:
+      image_name:
+        type: string
+        description: |
+          Image name from:
+          [openhpc-images](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images})
+          or [openhpc-images-prerelease](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images-prerelease})
+
+        required: true
+      bucket_name:
+        type: choice
+        required: true
+        description: Bucket name
+        options:
+          - openhpc-images
+          # - openhpc-images-prerelease
+
+jobs:
+  image_upload:
+    runs-on: ubuntu-22.04
+    concurrency: ${{ github.ref }}
+    env: 
+      OS_CLOUD: openstack
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Write clouds.yaml
+        run: |
+          mkdir -p ~/.config/openstack/
+          echo "${{ secrets.OS_CLOUD_YAML }}" > ~/.config/openstack/clouds.yaml
+        shell: bash
+
+      - name: Upload latest image if missing
+        run: |
+          python3 -m venv venv
+          . venv/bin/activate
+          pip install -U pip
+          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
+          bash .github/bin/get-s3-image.sh ${{ inputs.image_name }} ${{ inputs.bucket_name }}
+
+      - name: Cleanup OpenStack Image (on error or cancellation)
+        if: cancelled()
+        run: |
+          . venv/bin/activate
+          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')
+          if [ -n "$image_hanging" ]; then
+            echo "Cleaning up OpenStack image with ID: $image_hanging"
+            openstack image delete $image_hanging
+          else
+            echo "No image ID found, skipping cleanup."
+          fi
+        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index 67739f7dc..3cb29fb85 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -19,7 +19,10 @@ on:
     inputs:
       image_name:
         type: string
-        description: Image name from [openhpc-images](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images}) or [openhpc-images-prerelease](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images-prerelease})
+        description: |
+          Image name from:
+          [openhpc-images](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images})
+          or [openhpc-images-prerelease](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images-prerelease})
 
         required: true
       bucket_name:

From 663e6cbc772b4bbc7489977d6e666347071d5cd1 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 23 Aug 2024 14:31:08 +0000
Subject: [PATCH 51/78] finish

---
 .github/workflows/upgrade-check.yml           | 79 -------------------
 .github/workflows/upload-release-image.yml    | 70 ----------------
 .../workflows/upload-release-image.yml.sample |  6 +-
 3 files changed, 1 insertion(+), 154 deletions(-)
 delete mode 100644 .github/workflows/upgrade-check.yml
 delete mode 100644 .github/workflows/upload-release-image.yml

diff --git a/.github/workflows/upgrade-check.yml b/.github/workflows/upgrade-check.yml
deleted file mode 100644
index 39efcd8fe..000000000
--- a/.github/workflows/upgrade-check.yml
+++ /dev/null
@@ -1,79 +0,0 @@
-# This workflow compares a downstream ansible-slurm-appliance repository for a specific site with the upstream
-# stackhpc/ansible-slurm-appliance repository to check whether there is a new upstream version available. If a
-# newer tag is found in the upstream repository then a pull request is created to the downstream repo
-# in order to merge in the changes from the new upstream release.
-#
-# To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
-# and give it an appropriate name, e.g.
-# cp .github/workflows/upgrade-check.yml.sample .github/workflows/upgrade-check.yml
-#
-# Workflow uses https://github.com/peter-evans/create-pull-request to handle the pull request action. 
-# See the docs for action inputs.
-#
-# In order for GitHub actions to create pull requests that make changes to workflows in `.github/workflows`, 
-# a token for each deployment must be provided. Both user PAT and fine-grained tokens should work, but it was tested
-# with a PAT. Fine-grained repo-scoped token is preferred if possible but requires organisation admin privileges.
-#
-# See https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
-# for security considerations around tokens. TREAT YOUR ACCESS TOKENS LIKE PASSWORDS.
-#
-# The following repository permissions must be set for the PAT:
-#  - `Workflows: Read and write`
-#  - `Actions: Read and write`
-#  - `Pull requests: Read and write`
-# The PAT should then be copied into an Actions repository secret in the downstream repo with the title `WORKFLOW_TOKEN`.
-
-name: Check for upstream updates
-on:
-  schedule:
-    - cron: "0 9 * * *"
-  workflow_dispatch:
-jobs:
-  check_for_update:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Checkout the config repo
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-
-      # Based on equivalent azimuth-config job
-      - name: Check for new release
-        shell: bash
-        run: |
-          set -xe
-
-          # Tell git who we are for commits
-          git config user.email "${{ github.actor }}-ci@slurmapp.ci"
-          git config user.name "${{ github.actor }} CI"
-
-          # Create the merge branch and write vars to .mergeenv file
-          .github/bin/create-merge-branch.sh
-
-      - name: Set release tag output
-        id: release_tag
-        if: ${{ hashFiles('.mergeenv') }}
-        run: source .mergeenv && echo value=$RELEASE_TAG >> $GITHUB_OUTPUT
-
-      - name: Set branch name output
-        id: branch_name
-        if: ${{ hashFiles('.mergeenv') }}
-        run: source .mergeenv && echo value=$BRANCH_NAME >> $GITHUB_OUTPUT
-
-      - name: Remove tmp file
-        run: rm .mergeenv
-        if: ${{ hashFiles('.mergeenv') }}
-
-      - name: Create Pull Request
-        if: ${{ steps.release_tag.outputs.value }}
-        uses: peter-evans/create-pull-request@v6
-        with:
-          base: main
-          branch: ${{ steps.branch_name.outputs.value }}
-          title: "Upgrade ansible-slurm-appliance to ${{ steps.release_tag.outputs.value }}"
-          body: This PR was automatically generated by GitHub Actions.
-          commit-message: "Upgrade ansible-slurm-appliance to ${{ steps.release_tag.outputs.value }}"
-          delete-branch: true
-          token: ${{ secrets.WORKFLOW_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/upload-release-image.yml b/.github/workflows/upload-release-image.yml
deleted file mode 100644
index 3cb29fb85..000000000
--- a/.github/workflows/upload-release-image.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-# This workflow can be used to fetch images published by StackHPC and upload them to a client's OpenStack. 
-# The workflow takes two inputs:
-# - image name
-# - s3 bucket name
-# and first checks to see if the image exists in the target OpenStack. If the image doesn't exist, it is downloaded 
-# from StackHPC's public S3 bucket and then uploaded to the target OpenStack.
-#
-# To use this workflow in a downstream ansible-slurm-appliance repository simply copy it into .github/workflows
-# and give it an appropriate name, e.g.
-# cp .github/workflows/upload-s3-image.yml.sample .github/workflows/upload-s3-image.yml
-#
-# In order for the workflow to access the target OpenStack, an application credential clouds.yaml file must be 
-# added as a repository secret named OS_CLOUD_YAML.
-# Details on the contents of the clouds.yaml file can be found at https://docs.openstack.org/keystone/latest/user/application_credentials.html
-
-name: Upload release images to client sites from s3
-on:
-  workflow_dispatch:
-    inputs:
-      image_name:
-        type: string
-        description: |
-          Image name from:
-          [openhpc-images](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images})
-          or [openhpc-images-prerelease](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images-prerelease})
-
-        required: true
-      bucket_name:
-        type: choice
-        required: true
-        description: Bucket name
-        options:
-          - openhpc-images
-          # - openhpc-images-prerelease
-
-jobs:
-  image_upload:
-    runs-on: ubuntu-22.04
-    concurrency: ${{ github.ref }}
-    env: 
-      OS_CLOUD: openstack
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Write clouds.yaml
-        run: |
-          mkdir -p ~/.config/openstack/
-          echo "${{ secrets.OS_CLOUD_YAML }}" > ~/.config/openstack/clouds.yaml
-        shell: bash
-
-      - name: Upload latest image if missing
-        run: |
-          python3 -m venv venv
-          . venv/bin/activate
-          pip install -U pip
-          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
-          bash .github/bin/get-s3-image.sh ${{ inputs.image_name }} ${{ inputs.bucket_name }}
-
-      - name: Cleanup OpenStack Image (on error or cancellation)
-        if: cancelled()
-        run: |
-          . venv/bin/activate
-          image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')
-          if [ -n "$image_hanging" ]; then
-            echo "Cleaning up OpenStack image with ID: $image_hanging"
-            openstack image delete $image_hanging
-          else
-            echo "No image ID found, skipping cleanup."
-          fi
-        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index 3cb29fb85..264a96143 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -19,11 +19,7 @@ on:
     inputs:
       image_name:
         type: string
-        description: |
-          Image name from:
-          [openhpc-images](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images})
-          or [openhpc-images-prerelease](https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{openhpc-images-prerelease})
-
+        description: Image name from: (https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{BUCKET_NAME})
         required: true
       bucket_name:
         type: choice

From 9e53ce6ce9535df4bd3c50df74e33451128d6540 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Fri, 6 Sep 2024 09:52:44 +0100
Subject: [PATCH 52/78] Add RL9 cuda build variant (#428)

* determine cuda distro automatically

* fix typo in CUDA samples

* make facts available for cuda

* add RL9 cuda build variant

* fix typo in build definitions

* set packer build volume sizes depending on build variant

* fix volume size definition

* fix cuda verfsion to workaround issue with 12-6-0-1

* don't fail all builds if one fails

* bump CUDA builder disk size (build ran out of space)

* download cuda image to /mnt on gh runner

* download cuda image to /mnt on gh runner

* fix fatimage.yml mnt permissions

* Update main.yml

* switch to open nvidia drivers

* bump CI images

* make packer build volume-backed optional again

---------

Co-authored-by: bertiethorpe <bertie443@gmail.com>
Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
---
 .github/workflows/fatimage.yml               | 16 +++++++++-----
 ansible/extras.yml                           |  2 +-
 ansible/roles/cuda/defaults/main.yml         |  7 +++---
 ansible/roles/cuda/tasks/main.yml            | 13 ++---------
 environments/.stackhpc/ARCUS.pkrvars.hcl     |  3 ---
 environments/.stackhpc/LEAFCLOUD.pkrvars.hcl |  3 ---
 environments/.stackhpc/terraform/main.tf     |  4 ++--
 packer/openstack.pkr.hcl                     | 23 +++++++++++++++-----
 8 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index 31fcc789a..7e2fc35b1 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -10,16 +10,20 @@ jobs:
     name: openstack-imagebuild
     runs-on: ubuntu-22.04
     strategy:
-      matrix:
+      fail-fast: false # allow other matrix jobs to continue even if one fails
+      matrix: # build RL8, RL9+OFED, RL9+CUDA versions
         os_version:
           - RL8
           - RL9
         build:
           - openstack.openhpc
           - openstack.openhpc-ofed
+          - openstack.openhpc-cuda
         exclude:
           - os_version: RL8
             build: openstack.openhpc-ofed
+          - os_version: RL8
+            build: openstack.openhpc-cuda
           - os_version: RL9
             build: openstack.openhpc
     env:
@@ -81,7 +85,9 @@ jobs:
       - name: Download image
         run: |
           . venv/bin/activate
-          openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
+          sudo mkdir /mnt/images
+          sudo chmod 777 /mnt/images
+          openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
@@ -95,13 +101,13 @@ jobs:
         run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
 
       - name: mount qcow2 file
-        run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
+        run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
 
       - name: Run Trivy vulnerability scanner
         uses: aquasecurity/trivy-action@0.17.0
         with:
           scan-type: fs
-          scan-ref: "./${{ steps.manifest.outputs.image-name }}"
+          scan-ref: "${{ steps.manifest.outputs.image-name }}"
           scanners: "vuln"
           format: sarif
           output: "${{ steps.manifest.outputs.image-name }}.sarif"
@@ -117,7 +123,7 @@ jobs:
         uses: aquasecurity/trivy-action@0.16.1
         with:
           scan-type: fs
-          scan-ref: "./${{ steps.manifest.outputs.image-name }}"
+          scan-ref: "${{ steps.manifest.outputs.image-name }}"
           scanners: "vuln"
           format: table
           exit-code: '1'
diff --git a/ansible/extras.yml b/ansible/extras.yml
index 445a0cc16..c32f51c32 100644
--- a/ansible/extras.yml
+++ b/ansible/extras.yml
@@ -21,7 +21,7 @@
 - name: Setup CUDA
   hosts: cuda
   become: yes
-  gather_facts: no
+  gather_facts: yes
   tags: cuda
   tasks:
     - import_role:
diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml
index 6b377a10b..33a25d9b4 100644
--- a/ansible/roles/cuda/defaults/main.yml
+++ b/ansible/roles/cuda/defaults/main.yml
@@ -1,11 +1,12 @@
-cuda_distro: rhel8
+cuda_distro: "rhel{{ ansible_distribution_major_version }}"
 cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
 cuda_driver_stream: default
+cuda_package_version: 'latest'
 cuda_packages:
-  - cuda
+  - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
   - nvidia-gds
 # _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
-cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
+cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}"
 cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
 cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
 cuda_samples_programs:
diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/main.yml
index b323cfc04..22f8e9e8e 100644
--- a/ansible/roles/cuda/tasks/main.yml
+++ b/ansible/roles/cuda/tasks/main.yml
@@ -24,22 +24,13 @@
   failed_when: false
   register: _cuda_driver_module_enabled
 
-- name: List nvidia driver dnf module stream versions
-  shell:
-    cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
-  # Output of interest from command is something like (some whitespace removed):
-  # "nvidia-driver 418-dkms   default [d], fm, ks    Nvidia driver for 418-dkms branch "
-  changed_when: false
-  register: _cuda_driver_module_streams
-  when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
-
 - name: Enable nvidia driver module
-  ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
+  ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
   register: _cuda_driver_module_enable
   when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
   changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
 
-- name: Install nvidia drivers # TODO: make removal possible?
+- name: Install nvidia drivers
   ansible.builtin.command: dnf module install -y nvidia-driver
   register: _cuda_driver_install
   when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl
index c07717156..6fd80e7a6 100644
--- a/environments/.stackhpc/ARCUS.pkrvars.hcl
+++ b/environments/.stackhpc/ARCUS.pkrvars.hcl
@@ -1,7 +1,4 @@
 flavor = "vm.ska.cpu.general.small"
-use_blockstorage_volume = true
-volume_size = 15 # GB
-image_disk_format = "qcow2"
 networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
 ssh_keypair_name = "slurm-app-ci"
 ssh_private_key_file = "~/.ssh/id_rsa"
diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
index da2d96d38..5adf4199c 100644
--- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
+++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
@@ -1,8 +1,5 @@
 flavor = "ec1.large"
-use_blockstorage_volume = true
-volume_size = 15 # GB
 volume_type = "unencrypted"
-image_disk_format = "qcow2"
 networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci
 ssh_keypair_name = "slurm-app-ci"
 ssh_private_key_file = "~/.ssh/id_rsa"
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index 53ed174f4..a8c5d787f 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -30,8 +30,8 @@ variable "cluster_image" {
     type = map(string)
     default = {
         # https://github.com/stackhpc/ansible-slurm-appliance/pull/413
-        RL8: "openhpc-RL8-240813-1317-1b370a36"
-        RL9: "openhpc-ofed-RL9-240813-1317-1b370a36"
+        RL8: "openhpc-RL8-240904-1509-1687368f"
+        RL9: "openhpc-ofed-RL9-240904-1509-1687368f"
     }
 }
 
diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl
index bb6af1a38..5f66c0320 100644
--- a/packer/openstack.pkr.hcl
+++ b/packer/openstack.pkr.hcl
@@ -120,7 +120,7 @@ variable "manifest_output_path" {
 
 variable "use_blockstorage_volume" {
   type = bool
-  default = false
+  default = true
 }
 
 variable "volume_type" {
@@ -129,13 +129,18 @@ variable "volume_type" {
 }
 
 variable "volume_size" {
-  type = number
-  default = null # When not specified use the size of the builder instance root disk
+  type = map(number)
+  default = {
+    # fat image builds, GB:
+    openhpc = 15
+    openhpc-ofed = 15
+    openhpc-cuda = 30
+  }
 }
 
 variable "image_disk_format" {
   type = string
-  default = null # When not specified use the image default
+  default = "qcow2"
 }
 
 variable "metadata" {
@@ -150,6 +155,7 @@ variable "groups" {
     # fat image builds:
     openhpc = ["control", "compute", "login"]
     openhpc-ofed = ["control", "compute", "login", "ofed"]
+    openhpc-cuda = ["control", "compute", "login", "ofed", "cuda"]
   }
 }
 
@@ -158,11 +164,11 @@ source "openstack" "openhpc" {
   flavor = var.flavor
   use_blockstorage_volume = var.use_blockstorage_volume
   volume_type = var.volume_type
+  volume_size = var.volume_size[source.name]
   metadata = var.metadata
   networks = var.networks
   floating_ip_network = var.floating_ip_network
   security_groups = var.security_groups
-  volume_size = var.volume_size
   
   # Input image:
   source_image = "${var.source_image[var.os_version]}"
@@ -178,7 +184,7 @@ source "openstack" "openhpc" {
   ssh_bastion_private_key_file = var.ssh_bastion_private_key_file
   
   # Output image:
-  image_disk_format = var.image_disk_format
+  image_disk_format = "qcow2"
   image_visibility = var.image_visibility
   image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
 }
@@ -195,6 +201,11 @@ build {
     name = "openhpc-ofed"
   }
 
+  # CUDA fat image:
+  source "source.openstack.openhpc" {
+    name = "openhpc-cuda"
+  }
+
   # Extended site-specific image, built on fat image:
   source "source.openstack.openhpc" {
     name = "openhpc-extra"

From 80c4cebeab42b5d12dac246db283815fbdd989a5 Mon Sep 17 00:00:00 2001
From: Matt Crees <mattc@stackhpc.com>
Date: Fri, 6 Sep 2024 17:45:47 +0100
Subject: [PATCH 53/78] Build RL8+OFED image in CI (#427)

* Check major version for RL8 package installs

* Gather facts on ofed role

* Support kernel checks with mismatching version length

4.18.0-553.16.1.el8_9.x86_64
4.18.0-553.el8_9.x86_64
These would fail with the error:

'<' not supported between instances of 'str' and 'int'.

as the community.general.version_sort was trying to compare the `el8_9` of the latter with the `16` of the former.

Strip the last two chunks so we just compare numbers.

* Move to LTS version now RL9.4 is supported

* Fail when any inventory source cannot be parsed

* Always reboot after selinux and package updates

* Cleat facts before OFED so install will match newest kernel

* Clear facts after reboot so OFED install will match newest kernel

* fail caas and stackhpc if any inventory can't be read

* make reboot conditional on package or SELinux changes again

* include OFED in both RL8 and RL9 builds

* always run CI tests on RL8 and RL9

* allow concurrent RL8/RL9 CI tests

* mark pending reboot check as not a change

* fix workflow matrix definitions

* bump CI images - now both OFED

* use reboot hint for checking reboot required

---------

Co-authored-by: Steve Brasier <steveb@stackhpc.com>
---
 .github/workflows/fatimage.yml                | 15 +++------
 .github/workflows/stackhpc.yml                | 33 +++++--------------
 ansible/bootstrap.yml                         | 19 ++++++-----
 ansible/roles/ofed/defaults/main.yml          |  3 +-
 ansible/roles/ofed/tasks/install.yml          |  8 +++--
 environments/.caas/ansible.cfg                |  4 +++
 environments/.stackhpc/ansible.cfg            |  4 +++
 environments/.stackhpc/terraform/main.tf      |  6 ++--
 .../{{cookiecutter.environment}}/ansible.cfg  |  4 +++
 9 files changed, 45 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index 7e2fc35b1..59eb1b78e 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -1,31 +1,26 @@
 
 name: Build fat image
-'on':
+on:
   workflow_dispatch:
-concurrency:
-  group: ${{ github.ref }}-{{ matrix.os_version }}-{{ matrix.build }} # to branch/PR + OS + build
-  cancel-in-progress: true
 jobs:
   openstack:
     name: openstack-imagebuild
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
+      cancel-in-progress: true
     runs-on: ubuntu-22.04
     strategy:
       fail-fast: false # allow other matrix jobs to continue even if one fails
-      matrix: # build RL8, RL9+OFED, RL9+CUDA versions
+      matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
         os_version:
           - RL8
           - RL9
         build:
-          - openstack.openhpc
           - openstack.openhpc-ofed
           - openstack.openhpc-cuda
         exclude:
-          - os_version: RL8
-            build: openstack.openhpc-ofed
           - os_version: RL8
             build: openstack.openhpc-cuda
-          - os_version: RL9
-            build: openstack.openhpc
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index 1813eac13..711d24c21 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -2,12 +2,6 @@
 name: Test deployment and reimage on OpenStack
 on:
   workflow_dispatch:
-    inputs:
-      use_RL8:
-        required: true
-        description: Include RL8 tests
-        type: boolean
-        default: false
   push:
     branches:
       - main
@@ -15,27 +9,22 @@ on:
 jobs:
   openstack:
     name: openstack-ci
-    concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS
+      cancel-in-progress: true
     runs-on: ubuntu-22.04
     strategy:
+      fail-fast: false # allow other matrix jobs to continue even if one fails
       matrix:
-        os_version: [RL8, RL9]
-        rl8_selected:
-          - ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch
-        rl8_branch:
-          - ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge
-        rl8_label:
-          - ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created
-        exclude:
-          - os_version: RL8
-            rl8_selected: false
-            rl8_branch: false
-            rl8_label: false
+        os_version:
+          - RL8
+          - RL9
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
       TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
       CI_CLOUD: ${{ vars.CI_CLOUD }}
+      TF_VAR_os_version: ${{ matrix.os_version }}
     steps:
       - uses: actions/checkout@v2
 
@@ -89,8 +78,6 @@ jobs:
           . environments/.stackhpc/activate
           cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
           terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
-        env:
-          TF_VAR_os_version: ${{ matrix.os_version }}
 
       - name: Delete infrastructure if provisioning failed
         run: |
@@ -99,8 +86,6 @@ jobs:
           cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
           terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
         if: failure() && steps.provision_servers.outcome == 'failure'
-        env:
-          TF_VAR_os_version: ${{ matrix.os_version }}
 
       - name: Configure cluster
         run: |
@@ -199,8 +184,6 @@ jobs:
           cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
           terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
         if: ${{ success() || cancelled() }}
-        env:
-          TF_VAR_os_version: ${{ matrix.os_version }}
 
       # - name: Delete images
       #   run: |
diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml
index e8e2713a5..c43d614db 100644
--- a/ansible/bootstrap.yml
+++ b/ansible/bootstrap.yml
@@ -227,24 +227,25 @@
     - update
   tasks:
     - name: Check for pending reboot from package updates
-      stat: 
-        path: /var/run/reboot-required
+      command:
+        cmd: dnf needs-restarting -r
       register: update_reboot_required
-    - debug:
-        msg: "setstatus:{{ (sestatus.reboot_required | default(false)) }} packages: {{ (update_reboot_required.stat.exists | bool) }}"
-    - name: Reboot if required from SELinux state change or package upgrades
+      failed_when: "update_reboot_required.rc not in [0, 1]"
+      changed_when: false
+    - name: Reboot to cover SELinux state change or package upgrades
       reboot:
         post_reboot_delay: 30
-      when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.stat.exists | bool)
+      when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.rc == 1)
     - name: Wait for hosts to be reachable
       wait_for_connection:
         sleep: 15
-    - name: update facts
+    - name: Clear facts
+      meta: clear_facts
+    - name: Update facts
       setup:
-      when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false))
 
 - hosts: ofed
-  gather_facts: no
+  gather_facts: yes
   become: yes
   tags: ofed
   tasks:
diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml
index 7233809bc..0d040b55e 100644
--- a/ansible/roles/ofed/defaults/main.yml
+++ b/ansible/roles/ofed/defaults/main.yml
@@ -1,7 +1,8 @@
-ofed_version: '24.04-0.6.6.0'  # LTS version 23.10-2.1.3.1 does not support RL9.4
+ofed_version: '23.10-3.2.2.0' # LTS
 ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz
 ofed_distro: rhel # NB: not expected to work on other distros due to installation differences
 ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9'
+ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8'
 ofed_arch: "{{ ansible_architecture }}"
 ofed_tmp_dir: /tmp
 ofed_update_firmware: false
diff --git a/ansible/roles/ofed/tasks/install.yml b/ansible/roles/ofed/tasks/install.yml
index 454ef787e..45f341bf9 100644
--- a/ansible/roles/ofed/tasks/install.yml
+++ b/ansible/roles/ofed/tasks/install.yml
@@ -10,11 +10,13 @@
 
 - name: Check current kernel is newest installed
   assert:
-    that: _ofed_loaded_kernel.stdout == _ofed_dnf_kernels_newest
+    that: _ofed_kernel_current == _ofed_dnf_kernels_newest
     fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?"
   vars:
+    _ofed_kernel_current: >-
+      {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }}
     _ofed_dnf_kernels_newest: >-
-      {{ _ofed_dnf_kernels.stdout_lines[1:] | map('regex_replace', '^\w+\.(\w+)\s+(\S+)\s+\S+\s*$', '\2.\1') | community.general.version_sort | last }}
+      {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }}
     # dnf line format e.g. "kernel.x86_64  4.18.0-513.18.1.el8_9   @baseos  "
 
 - name: Enable epel
@@ -31,7 +33,7 @@
 
 - name: Install build prerequisites
   dnf:
-    name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_version == '8.9' else []) }}"
+    name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_major_version == '8' else []) }}"
   when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout"
   # don't want to install a load of prereqs unnecessarily
 
diff --git a/environments/.caas/ansible.cfg b/environments/.caas/ansible.cfg
index 54a1c2a50..922f086aa 100644
--- a/environments/.caas/ansible.cfg
+++ b/environments/.caas/ansible.cfg
@@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins
 [ssh_connection]
 ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
 pipelining = True
+
+[inventory]
+# Fail when any inventory source cannot be parsed.
+any_unparsed_is_failed = True
diff --git a/environments/.stackhpc/ansible.cfg b/environments/.stackhpc/ansible.cfg
index aa0ec5aaf..26587e33f 100644
--- a/environments/.stackhpc/ansible.cfg
+++ b/environments/.stackhpc/ansible.cfg
@@ -14,3 +14,7 @@ filter_plugins = ../../ansible/filter_plugins
 [ssh_connection]
 ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
 pipelining = True
+
+[inventory]
+# Fail when any inventory source cannot be parsed.
+any_unparsed_is_failed = True
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index a8c5d787f..45cf5db1a 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,9 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/413
-        RL8: "openhpc-RL8-240904-1509-1687368f"
-        RL9: "openhpc-ofed-RL9-240904-1509-1687368f"
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/427
+        RL8: "openhpc-ofed-RL8-240906-1042-32568dbb"
+        RL9: "openhpc-ofed-RL9-240906-1041-32568dbb"
     }
 }
 
diff --git a/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg b/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg
index 2a12e06b6..04c1fe143 100644
--- a/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg
+++ b/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg
@@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins
 [ssh_connection]
 ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
 pipelining = True
+
+[inventory]
+# Fail when any inventory source cannot be parsed.
+any_unparsed_is_failed = True

From 554f16f6df33f66590508981e050105514cbb4aa Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 9 Sep 2024 15:32:23 +0100
Subject: [PATCH 54/78] Create extract_logs.py

extract fatimage logs and process ansible timings
---
 dev/extract_logs.py | 72 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 dev/extract_logs.py

diff --git a/dev/extract_logs.py b/dev/extract_logs.py
new file mode 100644
index 000000000..f1b468cae
--- /dev/null
+++ b/dev/extract_logs.py
@@ -0,0 +1,72 @@
+import csv
+import re
+import os
+
+def convert_time_to_seconds(time_str):
+    h, m, s = time_str.split(':')
+    return int(h) * 3600 + int(m) * 60 + float(s)
+
+def extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory):
+    data = []
+
+    unwanted_chars = re.compile(r'(\x1B\[[0-9;]*m)|([^\x00-\x7F])')
+
+    with open(log_file_path, 'r') as file:
+        lines = file.readlines()
+
+        previous_task = None
+
+        for i in range(len(lines)):
+            if "TASK [" in lines[i]:
+                task_name = lines[i].strip().split('TASK [')[1].split(']')[0]
+
+                full_task_path = lines[i + 1].strip().split('task path: ')[1]
+                if target_directory in full_task_path:
+                    start_index = full_task_path.find(target_directory) + len(target_directory)
+                    partial_task_path = full_task_path[start_index:]
+                else:
+                    partial_task_path = full_task_path
+
+                partial_task_path = unwanted_chars.sub('', partial_task_path).strip()
+
+                time_to_complete = lines[i + 2].strip().split('(')[1].split(')')[0]
+
+                if previous_task:
+                    previous_task[2] = time_to_complete  # Shift the time to the previous task
+                    data.append(previous_task)
+
+                previous_task = [task_name, partial_task_path, None]  # Placeholder for the next time_to_complete
+
+        # Ensure the last task is also included
+        if previous_task:
+            previous_task[2] = time_to_complete if time_to_complete else 'N/A'
+            data.append(previous_task)
+
+    # Convert time strings to seconds for sorting
+    for row in data:
+        if row[2] != 'N/A':
+            row[2] = convert_time_to_seconds(row[2])
+
+    # Sort the data by time (now in seconds)
+    data.sort(key=lambda x: x[2], reverse=True)
+
+    # Convert times back to original string format
+    for row in data:
+        if isinstance(row[2], float):
+            row[2] = f'{int(row[2] // 3600):02}:{int((row[2] % 3600) // 60):02}:{row[2] % 60:.3f}'
+
+    # Write the sorted data to a CSV file
+    with open(output_csv_path, 'w', newline='') as csvfile:
+        csvwriter = csv.writer(csvfile)
+        csvwriter.writerow(['Task Name', 'Task Path', 'Time to Complete'])
+        csvwriter.writerows(data)
+
+    print(f"Data extracted, sorted, and saved to {output_csv_path}")
+
+# File paths
+log_file_path = './RL9-ofed-fatimage-177.txt'
+output_csv_path = 'RL9-ofed-fatimage-177.csv'
+target_directory = '/ansible/'
+
+# Run the function
+extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory)

From 756a1fa7cbea6dd40cde923ed4d7f49cd659f718 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 9 Sep 2024 15:36:45 +0100
Subject: [PATCH 55/78] Update extract_logs.py

---
 dev/extract_logs.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/dev/extract_logs.py b/dev/extract_logs.py
index f1b468cae..0f73c616b 100644
--- a/dev/extract_logs.py
+++ b/dev/extract_logs.py
@@ -37,36 +37,29 @@ def extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_dir
 
                 previous_task = [task_name, partial_task_path, None]  # Placeholder for the next time_to_complete
 
-        # Ensure the last task is also included
         if previous_task:
             previous_task[2] = time_to_complete if time_to_complete else 'N/A'
             data.append(previous_task)
 
-    # Convert time strings to seconds for sorting
     for row in data:
         if row[2] != 'N/A':
             row[2] = convert_time_to_seconds(row[2])
 
-    # Sort the data by time (now in seconds)
     data.sort(key=lambda x: x[2], reverse=True)
 
-    # Convert times back to original string format
     for row in data:
         if isinstance(row[2], float):
             row[2] = f'{int(row[2] // 3600):02}:{int((row[2] % 3600) // 60):02}:{row[2] % 60:.3f}'
 
-    # Write the sorted data to a CSV file
     with open(output_csv_path, 'w', newline='') as csvfile:
         csvwriter = csv.writer(csvfile)
         csvwriter.writerow(['Task Name', 'Task Path', 'Time to Complete'])
         csvwriter.writerows(data)
 
     print(f"Data extracted, sorted, and saved to {output_csv_path}")
+    
+log_file_path = './RL9-ofed-fatimage-177.txt' # Input workflow log name
+output_csv_path = 'RL9-ofed-fatimage-177.csv' # Output CSV name
+target_directory = '/ansible/' # Shared directory for task path
 
-# File paths
-log_file_path = './RL9-ofed-fatimage-177.txt'
-output_csv_path = 'RL9-ofed-fatimage-177.csv'
-target_directory = '/ansible/'
-
-# Run the function
 extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory)

From 2932a9d5825a7a6396c7d4f171d34faaa113567b Mon Sep 17 00:00:00 2001
From: sd109 <sdavidson327@gmail.com>
Date: Mon, 9 Sep 2024 16:11:38 +0100
Subject: [PATCH 56/78] Ignore irrelevant paths in workflow trigger

---
 .github/workflows/stackhpc.yml | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index 711d24c21..cd4fd7d83 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -5,7 +5,17 @@ on:
   push:
     branches:
       - main
+    paths-ignore:
+      - dev/**
+      - docs/**
+      - README.md
+      - .gitignore
   pull_request:
+    paths-ignore:
+      - dev/**
+      - docs/**
+      - README.md
+      - .gitignore
 jobs:
   openstack:
     name: openstack-ci
@@ -39,11 +49,11 @@ jobs:
           echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
           chmod 0600 ~/.ssh/id_rsa
         shell: bash
-        
+
       - name: Add bastion's ssh key to known_hosts
         run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
         shell: bash
-      
+
       - name: Install ansible etc
         run: dev/setup-env.sh
 
@@ -51,11 +61,11 @@ jobs:
         uses: opentofu/setup-opentofu@v1
         with:
           tofu_version: 1.6.2
-      
+
       - name: Initialise terraform
         run: terraform init
         working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform
-        
+
       - name: Write clouds.yaml
         run: |
           mkdir -p ~/.config/openstack/
@@ -111,14 +121,14 @@ jobs:
         run: |
           . venv/bin/activate
           . environments/.stackhpc/activate
-          
+
           # load ansible variables into shell:
           ansible-playbook ansible/ci/output_vars.yml \
             -e output_vars_hosts=openondemand \
             -e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \
             -e output_vars_items=bastion_ip,bastion_user,openondemand_servername
           source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt
-          
+
           # setup ssh proxying:
           sudo apt-get --yes install proxychains
           echo proxychains installed
@@ -155,7 +165,7 @@ jobs:
       #     ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
       #     ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
       #     ansible-playbook -v ansible/ci/check_slurm.yml
-      
+
       - name: Test reimage of login and control nodes (via rebuild adhoc)
         run: |
           . venv/bin/activate
@@ -164,7 +174,7 @@ jobs:
           ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
           ansible-playbook -v ansible/site.yml
           ansible-playbook -v ansible/ci/check_slurm.yml
-      
+
       - name: Check sacct state survived reimage
         run: |
           . venv/bin/activate

From cfee7b602cdeeda8a9a2e7f08807823fa2802a8b Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 16 Sep 2024 09:37:23 +0100
Subject: [PATCH 57/78] Update extract_logs.py

---
 dev/extract_logs.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/dev/extract_logs.py b/dev/extract_logs.py
index 0f73c616b..af30f74d0 100644
--- a/dev/extract_logs.py
+++ b/dev/extract_logs.py
@@ -1,6 +1,10 @@
+
+
+
 import csv
 import re
 import os
+import sys
 
 def convert_time_to_seconds(time_str):
     h, m, s = time_str.split(':')
@@ -58,8 +62,11 @@ def extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_dir
 
     print(f"Data extracted, sorted, and saved to {output_csv_path}")
     
-log_file_path = './RL9-ofed-fatimage-177.txt' # Input workflow log name
-output_csv_path = 'RL9-ofed-fatimage-177.csv' # Output CSV name
+if len(sys.argv) != 2:
+    print("Path to workflow log plain text file should be provided as the only arg to this script")
+    sys.exit(1)
+log_file_path = sys.argv[1] # Input workflow log name
+output_csv_path = log_file_path.replace('.txt.', '.csv') # Output CSV name
 target_directory = '/ansible/' # Shared directory for task path
 
 extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory)

From 9bdc696dd373c44561748940c378f3968de0f836 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 16 Sep 2024 09:47:18 +0100
Subject: [PATCH 58/78] Update extract_logs.py

---
 dev/extract_logs.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/dev/extract_logs.py b/dev/extract_logs.py
index af30f74d0..91923f1a0 100644
--- a/dev/extract_logs.py
+++ b/dev/extract_logs.py
@@ -1,5 +1,15 @@
+#!/usr/bin/env python
 
+"""
+Process packer build workflow logs into CSV. Useful for timing 
+dissemination.
 
+Usage:
+    extract_logs.py <logs.txt>
+
+Where logs.txt is the name of the workflow log downloaded.
+It will list task name, against task directory path, against time to complete.
+"""
 
 import csv
 import re

From 9728489fabfeb6b15d30558ad436dd366f803433 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 16 Sep 2024 09:56:58 +0100
Subject: [PATCH 59/78] Update stackhpc.yml

---
 .github/workflows/stackhpc.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index cd4fd7d83..880b94ffc 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -7,12 +7,14 @@ on:
       - main
     paths-ignore:
       - dev/**
+      - !dev/setup-env.sh
       - docs/**
       - README.md
       - .gitignore
   pull_request:
     paths-ignore:
       - dev/**
+      - !dev/setup-env.sh
       - docs/**
       - README.md
       - .gitignore

From dd7bec38f2184db98772474c7852073592ec24f1 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:53:40 +0100
Subject: [PATCH 60/78] Update stackhpc.yml

---
 .github/workflows/stackhpc.yml | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index 880b94ffc..ac96b9571 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -5,19 +5,21 @@ on:
   push:
     branches:
       - main
-    paths-ignore:
-      - dev/**
-      - !dev/setup-env.sh
-      - docs/**
-      - README.md
-      - .gitignore
+    paths:
+      - **
+      - !dev/**
+      - dev/setup-env.sh
+      - !docs/**
+      - !README.md
+      - !.gitignore
   pull_request:
-    paths-ignore:
-      - dev/**
-      - !dev/setup-env.sh
-      - docs/**
-      - README.md
-      - .gitignore
+    paths:
+      - **
+      - !dev/**
+      - dev/setup-env.sh
+      - !docs/**
+      - !README.md
+      - !.gitignore
 jobs:
   openstack:
     name: openstack-ci

From 1c78e5ba6cc6f7c96ba1e4461cfbba284cdce8dd Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:57:18 +0100
Subject: [PATCH 61/78] Update stackhpc.yml

---
 .github/workflows/stackhpc.yml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index ac96b9571..52856a18f 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -6,20 +6,20 @@ on:
     branches:
       - main
     paths:
-      - **
-      - !dev/**
-      - dev/setup-env.sh
-      - !docs/**
-      - !README.md
-      - !.gitignore
+      - '**'
+      - '!dev/**'
+      - 'dev/setup-env.sh'
+      - '!docs/**'
+      - '!README.md'
+      - '!.gitignore'
   pull_request:
     paths:
-      - **
-      - !dev/**
-      - dev/setup-env.sh
-      - !docs/**
-      - !README.md
-      - !.gitignore
+      - '**'
+      - '!dev/**'
+      - 'dev/setup-env.sh'
+      - '!docs/**'
+      - '!README.md'
+      - '!.gitignore'
 jobs:
   openstack:
     name: openstack-ci

From cabdd99aff55faba68d1ef9dc46d87be79144719 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Tue, 17 Sep 2024 16:30:37 +0100
Subject: [PATCH 62/78] Enable SMS Labs for CI (#426)

* configure SMS deployment

* add slurm-app-ci user

* bastion config

* allow overriding CI_CLOUD for PRs using label

* choose cloud for fatimage workflow_dispatch

* packer build qcows

* bump fatimage

* update packer common vars

* Update fatimage.yml

* Update main.tf images

* Update fatimage.yml

* revert fatimage.yml changes

* Update fatimage.yml

* Update fatimage.yml

* hcl packer var put string in quotes

---------

Co-authored-by: Steve Brasier <steveb@stackhpc.com>
---
 .github/workflows/fatimage.yml                | 21 +++++++++++---
 .github/workflows/stackhpc.yml                | 28 ++++++++++++++-----
 environments/.stackhpc/SMS.pkrvars.hcl        |  7 +++++
 environments/.stackhpc/bastion_fingerprints   |  5 +++-
 .../inventory/group_vars/all/bastion.yml      |  3 ++
 environments/.stackhpc/terraform/SMS.tfvars   |  4 +++
 6 files changed, 56 insertions(+), 12 deletions(-)
 create mode 100644 environments/.stackhpc/SMS.pkrvars.hcl
 create mode 100644 environments/.stackhpc/terraform/SMS.tfvars

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index 59eb1b78e..9bf05f62a 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -2,6 +2,15 @@
 name: Build fat image
 on:
   workflow_dispatch:
+    inputs:
+      ci_cloud:
+        description: 'Select the CI_CLOUD'
+        required: true
+        type: choice
+        options:
+          - LEAFCLOUD
+          - SMS
+          - ARCUS
 jobs:
   openstack:
     name: openstack-imagebuild
@@ -24,15 +33,19 @@ jobs:
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
-      CI_CLOUD: ${{ vars.CI_CLOUD }}
+      CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
     steps:
       - uses: actions/checkout@v2
 
+      - name: Record settings for CI cloud
+        run: |
+          echo CI_CLOUD: ${{ env.CI_CLOUD }}
+
       - name: Setup ssh
         run: |
           set -x
           mkdir ~/.ssh
-          echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
+          echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
           chmod 0600 ~/.ssh/id_rsa
         shell: bash
 
@@ -46,7 +59,7 @@ jobs:
       - name: Write clouds.yaml
         run: |
           mkdir -p ~/.config/openstack/
-          echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
         shell: bash
 
       - name: Setup environment
@@ -61,7 +74,7 @@ jobs:
           . environments/.stackhpc/activate
           cd packer/
           packer init .
-          PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only=${{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
+          PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only=${{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
         env:
           PKR_VAR_os_version: ${{ matrix.os_version }}
 
diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
index 52856a18f..3dce03317 100644
--- a/.github/workflows/stackhpc.yml
+++ b/.github/workflows/stackhpc.yml
@@ -37,20 +37,34 @@ jobs:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
       TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
-      CI_CLOUD: ${{ vars.CI_CLOUD }}
+      CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
       TF_VAR_os_version: ${{ matrix.os_version }}
     steps:
       - uses: actions/checkout@v2
 
+      - name: Override CI_CLOUD if PR label is present
+        if: ${{ github.event_name == 'pull_request' }}
+        run: |
+          # Iterate over the labels
+          labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name')
+          echo $labels
+          for label in $labels; do
+             if [[ $label == CI_CLOUD=* ]]; then
+              # Extract the value after 'CI_CLOUD='
+              CI_CLOUD_OVERRIDE=${label#CI_CLOUD=}
+              echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV
+            fi
+          done
+
       - name: Record settings for CI cloud
         run: |
-          echo CI_CLOUD: ${{ vars.CI_CLOUD }}
+          echo CI_CLOUD: ${{ env.CI_CLOUD }}
 
       - name: Setup ssh
         run: |
           set -x
           mkdir ~/.ssh
-          echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
+          echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
           chmod 0600 ~/.ssh/id_rsa
         shell: bash
 
@@ -73,7 +87,7 @@ jobs:
       - name: Write clouds.yaml
         run: |
           mkdir -p ~/.config/openstack/
-          echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
         shell: bash
 
       - name: Setup environment-specific inventory/terraform inputs
@@ -91,14 +105,14 @@ jobs:
           . venv/bin/activate
           . environments/.stackhpc/activate
           cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
-          terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
+          terraform apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
 
       - name: Delete infrastructure if provisioning failed
         run: |
           . venv/bin/activate
           . environments/.stackhpc/activate
           cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
-          terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
+          terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
         if: failure() && steps.provision_servers.outcome == 'failure'
 
       - name: Configure cluster
@@ -196,7 +210,7 @@ jobs:
           . venv/bin/activate
           . environments/.stackhpc/activate
           cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
-          terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
+          terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
         if: ${{ success() || cancelled() }}
 
       # - name: Delete images
diff --git a/environments/.stackhpc/SMS.pkrvars.hcl b/environments/.stackhpc/SMS.pkrvars.hcl
new file mode 100644
index 000000000..b88106fe8
--- /dev/null
+++ b/environments/.stackhpc/SMS.pkrvars.hcl
@@ -0,0 +1,7 @@
+flavor = "general.v1.small"
+networks = ["e2b9e59f-43da-4e1c-b558-dc9da4c0d738"] # stackhpc-ipv4-geneve
+ssh_keypair_name = "slurm-app-ci"
+ssh_private_key_file = "~/.ssh/id_rsa"
+ssh_bastion_username = "slurm-app-ci"
+ssh_bastion_host = "185.45.78.150"
+ssh_bastion_private_key_file = "~/.ssh/id_rsa"
\ No newline at end of file
diff --git a/environments/.stackhpc/bastion_fingerprints b/environments/.stackhpc/bastion_fingerprints
index 8939708a1..8596c1694 100644
--- a/environments/.stackhpc/bastion_fingerprints
+++ b/environments/.stackhpc/bastion_fingerprints
@@ -2,4 +2,7 @@
 |1|whGSPLhKW4xt/7PWOZ1treg3PtA=|F5gwV8j0JYWDzjb6DvHHaqO+sxs= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCpCG881Gt3dr+nuVIC2uGEQkeVwG6WDdS1WcCoxXC7AG+Oi5bfdqtf4IfeLpWmeuEaAaSFH48ODFr76ViygSjU=
 |1|0V6eQ1FKO5NMKaHZeNFbw62mrJs=|H1vuGTbbtZD2MEgZxQf1PXPk+yU= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEnOtYByM3s2qvRT8SS1sn5z5sbwjzb1alm0B3emPcHJ
 |1|u3QVAK9R2x7Z3uKNj+0vDEIekl0=|yy09Q0Kw472+J7bjFkmir28x3lE= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINNuXZkH7ppkTGNGKzmGEvAnvlLO2D+YtlJw1m3P16FV
-|1|nOHeibGxhsIFnhW0flRwnirJjlg=|IJ8nJB355LGI+1U3Wpvdcgdf0ek= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGG6DieKAdgiTCqRmF2HD0dJi9DuORblPzbridniICsw
\ No newline at end of file
+|1|nOHeibGxhsIFnhW0flRwnirJjlg=|IJ8nJB355LGI+1U3Wpvdcgdf0ek= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGG6DieKAdgiTCqRmF2HD0dJi9DuORblPzbridniICsw
+185.45.78.150 ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDkOPL7fQiLFrg+/mDbff+jr+mQkI8pAkS5aBKOaknKuzTGrxILO5XSbyTJxyEwIKzZHBCUH2w99yv3oCqiphYp7iLLdPKl98RRnAXneJ1mo7nJfaTOSj5FGFf/AeHFZFa18B8zZrfFOOTGdEXeQpcik6R2A0/o4ZGE9rUg/dEoLQpFp8z+XRhsbNWgZ4a63oWrt02p+zdXPZ+Plir56j0qyQXoOo/BjEoLHs0aah61jfEOcJAcgpTrev/vdhBqJCgEXkf6AhiKidTnQxw7G/5C/BKtJbtuBWMgWZKcDf/uCzRkXaHNEggcJi1e6jvpUkvPLUfpRnNiBWLzehw3xZL4NicMM6D2TU0TSpB+UfEOLR0jyhCGKRQQN4jnj8ll0h+JBE6a0KnyKG+B5mXrD7THYu848jXUmBnxIaeor/NUItKEnCL0hzvAygOnniBN6uvtszSJHoGe8WbChLYJcoH3mOQTUH0k9RhXSEe90gSlLfRQInU+uzf2/qc6pffcKuc=
+185.45.78.150 ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCB8R1BElOz4geGfCcb/ObF5n4Par+g9AaXQW5FU1ccgnPA59uJeOEALPeXAgJijVOhwqTdIkIoWYWeGdlud9Wc=
+185.45.78.150 ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINNuXZkH7ppkTGNGKzmGEvAnvlLO2D+YtlJw1m3P16FV
diff --git a/environments/.stackhpc/inventory/group_vars/all/bastion.yml b/environments/.stackhpc/inventory/group_vars/all/bastion.yml
index 94287827c..a1001e862 100644
--- a/environments/.stackhpc/inventory/group_vars/all/bastion.yml
+++ b/environments/.stackhpc/inventory/group_vars/all/bastion.yml
@@ -6,6 +6,9 @@ bastion_config:
   LEAFCLOUD:
     user: slurm-app-ci
     ip: 195.114.30.222
+  SMS:
+    user: slurm-app-ci
+    ip: 185.45.78.150
 # NB: The bastion_{user,ip} variables are used directly in the CI workflow too
 bastion_user: "{{ bastion_config[ci_cloud].user }}"
 bastion_ip: "{{ bastion_config[ci_cloud].ip }}"
diff --git a/environments/.stackhpc/terraform/SMS.tfvars b/environments/.stackhpc/terraform/SMS.tfvars
new file mode 100644
index 000000000..66113a68d
--- /dev/null
+++ b/environments/.stackhpc/terraform/SMS.tfvars
@@ -0,0 +1,4 @@
+cluster_net = "stackhpc-ipv4-geneve"
+cluster_subnet = "stackhpc-ipv4-geneve-subnet"
+control_node_flavor = "general.v1.small"
+other_node_flavor = "general.v1.small"
\ No newline at end of file

From db84ea8cf5d4bb22e36813fbc7f66ac514e7212d Mon Sep 17 00:00:00 2001
From: John Garbutt <john.garbutt@stackhpc.com>
Date: Tue, 1 Oct 2024 09:32:04 +0100
Subject: [PATCH 63/78] Caas updated to use
 openstack_networking_floatingip_associate_v2 (#445)

Before we can move to v3.0.0 we need to use the new floating ip
resource, now the compute one has been removed.
To unblock this move, we add a tempory pin to use v2.1.0
for a bit.
---
 ansible/roles/cluster_infra/templates/providers.tf.j2 | 4 ++++
 ansible/roles/cluster_infra/templates/resources.tf.j2 | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/ansible/roles/cluster_infra/templates/providers.tf.j2 b/ansible/roles/cluster_infra/templates/providers.tf.j2
index 32a16f27b..35d775e7f 100644
--- a/ansible/roles/cluster_infra/templates/providers.tf.j2
+++ b/ansible/roles/cluster_infra/templates/providers.tf.j2
@@ -5,6 +5,10 @@ terraform {
   required_providers {
     openstack = {
       source = "terraform-provider-openstack/openstack"
+      # TODO we must upgrade to 3.0.0
+      # but only after we stop using the deprecated
+      # openstack_compute_floatingip_associate_v2
+      version = "~>2.1.0"
     }
   }
 }
diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
index 4c7534d62..344137b62 100644
--- a/ansible/roles/cluster_infra/templates/resources.tf.j2
+++ b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -572,7 +572,7 @@ data "openstack_networking_floatingip_v2" "cluster_floating_ip" {
   {% endif %}
 }
 
-resource "openstack_compute_floatingip_associate_v2" "login_floatingip_assoc" {
+resource "openstack_networking_floatingip_associate_v2" "login_floatingip_assoc" {
   floating_ip = "${data.openstack_networking_floatingip_v2.cluster_floating_ip.address}"
-  instance_id = "${openstack_compute_instance_v2.login.id}"
+  port_id = "${openstack_networking_port_v2.login.id}"
 }

From db2ce093cc6f265619ef92af35f0ae84331f97d1 Mon Sep 17 00:00:00 2001
From: John Garbutt <john.garbutt@stackhpc.com>
Date: Tue, 1 Oct 2024 13:44:57 +0100
Subject: [PATCH 64/78] Fix up the outputs, after the fip fix (#446)

---
 ansible/roles/cluster_infra/templates/outputs.tf.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2
index 885e4ad79..4d894a1dd 100644
--- a/ansible/roles/cluster_infra/templates/outputs.tf.j2
+++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2
@@ -1,6 +1,6 @@
 output "cluster_gateway_ip" {
   description = "The IP address of the gateway used to contact the cluster nodes"
-  value       =  openstack_compute_floatingip_associate_v2.login_floatingip_assoc.floating_ip
+  value       =  openstack_networking_floatingip_associate_v2.login_floatingip_assoc.floating_ip
 }
 
 {% if cluster_ssh_private_key_file is not defined %}

From 9c3116462a8c21e727f44d155a7c65ade3f0b846 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Fri, 4 Oct 2024 13:56:31 +0100
Subject: [PATCH 65/78] Add description of image to build (#444)

* add description of image description to build

* fix image summary dir logic

* bump CI image
---
 ansible/cleanup.yml                      | 25 ++++++++++++++++++++++++
 ansible/fatimage.yml                     |  4 ++--
 environments/.stackhpc/terraform/main.tf |  7 ++++---
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml
index e0fabf5e1..9c1373667 100644
--- a/ansible/cleanup.yml
+++ b/ansible/cleanup.yml
@@ -38,3 +38,28 @@
 
 - name: Cleanup /tmp
   command : rm -rf /tmp/*
+
+- name: Get package facts
+  package_facts:
+
+- name: Ensure image summary directory exists
+  file:
+    path: /var/lib/image/
+    state: directory
+    owner: root
+    group: root
+    mode: u=rwX,go=rX
+
+- name: Write image summary
+  copy:
+    content: "{{ image_info | to_nice_json }}"
+    dest: /var/lib/image/image.json
+  vars:
+    image_info:
+      branch: "{{ lookup('pipe', 'git rev-parse --abbrev-ref HEAD') }}"
+      build: "{{  ansible_nodename | split('.') | first }}" # hostname is image name, which contains build info
+      os: "{{ ansible_distribution }} {{ ansible_distribution_version }}"
+      kernel: "{{ ansible_kernel }}"
+      ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}"
+      cuda:  "{{ ansible_facts.packages['cuda'].0.version | default('-') }}"
+      slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"
diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml
index 58e1d72c7..81c4a2043 100644
--- a/ansible/fatimage.yml
+++ b/ansible/fatimage.yml
@@ -177,9 +177,9 @@
 
 - hosts: builder
   become: yes
-  gather_facts: no
+  gather_facts: yes
+  tags: finalise
   tasks:
-    # - meta: end_here
     - name: Cleanup image
       import_tasks: cleanup.yml
 
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index 45cf5db1a..cd759c22d 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -29,9 +29,10 @@ variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
     default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/427
-        RL8: "openhpc-ofed-RL8-240906-1042-32568dbb"
-        RL9: "openhpc-ofed-RL9-240906-1041-32568dbb"
+        # https://github.com/stackhpc/ansible-slurm-appliance/pull/444
+        RL8: "openhpc-ofed-RL8-241002-1612-1ce702b1"
+        RL9: "openhpc-ofed-RL9-241003-1052-1ce702b1"
+        RL9-cuda: "openhpc-cuda-RL9-241002-1612-1ce702b1"
     }
 }
 

From 760ab20dacd94c0dc8e3382636def7cd317fdd8c Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:26:40 +0100
Subject: [PATCH 66/78] Nightly Slurm CI Rocky update workflow (#440)

* Update openstack.pkr.hcl

* new image build workflow

* dynamically set packer vars from fatimage workflow

* remove openstack. prefix from image name

* echo image name

* make image_name var in packer config

* new changes

* fix merge changes

* temp workflow changes

* test nightly build

* change back fatimage workflow

* rename images built

* add update to builder group

* add update to fatimage build groups

* fatimage.yml fix

* move output image_name declaration into build blocks

* delete outdated nightly image

* test new fatimage build

* debug dnf remove cockpit

* --amend

* add cuda build back in

* cuda nightly build

* test cuda nightly builds

* test new fatimage build on SMS

* test image upploads across clouds

* test image uploads in separate workflow

* finish nightly build workflow

* fix image delete logic

* use azimuth-cloud trivy db mirror

* use GITHUB_TOKEN env

* test new fatimage build

* add final nightlybuilds workflow

* move trivy scan to separate workflow

* bump image and test new trivy scan

* fix artifact creation

* bump image and test trivy scan

* only run trivy scan on image bumps

* bump image to test trivy scan run condition

* bump cuda image

* bump image

* extend timeout for trivy scanning cuda image

* Run workflow on PR to main

* address PR comments

* fix source_image_name packer parse

* bump image

* additional PR comments

* bump image
---
 .github/workflows/fatimage.yml                | 107 +++----
 .github/workflows/nightlybuild.yml            | 265 ++++++++++++++++++
 .github/workflows/trivyscan.yml               | 116 ++++++++
 ansible/bootstrap.yml                         |   6 +-
 ansible/fatimage.yml                          |  22 +-
 dev/extract_logs.py                           |   2 +-
 .../terraform/cluster_image.auto.tfvars.json  |   7 +
 environments/.stackhpc/terraform/main.tf      |   6 -
 environments/common/layouts/everything        |   1 -
 packer/openstack.pkr.hcl                      |  47 ++--
 10 files changed, 482 insertions(+), 97 deletions(-)
 create mode 100644 .github/workflows/nightlybuild.yml
 create mode 100644 .github/workflows/trivyscan.yml
 create mode 100644 environments/.stackhpc/terraform/cluster_image.auto.tfvars.json

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index 9bf05f62a..5425eb4e3 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -1,16 +1,16 @@
-
 name: Build fat image
 on:
   workflow_dispatch:
-    inputs:
-      ci_cloud:
-        description: 'Select the CI_CLOUD'
-        required: true
-        type: choice
-        options:
-          - LEAFCLOUD
-          - SMS
-          - ARCUS
+      inputs:
+        ci_cloud:
+          description: 'Select the CI_CLOUD'
+          required: true
+          type: choice
+          options:
+            - LEAFCLOUD
+            - SMS
+            - ARCUS
+
 jobs:
   openstack:
     name: openstack-imagebuild
@@ -25,7 +25,7 @@ jobs:
           - RL8
           - RL9
         build:
-          - openstack.openhpc-ofed
+          - openstack.openhpc
           - openstack.openhpc-cuda
         exclude:
           - os_version: RL8
@@ -34,6 +34,18 @@ jobs:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
       CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
+      SOURCE_IMAGES_MAP: |
+        {
+          "RL8": {
+            "openstack.openhpc": "rocky-latest-RL8",
+            "openstack.openhpc-cuda": "rocky-latest-cuda-RL8"
+          },
+          "RL9": {
+            "openstack.openhpc": "rocky-latest-RL9",
+            "openstack.openhpc-cuda": "rocky-latest-cuda-RL9"
+          }
+        }
+
     steps:
       - uses: actions/checkout@v2
 
@@ -52,10 +64,10 @@ jobs:
       - name: Add bastion's ssh key to known_hosts
         run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
         shell: bash
-      
+
       - name: Install ansible etc
         run: dev/setup-env.sh
-      
+
       - name: Write clouds.yaml
         run: |
           mkdir -p ~/.config/openstack/
@@ -66,17 +78,25 @@ jobs:
         run: |
           . venv/bin/activate
           . environments/.stackhpc/activate
-        
+
       - name: Build fat image with packer
         id: packer_build
         run: |
+          set -x
           . venv/bin/activate
           . environments/.stackhpc/activate
           cd packer/
           packer init .
-          PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only=${{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
+
+          PACKER_LOG=1 packer build \
+          -on-error=${{ vars.PACKER_ON_ERROR }} \
+          -only=${{ matrix.build }} \
+          -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
+          -var "source_image_name=${{ env.SOURCE_IMAGE }}" \
+          openstack.pkr.hcl
         env:
           PKR_VAR_os_version: ${{ matrix.os_version }}
+          SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }}
 
       - name: Get created image names from manifest
         id: manifest
@@ -87,53 +107,14 @@ jobs:
             sleep 5
           done
           IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
-          echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
-          echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
-
-      - name: Download image
-        run: |
-          . venv/bin/activate
-          sudo mkdir /mnt/images
-          sudo chmod 777 /mnt/images
-          openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
-      - name: install libguestfs
-        run: |
-          sudo apt -y update
-          sudo apt -y install libguestfs-tools
-
-      - name: mkdir for mount
-        run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
-
-      - name: mount qcow2 file
-        run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
-
-      - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@0.17.0
-        with:
-          scan-type: fs
-          scan-ref: "${{ steps.manifest.outputs.image-name }}"
-          scanners: "vuln"
-          format: sarif
-          output: "${{ steps.manifest.outputs.image-name }}.sarif"
-          # turn off secret scanning to speed things up
-
-      - name: Upload Trivy scan results to GitHub Security tab
-        uses: github/codeql-action/upload-sarif@v3
-        with:
-          sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
-          category: "${{ matrix.os_version }}-${{ matrix.build }}"
+          echo $IMAGE_ID > image-id.txt
+          echo $IMAGE_NAME > image-name.txt
 
-      - name: Fail if scan has CRITICAL vulnerabilities
-        uses: aquasecurity/trivy-action@0.16.1
+      - name: Upload manifest artifact
+        uses: actions/upload-artifact@v4
         with:
-          scan-type: fs
-          scan-ref: "${{ steps.manifest.outputs.image-name }}"
-          scanners: "vuln"
-          format: table
-          exit-code: '1'
-          severity: 'CRITICAL'
-          ignore-unfixed: true
+          name: image-details-${{ matrix.build }}-${{ matrix.os_version }}
+          path: |
+            ./image-id.txt
+            ./image-name.txt
+          overwrite: true
\ No newline at end of file
diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml
new file mode 100644
index 000000000..4df3f9955
--- /dev/null
+++ b/.github/workflows/nightlybuild.yml
@@ -0,0 +1,265 @@
+name: Build nightly image
+on:
+  workflow_dispatch:
+      inputs:
+        ci_cloud:
+          description: 'Select the CI_CLOUD'
+          required: true
+          type: choice
+          options:
+            - LEAFCLOUD
+            - SMS
+            - ARCUS
+  schedule:
+    - cron: '0 0 * * *'  # Run at midnight
+
+jobs:
+  openstack:
+    name: openstack-imagebuild
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
+      cancel-in-progress: true
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false # allow other matrix jobs to continue even if one fails
+      matrix: # build RL8, RL9, RL9+CUDA versions
+        os_version:
+          - RL8
+          - RL9
+        build:
+          - openstack.rocky-latest
+          - openstack.rocky-latest-cuda
+        exclude:
+          - os_version: RL8
+            build: openstack.rocky-latest-cuda
+
+    env:
+      ANSIBLE_FORCE_COLOR: True
+      OS_CLOUD: openstack
+      CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
+      SOURCE_IMAGES_MAP: |
+        {
+          "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2",
+          "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2"
+        }
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Record settings for CI cloud
+        run: |
+          echo CI_CLOUD: ${{ env.CI_CLOUD }}
+
+      - name: Setup ssh
+        run: |
+          set -x
+          mkdir ~/.ssh
+          echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
+          chmod 0600 ~/.ssh/id_rsa
+        shell: bash
+
+      - name: Add bastion's ssh key to known_hosts
+        run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
+        shell: bash
+
+      - name: Install ansible etc
+        run: dev/setup-env.sh
+
+      - name: Write clouds.yaml
+        run: |
+          mkdir -p ~/.config/openstack/
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+        shell: bash
+
+      - name: Setup environment
+        run: |
+          . venv/bin/activate
+          . environments/.stackhpc/activate
+
+      - name: Build fat image with packer
+        id: packer_build
+        run: |
+          set -x
+          . venv/bin/activate
+          . environments/.stackhpc/activate
+          cd packer/
+          packer init .
+
+          PACKER_LOG=1 packer build \
+          -on-error=${{ vars.PACKER_ON_ERROR }} \
+          -only=${{ matrix.build }} \
+          -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
+          -var "source_image_name=${{ env.SOURCE_IMAGE }}"
+          openstack.pkr.hcl
+
+        env:
+          PKR_VAR_os_version: ${{ matrix.os_version }}
+          SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }}
+
+      - name: Get created image names from manifest
+        id: manifest
+        run: |
+          . venv/bin/activate
+          IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
+          while ! openstack image show -f value -c name $IMAGE_ID; do
+            sleep 5
+          done
+          IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
+          echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
+          echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
+
+      - name: Download image
+        run: |
+          . venv/bin/activate
+          sudo mkdir /mnt/images
+          sudo chmod 777 /mnt/images
+          openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
+          openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: install libguestfs
+        run: |
+          sudo apt -y update
+          sudo apt -y install libguestfs-tools
+
+      - name: mkdir for mount
+        run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
+
+      - name: mount qcow2 file
+        run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
+
+      - name: Run Trivy vulnerability scanner
+        uses: aquasecurity/trivy-action@0.17.0
+        with:
+          scan-type: fs
+          scan-ref: "${{ steps.manifest.outputs.image-name }}"
+          scanners: "vuln"
+          format: sarif
+          output: "${{ steps.manifest.outputs.image-name }}.sarif"
+          # turn off secret scanning to speed things up
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload Trivy scan results to GitHub Security tab
+        uses: github/codeql-action/upload-sarif@v3
+        with:
+          sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
+          category: "${{ matrix.os_version }}-${{ matrix.build }}"
+
+      - name: Fail if scan has CRITICAL vulnerabilities
+        uses: aquasecurity/trivy-action@0.16.1
+        with:
+          scan-type: fs
+          scan-ref: "${{ steps.manifest.outputs.image-name }}"
+          scanners: "vuln"
+          format: table
+          exit-code: '1'
+          severity: 'CRITICAL'
+          ignore-unfixed: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Delete new image if Trivy scan fails
+        if: failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed
+        run: |
+          . venv/bin/activate
+          echo "Deleting new image due to critical vulnerabilities or scan failure ..."
+          openstack image delete "${{ steps.manifest.outputs.image-id }}"
+
+      - name: Delete old latest image
+        if: success() # Runs only if Trivy scan passed
+        run: |
+          . venv/bin/activate
+          IMAGE_COUNT=$(openstack image list --name ${{ steps.manifest.outputs.image-name }} -f value -c ID | wc -l)
+          if [ "$IMAGE_COUNT" -gt 1 ]; then
+            OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ steps.manifest.outputs.image-name }}"  -f value -c ID | head -n 1)
+            echo "Deleting old image ID: $OLD_IMAGE_ID"
+            openstack image delete "$OLD_IMAGE_ID"
+          else
+            echo "Only one image exists, skipping deletion."
+          fi
+
+  upload:
+    name: upload-nightly-targets
+    needs: openstack
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }}
+      cancel-in-progress: true
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        target_cloud:
+          - LEAFCLOUD
+          - SMS
+          - ARCUS
+        os_version:
+          - RL8
+          - RL9
+        image:
+          - rocky-latest
+          - rocky-latest-cuda
+        exclude:
+          - os_version: RL8
+            image: rocky-latest-cuda
+          - target_cloud: LEAFCLOUD
+    env:
+      OS_CLOUD: openstack
+      SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
+      TARGET_CLOUD: ${{ matrix.target_cloud }}
+      IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}"
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Record settings for CI cloud
+        run: |
+          echo SOURCE_CLOUD: ${{ env.SOURCE_CLOUD }}
+          echo TARGET_CLOUD: ${{ env.TARGET_CLOUD }}
+
+      - name: Install openstackclient
+        run: |
+          python3 -m venv venv
+          . venv/bin/activate
+          pip install -U pip
+          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
+        shell: bash
+
+      - name: Write clouds.yaml
+        run: |
+          mkdir -p ~/.config/openstack/
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml
+        shell: bash
+
+      - name: Download source image
+        run: |
+          . venv/bin/activate
+          export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml
+          openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }}
+        shell: bash
+
+      - name: Upload to target cloud
+        run: |
+          . venv/bin/activate
+          export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml
+
+          openstack image create "${{ env.IMAGE_NAME }}" \
+            --file "${{ env.IMAGE_NAME }}" \
+            --disk-format qcow2 \
+        shell: bash
+
+      - name: Delete old latest image from target cloud
+        run: |
+          . venv/bin/activate
+          export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml
+
+          IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l)
+          if [ "$IMAGE_COUNT" -gt 1 ]; then
+            OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}"  -f value -c ID | head -n 1)
+            openstack image delete "$OLD_IMAGE_ID"
+          else
+            echo "Only one image exists, skipping deletion."
+          fi
+        shell: bash
diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml
new file mode 100644
index 000000000..2957b22ee
--- /dev/null
+++ b/.github/workflows/trivyscan.yml
@@ -0,0 +1,116 @@
+name: Trivy scan image for vulnerabilities
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
+
+jobs:
+  scan:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} # to branch/PR + OS + build
+      cancel-in-progress: true
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build: ["RL8", "RL9", "RL9-cuda"]
+    env:
+      JSON_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
+      OS_CLOUD: openstack
+      CI_CLOUD: ${{ vars.CI_CLOUD }}
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Record settings for CI cloud
+        run: |
+          echo CI_CLOUD: ${{ env.CI_CLOUD }}
+
+      - name: Setup ssh
+        run: |
+          set -x
+          mkdir ~/.ssh
+          echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
+          chmod 0600 ~/.ssh/id_rsa
+        shell: bash
+
+      - name: Add bastion's ssh key to known_hosts
+        run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
+        shell: bash
+
+      - name: setup environment
+        run: |
+          python3 -m venv venv
+          . venv/bin/activate
+          pip install -U pip
+          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
+        shell: bash
+
+      - name: Write clouds.yaml
+        run: |
+          mkdir -p ~/.config/openstack/
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+        shell: bash
+
+      - name: Parse image name json
+        id: manifest
+        run: |
+          IMAGE_NAME=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.JSON_PATH }}")
+          echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
+
+      - name: Download image
+        run: |
+          . venv/bin/activate
+          sudo mkdir /mnt/images
+          sudo chmod 777 /mnt/images
+          openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: install libguestfs
+        run: |
+          sudo apt -y update
+          sudo apt -y install libguestfs-tools
+
+      - name: mkdir for mount
+        run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
+
+      - name: mount qcow2 file
+        run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
+
+      - name: Run Trivy vulnerability scanner
+        uses: aquasecurity/trivy-action@0.24.0
+        with:
+          scan-type: fs
+          scan-ref: "${{ steps.manifest.outputs.image-name }}"
+          scanners: "vuln"
+          format: sarif
+          output: "${{ steps.manifest.outputs.image-name }}.sarif"
+          # turn off secret scanning to speed things up
+          timeout: 15m
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload Trivy scan results to GitHub Security tab
+        uses: github/codeql-action/upload-sarif@v3
+        with:
+          sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
+          category: "${{ matrix.os_version }}-${{ matrix.build }}"
+
+      - name: Fail if scan has CRITICAL vulnerabilities
+        uses: aquasecurity/trivy-action@0.24.0
+        with:
+          scan-type: fs
+          scan-ref: "${{ steps.manifest.outputs.image-name }}"
+          scanners: "vuln"
+          format: table
+          exit-code: '1'
+          severity: 'CRITICAL'
+          ignore-unfixed: true
+          timeout: 15m
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml
index c43d614db..18d159996 100644
--- a/ansible/bootstrap.yml
+++ b/ansible/bootstrap.yml
@@ -148,9 +148,9 @@
   tags: cockpit
   tasks:
     - name: Remove RHEL cockpit
-      dnf:
-        name: cockpit-ws
-        state: "{{ appliances_cockpit_state }}"
+      command: dnf -y remove cockpit-ws # N.B. using ansible dnf module is very slow
+      register: dnf_remove_output
+      ignore_errors: true  # Avoid failing if a lock or other error happens
 
 - hosts: firewalld
   gather_facts: false
diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml
index 81c4a2043..e623c2794 100644
--- a/ansible/fatimage.yml
+++ b/ansible/fatimage.yml
@@ -56,10 +56,12 @@
       include_role:
         name: mysql
         tasks_from: install.yml
+      when: "'mysql' in group_names"
     - name: OpenHPC
       import_role:
         name: stackhpc.openhpc
         tasks_from: install.yml
+      when: "'openhpc' in group_names"
 
     # - import_playbook: portal.yml
     - name: Open Ondemand server (packages)
@@ -67,6 +69,7 @@
         name: osc.ood
         tasks_from: install-package.yml
         vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
+      when: "'openondemand' in group_names"
     # # FUTURE: install-apps.yml - this is git clones
 
     - name: Open Ondemand server (apps)
@@ -74,34 +77,40 @@
         name: osc.ood
         tasks_from: install-apps.yml
         vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
+      when: "'openondemand' in group_names"
 
     - name: Open Ondemand remote desktop
       import_role:
         name: openondemand
         tasks_from: vnc_compute.yml
+      when: "'openondemand_desktop' in group_names"
     - name: Open Ondemand jupyter node
       import_role:
         name: openondemand
         tasks_from: jupyter_compute.yml
+      when: "'openondemand' in group_names"
 
     # - import_playbook: monitoring.yml:
     - import_role:
         name: opensearch
         tasks_from: install.yml
-      become: true
+      when: "'opensearch' in group_names"
     # slurm_stats - nothing to do
     - import_role:
         name: filebeat
         tasks_from: install.yml
+      when: "'filebeat' in group_names"
 
     - import_role:
       # can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start
       # however starting node exporter is ok
         name: cloudalchemy.node_exporter
+      when: "'node_exporter' in group_names"
 
     - name: openondemand exporter
       dnf:
-        name: ondemand_exporter 
+        name: ondemand_exporter
+      when: "'openondemand' in group_names"
 
     - name: slurm exporter
       import_role:
@@ -109,7 +118,12 @@
         tasks_from: install
       vars:
         slurm_exporter_state: stopped
+      when: "'slurm_exporter' in group_names"
 
+- hosts: prometheus
+  become: yes
+  gather_facts: yes
+  tasks:
     - import_role:
         name: cloudalchemy.prometheus
         tasks_from: preflight.yml
@@ -162,6 +176,10 @@
         - prometheus
         - promtool
 
+- hosts: grafana
+  become: yes
+  gather_facts: yes
+  tasks:
     - name: Include distribution variables for cloudalchemy.grafana
       include_vars: "{{ appliances_repository_root }}/ansible/roles/cloudalchemy.grafana/vars/redhat.yml"
     - import_role:
diff --git a/dev/extract_logs.py b/dev/extract_logs.py
index 91923f1a0..65df0140e 100644
--- a/dev/extract_logs.py
+++ b/dev/extract_logs.py
@@ -76,7 +76,7 @@ def extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_dir
     print("Path to workflow log plain text file should be provided as the only arg to this script")
     sys.exit(1)
 log_file_path = sys.argv[1] # Input workflow log name
-output_csv_path = log_file_path.replace('.txt.', '.csv') # Output CSV name
+output_csv_path = log_file_path.replace('.txt', '.csv') # Output CSV name
 target_directory = '/ansible/' # Shared directory for task path
 
 extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory)
diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
new file mode 100644
index 000000000..f62c8886e
--- /dev/null
+++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
@@ -0,0 +1,7 @@
+{
+    "cluster_image": {
+        "RL8": "openhpc-RL8-241009-1523-354b048a",
+        "RL9": "openhpc-RL9-241009-1523-354b048a",
+        "RL9-cuda": "openhpc-cuda-RL9-241009-1523-354b048a"
+    }
+}
\ No newline at end of file
diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf
index cd759c22d..99197dece 100644
--- a/environments/.stackhpc/terraform/main.tf
+++ b/environments/.stackhpc/terraform/main.tf
@@ -28,12 +28,6 @@ variable "os_version" {
 variable "cluster_image" {
     description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
     type = map(string)
-    default = {
-        # https://github.com/stackhpc/ansible-slurm-appliance/pull/444
-        RL8: "openhpc-ofed-RL8-241002-1612-1ce702b1"
-        RL9: "openhpc-ofed-RL9-241003-1052-1ce702b1"
-        RL9-cuda: "openhpc-cuda-RL9-241002-1612-1ce702b1"
-    }
 }
 
 variable "cluster_net" {}
diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything
index 85af46c06..205f1d334 100644
--- a/environments/common/layouts/everything
+++ b/environments/common/layouts/everything
@@ -28,7 +28,6 @@ slurm_stats
 # NB: [rebuild] not defined here as this template is used in CI
 
 [update:children]
-cluster
 
 [fail2ban:children]
 # Hosts to install fail2ban on to protect SSH
diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl
index 5f66c0320..ae5744ff3 100644
--- a/packer/openstack.pkr.hcl
+++ b/packer/openstack.pkr.hcl
@@ -47,21 +47,14 @@ variable "os_version" {
 
 # Must supply either source_image_name or source_image_id
 variable "source_image_name" {
-  type = map(string)
-  description = "name of source image, keyed from var.os_version"
-  default = {
-    RL8: "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
-    RL9: "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2"
-  }
+  type = string
+  description = "name of source image"
 }
 
 variable "source_image" {
-  type = map(string)
-  default = {
-    RL8: null
-    RL9: null
-  }
-  description = "UUID of source image, keyed from var.os_version"
+  type = string
+  default = null
+  description = "UUID of source image"
 }
 
 variable "flavor" {
@@ -132,8 +125,9 @@ variable "volume_size" {
   type = map(number)
   default = {
     # fat image builds, GB:
+    rocky-latest = 15
+    rocky-latest-cuda = 30
     openhpc = 15
-    openhpc-ofed = 15
     openhpc-cuda = 30
   }
 }
@@ -153,9 +147,10 @@ variable "groups" {
   description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name"
   default = {
     # fat image builds:
+    rocky-latest = ["update", "ofed"]
+    rocky-latest-cuda = ["update", "ofed", "cuda"]
     openhpc = ["control", "compute", "login"]
-    openhpc-ofed = ["control", "compute", "login", "ofed"]
-    openhpc-cuda = ["control", "compute", "login", "ofed", "cuda"]
+    openhpc-cuda = ["control", "compute", "login"]
   }
 }
 
@@ -171,8 +166,8 @@ source "openstack" "openhpc" {
   security_groups = var.security_groups
   
   # Input image:
-  source_image = "${var.source_image[var.os_version]}"
-  source_image_name = "${var.source_image_name[var.os_version]}" # NB: must already exist in OpenStack
+  source_image = "${var.source_image}"
+  source_image_name = "${var.source_image_name}" # NB: must already exist in OpenStack
   
   # SSH:
   ssh_username = var.ssh_username
@@ -186,29 +181,39 @@ source "openstack" "openhpc" {
   # Output image:
   image_disk_format = "qcow2"
   image_visibility = var.image_visibility
-  image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
+  
 }
 
 build {
 
-  # non-OFED fat image:
+  # latest nightly image:
   source "source.openstack.openhpc" {
-    name = "openhpc"
+    name = "rocky-latest"
+    image_name = "${source.name}-${var.os_version}"
+  }
+
+  # latest nightly cuda image:
+  source "source.openstack.openhpc" {
+    name = "rocky-latest-cuda"
+    image_name = "${source.name}-${var.os_version}"
   }
 
   # OFED fat image:
   source "source.openstack.openhpc" {
-    name = "openhpc-ofed"
+    name = "openhpc"
+    image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
   }
 
   # CUDA fat image:
   source "source.openstack.openhpc" {
     name = "openhpc-cuda"
+    image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
   }
 
   # Extended site-specific image, built on fat image:
   source "source.openstack.openhpc" {
     name = "openhpc-extra"
+    image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
   }
 
   provisioner "ansible" {

From 368436e10555b5e6a8e6ed6127775dc1ea1d9771 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 10 Oct 2024 10:46:53 +0000
Subject: [PATCH 67/78] test s3 image sync

---
 .github/workflows/fatimage.yml | 204 +++++++++++++++++++--------------
 1 file changed, 119 insertions(+), 85 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index 5425eb4e3..cfa0197f5 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -1,72 +1,128 @@
-name: Build fat image
+name: Upload CI-tested images to Arcus S3 and sync clouds
 on:
   workflow_dispatch:
-      inputs:
-        ci_cloud:
-          description: 'Select the CI_CLOUD'
-          required: true
-          type: choice
-          options:
-            - LEAFCLOUD
-            - SMS
-            - ARCUS
+  push:
+    branches:
+      - main
+    paths:
+      - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
 
 jobs:
-  openstack:
-    name: openstack-imagebuild
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
-      cancel-in-progress: true
+  image_upload:
     runs-on: ubuntu-22.04
+    concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }}
     strategy:
-      fail-fast: false # allow other matrix jobs to continue even if one fails
-      matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
-        os_version:
+      fail-fast: false
+      matrix:
+        build:
           - RL8
           - RL9
-        build:
-          - openstack.openhpc
-          - openstack.openhpc-cuda
-        exclude:
-          - os_version: RL8
-            build: openstack.openhpc-cuda
+          - RL9-cuda
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
-      CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
-      SOURCE_IMAGES_MAP: |
-        {
-          "RL8": {
-            "openstack.openhpc": "rocky-latest-RL8",
-            "openstack.openhpc-cuda": "rocky-latest-cuda-RL8"
-          },
-          "RL9": {
-            "openstack.openhpc": "rocky-latest-RL9",
-            "openstack.openhpc-cuda": "rocky-latest-cuda-RL9"
-          }
-        }
-
+      CI_CLOUD: ${{ vars.CI_CLOUD }}
+      IMAGE_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
     steps:
       - uses: actions/checkout@v2
 
-      - name: Record settings for CI cloud
+      - name: Record which cloud CI is running on
         run: |
           echo CI_CLOUD: ${{ env.CI_CLOUD }}
 
-      - name: Setup ssh
+      - name: setup environment
+        run: |
+          python3 -m venv venv
+          . venv/bin/activate
+          pip install -U pip
+          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
+        shell: bash
+
+      - name: Write clouds.yaml
+        run: |
+          mkdir -p ~/.config/openstack/
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+        shell: bash
+
+      - name: Write s3cmd configuration
+        run: |
+          echo "${{ secrets['ARCUS_S3CFG'] }}" > ~/.s3cfg
+        shell: bash
+
+      - name: Install s3cmd
+        run: |
+          sudo apt-get --yes install s3cmd
+
+      - name: Check for image in Arcus S3 bucket
+        id: s3_ls
+        run: |
+
+          TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
+          echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
+          echo "target-image-${{ matrix.build }}=${TARGET_IMAGE}" >> "$GITHUB_OUTPUT"
+
+          S3_IMAGES=$(s3cmd ls s3://openhpc-images)
+          
+          if echo "$S3_IMAGES" | grep -q "$TARGET_IMAGE"; then
+            echo "Image ${TARGET_IMAGE} is already present in S3."
+            echo "IMAGE_EXISTS=true" >> $GITHUB_ENV
+          else
+            echo "Image ${TARGET_IMAGE} is not present in S3."
+            echo "IMAGE_EXISTS=false" >> $GITHUB_ENV
+          fi
+        shell: bash
+
+      - name: Download image to runner
+        if: env.IMAGE_EXISTS == 'false'
         run: |
-          set -x
-          mkdir ~/.ssh
-          echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
-          chmod 0600 ~/.ssh/id_rsa
+          . venv/bin/activate
+          openstack image save --file ${{ env.TARGET_IMAGE }} ${{ env.TARGET_IMAGE }}
         shell: bash
 
-      - name: Add bastion's ssh key to known_hosts
-        run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
+      - name: Conditionally Upload Image to S3
+        if: env.IMAGE_EXISTS == 'false'
+        run: |
+          echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..."
+          s3cmd put ${{ env.TARGET_IMAGE }}.qcow2 s3://openhpc-images
         shell: bash
 
-      - name: Install ansible etc
-        run: dev/setup-env.sh
+  image_sync:
+    needs: image_upload
+    runs-on: ubuntu-22.04
+    concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cloud }}-${{ matrix.build }}
+    strategy:
+      fail-fast: false
+      matrix:
+        cloud:
+          - LEAFCLOUD
+          - SMS
+          - ARCUS
+        build:
+          - RL8
+          - RL9
+          - RL9-cuda
+        exclude: 
+          - cloud: LEAFCLOUD
+
+    env:
+      ANSIBLE_FORCE_COLOR: True
+      OS_CLOUD: openstack
+      CI_CLOUD: ${{ matrix.cloud }}
+      IMAGE_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Record which cloud CI is running on
+        run: |
+          echo CI_CLOUD: ${{ env.CI_CLOUD }}
+
+      - name: setup environment
+        run: |
+          python3 -m venv venv
+          . venv/bin/activate
+          pip install -U pip
+          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
+        shell: bash
 
       - name: Write clouds.yaml
         run: |
@@ -74,47 +130,25 @@ jobs:
           echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
         shell: bash
 
-      - name: Setup environment
+      - name: Retrieve image name
         run: |
-          . venv/bin/activate
-          . environments/.stackhpc/activate
+          TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
+          echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
 
-      - name: Build fat image with packer
-        id: packer_build
+      - name: Upload latest image if missing
         run: |
-          set -x
           . venv/bin/activate
-          . environments/.stackhpc/activate
-          cd packer/
-          packer init .
-
-          PACKER_LOG=1 packer build \
-          -on-error=${{ vars.PACKER_ON_ERROR }} \
-          -only=${{ matrix.build }} \
-          -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-          -var "source_image_name=${{ env.SOURCE_IMAGE }}" \
-          openstack.pkr.hcl
-        env:
-          PKR_VAR_os_version: ${{ matrix.os_version }}
-          SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }}
-
-      - name: Get created image names from manifest
-        id: manifest
+          bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} openhpc-images
+
+      - name: Cleanup OpenStack Image (on error or cancellation)
+        if: cancelled()
         run: |
           . venv/bin/activate
-          IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
-          while ! openstack image show -f value -c name $IMAGE_ID; do
-            sleep 5
-          done
-          IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
-          echo $IMAGE_ID > image-id.txt
-          echo $IMAGE_NAME > image-name.txt
-
-      - name: Upload manifest artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: image-details-${{ matrix.build }}-${{ matrix.os_version }}
-          path: |
-            ./image-id.txt
-            ./image-name.txt
-          overwrite: true
\ No newline at end of file
+          image_hanging=$(openstack image list --name ${{ env.TARGET_IMAGE }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')
+          if [ -n "$image_hanging" ]; then
+            echo "Cleaning up OpenStack image with ID: $image_hanging"
+            openstack image delete $image_hanging
+          else
+            echo "No image ID found, skipping cleanup."
+          fi
+        shell: bash

From 0c396e80062a8807368d11b45b0ce7000aeda87c Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 10 Oct 2024 11:11:40 +0000
Subject: [PATCH 68/78] fix s3cfg creds

---
 .github/workflows/fatimage.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index cfa0197f5..40efe1fec 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -46,7 +46,9 @@ jobs:
 
       - name: Write s3cmd configuration
         run: |
-          echo "${{ secrets['ARCUS_S3CFG'] }}" > ~/.s3cfg
+          . venv/bin/activate
+          S3_CREDS=$(openstack ec2 credentials create)
+          echo "$S3_CREDS" > ~/.s3cfg
         shell: bash
 
       - name: Install s3cmd
@@ -56,7 +58,7 @@ jobs:
       - name: Check for image in Arcus S3 bucket
         id: s3_ls
         run: |
-
+          . venv/bin/activate
           TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
           echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
           echo "target-image-${{ matrix.build }}=${TARGET_IMAGE}" >> "$GITHUB_OUTPUT"

From 95f043bb5d7bd98d248e3b7857772342f4322334 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 10 Oct 2024 13:19:33 +0000
Subject: [PATCH 69/78] fix ~/.s3cfg

---
 .github/workflows/fatimage.yml | 37 +++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index 40efe1fec..f46490956 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -44,11 +44,42 @@ jobs:
           echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
         shell: bash
 
-      - name: Write s3cmd configuration
+      - name: Create EC2 credentials if not present
+        id: check_creds
         run: |
           . venv/bin/activate
-          S3_CREDS=$(openstack ec2 credentials create)
-          echo "$S3_CREDS" > ~/.s3cfg
+          # List existing EC2 credentials
+          existing_creds=$(openstack ec2 credentials list --format json)
+
+          # Check if the list is empty
+          if [ "$(echo "$existing_creds" | jq 'length')" -eq 0 ]; then
+            echo "No existing EC2 credentials found."
+            new_creds=$(openstack ec2 credentials create --format json)
+            access_key=$(echo "$new_creds" | jq -r '.Access')
+            secret_key=$(echo "$new_creds" | jq -r '.Secret')
+            echo "Created new EC2 credentials."
+          else
+            echo "Existing EC2 credentials found."
+            access_key=$(echo "$existing_creds" | jq -r '.[0].Access')
+            secret_key=$(echo "$existing_creds" | jq -r '.[0].Secret')
+          fi
+
+          # Save access and secret keys for the next step
+          echo "access_key=${access_key}" >> $GITHUB_ENV
+          echo "secret_key=${secret_key}" >> $GITHUB_ENV
+        shell: bash
+
+      - name: Write s3cmd configuration
+        run: |
+          cat <<EOF > ~/.s3cfg
+          [default]
+          host_base = https://object.arcus.openstack.hpc.cam.ac.uk
+          host_bucket = https://object.arcus.openstack.hpc.cam.ac.uk
+          access_key = ${{ env.access_key }}
+          secret_key = ${{ env.secret_key }}
+          use_https = True
+          signurl_use_https = True
+          EOF
         shell: bash
 
       - name: Install s3cmd

From e2f30d486da467a8054fba07ae4edfb346c2e4e2 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 10 Oct 2024 14:06:13 +0000
Subject: [PATCH 70/78] revert to using secret

---
 .github/workflows/fatimage.yml | 45 +++++-----------------------------
 1 file changed, 6 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index f46490956..b9727111c 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -44,42 +44,9 @@ jobs:
           echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
         shell: bash
 
-      - name: Create EC2 credentials if not present
-        id: check_creds
-        run: |
-          . venv/bin/activate
-          # List existing EC2 credentials
-          existing_creds=$(openstack ec2 credentials list --format json)
-
-          # Check if the list is empty
-          if [ "$(echo "$existing_creds" | jq 'length')" -eq 0 ]; then
-            echo "No existing EC2 credentials found."
-            new_creds=$(openstack ec2 credentials create --format json)
-            access_key=$(echo "$new_creds" | jq -r '.Access')
-            secret_key=$(echo "$new_creds" | jq -r '.Secret')
-            echo "Created new EC2 credentials."
-          else
-            echo "Existing EC2 credentials found."
-            access_key=$(echo "$existing_creds" | jq -r '.[0].Access')
-            secret_key=$(echo "$existing_creds" | jq -r '.[0].Secret')
-          fi
-
-          # Save access and secret keys for the next step
-          echo "access_key=${access_key}" >> $GITHUB_ENV
-          echo "secret_key=${secret_key}" >> $GITHUB_ENV
-        shell: bash
-
       - name: Write s3cmd configuration
         run: |
-          cat <<EOF > ~/.s3cfg
-          [default]
-          host_base = https://object.arcus.openstack.hpc.cam.ac.uk
-          host_bucket = https://object.arcus.openstack.hpc.cam.ac.uk
-          access_key = ${{ env.access_key }}
-          secret_key = ${{ env.secret_key }}
-          use_https = True
-          signurl_use_https = True
-          EOF
+          echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
         shell: bash
 
       - name: Install s3cmd
@@ -94,8 +61,8 @@ jobs:
           echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
           echo "target-image-${{ matrix.build }}=${TARGET_IMAGE}" >> "$GITHUB_OUTPUT"
 
-          S3_IMAGES=$(s3cmd ls s3://openhpc-images)
-          
+          S3_IMAGES=$(s3cmd ls s3://openhpc-images-prerelease)
+
           if echo "$S3_IMAGES" | grep -q "$TARGET_IMAGE"; then
             echo "Image ${TARGET_IMAGE} is already present in S3."
             echo "IMAGE_EXISTS=true" >> $GITHUB_ENV
@@ -116,7 +83,7 @@ jobs:
         if: env.IMAGE_EXISTS == 'false'
         run: |
           echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..."
-          s3cmd put ${{ env.TARGET_IMAGE }}.qcow2 s3://openhpc-images
+          s3cmd put ${{ env.TARGET_IMAGE }}.qcow2 s3://openhpc-images-prerelease
         shell: bash
 
   image_sync:
@@ -168,10 +135,10 @@ jobs:
           TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
           echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
 
-      - name: Upload latest image if missing
+      - name: Download latest image if missing
         run: |
           . venv/bin/activate
-          bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} openhpc-images
+          bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} openhpc-images-prerelease
 
       - name: Cleanup OpenStack Image (on error or cancellation)
         if: cancelled()

From bd4dcc82377107b025c6f083b83124afa7a0a6bf Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Thu, 10 Oct 2024 14:15:41 +0000
Subject: [PATCH 71/78] multipart chunk image upload

---
 .github/workflows/fatimage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index b9727111c..1bb1a5a06 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -83,7 +83,7 @@ jobs:
         if: env.IMAGE_EXISTS == 'false'
         run: |
           echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..."
-          s3cmd put ${{ env.TARGET_IMAGE }}.qcow2 s3://openhpc-images-prerelease
+          s3cmd --multipart-chunk-size-mb=300 put ${{ env.TARGET_IMAGE }} s3://openhpc-images-prerelease
         shell: bash
 
   image_sync:

From dec39683bb9a4158026ecd94e5098a367a172fb3 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 11 Oct 2024 10:24:14 +0000
Subject: [PATCH 72/78] cleanup s3 at beginning

---
 .github/workflows/fatimage.yml | 50 ++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index 1bb1a5a06..ee7c874fd 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -6,10 +6,35 @@ on:
       - main
     paths:
       - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
+env:
+  S3_BUCKET: openhpc-images-prerelease
+  IMAGE_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
 
 jobs:
+  s3_cleanup:
+    runs-on: ubuntu-22.04
+    concurrency: ${{ github.workflow }}-${{ github.ref }}
+    strategy:
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Write s3cmd configuration
+        run: |
+          echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
+        shell: bash
+
+      - name: Install s3cmd
+        run: |
+          sudo apt-get --yes install s3cmd
+      
+      - name: Cleanup S3 bucket
+        run: |
+          s3cmd rm s3://${{ env.S3_BUCKET }} --recursive --force
+
   image_upload:
     runs-on: ubuntu-22.04
+    needs: s3_cleanup
     concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }}
     strategy:
       fail-fast: false
@@ -22,7 +47,6 @@ jobs:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
       CI_CLOUD: ${{ vars.CI_CLOUD }}
-      IMAGE_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
     steps:
       - uses: actions/checkout@v2
 
@@ -53,37 +77,22 @@ jobs:
         run: |
           sudo apt-get --yes install s3cmd
 
-      - name: Check for image in Arcus S3 bucket
-        id: s3_ls
+      - name: Retrieve image name
         run: |
-          . venv/bin/activate
           TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
           echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
-          echo "target-image-${{ matrix.build }}=${TARGET_IMAGE}" >> "$GITHUB_OUTPUT"
-
-          S3_IMAGES=$(s3cmd ls s3://openhpc-images-prerelease)
-
-          if echo "$S3_IMAGES" | grep -q "$TARGET_IMAGE"; then
-            echo "Image ${TARGET_IMAGE} is already present in S3."
-            echo "IMAGE_EXISTS=true" >> $GITHUB_ENV
-          else
-            echo "Image ${TARGET_IMAGE} is not present in S3."
-            echo "IMAGE_EXISTS=false" >> $GITHUB_ENV
-          fi
         shell: bash
 
       - name: Download image to runner
-        if: env.IMAGE_EXISTS == 'false'
         run: |
           . venv/bin/activate
           openstack image save --file ${{ env.TARGET_IMAGE }} ${{ env.TARGET_IMAGE }}
         shell: bash
 
-      - name: Conditionally Upload Image to S3
-        if: env.IMAGE_EXISTS == 'false'
+      - name: Upload Image to S3
         run: |
           echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..."
-          s3cmd --multipart-chunk-size-mb=300 put ${{ env.TARGET_IMAGE }} s3://openhpc-images-prerelease
+          s3cmd --multipart-chunk-size-mb=150 put ${{ env.TARGET_IMAGE }} s3://${{ env.S3_BUCKET }}
         shell: bash
 
   image_sync:
@@ -108,7 +117,6 @@ jobs:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
       CI_CLOUD: ${{ matrix.cloud }}
-      IMAGE_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
     steps:
       - uses: actions/checkout@v2
 
@@ -138,7 +146,7 @@ jobs:
       - name: Download latest image if missing
         run: |
           . venv/bin/activate
-          bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} openhpc-images-prerelease
+          bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} ${{ env.S3_BUCKET }}
 
       - name: Cleanup OpenStack Image (on error or cancellation)
         if: cancelled()

From e4d90b722e4650135a0f941dace3490ed79168f9 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Fri, 11 Oct 2024 13:13:33 +0000
Subject: [PATCH 73/78] move s3 sync to new workflow

---
 .github/workflows/fatimage.yml      | 212 +++++++++++-----------------
 .github/workflows/s3-image-sync.yml | 162 +++++++++++++++++++++
 2 files changed, 247 insertions(+), 127 deletions(-)
 create mode 100644 .github/workflows/s3-image-sync.yml

diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml
index ee7c874fd..5425eb4e3 100644
--- a/.github/workflows/fatimage.yml
+++ b/.github/workflows/fatimage.yml
@@ -1,136 +1,72 @@
-name: Upload CI-tested images to Arcus S3 and sync clouds
+name: Build fat image
 on:
   workflow_dispatch:
-  push:
-    branches:
-      - main
-    paths:
-      - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
-env:
-  S3_BUCKET: openhpc-images-prerelease
-  IMAGE_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
+      inputs:
+        ci_cloud:
+          description: 'Select the CI_CLOUD'
+          required: true
+          type: choice
+          options:
+            - LEAFCLOUD
+            - SMS
+            - ARCUS
 
 jobs:
-  s3_cleanup:
+  openstack:
+    name: openstack-imagebuild
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
+      cancel-in-progress: true
     runs-on: ubuntu-22.04
-    concurrency: ${{ github.workflow }}-${{ github.ref }}
     strategy:
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Write s3cmd configuration
-        run: |
-          echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
-        shell: bash
-
-      - name: Install s3cmd
-        run: |
-          sudo apt-get --yes install s3cmd
-      
-      - name: Cleanup S3 bucket
-        run: |
-          s3cmd rm s3://${{ env.S3_BUCKET }} --recursive --force
-
-  image_upload:
-    runs-on: ubuntu-22.04
-    needs: s3_cleanup
-    concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }}
-    strategy:
-      fail-fast: false
-      matrix:
-        build:
+      fail-fast: false # allow other matrix jobs to continue even if one fails
+      matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
+        os_version:
           - RL8
           - RL9
-          - RL9-cuda
+        build:
+          - openstack.openhpc
+          - openstack.openhpc-cuda
+        exclude:
+          - os_version: RL8
+            build: openstack.openhpc-cuda
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
-      CI_CLOUD: ${{ vars.CI_CLOUD }}
+      CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
+      SOURCE_IMAGES_MAP: |
+        {
+          "RL8": {
+            "openstack.openhpc": "rocky-latest-RL8",
+            "openstack.openhpc-cuda": "rocky-latest-cuda-RL8"
+          },
+          "RL9": {
+            "openstack.openhpc": "rocky-latest-RL9",
+            "openstack.openhpc-cuda": "rocky-latest-cuda-RL9"
+          }
+        }
+
     steps:
       - uses: actions/checkout@v2
 
-      - name: Record which cloud CI is running on
+      - name: Record settings for CI cloud
         run: |
           echo CI_CLOUD: ${{ env.CI_CLOUD }}
 
-      - name: setup environment
+      - name: Setup ssh
         run: |
-          python3 -m venv venv
-          . venv/bin/activate
-          pip install -U pip
-          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
-        shell: bash
-
-      - name: Write clouds.yaml
-        run: |
-          mkdir -p ~/.config/openstack/
-          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+          set -x
+          mkdir ~/.ssh
+          echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
+          chmod 0600 ~/.ssh/id_rsa
         shell: bash
 
-      - name: Write s3cmd configuration
-        run: |
-          echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
+      - name: Add bastion's ssh key to known_hosts
+        run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
         shell: bash
 
-      - name: Install s3cmd
-        run: |
-          sudo apt-get --yes install s3cmd
-
-      - name: Retrieve image name
-        run: |
-          TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
-          echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
-        shell: bash
-
-      - name: Download image to runner
-        run: |
-          . venv/bin/activate
-          openstack image save --file ${{ env.TARGET_IMAGE }} ${{ env.TARGET_IMAGE }}
-        shell: bash
-
-      - name: Upload Image to S3
-        run: |
-          echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..."
-          s3cmd --multipart-chunk-size-mb=150 put ${{ env.TARGET_IMAGE }} s3://${{ env.S3_BUCKET }}
-        shell: bash
-
-  image_sync:
-    needs: image_upload
-    runs-on: ubuntu-22.04
-    concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cloud }}-${{ matrix.build }}
-    strategy:
-      fail-fast: false
-      matrix:
-        cloud:
-          - LEAFCLOUD
-          - SMS
-          - ARCUS
-        build:
-          - RL8
-          - RL9
-          - RL9-cuda
-        exclude: 
-          - cloud: LEAFCLOUD
-
-    env:
-      ANSIBLE_FORCE_COLOR: True
-      OS_CLOUD: openstack
-      CI_CLOUD: ${{ matrix.cloud }}
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Record which cloud CI is running on
-        run: |
-          echo CI_CLOUD: ${{ env.CI_CLOUD }}
-
-      - name: setup environment
-        run: |
-          python3 -m venv venv
-          . venv/bin/activate
-          pip install -U pip
-          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
-        shell: bash
+      - name: Install ansible etc
+        run: dev/setup-env.sh
 
       - name: Write clouds.yaml
         run: |
@@ -138,25 +74,47 @@ jobs:
           echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
         shell: bash
 
-      - name: Retrieve image name
+      - name: Setup environment
         run: |
-          TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
-          echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
+          . venv/bin/activate
+          . environments/.stackhpc/activate
 
-      - name: Download latest image if missing
+      - name: Build fat image with packer
+        id: packer_build
         run: |
+          set -x
           . venv/bin/activate
-          bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} ${{ env.S3_BUCKET }}
-
-      - name: Cleanup OpenStack Image (on error or cancellation)
-        if: cancelled()
+          . environments/.stackhpc/activate
+          cd packer/
+          packer init .
+
+          PACKER_LOG=1 packer build \
+          -on-error=${{ vars.PACKER_ON_ERROR }} \
+          -only=${{ matrix.build }} \
+          -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
+          -var "source_image_name=${{ env.SOURCE_IMAGE }}" \
+          openstack.pkr.hcl
+        env:
+          PKR_VAR_os_version: ${{ matrix.os_version }}
+          SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }}
+
+      - name: Get created image names from manifest
+        id: manifest
         run: |
           . venv/bin/activate
-          image_hanging=$(openstack image list --name ${{ env.TARGET_IMAGE }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')
-          if [ -n "$image_hanging" ]; then
-            echo "Cleaning up OpenStack image with ID: $image_hanging"
-            openstack image delete $image_hanging
-          else
-            echo "No image ID found, skipping cleanup."
-          fi
-        shell: bash
+          IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
+          while ! openstack image show -f value -c name $IMAGE_ID; do
+            sleep 5
+          done
+          IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
+          echo $IMAGE_ID > image-id.txt
+          echo $IMAGE_NAME > image-name.txt
+
+      - name: Upload manifest artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: image-details-${{ matrix.build }}-${{ matrix.os_version }}
+          path: |
+            ./image-id.txt
+            ./image-name.txt
+          overwrite: true
\ No newline at end of file
diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml
new file mode 100644
index 000000000..ee7c874fd
--- /dev/null
+++ b/.github/workflows/s3-image-sync.yml
@@ -0,0 +1,162 @@
+name: Upload CI-tested images to Arcus S3 and sync clouds
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+    paths:
+      - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
+env:
+  S3_BUCKET: openhpc-images-prerelease
+  IMAGE_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
+
+jobs:
+  s3_cleanup:
+    runs-on: ubuntu-22.04
+    concurrency: ${{ github.workflow }}-${{ github.ref }}
+    strategy:
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Write s3cmd configuration
+        run: |
+          echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
+        shell: bash
+
+      - name: Install s3cmd
+        run: |
+          sudo apt-get --yes install s3cmd
+      
+      - name: Cleanup S3 bucket
+        run: |
+          s3cmd rm s3://${{ env.S3_BUCKET }} --recursive --force
+
+  image_upload:
+    runs-on: ubuntu-22.04
+    needs: s3_cleanup
+    concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }}
+    strategy:
+      fail-fast: false
+      matrix:
+        build:
+          - RL8
+          - RL9
+          - RL9-cuda
+    env:
+      ANSIBLE_FORCE_COLOR: True
+      OS_CLOUD: openstack
+      CI_CLOUD: ${{ vars.CI_CLOUD }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Record which cloud CI is running on
+        run: |
+          echo CI_CLOUD: ${{ env.CI_CLOUD }}
+
+      - name: setup environment
+        run: |
+          python3 -m venv venv
+          . venv/bin/activate
+          pip install -U pip
+          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
+        shell: bash
+
+      - name: Write clouds.yaml
+        run: |
+          mkdir -p ~/.config/openstack/
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+        shell: bash
+
+      - name: Write s3cmd configuration
+        run: |
+          echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
+        shell: bash
+
+      - name: Install s3cmd
+        run: |
+          sudo apt-get --yes install s3cmd
+
+      - name: Retrieve image name
+        run: |
+          TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
+          echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
+        shell: bash
+
+      - name: Download image to runner
+        run: |
+          . venv/bin/activate
+          openstack image save --file ${{ env.TARGET_IMAGE }} ${{ env.TARGET_IMAGE }}
+        shell: bash
+
+      - name: Upload Image to S3
+        run: |
+          echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..."
+          s3cmd --multipart-chunk-size-mb=150 put ${{ env.TARGET_IMAGE }} s3://${{ env.S3_BUCKET }}
+        shell: bash
+
+  image_sync:
+    needs: image_upload
+    runs-on: ubuntu-22.04
+    concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cloud }}-${{ matrix.build }}
+    strategy:
+      fail-fast: false
+      matrix:
+        cloud:
+          - LEAFCLOUD
+          - SMS
+          - ARCUS
+        build:
+          - RL8
+          - RL9
+          - RL9-cuda
+        exclude: 
+          - cloud: LEAFCLOUD
+
+    env:
+      ANSIBLE_FORCE_COLOR: True
+      OS_CLOUD: openstack
+      CI_CLOUD: ${{ matrix.cloud }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Record which cloud CI is running on
+        run: |
+          echo CI_CLOUD: ${{ env.CI_CLOUD }}
+
+      - name: setup environment
+        run: |
+          python3 -m venv venv
+          . venv/bin/activate
+          pip install -U pip
+          pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
+        shell: bash
+
+      - name: Write clouds.yaml
+        run: |
+          mkdir -p ~/.config/openstack/
+          echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
+        shell: bash
+
+      - name: Retrieve image name
+        run: |
+          TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
+          echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
+
+      - name: Download latest image if missing
+        run: |
+          . venv/bin/activate
+          bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} ${{ env.S3_BUCKET }}
+
+      - name: Cleanup OpenStack Image (on error or cancellation)
+        if: cancelled()
+        run: |
+          . venv/bin/activate
+          image_hanging=$(openstack image list --name ${{ env.TARGET_IMAGE }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')
+          if [ -n "$image_hanging" ]; then
+            echo "Cleaning up OpenStack image with ID: $image_hanging"
+            openstack image delete $image_hanging
+          else
+            echo "No image ID found, skipping cleanup."
+          fi
+        shell: bash

From 4f993130e7afd8f66124618b47f75583dfbb9711 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Mon, 14 Oct 2024 09:41:23 +0000
Subject: [PATCH 74/78] update packer readme

---
 packer/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/packer/README.md b/packer/README.md
index 3bc188c7e..5e1d57dc2 100644
--- a/packer/README.md
+++ b/packer/README.md
@@ -7,9 +7,9 @@ The Packer configuration defined here builds "fat images" which contain binaries
 - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates).
 - Improves deployment speed by reducing the number of package downloads to improve deployment speed.
 
-By default, a fat image build starts from a RockyLinux GenericCloud image and updates all DNF packages already present.
+By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image.
 
-The fat images StackHPC builds and test in CI are  available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to:
+The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to:
 1. Build site-specific fat images from scratch.
 2. Extend an existing fat image with additional software.
 
@@ -39,9 +39,9 @@ The steps for building site-specific fat images or extending an existing fat ima
         cd packer/
         PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
 
-  Note that the `-only` flag here restricts the build to the non-OFED fat image "source" (in Packer terminology). Other
+  Note that the `-only` flag here restricts the build to the non-CUDA fat image "source" (in Packer terminology). Other
   source options are:
-    - `-only=openstack.openhpc-ofed`: Build a fat image including Mellanox OFED
+    - `-only=openstack.openhpc-cuda`: Build a fat image including CUDA packages.
     - `-only=openstack.openhpc-extra`: Build an image which extends an existing fat image - in this case the variable `source_image` or `source_image_name}` must also be set in the Packer variables file.
     
 5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash.
@@ -70,7 +70,7 @@ What is Slurm Appliance-specific are the details of how Ansible is run:
           openhpc-extra = ["foo"]
       }
 
-    the build VM uses an existing "fat image" (rather than a RockyLinyux GenericCloud one) and is added to the `builder` and `foo` groups. This means only code targeting `builder` and `foo` groups runs. In this way an existing image can be extended with site-specific code, without modifying the part of the image which has already been tested in the StackHPC CI.
+    the build VM uses an existing "fat image" (rather than a 'latest' nightly one) and is added to the `builder` and `foo` groups. This means only code targeting `builder` and `foo` groups runs. In this way an existing image can be extended with site-specific code, without modifying the part of the image which has already been tested in the StackHPC CI.
 
  - The playbook `ansible/fatimage.yml` is run which is only a subset of `ansible/site.yml`. This allows restricting the code
    which runs during build for cases where setting `builder` groupvars is not sufficient (e.g. a role always attempts to configure or start services). This may eventually be removed.
@@ -82,5 +82,5 @@ There are some things to be aware of when developing Ansible to run in a Packer
   - Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars.
   - Ansible may need to proxy to real compute nodes. If Packer should not use the same proxy to connect to the
     build VMs (e.g. build happens on a different network), proxy configuration should not be added to the `all` group.
-  - Currently two fat image "sources" are defined, with and without OFED. This simplifies CI configuration by allowing the
+  - Currently two fat image "sources" are defined, with and without CUDA. This simplifies CI configuration by allowing the
     default source images to be defined in the `openstack.pkr.hcl` definition.

From 62a590679803f279b2af1dc824df40ba85699611 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 14 Oct 2024 11:15:20 +0100
Subject: [PATCH 75/78] Apply suggestions from code review

Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com>
---
 .github/workflows/s3-image-sync.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml
index ee7c874fd..8845ff773 100644
--- a/.github/workflows/s3-image-sync.yml
+++ b/.github/workflows/s3-image-sync.yml
@@ -54,7 +54,7 @@ jobs:
         run: |
           echo CI_CLOUD: ${{ env.CI_CLOUD }}
 
-      - name: setup environment
+      - name: Setup environment
         run: |
           python3 -m venv venv
           . venv/bin/activate
@@ -124,7 +124,7 @@ jobs:
         run: |
           echo CI_CLOUD: ${{ env.CI_CLOUD }}
 
-      - name: setup environment
+      - name: Setup environment
         run: |
           python3 -m venv venv
           . venv/bin/activate
@@ -149,7 +149,7 @@ jobs:
           bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} ${{ env.S3_BUCKET }}
 
       - name: Cleanup OpenStack Image (on error or cancellation)
-        if: cancelled()
+        if: cancelled() || failure()
         run: |
           . venv/bin/activate
           image_hanging=$(openstack image list --name ${{ env.TARGET_IMAGE }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')

From bf5793995357365e970165627deb248c5d925e53 Mon Sep 17 00:00:00 2001
From: bertiethorpe <bertie443@gmail.com>
Date: Mon, 14 Oct 2024 10:53:54 +0000
Subject: [PATCH 76/78] set matrix exclusion dynamically

---
 .github/workflows/s3-image-sync.yml               | 7 +++++--
 .github/workflows/upload-release-image.yml.sample | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml
index 8845ff773..0ffaae954 100644
--- a/.github/workflows/s3-image-sync.yml
+++ b/.github/workflows/s3-image-sync.yml
@@ -47,12 +47,15 @@ jobs:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
       CI_CLOUD: ${{ vars.CI_CLOUD }}
+    outputs:
+      ci_cloud: ${{ steps.ci.outputs.CI_CLOUD }}
     steps:
       - uses: actions/checkout@v2
 
       - name: Record which cloud CI is running on
+        id: ci
         run: |
-          echo CI_CLOUD: ${{ env.CI_CLOUD }}
+          echo "CI_CLOUD=${{ env.CI_CLOUD }}" >> "$GITHUB_OUTPUT"
 
       - name: Setup environment
         run: |
@@ -111,7 +114,7 @@ jobs:
           - RL9
           - RL9-cuda
         exclude: 
-          - cloud: LEAFCLOUD
+          - cloud: ${{ needs.image_upload.outputs.ci_cloud }}
 
     env:
       ANSIBLE_FORCE_COLOR: True
diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample
index 264a96143..0b123bcf4 100644
--- a/.github/workflows/upload-release-image.yml.sample
+++ b/.github/workflows/upload-release-image.yml.sample
@@ -53,7 +53,7 @@ jobs:
           bash .github/bin/get-s3-image.sh ${{ inputs.image_name }} ${{ inputs.bucket_name }}
 
       - name: Cleanup OpenStack Image (on error or cancellation)
-        if: cancelled()
+        if: cancelled() || failure()
         run: |
           . venv/bin/activate
           image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}')

From 52367cc95eae4028e82b9ac2cc208f4b5672ba18 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Tue, 15 Oct 2024 14:15:14 +0100
Subject: [PATCH 77/78] Update docs to include operations (#422)

* Update README.md

* OSes supported as deploy hosts

* undo readme OSes supported

* add operations docs

* simplify main README.md to only cover default configuration

* move more-specific documentation into their own files

* provide site docs directory

* address docs review comments

* Fix a / in docs

Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com>

* address PR comments on docs

* address PR comments on docs

---------

Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com>
---
 README.md                                     | 179 +++++++-----------
 docs/adding-functionality.md                  |   9 +
 docs/ci.md                                    |   8 +
 docs/environments.md                          |  30 +++
 ...ng.README.md => monitoring-and-logging.md} |   0
 docs/operations.md                            | 148 +++++++++++++++
 docs/production.md                            |   9 +
 docs/site/README.md                           |   6 +
 8 files changed, 280 insertions(+), 109 deletions(-)
 create mode 100644 docs/adding-functionality.md
 create mode 100644 docs/ci.md
 create mode 100644 docs/environments.md
 rename docs/{monitoring-and-logging.README.md => monitoring-and-logging.md} (100%)
 create mode 100644 docs/operations.md
 create mode 100644 docs/production.md
 create mode 100644 docs/site/README.md

diff --git a/README.md b/README.md
index d348d66d0..b54cd110a 100644
--- a/README.md
+++ b/README.md
@@ -2,36 +2,47 @@
 
 # StackHPC Slurm Appliance
 
-This repository contains playbooks and configuration to define a Slurm-based HPC environment including:
-- A Rocky Linux 9 and OpenHPC v3-based Slurm cluster.
-- Shared fileystem(s) using NFS (with servers within or external to the cluster).
-- Slurm accounting using a MySQL backend.
-- A monitoring backend using Prometheus and ElasticSearch.
-- Grafana with dashboards for both individual nodes and Slurm jobs.
-- Production-ready Slurm defaults for access and memory.
-- A Packer-based build pipeline for compute and login node images.
-
-The repository is designed to be forked for a specific use-case/HPC site but can contain multiple environments (e.g. development, staging and production). It has been designed to be modular and extensible, so if you add features for your HPC site please feel free to submit PRs back upstream to us!
-
-While it is tested on OpenStack it should work on any cloud, except for node rebuild/reimaging features which are currently OpenStack-specific.
-
-## Prerequisites
-It is recommended to check the following before starting:
-- You have root access on the "ansible deploy host" which will be used to deploy the appliance.
+This repository contains playbooks and configuration to define a Slurm-based HPC environment. This includes:
+- [Rocky Linux](https://rockylinux.org/)-based hosts.
+- [OpenTofu](https://opentofu.org/) configurations to define the cluster's infrastructure-as-code.
+- Packages for Slurm and MPI software stacks from [OpenHPC](https://openhpc.community/).
+- Shared fileystem(s) using NFS (with in-cluster or external servers) or [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [Openstack Manila](https://wiki.openstack.org/wiki/Manila).
+- Slurm accounting using a MySQL database.
+- Monitoring integrated with Slurm jobs using Prometheus, ElasticSearch and Grafana.
+- A web-based portal from [OpenOndemand](https://openondemand.org/).
+- Production-ready default Slurm configurations for access and memory limits.
+- [Packer](https://developer.hashicorp.com/packer)-based image build configurations for node images.
+
+The repository is expected to be forked for a specific HPC site but can contain multiple environments for e.g. development, staging and production clusters
+sharing a common configuration. It has been designed to be modular and extensible, so if you add features for your HPC site please feel free to submit PRs
+back upstream to us!
+
+While it is tested on OpenStack it should work on any cloud with appropriate OpenTofu configuration files.
+
+## Demonstration Deployment
+
+The default configuration in this repository may be used to create a cluster to explore use of the appliance. It provides:
+- Persistent state backed by an OpenStack volume.
+- NFS-based shared file system backed by another OpenStack volume.
+
+Note that the OpenOndemand portal and its remote apps are not usable with this default configuration.
+
+It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud.
+
+Before starting ensure that:
+- You have root access on the deploy host.
 - You can create instances using a Rocky 9 GenericCloud image (or an image based on that).
     - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. However the appliance will install the necessary packages if a GenericCloud image is used.
-- SSH keys get correctly injected into instances.
-- Instances have access to internet (note proxies can be setup through the appliance if necessary).
-- DNS works (if not this can be partially worked around but additional configuration will be required).
+- You have a SSH keypair defined in OpenStack, with the private key available on the deploy host.
+- Created instances have access to internet (note proxies can be setup through the appliance if necessary).
 - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance).
 
-## Installation on deployment host
+### Setup deploy host
 
-Current Operating Systems supported to be deploy hosts:
+The following operating systems are supported for the deploy host:
 
 - Rocky Linux 9
 - Rocky Linux 8
-- Ubuntu 22.04
 
 These instructions assume the deployment host is running Rocky Linux 8:
 
@@ -40,28 +51,11 @@ These instructions assume the deployment host is running Rocky Linux 8:
     cd ansible-slurm-appliance
     ./dev/setup-env.sh
 
-## Overview of directory structure
-
-- `environments/`: Contains configurations for both a "common" environment and one or more environments derived from this for your site. These define ansible inventory and may also contain provisioning automation such as Terraform or OpenStack HEAT templates.
-- `ansible/`: Contains the ansible playbooks to configure the infrastruture.
-- `packer/`: Contains automation to use Packer to build compute nodes for an enviromment - see the README in this directory for further information.
-- `dev/`: Contains development tools.
-
-## Environments
+You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install/rpm/).
 
-### Overview
+### Create a new environment
 
-An environment defines the configuration for a single instantiation of this Slurm appliance. Each environment is a directory in `environments/`, containing:
-- Any deployment automation required - e.g. Terraform configuration or HEAT templates.
-- An ansible `inventory/` directory.
-- An `activate` script which sets environment variables to point to this configuration.
-- Optionally, additional playbooks in `/hooks` to run before or after the main tasks.
-
-All environments load the inventory from the `common` environment first, with the environment-specific inventory then overriding parts of this as required.
-
-### Creating a new environment
-
-This repo contains a `cookiecutter` template which can be used to create a new environment from scratch. Run the [installation on deployment host](#Installation-on-deployment-host) instructions above, then in the repo root run:
+Use the `cookiecutter` template to create a new environment to hold your configuration. In the repository root run:
 
     . venv/bin/activate
     cd environments
@@ -69,86 +63,53 @@ This repo contains a `cookiecutter` template which can be used to create a new e
 
 and follow the prompts to complete the environment name and description.
 
-Alternatively, you could copy an existing environment directory.
-
-Now add deployment automation if required, and then complete the environment-specific inventory as described below.
+**NB:** In subsequent sections this new environment is refered to as `$ENV`.
 
-### Environment-specific inventory structure
+Now generate secrets for this environment:
 
-The ansible inventory for the environment is in `environments/<environment>/inventory/`. It should generally contain:
-- A `hosts` file. This defines the hosts in the appliance. Generally it should be templated out by the deployment automation so it is also a convenient place to define variables which depend on the deployed hosts such as connection variables, IP addresses, ssh proxy arguments etc.
-- A `groups` file defining ansible groups, which essentially controls which features of the appliance are enabled and where they are deployed. This repository generally follows a convention where functionality is defined using ansible roles applied to a a group of the same name, e.g. `openhpc` or `grafana`. The meaning and use of each group is described in comments in `environments/common/inventory/groups`. As the groups defined there for the common environment are empty, functionality is disabled by default and must be enabled in a specific environment's `groups` file. Two template examples are provided in `environments/commmon/layouts/` demonstrating a minimal appliance with only the Slurm cluster itself, and an appliance with all functionality.
-- Optionally, group variable files in `group_vars/<group_name>/overrides.yml`, where the group names match the functional groups described above. These can be used to override the default configuration for each functionality, which are defined in `environments/common/inventory/group_vars/all/<group_name>.yml` (the use of `all` here is due to ansible's precedence rules).
+    ansible-playbook ansible/adhoc/generate-passwords.yml
 
-Although most of the inventory uses the group convention described above there are a few special cases:
-- The `control`, `login` and `compute` groups are special as they need to contain actual hosts rather than child groups, and so should generally be defined in the templated-out `hosts` file.
-- The cluster name must be set on all hosts using `openhpc_cluster_name`. Using an  `[all:vars]` section in the `hosts` file is usually convenient.
-- `environments/common/inventory/group_vars/all/defaults.yml` contains some variables which are not associated with a specific role/feature. These are unlikely to need changing, but if necessary that could be done using a `environments/<environment>/inventory/group_vars/all/overrides.yml` file.
-- The `ansible/adhoc/generate-passwords.yml` playbook sets secrets for all hosts in `environments/<environent>/inventory/group_vars/all/secrets.yml`.
-- The Packer-based pipeline for building compute images creates a VM in groups `builder` and `compute`, allowing build-specific properties to be set in `environments/common/inventory/group_vars/builder/defaults.yml` or the equivalent inventory-specific path.
-- Each Slurm partition must have:
-    - An inventory group `<cluster_name>_<partition_name>` defining the hosts it contains - these must be homogenous w.r.t CPU and memory.
-    - An entry in the `openhpc_slurm_partitions` mapping in `environments/<environment>/inventory/group_vars/openhpc/overrides.yml`.
-    See the [openhpc role documentation](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) for more options.
-- On an OpenStack cloud, rebuilding/reimaging compute nodes from Slurm can be enabled by defining a `rebuild` group containing the relevant compute hosts (e.g. in the generated `hosts` file).
+### Define infrastructure configuration
 
-## Creating a Slurm appliance
+Create an OpenTofu variables file to define the required infrastructure, e.g.:
 
-NB: This section describes generic instructions - check for any environment-specific instructions in `environments/<environment>/README.md` before starting.
+    # environments/$ENV/terraform/terraform.tfvars:
 
-1. Activate the environment - this **must be done** before any other commands are run:
+    cluster_name = "mycluster"
+    cluster_net = "some_network" # *
+    cluster_subnet = "some_subnet" # *
+    key_pair = "my_key" # *
+    control_node_flavor = "some_flavor_name"
+    login_nodes = {
+        login-0: "login_flavor_name"
+    }
+    cluster_image_id = "rocky_linux_9_image_uuid"
+    compute = {
+        general = {
+            nodes: ["compute-0", "compute-1"]
+            flavor: "compute_flavor_name"
+        }
+    }
 
-        source environments/<environment>/activate
+Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables
+and descriptions see `environments/$ENV/terraform/terraform.tfvars`.
 
-2. Deploy instances - see environment-specific instructions.
+### Deploy appliance
 
-3. Generate passwords:
+    ansible-playbook ansible/site.yml
 
-        ansible-playbook ansible/adhoc/generate-passwords.yml
+You can now log in to the cluster using:
 
-    This will output a set of passwords in `environments/<environment>/inventory/group_vars/all/secrets.yml`. It is recommended that these are encrpyted and then commited to git using:
+    ssh rocky@$login_ip
 
-        ansible-vault encrypt inventory/group_vars/all/secrets.yml
+where the IP of the login node is given in `environments/$ENV/inventory/hosts.yml`
 
-    See the [Ansible vault documentation](https://docs.ansible.com/ansible/latest/user_guide/vault.html) for more details.
 
-4. Deploy the appliance:
-
-        ansible-playbook ansible/site.yml
-
-   or if you have encrypted secrets use:
-
-        ansible-playbook ansible/site.yml --ask-vault-password
-
-    Tags as defined in the various sub-playbooks defined in `ansible/` may be used to only run part of the `site` tasks.
-
-5. "Utility" playbooks for managing a running appliance are contained in `ansible/adhoc` - run these by activating the environment and using:
-
-        ansible-playbook ansible/adhoc/<playbook name>
-
-   Currently they include the following (see each playbook for links to documentation):
-    - `hpctests.yml`: MPI-based cluster tests for latency, bandwidth and floating point performance.
-    - `rebuild.yml`: Rebuild nodes with existing or new images (NB: this is intended for development not for reimaging nodes on an in-production cluster - see `ansible/roles/rebuild` for that).
-    - `restart-slurm.yml`: Restart all Slurm daemons in the correct order.
-    - `update-packages.yml`: Update specified packages on cluster nodes.
-
-## Adding new functionality
-Please contact us for specific advice, but in outline this generally involves:
-- Adding a role.
-- Adding a play calling that role into an existing playbook in `ansible/`, or adding a new playbook there and updating `site.yml`.
-- Adding a new (empty) group named after the role into `environments/common/inventory/groups` and a non-empty example group into `environments/common/layouts/everything`.
-- Adding new default group vars into `environments/common/inventory/group_vars/all/<rolename>/`.
-- Updating the default Packer build variables in `environments/common/inventory/group_vars/builder/defaults.yml`.
-- Updating READMEs.
-
-## Monitoring and logging
-
-Please see the [monitoring-and-logging.README.md](docs/monitoring-and-logging.README.md) for details.
-
-## CI/CD automation
-
-The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include:
+## Overview of directory structure
 
-- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published.
+- `environments/`: See [docs/environments.md](docs/environments.md).
+- `ansible/`: Contains the ansible playbooks to configure the infrastruture.
+- `packer/`: Contains automation to use Packer to build machine images for an enviromment - see the README in this directory for further information.
+- `dev/`: Contains development tools.
 
-- An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow which takes an image name, downloads it from StackHPC's public S3 bucket if available, and uploads it to the target OpenStack cloud.
\ No newline at end of file
+For further information see the [docs](docs/) directory.
diff --git a/docs/adding-functionality.md b/docs/adding-functionality.md
new file mode 100644
index 000000000..69d3b3a3f
--- /dev/null
+++ b/docs/adding-functionality.md
@@ -0,0 +1,9 @@
+# Adding new functionality
+
+Please contact us for specific advice, but this generally involves:
+- Adding a role.
+- Adding a play calling that role into an existing playbook in `ansible/`, or adding a new playbook there and updating `site.yml`.
+- Adding a new (empty) group named after the role into `environments/common/inventory/groups` and a non-empty example group into `environments/common/layouts/everything`.
+- Adding new default group vars into `environments/common/inventory/group_vars/all/<rolename>/`.
+- Updating the default Packer build variables in `environments/common/inventory/group_vars/builder/defaults.yml`.
+- Updating READMEs.
diff --git a/docs/ci.md b/docs/ci.md
new file mode 100644
index 000000000..c6fa8900d
--- /dev/null
+++ b/docs/ci.md
@@ -0,0 +1,8 @@
+# CI/CD automation
+
+The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include:
+
+- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published.
+
+- An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow which takes an image name, downloads it from StackHPC's public S3 bucket if available, and uploads it to the target OpenStack cloud.
+
diff --git a/docs/environments.md b/docs/environments.md
new file mode 100644
index 000000000..d1c492312
--- /dev/null
+++ b/docs/environments.md
@@ -0,0 +1,30 @@
+# Environments
+
+## Overview
+
+An environment defines the configuration for a single instantiation of this Slurm appliance. Each environment is a directory in `environments/`, containing:
+- Any deployment automation required - e.g. OpenTofu configuration or HEAT templates.
+- An Ansible `inventory/` directory.
+- An `activate` script which sets environment variables to point to this configuration.
+- Optionally, additional playbooks in `hooks/` to run before or after to the default playbooks.
+
+All environments load the inventory from the `common` environment first, with the environment-specific inventory then overriding parts of this as required.
+
+### Environment-specific inventory structure
+
+The ansible inventory for the environment is in `environments/<environment>/inventory/`. It should generally contain:
+- A `hosts` file. This defines the hosts in the appliance. Generally it should be templated out by the deployment automation so it is also a convenient place to define variables which depend on the deployed hosts such as connection variables, IP addresses, ssh proxy arguments etc.
+- A `groups` file defining ansible groups, which essentially controls which features of the appliance are enabled and where they are deployed. This repository generally follows a convention where functionality is defined using ansible roles applied to a group of the same name, e.g. `openhpc` or `grafana`. The meaning and use of each group is described in comments in `environments/common/inventory/groups`. As the groups defined there for the common environment are empty, functionality is disabled by default and must be enabled in a specific environment's `groups` file. Two template examples are provided in `environments/commmon/layouts/` demonstrating a minimal appliance with only the Slurm cluster itself, and an appliance with all functionality.
+- Optionally, group variable files in `group_vars/<group_name>/overrides.yml`, where the group names match the functional groups described above. These can be used to override the default configuration for each functionality, which are defined in `environments/common/inventory/group_vars/all/<group_name>.yml` (the use of `all` here is due to ansible's precedence rules).
+
+Although most of the inventory uses the group convention described above there are a few special cases:
+- The `control`, `login` and `compute` groups are special as they need to contain actual hosts rather than child groups, and so should generally be defined in the templated-out `hosts` file.
+- The cluster name must be set on all hosts using `openhpc_cluster_name`. Using an `[all:vars]` section in the `hosts` file is usually convenient.
+- `environments/common/inventory/group_vars/all/defaults.yml` contains some variables which are not associated with a specific role/feature. These are unlikely to need changing, but if necessary that could be done using a `environments/<environment>/inventory/group_vars/all/overrides.yml` file.
+- The `ansible/adhoc/generate-passwords.yml` playbook sets secrets for all hosts in `environments/<environent>/inventory/group_vars/all/secrets.yml`.
+- The Packer-based pipeline for building compute images creates a VM in groups `builder` and `compute`, allowing build-specific properties to be set in `environments/common/inventory/group_vars/builder/defaults.yml` or the equivalent inventory-specific path.
+- Each Slurm partition must have:
+    - An inventory group `<cluster_name>_<partition_name>` defining the hosts it contains - these must be homogenous w.r.t CPU and memory.
+    - An entry in the `openhpc_slurm_partitions` mapping in `environments/<environment>/inventory/group_vars/openhpc/overrides.yml`.
+    See the [openhpc role documentation](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) for more options.
+- On an OpenStack cloud, rebuilding/reimaging compute nodes from Slurm can be enabled by defining a `rebuild` group containing the relevant compute hosts (e.g. in the generated `hosts` file).
diff --git a/docs/monitoring-and-logging.README.md b/docs/monitoring-and-logging.md
similarity index 100%
rename from docs/monitoring-and-logging.README.md
rename to docs/monitoring-and-logging.md
diff --git a/docs/operations.md b/docs/operations.md
new file mode 100644
index 000000000..a20d7f10c
--- /dev/null
+++ b/docs/operations.md
@@ -0,0 +1,148 @@
+# Operations
+
+This page describes the commands required for common operations.
+
+All subsequent sections assume that:
+- Commands are run from the repository root, unless otherwise indicated by a `cd` command.
+- An Ansible vault secret is configured.
+- The correct private key is available to Ansible.
+- Appropriate OpenStack credentials are available.
+- Any non-appliance controlled infrastructure is avaialble (e.g. networks, volumes, etc.).
+- `$ENV` is your current, activated environment, as defined by e.g. `environments/production/`.
+- `$SITE_ENV` is the base site-specific environment, as defined by e.g. `environments/mysite/`.
+- A string `some/path/to/file.yml:myvar` defines a path relative to the repository root and an Ansible variable in that file.
+- Configuration is generally common to all environments at a site, i.e. is made in `environments/$SITE_ENV` not `environments/$ENV`.
+
+Review any [site-specific documentation](site/README.md) for more details on the above.
+
+# Deploying a Cluster
+
+This follows the same process as defined in the main [README.md](../README.md) for the default configuration.
+
+Note that tags as defined in the various sub-playbooks defined in `ansible/` may be used to only run part of the tasks in `site.yml`.
+
+# SSH to Cluster Nodes
+
+This depends on how the cluster is accessed.
+
+The script `dev/ansible-ssh` may generally be used to connect to a host specified by a `inventory_hostname` using the same connection details as Ansible. If this does not work:
+- Instance IPs are normally defined in `ansible_host` variables in an inventory file `environments/$ENV/inventory/hosts{,.yml}`.
+- The ssh user is defined by `ansible_user`, default is `rocky`. This may be overriden in your environment.
+- If a jump host is required the user and address may be defined in the above inventory file.
+
+# Modifying general Slurm.conf parameters
+Parameters for [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) can be added to an `openhpc_config_extra` mapping in `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`.
+Note that values in this mapping may be:
+- A string, which will be inserted as-is.
+- A list, which will be converted to a comma-separated string.
+
+This allows specifying `slurm.conf` contents in an yaml-format Ansible-native way.
+
+**NB:** The appliance provides some default values in `environments/common/inventory/group_vars/all/openhpc.yml:openhpc_config_default` which is combined with the above. The `enable_configless` flag in the `SlurmCtldParameters` key this sets must not be overridden - a validation step checks this has not happened.
+
+See [Reconfiguring Slurm](#Reconfiguring-Slurm) to apply changes.
+
+# Modifying Slurm Partition-specific Configuration
+
+Modify the `openhpc_slurm_partitions` mapping usually in `enviroments/$SITE_ENV/inventory/group_vars/all/openhpc.yml` as described for [stackhpc.openhpc:slurmconf](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) (note the relevant version of this role is defined in the `requirements.yml`)
+
+Note an Ansible inventory group for the partition is required. This is generally auto-defined by a template in the OpenTofu configuration.
+
+**NB:** `default:NO` must be set on all non-default partitions, otherwise the last defined partition will always be set as the default.
+
+See [Reconfiguring Slurm](#Reconfiguring-Slurm) to apply changes.
+
+# Adding an Additional Partition
+This is a usually a two-step process:
+
+- If new nodes are required, define a new node group by adding an entry to the `compute` mapping in `environments/$ENV/tofu/main.tf` assuming the default OpenTofu configuration:
+    - The key is the partition name.
+    - The value should be a mapping, with the parameters defined in `environments/$SITE_ENV/terraform/compute/variables.tf`, but in brief will need at least `flavor` (name) and `nodes` (a list of node name suffixes).
+- Add a new partition to the partition configuration as described under [Modifying Slurm Partition-specific Configuration](#Modifying-Slurm-Partition-specific-Configuration).
+
+Deploying the additional nodes and applying these changes requires rerunning both Terraform and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster).
+
+# Adding Additional Packages
+Packages from any enabled DNF repositories (which always includes EPEL, PowerTools and OpenHPC) can be added to all nodes by defining a list `openhpc_packages_extra` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`. For example:
+
+    # environments/foo-base/inventory/group_vars/all/openhpc.yml:
+    openhpc_packages_extra:
+    - somepackage
+    - anotherpackage
+
+
+The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules.
+
+To add these packages to the current cluster, run the same command as for [Reconfiguring Slurm](#Reconfiguring-Slurm). TODO: describe what's required to add these to site-specific images.
+
+If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this:
+    - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory.
+    - `ansible.builtin.rpm_key` : Add a GPG key to the RPM database.
+    - `ansible.builtin.get_url`: Can be used to install a repofile directly from an URL (e.g. https://turbovnc.org/pmwiki/uploads/Downloads/TurboVNC.repo)
+    - `ansible.builtin.dnf`: Can be used to install 'release packages' providing repos, e.g. `epel-release`, `ohpc-release`.
+
+The packages to be installed from that repo could also be defined in that play. Note using the `dnf` module with a list for its `name` parameter is more efficient and allows better dependency resolution than calling the module in a loop.
+
+
+Adding these repos/packages to the cluster/image would then require running:
+
+    ansible-playbook environments/$SITE_ENV/hooks/{pre,post}.yml
+
+as appropriate.
+
+TODO: improve description about adding these to extra images.
+
+
+# Reconfiguring Slurm
+
+At a minimum run:
+
+    ansible-playbook ansible/slurm.yml --tags openhpc
+
+
+**NB:** This will restart all daemons if the `slurm.conf` has any changes, even if technically only a `scontrol reconfigure` is required.
+
+
+# Running the MPI Test Suite
+
+See [ansible/roles/hpctests/README.md](ansible/roles/hpctests/README.md) for a description of these. They can be run using
+
+    ansible-playbook ansible/adhoc/hpctests.yml
+
+Note that:
+- The above role provides variables to select specific partitions, nodes and interfaces which may be required. If not set in inventory, these can be passed as extravars:
+
+        ansible-playbook ansible/adhoc/hpctests.yml -e hpctests_myvar=foo
+- The HPL-based test is only resonably optimised on Intel processors due the libaries and default parallelisation scheme used. For AMD processors it is recommended this
+is skipped using:
+
+        ansible-playbook ansible/adhoc/hpctests.yml --skip-tags hpl-solo.
+
+Review any [site-specific documentation](site/README.md) for more details.
+
+# Running CUDA Tests
+This uses the [cuda-samples](https://github.com/NVIDIA/cuda-samples/) utilities "deviceQuery" and "bandwidthTest" to test GPU functionality. It automatically runs on any
+host in the `cuda` inventory group:
+
+    ansible-playbook ansible/adhoc/cudatests.yml
+
+**NB:** This test is not launched through Slurm, so confirm nodes are free/out of service or use `--limit` appropriately.
+
+# Ad-hoc Commands and Playbooks
+
+A set of utility playbooks for managing a running appliance are provided in `ansible/adhoc` - run these by activating the environment and using:
+
+        ansible-playbook ansible/adhoc/$PLAYBOOK
+
+Currently they include the following (see each playbook for links to documentation):
+
+- `hpctests.yml`: MPI-based cluster tests for latency, bandwidth and floating point performance.
+- `rebuild.yml`: Rebuild nodes with existing or new images (NB: this is intended for development not for reimaging nodes on an in-production cluster).
+- `restart-slurm.yml`: Restart all Slurm daemons in the correct order.
+- `update-packages.yml`: Update specified packages on cluster nodes (NB: not recommended for routine use).
+
+The `ansible` binary [can be used](https://docs.ansible.com/ansible/latest/command_guide/intro_adhoc.html) to run arbitrary shell commands against inventory groups or hosts, for example:
+
+    ansible [--become] <group/host> -m shell -a "<shell command>"
+
+This can be useful for debugging and development but any modifications made this way will be lost if nodes are rebuilt/reimaged.
diff --git a/docs/production.md b/docs/production.md
new file mode 100644
index 000000000..7219ee7fc
--- /dev/null
+++ b/docs/production.md
@@ -0,0 +1,9 @@
+# Production Deployments
+
+This page contains some brief notes about differences between the default/demo configuration, as described in the main [README.md](../README.md) and production-ready deployments.
+
+- Create a site environment. Usually at least production, staging and possibly development environments are required. To avoid divergence of configuration these should all have an `inventory` path referencing a shared, site-specific base environment. Where possible hooks should also be placed in this site-specific environment.
+- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. To ensure staging environments are a good model for production this should generally be moved into the site-specific environment. It can be be encrypted using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) and then committed to the repository.
+- Ensure created instances have accurate/synchronised time. For VM instances this is usually provided by the hypervisor, but if not (or for bare metal instances) it may be necessary to configure or proxy `chronyd` via an environment hook.
+- Remove production volumes from OpenTofu control. In the default OpenTofu configuration, deleting the resources also deletes the volumes used for persistent state and home directories. This is usually undesirable for production, so these resources should be removed from the OpenTofu configurations and manually deployed once. However note that for development environments leaving them under OpenTofu control is usually best.
+- Configure Open OpenOndemand - see [specific documentation](openondemand.README.md).
diff --git a/docs/site/README.md b/docs/site/README.md
new file mode 100644
index 000000000..ee147875c
--- /dev/null
+++ b/docs/site/README.md
@@ -0,0 +1,6 @@
+# Site-specific Documentation
+
+This document is a placeholder for any site-specific documentation, e.g. environment descriptions.
+
+#TODO: list things which should commonly be specified here.
+

From 3f85f774229e5cdaa5b66f88aefd11999d8cba8b Mon Sep 17 00:00:00 2001
From: Cloud User <rocky@scott-slurm-dev-login-0.scott-slurm-dev.invalid>
Date: Tue, 15 Oct 2024 14:41:59 +0000
Subject: [PATCH 78/78] Ansible playbook to configure sshd for Conch CA certs.

---
 ansible/ca-cert.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 ansible/ca-cert.yml

diff --git a/ansible/ca-cert.yml b/ansible/ca-cert.yml
new file mode 100644
index 000000000..34320115a
--- /dev/null
+++ b/ansible/ca-cert.yml
@@ -0,0 +1,27 @@
+# An ansible playbook to configure the SSHD configuration to enable CA cert auth for SSH.
+# Remember to export CI_CLOUD if it isn't part of your environment's variables.
+
+# NOTE: Change the src for the `ssh_signing_key.pub` to be your corresponding directory.
+
+- hosts: login
+  gather_facts: true
+  become: true
+  tasks:
+    - name: Copy ssh public key
+      ansible.builtin.copy:
+        src: /var/lib/rocky/conch/ssh_signing_key.pub
+        dest: /etc/ssh/ca_user_key.pub
+        owner: root
+        group: root
+        mode: '0644'
+        remote_src: true
+        
+    - name: Ensure CA Certs are accepted
+      ansible.builtin.lineinfile:
+        line: 'TrustedUserCAKeys /etc/ssh/ca_user_key.pub'
+        dest: /etc/ssh/sshd_config
+
+    - name: Restart SSH service
+      ansible.builtin.systemd:
+        name: sshd
+        state: restarted