From b0fc55a4cebc4fa704062ce7a1f9d12167b1eebc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 17 Sep 2025 09:04:50 +0100 Subject: [PATCH 1/8] remove default build groups --- environments/.stackhpc/inventory/extra_groups | 4 ++++ environments/common/inventory/groups | 1 - environments/site/inventory/groups | 7 +++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 29d9d9378..6d676e6a2 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -54,3 +54,7 @@ compute [raid:children] # Configure fatimage for raid builder + +[gateway:children] +# Install gateway ansible-init playbook into image +builder diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 246162116..bc394e43d 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -194,7 +194,6 @@ k3s_agent [dnf_repos:children] # Hosts to replace system repos with Pulp repos # Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users -builder extra_packages [pulp_site] diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 342201f97..0a4434d06 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -136,7 +136,11 @@ cluster [extra_packages:children] # Hosts to install specified additional packages on -builder + +[dnf_repos:children] +# Hosts to replace system repos with Pulp repos +# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users +extra_packages [cacerts] # Hosts to configure CA certificates and trusts on @@ -146,7 +150,6 @@ builder [gateway:children] # Add builder to this group to install gateway ansible-init playbook into image -builder [nhc:children] # Hosts to configure for node health checks From e3a1d4f08060885b8c734af0ad254a222a7bf91d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 17 Sep 2025 12:06:35 +0100 Subject: [PATCH 2/8] fixup doca/cuda inventory groups --- environments/common/inventory/groups | 5 +++++ environments/site/inventory/groups | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index bc394e43d..48934a39c 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -129,6 +129,9 @@ prometheus freeipa_server freeipa_client +[doca] +# Add `builder` to install NVIDIA DOCA during image build + [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md @@ -195,6 +198,8 @@ k3s_agent # Hosts to replace system repos with Pulp repos # Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users extra_packages +cuda +doca [pulp_site] # Add builder to this group to automatically sync pulp during image build diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 0a4434d06..ff877958d 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -74,6 +74,9 @@ cluster [freeipa_client] # Hosts to be a FreeIPA client. See ansible/roles/freeipa/README.md +[doca] +# Add `builder` to install NVIDIA DOCA during image build + [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md @@ -137,10 +140,9 @@ cluster [extra_packages:children] # Hosts to install specified additional packages on -[dnf_repos:children] +[dnf_repos] # Hosts to replace system repos with Pulp repos # Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users -extra_packages [cacerts] # Hosts to configure CA certificates and trusts on From 57d352beb0d66a90dccc8c98806a53cf389d6f0b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 17 Sep 2025 15:10:33 +0100 Subject: [PATCH 3/8] add fatimage inventory group --- .github/workflows/fatimage.yml | 4 ++-- environments/.stackhpc/inventory/extra_groups | 17 +---------------- environments/common/inventory/groups | 7 ++++++- environments/site/inventory/groups | 10 +++++++++- 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 51ea29a60..c7d111208 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -29,10 +29,10 @@ jobs: build: - image_name: openhpc-RL8 source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.raw - inventory_groups: control,compute,login,update + inventory_groups: fatimage - image_name: openhpc-RL9 source_image_name: Rocky-9-GenericCloud-Base-9.6-20250531.0.x86_64.qcow2 - inventory_groups: control,compute,login,update + inventory_groups: fatimage env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 6d676e6a2..525965c1b 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -28,20 +28,9 @@ compute cluster [tuned:children] -# Install tuned into fat image -# NB: builder has tuned_enabled and tuned_started false so does not configure it -builder -# Also test tuned during site playbook +# Test tuned during site playbook cluster -[squid:children] -# Install squid into fat image -builder - -[sssd:children] -# Install sssd into fat image -builder - [rebuild:children] control @@ -51,10 +40,6 @@ cluster [compute_init:children] compute -[raid:children] -# Configure fatimage for raid -builder - [gateway:children] # Install gateway ansible-init playbook into image builder diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 48934a39c..9057f9a77 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -196,7 +196,6 @@ k3s_agent [dnf_repos:children] # Hosts to replace system repos with Pulp repos -# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users extra_packages cuda doca @@ -224,3 +223,9 @@ doca [raid] # Add `builder` to configure image for software raid + +[fatimage:children] +# Minimal configuration for fat image build +control +login +compute diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index ff877958d..f16a28ad3 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -142,7 +142,6 @@ cluster [dnf_repos] # Hosts to replace system repos with Pulp repos -# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users [cacerts] # Hosts to configure CA certificates and trusts on @@ -174,3 +173,12 @@ compute [raid] # Add `builder` to configure image for software raid + +[fatimage:children] +# Standard additional functionality for fat image build, as used by StackHPC CI images: +gateway +raid +squid +sssd +tuned +update From 5c8de5a0cfb2fe3279febb661a3e55af95efcf46 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 17 Sep 2025 15:11:32 +0100 Subject: [PATCH 4/8] update docs for image build --- docs/image-build.md | 102 ++++++++++++++++++++++++++++++++++---------- docs/operations.md | 27 +++++++----- 2 files changed, 96 insertions(+), 33 deletions(-) diff --git a/docs/image-build.md b/docs/image-build.md index dc968ebfd..4c371189b 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -1,47 +1,103 @@ # Packer-based image build -The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. +The appliance contains configuration to use [Packer](https://developer.hashicorp.com/packer) +with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) +to build images. Using images: +- Enables the image to be tested in a `staging` environment before deployment + to the `production` environment. +- Ensures re-deployment of the cluster or deployment of additional nodes is + repeatable. +- Improves deployment speed by reducing the number of package installation. + +The Packer configuration here can be used to build two types of images: +1. "Fat images" which contain packages, binaries and container images but no + cluster-specific configuration. These start from a RockyLinux GenericCloud + (or compatible) image. The fat images StackHPC builds and tests in CI are + available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). + However site-specific fat images can also be built from a different source + image e.g. if a different partition layout is required. +2. "Extra-build" images which extend a StackHPC fat image to create a site-specific + image with with additional packages or functionality. For example the NVIDIA + `cuda` packages cannot be redistributed hence require an "extra" build. -The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: -- Enables the image to be tested in CI before production use. -- Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). -- Improves deployment speed by reducing the number of package downloads to improve deployment speed. +# Usage -The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: -1. Build site-specific fat images from scratch. -2. Extend an existing fat image with additional functionality. +For either a site-specific fat-image build or an extra-build: +1. Ensure the current OpenStack credentials have sufficient authorisation to + upload images (this may or may not require the `member` role for an + application credential, depending on your OpenStack configuration). +2. If package installs are required, add the provided dev credentials for + StackHPC's "Ark" Pulp server to the `site` environment: -# Usage + ```yaml + # environments/site/inventory/group_vars/all/dnf_repos.yml: + dnf_repos_username: your-ark-username + dnf_repos_password: "{{ vault_dnf_repos_password }}" + ``` + ```yaml + # environments/site/inventory/group_vars/all/dnf_repos.yml: + dnf_repos_password: 'your-ark-password' + ``` + > [!IMPORTANT] + > The latter file should be vault-encrypted. -To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: + Alternatively, configure a [local Pulp mirror](experimental/pulp.md). -1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. The provided dev credentials for StackHPC's "Ark" Pulp server must be added to the target environments. This is done by overriding `dnf_repos_username` and `dnf_repos_password` with your vault encrypted credentials in `environments//inventory/group_vars/all/pulp.yml`. See the [experimental docs](experimental/pulp.md) if you wish instead wish to use a local Pulp server. -3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: +3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) containing at a minimum e.g.: ```hcl + # environments/site/builder.pkrvars.hcl: flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image - inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to + inventory_groups = "cuda" # Additional inventory groups to add build VM to ``` Note that: - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. - - The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. - - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. All possible groups are listed in `environments/common/groups` but common options for this variable will be: - - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. - - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. + - Normally the network must provide outbound internet access. However it + does not need to provide access to resources used by the actual cluster + nodes (e.g. Slurm control node, network filesystem servers etc.). + - The flavor used must have sufficent memory for the build tasks (usually + 8GB), but otherwise does not need to match the actual cluster node + flavor(s). + - By default, the build VM is volume-backed to allow control of the root + disk size (and hence final image size), so the flavor's disk size does not + matter. The default volume size is not sufficent if enabling `cuda` and/or + `doca` and should be increased: + ```terraform + volume_size = 35 # GB + ``` + - The source image should be either: + - For a site-specific fatimage build: A RockyLinux GenericCloud or + compatible image. + - For an extra-build image: The appropriate StackHPC fat image, as defined + in `environments/.stackhpc/tofu/cluster_image.auto.tfvars.json`. See the + [GitHub release page](https://github.com/stackhpc/ansible-slurm-appliance/releases) + for download links. + - The `inventory_groups` variable takes a comma-separated list of Ansible + inventory groups to add the build VM to (in addition to the `builder` + group which is it always in). This controls which Ansible roles and + functionality run during build, and hence what gets added to the image. + All possible groups are listed in `environments/common/groups` but common + options for this variable will be: + - For a fatimage build: `fatimage`: This is defined in `enviroments/{common,site}/inventory/groups` + and results in an update of all packages in the source image, plus + installation of packages for default control, login and compute nodes. + - For an extra-built image, one or more specific groups e.g. `cuda` or + `doca,lustre`. This extends the source image with just this additional + functionality. + + See the top of [packer/openstack.pkr.hcl](../packer/openstack.pkr.hcl) + for all possible variables which can be set. 4. Activate the venv and the relevant environment. 5. Build images using the relevant variable definition file, e.g.: cd packer/ - PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=../environments/site/builder.pkrvars.hcl openstack.pkr.hcl **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: @@ -53,7 +109,9 @@ To build either a site-specific fat image from scratch, or to extend an existing then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. +6. The built image will be automatically uploaded to OpenStack. By default it + will have a name prefixed `openhpc` and including a timestamp and a shortened + git hash. # Build Process diff --git a/docs/operations.md b/docs/operations.md index 4c5c640c5..a1ab2018c 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -95,22 +95,29 @@ By default, the following utility packages are installed during the StackHPC ima - s-nail Additional packages can be added during image builds by: -- adding the `extra_packages` group to the build `inventory_groups` (see -[docs/image-build.md](./image-build.md)) -- defining a list of packages in `appliances_extra_packages_other` in e.g. -`environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + +1. Configuring an [docs/image-build.md](./image-build.md) to enable the + `extra_packages` group: + + + ```terraform + # environments/site/builder.pkrvars.hcl: + ... + inventory_groups = "extra_packages" + ... + ``` + +2. Defining a list of packages in `appliances_extra_packages_other`, for example: ```yaml - # environments/foo-base/inventory/group_vars/all/defaults.yml: + # environments/site/inventory/group_vars/all/defaults.yml: appliances_extra_packages_other: - somepackage - anotherpackage ``` -For packages which come from repositories mirrored by StackHPC's "Ark" Pulp server -(including rocky, EPEL and OpenHPC repositories), this will require either [Ark -credentials](./image-build.md)) or a [local Pulp mirror](./experimental/pulp.md) -to be configured. This includes rocky, EPEL and OpenHPC repos. +3. Either adding [Ark credentials](./image-build.md) or a [local Pulp mirror](./experimental/pulp.md) + to provide access to the required [repository snapshots](../environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml). The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the @@ -138,8 +145,6 @@ Adding these repos/packages to the cluster/image would then require running: as appropriate. -TODO: improve description about adding these to extra images. - # Reconfiguring Slurm From 60d89d65ffb73ef3db6f5073e229853406a9e63d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 17 Sep 2025 15:23:51 +0100 Subject: [PATCH 5/8] minor docs tweaks --- docs/image-build.md | 16 +++++++++------- docs/operations.md | 8 ++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/image-build.md b/docs/image-build.md index 4c371189b..4aab27abc 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -16,7 +16,7 @@ The Packer configuration here can be used to build two types of images: available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However site-specific fat images can also be built from a different source image e.g. if a different partition layout is required. -2. "Extra-build" images which extend a StackHPC fat image to create a site-specific +2. "Extra-build" images which extend a fat image to create a site-specific image with with additional packages or functionality. For example the NVIDIA `cuda` packages cannot be redistributed hence require an "extra" build. @@ -39,8 +39,8 @@ For either a site-specific fat-image build or an extra-build: # environments/site/inventory/group_vars/all/dnf_repos.yml: dnf_repos_password: 'your-ark-password' ``` - > [!IMPORTANT] - > The latter file should be vault-encrypted. + > [!IMPORTANT] + > The latter file should be vault-encrypted. Alternatively, configure a [local Pulp mirror](experimental/pulp.md). @@ -72,10 +72,12 @@ For either a site-specific fat-image build or an extra-build: - The source image should be either: - For a site-specific fatimage build: A RockyLinux GenericCloud or compatible image. - - For an extra-build image: The appropriate StackHPC fat image, as defined - in `environments/.stackhpc/tofu/cluster_image.auto.tfvars.json`. See the - [GitHub release page](https://github.com/stackhpc/ansible-slurm-appliance/releases) - for download links. + - For an extra-build image: Usually the appropriate StackHPC fat image, + as defined in `environments/.stackhpc/tofu/cluster_image.auto.tfvars.json` at the + checkout's current commit. See the [GitHub release page](https://github.com/stackhpc/ansible-slurm-appliance/releases) + for download links. In some cases extra builds may be chained, e.g. + one extra build adds a Lustre client, and the resulting image is used + as the source image for an extra build adding GPU support. - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to (in addition to the `builder` group which is it always in). This controls which Ansible roles and diff --git a/docs/operations.md b/docs/operations.md index a1ab2018c..9584d0035 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -96,7 +96,7 @@ By default, the following utility packages are installed during the StackHPC ima Additional packages can be added during image builds by: -1. Configuring an [docs/image-build.md](./image-build.md) to enable the +1. Configuring an [image build](./image-build.md) to enable the `extra_packages` group: @@ -126,9 +126,9 @@ the OpenHPC installation guide (linked from the corresponding `lmod` modules. Packages *may* also be installed during the site.yml, by adding the `cluster` -group into the `extra_packages` group. An error will occur if Ark credentials -are defined in this case, as they are readable by unprivileged users in the -`.repo` files and a local Pulp mirror must be used instead. +group as a child of the `extra_packages` group. An error will occur if Ark +credential are defined in this case, as they are readable by unprivileged users +in the `.repo` files and a local Pulp mirror must be used instead. If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a play should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. From 6ab7255c80e113f1dec9baa90d2ad9b5072f20ca Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 17 Sep 2025 15:59:00 +0100 Subject: [PATCH 6/8] fixup fatimage group definition --- environments/common/inventory/groups | 7 ++---- environments/site/inventory/groups | 33 ++++++++++++++++++---------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 9057f9a77..2145a4297 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -224,8 +224,5 @@ doca [raid] # Add `builder` to configure image for software raid -[fatimage:children] -# Minimal configuration for fat image build -control -login -compute +[fatimage] +# Add build VM into this group to enable all features with this as child diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index f16a28ad3..83e2e1c71 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -1,3 +1,15 @@ +[login:children] +# All Slurm login nodes. Combined control/login nodes are not supported. +fatimage + +[control:children] +# A single Slurm control node. Multiple (high availability) control nodes are not supported. +fatimage + +[compute:children] +# All Slurm compute nodes (in all partitions). +fatimage + [nfs:children] openhpc @@ -31,6 +43,7 @@ slurm_stats # NB: [rebuild] not defined here as likely to need features not currently supported [update:children] +fatimage [fail2ban:children] # Hosts to install fail2ban on to protect SSH @@ -102,18 +115,21 @@ openhpc login openondemand -[squid] +[squid:children] # Hosts to run squid proxy +fatimage [tuned:children] # Hosts to run TuneD configuration +fatimage [ansible_init:children] # Hosts to run linux-ansible-init cluster -[sssd] +[sssd:children] # Hosts to configure sssd on +fatimage [sshd] # Hosts where the OpenSSH server daemon should be configured @@ -151,6 +167,7 @@ cluster [gateway:children] # Add builder to this group to install gateway ansible-init playbook into image +fatimage [nhc:children] # Hosts to configure for node health checks @@ -171,14 +188,6 @@ compute # pulp_host ansible_host= # Note inventory host name cannot conflict with group names i.e can't be called `pulp` or `pulp_server`. -[raid] +[raid:children] # Add `builder` to configure image for software raid - -[fatimage:children] -# Standard additional functionality for fat image build, as used by StackHPC CI images: -gateway -raid -squid -sssd -tuned -update +fatimage From 8741411329d5ac90a5f575c33ffc3b92b04753b4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 18 Sep 2025 09:42:22 +0000 Subject: [PATCH 7/8] fix build groups --- environments/.stackhpc/inventory/extra_groups | 9 +++------ environments/common/inventory/groups | 13 ++++++++++++- environments/site/inventory/groups | 5 ++++- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 525965c1b..0d7fb53b5 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -1,3 +1,5 @@ +# Unless noted otherwise features enabled here are tested by CI site.yml playbook + [basic_users:children] cluster @@ -20,7 +22,7 @@ cluster # --- end of FreeIPA example --- [manila:children] -# Allows demo; also installs manila client in fat image +# Not actully tested but allows demo using this environment login compute @@ -28,7 +30,6 @@ compute cluster [tuned:children] -# Test tuned during site playbook cluster [rebuild:children] @@ -39,7 +40,3 @@ cluster [compute_init:children] compute - -[gateway:children] -# Install gateway ansible-init playbook into image -builder diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 2145a4297..53fcf098e 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -1,3 +1,12 @@ +# This file +# 1. Ensures all groups in the appliance are always defined - even if empty +# 2. Defines dependencies between groups - child groups require & enables parent +# +# IMPORTANT +# --------- +# All groups and child groups here MUST be empty, as other environments cannot +# remove hosts/groups. + [login] # All Slurm login nodes. Combined control/login nodes are not supported. @@ -196,10 +205,12 @@ k3s_agent [dnf_repos:children] # Hosts to replace system repos with Pulp repos +# Roles/groups listed here *always* do installs: extra_packages -cuda doca +# what we want to say is if cuda and build, add dnf_repos + [pulp_site] # Add builder to this group to automatically sync pulp during image build diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 83e2e1c71..674d43286 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -156,8 +156,11 @@ fatimage [extra_packages:children] # Hosts to install specified additional packages on -[dnf_repos] +[dnf_repos:children] # Hosts to replace system repos with Pulp repos +# Some roles do installs when in install mode/on build VM only: +fatimage +builder [cacerts] # Hosts to configure CA certificates and trusts on From c50d73f1ed7556c1afad545f8f01cf9a9b4c1f0a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 18 Sep 2025 10:33:30 +0000 Subject: [PATCH 8/8] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 6b294d157..a02205aad 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250820-0800-767addd8", - "RL9": "openhpc-RL9-250908-2047-d90ebd0e" + "RL8": "openhpc-RL8-250918-0944-87414113", + "RL9": "openhpc-RL9-250918-0944-87414113" } }