From a49304865dc48fe7955ee3f4d737ab944784d728 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Aug 2025 11:35:54 +0100 Subject: [PATCH 1/7] Moved cookiecutter tofu to site environment --- .../cookiecutter.json | 0 .../{{cookiecutter.environment}}/README.md | 0 .../{{cookiecutter.environment}}/activate | 0 .../{{cookiecutter.environment}}/ansible.cfg | 19 ++++++++++++++++ .../hooks/.gitkeep | 0 .../inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/basic_users.yml | 0 .../inventory/group_vars/all/hpctests.yml | 0 .../{{cookiecutter.environment}}/tofu/main.tf | 12 ++++++++++ environments/.stackhpc/inventory/everything | 1 - environments/.stackhpc/inventory/groups | 1 + environments/common/layouts/README.md | 6 ----- environments/common/layouts/minimal | 8 ------- environments/site/activate | 22 +++++++++++++++++++ .../ansible.cfg | 0 environments/site/hooks/.gitkeep | 0 .../inventory/group_vars/all/alertmanager.yml | 0 .../inventory/group_vars/all/grafana.yml | 0 .../group_vars/all/vault_alertmanager.yml | 0 .../everything => site/inventory/groups} | 0 .../tofu/additional.tf | 0 .../tofu/baremetal-node-list.py | 0 .../tofu/compute.tf | 0 .../tofu/control.tf | 0 .../tofu/data.tf | 0 .../tofu/inventory.tf | 0 .../tofu/inventory.tpl | 0 .../tofu/login.tf | 0 .../tofu/main.tf | 0 .../tofu/network.tf | 0 .../tofu/node_group/main.tf | 0 .../tofu/node_group/network.tf | 0 .../tofu/node_group/nodes.tf | 0 .../tofu/node_group/variables.tf | 0 .../tofu/read-inventory-secrets.py | 0 .../tofu/variables.tf | 0 .../tofu/volumes.tf | 0 .../inventory/groups | 1 - 38 files changed, 54 insertions(+), 16 deletions(-) rename {environments/skeleton => cookiecutter}/cookiecutter.json (100%) rename {environments/skeleton => cookiecutter}/{{cookiecutter.environment}}/README.md (100%) rename {environments/skeleton => cookiecutter}/{{cookiecutter.environment}}/activate (100%) create mode 100644 cookiecutter/{{cookiecutter.environment}}/ansible.cfg rename {environments/skeleton => cookiecutter}/{{cookiecutter.environment}}/hooks/.gitkeep (100%) rename {environments/skeleton => cookiecutter}/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep (100%) rename {environments/skeleton => cookiecutter}/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml (100%) rename {environments/skeleton => cookiecutter}/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml (100%) create mode 100644 cookiecutter/{{cookiecutter.environment}}/tofu/main.tf delete mode 120000 environments/.stackhpc/inventory/everything create mode 120000 environments/.stackhpc/inventory/groups delete mode 100644 environments/common/layouts/README.md delete mode 100644 environments/common/layouts/minimal create mode 100644 environments/site/activate rename environments/{skeleton/{{cookiecutter.environment}} => site}/ansible.cfg (100%) create mode 100644 environments/site/hooks/.gitkeep rename environments/{skeleton/{{cookiecutter.environment}} => site}/inventory/group_vars/all/alertmanager.yml (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/inventory/group_vars/all/grafana.yml (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/inventory/group_vars/all/vault_alertmanager.yml (100%) rename environments/{common/layouts/everything => site/inventory/groups} (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/additional.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/baremetal-node-list.py (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/compute.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/control.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/data.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/inventory.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/inventory.tpl (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/login.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/main.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/network.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/node_group/main.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/node_group/network.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/node_group/nodes.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/node_group/variables.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/read-inventory-secrets.py (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/variables.tf (100%) rename environments/{skeleton/{{cookiecutter.environment}} => site}/tofu/volumes.tf (100%) delete mode 120000 environments/skeleton/{{cookiecutter.environment}}/inventory/groups diff --git a/environments/skeleton/cookiecutter.json b/cookiecutter/cookiecutter.json similarity index 100% rename from environments/skeleton/cookiecutter.json rename to cookiecutter/cookiecutter.json diff --git a/environments/skeleton/{{cookiecutter.environment}}/README.md b/cookiecutter/{{cookiecutter.environment}}/README.md similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/README.md rename to cookiecutter/{{cookiecutter.environment}}/README.md diff --git a/environments/skeleton/{{cookiecutter.environment}}/activate b/cookiecutter/{{cookiecutter.environment}}/activate similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/activate rename to cookiecutter/{{cookiecutter.environment}}/activate diff --git a/cookiecutter/{{cookiecutter.environment}}/ansible.cfg b/cookiecutter/{{cookiecutter.environment}}/ansible.cfg new file mode 100644 index 000000000..be0fa1aef --- /dev/null +++ b/cookiecutter/{{cookiecutter.environment}}/ansible.cfg @@ -0,0 +1,19 @@ +[defaults] +any_errors_fatal = True +stdout_callback = debug +stderr_callback = debug +gathering = smart +forks = 30 +host_key_checking = False +inventory = ../common/inventory,../site/inventory,inventory +collections_path = ../../ansible/collections +roles_path = ../../ansible/roles +filter_plugins = ../../ansible/filter_plugins + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True + +[inventory] +# Fail when any inventory source cannot be parsed. +any_unparsed_is_failed = True diff --git a/environments/skeleton/{{cookiecutter.environment}}/hooks/.gitkeep b/cookiecutter/{{cookiecutter.environment}}/hooks/.gitkeep similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/hooks/.gitkeep rename to cookiecutter/{{cookiecutter.environment}}/hooks/.gitkeep diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep b/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep rename to cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/.gitkeep diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml b/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml rename to cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml b/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml rename to cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml diff --git a/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf b/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf new file mode 100644 index 000000000..fdf38c624 --- /dev/null +++ b/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf @@ -0,0 +1,12 @@ +variable "environment_root" { + type = string + description = "Path to environment root, automatically set by activate script" +} + +module "cluster" { + source = "../../site/tofu/" + environment_root = var.environment_root + + # Environment specific variables + # cluster_name = "foo" +} diff --git a/environments/.stackhpc/inventory/everything b/environments/.stackhpc/inventory/everything deleted file mode 120000 index dc66b9576..000000000 --- a/environments/.stackhpc/inventory/everything +++ /dev/null @@ -1 +0,0 @@ -../../../environments/common/layouts/everything \ No newline at end of file diff --git a/environments/.stackhpc/inventory/groups b/environments/.stackhpc/inventory/groups new file mode 120000 index 000000000..3101becc4 --- /dev/null +++ b/environments/.stackhpc/inventory/groups @@ -0,0 +1 @@ +../../site/inventory/groups \ No newline at end of file diff --git a/environments/common/layouts/README.md b/environments/common/layouts/README.md deleted file mode 100644 index e87ad93ef..000000000 --- a/environments/common/layouts/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Layouts - -This folder contains some predefined group mappings. You can copy them into -an environment folder if you wish to modify them or just reference them directly -in ansible.cfg as another inventory file. If you are referencing them in the -inventory file, it is advisable to put them just after the common environment. \ No newline at end of file diff --git a/environments/common/layouts/minimal b/environments/common/layouts/minimal deleted file mode 100644 index 76db23675..000000000 --- a/environments/common/layouts/minimal +++ /dev/null @@ -1,8 +0,0 @@ -[nfs:children] -cluster - -[openhpc:children] -cluster - -[mysql:children] -control diff --git a/environments/site/activate b/environments/site/activate new file mode 100644 index 000000000..2a58b40e4 --- /dev/null +++ b/environments/site/activate @@ -0,0 +1,22 @@ +export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) +echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" + +export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" + +export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") +echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" + +export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" + +export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" + +export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") +echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" + +if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then + export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg +fi + + diff --git a/environments/skeleton/{{cookiecutter.environment}}/ansible.cfg b/environments/site/ansible.cfg similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/ansible.cfg rename to environments/site/ansible.cfg diff --git a/environments/site/hooks/.gitkeep b/environments/site/hooks/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml b/environments/site/inventory/group_vars/all/alertmanager.yml similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml rename to environments/site/inventory/group_vars/all/alertmanager.yml diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml b/environments/site/inventory/group_vars/all/grafana.yml similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml rename to environments/site/inventory/group_vars/all/grafana.yml diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml b/environments/site/inventory/group_vars/all/vault_alertmanager.yml similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml rename to environments/site/inventory/group_vars/all/vault_alertmanager.yml diff --git a/environments/common/layouts/everything b/environments/site/inventory/groups similarity index 100% rename from environments/common/layouts/everything rename to environments/site/inventory/groups diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/additional.tf b/environments/site/tofu/additional.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/additional.tf rename to environments/site/tofu/additional.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py b/environments/site/tofu/baremetal-node-list.py similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py rename to environments/site/tofu/baremetal-node-list.py diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf b/environments/site/tofu/compute.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf rename to environments/site/tofu/compute.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf b/environments/site/tofu/control.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf rename to environments/site/tofu/control.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf b/environments/site/tofu/data.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf rename to environments/site/tofu/data.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf b/environments/site/tofu/inventory.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf rename to environments/site/tofu/inventory.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl b/environments/site/tofu/inventory.tpl similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl rename to environments/site/tofu/inventory.tpl diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf b/environments/site/tofu/login.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf rename to environments/site/tofu/login.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/main.tf b/environments/site/tofu/main.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/main.tf rename to environments/site/tofu/main.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/network.tf b/environments/site/tofu/network.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/network.tf rename to environments/site/tofu/network.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/main.tf b/environments/site/tofu/node_group/main.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/main.tf rename to environments/site/tofu/node_group/main.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/network.tf b/environments/site/tofu/node_group/network.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/network.tf rename to environments/site/tofu/node_group/network.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/site/tofu/node_group/nodes.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf rename to environments/site/tofu/node_group/nodes.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/site/tofu/node_group/variables.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf rename to environments/site/tofu/node_group/variables.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py b/environments/site/tofu/read-inventory-secrets.py similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py rename to environments/site/tofu/read-inventory-secrets.py diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/site/tofu/variables.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf rename to environments/site/tofu/variables.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/volumes.tf b/environments/site/tofu/volumes.tf similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/tofu/volumes.tf rename to environments/site/tofu/volumes.tf diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/groups b/environments/skeleton/{{cookiecutter.environment}}/inventory/groups deleted file mode 120000 index 33a1036f5..000000000 --- a/environments/skeleton/{{cookiecutter.environment}}/inventory/groups +++ /dev/null @@ -1 +0,0 @@ -../../../common/layouts/everything \ No newline at end of file From 4dc24213dc06fc1d44d543bdc7a4b701ae745cd1 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Aug 2025 11:44:55 +0100 Subject: [PATCH 2/7] updated CI environment --- environments/.stackhpc/ansible.cfg | 2 +- environments/.stackhpc/inventory/group_vars/all/grafana.yml | 1 - environments/.stackhpc/inventory/groups | 1 - environments/.stackhpc/tofu/main.tf | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) delete mode 100644 environments/.stackhpc/inventory/group_vars/all/grafana.yml delete mode 120000 environments/.stackhpc/inventory/groups diff --git a/environments/.stackhpc/ansible.cfg b/environments/.stackhpc/ansible.cfg index 6261f3149..470c753ea 100644 --- a/environments/.stackhpc/ansible.cfg +++ b/environments/.stackhpc/ansible.cfg @@ -6,7 +6,7 @@ callbacks_enabled = ansible.posix.profile_tasks gathering = smart forks = 30 host_key_checking = False -inventory = ../common/inventory,inventory +inventory = ../common/inventory,../site/inventory,inventory collections_path = ../../ansible/collections roles_path = ../../ansible/roles filter_plugins = ../../ansible/filter_plugins diff --git a/environments/.stackhpc/inventory/group_vars/all/grafana.yml b/environments/.stackhpc/inventory/group_vars/all/grafana.yml deleted file mode 100644 index 14fefa945..000000000 --- a/environments/.stackhpc/inventory/group_vars/all/grafana.yml +++ /dev/null @@ -1 +0,0 @@ -grafana_auth_anonymous: true diff --git a/environments/.stackhpc/inventory/groups b/environments/.stackhpc/inventory/groups deleted file mode 120000 index 3101becc4..000000000 --- a/environments/.stackhpc/inventory/groups +++ /dev/null @@ -1 +0,0 @@ -../../site/inventory/groups \ No newline at end of file diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index c58fb3fc5..efe918c9c 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -59,7 +59,7 @@ data "openstack_images_image_v2" "cluster" { } module "cluster" { - source = "../../skeleton/{{cookiecutter.environment}}/tofu/" + source = "../../site/tofu/" cluster_name = var.cluster_name cluster_networks = var.cluster_networks From 4c0e0849280a68316cfac4d2ed7adb89d70a6a73 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 6 Aug 2025 13:01:00 +0100 Subject: [PATCH 3/7] Updated docs for new environment structure --- README.md | 2 +- ansible/roles/alertmanager/README.md | 5 +- ansible/roles/block_devices/README.md | 2 +- ansible/roles/freeipa/README.md | 4 +- docs/adding-functionality.md | 2 +- docs/alerting.md | 3 +- docs/experimental/isolated-clusters.md | 3 +- docs/monitoring-and-logging.md | 2 +- docs/persistent-state.md | 4 +- docs/production.md | 46 +++---------------- docs/upgrades.md | 7 ++- .../.caas/inventory/group_vars/all/nfs.yml | 2 +- environments/.stackhpc/tofu/main.tf | 2 +- environments/README.md | 13 ++++-- .../inventory/group_vars/all/firewalld.yml | 2 +- .../common/inventory/group_vars/all/nfs.yml | 2 +- 16 files changed, 33 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index a47afd4e4..f8503a434 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ Run the following from the repository root to activate the venv: Use the `cookiecutter` template to create a new environment to hold your configuration: cd environments - cookiecutter skeleton + cookiecutter ../cookiecutter and follow the prompts to complete the environment name and description. diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index 612761731..f5bc23bc2 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -11,12 +11,9 @@ Note that: - No Grafana dashboard for alerts is currently provided. Alertmanager is enabled by default on the `control` node in the -[everything](../../../environments/common/layouts/everything) template which -`cookiecutter` uses for a new environment's `inventory/groups` file. +`site` environment's `inventory/groups` file. In general usage may only require: -- Adding the `control` node into the `alertmanager` group in `environments/site/groups` - if upgrading an existing environment. - Enabling the Slack integration (see section below). - Possibly setting `alertmanager_web_external_url`. diff --git a/ansible/roles/block_devices/README.md b/ansible/roles/block_devices/README.md index d3dad63bf..0d326d3a2 100644 --- a/ansible/roles/block_devices/README.md +++ b/ansible/roles/block_devices/README.md @@ -11,7 +11,7 @@ This is a convenience wrapper around the ansible modules: To avoid issues with device names changing after e.g. reboots, devices are identified by serial number and mounted by filesystem UUID. -**NB:** This role is ignored[^1] during Packer builds as block devices will not be attached to the Packer build VMs. This role is therefore deprecated and it is suggested that `cloud-init` is used instead. See e.g. `environments/skeleton/{{cookiecutter.environment}}/tofu/control.userdata.tpl`. +**NB:** This role is ignored[^1] during Packer builds as block devices will not be attached to the Packer build VMs. This role is therefore deprecated and it is suggested that `cloud-init` is used instead. See e.g. `environments/site/tofu/control.userdata.tpl`. [^1]: See `environments/common/inventory/group_vars/builder/defaults.yml` diff --git a/ansible/roles/freeipa/README.md b/ansible/roles/freeipa/README.md index 70270356a..0fd9c360d 100644 --- a/ansible/roles/freeipa/README.md +++ b/ansible/roles/freeipa/README.md @@ -7,7 +7,7 @@ Support FreeIPA in the appliance. In production use it is expected the FreeIPA s ## Usage - Add hosts to the `freeipa_client` group and run (at a minimum) the `ansible/iam.yml` playbook. -- Host names must match the domain name. By default (using the skeleton OpenTofu) hostnames are of the form `nodename.cluster_name.cluster_domain_suffix` where `cluster_name` and `cluster_domain_suffix` are OpenTofu variables. +- Host names must match the domain name. By default (using the site OpenTofu) hostnames are of the form `nodename.cluster_name.cluster_domain_suffix` where `cluster_name` and `cluster_domain_suffix` are OpenTofu variables. - Hosts discover the FreeIPA server FQDN (and their own domain) from DNS records. If DNS servers are not set this is not set from DHCP, then use the `resolv_conf` role to configure this. For example when using the in-appliance FreeIPA development server: ```ini @@ -28,7 +28,7 @@ Support FreeIPA in the appliance. In production use it is expected the FreeIPA s - For production use with an external FreeIPA server, a random one-time password (OTP) must be generated when adding hosts to FreeIPA (e.g. using `ipa host-add --random ...`). This password should be set as a hostvar `freeipa_host_password`. Initial host enrolment will use this OTP to enrol the host. After this it becomes irrelevant so it does not need to be committed to git. This approach means the appliance does not require the FreeIPA administrator password. - For development use with the in-appliance FreeIPA server, `freeipa_host_password` will be automatically generated in memory. - The `control` host must define `appliances_state_dir` (on persistent storage). This is used to back-up keytabs to allow FreeIPA clients to automatically re-enrol after e.g. reimaging. Note that: - - This is implemented when using the skeleton OpenTofu; on the control node `appliances_state_dir` defaults to `/var/lib/state` which is mounted from a volume. + - This is implemented when using the site OpenTofu; on the control node `appliances_state_dir` defaults to `/var/lib/state` which is mounted from a volume. - Nodes are not re-enroled by a [Slurm-driven reimage](../../collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/rebuild/README.md) (as that does not run this role). - If both a backed-up keytab and `freeipa_host_password` exist, the former is used. diff --git a/docs/adding-functionality.md b/docs/adding-functionality.md index 69d3b3a3f..05bcbb5a8 100644 --- a/docs/adding-functionality.md +++ b/docs/adding-functionality.md @@ -3,7 +3,7 @@ Please contact us for specific advice, but this generally involves: - Adding a role. - Adding a play calling that role into an existing playbook in `ansible/`, or adding a new playbook there and updating `site.yml`. -- Adding a new (empty) group named after the role into `environments/common/inventory/groups` and a non-empty example group into `environments/common/layouts/everything`. +- Adding a new (empty) group named after the role into `environments/common/inventory/groups` and a non-empty example group into `environments/site/inventory/groups`. - Adding new default group vars into `environments/common/inventory/group_vars/all//`. - Updating the default Packer build variables in `environments/common/inventory/group_vars/builder/defaults.yml`. - Updating READMEs. diff --git a/docs/alerting.md b/docs/alerting.md index b53c0fa40..e030d23de 100644 --- a/docs/alerting.md +++ b/docs/alerting.md @@ -21,8 +21,7 @@ must be configured to generate notifications. ## Enabling alertmanager 1. Ensure both the `prometheus` and `alertmanager` servers are deployed on the -control node - for new environments the `cookiecutter` tool will have done -this: +control node - these are deployed by default in the site environment's groups: ```ini # environments/site/groups: diff --git a/docs/experimental/isolated-clusters.md b/docs/experimental/isolated-clusters.md index a570465ea..c136e99ea 100644 --- a/docs/experimental/isolated-clusters.md +++ b/docs/experimental/isolated-clusters.md @@ -6,8 +6,7 @@ access from all nodes, possibly via a [proxy](../../ansible/roles/proxy/). However many features (as defined by Ansible inventory groups/roles) will work if the cluster network(s) provide no outbound access. Currently this includes all "default" features, i.e. roles/groups which are enabled either in the -`common` environment or in the `environments/$ENV/inventory/groups` file -created by cookiecutter for a new environment. +`common` or `site` environments. The full list of features and whether they are functional on such an "isolated" network is shown in the table below. Note that: diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 6913c285f..46b405a3e 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -227,7 +227,7 @@ The `prometheus` group determines the placement of the prometheus service. Load ### Access -Prometheus is exposed on port `9090` on all hosts in the prometheus group. Currently, the configuration assumes a single host. Following the reference layout in `environments/common/layouts/everything`, this will be set to the slurm `control` node, prometheus would then be accessible from: +Prometheus is exposed on port `9090` on all hosts in the prometheus group. Currently, the configuration assumes a single host. Following the reference layout in `environments/site/inventory/groups`, this will be set to the slurm `control` node, prometheus would then be accessible from: > http://:9090 diff --git a/docs/persistent-state.md b/docs/persistent-state.md index a895f2e44..f5d4852fa 100644 --- a/docs/persistent-state.md +++ b/docs/persistent-state.md @@ -9,11 +9,11 @@ At present this will affect the following: - Grafana data - OpenDistro/elasticsearch data -If using the `environments/common/layout/everything` Ansible groups template (which is the default for a new cookiecutter-produced environment) then these services will all be on the `control` node and hence only this node requires persistent storage. +If using the upstream defaults in the `site` environments `inventory/groups` file then these services will all be on the `control` node and hence only this node requires persistent storage. Note that if `appliances_state_dir` is defined, the path it gives must exist and should be owned by root. Directories will be created within this with appropriate permissions for each item of state defined above. Additionally, the systemd units for the services listed above will be modified to require `appliances_state_dir` to be mounted before service start (via the `systemd` role). -A new cookiecutter-produced environment supports persistent state in the default OpenTofu (see `environments/skeleton/{{cookiecutter.environment}}/tofu/`) by: +The `site` environment supports persistent state in the default OpenTofu (see `environments/site/tofu/`) by: - Defining a volume with a default size of 150GB - this can be controlled by the OpenTofu variable `state_volume_size`. - Attaching it to the control node. diff --git a/docs/production.md b/docs/production.md index aef041040..d1ea3e90b 100644 --- a/docs/production.md +++ b/docs/production.md @@ -7,25 +7,15 @@ production-ready deployments. - Get it agreed up front what the cluster names will be. Changing this later requires instance deletion/recreation. -- At least three environments should be created: - - `site`: site-specific base environment +- At least two environments should be created on top of the `site` base environment: - `production`: production environment - `staging`: staging environment A `dev` environment should also be created if considered required, or this can be left until later. - These can all be produced using the cookicutter instructions, but the - `production` and `staging` environments will need their - `environments/$ENV/ansible.cfg` file modifying so that they point to the - `site` environment: - - ```ini - inventory = ../common/inventory,../site/inventory,inventory - ``` - - In general only the `site` environment will need an `inventory/groups` file - - this is templated out by cookiecutter and should be modified as required to + In general only the `inventory/groups` file in the `site` environment is needed - + it can be modified as required to enable features for all environments at the site. - To avoid divergence of configuration all possible overrides for group/role @@ -42,34 +32,10 @@ and referenced from the `site` and `production` environments, e.g.: import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" ``` -- OpenTofu configurations should be defined in the `site` environment and used - as a module from the other environments. This can be done with the - cookie-cutter generated configurations: - - Delete the *contents* of the cookie-cutter generated `tofu/` directories - from the `production` and `staging` environments. - - Create a `main.tf` in those directories which uses `site/tofu/` as a - [module](https://opentofu.org/docs/language/modules/), e.g. : - - ``` - ... - variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" - } - - module "cluster" { - source = "../../site/tofu/" - environment_root = var.environment_root - - cluster_name = "foo" - ... - } - ``` - - Note that: +- When setting OpenTofu configurations: - Environment-specific variables (`cluster_name`) should be hardcoded - into the cluster module block. + as arguments into the cluster module block at `environments/$ENV/tofu/main.tf`. - Environment-independent variables (e.g. maybe `cluster_net` if the same is used for staging and production) should be set as *defaults* in `environments/site/tofu/variables.tf`, and then don't need to @@ -87,7 +53,7 @@ and referenced from the `site` and `production` environments, e.g.: instances) it may be necessary to configure or proxy `chronyd` via an environment hook. -- By default, the cookiecutter-provided OpenTofu configuration provisions two +- By default, the site OpenTofu configuration provisions two volumes and attaches them to the control node: - "$cluster_name-home" for NFS-shared home directories - "$cluster_name-state" for monitoring and Slurm data diff --git a/docs/upgrades.md b/docs/upgrades.md index 07f5f524c..9580daaee 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -50,10 +50,9 @@ All other commands should be run on the Ansible deploy host. site-specific configuration. In general changes to existing functionality will aim to be backward compatible. Alteration of site-specific configuration will usually only be necessary to use new functionality or where functionality has been upstreamed as above. - Note that the `environments/common/layouts/everything` file contains all possible - groups which can be used to enable features; diff this against your e.g. - `environments/site/inventory/groups` file to see new features which you may - wish to enable in the latter file. + Note that the upstream `environments/site/inventory/groups` file contains all possible + groups which can be used to enable features, check for new groups which have been added in the + latest release and remove any which are unnescessary from the `groups` file in your fork. Make changes as necessary. diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index f42422601..22225fdeb 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -5,7 +5,7 @@ caas_nfs_home: nfs_enable: server: "{{ inventory_hostname in groups['control'] }}" clients: "{{ inventory_hostname in groups['cluster'] }}" - nfs_export: "/exports/home" # assumes skeleton TF is being used + nfs_export: "/exports/home" # assumes upstream site TF is being used nfs_client_mnt_point: "/home" nfs_configurations: "{{ caas_nfs_home if not cluster_home_manila_share | bool else [] }}" diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index efe918c9c..82c963ca0 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -1,4 +1,4 @@ -# This terraform configuration uses the "skeleton" terraform, so that is checked by CI. +# This terraform configuration uses the site terraform, so that is checked by CI. terraform { required_version = ">= 0.14" diff --git a/environments/README.md b/environments/README.md index 722c358ba..308a20227 100644 --- a/environments/README.md +++ b/environments/README.md @@ -33,17 +33,22 @@ for usage instructions for that component. Shared configuration for all environments. This is not intended to be used as a standalone environment, hence the README does *not* detail -how to provision the infrastructure. +how to provision the infrastructure. This environment generally should not be edited. -### skeleton +## site + +Provides the base configuration for all subsequent `cookiecutter` created environments, +including configuration for provisioning infrastructure. Site specific configuration should generally +be edited here, unless it is specific to a particular cookiecutter environment, in which case it should be set in +that environment. -Skeleton directory that is used as a template to create a new environemnt. ## Defining an environment To define an environment using cookiecutter: - cookiecutter skeleton + cd environments + cookiecutter ../cookiecutter This will present you with a series of questions which you must answer. Once you have answered all questions, a new environment directory will diff --git a/environments/common/inventory/group_vars/all/firewalld.yml b/environments/common/inventory/group_vars/all/firewalld.yml index 3548045ed..168559aa2 100644 --- a/environments/common/inventory/group_vars/all/firewalld.yml +++ b/environments/common/inventory/group_vars/all/firewalld.yml @@ -3,7 +3,7 @@ firewalld_configs_default: # A list of dicts defining firewalld rules. - # Using the "everything" template firewalld is deployed on the login node to enable fail2ban. + # Using the upstream site groups firewalld is deployed on the login node to enable fail2ban. # However by default we rely on openstack security groups so make firewalld permissive. # Each dict contains: # name: An arbitrary name or description diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index aec2213f1..ce5215fe9 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -18,7 +18,7 @@ nfs_configuration_home_volume: # volume-backed home directories # Don't mount share on control node: clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" nfs_server: "{{ nfs_server_default }}" - nfs_export: "/exports/home" # assumes skeleton TF is being used + nfs_export: "/exports/home" # assumes upstream site TF is being used nfs_client_mnt_point: "/home" # prevent tunnelling and setuid binaries: # NB: this is stackhpc.nfs role defaults but are set here to prevent being From d77e14f344c6959b08b5cde97aa03cee4cf07006 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 7 Aug 2025 08:13:57 +0100 Subject: [PATCH 4/7] review comments Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/production.md | 2 +- environments/README.md | 4 ++-- environments/common/inventory/group_vars/all/firewalld.yml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/production.md b/docs/production.md index d1ea3e90b..8929c4cd8 100644 --- a/docs/production.md +++ b/docs/production.md @@ -7,7 +7,7 @@ production-ready deployments. - Get it agreed up front what the cluster names will be. Changing this later requires instance deletion/recreation. -- At least two environments should be created on top of the `site` base environment: +- At least two environments should be created using cookiecutter, which will derive from the `site` base environment: - `production`: production environment - `staging`: staging environment diff --git a/environments/README.md b/environments/README.md index 308a20227..b53e5340c 100644 --- a/environments/README.md +++ b/environments/README.md @@ -33,12 +33,12 @@ for usage instructions for that component. Shared configuration for all environments. This is not intended to be used as a standalone environment, hence the README does *not* detail -how to provision the infrastructure. This environment generally should not be edited. +how to provision the infrastructure. This environment should not be edited, except as part of upstreaming new features or bug fixes. ## site Provides the base configuration for all subsequent `cookiecutter` created environments, -including configuration for provisioning infrastructure. Site specific configuration should generally +including OpenTofu configurations for infrastructure. In general, most local customisations should be made by adding to this environment. be edited here, unless it is specific to a particular cookiecutter environment, in which case it should be set in that environment. diff --git a/environments/common/inventory/group_vars/all/firewalld.yml b/environments/common/inventory/group_vars/all/firewalld.yml index 168559aa2..569428e07 100644 --- a/environments/common/inventory/group_vars/all/firewalld.yml +++ b/environments/common/inventory/group_vars/all/firewalld.yml @@ -3,7 +3,7 @@ firewalld_configs_default: # A list of dicts defining firewalld rules. - # Using the upstream site groups firewalld is deployed on the login node to enable fail2ban. + # Using the default site `groups` file, firewalld is deployed on the login node to enable fail2ban. # However by default we rely on openstack security groups so make firewalld permissive. # Each dict contains: # name: An arbitrary name or description From be67aab748ec5b700e912573c305384e1afb5bf5 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 7 Aug 2025 09:44:11 +0100 Subject: [PATCH 5/7] docs updates --- cookiecutter/{{cookiecutter.environment}}/ansible.cfg | 2 +- .../{{cookiecutter.environment}}/tofu/main.tf | 11 ++++++++++- docs/production.md | 4 +++- docs/upgrades.md | 11 +++++++++-- environments/.caas/inventory/group_vars/all/nfs.yml | 2 +- environments/README.md | 2 -- environments/common/inventory/group_vars/all/nfs.yml | 2 +- 7 files changed, 25 insertions(+), 9 deletions(-) diff --git a/cookiecutter/{{cookiecutter.environment}}/ansible.cfg b/cookiecutter/{{cookiecutter.environment}}/ansible.cfg index be0fa1aef..6809f396a 100644 --- a/cookiecutter/{{cookiecutter.environment}}/ansible.cfg +++ b/cookiecutter/{{cookiecutter.environment}}/ansible.cfg @@ -11,7 +11,7 @@ roles_path = ../../ansible/roles filter_plugins = ../../ansible/filter_plugins [ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True [inventory] diff --git a/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf b/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf index fdf38c624..9aa447595 100644 --- a/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf +++ b/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf @@ -8,5 +8,14 @@ module "cluster" { environment_root = var.environment_root # Environment specific variables - # cluster_name = "foo" + # Note that some of the variables below may need to be moved to the site environment + # defaults e.g cluster_networks should be in site if your staging and prod + # environments use the same networks + cluster_name = + cluster_image_id = + control_node_flavor = + cluster_networks = + key_pair = + login = + compute = } diff --git a/docs/production.md b/docs/production.md index 8929c4cd8..c0979d2a3 100644 --- a/docs/production.md +++ b/docs/production.md @@ -109,7 +109,9 @@ and referenced from the `site` and `production` environments, e.g.: - Configure Open OnDemand - see [specific documentation](openondemand.md) which notes specific variables required. -- Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml` +- Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml`. + Replace the `hpctests_user` in `environments/$ENV/inventory/group_vars/all/hpctests.yml` with + an appropriately configured user. - Consider whether having (read-only) access to Grafana without login is OK. If not, remove `grafana_auth_anonymous` in `environments/$ENV/inventory/group_vars/all/grafana.yml` diff --git a/docs/upgrades.md b/docs/upgrades.md index 9580daaee..7a1099fb3 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -41,6 +41,12 @@ All other commands should be run on the Ansible deploy host. prompts. Generally merge conflicts should only exist where functionality which was added for your site (not in a hook) has subsequently been merged upstream. + Note that if upgrading from a release prior to v2.3, you will likely have merge conflicts + with existing site OpenTofu configurations in `environments/site/tofu`. Generally + - Changes to `default` values in `environments/site/tofu.variables.tf` should be rejected. + - All other changes to the OpenTofu configuration should be accepted, unless they overwrite + site-specific additional resources. + 1. Push this branch and create a PR: git push @@ -51,8 +57,9 @@ All other commands should be run on the Ansible deploy host. backward compatible. Alteration of site-specific configuration will usually only be necessary to use new functionality or where functionality has been upstreamed as above. Note that the upstream `environments/site/inventory/groups` file contains all possible - groups which can be used to enable features, check for new groups which have been added in the - latest release and remove any which are unnescessary from the `groups` file in your fork. + groups which can be used to enable features. This will be updated when pulling changes + from the StackHPC repo, and any new groups should be enabled/disable as required for + your site. Make changes as necessary. diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index 22225fdeb..74a42cdd3 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -5,7 +5,7 @@ caas_nfs_home: nfs_enable: server: "{{ inventory_hostname in groups['control'] }}" clients: "{{ inventory_hostname in groups['cluster'] }}" - nfs_export: "/exports/home" # assumes upstream site TF is being used + nfs_export: "/exports/home" # assumes default site TF is being used nfs_client_mnt_point: "/home" nfs_configurations: "{{ caas_nfs_home if not cluster_home_manila_share | bool else [] }}" diff --git a/environments/README.md b/environments/README.md index b53e5340c..b6e2cf968 100644 --- a/environments/README.md +++ b/environments/README.md @@ -39,8 +39,6 @@ how to provision the infrastructure. This environment should not be edited, exce Provides the base configuration for all subsequent `cookiecutter` created environments, including OpenTofu configurations for infrastructure. In general, most local customisations should be made by adding to this environment. -be edited here, unless it is specific to a particular cookiecutter environment, in which case it should be set in -that environment. ## Defining an environment diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index ce5215fe9..398bde78a 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -18,7 +18,7 @@ nfs_configuration_home_volume: # volume-backed home directories # Don't mount share on control node: clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" nfs_server: "{{ nfs_server_default }}" - nfs_export: "/exports/home" # assumes upstream site TF is being used + nfs_export: "/exports/home" # assumes default site TF is being used nfs_client_mnt_point: "/home" # prevent tunnelling and setuid binaries: # NB: this is stackhpc.nfs role defaults but are set here to prevent being From ca6d85848f49a8ead9eb4fd7a0edb26547f26574 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:59:45 +0100 Subject: [PATCH 6/7] typo Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/upgrades.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/upgrades.md b/docs/upgrades.md index 7a1099fb3..b62720ac2 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -58,7 +58,7 @@ All other commands should be run on the Ansible deploy host. necessary to use new functionality or where functionality has been upstreamed as above. Note that the upstream `environments/site/inventory/groups` file contains all possible groups which can be used to enable features. This will be updated when pulling changes - from the StackHPC repo, and any new groups should be enabled/disable as required for + from the StackHPC repo, and any new groups should be enabled/disabled as required for your site. Make changes as necessary. From 064450263dc91a0ba22a55bbe150fbd6381f1b92 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 7 Aug 2025 12:02:31 +0100 Subject: [PATCH 7/7] removed topology from default groups + added docs --- docs/production.md | 7 +++++++ environments/site/inventory/groups | 3 +-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/production.md b/docs/production.md index c0979d2a3..bcf4925e7 100644 --- a/docs/production.md +++ b/docs/production.md @@ -118,6 +118,13 @@ and referenced from the `site` and `production` environments, e.g.: - If floating IPs are required for login nodes, create these in OpenStack and add the IPs into the OpenTofu `login` definition. +- Consider enabling topology aware scheduling. This is currently only supported if your cluster does not include any baremetal nodes. This can be enabled by: + 1. Creating Availability Zones in your OpenStack project for each physical rack + 2. Setting the `availability_zone` fields of compute groups in your OpenTofu configuration + 3. Adding the `compute` group as a child of `topology` in `environments/$ENV/inventory/groups` + 4. (Optional) If you are aware of the physical topology of switches above the rack-level, override `topology_above_rack_topology` in your groups vars + (see [topology docs](../ansible/roles/topology/README.md) for more detail) + - Consider whether mapping of baremetal nodes to ironic nodes is required. See [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 0f4253758..da0562060 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -140,9 +140,8 @@ builder # Hosts to configure for node health checks compute -[topology:children] +[topology] # Compute nodes to be included in the Slurm topology plugin's topology tree. See ansible/roles/topology # Should be set to `compute` if enabled # Note that this feature currently assumes all compute nodes are VMs, enabling # when the cluster contains baremetal compute nodes may lead to unexpected scheduling behaviour -compute