From ca44f898f124829de5fb798e3238cae35bc4af1e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 13 May 2025 10:31:29 +0000 Subject: [PATCH 01/11] bump OpenHPC snapshots to v3.1.1 (slurm 24.11.5) and v2.9.1 (slurm 23.11.11) for CVE-2025-43904 --- environments/common/inventory/group_vars/all/timestamps.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/common/inventory/group_vars/all/timestamps.yml b/environments/common/inventory/group_vars/all/timestamps.yml index 2f31ee1a3..8d046437a 100644 --- a/environments/common/inventory/group_vars/all/timestamps.yml +++ b/environments/common/inventory/group_vars/all/timestamps.yml @@ -63,10 +63,10 @@ appliances_pulp_repos: openhpc_updates: '8': path: OpenHPC/2/updates/EL_8 - timestamp: 20241218T154614 + timestamp: 20250512T003315 '9': path: OpenHPC/3/updates/EL_9 - timestamp: 20241218T154614 + timestamp: 20250510T003301 grafana: '8': path: grafana/oss/rpm From 929dde27778c0e8d70f39c196c6ca2111e031538 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 13 May 2025 11:13:59 +0000 Subject: [PATCH 02/11] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index f9117a26a..614c0adb9 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250506-1259-abb6394b", - "RL9": "openhpc-RL9-250506-1259-abb6394b" + "RL8": "openhpc-RL8-250513-1045-ca44f898", + "RL9": "openhpc-RL9-250513-1046-ca44f898" } } From a77ef88e9163713d554f2c97217499ce452be067 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 13 May 2025 12:13:15 +0000 Subject: [PATCH 03/11] extend timeout for slurmdbd startup to cope with major version upgrade on startup --- .../common/inventory/group_vars/all/openhpc.yml | 5 +++++ .../common/inventory/group_vars/all/systemd.yml | 16 +++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index bcda89b56..89f0a859c 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -57,3 +57,8 @@ ohpc_openhpc_repos: ohpc_default_extra_repos: "9": [] "8": [] + +# systemd.service.unit.TimeoutStartSec to wait for slurmdbd startup +# Set long enought to avoid problems with a major version upgrade +# Currently implemented in environments/common/inventory/group_vars/all/systemd.yml +openhpc_slurmdbd_timeout_start_sec: '45 minutes' diff --git a/environments/common/inventory/group_vars/all/systemd.yml b/environments/common/inventory/group_vars/all/systemd.yml index 2c5e03e35..4c7538aa6 100644 --- a/environments/common/inventory/group_vars/all/systemd.yml +++ b/environments/common/inventory/group_vars/all/systemd.yml @@ -1,9 +1,11 @@ _systemd_requiresmount_statedir: | + {% if appliances_state_dir is defined %} [Unit] RequiresMountsFor={{ appliances_state_dir | default('') }} + {% endif %} -_systemd_dropins_statedir: - # mysql not included as role handles state dir correctly +systemd_dropins: + # NB: mysql does not need _systemd_requiresmount_statedir as role handles state dir correctly opensearch: group: opensearch content: "{{ _systemd_requiresmount_statedir }}" @@ -12,12 +14,16 @@ _systemd_dropins_statedir: content: "{{ _systemd_requiresmount_statedir }}" slurmdbd: group: openhpc - content: "{{ _systemd_requiresmount_statedir }}" + content: | + {{ _systemd_requiresmount_statedir }} + + [Service] + # Allow slurmdbd to complete major version upgrades + TimeoutStartSec={{ openhpc_slurmdbd_timeout_start_sec }} + slurmctld: group: openhpc content: "{{ _systemd_requiresmount_statedir }}" prometheus: group: prometheus content: "{{ _systemd_requiresmount_statedir }}" - -systemd_dropins: "{{ _systemd_dropins_statedir if appliances_state_dir is defined else {} }}" From 4bce66cb25f21e86a00779c2ccad06bdfbbbbeec Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 14 May 2025 08:42:08 +0000 Subject: [PATCH 04/11] configure openhpc for slurmdbd backup/update --- .../common/inventory/group_vars/all/openhpc.yml | 13 +++++++++---- .../common/inventory/group_vars/all/systemd.yml | 8 +------- requirements.yml | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 89f0a859c..abfece409 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -58,7 +58,12 @@ ohpc_default_extra_repos: "9": [] "8": [] -# systemd.service.unit.TimeoutStartSec to wait for slurmdbd startup -# Set long enought to avoid problems with a major version upgrade -# Currently implemented in environments/common/inventory/group_vars/all/systemd.yml -openhpc_slurmdbd_timeout_start_sec: '45 minutes' +# configure slurm database pre-upgrade backups: +openhpc_slurm_accounting_storage_service: mysql +openhpc_slurm_accounting_storage_backup_cmd: >- + openstack volume snapshot create + --volume {{ openhpc_cluster_name }}-state + --force + {{ openhpc_cluster_name }}-state-{{ ansible_date_time.iso8601_basic_short }} +openhpc_slurm_accounting_storage_backup_host: localhost +openhpc_slurm_accounting_storage_backup_become: false diff --git a/environments/common/inventory/group_vars/all/systemd.yml b/environments/common/inventory/group_vars/all/systemd.yml index 4c7538aa6..ae72a7882 100644 --- a/environments/common/inventory/group_vars/all/systemd.yml +++ b/environments/common/inventory/group_vars/all/systemd.yml @@ -14,13 +14,7 @@ systemd_dropins: content: "{{ _systemd_requiresmount_statedir }}" slurmdbd: group: openhpc - content: | - {{ _systemd_requiresmount_statedir }} - - [Service] - # Allow slurmdbd to complete major version upgrades - TimeoutStartSec={{ openhpc_slurmdbd_timeout_start_sec }} - + content: "{{ _systemd_requiresmount_statedir }}" slurmctld: group: openhpc content: "{{ _systemd_requiresmount_statedir }}" diff --git a/requirements.yml b/requirements.yml index 87b2a6263..d4f0ec1d5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.28.0 + version: 'feat/upgrade-db' # local - TODO: bump on release name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 92a37f26ea46263504c9139e5a599245377eeba3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 14 May 2025 12:37:38 +0000 Subject: [PATCH 05/11] support mysql tasks in openhpc role --- environments/common/inventory/group_vars/all/openhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index abfece409..2d6d8c78d 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -20,6 +20,7 @@ openhpc_slurm_partitions: openhpc_packages_default: # system packages - podman + - mysql # OpenHPC packages - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu12-openmpi4-perf-tools # for hpctests From 860cb73e0802fd4d2b70ac68c7c00b413799e12d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 14 May 2025 14:17:05 +0000 Subject: [PATCH 06/11] remove slurmdbd startup timeout increase - got borked during merge from main --- environments/common/inventory/group_vars/all/systemd.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/environments/common/inventory/group_vars/all/systemd.yml b/environments/common/inventory/group_vars/all/systemd.yml index 4c7538aa6..ae72a7882 100644 --- a/environments/common/inventory/group_vars/all/systemd.yml +++ b/environments/common/inventory/group_vars/all/systemd.yml @@ -14,13 +14,7 @@ systemd_dropins: content: "{{ _systemd_requiresmount_statedir }}" slurmdbd: group: openhpc - content: | - {{ _systemd_requiresmount_statedir }} - - [Service] - # Allow slurmdbd to complete major version upgrades - TimeoutStartSec={{ openhpc_slurmdbd_timeout_start_sec }} - + content: "{{ _systemd_requiresmount_statedir }}" slurmctld: group: openhpc content: "{{ _systemd_requiresmount_statedir }}" From 5a923b2c468751b25bb4d6b9c6e528c04db134c6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 14 May 2025 14:42:08 +0000 Subject: [PATCH 07/11] mysql package now installed separately in role from openhpc_packages --- environments/common/inventory/group_vars/all/openhpc.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 2d6d8c78d..abfece409 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -20,7 +20,6 @@ openhpc_slurm_partitions: openhpc_packages_default: # system packages - podman - - mysql # OpenHPC packages - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu12-openmpi4-perf-tools # for hpctests From 8d7371f246c0f4dffeb9847c292f1b92c3b8bbdb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 14 May 2025 15:36:25 +0000 Subject: [PATCH 08/11] bump CI image to get mysql client installed --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 614c0adb9..763165f62 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250513-1045-ca44f898", - "RL9": "openhpc-RL9-250513-1046-ca44f898" + "RL8": "openhpc-RL8-250514-1502-5a923b2c", + "RL9": "openhpc-RL9-250514-1502-5a923b2c" } } From fb5e7cd810a1ffd59ac15f029c2bd098f2ed5e3a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 15 May 2025 10:19:25 +0000 Subject: [PATCH 09/11] delete snapshot when cleaning up in CI --- .github/workflows/stackhpc.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 0d8846501..173b4e797 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -230,6 +230,16 @@ jobs: env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + - name: Delete possible volume snapshot from slurm upgrade + run: | + . venv/bin/activate + . environments/.stackhpc/activate + if [ -n "$SNAPSHOT" ] + then + echo Deleting $SNAPSHOT + openstack volume snapshot delete $SNAPSHOT + fi + - name: Delete infrastructure run: | . venv/bin/activate From 9422cdf4619c8c0d467870f341a10d611210d0eb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 15 May 2025 11:29:18 +0000 Subject: [PATCH 10/11] bump openhpc role to commit --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index d4f0ec1d5..1ef961026 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: 'feat/upgrade-db' # local - TODO: bump on release + version: 362e3fc # feat/upgrade-db TODO: bump on release name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From e50f964f6c9b7d4d9191c0b766b6f6175846ca78 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 15 May 2025 12:35:13 +0000 Subject: [PATCH 11/11] bump openhpc role to release --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 1ef961026..21c69c39a 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: 362e3fc # feat/upgrade-db TODO: bump on release + version: v0.30.0 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc