From e0cc721a3fad2c3475de35921b11fe29d2a14b63 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Tue, 15 Jul 2025 10:54:44 -0400 Subject: [PATCH 1/3] Delete cluster name state file whenever slurm accounting is configured or updated --- .../recipes/config/config_slurm_accounting.rb | 9 +++++++++ .../recipes/update/clear_slurm_accounting.rb | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb index 57c304b531..c9d8a97491 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb @@ -88,6 +88,15 @@ retry_delay 10 end unless kitchen_test? || (node['cluster']['node_type'] == "ExternalSlurmDbd") + bash "Remove existing cluster name state file" do + user 'root' + group 'root' + code <<-CLUSTERSTATE + rm /var/spool/slurm.state/clustername + CLUSTERSTATE + only_if { ::File.exist?('/var/spool/slurm.state/clustername') } + end + bash "bootstrap slurm database" do user 'root' group 'root' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb index 9aca50d359..4d64a3677e 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb @@ -23,3 +23,12 @@ supports restart: false action %i(disable stop) end + +bash "Remove existing cluster name state file" do + user 'root' + group 'root' + code <<-CLUSTERSTATE + rm /var/spool/slurm.state/clustername + CLUSTERSTATE + only_if { ::File.exist?('/var/spool/slurm.state/clustername') } +end From baae6e74696dd2b449489e4ada25068c080f8278 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Fri, 18 Jul 2025 13:58:47 -0400 Subject: [PATCH 2/3] Add spec test to cover the deletion of the cluster id state --- .../recipes/config/config_slurm_accounting.rb | 10 ++----- .../recipes/update/clear_slurm_accounting.rb | 9 ++---- .../recipes/clear_slurm_accounting_spec.rb | 28 +++++++++++++++++++ .../recipes/config_slurm_accounting_spec.rb | 4 +++ 4 files changed, 37 insertions(+), 14 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/clear_slurm_accounting_spec.rb diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb index c9d8a97491..6a7ed2fb66 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb @@ -88,13 +88,9 @@ retry_delay 10 end unless kitchen_test? || (node['cluster']['node_type'] == "ExternalSlurmDbd") - bash "Remove existing cluster name state file" do - user 'root' - group 'root' - code <<-CLUSTERSTATE - rm /var/spool/slurm.state/clustername - CLUSTERSTATE - only_if { ::File.exist?('/var/spool/slurm.state/clustername') } + + file '/var/spool/slurm.state/clustername' do + action :delete end bash "bootstrap slurm database" do diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb index 4d64a3677e..ece9c76919 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb @@ -24,11 +24,6 @@ action %i(disable stop) end -bash "Remove existing cluster name state file" do - user 'root' - group 'root' - code <<-CLUSTERSTATE - rm /var/spool/slurm.state/clustername - CLUSTERSTATE - only_if { ::File.exist?('/var/spool/slurm.state/clustername') } +file '/var/spool/slurm.state/clustername' do + action :delete end diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/clear_slurm_accounting_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/clear_slurm_accounting_spec.rb new file mode 100644 index 0000000000..39dff104dd --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/clear_slurm_accounting_spec.rb @@ -0,0 +1,28 @@ +require 'spec_helper' + +describe 'aws-parallelcluster-slurm::clear_slurm_accounting' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + mock_file_exists("/var/spool/slurm.state/clustername", true) + node.override['cluster']['slurmdbd_service_enabled'] = true + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'stops the slurm database daemon' do + is_expected.to disable_service("slurmdbd") + end + + it 'deletes the Slurm database password update script' do + is_expected.to delete_file("#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh") + end + + it 'Removes existing cluster name state file' do + is_expected.to delete_file('/var/spool/slurm.state/clustername') + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb index 822d692d2b..f2025932a4 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb @@ -10,6 +10,7 @@ allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false) allow_any_instance_of(Object).to receive(:dig).and_return(true) RSpec::Mocks.configuration.allow_message_expectations_on_nil = true + mock_file_exists("/var/spool/slurm.state/clustername", true) node.override['cluster']['slurmdbd_service_enabled'] = enable_service end runner.converge(described_recipe) @@ -70,6 +71,9 @@ ) end if enable_service == "true" + it 'Removes existing cluster name state file' do + is_expected.to delete_file('/var/spool/slurm.state/clustername') + end it 'starts the slurm database daemon' do is_expected.to enable_service("slurmdbd") is_expected.to start_service("slurmdbd") From 2b67150cd59c07ac6c809bfc08f2ffc4c67713fa Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Mon, 21 Jul 2025 11:29:55 -0400 Subject: [PATCH 3/3] Update changelog --- CHANGELOG.md | 1 + .../recipes/config/config_slurm_accounting.rb | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d85df10d5..1b7168188c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Ubuntu 20.04 is no longer supported. - Upgrade Slurm to version 24.11.5. +- Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting. - Upgrade DCV to version 2024.0-19030. - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb index 6a7ed2fb66..7c97318fe5 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb @@ -76,6 +76,10 @@ action action end unless on_docker? +file "/var/spool/slurm.state/clustername" do + action "delete" +end + if node['cluster']['slurmdbd_service_enabled'] == "true" # After starting slurmdbd the database may not be fully responsive yet and # its bootstrapping may fail. We need to wait for sacctmgr to successfully @@ -88,11 +92,6 @@ retry_delay 10 end unless kitchen_test? || (node['cluster']['node_type'] == "ExternalSlurmDbd") - - file '/var/spool/slurm.state/clustername' do - action :delete - end - bash "bootstrap slurm database" do user 'root' group 'root'