Skip to content

Commit 59cce6b

Browse files
authored
Delete cluster name state file whenever slurm accounting is configured or updated (aws#2994)
* Delete cluster name state file whenever slurm accounting is configured or updated * Add spec test to cover the deletion of the cluster id state * Update changelog
1 parent 01181cb commit 59cce6b

File tree

5 files changed

+41
-0
lines changed

5 files changed

+41
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1212
**CHANGES**
1313
- Ubuntu 20.04 is no longer supported.
1414
- Upgrade Slurm to version 24.11.5.
15+
- Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting.
1516
- Upgrade DCV to version 2024.0-19030.
1617
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
1718

cookbooks/aws-parallelcluster-slurm/recipes/config/config_slurm_accounting.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@
7676
action action
7777
end unless on_docker?
7878

79+
file "/var/spool/slurm.state/clustername" do
80+
action "delete"
81+
end
82+
7983
if node['cluster']['slurmdbd_service_enabled'] == "true"
8084
# After starting slurmdbd the database may not be fully responsive yet and
8185
# its bootstrapping may fail. We need to wait for sacctmgr to successfully

cookbooks/aws-parallelcluster-slurm/recipes/update/clear_slurm_accounting.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,7 @@
2323
supports restart: false
2424
action %i(disable stop)
2525
end
26+
27+
file '/var/spool/slurm.state/clustername' do
28+
action :delete
29+
end
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
require 'spec_helper'
2+
3+
describe 'aws-parallelcluster-slurm::clear_slurm_accounting' do
4+
for_all_oses do |platform, version|
5+
context "on #{platform}#{version}" do
6+
cached(:chef_run) do
7+
runner = runner(platform: platform, version: version) do |node|
8+
mock_file_exists("/var/spool/slurm.state/clustername", true)
9+
node.override['cluster']['slurmdbd_service_enabled'] = true
10+
end
11+
runner.converge(described_recipe)
12+
end
13+
cached(:node) { chef_run.node }
14+
15+
it 'stops the slurm database daemon' do
16+
is_expected.to disable_service("slurmdbd")
17+
end
18+
19+
it 'deletes the Slurm database password update script' do
20+
is_expected.to delete_file("#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh")
21+
end
22+
23+
it 'Removes existing cluster name state file' do
24+
is_expected.to delete_file('/var/spool/slurm.state/clustername')
25+
end
26+
end
27+
end
28+
end

cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/config_slurm_accounting_spec.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false)
1111
allow_any_instance_of(Object).to receive(:dig).and_return(true)
1212
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
13+
mock_file_exists("/var/spool/slurm.state/clustername", true)
1314
node.override['cluster']['slurmdbd_service_enabled'] = enable_service
1415
end
1516
runner.converge(described_recipe)
@@ -70,6 +71,9 @@
7071
)
7172
end
7273
if enable_service == "true"
74+
it 'Removes existing cluster name state file' do
75+
is_expected.to delete_file('/var/spool/slurm.state/clustername')
76+
end
7377
it 'starts the slurm database daemon' do
7478
is_expected.to enable_service("slurmdbd")
7579
is_expected.to start_service("slurmdbd")

0 commit comments

Comments
 (0)