[Update] Reduce the time clustermgtd is stopped during cluster updates.

gmarciani · gmarciani · commit 5d9beafc8e98 · 2026-01-29T09:53:45.000-05:00
In particular, we keep it stopped only during update actions that could cause race conditions:

  1. munge key update
  2. slurm accounting updates
  3. slurmctld restart
  4. scontrol reconfigure
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
 - Upgrade aws-cfn-bootstrap to version 2.0-38 (from 2.0-33).
 - Always start clustermgtd on cluster update failure, regardless the failure condition. 
 - Keep clustermgtd running during compute-fleet status updates.
+- Keep clustermgtd stopped during cluster updates only during munge key updates, slurmctld restart and slurm reconfigure.
 
 **BUG FIXES**
 - Fix timestamp formats in CloudWatch log configuration to let CloudWatch parse the correct timestamps.
diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb
@@ -15,11 +15,6 @@
 # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 # limitations under the License.
 
-execute 'stop clustermgtd' do
-  command "#{cookbook_virtualenv_path}/bin/supervisorctl stop clustermgtd"
-  not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? }
-end
-
 # Write the new config version to shared storage to signal compute nodes to update
 file node['cluster']['update']['trigger_file'] do
   content node['cluster']['cluster_config_version']
@@ -207,17 +202,6 @@ def update_nodes_in_queue(strategy, queues)
   replace_only true
 end
 
-ruby_block "Update Slurm Accounting" do
-  block do
-    if node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil?
-      run_context.include_recipe "aws-parallelcluster-slurm::clear_slurm_accounting"
-    else
-      run_context.include_recipe "aws-parallelcluster-slurm::config_slurm_accounting"
-    end
-  end
-  only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? }
-end unless on_docker?
-
 # Cover the following two scenarios:
 # - a cluster without login nodes is updated to have login nodes;
 # - a cluster with login nodes is updated to use another pool name.
@@ -242,8 +226,6 @@ def update_nodes_in_queue(strategy, queues)
   only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
 end
 
-update_munge_head_node
-
 # The previous execute "generate_pcluster_slurm_configs" block resource may have overridden the slurmdbd password in
 # slurm_parallelcluster_slurmdbd.conf with a default value, so if it has run and Slurm accounting
 # is enabled we must pull the database password from Secrets Manager once again.
@@ -255,6 +237,24 @@ def update_nodes_in_queue(strategy, queues)
   only_if { !(::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated?) && !node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil? }
 end
 
+execute 'stop clustermgtd' do
+  command "#{cookbook_virtualenv_path}/bin/supervisorctl stop clustermgtd"
+  not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? }
+end
+
+update_munge_head_node
+
+ruby_block "Update Slurm Accounting" do
+  block do
+    if node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil?
+      run_context.include_recipe "aws-parallelcluster-slurm::clear_slurm_accounting"
+    else
+      run_context.include_recipe "aws-parallelcluster-slurm::config_slurm_accounting"
+    end
+  end
+  only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? }
+end unless on_docker?
+
 service 'slurmctld' do
   action :restart
   not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? }
@@ -280,12 +280,12 @@ def update_nodes_in_queue(strategy, queues)
 
 chef_sleep '15'
 
-wait_cluster_ready if cluster_readiness_check_on_update_enabled?
-
 execute 'start clustermgtd' do
   command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"
 end
 
+wait_cluster_ready if cluster_readiness_check_on_update_enabled?
+
 # The updated cfnconfig will be used by post update custom scripts
 template "#{node['cluster']['etc_dir']}/cfnconfig" do
   source 'init/cfnconfig.erb'
diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb
@@ -84,6 +84,43 @@
               timeout: reconfigure_timeout
             )
           end
+
+          it 'executes resources in the correct order' do
+            # NOTE: The most important aspect in the sequence is that clustermgtd is stopped while executing:
+            #   1. update_munge_key
+            #   2. restart of slurmctld
+            #   3. scontrol reconfigure
+            resource_names = chef_run.resource_collection.map(&:name)
+
+            expected_sequence = [
+              chef_run.node['cluster']['update']['trigger_file'],
+              'update_shared_storages',
+              'replace slurm queue nodes',
+              'Update or Cleanup Slurm Topology',
+              'generate_pcluster_slurm_configs',
+              'generate_pcluster_custom_slurm_settings_include_files',
+              'Override Custom Slurm Settings with remote file',
+              'generate_pcluster_fleet_config',
+              'update node replacement timeout',
+              "#{scripts_dir}/slurm/check_login_nodes_stopped.sh",
+              "#{scripts_dir}/slurm/update_munge_key.sh",
+              'update Slurm database password',
+              'stop clustermgtd',
+              'update_munge_key',
+              'Update Slurm Accounting',
+              'slurmctld',
+              '5',
+              'check slurmctld status',
+              'reload config for running nodes',
+              '15',
+              'start clustermgtd',
+              'Check cluster readiness',
+              '/etc/parallelcluster/cfnconfig',
+              'Cleanup',
+            ]
+
+            expect(resource_names).to eq(expected_sequence)
+          end
         end
       end