Skip to content

Commit 5d9beaf

Browse files
committed
[Update] Reduce the time clustermgtd is stopped during cluster updates.
In particular, we keep it stopped only during update actions that could cause race conditions: 1. munge key update 2. slurm accounting updates 3. slurmctld restart 4. scontrol reconfigure
1 parent 66f5649 commit 5d9beaf

File tree

3 files changed

+58
-20
lines changed

3 files changed

+58
-20
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2626
- Upgrade aws-cfn-bootstrap to version 2.0-38 (from 2.0-33).
2727
- Always start clustermgtd on cluster update failure, regardless the failure condition.
2828
- Keep clustermgtd running during compute-fleet status updates.
29+
- Keep clustermgtd stopped during cluster updates only during munge key updates, slurmctld restart and slurm reconfigure.
2930

3031
**BUG FIXES**
3132
- Fix timestamp formats in CloudWatch log configuration to let CloudWatch parse the correct timestamps.

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,6 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
execute 'stop clustermgtd' do
19-
command "#{cookbook_virtualenv_path}/bin/supervisorctl stop clustermgtd"
20-
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? }
21-
end
22-
2318
# Write the new config version to shared storage to signal compute nodes to update
2419
file node['cluster']['update']['trigger_file'] do
2520
content node['cluster']['cluster_config_version']
@@ -207,17 +202,6 @@ def update_nodes_in_queue(strategy, queues)
207202
replace_only true
208203
end
209204

210-
ruby_block "Update Slurm Accounting" do
211-
block do
212-
if node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil?
213-
run_context.include_recipe "aws-parallelcluster-slurm::clear_slurm_accounting"
214-
else
215-
run_context.include_recipe "aws-parallelcluster-slurm::config_slurm_accounting"
216-
end
217-
end
218-
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? }
219-
end unless on_docker?
220-
221205
# Cover the following two scenarios:
222206
# - a cluster without login nodes is updated to have login nodes;
223207
# - a cluster with login nodes is updated to use another pool name.
@@ -242,8 +226,6 @@ def update_nodes_in_queue(strategy, queues)
242226
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
243227
end
244228

245-
update_munge_head_node
246-
247229
# The previous execute "generate_pcluster_slurm_configs" block resource may have overridden the slurmdbd password in
248230
# slurm_parallelcluster_slurmdbd.conf with a default value, so if it has run and Slurm accounting
249231
# is enabled we must pull the database password from Secrets Manager once again.
@@ -255,6 +237,24 @@ def update_nodes_in_queue(strategy, queues)
255237
only_if { !(::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated?) && !node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil? }
256238
end
257239

240+
execute 'stop clustermgtd' do
241+
command "#{cookbook_virtualenv_path}/bin/supervisorctl stop clustermgtd"
242+
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? }
243+
end
244+
245+
update_munge_head_node
246+
247+
ruby_block "Update Slurm Accounting" do
248+
block do
249+
if node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil?
250+
run_context.include_recipe "aws-parallelcluster-slurm::clear_slurm_accounting"
251+
else
252+
run_context.include_recipe "aws-parallelcluster-slurm::config_slurm_accounting"
253+
end
254+
end
255+
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? }
256+
end unless on_docker?
257+
258258
service 'slurmctld' do
259259
action :restart
260260
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? }
@@ -280,12 +280,12 @@ def update_nodes_in_queue(strategy, queues)
280280

281281
chef_sleep '15'
282282

283-
wait_cluster_ready if cluster_readiness_check_on_update_enabled?
284-
285283
execute 'start clustermgtd' do
286284
command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"
287285
end
288286

287+
wait_cluster_ready if cluster_readiness_check_on_update_enabled?
288+
289289
# The updated cfnconfig will be used by post update custom scripts
290290
template "#{node['cluster']['etc_dir']}/cfnconfig" do
291291
source 'init/cfnconfig.erb'

cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,43 @@
8484
timeout: reconfigure_timeout
8585
)
8686
end
87+
88+
it 'executes resources in the correct order' do
89+
# NOTE: The most important aspect in the sequence is that clustermgtd is stopped while executing:
90+
# 1. update_munge_key
91+
# 2. restart of slurmctld
92+
# 3. scontrol reconfigure
93+
resource_names = chef_run.resource_collection.map(&:name)
94+
95+
expected_sequence = [
96+
chef_run.node['cluster']['update']['trigger_file'],
97+
'update_shared_storages',
98+
'replace slurm queue nodes',
99+
'Update or Cleanup Slurm Topology',
100+
'generate_pcluster_slurm_configs',
101+
'generate_pcluster_custom_slurm_settings_include_files',
102+
'Override Custom Slurm Settings with remote file',
103+
'generate_pcluster_fleet_config',
104+
'update node replacement timeout',
105+
"#{scripts_dir}/slurm/check_login_nodes_stopped.sh",
106+
"#{scripts_dir}/slurm/update_munge_key.sh",
107+
'update Slurm database password',
108+
'stop clustermgtd',
109+
'update_munge_key',
110+
'Update Slurm Accounting',
111+
'slurmctld',
112+
'5',
113+
'check slurmctld status',
114+
'reload config for running nodes',
115+
'15',
116+
'start clustermgtd',
117+
'Check cluster readiness',
118+
'/etc/parallelcluster/cfnconfig',
119+
'Cleanup',
120+
]
121+
122+
expect(resource_names).to eq(expected_sequence)
123+
end
87124
end
88125
end
89126

0 commit comments

Comments
 (0)