Skip to content

Commit b82c8d9

Browse files
committed
[DFSM] Fix major gaps in the update workflow.
In particular: 1. If the change-set does not require a live update on compute and login nodes, they save a DDB record with status SKIPPED_LIVE_UPDATE and exists from the update without performing and storage change. 2. If the change-set requires a live update on compute and login nodes, they perform the storage change and save a DDB record with status DEPLOYED. 3. The cluster readiness check performed by the head node considers successful all nodes having either status DEPLOYED or SKIPPED_LIVE_UPDATE. 4. Cluster readiness check now emits a more readable log about retrieved DDB items. Signed-off-by: Giacomo Marciani <[email protected]>
1 parent 48144e2 commit b82c8d9

File tree

12 files changed

+152
-72
lines changed

12 files changed

+152
-72
lines changed

cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks/check_cluster_ready.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def check_deployed_config_version(cluster_name: str, table_name: str, expected_c
108108
logger.info("Found batch of %s cluster node(s): %s", n_instance_ids, instance_ids)
109109

110110
items = get_cluster_config_records(table_name, instance_ids, region)
111-
logger.info("Retrieved %s DDB item(s): %s", len(items), items)
111+
logger.info("Retrieved %s DDB item(s):\n\t%s", len(items), "\n\t".join([str(i) for i in items]))
112112

113113
missing, incomplete, wrong = _check_cluster_config_items(instance_ids, items, expected_config_version)
114114

cookbooks/aws-parallelcluster-slurm/libraries/dynamo.rb

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,24 @@
1414
# Saves on the DynamoDB table 'parallelcluster-$CLUSTERNAME' the correspondence between
1515
# the instance id and the cluster config version deployed on that instance.
1616
# This is done to understand what is the deployed cluster config in each node, for example, during a CFN Stack update.
17-
def save_instance_config_version_to_dynamodb
17+
18+
19+
DDB_CONFIG_STATUS = {
20+
DEPLOYED: "DEPLOYED",
21+
SKIPPED_LIVE_UPDATE: "SKIPPED_LIVE_UPDATE"
22+
}
23+
24+
def save_instance_config_version_to_dynamodb(status)
1825
unless on_docker? || kitchen_test? && !node['interact_with_ddb']
1926
# TODO: the table name should be read from a dedicated node attribute,
2027
# but it is currently missing in the dna.json for compute nodes.
2128
table_name = "parallelcluster-#{node['cluster']['cluster_name']}"
2229
item_id = "CLUSTER_CONFIG.#{node['ec2']['instance_id']}"
23-
item_data = "{\"cluster_config_version\": {\"S\": \"#{node['cluster']['cluster_config_version']}\"}, \"lastUpdateTime\": {\"S\": \"#{Time.now.utc}\"}}"
30+
node_type = node['cluster']['node_type']
31+
config_version = node['cluster']['cluster_config_version']
32+
last_update_time = Time.now.utc
33+
34+
item_data = "{\"cluster_config_version\": {\"S\": \"#{config_version}\"}, \"status\": {\"S\": \"#{status}\"}, \"node_type\": {\"S\": \"#{node_type}\"}, \"lastUpdateTime\": {\"S\": \"#{last_update_time}\"}}"
2435
item = "{\"Id\": {\"S\": \"#{item_id}\"}, \"Data\": {\"M\": #{item_data}}}"
2536

2637
execute "Save cluster config version to DynamoDB" do

cookbooks/aws-parallelcluster-slurm/libraries/storage.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def storage_change_supports_live_update?(changes)
104104

105105
if storage_changes.empty?
106106
Chef::Log.info("No shared storage change found: assuming live update is supported")
107-
return true
107+
return false
108108
end
109109

110110
storage_changes.each do |change|

cookbooks/aws-parallelcluster-slurm/libraries/update.rb

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,26 @@ def is_login_nodes_removed?
9595
previous_config = YAML.safe_load(File.read(node['cluster']['previous_cluster_config_path']))
9696
previous_config.dig("LoginNodes") and !config.dig("LoginNodes")
9797
end
98+
99+
def is_live_update_required?
100+
require 'json'
101+
102+
change_set_path = node['cluster']['change_set_path']
103+
104+
return false unless File.exist?(change_set_path)
105+
106+
change_set = JSON.load_file("#{node['cluster']['change_set_path']}")
107+
changes = change_set["changeSet"]
108+
109+
case node["cluster"]["node_type"]
110+
when 'HeadNode'
111+
# The head node supports live updates regardless the content of the changeset.
112+
return true
113+
when 'ComputeFleet','LoginNode'
114+
# The compute and login nodes support live updates only in specific cases:
115+
# * changeset contains only shared storage changes that support live updates
116+
return storage_change_supports_live_update?(changes)
117+
else
118+
raise "node_type must be HeadNode, LoginNode or ComputeFleet"
119+
end
120+
end

cookbooks/aws-parallelcluster-slurm/recipes/finalize/finalize_compute.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,4 @@
6060
only_if { is_static_node?(node.run_state['slurm_compute_nodename']) }
6161
end
6262

63-
save_instance_config_version_to_dynamodb
63+
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:DEPLOYED])

cookbooks/aws-parallelcluster-slurm/recipes/finalize/finalize_login_node.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
save_instance_config_version_to_dynamodb
18+
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:DEPLOYED])

cookbooks/aws-parallelcluster-slurm/recipes/update/update_compute.rb

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,18 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18+
unless lambda { is_live_update_required? }.call
19+
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:SKIPPED_LIVE_UPDATE])
20+
return
21+
end
22+
1823
# TODO: Move the only_if decision to the update_shared_storage recipe for better definition of responsibilities
19-
# and to facilitate unit testing.
24+
# and to facilitate unit testing.
2025
ruby_block "update_shared_storages" do
2126
block do
2227
run_context.include_recipe 'aws-parallelcluster-environment::update_shared_storages'
2328
end
2429
only_if { are_mount_or_unmount_required? }
2530
end
2631

27-
save_instance_config_version_to_dynamodb
32+
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:DEPLOYED])

cookbooks/aws-parallelcluster-slurm/recipes/update/update_login_node.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18+
unless lambda { is_live_update_required? }.call
19+
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:SKIPPED_LIVE_UPDATE])
20+
return
21+
end
22+
1823
# TODO: Move the only_if decision to the update_shared_storage recipe for better definition of responsibilities
1924
# and to facilitate unit testing.
2025
ruby_block "update_shared_storages" do
@@ -24,4 +29,4 @@
2429
only_if { are_mount_or_unmount_required? }
2530
end
2631

27-
save_instance_config_version_to_dynamodb
32+
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:DEPLOYED])

cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/finalize_compute_spec.rb

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
describe 'aws-parallelcluster-slurm::finalize_compute' do
1717
for_all_oses do |platform, version|
18+
node_type = "ComputeFleet"
1819
cookbook_venv_path = "MOCK_COOKBOOK_VENV_PATH"
1920
cluster_name = "MOCK_CLUSTER_NAME"
2021
region = "MOCK_REGION"
@@ -32,8 +33,7 @@
3233
allow(Time).to receive(:now).and_return(Time.parse(time_now))
3334
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
3435

35-
node.override['cluster']['node_type'] = 'ComputeFleet'
36-
node.override['interact_with_ddb'] = true
36+
node.override['cluster']['node_type'] = node_type
3737
node.override['cluster']['cluster_name'] = cluster_name
3838
node.override['cluster']['region'] = region
3939
node.override['ec2']['instance_id'] = instance_id
@@ -42,10 +42,11 @@
4242
runner.converge(described_recipe)
4343
end
4444

45-
it 'saves the cluster config version to dynamodb' do
45+
status = "DEPLOYED"
46+
it "saves the cluster config version to dynamodb with status #{status}" do
4647
expected_command = "#{cookbook_venv_path}/bin/aws dynamodb put-item" \
4748
" --table-name parallelcluster-#{cluster_name}"\
48-
" --item '{\"Id\": {\"S\": \"CLUSTER_CONFIG.#{instance_id}\"}, \"Data\": {\"M\": {\"cluster_config_version\": {\"S\": \"#{cluster_config_version}\"}, \"lastUpdateTime\": {\"S\": \"#{time_now}\"}}}}'" \
49+
" --item '{\"Id\": {\"S\": \"CLUSTER_CONFIG.#{instance_id}\"}, \"Data\": {\"M\": {\"cluster_config_version\": {\"S\": \"#{cluster_config_version}\"}, \"status\": {\"S\": \"#{status}\"}, \"node_type\": {\"S\": \"#{node_type}\"}, \"lastUpdateTime\": {\"S\": \"#{time_now}\"}}}}'" \
4950
" --region #{region}"
5051
is_expected.to run_execute("Save cluster config version to DynamoDB").with(
5152
command: expected_command,
@@ -62,11 +63,8 @@
6263
allow_any_instance_of(Object).to receive(:is_static_node?).and_return(false)
6364
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
6465

65-
node.override['cluster']['node_type'] = 'ComputeFleet'
6666
node.override['kitchen'] = true
6767
node.override['interact_with_ddb'] = false
68-
node.override['ec2']['instance_id'] = instance_id
69-
node.override['cluster']['cluster_config_version'] = cluster_config_version
7068
end
7169
runner.converge(described_recipe)
7270
end

cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/finalize_login_node_spec.rb

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
describe 'aws-parallelcluster-slurm::finalize_login_node' do
1717
for_all_oses do |platform, version|
18+
node_type = "LoginNode"
1819
cookbook_venv_path = "MOCK_COOKBOOK_VENV_PATH"
1920
cluster_name = "MOCK_CLUSTER_NAME"
2021
region = "MOCK_REGION"
@@ -32,8 +33,7 @@
3233
allow(Time).to receive(:now).and_return(Time.parse(time_now))
3334
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
3435

35-
node.override['cluster']['node_type'] = 'LoginNode'
36-
node.override['interact_with_ddb'] = true
36+
node.override['cluster']['node_type'] = node_type
3737
node.override['cluster']['cluster_name'] = cluster_name
3838
node.override['cluster']['region'] = region
3939
node.override['ec2']['instance_id'] = instance_id
@@ -42,10 +42,11 @@
4242
runner.converge(described_recipe)
4343
end
4444

45-
it 'saves the cluster config version to dynamodb' do
45+
status = "DEPLOYED"
46+
it "saves the cluster config version to dynamodb with status #{status}" do
4647
expected_command = "#{cookbook_venv_path}/bin/aws dynamodb put-item" \
4748
" --table-name parallelcluster-#{cluster_name}"\
48-
" --item '{\"Id\": {\"S\": \"CLUSTER_CONFIG.#{instance_id}\"}, \"Data\": {\"M\": {\"cluster_config_version\": {\"S\": \"#{cluster_config_version}\"}, \"lastUpdateTime\": {\"S\": \"#{time_now}\"}}}}'" \
49+
" --item '{\"Id\": {\"S\": \"CLUSTER_CONFIG.#{instance_id}\"}, \"Data\": {\"M\": {\"cluster_config_version\": {\"S\": \"#{cluster_config_version}\"}, \"status\": {\"S\": \"#{status}\"}, \"node_type\": {\"S\": \"#{node_type}\"}, \"lastUpdateTime\": {\"S\": \"#{time_now}\"}}}}'" \
4950
" --region #{region}"
5051
is_expected.to run_execute("Save cluster config version to DynamoDB").with(
5152
command: expected_command,
@@ -62,11 +63,8 @@
6263
allow_any_instance_of(Object).to receive(:is_static_node?).and_return(false)
6364
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
6465

65-
node.override['cluster']['node_type'] = 'LoginNode'
6666
node.override['kitchen'] = true
6767
node.override['interact_with_ddb'] = false
68-
node.override['ec2']['instance_id'] = instance_id
69-
node.override['cluster']['cluster_config_version'] = cluster_config_version
7068
end
7169
runner.converge(described_recipe)
7270
end

0 commit comments

Comments
 (0)