Skip to content

Commit 2ab66c4

Browse files
committed
[DFSM] Fix race conditions when checking if a live update is supported by a storage change.
In particular, we moved the check that determines if a storage change can be applied with a live update, from a lambda block to specific only_if blocks. This is necessary to let Chef evaluate it when it gets executed, rather than at the beginning of the converge phase. Also, removed unnecessary state 'SKIPPED_LIVE_UPDATE' and covered DFSM critical functionalities with spec tests. Signed-off-by: Giacomo Marciani <[email protected]>
1 parent 067f91e commit 2ab66c4

File tree

11 files changed

+324
-340
lines changed

11 files changed

+324
-340
lines changed

cookbooks/aws-parallelcluster-slurm/libraries/dynamo.rb

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,6 @@
1818
DDB_CONFIG_STATUS = {
1919
# The node successfully deployed the cluster configuration.
2020
DEPLOYED: "DEPLOYED",
21-
22-
# The node evaluated the cluster configuration and concluded that a live update should be skipped because:
23-
# 1. not supported: the changeset contains changes that cannot be applied with a live update
24-
# and may instead require a node replacement.
25-
# Example: mounting a managed EBS.
26-
# 2. not required: the changeset contains changes that could lead to an empty live update.
27-
# Example: changing the number of static nodes.
28-
SKIPPED_LIVE_UPDATE: "SKIPPED_LIVE_UPDATE",
2921
}.freeze
3022

3123
def save_instance_config_version_to_dynamodb(status)

cookbooks/aws-parallelcluster-slurm/libraries/storage.rb

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class SharedStorageChangeInfo
3232
# "EfsSettings": {
3333
# "FileSystemId": "fs-123456789"
3434
# },
35-
# "currentValue": "-"
35+
# "currentValue": nil
3636
# }
3737
def initialize(change)
3838
old_value = change["currentValue"]
@@ -74,36 +74,26 @@ def to_s
7474
end
7575
end
7676

77-
# Checks if the given changes provided by a change-set support live updates.
77+
# Checks if all the shared storage changes in the change-set (if any) support live updates.
7878
# With live updates we refer to in-place updates that do not required node replacement.
7979
# The decision on the support is demanded to SharedStorageChangeInfo.
8080
# If the changes are nil, empty or they do not contain any shared storage change,
8181
# we assume that a live update is supported.
82-
# Example of a change:
83-
# {
84-
# "parameter": "SharedStorage",
85-
# "requestedValue": {
86-
# "MountDir": "/opt/shared/efs/managed/1",
87-
# "Name": "shared-efs-managed-1",
88-
# "StorageType": "Efs",
89-
# "EfsSettings": {
90-
# "FileSystemId": "fs-123456789"
91-
# },
92-
# "currentValue": "-"
93-
# }
9482
#
95-
# @param [List[change]] changes the list fo changes provided by a change-set.
9683
# @return [Boolean] true if the change supports live updates; false, otherwise.
97-
def storage_change_supports_live_update?(changes)
84+
def storage_change_supports_live_update?
85+
change_set = JSON.load_file("#{node['cluster']['change_set_path']}")
86+
changes = change_set["changeSet"]
87+
9888
if changes.nil? || changes.empty?
99-
Chef::Log.info("No change found: live update is supported")
89+
Chef::Log.info("Changeset is empty: live update is supported")
10090
return true
10191
end
10292

10393
storage_changes = changes.select { |change| change["parameter"] == "SharedStorage" }
10494

10595
if storage_changes.empty?
106-
Chef::Log.info("No shared storage change found: live update is supported")
96+
Chef::Log.info("Changeset does not contain shared storage changes: live update is supported")
10797
return true
10898
end
10999

@@ -112,7 +102,7 @@ def storage_change_supports_live_update?(changes)
112102
change_info = SharedStorageChangeInfo.new(change)
113103
Chef::Log.info("Generated shared storage change info: #{change_info}")
114104
supported = change_info.support_live_updates?
115-
Chef::Log.info("Change #{change} #{'does not ' unless supported}support live updates")
105+
Chef::Log.info("Change #{change} #{'does not ' unless supported}support live update")
116106
return false unless supported
117107
end
118108
Chef::Log.info("All shared storage changes support live update.")

cookbooks/aws-parallelcluster-slurm/libraries/update.rb

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -95,45 +95,3 @@ def is_login_nodes_removed?
9595
previous_config = YAML.safe_load(File.read(node['cluster']['previous_cluster_config_path']))
9696
previous_config.dig("LoginNodes") and !config.dig("LoginNodes")
9797
end
98-
99-
def is_live_update_required?
100-
require 'json'
101-
102-
change_set_path = node['cluster']['change_set_path']
103-
104-
Chef::Log.info("Evaluating if a live update is required according to the changeset at #{change_set_path}")
105-
106-
unless File.exist?(change_set_path)
107-
Chef::Log.info("Changeset not found: live update is not required")
108-
return false
109-
end
110-
111-
change_set = JSON.load_file("#{node['cluster']['change_set_path']}")
112-
changes = change_set["changeSet"]
113-
114-
Chef::Log.info("Changeset found: evaluating changes #{changes}")
115-
116-
if changes.nil? || changes.empty?
117-
Chef::Log.info("No change found in changeset: live update is not required")
118-
return false
119-
end
120-
121-
outcome = case node["cluster"]["node_type"]
122-
when 'HeadNode'
123-
# The head node supports live updates regardless the content of the changeset,
124-
# because we assume here that the CLI permitted only the submission of an update with changes
125-
# that can be handled by a live update on the head node.
126-
true
127-
when 'ComputeFleet', 'LoginNode'
128-
# The compute and login nodes support live updates only in specific cases:
129-
# * changeset contains only shared storage changes that support live updates
130-
only_shared_storage_change = changes.all? { |change| change["parameter"] == "SharedStorage" }
131-
only_shared_storage_change && storage_change_supports_live_update?(changes)
132-
else
133-
raise "node_type must be HeadNode, LoginNode or ComputeFleet"
134-
end
135-
136-
Chef::Log.info("Live update required: #{outcome}")
137-
138-
outcome
139-
end

cookbooks/aws-parallelcluster-slurm/recipes/update/update_compute.rb

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,13 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
unless -> { is_live_update_required? }.call
19-
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:SKIPPED_LIVE_UPDATE])
20-
return
21-
end
22-
2318
# TODO: Move the only_if decision to the update_shared_storage recipe for better definition of responsibilities
2419
# and to facilitate unit testing.
2520
ruby_block "update_shared_storages" do
2621
block do
2722
run_context.include_recipe 'aws-parallelcluster-environment::update_shared_storages'
2823
end
29-
only_if { are_mount_or_unmount_required? }
24+
only_if { are_mount_or_unmount_required? && storage_change_supports_live_update? }
3025
end
3126

3227
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:DEPLOYED])

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,9 @@ def get_queues_with_changes(config)
100100
change_set = JSON.load_file("#{node['cluster']['shared_dir']}/change-set.json")
101101
changes = change_set["changeSet"]
102102

103-
if changes.empty?
104-
Chef::Log.info("Changeset is empty: queues do not need updates")
105-
return queues
106-
end
107-
108103
# Changes to the shared storage are applied to all queues,
109104
# but only changes not supporting live updates will be considered to update the queues.
110-
if are_mount_or_unmount_required? && !storage_change_supports_live_update?(changes)
105+
if are_mount_or_unmount_required? && !storage_change_supports_live_update?
111106
queues = get_all_queues(config)
112107
Chef::Log.info("All queues will be updated in order to update shared storages")
113108
else

cookbooks/aws-parallelcluster-slurm/recipes/update/update_login_node.rb

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,13 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
unless -> { is_live_update_required? }.call
19-
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:SKIPPED_LIVE_UPDATE])
20-
return
21-
end
22-
2318
# TODO: Move the only_if decision to the update_shared_storage recipe for better definition of responsibilities
2419
# and to facilitate unit testing.
2520
ruby_block "update_shared_storages" do
2621
block do
2722
run_context.include_recipe 'aws-parallelcluster-environment::update_shared_storages'
2823
end
29-
only_if { are_mount_or_unmount_required? }
24+
only_if { are_mount_or_unmount_required? && storage_change_supports_live_update? }
3025
end
3126

3227
save_instance_config_version_to_dynamodb(DDB_CONFIG_STATUS[:DEPLOYED])
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
require 'spec_helper'
2+
3+
describe "aws-parallelcluster-slurm:libraries:storage_change_supports_live_update" do
4+
let(:node) do
5+
{
6+
"cluster" => { "change_set_path" => "/SHARED_DIR/change-set.json" },
7+
}
8+
end
9+
10+
let(:mock_shared_storage_change_info) { instance_double(SharedStorageChangeInfo) }
11+
12+
shared_examples "the correct method" do |changeset, info_outcome, expected_result|
13+
it "returns #{expected_result}" do
14+
allow(File).to receive(:read).with("/SHARED_DIR/change-set.json").and_return(JSON.dump(changeset))
15+
allow(SharedStorageChangeInfo).to receive(:new).and_return(mock_shared_storage_change_info)
16+
allow(mock_shared_storage_change_info).to receive(:support_live_updates?).and_return(info_outcome)
17+
result = storage_change_supports_live_update?
18+
expect(result).to eq(expected_result)
19+
end
20+
end
21+
22+
context "when changeset is empty" do
23+
changeset = {
24+
"changeSet" => [],
25+
}
26+
include_examples "the correct method", changeset, nil, true
27+
end
28+
29+
context "when changeset does not contain any change for SharedStorage" do
30+
changeset = {
31+
"changeSet" => [
32+
{
33+
parameter: "NOT_SharedStorage",
34+
},
35+
],
36+
}
37+
include_examples "the correct method", changeset, nil, true
38+
end
39+
40+
context "when changeset contains a change for SharedStorage" do
41+
changeset = {
42+
"changeSet" => [
43+
{
44+
parameter: "SharedStorage",
45+
},
46+
{
47+
parameter: "NOT_SharedStorage",
48+
},
49+
],
50+
}
51+
[true, false].each do |info_outcome|
52+
context "and SharedStorageChangeInfo says it is #{'not ' unless info_outcome}supported" do
53+
expected_result = info_outcome
54+
include_examples "the correct method", changeset, info_outcome, expected_result
55+
end
56+
end
57+
end
58+
end
59+
60+
def build_storage_change(type, ownership, action)
61+
storage_identified = case type
62+
when "Ebs", "FsxOntap", "FsxOpenZfs"
63+
"VolumeId"
64+
when "FileCache"
65+
"FileCacheId"
66+
else
67+
"FileSystemId"
68+
end
69+
storage_settings = case ownership
70+
when "external"
71+
{
72+
storage_identified => "STORAGE_ID",
73+
}
74+
when "managed"
75+
{
76+
"SETTING_1" => "VALUE_1",
77+
"SETTING_2" => "VALUE_2",
78+
}
79+
end
80+
case action
81+
when "mount"
82+
current_value = nil
83+
requested_value = {
84+
"MountDir" => "/opt/shared/#{type}/#{ownership}/1",
85+
"Name" => "shared-#{type}-#{ownership}-external-1",
86+
"StorageType" => "#{type}",
87+
"#{type}Settings" => storage_settings,
88+
}
89+
when "unmount"
90+
current_value = {
91+
"MountDir" => "/opt/shared/#{type}/#{ownership}/1",
92+
"Name" => "shared-#{type}-#{ownership}-external-1",
93+
"StorageType" => "#{type}",
94+
"#{type}Settings" => storage_settings,
95+
}
96+
requested_value = nil
97+
else
98+
raise "Unrecognized action #{action}. It must be one of : mount, unmount"
99+
end
100+
101+
{
102+
"parameter" => "SharedStorage",
103+
"currentValue" => current_value,
104+
"requestedValue" => requested_value,
105+
}
106+
end
107+
108+
describe "aws-parallelcluster-slurm:libraries:SharedStorageChangeInfo" do
109+
shared_examples "behaves correctly" do |change, expected_result|
110+
it "support_live_updates? returns the correct value" do
111+
result = SharedStorageChangeInfo.new(change).support_live_updates?
112+
expect(result).to eq(expected_result)
113+
end
114+
end
115+
116+
%w(Ebs Efs FsxLustre FsxOntap FsxOpenZfs FileCache).each do |storage_type|
117+
%w(external managed).each do |storage_ownership|
118+
%w(mount unmount).each do |storage_action|
119+
context "when #{storage_action}ing #{storage_ownership} #{storage_type}" do
120+
change = build_storage_change(storage_type, storage_ownership, storage_action)
121+
expected_result = %(Efs FsxLustre FsxOntap FsxOpenZfs FileCache).include?(storage_type) && storage_ownership == "external"
122+
include_examples "behaves correctly", change, expected_result
123+
end
124+
end
125+
end
126+
end
127+
end
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
require 'spec_helper'
2+
3+
describe "aws-parallelcluster-slurm:libraries:are_mount_unmount_required" do
4+
let(:node) do
5+
{
6+
"cluster" => { "shared_dir" => "/SHARED_DIR" },
7+
}
8+
end
9+
10+
shared_examples "the correct method" do |changeset, expected_result|
11+
it "returns #{expected_result}" do
12+
allow(File).to receive(:read).with("/SHARED_DIR/change-set.json").and_return(JSON.dump(changeset))
13+
result = are_mount_or_unmount_required?
14+
expect(result).to eq(expected_result)
15+
end
16+
end
17+
18+
context "when changeset is empty" do
19+
changeset = {
20+
"changeSet" => [],
21+
}
22+
include_examples "the correct method", changeset, false
23+
end
24+
25+
context "when changeset does not contain any change with SHARED_STORAGE_UPDATE_POLICY" do
26+
changeset = {
27+
"changeSet" => [
28+
{
29+
updatePolicy: "NOT_SHARED_STORAGE_UPDATE_POLICY",
30+
},
31+
],
32+
}
33+
include_examples "the correct method", changeset, false
34+
end
35+
36+
context "when changeset contains at least a change with SHARED_STORAGE_UPDATE_POLICY" do
37+
changeset = {
38+
"changeSet" => [
39+
{
40+
updatePolicy: "SHARED_STORAGE_UPDATE_POLICY",
41+
},
42+
{
43+
updatePolicy: "NOT_SHARED_STORAGE_UPDATE_POLICY",
44+
},
45+
],
46+
}
47+
include_examples "the correct method", changeset, true
48+
end
49+
end

0 commit comments

Comments
 (0)