Skip to content

Commit cc9f089

Browse files
authored
Merge branch 'develop' into wip/mgiacomo/3140/fix-queue-updated-condition-0812-2
2 parents 2791251 + b2d27a2 commit cc9f089

File tree

7 files changed

+220
-85
lines changed

7 files changed

+220
-85
lines changed

cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,7 @@
2525
# Pyxis
2626
default['cluster']['pyxis']['version'] = '0.20.0'
2727
default['cluster']['pyxis']['runtime_path'] = '/run/pyxis'
28+
29+
# Block Topology Plugin
30+
default['cluster']['slurm']['block_topology']['force_configuration'] = false
31+
default['cluster']['p6egb200_block_sizes'] = nil

cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
log = logging.getLogger()
2424

25-
25+
P6E_GB200 = "p6e-gb200"
2626
CAPACITY_TYPE_MAP = {
2727
"ONDEMAND": "on-demand",
2828
"SPOT": "spot",
@@ -49,15 +49,25 @@ def _load_cluster_config(input_file_path):
4949
return yaml.load(input_file, Loader=yaml.SafeLoader)
5050

5151

52-
def generate_topology_config_file(output_file: str, input_file: str, block_sizes: str): # noqa: C901
52+
def _is_capacity_block(capacity_type):
53+
return capacity_type == CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK")
54+
55+
56+
def _is_gb200(instance_type):
57+
return instance_type is not None and instance_type.split(".")[0] == P6E_GB200
58+
59+
60+
def generate_topology_config_file( # noqa: C901
61+
output_file: str, input_file: str, block_sizes: str, force_configuration: bool
62+
):
5363
"""
5464
Generate Topology configuration file.
5565
5666
Generate topology.conf
5767
5868
# This file is automatically generated by pcluster
59-
BlockName=block1 Nodes=queue-1-st-compute-resource-0-[1-9] #### 9 nodes
60-
BlockName=block2 Nodes=queue-1-st-compute-resource-0-[1-18] #### 18 nodes
69+
BlockName=block1 Nodes=queue-1-st-compute-resource-0-[1-9] #### 9 nodes Capacity Block 1
70+
BlockName=block2 Nodes=queue-1-st-compute-resource-0-[1-18] #### 18 nodes Capacity Block 2
6171
BlockSizes=9,18
6272
"""
6373
if block_sizes:
@@ -74,7 +84,8 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes
7484

7585
# Retrieve capacity info from the queue_name, if there
7686
queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND"))
77-
if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"):
87+
if not _is_capacity_block(queue_capacity_type) and not force_configuration:
88+
# We ignore this check when force_configuration option is used.
7889
log.info("ParallelCluster does not create topology for %s", queue_capacity_type)
7990
continue
8091

@@ -88,7 +99,7 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes
8899
continue
89100

90101
# Check for if reservation is for NVLink and size matches min_block_size_list
91-
if compute_resource_config.get("InstanceType") == "p6e-gb200.36xlarge":
102+
if _is_gb200(compute_resource_config.get("InstanceType")) or force_configuration:
92103
if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count:
93104
block_count += 1
94105
# Each Capacity Reservation ID is a Capacity Block,
@@ -149,6 +160,11 @@ def main():
149160
help="Yaml file containing pcluster CLI configuration file with default values",
150161
required=True,
151162
)
163+
parser.add_argument(
164+
"--force-configuration",
165+
help="Force creation of topology.conf by ignoring the checks of Capacity Block and Instance Type. ",
166+
action="store_true",
167+
)
152168
cleanup_or_generate_exclusive_group.add_argument("--block-sizes", help="Block Sizes of topology.conf")
153169
cleanup_or_generate_exclusive_group.add_argument(
154170
"--cleanup",
@@ -159,7 +175,7 @@ def main():
159175
if args.cleanup:
160176
cleanup_topology_config_file(args.output_file)
161177
else:
162-
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes)
178+
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes, args.force_configuration)
163179
log.info("Completed Execution of ParallelCluster Topology Config Generator")
164180
except Exception as e:
165181
log.exception("Failed to generate Topology.conf, exception: %s", e)

cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
3030
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
3131
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\
32-
" --input-file #{node['cluster']['cluster_config_path']}"
32+
" --input-file #{node['cluster']['cluster_config_path']}"\
33+
"#{topology_generator_extra_args}"
3334
not_if { node['cluster']['p6egb200_block_sizes'].nil? }
3435
end
3536
end
@@ -48,8 +49,9 @@
4849
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
4950
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
5051
" --input-file #{node['cluster']['cluster_config_path']}"\
51-
"#{topology_generator_command_args}"
52-
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? }
52+
"#{topology_generator_command_args}"\
53+
"#{topology_generator_extra_args}"
54+
not_if { topology_generator_command_args.nil? }
5355
end
5456
end
5557

@@ -58,13 +60,17 @@ def is_block_topology_supported?
5860
end
5961

6062
def topology_generator_command_args
61-
if node['cluster']['p6egb200_block_sizes'].nil? && are_queues_updated? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf")
63+
if node['cluster']['p6egb200_block_sizes'].nil? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf")
6264
# If topology.conf exist and Capacity Block is removed, we cleanup
6365
" --cleanup"
64-
elsif node['cluster']['p6egb200_block_sizes'].nil? && !are_queues_updated?
65-
# We do nothing if p6e-gb200 is not used and queues are not updated
66-
nil
67-
else
66+
elsif !node['cluster']['p6egb200_block_sizes'].nil?
67+
# We add/update topology.conf if p6egb200_block_sizes is not null
6868
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
6969
end
7070
end
71+
72+
def topology_generator_extra_args
73+
if ['true', 'yes', true].include?(node['cluster']['slurm']['block_topology']['force_configuration'])
74+
" --force-configuration"
75+
end
76+
end

cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb

Lines changed: 122 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -21,59 +21,14 @@ def self.update(chef_run)
2121
script_dir = 'SCRIPT_DIR'
2222
slurm_install_dir = 'SLURM_INSTALL_DIR'
2323
block_sizes = '9,18'
24+
new_block_size = '1,2'
2425
cluster_config = 'CONFIG_YAML'
2526
cookbook_env = 'FAKE_COOKBOOK_PATH'
27+
force_configuration_extra_args = ' --force-configuration'
2628

2729
describe 'block_topology:configure' do
28-
for_all_oses do |platform, version|
29-
context "on #{platform}#{version}" do
30-
cached(:chef_run) do
31-
runner = ChefSpec::SoloRunner.new(
32-
platform: platform,
33-
version: version,
34-
step_into: ['block_topology']
35-
) do |node|
36-
node.override['cluster']['node_type'] = 'HeadNode'
37-
node.override['cluster']['scripts_dir'] = script_dir
38-
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
39-
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
40-
node.override['cluster']['cluster_config_path'] = cluster_config
41-
end
42-
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
43-
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
44-
ConvergeBlockTopology.configure(runner)
45-
runner
46-
end
47-
48-
if platform == 'amazon' && version == '2'
49-
it 'does not configures block_topology' do
50-
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
51-
expect(chef_run).not_to run_execute('generate_topology_config')
52-
end
53-
else
54-
it 'creates the topology configuration template' do
55-
expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
56-
.with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb')
57-
.with(user: 'root')
58-
.with(group: 'root')
59-
.with(mode: '0644')
60-
end
61-
62-
it 'generates topology config when block sizes are present' do
63-
expect(chef_run).to run_execute('generate_topology_config')
64-
.with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
65-
" --output-file #{slurm_install_dir}/etc/topology.conf" \
66-
" --block-sizes #{block_sizes}" \
67-
" --input-file #{cluster_config}")
68-
end
69-
end
70-
end
71-
end
72-
end
73-
74-
describe 'block_topology:update' do
75-
for_all_oses do |platform, version|
76-
['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args|
30+
['false', false, 'no', 'true', true, 'yes'].each do |force_configuration|
31+
for_all_oses do |platform, version|
7732
context "on #{platform}#{version}" do
7833
cached(:chef_run) do
7934
runner = ChefSpec::SoloRunner.new(
@@ -86,18 +41,18 @@ def self.update(chef_run)
8641
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
8742
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
8843
node.override['cluster']['cluster_config_path'] = cluster_config
44+
node.override['cluster']['slurm']['block_topology']['force_configuration'] = force_configuration
8945
end
9046
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
91-
allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args)
9247
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
93-
ConvergeBlockTopology.update(runner)
48+
ConvergeBlockTopology.configure(runner)
9449
runner
9550
end
9651

9752
if platform == 'amazon' && version == '2'
9853
it 'does not configures block_topology' do
9954
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
100-
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
55+
expect(chef_run).not_to run_execute('generate_topology_config')
10156
end
10257
else
10358
it 'creates the topology configuration template' do
@@ -107,13 +62,86 @@ def self.update(chef_run)
10762
.with(group: 'root')
10863
.with(mode: '0644')
10964
end
65+
command = "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
66+
" --output-file #{slurm_install_dir}/etc/topology.conf" \
67+
" --block-sizes #{block_sizes}" \
68+
" --input-file #{cluster_config}"
69+
command_to_exe = if ['true', 'yes', true].include?(force_configuration)
70+
"#{command}#{force_configuration_extra_args}"
71+
else
72+
"#{command}"
73+
end
74+
it 'generates topology config when block sizes are present' do
75+
expect(chef_run).to run_execute('generate_topology_config')
76+
.with(command: command_to_exe)
77+
end
78+
end
79+
end
80+
end
81+
end
82+
end
83+
84+
describe 'block_topology:update' do
85+
['false', false, 'no', 'true', true, 'yes'].each do |force_configuration|
86+
for_all_oses do |platform, version|
87+
['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args|
88+
context "on #{platform}#{version}" do
89+
cached(:chef_run) do
90+
runner = ChefSpec::SoloRunner.new(
91+
platform: platform,
92+
version: version,
93+
step_into: ['block_topology']
94+
) do |node|
95+
node.override['cluster']['node_type'] = 'HeadNode'
96+
node.override['cluster']['scripts_dir'] = script_dir
97+
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
98+
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
99+
node.override['cluster']['cluster_config_path'] = cluster_config
100+
node.override['cluster']['slurm']['block_topology']['force_configuration'] = force_configuration
101+
end
102+
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
103+
allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args)
104+
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
105+
ConvergeBlockTopology.update(runner)
106+
runner
107+
end
108+
109+
if platform == 'amazon' && version == '2'
110+
it 'does not configures block_topology' do
111+
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
112+
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
113+
end
114+
else
115+
command = "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
116+
" --output-file #{slurm_install_dir}/etc/topology.conf" \
117+
" --input-file #{cluster_config}"\
118+
"#{topo_command_args}"
119+
command_to_exe = if ['true', 'yes', true].include?(force_configuration)
120+
"#{command}#{force_configuration_extra_args}"
121+
else
122+
"#{command}"
123+
end
124+
125+
it 'creates the topology configuration template' do
126+
expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
127+
.with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb')
128+
.with(user: 'root')
129+
.with(group: 'root')
130+
.with(mode: '0644')
131+
end
132+
133+
if topo_command_args.nil?
134+
it 'update or cleanup topology.conf when block sizes are present' do
135+
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
136+
.with(command: command_to_exe)
137+
end
138+
else
139+
it 'update or cleanup topology.conf when block sizes are present' do
140+
expect(chef_run).to run_execute('update or cleanup topology.conf')
141+
.with(command: command_to_exe)
142+
end
143+
end
110144

111-
it 'update or cleanup topology.conf when block sizes are present' do
112-
expect(chef_run).to run_execute('update or cleanup topology.conf')
113-
.with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
114-
" --output-file #{slurm_install_dir}/etc/topology.conf" \
115-
" --input-file #{cluster_config}"\
116-
"#{topo_command_args}")
117145
end
118146
end
119147
end
@@ -127,24 +155,58 @@ def self.update(chef_run)
127155
cached(:chef_run) do
128156
runner(platform: platform, version: version, step_into: ['block_topology']) do |node|
129157
node.override['cluster']['p6egb200_block_sizes'] = nil
158+
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
130159
end
131160
end
132161
cached(:resource) do
133162
ConvergeBlockTopology.update(chef_run)
134163
chef_run.find_resource('block_topology', 'update')
135164
end
136165

137-
context "when queues are not updated and topolog.conf does not exists" do
166+
context "when capacity block is removed and topolog.conf does exists" do
167+
before do
168+
allow(File).to receive(:exist?).with("#{slurm_install_dir}/etc/topology.conf").and_return(true)
169+
chef_run.node.override['cluster']['p6egb200_block_sizes'] = nil
170+
end
171+
172+
it 'returns cleanup' do
173+
expect(resource.topology_generator_command_args).to eq(" --cleanup")
174+
end
175+
end
176+
177+
context "when capacity block is not used and topolog.conf does not exists" do
138178
before do
139-
allow_any_instance_of(Object).to receive(:are_queues_updated?).and_return(false)
140179
allow(File).to receive(:exist?).with("#{slurm_install_dir}/etc/topology.conf").and_return(false)
180+
chef_run.node.override['cluster']['p6egb200_block_sizes'] = nil
141181
end
142182

143183
it 'it gives nil' do
144184
expect(resource.topology_generator_command_args).to eq(nil)
145185
end
146186
end
147187

188+
context "when capacity block is updated and topolog.conf does not exists" do
189+
before do
190+
allow(File).to receive(:exist?).with("#{slurm_install_dir}/etc/topology.conf").and_return(false)
191+
chef_run.node.override['cluster']['p6egb200_block_sizes'] = block_sizes
192+
end
193+
194+
it 'returns block-sizes argument' do
195+
expect(resource.topology_generator_command_args).to eq(" --block-sizes #{block_sizes}")
196+
end
197+
end
198+
199+
context "when capacity block is updated and topolog.conf does exists" do
200+
before do
201+
allow(File).to receive(:exist?).with("#{slurm_install_dir}/etc/topology.conf").and_return(true)
202+
chef_run.node.override['cluster']['p6egb200_block_sizes'] = new_block_size
203+
end
204+
205+
it 'returns block-sizes argument' do
206+
expect(resource.topology_generator_command_args).to eq(" --block-sizes #{new_block_size}")
207+
end
208+
end
209+
148210
context "when block sizes is not nil" do
149211
before do
150212
chef_run.node.override['cluster']['p6egb200_block_sizes'] = block_sizes

0 commit comments

Comments
 (0)