Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@
# Pyxis
default['cluster']['pyxis']['version'] = '0.20.0'
default['cluster']['pyxis']['runtime_path'] = '/run/pyxis'

# Block Topology Plugin
default['cluster']['slurm']['block_topology']['force_configuration'] = false
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

log = logging.getLogger()


P6E_GB200 = "p6e-gb200"
CAPACITY_TYPE_MAP = {
"ONDEMAND": "on-demand",
"SPOT": "spot",
Expand All @@ -49,7 +49,17 @@ def _load_cluster_config(input_file_path):
return yaml.load(input_file, Loader=yaml.SafeLoader)


def generate_topology_config_file(output_file: str, input_file: str, block_sizes: str): # noqa: C901
def _is_capacity_block(capacity_type):
return capacity_type == CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK")


def _is_gb200(instance_type):
return instance_type is not None and instance_type.split(".")[0] == P6E_GB200


def generate_topology_config_file( # noqa: C901
output_file: str, input_file: str, block_sizes: str, force_configuration: bool
):
"""
Generate Topology configuration file.

Expand All @@ -74,7 +84,8 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes

# Retrieve capacity info from the queue_name, if there
queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND"))
if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"):
if not _is_capacity_block(queue_capacity_type) and not force_configuration:
# We ignore this check when force_configuration option is used.
log.info("ParallelCluster does not create topology for %s", queue_capacity_type)
continue

Expand All @@ -88,7 +99,7 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes
continue

# Check for if reservation is for NVLink and size matches min_block_size_list
if compute_resource_config.get("InstanceType") == "p6e-gb200.36xlarge":
if _is_gb200(compute_resource_config.get("InstanceType")) or force_configuration:
if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count:
block_count += 1
# Each Capacity Reservation ID is a Capacity Block,
Expand Down Expand Up @@ -149,6 +160,11 @@ def main():
help="Yaml file containing pcluster CLI configuration file with default values",
required=True,
)
parser.add_argument(
"--force-configuration",
help="Force creation of topology.conf by ignoring the checks of Capacity Block and Instance Type. ",
action="store_true",
)
cleanup_or_generate_exclusive_group.add_argument("--block-sizes", help="Block Sizes of topology.conf")
cleanup_or_generate_exclusive_group.add_argument(
"--cleanup",
Expand All @@ -159,7 +175,7 @@ def main():
if args.cleanup:
cleanup_topology_config_file(args.output_file)
else:
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes)
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes, args.force_configuration)
log.info("Completed Execution of ParallelCluster Topology Config Generator")
except Exception as e:
log.exception("Failed to generate Topology.conf, exception: %s", e)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\
" --input-file #{node['cluster']['cluster_config_path']}"
" --input-file #{node['cluster']['cluster_config_path']}"\
"#{topology_generator_extra_args}"
not_if { node['cluster']['p6egb200_block_sizes'].nil? }
end
end
Expand All @@ -48,8 +49,9 @@
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
" --input-file #{node['cluster']['cluster_config_path']}"\
"#{topology_generator_command_args}"
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? }
"#{topology_generator_command_args}"\
"#{topology_generator_extra_args}"
not_if { topology_generator_command_args.nil? }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the previous_cluster_config_path check was removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We dont need to check the existence of this file. And this is the check which is causing the errors in our integ tests as ['cluster']['previous_cluster_config_path'] will exist in every update

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add a followup PR to solve the update cluster issue as this doesnt solve it

end
end

Expand All @@ -68,3 +70,9 @@ def topology_generator_command_args
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
end
end

def topology_generator_extra_args
if ['true', 'yes', true].include?(node['cluster']['slurm']['block_topology']['force_configuration'])
" --force-configuration"
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -23,57 +23,11 @@ def self.update(chef_run)
block_sizes = '9,18'
cluster_config = 'CONFIG_YAML'
cookbook_env = 'FAKE_COOKBOOK_PATH'
force_configuration_extra_args = ' --force-configuration'

describe 'block_topology:configure' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
runner = ChefSpec::SoloRunner.new(
platform: platform,
version: version,
step_into: ['block_topology']
) do |node|
node.override['cluster']['node_type'] = 'HeadNode'
node.override['cluster']['scripts_dir'] = script_dir
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
node.override['cluster']['cluster_config_path'] = cluster_config
end
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
ConvergeBlockTopology.configure(runner)
runner
end

if platform == 'amazon' && version == '2'
it 'does not configures block_topology' do
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
expect(chef_run).not_to run_execute('generate_topology_config')
end
else
it 'creates the topology configuration template' do
expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
.with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
end

it 'generates topology config when block sizes are present' do
expect(chef_run).to run_execute('generate_topology_config')
.with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
" --output-file #{slurm_install_dir}/etc/topology.conf" \
" --block-sizes #{block_sizes}" \
" --input-file #{cluster_config}")
end
end
end
end
end

describe 'block_topology:update' do
for_all_oses do |platform, version|
['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args|
['false', false, 'no', 'true', true, 'yes'].each do |force_configuration|
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
runner = ChefSpec::SoloRunner.new(
Expand All @@ -86,18 +40,18 @@ def self.update(chef_run)
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
node.override['cluster']['cluster_config_path'] = cluster_config
node.override['cluster']['slurm']['block_topology']['force_configuration'] = force_configuration
end
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args)
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
ConvergeBlockTopology.update(runner)
ConvergeBlockTopology.configure(runner)
runner
end

if platform == 'amazon' && version == '2'
it 'does not configures block_topology' do
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
expect(chef_run).not_to run_execute('generate_topology_config')
end
else
it 'creates the topology configuration template' do
Expand All @@ -107,13 +61,86 @@ def self.update(chef_run)
.with(group: 'root')
.with(mode: '0644')
end
command = "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
" --output-file #{slurm_install_dir}/etc/topology.conf" \
" --block-sizes #{block_sizes}" \
" --input-file #{cluster_config}"
command_to_exe = if ['true', 'yes', true].include?(force_configuration)
"#{command}#{force_configuration_extra_args}"
else
"#{command}"
end
it 'generates topology config when block sizes are present' do
expect(chef_run).to run_execute('generate_topology_config')
.with(command: command_to_exe)
end
end
end
end
end
end

describe 'block_topology:update' do
['false', false, 'no', 'true', true, 'yes'].each do |force_configuration|
for_all_oses do |platform, version|
['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args|
context "on #{platform}#{version}" do
cached(:chef_run) do
runner = ChefSpec::SoloRunner.new(
platform: platform,
version: version,
step_into: ['block_topology']
) do |node|
node.override['cluster']['node_type'] = 'HeadNode'
node.override['cluster']['scripts_dir'] = script_dir
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
node.override['cluster']['cluster_config_path'] = cluster_config
node.override['cluster']['slurm']['block_topology']['force_configuration'] = force_configuration
end
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args)
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
ConvergeBlockTopology.update(runner)
runner
end

if platform == 'amazon' && version == '2'
it 'does not configures block_topology' do
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
end
else
command = "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
" --output-file #{slurm_install_dir}/etc/topology.conf" \
" --input-file #{cluster_config}"\
"#{topo_command_args}"
command_to_exe = if ['true', 'yes', true].include?(force_configuration)
"#{command}#{force_configuration_extra_args}"
else
"#{command}"
end

it 'creates the topology configuration template' do
expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
.with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
end

if topo_command_args.nil?
it 'update or cleanup topology.conf when block sizes are present' do
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
.with(command: command_to_exe)
end
else
it 'update or cleanup topology.conf when block sizes are present' do
expect(chef_run).to run_execute('update or cleanup topology.conf')
.with(command: command_to_exe)
end
end

it 'update or cleanup topology.conf when block sizes are present' do
expect(chef_run).to run_execute('update or cleanup topology.conf')
.with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
" --output-file #{slurm_install_dir}/etc/topology.conf" \
" --input-file #{cluster_config}"\
"#{topo_command_args}")
end
end
end
Expand Down
46 changes: 42 additions & 4 deletions test/unit/slurm/test_topology_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import pytest
from assertpy import assert_that
from pcluster_topology_generator import (
_is_capacity_block,
_is_gb200,
cleanup_topology_config_file,
generate_topology_config_file,
)
Expand All @@ -24,14 +26,23 @@ def _assert_files_are_equal(file, expected_file):
assert_that(f.read()).is_equal_to(exp_f.read())


@pytest.mark.parametrize("file_name_suffix", ["with_capacity_block", "no_capacity_block"])
def test_generate_topology_config(test_datadir, tmpdir, file_name_suffix):
@pytest.mark.parametrize(
"file_name_suffix, force_configuration",
[
("with_capacity_block", False),
("no_capacity_block", False),
("with_capacity_block", True),
("no_capacity_block", True),
],
)
def test_generate_topology_config(test_datadir, tmpdir, file_name_suffix, force_configuration):
block_sizes = "9,18" if "no" not in file_name_suffix else None
force_suffix = "_force" if force_configuration else ""
file_name = "sample_" + file_name_suffix + ".yaml"
input_file_path = str(test_datadir / file_name)
output_file_name = "topology_" + file_name_suffix + ".conf"
output_file_name = "topology_" + file_name_suffix + force_suffix + ".conf"
output_file_path = f"{tmpdir}/{output_file_name}"
generate_topology_config_file(output_file_path, input_file_path, block_sizes)
generate_topology_config_file(output_file_path, input_file_path, block_sizes, force_configuration)
if "no" in file_name_suffix:
assert_that(os.path.isfile(output_file_path)).is_equal_to(False)
else:
Expand All @@ -48,3 +59,30 @@ def test_cleanup_topology_config_file(mocker, tmpdir, file_exists):
mock_remove.assert_called_once_with(str(topology_file_path))
else:
mock_remove.assert_not_called()


@pytest.mark.parametrize(
"capacity_type, expected_result",
[
("capacity-block", True),
("on-demand", False),
("spot", False),
("any-value", False),
("bla-capacity-block-bla", False),
],
)
def test_is_capacity_block(capacity_type, expected_result):
assert_that(_is_capacity_block(capacity_type)).is_equal_to(expected_result)


@pytest.mark.parametrize(
"instance_type, expected_result",
[
("p6e-gb200.ANY_SIZE", True),
("NOTp6e-gb200.ANY_SIZE", False),
("p6e-b200.ANY_SIZE", False),
("any-value", False),
],
)
def test_is_gb200(instance_type, expected_result):
assert_that(_is_gb200(instance_type)).is_equal_to(expected_result)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This file is automatically generated by pcluster

BlockName=Block1 Nodes=multiple_spot-st-multiplespot-2-[1-9]
BlockName=Block2 Nodes=gpu-st-gpu-p3dn24xlarge-[1-9]
BlockName=Block3 Nodes=capacity-block-queue1-st-cb-gb200-1-[1-9]
BlockName=Block4 Nodes=capacity-block-queue1-st-fleet-no-res-[1-18]
BlockName=Block5 Nodes=capacity-block-queue2-st-cb-gb200-2-[1-18]
BlockName=Block6 Nodes=capacity-block-queue2-st-cb-gb200-3-[1-9]
BlockSizes=9,18
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ Scheduling:
Enabled: false
GdrSupport: false
InstanceType: c5.2xlarge
MaxCount: 5
MinCount: 5
MaxCount: 9
MinCount: 9
Name: multiplespot-2
SpotPrice: 1.5
StaticNodePriority: 1
Expand Down Expand Up @@ -69,8 +69,8 @@ Scheduling:
Enabled: false
GdrSupport: false
InstanceType: p3dn.24xlarge
MaxCount: 10
MinCount: 10
MaxCount: 9
MinCount: 9
Name: gpu-p3dn24xlarge
SpotPrice: null
StaticNodePriority: 1
Expand Down Expand Up @@ -106,8 +106,8 @@ Scheduling:
Instances:
- InstanceType: c5n.4xlarge
- InstanceType: r5.4xlarge
MaxCount: 10
MinCount: 10
MaxCount: 18
MinCount: 18
Name: fleet-no-res
SchedulableMemory: null
SpotPrice: null
Expand Down
Loading