diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb index d20fdeb0d..35aee326b 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb @@ -25,3 +25,6 @@ # Pyxis default['cluster']['pyxis']['version'] = '0.20.0' default['cluster']['pyxis']['runtime_path'] = '/run/pyxis' + +# Block Topology Plugin +default['cluster']['slurm']['block_topology']['force_configuration'] = false diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py index 858120eb3..ca009dc99 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py @@ -22,7 +22,7 @@ log = logging.getLogger() - +P6E_GB200 = "p6e-gb200" CAPACITY_TYPE_MAP = { "ONDEMAND": "on-demand", "SPOT": "spot", @@ -49,7 +49,17 @@ def _load_cluster_config(input_file_path): return yaml.load(input_file, Loader=yaml.SafeLoader) -def generate_topology_config_file(output_file: str, input_file: str, block_sizes: str): # noqa: C901 +def _is_capacity_block(capacity_type): + return capacity_type == CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK") + + +def _is_gb200(instance_type): + return instance_type is not None and instance_type.split(".")[0] == P6E_GB200 + + +def generate_topology_config_file( # noqa: C901 + output_file: str, input_file: str, block_sizes: str, force_configuration: bool +): """ Generate Topology configuration file. @@ -74,7 +84,8 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes # Retrieve capacity info from the queue_name, if there queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND")) - if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"): + if not _is_capacity_block(queue_capacity_type) and not force_configuration: + # We ignore this check when force_configuration option is used. log.info("ParallelCluster does not create topology for %s", queue_capacity_type) continue @@ -88,7 +99,7 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes continue # Check for if reservation is for NVLink and size matches min_block_size_list - if compute_resource_config.get("InstanceType") == "p6e-gb200.36xlarge": + if _is_gb200(compute_resource_config.get("InstanceType")) or force_configuration: if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count: block_count += 1 # Each Capacity Reservation ID is a Capacity Block, @@ -149,6 +160,11 @@ def main(): help="Yaml file containing pcluster CLI configuration file with default values", required=True, ) + parser.add_argument( + "--force-configuration", + help="Force creation of topology.conf by ignoring the checks of Capacity Block and Instance Type. ", + action="store_true", + ) cleanup_or_generate_exclusive_group.add_argument("--block-sizes", help="Block Sizes of topology.conf") cleanup_or_generate_exclusive_group.add_argument( "--cleanup", @@ -159,7 +175,7 @@ def main(): if args.cleanup: cleanup_topology_config_file(args.output_file) else: - generate_topology_config_file(args.output_file, args.input_file, args.block_sizes) + generate_topology_config_file(args.output_file, args.input_file, args.block_sizes, args.force_configuration) log.info("Completed Execution of ParallelCluster Topology Config Generator") except Exception as e: log.exception("Failed to generate Topology.conf, exception: %s", e) diff --git a/cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb index a3dd501aa..fe44cbacb 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb @@ -29,7 +29,8 @@ command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\ " --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\ " --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\ - " --input-file #{node['cluster']['cluster_config_path']}" + " --input-file #{node['cluster']['cluster_config_path']}"\ + "#{topology_generator_extra_args}" not_if { node['cluster']['p6egb200_block_sizes'].nil? } end end @@ -48,8 +49,9 @@ command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\ " --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\ " --input-file #{node['cluster']['cluster_config_path']}"\ - "#{topology_generator_command_args}" - not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? } + "#{topology_generator_command_args}"\ + "#{topology_generator_extra_args}" + not_if { topology_generator_command_args.nil? } end end @@ -68,3 +70,9 @@ def topology_generator_command_args " --block-sizes #{node['cluster']['p6egb200_block_sizes']}" end end + +def topology_generator_extra_args + if ['true', 'yes', true].include?(node['cluster']['slurm']['block_topology']['force_configuration']) + " --force-configuration" + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb index a94391895..4e0310daa 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb @@ -23,57 +23,11 @@ def self.update(chef_run) block_sizes = '9,18' cluster_config = 'CONFIG_YAML' cookbook_env = 'FAKE_COOKBOOK_PATH' +force_configuration_extra_args = ' --force-configuration' describe 'block_topology:configure' do - for_all_oses do |platform, version| - context "on #{platform}#{version}" do - cached(:chef_run) do - runner = ChefSpec::SoloRunner.new( - platform: platform, - version: version, - step_into: ['block_topology'] - ) do |node| - node.override['cluster']['node_type'] = 'HeadNode' - node.override['cluster']['scripts_dir'] = script_dir - node.override['cluster']['slurm']['install_dir'] = slurm_install_dir - node.override['cluster']['p6egb200_block_sizes'] = block_sizes - node.override['cluster']['cluster_config_path'] = cluster_config - end - allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true) - allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env) - ConvergeBlockTopology.configure(runner) - runner - end - - if platform == 'amazon' && version == '2' - it 'does not configures block_topology' do - expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") - expect(chef_run).not_to run_execute('generate_topology_config') - end - else - it 'creates the topology configuration template' do - expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") - .with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - end - - it 'generates topology config when block sizes are present' do - expect(chef_run).to run_execute('generate_topology_config') - .with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \ - " --output-file #{slurm_install_dir}/etc/topology.conf" \ - " --block-sizes #{block_sizes}" \ - " --input-file #{cluster_config}") - end - end - end - end -end - -describe 'block_topology:update' do - for_all_oses do |platform, version| - ['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args| + ['false', false, 'no', 'true', true, 'yes'].each do |force_configuration| + for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do runner = ChefSpec::SoloRunner.new( @@ -86,18 +40,18 @@ def self.update(chef_run) node.override['cluster']['slurm']['install_dir'] = slurm_install_dir node.override['cluster']['p6egb200_block_sizes'] = block_sizes node.override['cluster']['cluster_config_path'] = cluster_config + node.override['cluster']['slurm']['block_topology']['force_configuration'] = force_configuration end allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true) - allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args) allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env) - ConvergeBlockTopology.update(runner) + ConvergeBlockTopology.configure(runner) runner end if platform == 'amazon' && version == '2' it 'does not configures block_topology' do expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") - expect(chef_run).not_to run_execute('update or cleanup topology.conf') + expect(chef_run).not_to run_execute('generate_topology_config') end else it 'creates the topology configuration template' do @@ -107,13 +61,86 @@ def self.update(chef_run) .with(group: 'root') .with(mode: '0644') end + command = "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \ + " --output-file #{slurm_install_dir}/etc/topology.conf" \ + " --block-sizes #{block_sizes}" \ + " --input-file #{cluster_config}" + command_to_exe = if ['true', 'yes', true].include?(force_configuration) + "#{command}#{force_configuration_extra_args}" + else + "#{command}" + end + it 'generates topology config when block sizes are present' do + expect(chef_run).to run_execute('generate_topology_config') + .with(command: command_to_exe) + end + end + end + end + end +end + +describe 'block_topology:update' do + ['false', false, 'no', 'true', true, 'yes'].each do |force_configuration| + for_all_oses do |platform, version| + ['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = ChefSpec::SoloRunner.new( + platform: platform, + version: version, + step_into: ['block_topology'] + ) do |node| + node.override['cluster']['node_type'] = 'HeadNode' + node.override['cluster']['scripts_dir'] = script_dir + node.override['cluster']['slurm']['install_dir'] = slurm_install_dir + node.override['cluster']['p6egb200_block_sizes'] = block_sizes + node.override['cluster']['cluster_config_path'] = cluster_config + node.override['cluster']['slurm']['block_topology']['force_configuration'] = force_configuration + end + allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true) + allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args) + allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env) + ConvergeBlockTopology.update(runner) + runner + end + + if platform == 'amazon' && version == '2' + it 'does not configures block_topology' do + expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") + expect(chef_run).not_to run_execute('update or cleanup topology.conf') + end + else + command = "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \ + " --output-file #{slurm_install_dir}/etc/topology.conf" \ + " --input-file #{cluster_config}"\ + "#{topo_command_args}" + command_to_exe = if ['true', 'yes', true].include?(force_configuration) + "#{command}#{force_configuration_extra_args}" + else + "#{command}" + end + + it 'creates the topology configuration template' do + expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") + .with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + end + + if topo_command_args.nil? + it 'update or cleanup topology.conf when block sizes are present' do + expect(chef_run).not_to run_execute('update or cleanup topology.conf') + .with(command: command_to_exe) + end + else + it 'update or cleanup topology.conf when block sizes are present' do + expect(chef_run).to run_execute('update or cleanup topology.conf') + .with(command: command_to_exe) + end + end - it 'update or cleanup topology.conf when block sizes are present' do - expect(chef_run).to run_execute('update or cleanup topology.conf') - .with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \ - " --output-file #{slurm_install_dir}/etc/topology.conf" \ - " --input-file #{cluster_config}"\ - "#{topo_command_args}") end end end diff --git a/test/unit/slurm/test_topology_generator.py b/test/unit/slurm/test_topology_generator.py index 57a12780c..8ca5272c0 100644 --- a/test/unit/slurm/test_topology_generator.py +++ b/test/unit/slurm/test_topology_generator.py @@ -14,6 +14,8 @@ import pytest from assertpy import assert_that from pcluster_topology_generator import ( + _is_capacity_block, + _is_gb200, cleanup_topology_config_file, generate_topology_config_file, ) @@ -24,14 +26,23 @@ def _assert_files_are_equal(file, expected_file): assert_that(f.read()).is_equal_to(exp_f.read()) -@pytest.mark.parametrize("file_name_suffix", ["with_capacity_block", "no_capacity_block"]) -def test_generate_topology_config(test_datadir, tmpdir, file_name_suffix): +@pytest.mark.parametrize( + "file_name_suffix, force_configuration", + [ + ("with_capacity_block", False), + ("no_capacity_block", False), + ("with_capacity_block", True), + ("no_capacity_block", True), + ], +) +def test_generate_topology_config(test_datadir, tmpdir, file_name_suffix, force_configuration): block_sizes = "9,18" if "no" not in file_name_suffix else None + force_suffix = "_force" if force_configuration else "" file_name = "sample_" + file_name_suffix + ".yaml" input_file_path = str(test_datadir / file_name) - output_file_name = "topology_" + file_name_suffix + ".conf" + output_file_name = "topology_" + file_name_suffix + force_suffix + ".conf" output_file_path = f"{tmpdir}/{output_file_name}" - generate_topology_config_file(output_file_path, input_file_path, block_sizes) + generate_topology_config_file(output_file_path, input_file_path, block_sizes, force_configuration) if "no" in file_name_suffix: assert_that(os.path.isfile(output_file_path)).is_equal_to(False) else: @@ -48,3 +59,30 @@ def test_cleanup_topology_config_file(mocker, tmpdir, file_exists): mock_remove.assert_called_once_with(str(topology_file_path)) else: mock_remove.assert_not_called() + + +@pytest.mark.parametrize( + "capacity_type, expected_result", + [ + ("capacity-block", True), + ("on-demand", False), + ("spot", False), + ("any-value", False), + ("bla-capacity-block-bla", False), + ], +) +def test_is_capacity_block(capacity_type, expected_result): + assert_that(_is_capacity_block(capacity_type)).is_equal_to(expected_result) + + +@pytest.mark.parametrize( + "instance_type, expected_result", + [ + ("p6e-gb200.ANY_SIZE", True), + ("NOTp6e-gb200.ANY_SIZE", False), + ("p6e-b200.ANY_SIZE", False), + ("any-value", False), + ], +) +def test_is_gb200(instance_type, expected_result): + assert_that(_is_gb200(instance_type)).is_equal_to(expected_result) diff --git a/test/unit/slurm/test_topology_generator/test_generate_topology_config/expected_outputs/topology_with_capacity_block_force.conf b/test/unit/slurm/test_topology_generator/test_generate_topology_config/expected_outputs/topology_with_capacity_block_force.conf new file mode 100644 index 000000000..b284e8f8b --- /dev/null +++ b/test/unit/slurm/test_topology_generator/test_generate_topology_config/expected_outputs/topology_with_capacity_block_force.conf @@ -0,0 +1,9 @@ +# This file is automatically generated by pcluster + +BlockName=Block1 Nodes=multiple_spot-st-multiplespot-2-[1-9] +BlockName=Block2 Nodes=gpu-st-gpu-p3dn24xlarge-[1-9] +BlockName=Block3 Nodes=capacity-block-queue1-st-cb-gb200-1-[1-9] +BlockName=Block4 Nodes=capacity-block-queue1-st-fleet-no-res-[1-18] +BlockName=Block5 Nodes=capacity-block-queue2-st-cb-gb200-2-[1-18] +BlockName=Block6 Nodes=capacity-block-queue2-st-cb-gb200-3-[1-9] +BlockSizes=9,18 diff --git a/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_with_capacity_block.yaml b/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_with_capacity_block.yaml index 14ca792f3..5ffb29e28 100644 --- a/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_with_capacity_block.yaml +++ b/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_with_capacity_block.yaml @@ -18,8 +18,8 @@ Scheduling: Enabled: false GdrSupport: false InstanceType: c5.2xlarge - MaxCount: 5 - MinCount: 5 + MaxCount: 9 + MinCount: 9 Name: multiplespot-2 SpotPrice: 1.5 StaticNodePriority: 1 @@ -69,8 +69,8 @@ Scheduling: Enabled: false GdrSupport: false InstanceType: p3dn.24xlarge - MaxCount: 10 - MinCount: 10 + MaxCount: 9 + MinCount: 9 Name: gpu-p3dn24xlarge SpotPrice: null StaticNodePriority: 1 @@ -106,8 +106,8 @@ Scheduling: Instances: - InstanceType: c5n.4xlarge - InstanceType: r5.4xlarge - MaxCount: 10 - MinCount: 10 + MaxCount: 18 + MinCount: 18 Name: fleet-no-res SchedulableMemory: null SpotPrice: null