Skip to content

Commit 33287b8

Browse files
author
Himani Anil Deshpande
committed
[SlurmTopo] We cleanup or generate Topology only if p6egb200_block_size exist
1 parent 22234f5 commit 33287b8

File tree

2 files changed

+35
-6
lines changed

2 files changed

+35
-6
lines changed

cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import yaml
1414
import logging
1515
import traceback
16+
import os
1617
log = logging.getLogger()
1718

1819

@@ -118,22 +119,42 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes
118119
log.info("Finished.")
119120

120121

122+
def cleanup_topology_config_file(file_path):
123+
"""Cleanup topology.conf file."""
124+
try:
125+
if os.path.exists(file_path):
126+
log.info("Cleaning up %s", file_path)
127+
os.remove(file_path)
128+
except Exception as err:
129+
log.warning("Unable to delete %s due to %s", file_path, err)
130+
131+
121132
def main():
122133
try:
123134
logging.basicConfig(
124135
level=logging.INFO, format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s"
125136
)
126137
log.info("Running ParallelCluster Topology Config Generator")
127138
parser = argparse.ArgumentParser(description="Take in Topology configuration generator related parameters")
139+
cleanup_or_generate_exclusive_group = parser.add_mutually_exclusive_group(required=True)
128140
parser.add_argument("--output-file", help="The output file for generated topology.conf", required=True)
129141
parser.add_argument(
130142
"--input-file",
131143
help="Yaml file containing pcluster CLI configuration file with default values",
132144
required=True,
133145
)
134-
parser.add_argument("--block-sizes", help="Block Size of topology.conf", required=True)
146+
cleanup_or_generate_exclusive_group.add_argument("--block-sizes", help="Block Sizes of topology.conf")
147+
cleanup_or_generate_exclusive_group.add_argument(
148+
"--cleanup",
149+
action="store_true",
150+
help="Cleanup topology.conf",
151+
)
135152
args = parser.parse_args()
136-
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes)
153+
if args.cleanup:
154+
cleanup_topology_config_file(args.output_file)
155+
else:
156+
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes)
157+
log.info("Completed Execution of ParallelCluster Topology Config Generator")
137158
except Exception as e:
138159
log.exception("Failed to generate Topology.conf, exception: %s", e)
139160
raise

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,14 +161,22 @@ def update_nodes_in_queue(strategy, queues)
161161
mode '0644'
162162
end
163163

164+
if node['cluster']['p6egb200_block_sizes'].nil? && are_queues_updated? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf")
165+
# If topology.conf exist and Capacity Block is removed, we cleanup
166+
topology_generator_command_args = " --cleanup"
167+
elsif node['cluster']['p6egb200_block_sizes'].nil? && !are_queues_updated?
168+
# We do nothing if p6e-gb200 is not used and queues are not updated
169+
topology_generator_command_args = nil
170+
else
171+
topology_generator_command_args = " --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
172+
end
164173
# Update Slurm topology.conf file
165174
execute "update or cleanup topology.conf" do
166175
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
167176
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
168-
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\
169-
" --input-file #{node['cluster']['cluster_config_path']}"
170-
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && node['cluster']['p6egb200_block_sizes'].nil? }
171-
#TODO: Need to remove topology.conf if CB is removed
177+
" --input-file #{node['cluster']['cluster_config_path']}"\
178+
"#{topology_generator_command_args}"
179+
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? }
172180
end
173181
end
174182
not_if { platform?('amazon') && node['platform_version'] == "2" }

0 commit comments

Comments
 (0)