Skip to content

Commit 478eb39

Browse files
Himani Anil Deshpandehimani2411
authored andcommitted
Not running is_fleet_ready checks if static node count is 0
* Adding log for checking output * Using Ruby yaml for looping to get count od static nodes * Run static fleet checks if there are any static nodes
1 parent 0f73f4a commit 478eb39

File tree

1 file changed

+25
-4
lines changed
  • cookbooks/aws-parallelcluster-slurm/libraries

1 file changed

+25
-4
lines changed

cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,21 @@ def wait_cluster_ready
178178
end
179179
end
180180

181+
def get_static_node_count
182+
require 'yaml'
183+
cluster_config = YAML.safe_load(File.read(node['cluster']['cluster_config_path']))
184+
total_min_count = 0
185+
slurm_queues_section = cluster_config.dig("Scheduling", "SlurmQueues")
186+
if slurm_queues_section
187+
slurm_queues_section.each do |queue_config|
188+
queue_config['ComputeResources'].each do |compute_resource_config|
189+
total_min_count += compute_resource_config['MinCount'].to_i
190+
end
191+
end
192+
end
193+
total_min_count
194+
end
195+
181196
def wait_static_fleet_running
182197
ruby_block "wait for static fleet capacity" do
183198
block do
@@ -203,15 +218,21 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
203218
fleet_status_command = Shellwords.escape(
204219
"/usr/local/bin/get-compute-fleet-status.sh"
205220
)
221+
222+
total_static_node_count = get_static_node_count
223+
Chef::Log.info("Count of cluster static nodes is #{total_static_node_count}")
224+
206225
# Example output for sinfo
207226
# sinfo -h -o '%N %t'
208227
# queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~
209228
# queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle
210-
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty?
211-
check_for_protected_mode(fleet_status_command)
229+
if total_static_node_count.to_i > 0
230+
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty?
231+
check_for_protected_mode(fleet_status_command)
212232

213-
Chef::Log.info("Waiting for static fleet capacity provisioning")
214-
sleep(15)
233+
Chef::Log.info("Waiting for static fleet capacity provisioning")
234+
sleep(15)
235+
end
215236
end
216237
Chef::Log.info("Static fleet capacity is ready")
217238
end

0 commit comments

Comments
 (0)