Skip to content

Commit 356004f

Browse files
author
Himani Anil Deshpande
committed
Add TOTAL_MIN_COUNT of a cluster as comment
* Run static fleet checks if there are any static nodes Adding it as condition
1 parent be4d2c8 commit 356004f

File tree

3 files changed

+25
-6
lines changed

3 files changed

+25
-6
lines changed
Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
11
#!/bin/bash
22

3-
sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$')
4-
while IFS= read -r line; do
5-
nodelist=$(echo "$line" | awk '{print $1}')
6-
<%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; }
7-
done <<< "$sinfo_output"
3+
4+
5+
cluster_static_node_count=$1
6+
if [[ -z "$cluster_static_node_count" ]]; then
7+
cluster_static_node_count=1
8+
fi
9+
10+
if [[ "$cluster_static_node_count" -ge "1" ]]; then
11+
sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$')
12+
while IFS= read -r line; do
13+
nodelist=$(echo "$line" | awk '{print $1}')
14+
<%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; }
15+
done <<< "$sinfo_output"
16+
fi

cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,20 @@ include {{ output_dir }}/pcluster/slurm_parallelcluster_{{ queue.Name }}_partiti
3636
{% endfor %}
3737

3838
{% if ns.has_static %}
39+
{%- set ns.total_min_count = 0 %}
3940
SuspendExcNodes=
4041
{%- set ns.is_first = True %}
4142
{%- for queue in queues %}
4243
{% for compute_resource in queue.ComputeResources %}
4344
{% if compute_resource.MinCount > 0 %}
4445
{{- "," if not ns.is_first else "" -}}
4546
{{ queue.Name }}-st-{{ compute_resource.Name }}-[1-{{ compute_resource.MinCount }}]
47+
{%- set ns.total_min_count = ns.total_min_count + compute_resource.MinCount %}
4648
{%- set ns.is_first = False %}
4749
{%- endif %}
4850
{% endfor %}
4951
{% endfor %}
5052

5153
{% endif %}
54+
55+
#TOTAL_MIN_COUNT={{ ns.total_min_count }}

cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,11 @@ def wait_cluster_ready
178178
end
179179
end
180180

181+
def get_static_node_count
182+
cmd = Mixlib::ShellOut.new("cat #{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster.conf | grep -o '#TOTAL_MIN_COUNT=\([0-9]*\)' | cut -d'=' -f2")
183+
cmd.run_command.stdout.strip
184+
end
185+
181186
def wait_static_fleet_running
182187
ruby_block "wait for static fleet capacity" do
183188
block do
@@ -203,11 +208,12 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
203208
fleet_status_command = Shellwords.escape(
204209
"/usr/local/bin/get-compute-fleet-status.sh"
205210
)
211+
206212
# Example output for sinfo
207213
# sinfo -h -o '%N %t'
208214
# queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~
209215
# queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle
210-
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty?
216+
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh #{get_static_node_count.to_i}").stdout.strip.empty?
211217
check_for_protected_mode(fleet_status_command)
212218

213219
Chef::Log.info("Waiting for static fleet capacity provisioning")

0 commit comments

Comments
 (0)