File tree Expand file tree Collapse file tree 3 files changed +25
-6
lines changed
aws-parallelcluster-computefleet/templates/compute_fleet_status
aws-parallelcluster-slurm
files/default/head_node_slurm/slurm/templates Expand file tree Collapse file tree 3 files changed +25
-6
lines changed Original file line number Diff line number Diff line change 11#! /bin/bash
22
3- sinfo_output=$( < %= node[' cluster' ][' slurm' ][' install_dir' ] %> /bin/sinfo -h -o ' %N %t' | grep -v -E ' (idle|alloc|mix|maint)$' )
4- while IFS= read -r line; do
5- nodelist=$( echo " $line " | awk ' {print $1}' )
6- < %= node[' cluster' ][' slurm' ][' install_dir' ] %> /bin/scontrol show hostnames " $nodelist " | { grep -E ' ^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true ; }
7- done <<< " $sinfo_output"
3+
4+
5+ cluster_static_node_count=$1
6+ if [[ -z " $cluster_static_node_count " ]]; then
7+ cluster_static_node_count=1
8+ fi
9+
10+ if [[ " $cluster_static_node_count " -ge " 1" ]]; then
11+ sinfo_output=$( < %= node[' cluster' ][' slurm' ][' install_dir' ] %> /bin/sinfo -h -o ' %N %t' | grep -v -E ' (idle|alloc|mix|maint)$' )
12+ while IFS= read -r line; do
13+ nodelist=$( echo " $line " | awk ' {print $1}' )
14+ < %= node[' cluster' ][' slurm' ][' install_dir' ] %> /bin/scontrol show hostnames " $nodelist " | { grep -E ' ^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true ; }
15+ done <<< " $sinfo_output"
16+ fi
Original file line number Diff line number Diff line change @@ -36,16 +36,20 @@ include {{ output_dir }}/pcluster/slurm_parallelcluster_{{ queue.Name }}_partiti
3636{% endfor %}
3737
3838{% if ns.has_static %}
39+ {%- set ns.total_min_count = 0 %}
3940SuspendExcNodes=
4041 {%- set ns.is_first = True %}
4142 {%- for queue in queues %}
4243 {% for compute_resource in queue.ComputeResources %}
4344 {% if compute_resource.MinCount > 0 %}
4445{{- "," if not ns.is_first else "" -}}
4546{{ queue.Name }}-st-{{ compute_resource.Name }}-[1-{{ compute_resource.MinCount }}]
47+ {%- set ns.total_min_count = ns.total_min_count + compute_resource.MinCount %}
4648 {%- set ns.is_first = False %}
4749 {%- endif %}
4850 {% endfor %}
4951 {% endfor %}
5052
5153{% endif %}
54+
55+ #TOTAL_MIN_COUNT={{ ns.total_min_count }}
Original file line number Diff line number Diff line change @@ -178,6 +178,11 @@ def wait_cluster_ready
178178 end
179179end
180180
181+ def get_static_node_count
182+ cmd = Mixlib ::ShellOut . new ( "cat #{ node [ 'cluster' ] [ 'slurm' ] [ 'install_dir' ] } /etc/slurm_parallelcluster.conf | grep -o '#TOTAL_MIN_COUNT=\( [0-9]*\) ' | cut -d'=' -f2" )
183+ cmd . run_command . stdout . strip
184+ end
185+
181186def wait_static_fleet_running
182187 ruby_block "wait for static fleet capacity" do
183188 block do
@@ -203,11 +208,12 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
203208 fleet_status_command = Shellwords . escape (
204209 "/usr/local/bin/get-compute-fleet-status.sh"
205210 )
211+
206212 # Example output for sinfo
207213 # sinfo -h -o '%N %t'
208214 # queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~
209215 # queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle
210- until shell_out! ( "/bin/bash -c /usr/local/bin/is_fleet_ready.sh" ) . stdout . strip . empty?
216+ until shell_out! ( "/bin/bash -c /usr/local/bin/is_fleet_ready.sh #{ get_static_node_count . to_i } " ) . stdout . strip . empty?
211217 check_for_protected_mode ( fleet_status_command )
212218
213219 Chef ::Log . info ( "Waiting for static fleet capacity provisioning" )
You can’t perform that action at this time.
0 commit comments