From a8fb5363ecf13bc37e6c90c1bafa5abdef99ca89 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 23 May 2025 21:33:27 -0400 Subject: [PATCH 01/10] Add TOTAL_MIN_COUNT of a cluster as comment * Run static fleet checks if there are any static nodes Adding it as condition --- .../compute_fleet_status/is_fleet_ready.erb | 19 ++++++++++++++----- .../templates/slurm_parallelcluster.conf | 4 ++++ .../libraries/helpers.rb | 8 +++++++- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb b/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb index 0cf4d4a05e..305931d092 100644 --- a/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb +++ b/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb @@ -1,7 +1,16 @@ #!/bin/bash -sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$') -while IFS= read -r line; do - nodelist=$(echo "$line" | awk '{print $1}') - <%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; } -done <<< "$sinfo_output" + + +cluster_static_node_count=$1 +if [[ -z "$cluster_static_node_count" ]]; then + cluster_static_node_count=1 +fi + +if [[ "$cluster_static_node_count" -ge "1" ]]; then + sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$') + while IFS= read -r line; do + nodelist=$(echo "$line" | awk '{print $1}') + <%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; } + done <<< "$sinfo_output" +fi \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf index 07c383ff79..935c9e7890 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf @@ -36,6 +36,7 @@ include {{ output_dir }}/pcluster/slurm_parallelcluster_{{ queue.Name }}_partiti {% endfor %} {% if ns.has_static %} +{%- set ns.total_min_count = 0 %} SuspendExcNodes= {%- set ns.is_first = True %} {%- for queue in queues %} @@ -43,9 +44,12 @@ SuspendExcNodes= {% if compute_resource.MinCount > 0 %} {{- "," if not ns.is_first else "" -}} {{ queue.Name }}-st-{{ compute_resource.Name }}-[1-{{ compute_resource.MinCount }}] + {%- set ns.total_min_count = ns.total_min_count + compute_resource.MinCount %} {%- set ns.is_first = False %} {%- endif %} {% endfor %} {% endfor %} {% endif %} + +#TOTAL_MIN_COUNT={{ ns.total_min_count }} \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index dc599d91a6..96b22a1b95 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -178,6 +178,11 @@ def wait_cluster_ready end end +def get_static_node_count + cmd = Mixlib::ShellOut.new("cat #{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster.conf | grep -o '#TOTAL_MIN_COUNT=\([0-9]*\)' | cut -d'=' -f2") + cmd.run_command.stdout.strip +end + def wait_static_fleet_running ruby_block "wait for static fleet capacity" do block do @@ -203,11 +208,12 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested fleet_status_command = Shellwords.escape( "/usr/local/bin/get-compute-fleet-status.sh" ) + # Example output for sinfo # sinfo -h -o '%N %t' # queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~ # queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle - until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty? + until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh #{get_static_node_count.to_i}").stdout.strip.empty? check_for_protected_mode(fleet_status_command) Chef::Log.info("Waiting for static fleet capacity provisioning") From 81f0145e60ae17357fcd51e661fc65925611d756 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 6 Jun 2025 15:42:11 -0400 Subject: [PATCH 02/10] Update Unit tests --- .../expected_outputs/slurm_parallelcluster.conf | 2 ++ .../expected_outputs/slurm_parallelcluster_mem_sched.conf | 2 ++ .../expected_outputs/slurm_parallelcluster.conf | 2 ++ .../expected_outputs/slurm_parallelcluster.conf | 2 ++ .../expected_outputs/slurm_parallelcluster_externaldbd.conf | 2 ++ .../slurm_parallelcluster_slurm_accounting.conf | 2 ++ .../slurm_parallelcluster_slurm_accounting_dbname.conf | 2 ++ .../expected_outputs/slurm_parallelcluster.conf | 2 ++ 8 files changed, 16 insertions(+) diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster.conf index 28bb3c9158..032c13f5e9 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster.conf @@ -13,3 +13,5 @@ include /pcluster/slurm_parallelcluster_efa_partition.conf include /pcluster/slurm_parallelcluster_gpu_partition.conf SuspendExcNodes=multiple_spot-st-multiplespot-1-[1-5],multiple_spot-st-multiplespot-2-[1-5],efa-st-efa-c5n-[1-1],gpu-st-gpu-g38xlarge-[1-1] + +#TOTAL_MIN_COUNT=12 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster_mem_sched.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster_mem_sched.conf index 4694ad218a..babfdaec71 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster_mem_sched.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster_mem_sched.conf @@ -13,3 +13,5 @@ include /pcluster/slurm_parallelcluster_efa_partition.conf include /pcluster/slurm_parallelcluster_gpu_partition.conf SuspendExcNodes=multiple_spot-st-multiplespot-1-[1-5],multiple_spot-st-multiplespot-2-[1-5],efa-st-efa-c5n-[1-1],gpu-st-gpu-g38xlarge-[1-1] + +#TOTAL_MIN_COUNT=12 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_nogpu/expected_outputs/slurm_parallelcluster.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_nogpu/expected_outputs/slurm_parallelcluster.conf index 28bb3c9158..032c13f5e9 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_nogpu/expected_outputs/slurm_parallelcluster.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_nogpu/expected_outputs/slurm_parallelcluster.conf @@ -13,3 +13,5 @@ include /pcluster/slurm_parallelcluster_efa_partition.conf include /pcluster/slurm_parallelcluster_gpu_partition.conf SuspendExcNodes=multiple_spot-st-multiplespot-1-[1-5],multiple_spot-st-multiplespot-2-[1-5],efa-st-efa-c5n-[1-1],gpu-st-gpu-g38xlarge-[1-1] + +#TOTAL_MIN_COUNT=12 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster.conf index 6a12f3c1a2..f3d047e3f6 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster.conf @@ -11,3 +11,5 @@ SelectTypeParameters=CR_CPU include /pcluster/slurm_parallelcluster_efa_partition.conf SuspendExcNodes=efa-st-efa-c5n-[1-1] + +#TOTAL_MIN_COUNT=1 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_externaldbd.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_externaldbd.conf index dba3cf14d5..081c85a2a3 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_externaldbd.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_externaldbd.conf @@ -16,3 +16,5 @@ JobAcctGatherType=jobacct_gather/cgroup include /pcluster/slurm_parallelcluster_efa_partition.conf SuspendExcNodes=efa-st-efa-c5n-[1-1] + +#TOTAL_MIN_COUNT=1 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting.conf index b6c32dcd0a..17372c9502 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting.conf @@ -16,3 +16,5 @@ JobAcctGatherType=jobacct_gather/cgroup include /pcluster/slurm_parallelcluster_efa_partition.conf SuspendExcNodes=efa-st-efa-c5n-[1-1] + +#TOTAL_MIN_COUNT=1 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting_dbname.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting_dbname.conf index b6c32dcd0a..17372c9502 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting_dbname.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting_dbname.conf @@ -16,3 +16,5 @@ JobAcctGatherType=jobacct_gather/cgroup include /pcluster/slurm_parallelcluster_efa_partition.conf SuspendExcNodes=efa-st-efa-c5n-[1-1] + +#TOTAL_MIN_COUNT=1 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generating_slurm_config_flexible_instance_types/expected_outputs/slurm_parallelcluster.conf b/test/unit/slurm/test_slurm_config_generator/test_generating_slurm_config_flexible_instance_types/expected_outputs/slurm_parallelcluster.conf index ed42bbfd12..47e58f0c5d 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generating_slurm_config_flexible_instance_types/expected_outputs/slurm_parallelcluster.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generating_slurm_config_flexible_instance_types/expected_outputs/slurm_parallelcluster.conf @@ -18,3 +18,5 @@ include /pcluster/slurm_parallelcluster_queue7_partition.conf include /pcluster/slurm_parallelcluster_queue8_partition.conf SuspendExcNodes=queue1-st-cr1-[1-2],queue2-st-cr2-[1-1],queue3-st-cr1-[1-1],queue4-st-cr1-[1-2],queue5-st-cr1-[1-2],queue6-st-cr1-[1-2],queue7-st-cr1-[1-2],queue8-st-cr1-[1-1] + +#TOTAL_MIN_COUNT=13 \ No newline at end of file From 88ae38437121f2e5b795b35a04e0be9b0a293aeb Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 6 Jun 2025 16:52:25 -0400 Subject: [PATCH 03/10] Remove conversion to integer as null string will be converted to 0 --- cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index 96b22a1b95..d374b45a82 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -213,7 +213,7 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested # sinfo -h -o '%N %t' # queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~ # queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle - until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh #{get_static_node_count.to_i}").stdout.strip.empty? + until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh #{get_static_node_count}").stdout.strip.empty? check_for_protected_mode(fleet_status_command) Chef::Log.info("Waiting for static fleet capacity provisioning") From c850c96b0f21835a9a1f4639f5919272822d2d6c Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 16 Jun 2025 18:17:34 -0400 Subject: [PATCH 04/10] Revert "Update Unit tests" This reverts commit 36958cd386e7014075b00a507d8ed40fb51261f1. --- .../expected_outputs/slurm_parallelcluster.conf | 2 -- .../expected_outputs/slurm_parallelcluster_mem_sched.conf | 2 -- .../expected_outputs/slurm_parallelcluster.conf | 2 -- .../expected_outputs/slurm_parallelcluster.conf | 2 -- .../expected_outputs/slurm_parallelcluster_externaldbd.conf | 2 -- .../slurm_parallelcluster_slurm_accounting.conf | 2 -- .../slurm_parallelcluster_slurm_accounting_dbname.conf | 2 -- .../expected_outputs/slurm_parallelcluster.conf | 2 -- 8 files changed, 16 deletions(-) diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster.conf index 032c13f5e9..28bb3c9158 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster.conf @@ -13,5 +13,3 @@ include /pcluster/slurm_parallelcluster_efa_partition.conf include /pcluster/slurm_parallelcluster_gpu_partition.conf SuspendExcNodes=multiple_spot-st-multiplespot-1-[1-5],multiple_spot-st-multiplespot-2-[1-5],efa-st-efa-c5n-[1-1],gpu-st-gpu-g38xlarge-[1-1] - -#TOTAL_MIN_COUNT=12 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster_mem_sched.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster_mem_sched.conf index babfdaec71..4694ad218a 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster_mem_sched.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_memory_scheduling/expected_outputs/slurm_parallelcluster_mem_sched.conf @@ -13,5 +13,3 @@ include /pcluster/slurm_parallelcluster_efa_partition.conf include /pcluster/slurm_parallelcluster_gpu_partition.conf SuspendExcNodes=multiple_spot-st-multiplespot-1-[1-5],multiple_spot-st-multiplespot-2-[1-5],efa-st-efa-c5n-[1-1],gpu-st-gpu-g38xlarge-[1-1] - -#TOTAL_MIN_COUNT=12 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_nogpu/expected_outputs/slurm_parallelcluster.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_nogpu/expected_outputs/slurm_parallelcluster.conf index 032c13f5e9..28bb3c9158 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_nogpu/expected_outputs/slurm_parallelcluster.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_nogpu/expected_outputs/slurm_parallelcluster.conf @@ -13,5 +13,3 @@ include /pcluster/slurm_parallelcluster_efa_partition.conf include /pcluster/slurm_parallelcluster_gpu_partition.conf SuspendExcNodes=multiple_spot-st-multiplespot-1-[1-5],multiple_spot-st-multiplespot-2-[1-5],efa-st-efa-c5n-[1-1],gpu-st-gpu-g38xlarge-[1-1] - -#TOTAL_MIN_COUNT=12 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster.conf index f3d047e3f6..6a12f3c1a2 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster.conf @@ -11,5 +11,3 @@ SelectTypeParameters=CR_CPU include /pcluster/slurm_parallelcluster_efa_partition.conf SuspendExcNodes=efa-st-efa-c5n-[1-1] - -#TOTAL_MIN_COUNT=1 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_externaldbd.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_externaldbd.conf index 081c85a2a3..dba3cf14d5 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_externaldbd.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_externaldbd.conf @@ -16,5 +16,3 @@ JobAcctGatherType=jobacct_gather/cgroup include /pcluster/slurm_parallelcluster_efa_partition.conf SuspendExcNodes=efa-st-efa-c5n-[1-1] - -#TOTAL_MIN_COUNT=1 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting.conf index 17372c9502..b6c32dcd0a 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting.conf @@ -16,5 +16,3 @@ JobAcctGatherType=jobacct_gather/cgroup include /pcluster/slurm_parallelcluster_efa_partition.conf SuspendExcNodes=efa-st-efa-c5n-[1-1] - -#TOTAL_MIN_COUNT=1 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting_dbname.conf b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting_dbname.conf index 17372c9502..b6c32dcd0a 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting_dbname.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generate_slurm_config_files_slurm_accounting/expected_outputs/slurm_parallelcluster_slurm_accounting_dbname.conf @@ -16,5 +16,3 @@ JobAcctGatherType=jobacct_gather/cgroup include /pcluster/slurm_parallelcluster_efa_partition.conf SuspendExcNodes=efa-st-efa-c5n-[1-1] - -#TOTAL_MIN_COUNT=1 \ No newline at end of file diff --git a/test/unit/slurm/test_slurm_config_generator/test_generating_slurm_config_flexible_instance_types/expected_outputs/slurm_parallelcluster.conf b/test/unit/slurm/test_slurm_config_generator/test_generating_slurm_config_flexible_instance_types/expected_outputs/slurm_parallelcluster.conf index 47e58f0c5d..ed42bbfd12 100644 --- a/test/unit/slurm/test_slurm_config_generator/test_generating_slurm_config_flexible_instance_types/expected_outputs/slurm_parallelcluster.conf +++ b/test/unit/slurm/test_slurm_config_generator/test_generating_slurm_config_flexible_instance_types/expected_outputs/slurm_parallelcluster.conf @@ -18,5 +18,3 @@ include /pcluster/slurm_parallelcluster_queue7_partition.conf include /pcluster/slurm_parallelcluster_queue8_partition.conf SuspendExcNodes=queue1-st-cr1-[1-2],queue2-st-cr2-[1-1],queue3-st-cr1-[1-1],queue4-st-cr1-[1-2],queue5-st-cr1-[1-2],queue6-st-cr1-[1-2],queue7-st-cr1-[1-2],queue8-st-cr1-[1-1] - -#TOTAL_MIN_COUNT=13 \ No newline at end of file From 73355006b98ca079d7e37a13413e2a0276e5d50a Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 16 Jun 2025 18:19:23 -0400 Subject: [PATCH 05/10] Revert use of reading total_min_count from slurm_parallelcluster.conf --- .../slurm/templates/slurm_parallelcluster.conf | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf index 935c9e7890..116aaf9e6a 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf @@ -36,7 +36,6 @@ include {{ output_dir }}/pcluster/slurm_parallelcluster_{{ queue.Name }}_partiti {% endfor %} {% if ns.has_static %} -{%- set ns.total_min_count = 0 %} SuspendExcNodes= {%- set ns.is_first = True %} {%- for queue in queues %} @@ -44,12 +43,9 @@ SuspendExcNodes= {% if compute_resource.MinCount > 0 %} {{- "," if not ns.is_first else "" -}} {{ queue.Name }}-st-{{ compute_resource.Name }}-[1-{{ compute_resource.MinCount }}] - {%- set ns.total_min_count = ns.total_min_count + compute_resource.MinCount %} {%- set ns.is_first = False %} {%- endif %} {% endfor %} {% endfor %} -{% endif %} - -#TOTAL_MIN_COUNT={{ ns.total_min_count }} \ No newline at end of file +{% endif %} \ No newline at end of file From 23c827b257c8c45b37ca66649b2bf278d5d87873 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 16 Jun 2025 21:46:01 -0400 Subject: [PATCH 06/10] Adding logs and redirect python output to console --- cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index d374b45a82..c939556331 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -179,7 +179,7 @@ def wait_cluster_ready end def get_static_node_count - cmd = Mixlib::ShellOut.new("cat #{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster.conf | grep -o '#TOTAL_MIN_COUNT=\([0-9]*\)' | cut -d'=' -f2") + cmd = Mixlib::ShellOut.new("#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_fleet_config_generator.py --input-file #{node['cluster']['cluster_config_path']} --total-min-count | grep -o 'The total MinCount of cluster is =\([0-9]*\)' | cut -d'=' -f2") cmd.run_command.stdout.strip end From eff1587301106c9f9d34c6d2a53ce41b958fe306 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Mon, 16 Jun 2025 22:09:00 -0400 Subject: [PATCH 07/10] Add unit tests --- .../templates/slurm_parallelcluster.conf | 2 +- .../unit/slurm/test_fleet_config_generator.py | 116 +++++++++++++++++- 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf index 116aaf9e6a..07c383ff79 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/templates/slurm_parallelcluster.conf @@ -48,4 +48,4 @@ SuspendExcNodes= {% endfor %} {% endfor %} -{% endif %} \ No newline at end of file +{% endif %} diff --git a/test/unit/slurm/test_fleet_config_generator.py b/test/unit/slurm/test_fleet_config_generator.py index dfc79a4525..fe41668712 100644 --- a/test/unit/slurm/test_fleet_config_generator.py +++ b/test/unit/slurm/test_fleet_config_generator.py @@ -12,7 +12,7 @@ import pytest from assertpy import assert_that -from pcluster_fleet_config_generator import ConfigurationFieldNotFoundError, CriticalError, generate_fleet_config_file +from pcluster_fleet_config_generator import ConfigurationFieldNotFoundError, CriticalError, generate_fleet_config_file, get_total_min_count @pytest.mark.parametrize( @@ -266,3 +266,117 @@ def _assert_files_are_equal(file, expected_file): expected_file_content = exp_f.read() expected_file_content = expected_file_content.replace("", os.path.dirname(file)) assert_that(f.read()).is_equal_to(expected_file_content) + +@pytest.mark.parametrize( + "cluster_config, expected_exception, expected_message, expected_total_min_count", + [ + ({}, CriticalError, "Unable to find key 'Scheduling' in the configuration file", 0), + ({"Scheduling": {}}, CriticalError, "Unable to find key 'SlurmQueues' in the configuration file", 0), + ({"Scheduling": {"SlurmQueues": []}}, None, None, 0), + ( + { + "Scheduling": { + "SlurmQueues": [ + { + "Name": "q1", + "CapacityType": "ONDEMAND", + "ComputeResources": [ + {"MinCount": 0, "Instances": [{"InstanceType": "test"}]}, + ], + } + ] + } + }, + None, + None,0, + ), + ( + { + "Scheduling": { + "SlurmQueues": [ + { + "Name": "q1", + "CapacityType": "ONDEMAND", + "ComputeResources": [ + {"MinCount": 2, "Instances": [{"InstanceType": "test"}]}, + {"MinCount": 3, "InstanceType": "test"}, + ], + } + ] + } + }, + None, + None,5, + ), + ( + { + "Scheduling": { + "SlurmQueues": [ + { + "Name": "q1", + "CapacityType": "SPOT", + "ComputeResources": [ + { + "Name": "cr1", + "Instances": [{"InstanceType": "test"}, {"InstanceType": "test-2"}], + "MinCount": 3, + "SpotPrice": "10", + }, + {"Name": "cr2", "InstanceType": "test", "SpotPrice": "10", "MinCount": 9}, + ], + "Networking": {"SubnetIds": ["123", "456", "789"]}, + } + ] + } + }, + None, + None,12, + ), + ( + { + "Scheduling": { + "SlurmQueues": [ + { + "Name": "q1", + "CapacityType": "CAPACITY_BLOCK", + "ComputeResources": [ + { + "Name": "cr1", + "Instances": [{"InstanceType": "test"}], + "MinCount":2, + "CapacityReservationTarget": { + "CapacityReservationResourceGroupArn": "arn", + }, + }, + { + "Name": "cr2", + "MinCount":2, + "Instances": [{"InstanceType": "test"}], + "CapacityReservationTarget": { + "CapacityReservationId": "id", + }, + }, + ], + "Networking": {"SubnetIds": ["123"]}, + } + ] + } + }, + None, + None,4, + ), + ], +) +def test_get_total_min_count( + mocker, tmpdir, cluster_config, expected_exception, expected_message, expected_total_min_count +): + mocker.patch("pcluster_fleet_config_generator._load_cluster_config", return_value=cluster_config) + + if expected_message: + with pytest.raises(expected_exception, match=expected_message): + actual_min_count = get_total_min_count(input_file="fake") + assert_that(actual_min_count).is_equal_to(expected_total_min_count) + + else: + actual_min_count = get_total_min_count(input_file="fake") + assert_that(actual_min_count).is_equal_to(expected_total_min_count) From 314f60fff73826eda781877aa98c637accd114bd Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 17 Jun 2025 11:08:53 -0400 Subject: [PATCH 08/10] Using Ruby yaml for looping to get count od static nodes --- .../aws-parallelcluster-slurm/libraries/helpers.rb | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index c939556331..20be657008 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -179,8 +179,18 @@ def wait_cluster_ready end def get_static_node_count - cmd = Mixlib::ShellOut.new("#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_fleet_config_generator.py --input-file #{node['cluster']['cluster_config_path']} --total-min-count | grep -o 'The total MinCount of cluster is =\([0-9]*\)' | cut -d'=' -f2") - cmd.run_command.stdout.strip + require 'yaml' + cluster_config = YAML.safe_load(File.read(node['cluster']['cluster_config_path'])) + total_min_count = 0 + slurm_queues_section = cluster_config.dig("Scheduling", "SlurmQueues") + if slurm_queues_section + slurm_queues_section.each do |queue_config| + queue_config.dig('ComputeResources').each do |compute_resource_config| + total_min_count += compute_resource_config.dig('MinCount').to_i + end + end + end + total_min_count end def wait_static_fleet_running From 77fdf758c34617e157a6fca0fb51983c264b57e2 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 17 Jun 2025 11:16:25 -0400 Subject: [PATCH 09/10] Revert "Add unit tests" This reverts commit eff1587301106c9f9d34c6d2a53ce41b958fe306. --- .../unit/slurm/test_fleet_config_generator.py | 116 +----------------- 1 file changed, 1 insertion(+), 115 deletions(-) diff --git a/test/unit/slurm/test_fleet_config_generator.py b/test/unit/slurm/test_fleet_config_generator.py index fe41668712..dfc79a4525 100644 --- a/test/unit/slurm/test_fleet_config_generator.py +++ b/test/unit/slurm/test_fleet_config_generator.py @@ -12,7 +12,7 @@ import pytest from assertpy import assert_that -from pcluster_fleet_config_generator import ConfigurationFieldNotFoundError, CriticalError, generate_fleet_config_file, get_total_min_count +from pcluster_fleet_config_generator import ConfigurationFieldNotFoundError, CriticalError, generate_fleet_config_file @pytest.mark.parametrize( @@ -266,117 +266,3 @@ def _assert_files_are_equal(file, expected_file): expected_file_content = exp_f.read() expected_file_content = expected_file_content.replace("", os.path.dirname(file)) assert_that(f.read()).is_equal_to(expected_file_content) - -@pytest.mark.parametrize( - "cluster_config, expected_exception, expected_message, expected_total_min_count", - [ - ({}, CriticalError, "Unable to find key 'Scheduling' in the configuration file", 0), - ({"Scheduling": {}}, CriticalError, "Unable to find key 'SlurmQueues' in the configuration file", 0), - ({"Scheduling": {"SlurmQueues": []}}, None, None, 0), - ( - { - "Scheduling": { - "SlurmQueues": [ - { - "Name": "q1", - "CapacityType": "ONDEMAND", - "ComputeResources": [ - {"MinCount": 0, "Instances": [{"InstanceType": "test"}]}, - ], - } - ] - } - }, - None, - None,0, - ), - ( - { - "Scheduling": { - "SlurmQueues": [ - { - "Name": "q1", - "CapacityType": "ONDEMAND", - "ComputeResources": [ - {"MinCount": 2, "Instances": [{"InstanceType": "test"}]}, - {"MinCount": 3, "InstanceType": "test"}, - ], - } - ] - } - }, - None, - None,5, - ), - ( - { - "Scheduling": { - "SlurmQueues": [ - { - "Name": "q1", - "CapacityType": "SPOT", - "ComputeResources": [ - { - "Name": "cr1", - "Instances": [{"InstanceType": "test"}, {"InstanceType": "test-2"}], - "MinCount": 3, - "SpotPrice": "10", - }, - {"Name": "cr2", "InstanceType": "test", "SpotPrice": "10", "MinCount": 9}, - ], - "Networking": {"SubnetIds": ["123", "456", "789"]}, - } - ] - } - }, - None, - None,12, - ), - ( - { - "Scheduling": { - "SlurmQueues": [ - { - "Name": "q1", - "CapacityType": "CAPACITY_BLOCK", - "ComputeResources": [ - { - "Name": "cr1", - "Instances": [{"InstanceType": "test"}], - "MinCount":2, - "CapacityReservationTarget": { - "CapacityReservationResourceGroupArn": "arn", - }, - }, - { - "Name": "cr2", - "MinCount":2, - "Instances": [{"InstanceType": "test"}], - "CapacityReservationTarget": { - "CapacityReservationId": "id", - }, - }, - ], - "Networking": {"SubnetIds": ["123"]}, - } - ] - } - }, - None, - None,4, - ), - ], -) -def test_get_total_min_count( - mocker, tmpdir, cluster_config, expected_exception, expected_message, expected_total_min_count -): - mocker.patch("pcluster_fleet_config_generator._load_cluster_config", return_value=cluster_config) - - if expected_message: - with pytest.raises(expected_exception, match=expected_message): - actual_min_count = get_total_min_count(input_file="fake") - assert_that(actual_min_count).is_equal_to(expected_total_min_count) - - else: - actual_min_count = get_total_min_count(input_file="fake") - assert_that(actual_min_count).is_equal_to(expected_total_min_count) From 35171c1ebfe941163f288d02473d866afb6d4c57 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 17 Jun 2025 11:23:13 -0400 Subject: [PATCH 10/10] Adding log for checking output --- cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index 20be657008..088d1e654d 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -185,8 +185,8 @@ def get_static_node_count slurm_queues_section = cluster_config.dig("Scheduling", "SlurmQueues") if slurm_queues_section slurm_queues_section.each do |queue_config| - queue_config.dig('ComputeResources').each do |compute_resource_config| - total_min_count += compute_resource_config.dig('MinCount').to_i + queue_config['ComputeResources'].each do |compute_resource_config| + total_min_count += compute_resource_config['MinCount'].to_i end end end @@ -219,11 +219,14 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested "/usr/local/bin/get-compute-fleet-status.sh" ) + total_static_node_count = get_static_node_count + Chef::Log.info("Count of cluster static nodes is #{total_static_node_count}") + # Example output for sinfo # sinfo -h -o '%N %t' # queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~ # queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle - until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh #{get_static_node_count}").stdout.strip.empty? + until shell_out!("/bin/bash /usr/local/bin/is_fleet_ready.sh #{total_static_node_count}").stdout.strip.empty? check_for_protected_mode(fleet_status_command) Chef::Log.info("Waiting for static fleet capacity provisioning")