From a794f7d708827ea9a51c5d3b4fdcf52d9eaf0f79 Mon Sep 17 00:00:00 2001 From: Hanxuan Zhang Date: Wed, 11 Jun 2025 11:04:04 -0400 Subject: [PATCH 1/5] [Subnet Prioritization] Add SingleAvailabilityZone to fleet_config.json Signed-off-by: Hanxuan Zhang --- .../slurm/pcluster_fleet_config_generator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py index 788ec6db72..642d61a46b 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py @@ -99,6 +99,7 @@ def generate_fleet_config_file(output_file: str, input_file: str): queue_capacity_reservation=queue_capacity_reservation, queue_capacity_type=queue_capacity_type, queue_subnets=queue_config["Networking"]["SubnetIds"], + queue_single_availability_zone=queue_config["Networking"]["EnableSingleAvailabilityZone"], ) fleet_config[queue_name][compute_resource_name] = config_for_fleet @@ -125,6 +126,7 @@ def _generate_compute_resource_fleet_config( queue_capacity_reservation: str, queue_capacity_type: str, queue_subnets: List, + queue_single_availability_zone: bool, ): """ Generate compute resource config to add in the fleet-config.json, overriding values from the queue. @@ -156,7 +158,10 @@ def _generate_compute_resource_fleet_config( { "Api": "create-fleet", "Instances": copy.deepcopy(compute_resource_config["Instances"]), - "Networking": {"SubnetIds": queue_subnets}, + "Networking": { + "SubnetIds": queue_subnets, + "SingleAvailabilityZone": queue_single_availability_zone + }, } ) allocation_strategy = compute_resource_config.get("AllocationStrategy", queue_allocation_strategy) From 116282d15c8b73498dd50b0f65b49252081c68a3 Mon Sep 17 00:00:00 2001 From: Hanxuan Zhang Date: Mon, 16 Jun 2025 12:53:16 -0400 Subject: [PATCH 2/5] [Subnet Prioritization] Update unit test of generate_fleet_config_file Signed-off-by: Hanxuan Zhang --- .../unit/slurm/test_fleet_config_generator.py | 50 ++++++++++++++++--- .../expected_outputs/fleet-config.json | 15 ++++-- .../sample_input.yaml | 4 ++ 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/test/unit/slurm/test_fleet_config_generator.py b/test/unit/slurm/test_fleet_config_generator.py index dfc79a4525..87593e17a3 100644 --- a/test/unit/slurm/test_fleet_config_generator.py +++ b/test/unit/slurm/test_fleet_config_generator.py @@ -46,6 +46,22 @@ CriticalError, "Unable to find key 'Networking' in the configuration file. Queue: q1", ), + ( + { + "Scheduling": { + "SlurmQueues": [ + { + "Name": "q1", + "CapacityType": "SPOT", + "ComputeResources": [{"Instances": []}], + "Networking": {"SubnetIds": ["123"]}, + } + ] + } + }, + CriticalError, + "Unable to find key 'EnableSingleAvailabilityZone' in the configuration file. Queue: q1", + ), ( { "Scheduling": { @@ -54,7 +70,7 @@ "Name": "q1", "CapacityType": "SPOT", "ComputeResources": [{"Instances": []}], - "Networking": {"SubnetIds": ["123"]}, + "Networking": {"SubnetIds": ["123"], "EnableSingleAvailabilityZone": None}, } ] } @@ -70,7 +86,7 @@ "Name": "q1", "CapacityType": "ONDEMAND", "ComputeResources": [{"Name": "cr1", "Instances": []}], - "Networking": {"SubnetIds": ["123"]}, + "Networking": {"SubnetIds": ["123"], "EnableSingleAvailabilityZone": None}, } ] } @@ -89,7 +105,7 @@ {"Name": "cr1", "Instances": [{"InstanceType": "test"}]}, {"Name": "cr2", "InstanceType": "test"}, ], - "Networking": {"SubnetIds": ["123"]}, + "Networking": {"SubnetIds": ["123"], "EnableSingleAvailabilityZone": None}, } ] } @@ -108,7 +124,7 @@ {"Name": "cr1", "Instances": [{"InstanceType": "test"}, {"InstanceType": "test-2"}]}, {"Name": "cr2", "InstanceType": "test"}, ], - "Networking": {"SubnetIds": ["123", "456", "789"]}, + "Networking": {"SubnetIds": ["123", "456", "789"], "EnableSingleAvailabilityZone": None}, } ] } @@ -131,7 +147,7 @@ }, {"Name": "cr2", "InstanceType": "test", "SpotPrice": "10"}, ], - "Networking": {"SubnetIds": ["123", "456", "789"]}, + "Networking": {"SubnetIds": ["123", "456", "789"], "EnableSingleAvailabilityZone": None}, } ] } @@ -147,7 +163,7 @@ "Name": "q1", "CapacityType": "SPOT", "ComputeResources": [{"Name": "cr1", "Instances": [{"InstanceType": "test"}]}], - "Networking": {"SubnetIds": ["123"]}, + "Networking": {"SubnetIds": ["123"], "EnableSingleAvailabilityZone": None}, } ] } @@ -165,7 +181,7 @@ "ComputeResources": [ {"Name": "cr1", "Instances": [{"InstanceType": "test"}], "SpotPrice": 10} ], - "Networking": {"SubnetIds": ["123"]}, + "Networking": {"SubnetIds": ["123"], "EnableSingleAvailabilityZone": None}, } ] } @@ -208,6 +224,24 @@ CriticalError, "Unable to find key 'SubnetIds' in the configuration file. Queue: q1", ), + ( + { + "Scheduling": { + "SlurmQueues": [ + { + "Name": "q1", + "CapacityType": "SPOT", + "ComputeResources": [ + {"Name": "cr1", "Instances": [{"InstanceType": "test"}], "SpotPrice": 10} + ], + "Networking": {"SubnetIds": ["123"]}, + } + ] + } + }, + CriticalError, + "Unable to find key 'EnableSingleAvailabilityZone' in the configuration file. Queue: q1", + ), ( { "Scheduling": { @@ -231,7 +265,7 @@ }, }, ], - "Networking": {"SubnetIds": ["123"]}, + "Networking": {"SubnetIds": ["123"], "EnableSingleAvailabilityZone": None}, } ] } diff --git a/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/expected_outputs/fleet-config.json b/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/expected_outputs/fleet-config.json index a7e161d57e..b5de29fd4d 100644 --- a/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/expected_outputs/fleet-config.json +++ b/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/expected_outputs/fleet-config.json @@ -27,7 +27,8 @@ "Networking": { "SubnetIds": [ "subnet-0230367ab0e5123a4" - ] + ], + "SingleAvailabilityZone": true }, "AllocationStrategy": "lowest-price" } @@ -57,7 +58,8 @@ "SubnetIds": [ "subnet-0230367ab0e5123a4", "subnet-0b903123096649662" - ] + ], + "SingleAvailabilityZone": false }, "AllocationStrategy": "lowest-price" } @@ -84,7 +86,8 @@ "Networking": { "SubnetIds": [ "subnet-0230367ab0e5123a4" - ] + ], + "SingleAvailabilityZone": null }, "AllocationStrategy": "capacity-optimized", "MaxPrice": 10 @@ -100,7 +103,8 @@ "Networking": { "SubnetIds": [ "subnet-0230367ab0e5123a4" - ] + ], + "SingleAvailabilityZone": null }, "AllocationStrategy": "capacity-optimized" } @@ -131,7 +135,8 @@ "Networking": { "SubnetIds": [ "subnet-0230367ab0e5123a4" - ] + ], + "SingleAvailabilityZone": null } } } diff --git a/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/sample_input.yaml b/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/sample_input.yaml index 2e2edc5f1f..3fa2b4897b 100644 --- a/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/sample_input.yaml +++ b/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/sample_input.yaml @@ -62,6 +62,7 @@ Scheduling: SecurityGroups: null SubnetIds: - subnet-0230367ab0e5123a4 + EnableSingleAvailabilityZone: true # queue ondemand without capacity reservations and with multiple subnets - AllocationStrategy: lowest-price CapacityReservationTarget: null @@ -121,6 +122,7 @@ Scheduling: SubnetIds: - subnet-0230367ab0e5123a4 - subnet-0b903123096649662 + EnableSingleAvailabilityZone: false # queue spot - AllocationStrategy: capacity-optimized CapacityReservationTarget: null @@ -191,6 +193,7 @@ Scheduling: SecurityGroups: null SubnetIds: - subnet-0230367ab0e5123a4 + EnableSingleAvailabilityZone: null # queue for capacity-block - CapacityReservationTarget: CapacityReservationId: cr-987654 @@ -250,6 +253,7 @@ Scheduling: SecurityGroups: null SubnetIds: - subnet-0230367ab0e5123a4 + EnableSingleAvailabilityZone: null SlurmSettings: Dns: DisableManagedDns: false From 1b802b99e7ace136bbefead3624226d57aaee45a Mon Sep 17 00:00:00 2001 From: Hanxuan Zhang Date: Mon, 16 Jun 2025 13:22:32 -0400 Subject: [PATCH 3/5] [Subnet Prioritization] Update CHANGELOG.md Signed-off-by: Hanxuan Zhang --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b401818f94..1a40c519c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Ubuntu 20.04 is no longer supported. +- Add SingleAvailabilityZone parameter to fleet_config.json 3.13.1 ------ From 26eec74d4a6e67bbd1207bc93f7ee23d7da39ae2 Mon Sep 17 00:00:00 2001 From: Hanxuan Zhang Date: Mon, 16 Jun 2025 13:27:32 -0400 Subject: [PATCH 4/5] [Subnet Prioritization] Update format Signed-off-by: Hanxuan Zhang --- .../slurm/pcluster_fleet_config_generator.py | 2 +- .../unit/slurm/test_fleet_config_generator.py | 60 +++++++++---------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py index 642d61a46b..9d1a700d04 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py @@ -160,7 +160,7 @@ def _generate_compute_resource_fleet_config( "Instances": copy.deepcopy(compute_resource_config["Instances"]), "Networking": { "SubnetIds": queue_subnets, - "SingleAvailabilityZone": queue_single_availability_zone + "SingleAvailabilityZone": queue_single_availability_zone, }, } ) diff --git a/test/unit/slurm/test_fleet_config_generator.py b/test/unit/slurm/test_fleet_config_generator.py index 87593e17a3..093c666b0a 100644 --- a/test/unit/slurm/test_fleet_config_generator.py +++ b/test/unit/slurm/test_fleet_config_generator.py @@ -47,20 +47,20 @@ "Unable to find key 'Networking' in the configuration file. Queue: q1", ), ( - { - "Scheduling": { - "SlurmQueues": [ - { - "Name": "q1", - "CapacityType": "SPOT", - "ComputeResources": [{"Instances": []}], - "Networking": {"SubnetIds": ["123"]}, - } - ] - } - }, - CriticalError, - "Unable to find key 'EnableSingleAvailabilityZone' in the configuration file. Queue: q1", + { + "Scheduling": { + "SlurmQueues": [ + { + "Name": "q1", + "CapacityType": "SPOT", + "ComputeResources": [{"Instances": []}], + "Networking": {"SubnetIds": ["123"]}, + } + ] + } + }, + CriticalError, + "Unable to find key 'EnableSingleAvailabilityZone' in the configuration file. Queue: q1", ), ( { @@ -225,22 +225,22 @@ "Unable to find key 'SubnetIds' in the configuration file. Queue: q1", ), ( - { - "Scheduling": { - "SlurmQueues": [ - { - "Name": "q1", - "CapacityType": "SPOT", - "ComputeResources": [ - {"Name": "cr1", "Instances": [{"InstanceType": "test"}], "SpotPrice": 10} - ], - "Networking": {"SubnetIds": ["123"]}, - } - ] - } - }, - CriticalError, - "Unable to find key 'EnableSingleAvailabilityZone' in the configuration file. Queue: q1", + { + "Scheduling": { + "SlurmQueues": [ + { + "Name": "q1", + "CapacityType": "SPOT", + "ComputeResources": [ + {"Name": "cr1", "Instances": [{"InstanceType": "test"}], "SpotPrice": 10} + ], + "Networking": {"SubnetIds": ["123"]}, + } + ] + } + }, + CriticalError, + "Unable to find key 'EnableSingleAvailabilityZone' in the configuration file. Queue: q1", ), ( { From c3036573841d9d3f15bea10779f31adf12bbf988 Mon Sep 17 00:00:00 2001 From: Hanxuan Zhang Date: Fri, 20 Jun 2025 10:45:05 -0400 Subject: [PATCH 5/5] [Subnet Prioritization] Set SingleAvailabilityZone as null if EnableSingleAvailabilityZone is not provided Signed-off-by: Hanxuan Zhang --- .../slurm/pcluster_fleet_config_generator.py | 4 +++- test/unit/slurm/test_fleet_config_generator.py | 6 +++--- .../test_generate_fleet_config_file/sample_input.yaml | 1 - 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py index 9d1a700d04..e9bc26257d 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py @@ -99,7 +99,9 @@ def generate_fleet_config_file(output_file: str, input_file: str): queue_capacity_reservation=queue_capacity_reservation, queue_capacity_type=queue_capacity_type, queue_subnets=queue_config["Networking"]["SubnetIds"], - queue_single_availability_zone=queue_config["Networking"]["EnableSingleAvailabilityZone"], + queue_single_availability_zone=queue_config.get("Networking", {}).get( + "EnableSingleAvailabilityZone", None + ), ) fleet_config[queue_name][compute_resource_name] = config_for_fleet diff --git a/test/unit/slurm/test_fleet_config_generator.py b/test/unit/slurm/test_fleet_config_generator.py index 093c666b0a..7c1dd1891b 100644 --- a/test/unit/slurm/test_fleet_config_generator.py +++ b/test/unit/slurm/test_fleet_config_generator.py @@ -60,7 +60,7 @@ } }, CriticalError, - "Unable to find key 'EnableSingleAvailabilityZone' in the configuration file. Queue: q1", + "Unable to find key 'Name' in the configuration file. Queue: q1", ), ( { @@ -239,8 +239,8 @@ ] } }, - CriticalError, - "Unable to find key 'EnableSingleAvailabilityZone' in the configuration file. Queue: q1", + None, + None, ), ( { diff --git a/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/sample_input.yaml b/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/sample_input.yaml index 3fa2b4897b..310a38b2e1 100644 --- a/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/sample_input.yaml +++ b/test/unit/slurm/test_fleet_config_generator/test_generate_fleet_config_file/sample_input.yaml @@ -253,7 +253,6 @@ Scheduling: SecurityGroups: null SubnetIds: - subnet-0230367ab0e5123a4 - EnableSingleAvailabilityZone: null SlurmSettings: Dns: DisableManagedDns: false