diff --git a/CHANGELOG.md b/CHANGELOG.md index 9671ffe29a..63978fa573 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ CHANGELOG - Fix LoginNodes NLB not being publicly accessible when using public subnet with implicit main route table association. See https://github.com/aws/aws-parallelcluster/issues/7173 - Fix cluster creation failure without Internet access when GPU instances and DCV are used. +- Fix intermittent cluster creation failure caused by eventual consistency issues when head, compute and login nodes have the same security group. 3.14.1 ------ diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index 67a573e6de..66fdf179c5 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -795,8 +795,17 @@ def _add_storage_security_group(self, storage_cfn_id, storage): "Storage": [storage_security_group.ref], } + # Deduplicate SGs to avoid creating identical rules for the same security group. + # This prevents EC2 duplicate rule errors which are susceptible to eventual consistency issues. + seen_sgs = set() for sg_type, sg_refs in target_security_groups.items(): for sg_ref_id, sg_ref in enumerate(sg_refs): + # This conversion is required because sg_ref could either be a string or a CDK token reference. + # A CDK token must be converted to a string to make it comparable within the set. + sg_key = str(sg_ref) + if sg_key in seen_sgs: + continue + seen_sgs.add(sg_key) # TODO Scope down ingress rules to allow only traffic on the strictly necessary ports. # Currently scoped down only on Login nodes to limit blast radius. ingress_protocol = "-1" diff --git a/cli/tests/pcluster/templates/test_cluster_stack.py b/cli/tests/pcluster/templates/test_cluster_stack.py index be707d6b6d..d4c7886c67 100644 --- a/cli/tests/pcluster/templates/test_cluster_stack.py +++ b/cli/tests/pcluster/templates/test_cluster_stack.py @@ -1374,3 +1374,44 @@ def test_resource_combination_name( hash_length=hash_length, ) assert_that(combination_name).is_equal_to(expected_combination_name) + + +def test_storage_security_group_deduplication(mocker, test_datadir): + """ + Test that storage security group rules are deduplicated when head, compute, and login nodes share the same SG. + + When head node, compute nodes, and login nodes all use the same security group (sg-12345678), + only one set of ingress/egress rules should be created for that security group, not three separate sets. + The deduplicated rule uses the first occurrence's settings (Head node), which uses all protocols (-1). + """ + mock_aws_api(mocker) + mock_bucket(mocker) + mock_bucket_object_utils(mocker) + + input_yaml = load_yaml_dict(test_datadir / "config-shared-sg.yaml") + cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) + + generated_template, _ = CDKTemplateBuilder().build_cluster_template( + cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" + ) + + # Test both EFS and FSx storage types + for storage_type in ["EFS", "FSX"]: + storage_sg_ingress_rules = [ + (name, resource) + for name, resource in generated_template["Resources"].items() + if resource["Type"] == "AWS::EC2::SecurityGroupIngress" + and name.startswith(storage_type) + and "SecurityGroup" in name + ] + + # Verify deduplication: only 2 rules instead of 4 (Head, Compute, Login share same SG + Storage SG) + assert_that(len(storage_sg_ingress_rules)).is_equal_to(2) + + # Verify each unique source security group has exactly one ingress rule + source_sgs = { + str(rule["Properties"].get("SourceSecurityGroupId")) + for _, rule in storage_sg_ingress_rules + if rule["Properties"].get("SourceSecurityGroupId") + } + assert_that(len(storage_sg_ingress_rules)).is_equal_to(len(source_sgs)) diff --git a/cli/tests/pcluster/templates/test_cluster_stack/test_storage_security_group_deduplication/config-shared-sg.yaml b/cli/tests/pcluster/templates/test_cluster_stack/test_storage_security_group_deduplication/config-shared-sg.yaml new file mode 100644 index 0000000000..a0dfe5e0d4 --- /dev/null +++ b/cli/tests/pcluster/templates/test_cluster_stack/test_storage_security_group_deduplication/config-shared-sg.yaml @@ -0,0 +1,46 @@ +Image: + Os: alinux2 +HeadNode: + InstanceType: t3.micro + Ssh: + KeyName: ec2-key-name + Networking: + SubnetId: subnet-12345678 + SecurityGroups: + - sg-12345678 +Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: queue1 + Networking: + SubnetIds: + - subnet-12345678 + SecurityGroups: + - sg-12345678 + ComputeResources: + - Name: compute_resource1 + InstanceType: t3.micro + MinCount: 0 + MaxCount: 10 +LoginNodes: + Pools: + - Name: login + InstanceType: t3.micro + Count: 1 + Networking: + SubnetIds: + - subnet-12345678 + SecurityGroups: + - sg-12345678 +SharedStorage: + - Name: efs + StorageType: Efs + MountDir: /efs + EfsSettings: + ThroughputMode: bursting + - Name: fsx + StorageType: FsxLustre + MountDir: /fsx + FsxLustreSettings: + StorageCapacity: 1200 + DeploymentType: SCRATCH_2