Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ CHANGELOG
- Fix LoginNodes NLB not being publicly accessible when using public subnet with implicit main route table association.
See https://github.com/aws/aws-parallelcluster/issues/7173
- Fix cluster creation failure without Internet access when GPU instances and DCV are used.
- Fix intermittent cluster creation failure caused by eventual consistency issues when head, compute and login nodes have the same security group.

3.14.1
------
Expand Down
9 changes: 9 additions & 0 deletions cli/src/pcluster/templates/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,8 +795,17 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
"Storage": [storage_security_group.ref],
}

# Deduplicate SGs to avoid creating identical rules for the same security group.
# This prevents EC2 duplicate rule errors which are susceptible to eventual consistency issues.
seen_sgs = set()
for sg_type, sg_refs in target_security_groups.items():
for sg_ref_id, sg_ref in enumerate(sg_refs):
# This conversion is required because sg_ref could either be a string or a CDK token reference.
# A CDK token must be converted to a string to make it comparable within the set.
sg_key = str(sg_ref)
if sg_key in seen_sgs:
continue
seen_sgs.add(sg_key)
# TODO Scope down ingress rules to allow only traffic on the strictly necessary ports.
# Currently scoped down only on Login nodes to limit blast radius.
ingress_protocol = "-1"
Expand Down
41 changes: 41 additions & 0 deletions cli/tests/pcluster/templates/test_cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1374,3 +1374,44 @@ def test_resource_combination_name(
hash_length=hash_length,
)
assert_that(combination_name).is_equal_to(expected_combination_name)


def test_storage_security_group_deduplication(mocker, test_datadir):
"""
Test that storage security group rules are deduplicated when head, compute, and login nodes share the same SG.

When head node, compute nodes, and login nodes all use the same security group (sg-12345678),
only one set of ingress/egress rules should be created for that security group, not three separate sets.
The deduplicated rule uses the first occurrence's settings (Head node), which uses all protocols (-1).
"""
mock_aws_api(mocker)
mock_bucket(mocker)
mock_bucket_object_utils(mocker)

input_yaml = load_yaml_dict(test_datadir / "config-shared-sg.yaml")
cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml)

generated_template, _ = CDKTemplateBuilder().build_cluster_template(
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
)

# Test both EFS and FSx storage types
for storage_type in ["EFS", "FSX"]:
storage_sg_ingress_rules = [
(name, resource)
for name, resource in generated_template["Resources"].items()
if resource["Type"] == "AWS::EC2::SecurityGroupIngress"
and name.startswith(storage_type)
and "SecurityGroup" in name
]

# Verify deduplication: only 2 rules instead of 4 (Head, Compute, Login share same SG + Storage SG)
assert_that(len(storage_sg_ingress_rules)).is_equal_to(2)

# Verify each unique source security group has exactly one ingress rule
source_sgs = {
str(rule["Properties"].get("SourceSecurityGroupId"))
for _, rule in storage_sg_ingress_rules
if rule["Properties"].get("SourceSecurityGroupId")
}
assert_that(len(storage_sg_ingress_rules)).is_equal_to(len(source_sgs))
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Image:
Os: alinux2
HeadNode:
InstanceType: t3.micro
Ssh:
KeyName: ec2-key-name
Networking:
SubnetId: subnet-12345678
SecurityGroups:
- sg-12345678
Scheduling:
Scheduler: slurm
SlurmQueues:
- Name: queue1
Networking:
SubnetIds:
- subnet-12345678
SecurityGroups:
- sg-12345678
ComputeResources:
- Name: compute_resource1
InstanceType: t3.micro
MinCount: 0
MaxCount: 10
LoginNodes:
Pools:
- Name: login
InstanceType: t3.micro
Count: 1
Networking:
SubnetIds:
- subnet-12345678
SecurityGroups:
- sg-12345678
SharedStorage:
- Name: efs
StorageType: Efs
MountDir: /efs
EfsSettings:
ThroughputMode: bursting
- Name: fsx
StorageType: FsxLustre
MountDir: /fsx
FsxLustreSettings:
StorageCapacity: 1200
DeploymentType: SCRATCH_2
Loading