Skip to content

Commit 92c5b0b

Browse files
committed
[Networking] Prevent duplicate security group rules in shared storage security group.
The deduplication is needed because when duplicate rules are requested, the CFN handler responsible for their creation goes into a path that is more susceptible to eventual consistency issue. When such issue occur, cluster creation may fail.
1 parent abc74a6 commit 92c5b0b

File tree

4 files changed

+97
-0
lines changed

4 files changed

+97
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ CHANGELOG
1414
- Fix LoginNodes NLB not being publicly accessible when using public subnet with implicit main route table association.
1515
See https://github.com/aws/aws-parallelcluster/issues/7173
1616
- Fix cluster creation failure without Internet access when GPU instances and DCV are used.
17+
- Fix intermittent cluster creation failure caused by eventual consistency issues when head, compute and login nodes have the same security group.
1718

1819
3.14.1
1920
------

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -795,8 +795,17 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
795795
"Storage": [storage_security_group.ref],
796796
}
797797

798+
# Deduplicate SGs to avoid creating identical rules for the same security group.
799+
# This prevents EC2 duplicate rule errors which are susceptible to eventual consistency issues.
800+
seen_sgs = set()
798801
for sg_type, sg_refs in target_security_groups.items():
799802
for sg_ref_id, sg_ref in enumerate(sg_refs):
803+
# This conversion is required because sg_ref could either be a string or a CDK token reference.
804+
# A CDK token must be converted to a string to make it comparable within the set.
805+
sg_key = str(sg_ref)
806+
if sg_key in seen_sgs:
807+
continue
808+
seen_sgs.add(sg_key)
800809
# TODO Scope down ingress rules to allow only traffic on the strictly necessary ports.
801810
# Currently scoped down only on Login nodes to limit blast radius.
802811
ingress_protocol = "-1"

cli/tests/pcluster/templates/test_cluster_stack.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,3 +1374,44 @@ def test_resource_combination_name(
13741374
hash_length=hash_length,
13751375
)
13761376
assert_that(combination_name).is_equal_to(expected_combination_name)
1377+
1378+
1379+
def test_storage_security_group_deduplication(mocker, test_datadir):
1380+
"""
1381+
Test that storage security group rules are deduplicated when head, compute, and login nodes share the same SG.
1382+
1383+
When head node, compute nodes, and login nodes all use the same security group (sg-12345678),
1384+
only one set of ingress/egress rules should be created for that security group, not three separate sets.
1385+
The deduplicated rule uses the first occurrence's settings (Head node), which uses all protocols (-1).
1386+
"""
1387+
mock_aws_api(mocker)
1388+
mock_bucket(mocker)
1389+
mock_bucket_object_utils(mocker)
1390+
1391+
input_yaml = load_yaml_dict(test_datadir / "config-shared-sg.yaml")
1392+
cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml)
1393+
1394+
generated_template, _ = CDKTemplateBuilder().build_cluster_template(
1395+
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
1396+
)
1397+
1398+
# Test both EFS and FSx storage types
1399+
for storage_type in ["EFS", "FSX"]:
1400+
storage_sg_ingress_rules = [
1401+
(name, resource)
1402+
for name, resource in generated_template["Resources"].items()
1403+
if resource["Type"] == "AWS::EC2::SecurityGroupIngress"
1404+
and name.startswith(storage_type)
1405+
and "SecurityGroup" in name
1406+
]
1407+
1408+
# Verify deduplication: only 2 rules instead of 4 (Head, Compute, Login share same SG + Storage SG)
1409+
assert_that(len(storage_sg_ingress_rules)).is_equal_to(2)
1410+
1411+
# Verify each unique source security group has exactly one ingress rule
1412+
source_sgs = {
1413+
str(rule["Properties"].get("SourceSecurityGroupId"))
1414+
for _, rule in storage_sg_ingress_rules
1415+
if rule["Properties"].get("SourceSecurityGroupId")
1416+
}
1417+
assert_that(len(storage_sg_ingress_rules)).is_equal_to(len(source_sgs))
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
Image:
2+
Os: alinux2
3+
HeadNode:
4+
InstanceType: t3.micro
5+
Ssh:
6+
KeyName: ec2-key-name
7+
Networking:
8+
SubnetId: subnet-12345678
9+
SecurityGroups:
10+
- sg-12345678
11+
Scheduling:
12+
Scheduler: slurm
13+
SlurmQueues:
14+
- Name: queue1
15+
Networking:
16+
SubnetIds:
17+
- subnet-12345678
18+
SecurityGroups:
19+
- sg-12345678
20+
ComputeResources:
21+
- Name: compute_resource1
22+
InstanceType: t3.micro
23+
MinCount: 0
24+
MaxCount: 10
25+
LoginNodes:
26+
Pools:
27+
- Name: login
28+
InstanceType: t3.micro
29+
Count: 1
30+
Networking:
31+
SubnetIds:
32+
- subnet-12345678
33+
SecurityGroups:
34+
- sg-12345678
35+
SharedStorage:
36+
- Name: efs
37+
StorageType: Efs
38+
MountDir: /efs
39+
EfsSettings:
40+
ThroughputMode: bursting
41+
- Name: fsx
42+
StorageType: FsxLustre
43+
MountDir: /fsx
44+
FsxLustreSettings:
45+
StorageCapacity: 1200
46+
DeploymentType: SCRATCH_2

0 commit comments

Comments
 (0)