Skip to content

Commit 32f3dc0

Browse files
committed
[Networking] Prevent duplicate security group rules in shared storage security group.
The deduplication is needed because when duplicate rules are requested, the CFN handler responsible for their creation goes into a path that is more susceptible to eventual consistency issue. When such issue occur, cluster creation may fail. As part of this change, we also scoped down the SG rules that allow traffic to EFS and FSx by applying to head and compute nodes the same network access we had for login nodes. Applying the scope down made the deduplication change easier and it is also beneficial in terms of security posture.
1 parent 356daf7 commit 32f3dc0

File tree

5 files changed

+118
-13
lines changed

5 files changed

+118
-13
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@ CHANGELOG
88
- Add validator that warns against the downsides of disabling in-place updates on compute and login nodes through DevSettings.
99
- Upgrade jmespath to ~=1.0 (from ~=0.10).
1010
- Upgrade tabulate to <=0.9.0 (from <=0.8.10).
11+
- Scope down storage security group ingress rules to specific ports for all node types.
1112

1213
**BUG FIXES**
1314
- Add validation to block updates that change tag order. Blocking such change prevents update failures.
1415
- Fix LoginNodes NLB not being publicly accessible when using public subnet with implicit main route table association.
1516
See https://github.com/aws/aws-parallelcluster/issues/7173
1617
- Fix cluster creation failure without Internet access when GPU instances and DCV are used.
18+
- Fix cluster creation failure caused by eventual consistency issues when head, compute and login nodes have the same security group.
1719

1820
3.14.1
1921
------

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -795,19 +795,26 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
795795
"Storage": [storage_security_group.ref],
796796
}
797797

798+
# Deduplicate SGs to avoid creating identical rules for the same security group.
799+
# This prevents EC2 duplicate rule errors which are susceptible to eventual consistency issues.
800+
seen_sgs = set()
798801
for sg_type, sg_refs in target_security_groups.items():
799802
for sg_ref_id, sg_ref in enumerate(sg_refs):
800-
# TODO Scope down ingress rules to allow only traffic on the strictly necessary ports.
801-
# Currently scoped down only on Login nodes to limit blast radius.
803+
# This conversion is required because sg_ref could either be a string or a CDK token reference.
804+
# A CDK token must be converted to a string to make it comparable within the set.
805+
sg_key = str(sg_ref)
806+
if sg_key in seen_sgs:
807+
continue
808+
seen_sgs.add(sg_key)
809+
# Scope down ingress rules to allow only traffic on the strictly necessary ports.
802810
ingress_protocol = "-1"
803811
ingress_port = ALL_PORTS_RANGE
804-
if sg_type == "Login":
805-
if storage_type == SharedStorageType.EFS:
806-
ingress_protocol = "tcp"
807-
ingress_port = EFS_PORT
808-
elif storage_type == SharedStorageType.FSX:
809-
ingress_protocol = "tcp"
810-
ingress_port = FSX_PORTS[LUSTRE]["tcp"][0]
812+
if storage_type == SharedStorageType.EFS:
813+
ingress_protocol = "tcp"
814+
ingress_port = EFS_PORT
815+
elif storage_type == SharedStorageType.FSX:
816+
ingress_protocol = "tcp"
817+
ingress_port = FSX_PORTS[LUSTRE]["tcp"][0]
811818
ingress_rule = self._allow_all_ingress(
812819
description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}",
813820
source_security_group_id=sg_ref,

cli/tests/pcluster/templates/test_cluster_stack.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,3 +1374,51 @@ def test_resource_combination_name(
13741374
hash_length=hash_length,
13751375
)
13761376
assert_that(combination_name).is_equal_to(expected_combination_name)
1377+
1378+
1379+
def test_storage_security_group_deduplication(mocker, test_datadir):
1380+
"""
1381+
Test that storage security group rules are deduplicated when head, compute, and login nodes share the same SG.
1382+
1383+
When head node, compute nodes, and login nodes all use the same security group (sg-12345678),
1384+
only one set of ingress/egress rules should be created for that security group, not three separate sets.
1385+
Additionally, ingress rules should be scoped down to the specific ports required by the storage type.
1386+
"""
1387+
mock_aws_api(mocker)
1388+
mock_bucket(mocker)
1389+
mock_bucket_object_utils(mocker)
1390+
1391+
input_yaml = load_yaml_dict(test_datadir / "config-shared-sg.yaml")
1392+
cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml)
1393+
1394+
generated_template, _ = CDKTemplateBuilder().build_cluster_template(
1395+
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
1396+
)
1397+
1398+
# Test both EFS and FSx storage types
1399+
for storage_type, expected_port in [("EFS", 2049), ("FSX", 988)]:
1400+
storage_sg_ingress_rules = [
1401+
(name, resource)
1402+
for name, resource in generated_template["Resources"].items()
1403+
if resource["Type"] == "AWS::EC2::SecurityGroupIngress"
1404+
and name.startswith(storage_type)
1405+
and "SecurityGroup" in name
1406+
]
1407+
1408+
# Verify deduplication: only 2 rules instead of 4 (Head, Compute, Login share same SG + Storage SG)
1409+
assert_that(len(storage_sg_ingress_rules)).is_equal_to(2)
1410+
1411+
# Verify ingress rules are scoped down to the expected protocol and port
1412+
for _name, rule in storage_sg_ingress_rules:
1413+
props = rule["Properties"]
1414+
assert_that(props["IpProtocol"]).is_equal_to("tcp")
1415+
assert_that(props["FromPort"]).is_equal_to(expected_port)
1416+
assert_that(props["ToPort"]).is_equal_to(expected_port)
1417+
1418+
# Verify each unique source security group has exactly one ingress rule
1419+
source_sgs = {
1420+
str(rule["Properties"].get("SourceSecurityGroupId"))
1421+
for _, rule in storage_sg_ingress_rules
1422+
if rule["Properties"].get("SourceSecurityGroupId")
1423+
}
1424+
assert_that(len(storage_sg_ingress_rules)).is_equal_to(len(source_sgs))
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
Image:
2+
Os: alinux2
3+
HeadNode:
4+
InstanceType: t3.micro
5+
Ssh:
6+
KeyName: ec2-key-name
7+
Networking:
8+
SubnetId: subnet-12345678
9+
SecurityGroups:
10+
- sg-12345678
11+
Scheduling:
12+
Scheduler: slurm
13+
SlurmQueues:
14+
- Name: queue1
15+
Networking:
16+
SubnetIds:
17+
- subnet-12345678
18+
SecurityGroups:
19+
- sg-12345678
20+
ComputeResources:
21+
- Name: compute_resource1
22+
InstanceType: t3.micro
23+
MinCount: 0
24+
MaxCount: 10
25+
LoginNodes:
26+
Pools:
27+
- Name: login
28+
InstanceType: t3.micro
29+
Count: 1
30+
Networking:
31+
SubnetIds:
32+
- subnet-12345678
33+
SecurityGroups:
34+
- sg-12345678
35+
SharedStorage:
36+
- Name: efs
37+
StorageType: Efs
38+
MountDir: /efs
39+
EfsSettings:
40+
ThroughputMode: bursting
41+
- Name: fsx
42+
StorageType: FsxLustre
43+
MountDir: /fsx
44+
FsxLustreSettings:
45+
StorageCapacity: 1200
46+
DeploymentType: SCRATCH_2

cli/tests/pcluster/templates/test_shared_storage.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,9 @@ def test_shared_storage_efs(mocker, test_datadir, config_file_name, storage_name
107107
else f"{cluster_config.login_nodes.pools[0].name}LoginNodesSecurityGroup"
108108
)
109109
for sg in ["HeadNodeSecurityGroup", "ComputeSecurityGroup", login_nodes_sg_name, mount_target_sg_name]:
110-
ingress_ip_protocol = "tcp" if sg == login_nodes_sg_name else "-1"
111-
ingress_port_range = [2049, 2049] if sg == login_nodes_sg_name else [0, 65535]
110+
# All node types now have scoped-down ingress rules (tcp/2049 for EFS)
111+
ingress_ip_protocol = "tcp"
112+
ingress_port_range = [2049, 2049]
112113
rule_deletion_policy = deletion_policy if sg == mount_target_sg_name else None
113114
assert_sg_rule(
114115
generated_template,
@@ -173,8 +174,9 @@ def test_shared_storage_fsx(mocker, test_datadir, config_file_name, storage_name
173174
else f"{cluster_config.login_nodes.pools[0].name}LoginNodesSecurityGroup"
174175
)
175176
for sg in ["HeadNodeSecurityGroup", "ComputeSecurityGroup", login_nodes_sg_name, file_system_sg_name]:
176-
ingress_ip_protocol = "tcp" if sg == login_nodes_sg_name else "-1"
177-
ingress_port_range = [988, 988] if sg == login_nodes_sg_name else [0, 65535]
177+
# All node types now have scoped-down ingress rules (tcp/988 for FSx Lustre)
178+
ingress_ip_protocol = "tcp"
179+
ingress_port_range = [988, 988]
178180
rule_deletion_policy = deletion_policy if sg == file_system_sg_name else None
179181
assert_sg_rule(
180182
generated_template,

0 commit comments

Comments
 (0)