Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ CHANGELOG
- Add validator that warns against the downsides of disabling in-place updates on compute and login nodes through DevSettings.
- Upgrade jmespath to ~=1.0 (from ~=0.10).
- Upgrade tabulate to <=0.9.0 (from <=0.8.10).
- Scope down storage security group ingress rules to specific ports for all node types.

**BUG FIXES**
- Add validation to block updates that change tag order. Blocking such change prevents update failures.
Expand Down
3 changes: 1 addition & 2 deletions cli/src/pcluster/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,7 @@

FSX_PORTS = {
# Lustre Security group: https://docs.aws.amazon.com/fsx/latest/LustreGuide/limit-access-security-groups.html
# Among the Lustre ports, only 988 is mandatory to provide basic features.
LUSTRE: {"tcp": [988, (1018, 1023)]},
LUSTRE: {"tcp": [988]},
# OpenZFS Security group: https://docs.aws.amazon.com/fsx/latest/OpenZFSGuide/limit-access-security-groups.html
OPENZFS: {"tcp": [111, 2049, 20001, 20002, 20003], "udp": [111, 2049, 20001, 20002, 20003]},
# Ontap Security group: https://docs.aws.amazon.com/fsx/latest/ONTAPGuide/limit-access-security-groups.html
Expand Down
47 changes: 21 additions & 26 deletions cli/src/pcluster/templates/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,33 +806,25 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
if sg_key in seen_sgs:
continue
seen_sgs.add(sg_key)

# For Storage-to-Storage, allow all traffic.
# For Head/Compute/Login nodes, allow only the required ports.
storage_ports = {
SharedStorageType.EFS: ("tcp", [EFS_PORT]),
SharedStorageType.FSX: ("tcp", FSX_PORTS[LUSTRE]["tcp"]),
}
ingress_protocol, ingress_ports = (
("-1", [ALL_PORTS_RANGE])
if sg_type == "Storage"
else storage_ports.get(storage_type, ("-1", [ALL_PORTS_RANGE]))
# TODO Scope down ingress rules to allow only traffic on the strictly necessary ports.
# Currently scoped down only on Login nodes to limit blast radius.
ingress_protocol = "-1"
ingress_port = ALL_PORTS_RANGE
if sg_type == "Login":
if storage_type == SharedStorageType.EFS:
ingress_protocol = "tcp"
ingress_port = EFS_PORT
elif storage_type == SharedStorageType.FSX:
ingress_protocol = "tcp"
ingress_port = FSX_PORTS[LUSTRE]["tcp"][0]
ingress_rule = self._allow_all_ingress(
description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}",
source_security_group_id=sg_ref,
group_id=storage_security_group.ref,
ip_protocol=ingress_protocol,
port=ingress_port,
)

for rule_id, ingress_port in enumerate(ingress_ports):
ingress_rule = self._allow_all_ingress(
description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}Rule{rule_id}",
source_security_group_id=sg_ref,
group_id=storage_security_group.ref,
ip_protocol=ingress_protocol,
port=ingress_port,
)
rules.append(ingress_rule)

if sg_type == "Storage":
ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = (
storage_deletion_policy
)
rules.append(ingress_rule)

egress_rule = self._allow_all_egress(
description=f"{storage_cfn_id}SecurityGroup{sg_type}Egress{sg_ref_id}",
Expand All @@ -842,6 +834,9 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
rules.append(egress_rule)

if sg_type == "Storage":
ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = (
storage_deletion_policy
)
egress_rule.cfn_options.deletion_policy = egress_rule.cfn_options.update_replace_policy = (
storage_deletion_policy
)
Expand Down
8 changes: 2 additions & 6 deletions cli/src/pcluster/validators/cluster_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
DELETE_POLICY,
EFS_PORT,
FSX_PORTS,
LUSTRE,
PCLUSTER_IMAGE_BUILD_STATUS_TAG,
PCLUSTER_NAME_MAX_LENGTH,
PCLUSTER_NAME_MAX_LENGTH_SLURM_ACCOUNTING,
Expand Down Expand Up @@ -661,14 +660,11 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id
network_interfaces = [ni for ni in network_interface_responses if ni.get("VpcId") == vpc_id]

for protocol, ports in FSX_PORTS[file_storage.file_storage_type].items():
# For Lustre, only validate the first port (988), which is the only mandatory one.
# Other ports (1018-1023) are optional so we do not enforce them.
ports_to_validate = [ports[0]] if file_storage.file_storage_type == LUSTRE else ports
missing_ports = self._get_missing_ports(
security_groups_by_nodes,
subnet_ids,
network_interfaces,
ports_to_validate,
ports,
protocol,
file_storage.file_storage_type,
)
Expand All @@ -680,7 +676,7 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id
self._add_failure(
f"The current security group settings on file storage '{file_storage_id}' does not"
" satisfy mounting requirement. The file storage must be associated to a security group"
f" that allows {direction} {protocol.upper()} traffic through ports {ports_to_validate}. "
f" that allows {direction} {protocol.upper()} traffic through ports {ports}. "
f"Missing ports: {missing_ports}",
FailureLevel.ERROR,
)
Expand Down
110 changes: 16 additions & 94 deletions cli/tests/pcluster/templates/test_cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,7 @@ def test_storage_security_group_deduplication(mocker, test_datadir):

When head node, compute nodes, and login nodes all use the same security group (sg-12345678),
only one set of ingress/egress rules should be created for that security group, not three separate sets.
The deduplicated rule uses the first occurrence's settings (Head node), which uses all protocols (-1).
"""
mock_aws_api(mocker)
mock_bucket(mocker)
Expand All @@ -1394,102 +1395,23 @@ def test_storage_security_group_deduplication(mocker, test_datadir):
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
)

# The EFS storage security group must have 2 ingress rules:
# * allow traffic from cluster nodes (port 2049)
# * allow traffic from storage nodes (all traffic)
efs_sg_ingress_rules = [
(name, resource)
for name, resource in generated_template["Resources"].items()
if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("EFS") and "SecurityGroup" in name
]
assert_that(len(efs_sg_ingress_rules)).is_equal_to(2)

# The FSx Lustre storage security group must have 3 rules:
# * allow traffic from cluster nodes (port 2049)
# * allow traffic from cluster nodes (ports 1018-1023)
# * allow traffic from storage nodes (all traffic)
fsx_sg_ingress_rules = [
(name, resource)
for name, resource in generated_template["Resources"].items()
if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("FSX") and "SecurityGroup" in name
]
assert_that(len(fsx_sg_ingress_rules)).is_equal_to(3)
# Test both EFS and FSx storage types
for storage_type in ["EFS", "FSX"]:
storage_sg_ingress_rules = [
(name, resource)
for name, resource in generated_template["Resources"].items()
if resource["Type"] == "AWS::EC2::SecurityGroupIngress"
and name.startswith(storage_type)
and "SecurityGroup" in name
]

# Verify deduplication: only 2 rules instead of 4 (Head, Compute, Login share same SG + Storage SG)
assert_that(len(storage_sg_ingress_rules)).is_equal_to(2)

# Verify each storage type has the expected unique source security groups
for _storage_type, rules in [("EFS", efs_sg_ingress_rules), ("FSX", fsx_sg_ingress_rules)]:
# Verify each unique source security group has exactly one ingress rule
source_sgs = {
str(rule["Properties"].get("SourceSecurityGroupId"))
for _, rule in rules
for _, rule in storage_sg_ingress_rules
if rule["Properties"].get("SourceSecurityGroupId")
}
# Should have 2 unique source SGs (shared SG + storage SG)
assert_that(len(source_sgs)).is_equal_to(2)


def test_storage_security_group_port_restrictions(mocker, test_datadir):
"""
Test that storage security group rules use restricted ports for head/compute/login nodes.

Security group rules should follow these principles:
1. Storage-to-Storage: Allow all traffic (protocol -1)
2. Head/Compute/Login nodes to EFS: Allow only TCP port 2049
3. Head/Compute/Login nodes to FSx Lustre: Allow only TCP ports 988 and 1018-1023
"""
mock_aws_api(mocker)
mock_bucket(mocker)
mock_bucket_object_utils(mocker)

input_yaml = load_yaml_dict(test_datadir / "config-shared-sg.yaml")
cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml)

generated_template, _ = CDKTemplateBuilder().build_cluster_template(
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
)

# Test EFS storage - should only allow port 2049
efs_ingress_rules = [
(name, resource)
for name, resource in generated_template["Resources"].items()
if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("EFS") and "SecurityGroup" in name
]

for name, rule in efs_ingress_rules:
props = rule["Properties"]
if "Storage" in name:
# Storage-to-Storage: all traffic allowed
assert_that(props["IpProtocol"]).is_equal_to("-1")
assert_that(props["FromPort"]).is_equal_to(0)
assert_that(props["ToPort"]).is_equal_to(65535)
else:
# Head/Compute/Login to EFS: only TCP port 2049
assert_that(props["IpProtocol"]).is_equal_to("tcp")
assert_that(props["FromPort"]).is_equal_to(2049)
assert_that(props["ToPort"]).is_equal_to(2049)

# Test FSx Lustre storage - should only allow ports 988 and 1018-1023
fsx_ingress_rules = [
(name, resource)
for name, resource in generated_template["Resources"].items()
if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("FSX") and "SecurityGroup" in name
]

# Collect non-storage rules to verify FSx ports
fsx_node_rules = [(name, rule) for name, rule in fsx_ingress_rules if "Storage" not in name]
fsx_storage_rules = [(name, rule) for name, rule in fsx_ingress_rules if "Storage" in name]

# Verify Storage-to-Storage rule allows all traffic
for _name, rule in fsx_storage_rules:
props = rule["Properties"]
assert_that(props["IpProtocol"]).is_equal_to("-1")
assert_that(props["FromPort"]).is_equal_to(0)
assert_that(props["ToPort"]).is_equal_to(65535)

# Verify Head/Compute/Login rules use TCP and FSx Lustre ports (988, 1018-1023)
fsx_ports_found = set()
for _name, rule in fsx_node_rules:
props = rule["Properties"]
assert_that(props["IpProtocol"]).is_equal_to("tcp")
fsx_ports_found.add((props["FromPort"], props["ToPort"]))

# Should have rules for port 988 and port range 1018-1023
assert_that(fsx_ports_found).contains((988, 988), (1018, 1023))
assert_that(len(storage_sg_ingress_rules)).is_equal_to(len(source_sgs))

This file was deleted.

44 changes: 13 additions & 31 deletions cli/tests/pcluster/templates/test_shared_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,9 @@ def test_shared_storage_efs(mocker, test_datadir, config_file_name, storage_name
else f"{cluster_config.login_nodes.pools[0].name}LoginNodesSecurityGroup"
)
for sg in ["HeadNodeSecurityGroup", "ComputeSecurityGroup", login_nodes_sg_name, mount_target_sg_name]:
ingress_ip_protocol = "tcp" if sg == login_nodes_sg_name else "-1"
ingress_port_range = [2049, 2049] if sg == login_nodes_sg_name else [0, 65535]
rule_deletion_policy = deletion_policy if sg == mount_target_sg_name else None
# Storage-to-Storage allows all traffic, node-to-storage uses scoped-down EFS port 2049
if sg == mount_target_sg_name:
ingress_ip_protocol = "-1"
ingress_port_range = [0, 65535]
else:
ingress_ip_protocol = "tcp"
ingress_port_range = [2049, 2049]
assert_sg_rule(
generated_template,
mount_target_sg_name,
Expand Down Expand Up @@ -178,31 +173,18 @@ def test_shared_storage_fsx(mocker, test_datadir, config_file_name, storage_name
else f"{cluster_config.login_nodes.pools[0].name}LoginNodesSecurityGroup"
)
for sg in ["HeadNodeSecurityGroup", "ComputeSecurityGroup", login_nodes_sg_name, file_system_sg_name]:
ingress_ip_protocol = "tcp" if sg == login_nodes_sg_name else "-1"
ingress_port_range = [988, 988] if sg == login_nodes_sg_name else [0, 65535]
rule_deletion_policy = deletion_policy if sg == file_system_sg_name else None
# Storage-to-Storage allows all traffic, node-to-storage uses scoped-down FSx Lustre ports
if sg == file_system_sg_name:
assert_sg_rule(
generated_template,
file_system_sg_name,
rule_type="ingress",
protocol="-1",
port_range=[0, 65535],
target_sg=sg,
deletion_policy=rule_deletion_policy,
)
else:
ingress_ip_protocol = "tcp"
ingress_port_ranges = [(988, 988), (1018, 1023)]
for ingress_port_range in ingress_port_ranges:
assert_sg_rule(
generated_template,
file_system_sg_name,
rule_type="ingress",
protocol=ingress_ip_protocol,
port_range=ingress_port_range,
target_sg=sg,
deletion_policy=rule_deletion_policy,
)
assert_sg_rule(
generated_template,
file_system_sg_name,
rule_type="ingress",
protocol=ingress_ip_protocol,
port_range=ingress_port_range,
target_sg=sg,
deletion_policy=rule_deletion_policy,
)
assert_sg_rule(
generated_template,
file_system_sg_name,
Expand Down
2 changes: 1 addition & 1 deletion tests/integration-tests/tests/storage/test_fsx_lustre.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ def _create_file_cache(file_cache_path, bucket_name):

def _create_fsx_lustre_volume_ids(num_existing_fsx_lustre, fsx_factory, import_path, export_path):
return fsx_factory(
ports=[988, 1018, 1019, 1020, 1021, 1022, 1023],
ports=[988],
ip_protocols=["tcp"],
num=num_existing_fsx_lustre,
file_system_type="LUSTRE",
Expand Down
Loading