diff --git a/CHANGELOG.md b/CHANGELOG.md index b045c2d915..63978fa573 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ CHANGELOG - Add validator that warns against the downsides of disabling in-place updates on compute and login nodes through DevSettings. - Upgrade jmespath to ~=1.0 (from ~=0.10). - Upgrade tabulate to <=0.9.0 (from <=0.8.10). -- Scope down storage security group ingress rules to specific ports for all node types. **BUG FIXES** - Add validation to block updates that change tag order. Blocking such change prevents update failures. diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index e7864ef307..6e68cdaea7 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -127,8 +127,7 @@ FSX_PORTS = { # Lustre Security group: https://docs.aws.amazon.com/fsx/latest/LustreGuide/limit-access-security-groups.html - # Among the Lustre ports, only 988 is mandatory to provide basic features. - LUSTRE: {"tcp": [988, (1018, 1023)]}, + LUSTRE: {"tcp": [988]}, # OpenZFS Security group: https://docs.aws.amazon.com/fsx/latest/OpenZFSGuide/limit-access-security-groups.html OPENZFS: {"tcp": [111, 2049, 20001, 20002, 20003], "udp": [111, 2049, 20001, 20002, 20003]}, # Ontap Security group: https://docs.aws.amazon.com/fsx/latest/ONTAPGuide/limit-access-security-groups.html diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index 00ea082b6c..66fdf179c5 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -806,33 +806,25 @@ def _add_storage_security_group(self, storage_cfn_id, storage): if sg_key in seen_sgs: continue seen_sgs.add(sg_key) - - # For Storage-to-Storage, allow all traffic. - # For Head/Compute/Login nodes, allow only the required ports. - storage_ports = { - SharedStorageType.EFS: ("tcp", [EFS_PORT]), - SharedStorageType.FSX: ("tcp", FSX_PORTS[LUSTRE]["tcp"]), - } - ingress_protocol, ingress_ports = ( - ("-1", [ALL_PORTS_RANGE]) - if sg_type == "Storage" - else storage_ports.get(storage_type, ("-1", [ALL_PORTS_RANGE])) + # TODO Scope down ingress rules to allow only traffic on the strictly necessary ports. + # Currently scoped down only on Login nodes to limit blast radius. + ingress_protocol = "-1" + ingress_port = ALL_PORTS_RANGE + if sg_type == "Login": + if storage_type == SharedStorageType.EFS: + ingress_protocol = "tcp" + ingress_port = EFS_PORT + elif storage_type == SharedStorageType.FSX: + ingress_protocol = "tcp" + ingress_port = FSX_PORTS[LUSTRE]["tcp"][0] + ingress_rule = self._allow_all_ingress( + description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}", + source_security_group_id=sg_ref, + group_id=storage_security_group.ref, + ip_protocol=ingress_protocol, + port=ingress_port, ) - - for rule_id, ingress_port in enumerate(ingress_ports): - ingress_rule = self._allow_all_ingress( - description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}Rule{rule_id}", - source_security_group_id=sg_ref, - group_id=storage_security_group.ref, - ip_protocol=ingress_protocol, - port=ingress_port, - ) - rules.append(ingress_rule) - - if sg_type == "Storage": - ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = ( - storage_deletion_policy - ) + rules.append(ingress_rule) egress_rule = self._allow_all_egress( description=f"{storage_cfn_id}SecurityGroup{sg_type}Egress{sg_ref_id}", @@ -842,6 +834,9 @@ def _add_storage_security_group(self, storage_cfn_id, storage): rules.append(egress_rule) if sg_type == "Storage": + ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = ( + storage_deletion_policy + ) egress_rule.cfn_options.deletion_policy = egress_rule.cfn_options.update_replace_policy = ( storage_deletion_policy ) diff --git a/cli/src/pcluster/validators/cluster_validators.py b/cli/src/pcluster/validators/cluster_validators.py index 91181fc0e8..4e1d701bf9 100644 --- a/cli/src/pcluster/validators/cluster_validators.py +++ b/cli/src/pcluster/validators/cluster_validators.py @@ -26,7 +26,6 @@ DELETE_POLICY, EFS_PORT, FSX_PORTS, - LUSTRE, PCLUSTER_IMAGE_BUILD_STATUS_TAG, PCLUSTER_NAME_MAX_LENGTH, PCLUSTER_NAME_MAX_LENGTH_SLURM_ACCOUNTING, @@ -661,14 +660,11 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id network_interfaces = [ni for ni in network_interface_responses if ni.get("VpcId") == vpc_id] for protocol, ports in FSX_PORTS[file_storage.file_storage_type].items(): - # For Lustre, only validate the first port (988), which is the only mandatory one. - # Other ports (1018-1023) are optional so we do not enforce them. - ports_to_validate = [ports[0]] if file_storage.file_storage_type == LUSTRE else ports missing_ports = self._get_missing_ports( security_groups_by_nodes, subnet_ids, network_interfaces, - ports_to_validate, + ports, protocol, file_storage.file_storage_type, ) @@ -680,7 +676,7 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id self._add_failure( f"The current security group settings on file storage '{file_storage_id}' does not" " satisfy mounting requirement. The file storage must be associated to a security group" - f" that allows {direction} {protocol.upper()} traffic through ports {ports_to_validate}. " + f" that allows {direction} {protocol.upper()} traffic through ports {ports}. " f"Missing ports: {missing_ports}", FailureLevel.ERROR, ) diff --git a/cli/tests/pcluster/templates/test_cluster_stack.py b/cli/tests/pcluster/templates/test_cluster_stack.py index f804e7f9b9..d4c7886c67 100644 --- a/cli/tests/pcluster/templates/test_cluster_stack.py +++ b/cli/tests/pcluster/templates/test_cluster_stack.py @@ -1382,6 +1382,7 @@ def test_storage_security_group_deduplication(mocker, test_datadir): When head node, compute nodes, and login nodes all use the same security group (sg-12345678), only one set of ingress/egress rules should be created for that security group, not three separate sets. + The deduplicated rule uses the first occurrence's settings (Head node), which uses all protocols (-1). """ mock_aws_api(mocker) mock_bucket(mocker) @@ -1394,102 +1395,23 @@ def test_storage_security_group_deduplication(mocker, test_datadir): cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" ) - # The EFS storage security group must have 2 ingress rules: - # * allow traffic from cluster nodes (port 2049) - # * allow traffic from storage nodes (all traffic) - efs_sg_ingress_rules = [ - (name, resource) - for name, resource in generated_template["Resources"].items() - if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("EFS") and "SecurityGroup" in name - ] - assert_that(len(efs_sg_ingress_rules)).is_equal_to(2) - - # The FSx Lustre storage security group must have 3 rules: - # * allow traffic from cluster nodes (port 2049) - # * allow traffic from cluster nodes (ports 1018-1023) - # * allow traffic from storage nodes (all traffic) - fsx_sg_ingress_rules = [ - (name, resource) - for name, resource in generated_template["Resources"].items() - if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("FSX") and "SecurityGroup" in name - ] - assert_that(len(fsx_sg_ingress_rules)).is_equal_to(3) + # Test both EFS and FSx storage types + for storage_type in ["EFS", "FSX"]: + storage_sg_ingress_rules = [ + (name, resource) + for name, resource in generated_template["Resources"].items() + if resource["Type"] == "AWS::EC2::SecurityGroupIngress" + and name.startswith(storage_type) + and "SecurityGroup" in name + ] + + # Verify deduplication: only 2 rules instead of 4 (Head, Compute, Login share same SG + Storage SG) + assert_that(len(storage_sg_ingress_rules)).is_equal_to(2) - # Verify each storage type has the expected unique source security groups - for _storage_type, rules in [("EFS", efs_sg_ingress_rules), ("FSX", fsx_sg_ingress_rules)]: + # Verify each unique source security group has exactly one ingress rule source_sgs = { str(rule["Properties"].get("SourceSecurityGroupId")) - for _, rule in rules + for _, rule in storage_sg_ingress_rules if rule["Properties"].get("SourceSecurityGroupId") } - # Should have 2 unique source SGs (shared SG + storage SG) - assert_that(len(source_sgs)).is_equal_to(2) - - -def test_storage_security_group_port_restrictions(mocker, test_datadir): - """ - Test that storage security group rules use restricted ports for head/compute/login nodes. - - Security group rules should follow these principles: - 1. Storage-to-Storage: Allow all traffic (protocol -1) - 2. Head/Compute/Login nodes to EFS: Allow only TCP port 2049 - 3. Head/Compute/Login nodes to FSx Lustre: Allow only TCP ports 988 and 1018-1023 - """ - mock_aws_api(mocker) - mock_bucket(mocker) - mock_bucket_object_utils(mocker) - - input_yaml = load_yaml_dict(test_datadir / "config-shared-sg.yaml") - cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - - generated_template, _ = CDKTemplateBuilder().build_cluster_template( - cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" - ) - - # Test EFS storage - should only allow port 2049 - efs_ingress_rules = [ - (name, resource) - for name, resource in generated_template["Resources"].items() - if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("EFS") and "SecurityGroup" in name - ] - - for name, rule in efs_ingress_rules: - props = rule["Properties"] - if "Storage" in name: - # Storage-to-Storage: all traffic allowed - assert_that(props["IpProtocol"]).is_equal_to("-1") - assert_that(props["FromPort"]).is_equal_to(0) - assert_that(props["ToPort"]).is_equal_to(65535) - else: - # Head/Compute/Login to EFS: only TCP port 2049 - assert_that(props["IpProtocol"]).is_equal_to("tcp") - assert_that(props["FromPort"]).is_equal_to(2049) - assert_that(props["ToPort"]).is_equal_to(2049) - - # Test FSx Lustre storage - should only allow ports 988 and 1018-1023 - fsx_ingress_rules = [ - (name, resource) - for name, resource in generated_template["Resources"].items() - if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("FSX") and "SecurityGroup" in name - ] - - # Collect non-storage rules to verify FSx ports - fsx_node_rules = [(name, rule) for name, rule in fsx_ingress_rules if "Storage" not in name] - fsx_storage_rules = [(name, rule) for name, rule in fsx_ingress_rules if "Storage" in name] - - # Verify Storage-to-Storage rule allows all traffic - for _name, rule in fsx_storage_rules: - props = rule["Properties"] - assert_that(props["IpProtocol"]).is_equal_to("-1") - assert_that(props["FromPort"]).is_equal_to(0) - assert_that(props["ToPort"]).is_equal_to(65535) - - # Verify Head/Compute/Login rules use TCP and FSx Lustre ports (988, 1018-1023) - fsx_ports_found = set() - for _name, rule in fsx_node_rules: - props = rule["Properties"] - assert_that(props["IpProtocol"]).is_equal_to("tcp") - fsx_ports_found.add((props["FromPort"], props["ToPort"])) - - # Should have rules for port 988 and port range 1018-1023 - assert_that(fsx_ports_found).contains((988, 988), (1018, 1023)) + assert_that(len(storage_sg_ingress_rules)).is_equal_to(len(source_sgs)) diff --git a/cli/tests/pcluster/templates/test_cluster_stack/test_storage_security_group_port_restrictions/config-shared-sg.yaml b/cli/tests/pcluster/templates/test_cluster_stack/test_storage_security_group_port_restrictions/config-shared-sg.yaml deleted file mode 100644 index a0dfe5e0d4..0000000000 --- a/cli/tests/pcluster/templates/test_cluster_stack/test_storage_security_group_port_restrictions/config-shared-sg.yaml +++ /dev/null @@ -1,46 +0,0 @@ -Image: - Os: alinux2 -HeadNode: - InstanceType: t3.micro - Ssh: - KeyName: ec2-key-name - Networking: - SubnetId: subnet-12345678 - SecurityGroups: - - sg-12345678 -Scheduling: - Scheduler: slurm - SlurmQueues: - - Name: queue1 - Networking: - SubnetIds: - - subnet-12345678 - SecurityGroups: - - sg-12345678 - ComputeResources: - - Name: compute_resource1 - InstanceType: t3.micro - MinCount: 0 - MaxCount: 10 -LoginNodes: - Pools: - - Name: login - InstanceType: t3.micro - Count: 1 - Networking: - SubnetIds: - - subnet-12345678 - SecurityGroups: - - sg-12345678 -SharedStorage: - - Name: efs - StorageType: Efs - MountDir: /efs - EfsSettings: - ThroughputMode: bursting - - Name: fsx - StorageType: FsxLustre - MountDir: /fsx - FsxLustreSettings: - StorageCapacity: 1200 - DeploymentType: SCRATCH_2 diff --git a/cli/tests/pcluster/templates/test_shared_storage.py b/cli/tests/pcluster/templates/test_shared_storage.py index 71156566b9..5edfcd1ced 100644 --- a/cli/tests/pcluster/templates/test_shared_storage.py +++ b/cli/tests/pcluster/templates/test_shared_storage.py @@ -107,14 +107,9 @@ def test_shared_storage_efs(mocker, test_datadir, config_file_name, storage_name else f"{cluster_config.login_nodes.pools[0].name}LoginNodesSecurityGroup" ) for sg in ["HeadNodeSecurityGroup", "ComputeSecurityGroup", login_nodes_sg_name, mount_target_sg_name]: + ingress_ip_protocol = "tcp" if sg == login_nodes_sg_name else "-1" + ingress_port_range = [2049, 2049] if sg == login_nodes_sg_name else [0, 65535] rule_deletion_policy = deletion_policy if sg == mount_target_sg_name else None - # Storage-to-Storage allows all traffic, node-to-storage uses scoped-down EFS port 2049 - if sg == mount_target_sg_name: - ingress_ip_protocol = "-1" - ingress_port_range = [0, 65535] - else: - ingress_ip_protocol = "tcp" - ingress_port_range = [2049, 2049] assert_sg_rule( generated_template, mount_target_sg_name, @@ -178,31 +173,18 @@ def test_shared_storage_fsx(mocker, test_datadir, config_file_name, storage_name else f"{cluster_config.login_nodes.pools[0].name}LoginNodesSecurityGroup" ) for sg in ["HeadNodeSecurityGroup", "ComputeSecurityGroup", login_nodes_sg_name, file_system_sg_name]: + ingress_ip_protocol = "tcp" if sg == login_nodes_sg_name else "-1" + ingress_port_range = [988, 988] if sg == login_nodes_sg_name else [0, 65535] rule_deletion_policy = deletion_policy if sg == file_system_sg_name else None - # Storage-to-Storage allows all traffic, node-to-storage uses scoped-down FSx Lustre ports - if sg == file_system_sg_name: - assert_sg_rule( - generated_template, - file_system_sg_name, - rule_type="ingress", - protocol="-1", - port_range=[0, 65535], - target_sg=sg, - deletion_policy=rule_deletion_policy, - ) - else: - ingress_ip_protocol = "tcp" - ingress_port_ranges = [(988, 988), (1018, 1023)] - for ingress_port_range in ingress_port_ranges: - assert_sg_rule( - generated_template, - file_system_sg_name, - rule_type="ingress", - protocol=ingress_ip_protocol, - port_range=ingress_port_range, - target_sg=sg, - deletion_policy=rule_deletion_policy, - ) + assert_sg_rule( + generated_template, + file_system_sg_name, + rule_type="ingress", + protocol=ingress_ip_protocol, + port_range=ingress_port_range, + target_sg=sg, + deletion_policy=rule_deletion_policy, + ) assert_sg_rule( generated_template, file_system_sg_name, diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index 3c2eaf3b2c..069e88ea4a 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -521,7 +521,7 @@ def _create_file_cache(file_cache_path, bucket_name): def _create_fsx_lustre_volume_ids(num_existing_fsx_lustre, fsx_factory, import_path, export_path): return fsx_factory( - ports=[988, 1018, 1019, 1020, 1021, 1022, 1023], + ports=[988], ip_protocols=["tcp"], num=num_existing_fsx_lustre, file_system_type="LUSTRE",