Skip to content

Commit f5db84c

Browse files
authored
Merge branch 'develop' into develop
2 parents d305cb7 + 1e5018e commit f5db84c

File tree

36 files changed

+1798
-350
lines changed

36 files changed

+1798
-350
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,14 @@ CHANGELOG
88
- Add validator that warns against the downsides of disabling in-place updates on compute and login nodes through DevSettings.
99
- Upgrade jmespath to ~=1.0 (from ~=0.10).
1010
- Upgrade tabulate to <=0.9.0 (from <=0.8.10).
11+
- Scope down storage security group ingress rules to specific ports for all node types.
1112

1213
**BUG FIXES**
1314
- Add validation to block updates that change tag order. Blocking such change prevents update failures.
15+
- Fix LoginNodes NLB not being publicly accessible when using public subnet with implicit main route table association.
16+
See https://github.com/aws/aws-parallelcluster/issues/7173
17+
- Fix cluster creation failure without Internet access when GPU instances and DCV are used.
18+
- Fix intermittent cluster creation failure caused by eventual consistency issues when head, compute and login nodes have the same security group.
1419

1520
3.14.1
1621
------

cli/src/pcluster/cli/commands/ssh.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,13 @@ class SshCommand(CliCommand):
7979
"Run ssh command with the cluster username and IP address pre-populated. "
8080
"Arbitrary arguments are appended to the end of the ssh command."
8181
)
82-
epilog = textwrap.dedent(
83-
"""Example:
82+
epilog = textwrap.dedent("""Example:
8483
8584
pcluster ssh --cluster-name mycluster -i ~/.ssh/id_rsa
8685
8786
Returns an ssh command with the cluster username and IP address pre-populated:
8887
89-
ssh ec2-user@1.1.1.1 -i ~/.ssh/id_rsa"""
90-
)
88+
ssh ec2-user@1.1.1.1 -i ~/.ssh/id_rsa""")
9189

9290
def __init__(self, subparsers):
9391
super().__init__(

cli/src/pcluster/config/update_policy.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
SLURM,
2121
STORAGE_TYPES_SUPPORTING_LIVE_UPDATES,
2222
)
23+
from pcluster.utils import get_dictionary_diff, parse_json_string
2324

2425

2526
class UpdatePolicy:
@@ -510,6 +511,38 @@ def get_pool_name_from_change_paths(change):
510511
return ""
511512

512513

514+
# ExtraChefAttributes update policy helpers
515+
# Define which paths within ExtraChefAttributes JSON are updatable (dot-notation)
516+
# IMPORTANT: We assume this set to contain the path to leaf fields.
517+
EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS: set[str] = {
518+
"cluster.slurm.reconfigure_timeout",
519+
}
520+
521+
522+
def _get_non_updatable_changes(old_value: str, new_value: str) -> list[str]:
523+
"""Parse and compare two ExtraChefAttributes JSON strings, returning paths that are not updatable."""
524+
old_attrs = parse_json_string(old_value, raise_on_error=False, default={}) if old_value else {}
525+
new_attrs = parse_json_string(new_value, raise_on_error=False, default={}) if new_value else {}
526+
527+
return [p for p in get_dictionary_diff(old_attrs, new_attrs) if p not in EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS]
528+
529+
530+
def condition_checker_extra_chef_attributes(change, _) -> bool:
531+
"""
532+
Check if ExtraChefAttributes changes are allowed.
533+
534+
Only changes to paths defined in EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS are allowed.
535+
"""
536+
return len(_get_non_updatable_changes(change.old_value, change.new_value)) == 0
537+
538+
539+
def fail_reason_extra_chef_attributes(change, _) -> str:
540+
"""Generate fail reason for ExtraChefAttributes update."""
541+
non_updatable_changes = _get_non_updatable_changes(change.old_value, change.new_value)
542+
paths_str = ", ".join(sorted(non_updatable_changes))
543+
return f"The following ExtraChefAttributes fields cannot be updated: {paths_str}"
544+
545+
513546
# Common fail_reason messages
514547
UpdatePolicy.FAIL_REASONS = {
515548
"ebs_volume_resize": "Updating the file system after a resize operation requires commands specific to your "
@@ -526,6 +559,7 @@ def get_pool_name_from_change_paths(change):
526559
"compute_or_login_nodes_running": lambda change, patch: (
527560
"The update is not supported when compute or login nodes are running"
528561
),
562+
"extra_chef_attributes_update": fail_reason_extra_chef_attributes,
529563
}
530564

531565
# Common action_needed messages
@@ -548,6 +582,9 @@ def get_pool_name_from_change_paths(change):
548582
"Stop the login nodes by setting Count parameter to 0 "
549583
"and update the cluster with the pcluster update-cluster command"
550584
),
585+
"extra_chef_attributes_update": lambda change, patch: (
586+
"Revert the non-updatable ExtraChefAttributes fields to their original values."
587+
),
551588
}
552589

553590
# Base policies
@@ -567,6 +604,15 @@ def get_pool_name_from_change_paths(change):
567604
name="SUPPORTED", level=0, fail_reason="-", condition_checker=(lambda change, patch: True)
568605
)
569606

607+
# Update policy for ExtraChefAttributes - allows updates to specific fields only
608+
UpdatePolicy.EXTRA_CHEF_ATTRIBUTES = UpdatePolicy(
609+
name="EXTRA_CHEF_ATTRIBUTES",
610+
level=5,
611+
fail_reason=UpdatePolicy.FAIL_REASONS["extra_chef_attributes_update"],
612+
action_needed=UpdatePolicy.ACTIONS_NEEDED["extra_chef_attributes_update"],
613+
condition_checker=condition_checker_extra_chef_attributes,
614+
)
615+
570616
# Checks resize of max_vcpus in Batch Compute Environment
571617
UpdatePolicy.AWSBATCH_CE_MAX_RESIZE = UpdatePolicy(
572618
name="AWSBATCH_CE_MAX_RESIZE",

cli/src/pcluster/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@
127127

128128
FSX_PORTS = {
129129
# Lustre Security group: https://docs.aws.amazon.com/fsx/latest/LustreGuide/limit-access-security-groups.html
130-
LUSTRE: {"tcp": [988]},
130+
# Among the Lustre ports, only 988 is mandatory to provide basic features.
131+
LUSTRE: {"tcp": [988, (1018, 1023)]},
131132
# OpenZFS Security group: https://docs.aws.amazon.com/fsx/latest/OpenZFSGuide/limit-access-security-groups.html
132133
OPENZFS: {"tcp": [111, 2049, 20001, 20002, 20003], "udp": [111, 2049, 20001, 20002, 20003]},
133134
# Ontap Security group: https://docs.aws.amazon.com/fsx/latest/ONTAPGuide/limit-access-security-groups.html

cli/src/pcluster/schemas/common_schema.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -216,13 +216,7 @@ class CookbookSchema(BaseSchema):
216216
)
217217
}
218218
)
219-
extra_chef_attributes = fields.Str(
220-
metadata={
221-
"update_policy": UpdatePolicy(
222-
UpdatePolicy.UNSUPPORTED, fail_reason=UpdatePolicy.FAIL_REASONS["cookbook_update"]
223-
)
224-
}
225-
)
219+
extra_chef_attributes = fields.Str(metadata={"update_policy": UpdatePolicy.EXTRA_CHEF_ATTRIBUTES})
226220

227221
@post_load()
228222
def make_resource(self, data, **kwargs):
@@ -275,7 +269,7 @@ def make_resource(self, data, **kwargs):
275269
class BaseDevSettingsSchema(BaseSchema):
276270
"""Represent the common schema of Dev Setting for ImageBuilder and Cluster."""
277271

278-
cookbook = fields.Nested(CookbookSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
272+
cookbook = fields.Nested(CookbookSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
279273
node_package = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
280274
aws_batch_cli_package = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
281275

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -795,27 +795,44 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
795795
"Storage": [storage_security_group.ref],
796796
}
797797

798+
# Deduplicate SGs to avoid creating identical rules for the same security group.
799+
# This prevents EC2 duplicate rule errors which are susceptible to eventual consistency issues.
800+
seen_sgs = set()
798801
for sg_type, sg_refs in target_security_groups.items():
799802
for sg_ref_id, sg_ref in enumerate(sg_refs):
800-
# TODO Scope down ingress rules to allow only traffic on the strictly necessary ports.
801-
# Currently scoped down only on Login nodes to limit blast radius.
802-
ingress_protocol = "-1"
803-
ingress_port = ALL_PORTS_RANGE
804-
if sg_type == "Login":
805-
if storage_type == SharedStorageType.EFS:
806-
ingress_protocol = "tcp"
807-
ingress_port = EFS_PORT
808-
elif storage_type == SharedStorageType.FSX:
809-
ingress_protocol = "tcp"
810-
ingress_port = FSX_PORTS[LUSTRE]["tcp"][0]
811-
ingress_rule = self._allow_all_ingress(
812-
description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}",
813-
source_security_group_id=sg_ref,
814-
group_id=storage_security_group.ref,
815-
ip_protocol=ingress_protocol,
816-
port=ingress_port,
803+
# This conversion is required because sg_ref could either be a string or a CDK token reference.
804+
# A CDK token must be converted to a string to make it comparable within the set.
805+
sg_key = str(sg_ref)
806+
if sg_key in seen_sgs:
807+
continue
808+
seen_sgs.add(sg_key)
809+
810+
# For Storage-to-Storage, allow all traffic.
811+
# For Head/Compute/Login nodes, allow only the required ports.
812+
storage_ports = {
813+
SharedStorageType.EFS: ("tcp", [EFS_PORT]),
814+
SharedStorageType.FSX: ("tcp", FSX_PORTS[LUSTRE]["tcp"]),
815+
}
816+
ingress_protocol, ingress_ports = (
817+
("-1", [ALL_PORTS_RANGE])
818+
if sg_type == "Storage"
819+
else storage_ports.get(storage_type, ("-1", [ALL_PORTS_RANGE]))
817820
)
818-
rules.append(ingress_rule)
821+
822+
for rule_id, ingress_port in enumerate(ingress_ports):
823+
ingress_rule = self._allow_all_ingress(
824+
description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}Rule{rule_id}",
825+
source_security_group_id=sg_ref,
826+
group_id=storage_security_group.ref,
827+
ip_protocol=ingress_protocol,
828+
port=ingress_port,
829+
)
830+
rules.append(ingress_rule)
831+
832+
if sg_type == "Storage":
833+
ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = (
834+
storage_deletion_policy
835+
)
819836

820837
egress_rule = self._allow_all_egress(
821838
description=f"{storage_cfn_id}SecurityGroup{sg_type}Egress{sg_ref_id}",
@@ -825,9 +842,6 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
825842
rules.append(egress_rule)
826843

827844
if sg_type == "Storage":
828-
ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = (
829-
storage_deletion_policy
830-
)
831845
egress_rule.cfn_options.deletion_policy = egress_rule.cfn_options.update_replace_policy = (
832846
storage_deletion_policy
833847
)

cli/src/pcluster/utils.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,3 +614,79 @@ def get_needed_ultraserver_capacity_block_statuses(statuses, capacity_reservatio
614614
):
615615
needed_capacity_block_statuses.append(status)
616616
return needed_capacity_block_statuses
617+
618+
619+
def parse_json_string(value: str, raise_on_error: bool = False, default: any = None) -> any:
620+
"""
621+
Parse a JSON string into a Python object.
622+
623+
:param value: JSON string to parse
624+
:param raise_on_error: If True, raises exception on parse failure; if False, returns default
625+
:param default: Value to return on parse failure when raise_on_error is False
626+
:return: Parsed JSON object or default value on failure
627+
"""
628+
try:
629+
return json.loads(value)
630+
except (json.JSONDecodeError, TypeError) as e:
631+
LOGGER.error("Failed to parse JSON string: %s. Error: %s", value, e)
632+
if raise_on_error:
633+
raise e
634+
return default
635+
636+
637+
def _collect_leaves(d: dict, prefix: str) -> set[str]:
638+
"""Collect all leaf paths from a dict."""
639+
leaves = set()
640+
for k, v in d.items():
641+
key_path = f"{prefix}.{k}" if prefix else str(k)
642+
if isinstance(v, dict) and v:
643+
leaves.update(_collect_leaves(v, key_path))
644+
else:
645+
leaves.add(key_path)
646+
return leaves if leaves else {prefix} if prefix else set()
647+
648+
649+
def _collect_diff_for_key(d: dict, key: str, path: str) -> set[str]:
650+
"""Collect leaf paths for a key that exists only in one dictionary."""
651+
key_path = f"{path}.{key}" if path else str(key)
652+
if isinstance(d[key], dict) and d[key]:
653+
return _collect_leaves(d[key], key_path)
654+
return {key_path}
655+
656+
657+
def get_dictionary_diff(d1: dict, d2: dict, path: str = "") -> set[str]:
658+
"""
659+
Recursively find all leaf paths where two dictionaries differ.
660+
661+
Returns a set of dot-notation paths (e.g., "cluster.settings.enabled") where
662+
the dictionaries have different values, including added or removed keys.
663+
Always returns leaf paths, never intermediate dict paths.
664+
665+
:param d1: First dictionary to compare (if None, treated as empty dict)
666+
:param d2: Second dictionary to compare (if None, treated as empty dict)
667+
:param path: Current path prefix (used for recursion)
668+
:return: Set of dot-notation paths where dictionaries differ.
669+
If no differences detected return empoty set.
670+
"""
671+
d1 = d1 or {}
672+
d2 = d2 or {}
673+
674+
diffs: set[str] = set()
675+
keys1, keys2 = set(d1.keys()), set(d2.keys())
676+
677+
# Collect paths that are added or remove
678+
for k in keys1 - keys2:
679+
diffs.update(_collect_diff_for_key(d1, k, path))
680+
681+
for k in keys2 - keys1:
682+
diffs.update(_collect_diff_for_key(d2, k, path))
683+
684+
# Compare paths
685+
for k in keys1 & keys2:
686+
key_path = f"{path}.{k}" if path else str(k)
687+
if isinstance(d1[k], dict) and isinstance(d2[k], dict):
688+
diffs.update(get_dictionary_diff(d1[k], d2[k], key_path))
689+
elif d1[k] != d2[k]:
690+
diffs.add(key_path)
691+
692+
return diffs

cli/src/pcluster/validators/cluster_validators.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
DELETE_POLICY,
2727
EFS_PORT,
2828
FSX_PORTS,
29+
LUSTRE,
2930
PCLUSTER_IMAGE_BUILD_STATUS_TAG,
3031
PCLUSTER_NAME_MAX_LENGTH,
3132
PCLUSTER_NAME_MAX_LENGTH_SLURM_ACCOUNTING,
@@ -660,11 +661,14 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id
660661
network_interfaces = [ni for ni in network_interface_responses if ni.get("VpcId") == vpc_id]
661662

662663
for protocol, ports in FSX_PORTS[file_storage.file_storage_type].items():
664+
# For Lustre, only validate the first port (988), which is the only mandatory one.
665+
# Other ports (1018-1023) are optional so we do not enforce them.
666+
ports_to_validate = [ports[0]] if file_storage.file_storage_type == LUSTRE else ports
663667
missing_ports = self._get_missing_ports(
664668
security_groups_by_nodes,
665669
subnet_ids,
666670
network_interfaces,
667-
ports,
671+
ports_to_validate,
668672
protocol,
669673
file_storage.file_storage_type,
670674
)
@@ -676,7 +680,7 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id
676680
self._add_failure(
677681
f"The current security group settings on file storage '{file_storage_id}' does not"
678682
" satisfy mounting requirement. The file storage must be associated to a security group"
679-
f" that allows {direction} {protocol.upper()} traffic through ports {ports}. "
683+
f" that allows {direction} {protocol.upper()} traffic through ports {ports_to_validate}. "
680684
f"Missing ports: {missing_ports}",
681685
FailureLevel.ERROR,
682686
)

cli/src/pcluster/validators/dev_settings_validators.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
EXTRA_CHEF_ATTRIBUTES_PATH = "DevSettings/Cookbook/ExtraChefAttributes"
1717
ATTR_IN_PLACE_UPDATE_ON_FLEET_ENABLED = "in_place_update_on_fleet_enabled"
18+
ATTR_RECONFIGURE_TIMEOUT = "cluster.slurm.reconfigure_timeout"
19+
MIN_SLURM_RECONFIGURE_TIMEOUT = 300
1820

1921

2022
class ExtraChefAttributesValidator(Validator):
@@ -29,8 +31,10 @@ def _validate(self, extra_chef_attributes: str = None):
2931
"""
3032
if not extra_chef_attributes:
3133
return
32-
else:
33-
self._validate_in_place_update_on_fleet_enabled(json.loads(extra_chef_attributes))
34+
35+
attrs = json.loads(extra_chef_attributes)
36+
self._validate_in_place_update_on_fleet_enabled(attrs)
37+
self._validate_slurm_reconfigure_timeout(attrs)
3438

3539
def _validate_in_place_update_on_fleet_enabled(self, extra_chef_attributes: dict = None):
3640
"""Validate attribute cluster.in_place_update_on_fleet_enabled.
@@ -60,3 +64,33 @@ def _validate_in_place_update_on_fleet_enabled(self, extra_chef_attributes: dict
6064
"by replacing compute and login nodes according to the selected QueueUpdateStrategy.",
6165
FailureLevel.WARNING,
6266
)
67+
68+
def _validate_slurm_reconfigure_timeout(self, extra_chef_attributes: dict = None):
69+
"""Validate attribute cluster.slurm.reconfigure-timeout.
70+
71+
Must be an integer greater than 300.
72+
73+
Args:
74+
extra_chef_attributes: Dictionary of Chef attributes to validate.
75+
"""
76+
reconfigure_timeout = dig(extra_chef_attributes, *ATTR_RECONFIGURE_TIMEOUT.split("."))
77+
78+
if reconfigure_timeout is None:
79+
return
80+
81+
# Reject booleans explicitly (bool is subclass of int in Python)
82+
if isinstance(reconfigure_timeout, bool) or not isinstance(reconfigure_timeout, int):
83+
self._add_failure(
84+
f"Invalid value in {EXTRA_CHEF_ATTRIBUTES_PATH}: "
85+
f"attribute '{ATTR_RECONFIGURE_TIMEOUT}' must be an integer.",
86+
FailureLevel.ERROR,
87+
)
88+
return
89+
90+
if reconfigure_timeout <= MIN_SLURM_RECONFIGURE_TIMEOUT:
91+
self._add_failure(
92+
f"Invalid value in {EXTRA_CHEF_ATTRIBUTES_PATH}: "
93+
f"attribute '{ATTR_RECONFIGURE_TIMEOUT}' "
94+
f"must be greater than {MIN_SLURM_RECONFIGURE_TIMEOUT}.",
95+
FailureLevel.ERROR,
96+
)

0 commit comments

Comments
 (0)