Skip to content

Commit 76813d4

Browse files
committed
[DevSettings] Add support for selective ExtraChefAttributes updates.
Accept updates for the attribute `cluster/slurm/reconfigure_timeout`.
1 parent 8617a06 commit 76813d4

File tree

9 files changed

+649
-11
lines changed

9 files changed

+649
-11
lines changed

cli/src/pcluster/config/update_policy.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
SLURM,
2121
STORAGE_TYPES_SUPPORTING_LIVE_UPDATES,
2222
)
23+
from pcluster.utils import get_dictionary_diff, parse_json_string
2324

2425

2526
class UpdatePolicy:
@@ -510,6 +511,40 @@ def get_pool_name_from_change_paths(change):
510511
return ""
511512

512513

514+
# ExtraChefAttributes update policy helpers
515+
# Define which paths within ExtraChefAttributes JSON are updatable (dot-notation)
516+
# IMPORTANT: We assume this set to contain the path to leaf fields.
517+
EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS: set[str] = {
518+
"cluster.slurm.reconfigure_timeout",
519+
}
520+
521+
522+
def _get_non_updatable_changes(old_value: str, new_value: str) -> list[str]:
523+
"""
524+
Parse and compare two ExtraChefAttributes JSON strings, returning paths that are not updatable.
525+
"""
526+
old_attrs = parse_json_string(old_value, raise_on_error=False, default={}) if old_value else {}
527+
new_attrs = parse_json_string(new_value, raise_on_error=False, default={}) if new_value else {}
528+
529+
return [p for p in get_dictionary_diff(old_attrs, new_attrs) if p not in EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS]
530+
531+
532+
def condition_checker_extra_chef_attributes(change, patch) -> bool:
533+
"""
534+
Check if ExtraChefAttributes changes are allowed.
535+
536+
Only changes to paths defined in EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS are allowed.
537+
"""
538+
return len(_get_non_updatable_changes(change.old_value, change.new_value)) == 0
539+
540+
541+
def fail_reason_extra_chef_attributes(change, patch):
542+
"""Generate fail reason for ExtraChefAttributes update."""
543+
non_updatable_changes = _get_non_updatable_changes(change.old_value, change.new_value)
544+
paths_str = ", ".join(sorted(non_updatable_changes))
545+
return f"The following ExtraChefAttributes fields cannot be updated: {paths_str}"
546+
547+
513548
# Common fail_reason messages
514549
UpdatePolicy.FAIL_REASONS = {
515550
"ebs_volume_resize": "Updating the file system after a resize operation requires commands specific to your "
@@ -526,6 +561,7 @@ def get_pool_name_from_change_paths(change):
526561
"compute_or_login_nodes_running": lambda change, patch: (
527562
"The update is not supported when compute or login nodes are running"
528563
),
564+
"extra_chef_attributes_update": fail_reason_extra_chef_attributes,
529565
}
530566

531567
# Common action_needed messages
@@ -548,6 +584,9 @@ def get_pool_name_from_change_paths(change):
548584
"Stop the login nodes by setting Count parameter to 0 "
549585
"and update the cluster with the pcluster update-cluster command"
550586
),
587+
"extra_chef_attributes_update": lambda change, patch: (
588+
"Revert the non-updatable ExtraChefAttributes fields to their original values."
589+
),
551590
}
552591

553592
# Base policies
@@ -567,6 +606,15 @@ def get_pool_name_from_change_paths(change):
567606
name="SUPPORTED", level=0, fail_reason="-", condition_checker=(lambda change, patch: True)
568607
)
569608

609+
# Update policy for ExtraChefAttributes - allows updates to specific fields only
610+
UpdatePolicy.EXTRA_CHEF_ATTRIBUTES = UpdatePolicy(
611+
name="EXTRA_CHEF_ATTRIBUTES",
612+
level=5,
613+
fail_reason=UpdatePolicy.FAIL_REASONS["extra_chef_attributes_update"],
614+
action_needed=UpdatePolicy.ACTIONS_NEEDED["extra_chef_attributes_update"],
615+
condition_checker=condition_checker_extra_chef_attributes,
616+
)
617+
570618
# Checks resize of max_vcpus in Batch Compute Environment
571619
UpdatePolicy.AWSBATCH_CE_MAX_RESIZE = UpdatePolicy(
572620
name="AWSBATCH_CE_MAX_RESIZE",

cli/src/pcluster/schemas/common_schema.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -216,13 +216,7 @@ class CookbookSchema(BaseSchema):
216216
)
217217
}
218218
)
219-
extra_chef_attributes = fields.Str(
220-
metadata={
221-
"update_policy": UpdatePolicy(
222-
UpdatePolicy.UNSUPPORTED, fail_reason=UpdatePolicy.FAIL_REASONS["cookbook_update"]
223-
)
224-
}
225-
)
219+
extra_chef_attributes = fields.Str(metadata={"update_policy": UpdatePolicy.EXTRA_CHEF_ATTRIBUTES})
226220

227221
@post_load()
228222
def make_resource(self, data, **kwargs):
@@ -275,7 +269,7 @@ def make_resource(self, data, **kwargs):
275269
class BaseDevSettingsSchema(BaseSchema):
276270
"""Represent the common schema of Dev Setting for ImageBuilder and Cluster."""
277271

278-
cookbook = fields.Nested(CookbookSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
272+
cookbook = fields.Nested(CookbookSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
279273
node_package = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
280274
aws_batch_cli_package = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
281275

cli/src/pcluster/utils.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,3 +614,77 @@ def get_needed_ultraserver_capacity_block_statuses(statuses, capacity_reservatio
614614
):
615615
needed_capacity_block_statuses.append(status)
616616
return needed_capacity_block_statuses
617+
618+
619+
def parse_json_string(value: str, raise_on_error: bool = False, default: any = None) -> any:
620+
"""
621+
Parse a JSON string into a Python object.
622+
623+
:param value: JSON string to parse
624+
:param raise_on_error: If True, raises exception on parse failure; if False, returns default
625+
:param default: Value to return on parse failure when raise_on_error is False
626+
:return: Parsed JSON object or default value on failure
627+
"""
628+
try:
629+
return json.loads(value)
630+
except (json.JSONDecodeError, TypeError) as e:
631+
LOGGER.error("Failed to parse JSON string: %s. Error: %s", value, e)
632+
if raise_on_error:
633+
raise e
634+
return default
635+
636+
637+
def get_dictionary_diff(d1: dict, d2: dict, path: str = "") -> set[str]:
638+
"""
639+
Recursively find all leaf paths where two dictionaries differ.
640+
641+
Returns a set of dot-notation paths (e.g., "cluster.settings.enabled") where
642+
the dictionaries have different values, including added or removed keys.
643+
Always returns leaf paths, never intermediate dict paths.
644+
645+
:param d1: First dictionary to compare (if None, treated as empty dict)
646+
:param d2: Second dictionary to compare (if None, treated as empty dict)
647+
:param path: Current path prefix (used for recursion)
648+
:return: Set of dot-notation paths where dictionaries differ (never None)
649+
"""
650+
d1 = d1 or {}
651+
d2 = d2 or {}
652+
653+
diffs: set[str] = set()
654+
keys1, keys2 = set(d1.keys()), set(d2.keys())
655+
656+
def collect_leaves(d: dict, prefix: str) -> set[str]:
657+
"""Collect all leaf paths from a dict."""
658+
leaves = set()
659+
for k, v in d.items():
660+
key_path = f"{prefix}.{k}" if prefix else str(k)
661+
if isinstance(v, dict) and v:
662+
leaves.update(collect_leaves(v, key_path))
663+
else:
664+
leaves.add(key_path)
665+
return leaves if leaves else {prefix} if prefix else set()
666+
667+
# Added or removed keys - collect their leaf paths
668+
for k in keys1 - keys2:
669+
key_path = f"{path}.{k}" if path else str(k)
670+
if isinstance(d1[k], dict) and d1[k]:
671+
diffs.update(collect_leaves(d1[k], key_path))
672+
else:
673+
diffs.add(key_path)
674+
675+
for k in keys2 - keys1:
676+
key_path = f"{path}.{k}" if path else str(k)
677+
if isinstance(d2[k], dict) and d2[k]:
678+
diffs.update(collect_leaves(d2[k], key_path))
679+
else:
680+
diffs.add(key_path)
681+
682+
# Common keys - recurse or compare
683+
for k in keys1 & keys2:
684+
key_path = f"{path}.{k}" if path else str(k)
685+
if isinstance(d1[k], dict) and isinstance(d2[k], dict):
686+
diffs.update(get_dictionary_diff(d1[k], d2[k], key_path))
687+
elif d1[k] != d2[k]:
688+
diffs.add(key_path)
689+
690+
return diffs

cli/src/pcluster/validators/dev_settings_validators.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
EXTRA_CHEF_ATTRIBUTES_PATH = "DevSettings/Cookbook/ExtraChefAttributes"
1717
ATTR_IN_PLACE_UPDATE_ON_FLEET_ENABLED = "in_place_update_on_fleet_enabled"
18+
ATTR_RECONFIGURE_TIMEOUT = "cluster.slurm.reconfigure_timeout"
19+
MIN_SLURM_RECONFIGURE_TIMEOUT = 300
1820

1921

2022
class ExtraChefAttributesValidator(Validator):
@@ -29,8 +31,10 @@ def _validate(self, extra_chef_attributes: str = None):
2931
"""
3032
if not extra_chef_attributes:
3133
return
32-
else:
33-
self._validate_in_place_update_on_fleet_enabled(json.loads(extra_chef_attributes))
34+
35+
attrs = json.loads(extra_chef_attributes)
36+
self._validate_in_place_update_on_fleet_enabled(attrs)
37+
self._validate_slurm_reconfigure_timeout(attrs)
3438

3539
def _validate_in_place_update_on_fleet_enabled(self, extra_chef_attributes: dict = None):
3640
"""Validate attribute cluster.in_place_update_on_fleet_enabled.
@@ -60,3 +64,33 @@ def _validate_in_place_update_on_fleet_enabled(self, extra_chef_attributes: dict
6064
"by replacing compute and login nodes according to the selected QueueUpdateStrategy.",
6165
FailureLevel.WARNING,
6266
)
67+
68+
def _validate_slurm_reconfigure_timeout(self, extra_chef_attributes: dict = None):
69+
"""Validate attribute cluster.slurm.reconfigure-timeout.
70+
71+
Must be an integer greater than 300.
72+
73+
Args:
74+
extra_chef_attributes: Dictionary of Chef attributes to validate.
75+
"""
76+
reconfigure_timeout = dig(extra_chef_attributes, *ATTR_RECONFIGURE_TIMEOUT.split("."))
77+
78+
if reconfigure_timeout is None:
79+
return
80+
81+
# Reject booleans explicitly (bool is subclass of int in Python)
82+
if isinstance(reconfigure_timeout, bool) or not isinstance(reconfigure_timeout, int):
83+
self._add_failure(
84+
f"Invalid value in {EXTRA_CHEF_ATTRIBUTES_PATH}: "
85+
f"attribute '{ATTR_RECONFIGURE_TIMEOUT}' must be an integer.",
86+
FailureLevel.ERROR,
87+
)
88+
return
89+
90+
if reconfigure_timeout <= MIN_SLURM_RECONFIGURE_TIMEOUT:
91+
self._add_failure(
92+
f"Invalid value in {EXTRA_CHEF_ATTRIBUTES_PATH}: "
93+
f"attribute '{ATTR_RECONFIGURE_TIMEOUT}' "
94+
f"must be greater than {MIN_SLURM_RECONFIGURE_TIMEOUT}.",
95+
FailureLevel.ERROR,
96+
)

0 commit comments

Comments
 (0)