Skip to content

Commit 20cefbd

Browse files
committed
[DevSettings] Add support for selective ExtraChefAttributes updates.
Accept updates for the attribute `cluster/slurm/reconfigure_timeout`.
1 parent 2893426 commit 20cefbd

File tree

8 files changed

+609
-7
lines changed

8 files changed

+609
-7
lines changed

cli/src/pcluster/config/update_policy.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
99
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1010
# limitations under the License.
11+
import json
1112
import re
1213
from enum import Enum
1314

@@ -20,6 +21,8 @@
2021
SLURM,
2122
STORAGE_TYPES_SUPPORTING_LIVE_UPDATES,
2223
)
24+
from pcluster.utils import deep_diff_dict, parse_json_string
25+
from pcluster.validators.utils import dig
2326

2427

2528
class UpdatePolicy:
@@ -510,6 +513,40 @@ def get_pool_name_from_change_paths(change):
510513
return ""
511514

512515

516+
# ExtraChefAttributes update policy helpers
517+
# Define which paths within ExtraChefAttributes JSON are updatable (dot-notation)
518+
# IMPORTANT: We assume this set to contain the path to leaf fields.
519+
EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS: set[str] = {
520+
"cluster.slurm.reconfigure_timeout",
521+
}
522+
523+
524+
def _get_non_updatable_changes(old_value: str, new_value: str) -> list[str]:
525+
"""
526+
Parse and compare two ExtraChefAttributes JSON strings, returning paths that are not updatable.
527+
"""
528+
old_attrs = parse_json_string(old_value, raise_on_error=False, default={}) if old_value else {}
529+
new_attrs = parse_json_string(new_value, raise_on_error=False, default={}) if new_value else {}
530+
531+
return [p for p in deep_diff_dict(old_attrs, new_attrs) if p not in EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS]
532+
533+
534+
def condition_checker_extra_chef_attributes(change, patch) -> bool:
535+
"""
536+
Check if ExtraChefAttributes changes are allowed.
537+
538+
Only changes to paths defined in EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS are allowed.
539+
"""
540+
return len(_get_non_updatable_changes(change.old_value, change.new_value)) == 0
541+
542+
543+
def fail_reason_extra_chef_attributes(change, patch):
544+
"""Generate fail reason for ExtraChefAttributes update."""
545+
non_updatable_changes = _get_non_updatable_changes(change.old_value, change.new_value)
546+
paths_str = ", ".join(sorted(non_updatable_changes))
547+
return f"The following ExtraChefAttributes fields cannot be updated: {paths_str}"
548+
549+
513550
# Common fail_reason messages
514551
UpdatePolicy.FAIL_REASONS = {
515552
"ebs_volume_resize": "Updating the file system after a resize operation requires commands specific to your "
@@ -526,6 +563,7 @@ def get_pool_name_from_change_paths(change):
526563
"compute_or_login_nodes_running": lambda change, patch: (
527564
"The update is not supported when compute or login nodes are running"
528565
),
566+
"extra_chef_attributes_update": fail_reason_extra_chef_attributes,
529567
}
530568

531569
# Common action_needed messages
@@ -548,6 +586,9 @@ def get_pool_name_from_change_paths(change):
548586
"Stop the login nodes by setting Count parameter to 0 "
549587
"and update the cluster with the pcluster update-cluster command"
550588
),
589+
"extra_chef_attributes_update": lambda change, patch: (
590+
"Revert the non-updatable ExtraChefAttributes fields to their original values."
591+
),
551592
}
552593

553594
# Base policies
@@ -567,6 +608,15 @@ def get_pool_name_from_change_paths(change):
567608
name="SUPPORTED", level=0, fail_reason="-", condition_checker=(lambda change, patch: True)
568609
)
569610

611+
# Update policy for ExtraChefAttributes - allows updates to specific fields only
612+
UpdatePolicy.EXTRA_CHEF_ATTRIBUTES = UpdatePolicy(
613+
name="EXTRA_CHEF_ATTRIBUTES",
614+
level=5,
615+
fail_reason=UpdatePolicy.FAIL_REASONS["extra_chef_attributes_update"],
616+
action_needed=UpdatePolicy.ACTIONS_NEEDED["extra_chef_attributes_update"],
617+
condition_checker=condition_checker_extra_chef_attributes,
618+
)
619+
570620
# Checks resize of max_vcpus in Batch Compute Environment
571621
UpdatePolicy.AWSBATCH_CE_MAX_RESIZE = UpdatePolicy(
572622
name="AWSBATCH_CE_MAX_RESIZE",

cli/src/pcluster/schemas/common_schema.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -218,9 +218,7 @@ class CookbookSchema(BaseSchema):
218218
)
219219
extra_chef_attributes = fields.Str(
220220
metadata={
221-
"update_policy": UpdatePolicy(
222-
UpdatePolicy.UNSUPPORTED, fail_reason=UpdatePolicy.FAIL_REASONS["cookbook_update"]
223-
)
221+
"update_policy": UpdatePolicy.EXTRA_CHEF_ATTRIBUTES
224222
}
225223
)
226224

@@ -275,7 +273,7 @@ def make_resource(self, data, **kwargs):
275273
class BaseDevSettingsSchema(BaseSchema):
276274
"""Represent the common schema of Dev Setting for ImageBuilder and Cluster."""
277275

278-
cookbook = fields.Nested(CookbookSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
276+
cookbook = fields.Nested(CookbookSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
279277
node_package = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
280278
aws_batch_cli_package = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
281279

cli/src/pcluster/utils.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,3 +614,77 @@ def get_needed_ultraserver_capacity_block_statuses(statuses, capacity_reservatio
614614
):
615615
needed_capacity_block_statuses.append(status)
616616
return needed_capacity_block_statuses
617+
618+
619+
def parse_json_string(value: str, raise_on_error: bool = False, default: any = None) -> any:
620+
"""
621+
Parse a JSON string into a Python object.
622+
623+
:param value: JSON string to parse
624+
:param raise_on_error: If True, raises exception on parse failure; if False, returns default
625+
:param default: Value to return on parse failure when raise_on_error is False
626+
:return: Parsed JSON object or default value on failure
627+
"""
628+
try:
629+
return json.loads(value)
630+
except (json.JSONDecodeError, TypeError) as e:
631+
LOGGER.error("Failed to parse JSON string: %s. Error: %s", value, e)
632+
if raise_on_error:
633+
raise e
634+
return default
635+
636+
637+
def deep_diff_dict(d1: dict, d2: dict, path: str = "") -> set[str]:
638+
"""
639+
Recursively find all leaf paths where two dictionaries differ.
640+
641+
Returns a set of dot-notation paths (e.g., "cluster.settings.enabled") where
642+
the dictionaries have different values, including added or removed keys.
643+
Always returns leaf paths, never intermediate dict paths.
644+
645+
:param d1: First dictionary to compare (if None, treated as empty dict)
646+
:param d2: Second dictionary to compare (if None, treated as empty dict)
647+
:param path: Current path prefix (used for recursion)
648+
:return: Set of dot-notation paths where dictionaries differ (never None)
649+
"""
650+
d1 = d1 or {}
651+
d2 = d2 or {}
652+
653+
diffs: set[str] = set()
654+
keys1, keys2 = set(d1.keys()), set(d2.keys())
655+
656+
def collect_leaves(d: dict, prefix: str) -> set[str]:
657+
"""Collect all leaf paths from a dict."""
658+
leaves = set()
659+
for k, v in d.items():
660+
key_path = f"{prefix}.{k}" if prefix else str(k)
661+
if isinstance(v, dict) and v:
662+
leaves.update(collect_leaves(v, key_path))
663+
else:
664+
leaves.add(key_path)
665+
return leaves if leaves else {prefix} if prefix else set()
666+
667+
# Added or removed keys - collect their leaf paths
668+
for k in keys1 - keys2:
669+
key_path = f"{path}.{k}" if path else str(k)
670+
if isinstance(d1[k], dict) and d1[k]:
671+
diffs.update(collect_leaves(d1[k], key_path))
672+
else:
673+
diffs.add(key_path)
674+
675+
for k in keys2 - keys1:
676+
key_path = f"{path}.{k}" if path else str(k)
677+
if isinstance(d2[k], dict) and d2[k]:
678+
diffs.update(collect_leaves(d2[k], key_path))
679+
else:
680+
diffs.add(key_path)
681+
682+
# Common keys - recurse or compare
683+
for k in keys1 & keys2:
684+
key_path = f"{path}.{k}" if path else str(k)
685+
if isinstance(d1[k], dict) and isinstance(d2[k], dict):
686+
diffs.update(deep_diff_dict(d1[k], d2[k], key_path))
687+
elif d1[k] != d2[k]:
688+
diffs.add(key_path)
689+
690+
return diffs

cli/src/pcluster/validators/dev_settings_validators.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
EXTRA_CHEF_ATTRIBUTES_PATH = "DevSettings/Cookbook/ExtraChefAttributes"
1717
ATTR_IN_PLACE_UPDATE_ON_FLEET_ENABLED = "in_place_update_on_fleet_enabled"
18+
ATTR_RECONFIGURE_TIMEOUT = "reconfigure_timeout"
19+
MIN_SLURM_RECONFIGURE_TIMEOUT = 300
1820

1921

2022
class ExtraChefAttributesValidator(Validator):
@@ -29,8 +31,10 @@ def _validate(self, extra_chef_attributes: str = None):
2931
"""
3032
if not extra_chef_attributes:
3133
return
32-
else:
33-
self._validate_in_place_update_on_fleet_enabled(json.loads(extra_chef_attributes))
34+
35+
attrs = json.loads(extra_chef_attributes)
36+
self._validate_in_place_update_on_fleet_enabled(attrs)
37+
self._validate_slurm_reconfigure_timeout(attrs)
3438

3539
def _validate_in_place_update_on_fleet_enabled(self, extra_chef_attributes: dict = None):
3640
"""Validate attribute cluster.in_place_update_on_fleet_enabled.
@@ -60,3 +64,33 @@ def _validate_in_place_update_on_fleet_enabled(self, extra_chef_attributes: dict
6064
"by replacing compute and login nodes according to the selected QueueUpdateStrategy.",
6165
FailureLevel.WARNING,
6266
)
67+
68+
69+
def _validate_slurm_reconfigure_timeout(self, extra_chef_attributes: dict = None):
70+
"""Validate attribute cluster.slurm.reconfigure-timeout.
71+
72+
Must be an integer greater than 300.
73+
74+
Args:
75+
extra_chef_attributes: Dictionary of Chef attributes to validate.
76+
"""
77+
reconfigure_timeout = dig(extra_chef_attributes, "cluster", "slurm", ATTR_RECONFIGURE_TIMEOUT)
78+
79+
if reconfigure_timeout is None:
80+
return
81+
82+
# Reject booleans explicitly (bool is subclass of int in Python)
83+
if isinstance(reconfigure_timeout, bool) or not isinstance(reconfigure_timeout, int):
84+
self._add_failure(
85+
f"Invalid value in {EXTRA_CHEF_ATTRIBUTES_PATH}: "
86+
f"attribute 'cluster/slurm/{ATTR_RECONFIGURE_TIMEOUT}' must be an integer.",
87+
FailureLevel.ERROR,
88+
)
89+
return
90+
91+
if reconfigure_timeout <= MIN_SLURM_RECONFIGURE_TIMEOUT:
92+
self._add_failure(
93+
f"Invalid value in {EXTRA_CHEF_ATTRIBUTES_PATH}: "
94+
f"attribute 'cluster/slurm/{ATTR_RECONFIGURE_TIMEOUT}' must be greater than {MIN_SLURM_RECONFIGURE_TIMEOUT}.",
95+
FailureLevel.ERROR,
96+
)

0 commit comments

Comments
 (0)