Skip to content

Commit 7c09e6a

Browse files
mohamedzeidan2021Mohamed Zeidan
authored andcommitted
delete cluster functionality (#247)
Co-authored-by: Mohamed Zeidan <[email protected]>
1 parent 68c28f9 commit 7c09e6a

File tree

3 files changed

+163
-53
lines changed

3 files changed

+163
-53
lines changed

src/sagemaker/hyperpod/cli/common_utils.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@ def parse_comma_separated_list(value: str) -> List[str]:
7575
"""
7676
Parse a comma-separated string into a list of strings.
7777
Generic utility that can be reused across commands.
78-
78+
7979
Args:
8080
value: Comma-separated string like "item1,item2,item3"
81-
81+
8282
Returns:
8383
List of trimmed strings
8484
"""
@@ -87,35 +87,35 @@ def parse_comma_separated_list(value: str) -> List[str]:
8787
return [item.strip() for item in value.split(",") if item.strip()]
8888

8989

90-
def categorize_resources_by_type(resources: List[Dict[str, Any]],
90+
def categorize_resources_by_type(resources: List[Dict[str, Any]],
9191
type_mappings: Dict[str, List[str]]) -> Dict[str, List[str]]:
9292
"""
9393
Generic function to categorize resources by type.
94-
94+
9595
Args:
9696
resources: List of resource dictionaries with 'ResourceType' and 'LogicalResourceId'
9797
type_mappings: Dictionary mapping category names to lists of resource types
98-
98+
9999
Returns:
100100
Dictionary of category -> list of resource names
101101
"""
102102
categorized = {category: [] for category in type_mappings.keys()}
103103
categorized["Other"] = []
104-
104+
105105
for resource in resources:
106106
resource_type = resource.get("ResourceType", "")
107107
logical_id = resource.get("LogicalResourceId", "")
108-
108+
109109
# Find which category this resource type belongs to
110110
category_found = False
111111
for category, types in type_mappings.items():
112112
if any(resource_type.startswith(rt) for rt in types):
113113
categorized[category].append(logical_id)
114114
category_found = True
115115
break
116-
116+
117117
if not category_found:
118118
categorized["Other"].append(logical_id)
119-
119+
120120
# Remove empty categories
121121
return {k: v for k, v in categorized.items() if v}

src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py

Lines changed: 115 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,116 @@ def check_status(stack_name: str, region: Optional[str] = None):
510510
>>> status = HpClusterStack.check_status("my-stack", region="us-west-2")
511511
"""
512512
return HpClusterStack._get_stack_status_helper(stack_name, region)
513+
514+
@staticmethod
515+
def delete(stack_name: str, region: Optional[str] = None, retain_resources: Optional[List[str]] = None,
516+
logger: Optional[logging.Logger] = None) -> None:
517+
"""Deletes a HyperPod cluster CloudFormation stack.
518+
519+
Removes the specified CloudFormation stack and all associated AWS resources.
520+
This operation cannot be undone and proceeds automatically without confirmation.
521+
522+
**Parameters:**
523+
524+
.. list-table::
525+
:header-rows: 1
526+
:widths: 20 20 60
527+
528+
* - Parameter
529+
- Type
530+
- Description
531+
* - stack_name
532+
- str
533+
- Name of the CloudFormation stack to delete
534+
* - region
535+
- str, optional
536+
- AWS region where the stack exists
537+
* - retain_resources
538+
- List[str], optional
539+
- List of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks)
540+
* - logger
541+
- logging.Logger, optional
542+
- Logger instance for output messages. Uses default logger if not provided
543+
544+
**Raises:**
545+
546+
ValueError: When stack doesn't exist or retain_resources limitation is encountered
547+
RuntimeError: When CloudFormation deletion fails
548+
Exception: For other deletion errors
549+
550+
.. dropdown:: Usage Examples
551+
:open:
552+
553+
.. code-block:: python
554+
555+
>>> # Delete a stack (automatically proceeds without confirmation)
556+
>>> HpClusterStack.delete("my-stack-name")
557+
>>>
558+
>>> # Delete in specific region
559+
>>> HpClusterStack.delete("my-stack-name", region="us-west-2")
560+
>>>
561+
>>> # Delete with retained resources (only works on DELETE_FAILED stacks)
562+
>>> HpClusterStack.delete("my-stack-name", retain_resources=["S3Bucket", "EFSFileSystem"])
563+
>>>
564+
>>> # Delete with custom logger
565+
>>> import logging
566+
>>> logger = logging.getLogger(__name__)
567+
>>> HpClusterStack.delete("my-stack-name", logger=logger)
568+
"""
569+
from sagemaker.hyperpod.cli.cluster_stack_utils import (
570+
delete_stack_with_confirmation,
571+
StackNotFoundError
572+
)
573+
574+
if logger is None:
575+
logger = logging.getLogger(__name__)
576+
577+
# Convert retain_resources list to comma-separated string for the utility function
578+
retain_resources_str = ",".join(retain_resources) if retain_resources else ""
579+
580+
def sdk_confirm_callback(message: str) -> bool:
581+
"""SDK-specific confirmation callback - always auto-confirms."""
582+
logger.info(f"Auto-confirming: {message}")
583+
return True
584+
585+
try:
586+
delete_stack_with_confirmation(
587+
stack_name=stack_name,
588+
region=region or boto3.session.Session().region_name,
589+
retain_resources_str=retain_resources_str,
590+
message_callback=logger.info,
591+
confirm_callback=sdk_confirm_callback,
592+
success_callback=logger.info
593+
)
594+
except StackNotFoundError:
595+
error_msg = f"Stack '{stack_name}' not found"
596+
logger.error(error_msg)
597+
raise ValueError(error_msg)
598+
except Exception as e:
599+
error_str = str(e)
600+
601+
# Handle CloudFormation retain-resources limitation with clear exception for SDK
602+
if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str:
603+
error_msg = (
604+
f"CloudFormation limitation: retain_resources can only be used on stacks in DELETE_FAILED state. "
605+
f"Current stack state allows normal deletion. Try deleting without retain_resources first, "
606+
f"then retry with retain_resources if deletion fails."
607+
)
608+
logger.error(error_msg)
609+
raise ValueError(error_msg)
610+
611+
# Handle termination protection
612+
if "TerminationProtection is enabled" in error_str:
613+
error_msg = (
614+
f"Stack deletion blocked: Termination Protection is enabled. "
615+
f"Disable termination protection first using AWS CLI or Console."
616+
)
617+
logger.error(error_msg)
618+
raise RuntimeError(error_msg)
619+
620+
# Handle other errors
621+
logger.error(f"Failed to delete stack: {error_str}")
622+
raise RuntimeError(f"Stack deletion failed: {error_str}")
513623

514624
@staticmethod
515625
def delete(stack_name: str, region: Optional[str] = None, retain_resources: Optional[List[str]] = None,
@@ -622,8 +732,8 @@ def sdk_confirm_callback(message: str) -> bool:
622732
raise RuntimeError(f"Stack deletion failed: {error_str}")
623733

624734

625-
def _yaml_to_json_string(yaml_path) -> str:
626-
"""Convert YAML file to JSON string"""
627-
with open(yaml_path, 'r') as file:
628-
yaml_data = yaml.safe_load(file)
629-
return json.dumps(yaml_data, indent=2, ensure_ascii=False)
735+
def _yaml_to_json_string(yaml_path) -> str:
736+
"""Convert YAML file to JSON string"""
737+
with open(yaml_path, 'r') as file:
738+
yaml_data = yaml.safe_load(file)
739+
return json.dumps(yaml_data, indent=2, ensure_ascii=False)

0 commit comments

Comments
 (0)