Skip to content

Commit 883e534

Browse files
mohamedzeidan2021Mohamed Zeidan
andauthored
Feature: Delete Cluster Command (#250)
* delete cluster stack * delete cluster stack * removed unnecessary file * unit tests * more modular code * refactored modular code * sdk code added and improved modularity * cleanup * removed silent failure for sdk * fixed unit tests * integ tests * 2 integ happycase tests * changed test to use iam role instead of s3 bucket --------- Co-authored-by: Mohamed Zeidan <[email protected]>
1 parent 458bd63 commit 883e534

File tree

10 files changed

+1874
-9
lines changed

10 files changed

+1874
-9
lines changed

src/sagemaker/hyperpod/cli/cluster_stack_utils.py

Lines changed: 498 additions & 0 deletions
Large diffs are not rendered by default.

src/sagemaker/hyperpod/cli/commands/cluster_stack.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
from sagemaker.hyperpod.common.telemetry.constants import Feature
1919
from sagemaker.hyperpod.common.utils import setup_logging
2020
from sagemaker.hyperpod.cli.utils import convert_datetimes
21+
from sagemaker.hyperpod.cli.cluster_stack_utils import (
22+
StackNotFoundError,
23+
delete_stack_with_confirmation
24+
)
2125

2226
logger = logging.getLogger(__name__)
2327

@@ -292,8 +296,11 @@ def list_cluster_stacks(region, debug, status):
292296

293297
@click.command("cluster-stack")
294298
@click.argument("stack-name", required=True)
299+
@click.option("--retain-resources", help="Comma-separated list of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks). Resource names are shown in failed deletion output, or use AWS CLI: 'aws cloudformation list-stack-resources --stack-name STACK_NAME --region REGION'")
300+
@click.option("--region", required=True, help="AWS region (required)")
295301
@click.option("--debug", is_flag=True, help="Enable debug logging")
296-
def delete(stack_name: str, debug: bool) -> None:
302+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_cluster_stack_cli")
303+
def delete_cluster_stack(stack_name: str, retain_resources: str, region: str, debug: bool) -> None:
297304
"""Delete a HyperPod cluster stack.
298305
299306
Removes the specified CloudFormation stack and all associated AWS resources.
@@ -305,12 +312,34 @@ def delete(stack_name: str, debug: bool) -> None:
305312
.. code-block:: bash
306313
307314
# Delete a cluster stack
308-
hyp delete hyp-cluster my-stack-name
315+
hyp delete cluster-stack my-stack-name --region us-west-2
316+
317+
# Delete with retained resources (only works on DELETE_FAILED stacks)
318+
hyp delete cluster-stack my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models --region us-west-2
309319
"""
310320
logger = setup_logging(logging.getLogger(__name__), debug)
311321

312-
logger.info(f"Deleting stack: {stack_name}")
313-
logger.info("This feature is not yet implemented.")
322+
try:
323+
# Use the high-level orchestration function with CLI-specific callbacks
324+
delete_stack_with_confirmation(
325+
stack_name=stack_name,
326+
region=region,
327+
retain_resources_str=retain_resources or "",
328+
message_callback=click.echo,
329+
confirm_callback=lambda msg: click.confirm("Continue?", default=False),
330+
success_callback=lambda msg: click.echo(f"✓ {msg}")
331+
)
332+
333+
except StackNotFoundError:
334+
click.secho(f"❌ Stack '{stack_name}' not found", fg='red')
335+
except click.ClickException:
336+
# Re-raise ClickException for proper CLI error handling
337+
raise
338+
except Exception as e:
339+
logger.error(f"Failed to delete stack: {e}")
340+
if debug:
341+
logger.exception("Detailed error information:")
342+
raise click.ClickException(str(e))
314343

315344
@click.command("cluster")
316345
@click.option("--cluster-name", required=True, help="The name of the cluster to update")
@@ -376,4 +405,3 @@ def update_cluster(
376405

377406
logger.info("Cluster has been updated")
378407
click.secho(f"Cluster {cluster_name} has been updated")
379-

src/sagemaker/hyperpod/cli/common_utils.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import sys
2-
from typing import Mapping, Type
2+
from typing import Mapping, Type, List, Dict, Any
33
import click
44
import pkgutil
55
import json
@@ -68,4 +68,54 @@ def load_schema_for_version(
6868
f"Could not load schema.json for version {version} "
6969
f"(looked in package {ver_pkg})"
7070
)
71-
return json.loads(raw)
71+
return json.loads(raw)
72+
73+
74+
def parse_comma_separated_list(value: str) -> List[str]:
75+
"""
76+
Parse a comma-separated string into a list of strings.
77+
Generic utility that can be reused across commands.
78+
79+
Args:
80+
value: Comma-separated string like "item1,item2,item3"
81+
82+
Returns:
83+
List of trimmed strings
84+
"""
85+
if not value:
86+
return []
87+
return [item.strip() for item in value.split(",") if item.strip()]
88+
89+
90+
def categorize_resources_by_type(resources: List[Dict[str, Any]],
91+
type_mappings: Dict[str, List[str]]) -> Dict[str, List[str]]:
92+
"""
93+
Generic function to categorize resources by type.
94+
95+
Args:
96+
resources: List of resource dictionaries with 'ResourceType' and 'LogicalResourceId'
97+
type_mappings: Dictionary mapping category names to lists of resource types
98+
99+
Returns:
100+
Dictionary of category -> list of resource names
101+
"""
102+
categorized = {category: [] for category in type_mappings.keys()}
103+
categorized["Other"] = []
104+
105+
for resource in resources:
106+
resource_type = resource.get("ResourceType", "")
107+
logical_id = resource.get("LogicalResourceId", "")
108+
109+
# Find which category this resource type belongs to
110+
category_found = False
111+
for category, types in type_mappings.items():
112+
if any(resource_type.startswith(rt) for rt in types):
113+
categorized[category].append(logical_id)
114+
category_found = True
115+
break
116+
117+
if not category_found:
118+
categorized["Other"].append(logical_id)
119+
120+
# Remove empty categories
121+
return {k: v for k, v in categorized.items() if v}

src/sagemaker/hyperpod/cli/hyp_cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \
1111
get_monitoring
1212
from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \
13-
list_cluster_stacks, update_cluster
13+
list_cluster_stacks, update_cluster, delete_cluster_stack
1414
from sagemaker.hyperpod.cli.commands.training import (
1515
pytorch_create,
1616
list_jobs,
@@ -190,6 +190,7 @@ def exec():
190190
delete.add_command(pytorch_delete)
191191
delete.add_command(js_delete)
192192
delete.add_command(custom_delete)
193+
delete.add_command(delete_cluster_stack)
193194

194195
list_pods.add_command(pytorch_list_pods)
195196
list_pods.add_command(js_list_pods)

src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,116 @@ def check_status(stack_name: str, region: Optional[str] = None):
537537
"""
538538
return HpClusterStack._get_stack_status_helper(stack_name, region)
539539

540+
@staticmethod
541+
def delete(stack_name: str, region: Optional[str] = None, retain_resources: Optional[List[str]] = None,
542+
logger: Optional[logging.Logger] = None) -> None:
543+
"""Deletes a HyperPod cluster CloudFormation stack.
544+
545+
Removes the specified CloudFormation stack and all associated AWS resources.
546+
This operation cannot be undone and proceeds automatically without confirmation.
547+
548+
**Parameters:**
549+
550+
.. list-table::
551+
:header-rows: 1
552+
:widths: 20 20 60
553+
554+
* - Parameter
555+
- Type
556+
- Description
557+
* - stack_name
558+
- str
559+
- Name of the CloudFormation stack to delete
560+
* - region
561+
- str, optional
562+
- AWS region where the stack exists
563+
* - retain_resources
564+
- List[str], optional
565+
- List of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks)
566+
* - logger
567+
- logging.Logger, optional
568+
- Logger instance for output messages. Uses default logger if not provided
569+
570+
**Raises:**
571+
572+
ValueError: When stack doesn't exist or retain_resources limitation is encountered
573+
RuntimeError: When CloudFormation deletion fails
574+
Exception: For other deletion errors
575+
576+
.. dropdown:: Usage Examples
577+
:open:
578+
579+
.. code-block:: python
580+
581+
>>> # Delete a stack (automatically proceeds without confirmation)
582+
>>> HpClusterStack.delete("my-stack-name")
583+
>>>
584+
>>> # Delete in specific region
585+
>>> HpClusterStack.delete("my-stack-name", region="us-west-2")
586+
>>>
587+
>>> # Delete with retained resources (only works on DELETE_FAILED stacks)
588+
>>> HpClusterStack.delete("my-stack-name", retain_resources=["S3Bucket", "EFSFileSystem"])
589+
>>>
590+
>>> # Delete with custom logger
591+
>>> import logging
592+
>>> logger = logging.getLogger(__name__)
593+
>>> HpClusterStack.delete("my-stack-name", logger=logger)
594+
"""
595+
from sagemaker.hyperpod.cli.cluster_stack_utils import (
596+
delete_stack_with_confirmation,
597+
StackNotFoundError
598+
)
599+
600+
if logger is None:
601+
logger = logging.getLogger(__name__)
602+
603+
# Convert retain_resources list to comma-separated string for the utility function
604+
retain_resources_str = ",".join(retain_resources) if retain_resources else ""
605+
606+
def sdk_confirm_callback(message: str) -> bool:
607+
"""SDK-specific confirmation callback - always auto-confirms."""
608+
logger.info(f"Auto-confirming: {message}")
609+
return True
610+
611+
try:
612+
delete_stack_with_confirmation(
613+
stack_name=stack_name,
614+
region=region or boto3.session.Session().region_name,
615+
retain_resources_str=retain_resources_str,
616+
message_callback=logger.info,
617+
confirm_callback=sdk_confirm_callback,
618+
success_callback=logger.info
619+
)
620+
except StackNotFoundError:
621+
error_msg = f"Stack '{stack_name}' not found"
622+
logger.error(error_msg)
623+
raise ValueError(error_msg)
624+
except Exception as e:
625+
error_str = str(e)
626+
627+
# Handle CloudFormation retain-resources limitation with clear exception for SDK
628+
if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str:
629+
error_msg = (
630+
f"CloudFormation limitation: retain_resources can only be used on stacks in DELETE_FAILED state. "
631+
f"Current stack state allows normal deletion. Try deleting without retain_resources first, "
632+
f"then retry with retain_resources if deletion fails."
633+
)
634+
logger.error(error_msg)
635+
raise ValueError(error_msg)
636+
637+
# Handle termination protection
638+
if "TerminationProtection is enabled" in error_str:
639+
error_msg = (
640+
f"Stack deletion blocked: Termination Protection is enabled. "
641+
f"Disable termination protection first using AWS CLI or Console."
642+
)
643+
logger.error(error_msg)
644+
raise RuntimeError(error_msg)
645+
646+
# Handle other errors
647+
logger.error(f"Failed to delete stack: {error_str}")
648+
raise RuntimeError(f"Stack deletion failed: {error_str}")
649+
540650

541651
def _yaml_to_json_string(yaml_path) -> str:
542652
"""Convert YAML file to JSON string"""

0 commit comments

Comments
 (0)