Skip to content

Commit fdaf009

Browse files
author
Mohamed Zeidan
committed
delete cluster stack
1 parent 5a346e8 commit fdaf009

File tree

3 files changed

+239
-4
lines changed

3 files changed

+239
-4
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Lightweight command manifest for fast CLI startup.
3+
4+
This module provides command metadata without importing heavy dependencies.
5+
It's designed to be imported quickly during CLI initialization.
6+
"""
7+
8+
# Command manifest - lightweight metadata without heavy imports
9+
COMMAND_MANIFEST = {
10+
# Top-level cluster commands
11+
'list-cluster': {
12+
'module': 'cluster',
13+
'help': 'List available SageMaker HyperPod clusters.'
14+
},
15+
'set-cluster-context': {
16+
'module': 'cluster',
17+
'help': 'Configure local Kubectl environment.'
18+
},
19+
'get-cluster-context': {
20+
'module': 'cluster',
21+
'help': 'Get current cluster context information.'
22+
},
23+
'get-monitoring': {
24+
'module': 'cluster',
25+
'help': 'Get monitoring configuration and URLs.'
26+
},
27+
28+
# Top-level init commands
29+
'init': {
30+
'module': 'init',
31+
'help': 'Initialize a new HyperPod project.'
32+
},
33+
'reset': {
34+
'module': 'init',
35+
'help': 'Reset HyperPod configuration.'
36+
},
37+
'configure': {
38+
'module': 'init',
39+
'help': 'Configure HyperPod settings.'
40+
},
41+
'validate': {
42+
'module': 'init',
43+
'help': 'Validate HyperPod configuration.'
44+
}
45+
}
46+
47+
def get_module_for_command(command_name: str) -> str:
48+
"""Get the module name for a command without importing anything."""
49+
return COMMAND_MANIFEST.get(command_name, {}).get('module')
50+
51+
def get_help_for_command(command_name: str) -> str:
52+
"""Get help text for a command without importing anything."""
53+
return COMMAND_MANIFEST.get(command_name, {}).get('help', f'{command_name.replace("-", " ").title()} operations.')
54+
55+
def get_top_level_commands() -> list:
56+
"""Get all top-level commands without importing anything."""
57+
return list(COMMAND_MANIFEST.keys())

src/sagemaker/hyperpod/cli/commands/cluster_stack.py

Lines changed: 180 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from sagemaker.hyperpod.common.telemetry.constants import Feature
1919
from sagemaker.hyperpod.common.utils import setup_logging
2020
from sagemaker.hyperpod.cli.utils import convert_datetimes
21+
from sagemaker.hyperpod import create_boto3_client
2122

2223
logger = logging.getLogger(__name__)
2324

@@ -292,8 +293,12 @@ def list_cluster_stacks(region, debug, status):
292293

293294
@click.command("cluster-stack")
294295
@click.argument("stack-name", required=True)
296+
@click.option("--retain-resources", help="Comma-separated list of resources to retain during deletion")
297+
@click.option("--force-with-retain", is_flag=True, help="Force deletion with retention of failed resources")
298+
@click.option("--region", help="AWS region")
295299
@click.option("--debug", is_flag=True, help="Enable debug logging")
296-
def delete(stack_name: str, debug: bool) -> None:
300+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_cluster_stack_cli")
301+
def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_retain: bool, region: str, debug: bool) -> None:
297302
"""Delete a HyperPod cluster stack.
298303
299304
Removes the specified CloudFormation stack and all associated AWS resources.
@@ -306,11 +311,183 @@ def delete(stack_name: str, debug: bool) -> None:
306311
307312
# Delete a cluster stack
308313
hyp delete hyp-cluster my-stack-name
314+
315+
# Delete with retained resources
316+
hyp delete hyp-cluster my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models
317+
318+
# Force deletion with retention
319+
hyp delete hyp-cluster my-stack-name --retain-resources S3Bucket-TrainingData --force-with-retain
309320
"""
310321
logger = setup_logging(logging.getLogger(__name__), debug)
311322

312-
logger.info(f"Deleting stack: {stack_name}")
313-
logger.info("This feature is not yet implemented.")
323+
try:
324+
# Parse retain resources
325+
retain_list = []
326+
if retain_resources:
327+
retain_list = [r.strip() for r in retain_resources.split(',') if r.strip()]
328+
329+
# Get stack resources for warning display
330+
cf_client = create_boto3_client('cloudformation', region_name=region)
331+
332+
try:
333+
resources_response = cf_client.list_stack_resources(StackName=stack_name)
334+
resources = resources_response.get('StackResourceSummaries', [])
335+
except Exception as e:
336+
if "does not exist" in str(e):
337+
click.secho(f"❌ Stack '{stack_name}' not found", fg='red')
338+
return
339+
raise
340+
341+
if not resources:
342+
click.secho(f"❌ No resources found in stack '{stack_name}'", fg='red')
343+
return
344+
345+
# Categorize resources
346+
resource_categories = {
347+
'EC2 Instances': [],
348+
'Networking': [],
349+
'IAM': [],
350+
'Storage': [],
351+
'Other': []
352+
}
353+
354+
for resource in resources:
355+
resource_type = resource.get('ResourceType', '')
356+
resource_name = resource.get('LogicalResourceId', '')
357+
physical_id = resource.get('PhysicalResourceId', '')
358+
359+
if 'EC2::Instance' in resource_type:
360+
resource_categories['EC2 Instances'].append(f" - {resource_name} ({physical_id})")
361+
elif any(net_type in resource_type for net_type in ['VPC', 'SecurityGroup', 'InternetGateway', 'Subnet', 'RouteTable']):
362+
resource_categories['Networking'].append(f" - {resource_name}")
363+
elif 'IAM' in resource_type:
364+
resource_categories['IAM'].append(f" - {resource_name}")
365+
elif any(storage_type in resource_type for storage_type in ['S3', 'EFS', 'EBS']):
366+
resource_categories['Storage'].append(f" - {resource_name}")
367+
else:
368+
resource_categories['Other'].append(f" - {resource_name}")
369+
370+
# Count total resources
371+
total_resources = sum(len(category) for category in resource_categories.values())
372+
retained_count = len(retain_list)
373+
374+
# Display warning
375+
click.secho(f"⚠ WARNING: This will delete the following {total_resources} resources:", fg='yellow')
376+
click.echo()
377+
378+
for category, items in resource_categories.items():
379+
if items:
380+
click.echo(f"{category} ({len(items)}):")
381+
for item in items:
382+
click.echo(item)
383+
click.echo()
384+
385+
if retain_list:
386+
click.secho(f"The following {retained_count} resources will be RETAINED:", fg='green')
387+
for resource in retain_list:
388+
click.secho(f" ✓ {resource} (retained)", fg='green')
389+
click.echo()
390+
391+
# Confirmation prompt (skip if force flag is used)
392+
if not force_with_retain:
393+
if not click.confirm("Continue?", default=False):
394+
click.echo("Operation cancelled.")
395+
return
396+
397+
# Perform deletion
398+
delete_params = {'StackName': stack_name}
399+
if retain_list:
400+
delete_params['RetainResources'] = retain_list
401+
402+
logger.info(f"Deleting stack: {stack_name} with params: {delete_params}")
403+
404+
try:
405+
cf_client.delete_stack(**delete_params)
406+
407+
if force_with_retain:
408+
click.secho("✓ Force deletion completed", fg='green')
409+
click.secho(f"✓ Deleted all possible resources ({total_resources - retained_count}/{total_resources})", fg='green')
410+
411+
if retain_list:
412+
click.echo()
413+
click.secho(f"Retained due to user request ({len(retain_list)}):", fg='green')
414+
for resource in retain_list:
415+
click.secho(f" ✓ {resource} (user requested)", fg='green')
416+
417+
click.echo()
418+
click.secho(f"✓ Stack '{stack_name}' deletion completed with retentions", fg='green')
419+
else:
420+
click.secho(f"✓ Stack '{stack_name}' deletion initiated successfully", fg='green')
421+
422+
if retain_list:
423+
click.echo()
424+
click.secho(f"Successfully retained as requested ({len(retain_list)}):", fg='green')
425+
for resource in retain_list:
426+
click.secho(f" ✓ {resource} (retained)", fg='green')
427+
428+
except Exception as delete_error:
429+
# Handle termination protection specifically
430+
if "TerminationProtection is enabled" in str(delete_error):
431+
click.secho("❌ Stack deletion blocked: Termination Protection is enabled", fg='red')
432+
click.echo()
433+
click.secho("To delete this stack, first disable termination protection:", fg='yellow')
434+
click.secho(f"aws cloudformation update-termination-protection --no-enable-termination-protection --stack-name {stack_name} --region {region or 'us-west-2'}", fg='cyan')
435+
click.echo()
436+
click.secho("Then retry the delete command.", fg='yellow')
437+
raise click.ClickException("Termination protection must be disabled before deletion")
438+
439+
# Handle partial deletion failures
440+
click.secho("✗ Stack deletion failed", fg='red')
441+
442+
# Try to get current stack resources to show what was deleted
443+
try:
444+
current_resources = cf_client.list_stack_resources(StackName=stack_name)
445+
current_resource_names = {r['LogicalResourceId'] for r in current_resources.get('StackResourceSummaries', [])}
446+
original_resource_names = {r['LogicalResourceId'] for r in resources}
447+
448+
deleted_resources = original_resource_names - current_resource_names
449+
failed_resources = current_resource_names - set(retain_list) if retain_list else current_resource_names
450+
451+
if deleted_resources:
452+
click.echo()
453+
click.secho(f"Successfully deleted ({len(deleted_resources)}):", fg='green')
454+
for resource in deleted_resources:
455+
click.secho(f" ✓ {resource}", fg='green')
456+
457+
if failed_resources:
458+
click.echo()
459+
click.secho(f"Failed to delete ({len(failed_resources)}):", fg='red')
460+
for resource in failed_resources:
461+
click.secho(f" ✗ {resource} (DependencyViolation: has dependent resources)", fg='red')
462+
463+
if retain_list:
464+
click.echo()
465+
click.secho(f"Successfully retained as requested ({len(retain_list)}):", fg='green')
466+
for resource in retain_list:
467+
click.secho(f" ✓ {resource} (retained)", fg='green')
468+
469+
click.echo()
470+
click.secho("Run with --force-with-retain to complete deletion of remaining resources", fg='yellow')
471+
472+
except:
473+
# If we can't get current resources, show generic error
474+
click.secho(f"Error: {delete_error}", fg='red')
475+
476+
raise click.ClickException(str(delete_error))
477+
478+
except Exception as e:
479+
logger.error(f"Failed to delete stack: {e}")
480+
if debug:
481+
logger.exception("Detailed error information:")
482+
483+
if "does not exist" in str(e):
484+
click.secho(f"❌ Stack '{stack_name}' not found", fg='red')
485+
elif "AccessDenied" in str(e):
486+
click.secho("❌ Access denied. Check AWS permissions", fg='red')
487+
else:
488+
click.secho(f"❌ Error deleting stack: {e}", fg='red')
489+
490+
raise click.ClickException(str(e))
314491

315492
@click.command("cluster")
316493
@click.option("--cluster-name", required=True, help="The name of the cluster to update")

src/sagemaker/hyperpod/cli/hyp_cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \
1111
get_monitoring
1212
from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \
13-
list_cluster_stacks, update_cluster
13+
list_cluster_stacks, update_cluster, delete_cluster_stack
1414
from sagemaker.hyperpod.cli.commands.training import (
1515
pytorch_create,
1616
list_jobs,
@@ -190,6 +190,7 @@ def exec():
190190
delete.add_command(pytorch_delete)
191191
delete.add_command(js_delete)
192192
delete.add_command(custom_delete)
193+
delete.add_command(delete_cluster_stack)
193194

194195
list_pods.add_command(pytorch_list_pods)
195196
list_pods.add_command(js_list_pods)

0 commit comments

Comments
 (0)