Skip to content

Commit ced09aa

Browse files
author
Mohamed Zeidan
committed
delete cluster stack
1 parent fdaf009 commit ced09aa

File tree

1 file changed

+66
-37
lines changed

1 file changed

+66
-37
lines changed

src/sagemaker/hyperpod/cli/commands/cluster_stack.py

Lines changed: 66 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -294,11 +294,10 @@ def list_cluster_stacks(region, debug, status):
294294
@click.command("cluster-stack")
295295
@click.argument("stack-name", required=True)
296296
@click.option("--retain-resources", help="Comma-separated list of resources to retain during deletion")
297-
@click.option("--force-with-retain", is_flag=True, help="Force deletion with retention of failed resources")
298-
@click.option("--region", help="AWS region")
297+
@click.option("--region", required=True, help="AWS region (required)")
299298
@click.option("--debug", is_flag=True, help="Enable debug logging")
300299
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_cluster_stack_cli")
301-
def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_retain: bool, region: str, debug: bool) -> None:
300+
def delete_cluster_stack(stack_name: str, retain_resources: str, region: str, debug: bool) -> None:
302301
"""Delete a HyperPod cluster stack.
303302
304303
Removes the specified CloudFormation stack and all associated AWS resources.
@@ -310,13 +309,10 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
310309
.. code-block:: bash
311310
312311
# Delete a cluster stack
313-
hyp delete hyp-cluster my-stack-name
314-
315-
# Delete with retained resources
316-
hyp delete hyp-cluster my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models
312+
hyp delete cluster-stack my-stack-name --region us-west-2
317313
318-
# Force deletion with retention
319-
hyp delete hyp-cluster my-stack-name --retain-resources S3Bucket-TrainingData --force-with-retain
314+
# Delete with retained resources (only works on DELETE_FAILED stacks)
315+
hyp delete cluster-stack my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models --region us-west-2
320316
"""
321317
logger = setup_logging(logging.getLogger(__name__), debug)
322318

@@ -342,7 +338,29 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
342338
click.secho(f"❌ No resources found in stack '{stack_name}'", fg='red')
343339
return
344340

345-
# Categorize resources
341+
# Validate retain resources exist in stack
342+
if retain_list:
343+
existing_resource_names = {r.get('LogicalResourceId', '') for r in resources}
344+
valid_retain_resources = []
345+
invalid_retain_resources = []
346+
347+
for resource in retain_list:
348+
if resource in existing_resource_names:
349+
valid_retain_resources.append(resource)
350+
else:
351+
invalid_retain_resources.append(resource)
352+
353+
# Show warning for non-existent resources
354+
if invalid_retain_resources:
355+
click.secho(f"⚠️ Warning: The following {len(invalid_retain_resources)} resources don't exist in the stack:", fg='yellow')
356+
for resource in invalid_retain_resources:
357+
click.secho(f" - {resource} (not found)", fg='yellow')
358+
click.echo()
359+
360+
# Update retain_list to only include valid resources
361+
retain_list = valid_retain_resources
362+
363+
# Categorize resources (excluding retained ones from deletion display)
346364
resource_categories = {
347365
'EC2 Instances': [],
348366
'Networking': [],
@@ -356,6 +374,10 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
356374
resource_name = resource.get('LogicalResourceId', '')
357375
physical_id = resource.get('PhysicalResourceId', '')
358376

377+
# Skip resources that will be retained
378+
if resource_name in retain_list:
379+
continue
380+
359381
if 'EC2::Instance' in resource_type:
360382
resource_categories['EC2 Instances'].append(f" - {resource_name} ({physical_id})")
361383
elif any(net_type in resource_type for net_type in ['VPC', 'SecurityGroup', 'InternetGateway', 'Subnet', 'RouteTable']):
@@ -367,7 +389,7 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
367389
else:
368390
resource_categories['Other'].append(f" - {resource_name}")
369391

370-
# Count total resources
392+
# Count total resources (excluding retained ones)
371393
total_resources = sum(len(category) for category in resource_categories.values())
372394
retained_count = len(retain_list)
373395

@@ -388,11 +410,10 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
388410
click.secho(f" ✓ {resource} (retained)", fg='green')
389411
click.echo()
390412

391-
# Confirmation prompt (skip if force flag is used)
392-
if not force_with_retain:
393-
if not click.confirm("Continue?", default=False):
394-
click.echo("Operation cancelled.")
395-
return
413+
# Confirmation prompt
414+
if not click.confirm("Continue?", default=False):
415+
click.echo("Operation cancelled.")
416+
return
396417

397418
# Perform deletion
398419
delete_params = {'StackName': stack_name}
@@ -404,26 +425,16 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
404425
try:
405426
cf_client.delete_stack(**delete_params)
406427

407-
if force_with_retain:
408-
click.secho("✓ Force deletion completed", fg='green')
409-
click.secho(f"✓ Deleted all possible resources ({total_resources - retained_count}/{total_resources})", fg='green')
410-
411-
if retain_list:
412-
click.echo()
413-
click.secho(f"Retained due to user request ({len(retain_list)}):", fg='green')
414-
for resource in retain_list:
415-
click.secho(f" ✓ {resource} (user requested)", fg='green')
416-
428+
click.secho(f"✓ Stack '{stack_name}' deletion initiated successfully", fg='green')
429+
430+
if retain_list:
417431
click.echo()
418-
click.secho(f"✓ Stack '{stack_name}' deletion completed with retentions", fg='green')
419-
else:
420-
click.secho(f"✓ Stack '{stack_name}' deletion initiated successfully", fg='green')
421-
422-
if retain_list:
423-
click.echo()
424-
click.secho(f"Successfully retained as requested ({len(retain_list)}):", fg='green')
425-
for resource in retain_list:
426-
click.secho(f" ✓ {resource} (retained)", fg='green')
432+
click.secho(f"Successfully retained as requested ({len(retain_list)}):", fg='green')
433+
for resource in retain_list:
434+
click.secho(f" ✓ {resource} (retained)", fg='green')
435+
click.echo()
436+
click.secho("💡 Retained resources will remain as standalone AWS resources", fg='cyan')
437+
click.secho(" You can access them directly via AWS Console/CLI using their physical resource IDs", fg='cyan')
427438

428439
except Exception as delete_error:
429440
# Handle termination protection specifically
@@ -436,6 +447,21 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
436447
click.secho("Then retry the delete command.", fg='yellow')
437448
raise click.ClickException("Termination protection must be disabled before deletion")
438449

450+
# Handle CloudFormation retain-resources limitation
451+
if retain_list and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in str(delete_error):
452+
click.secho("❌ CloudFormation limitation: --retain-resources only works on failed deletions", fg='red')
453+
click.echo()
454+
click.secho("💡 Recommended workflow:", fg='yellow')
455+
click.secho("1. First try deleting without --retain-resources:", fg='cyan')
456+
click.secho(f" hyp delete cluster-stack {stack_name} --region {region or 'us-west-2'}", fg='cyan')
457+
click.echo()
458+
click.secho("2. If deletion fails, the stack will be in DELETE_FAILED state", fg='cyan')
459+
click.secho("3. Then retry with --retain-resources to keep specific resources:", fg='cyan')
460+
click.secho(f" hyp delete cluster-stack {stack_name} --retain-resources {retain_resources} --region {region or 'us-west-2'}", fg='cyan')
461+
click.echo()
462+
click.secho("⚠️ Alternative: Delete without retention and manually preserve important data first", fg='yellow')
463+
return # Exit gracefully without raising exception
464+
439465
# Handle partial deletion failures
440466
click.secho("✗ Stack deletion failed", fg='red')
441467

@@ -467,7 +493,8 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
467493
click.secho(f" ✓ {resource} (retained)", fg='green')
468494

469495
click.echo()
470-
click.secho("Run with --force-with-retain to complete deletion of remaining resources", fg='yellow')
496+
click.secho("💡 Note: Some resources may have dependencies preventing deletion", fg='yellow')
497+
click.secho(" Check the AWS CloudFormation console for detailed dependency information", fg='cyan')
471498

472499
except:
473500
# If we can't get current resources, show generic error
@@ -487,7 +514,9 @@ def delete_cluster_stack(stack_name: str, retain_resources: str, force_with_reta
487514
else:
488515
click.secho(f"❌ Error deleting stack: {e}", fg='red')
489516

490-
raise click.ClickException(str(e))
517+
# Only raise exception for truly unexpected errors, not user-facing ones
518+
if not any(msg in str(e) for msg in ["does not exist", "AccessDenied"]):
519+
raise click.ClickException(str(e))
491520

492521
@click.command("cluster")
493522
@click.option("--cluster-name", required=True, help="The name of the cluster to update")

0 commit comments

Comments
 (0)