1818from sagemaker .hyperpod .common .telemetry .constants import Feature
1919from sagemaker .hyperpod .common .utils import setup_logging
2020from sagemaker .hyperpod .cli .utils import convert_datetimes
21+ from sagemaker .hyperpod import create_boto3_client
2122
2223logger = logging .getLogger (__name__ )
2324
@@ -292,8 +293,12 @@ def list_cluster_stacks(region, debug, status):
292293
293294@click .command ("cluster-stack" )
294295@click .argument ("stack-name" , required = True )
296+ @click .option ("--retain-resources" , help = "Comma-separated list of resources to retain during deletion" )
297+ @click .option ("--force-with-retain" , is_flag = True , help = "Force deletion with retention of failed resources" )
298+ @click .option ("--region" , help = "AWS region" )
295299@click .option ("--debug" , is_flag = True , help = "Enable debug logging" )
296- def delete (stack_name : str , debug : bool ) -> None :
300+ @_hyperpod_telemetry_emitter (Feature .HYPERPOD_CLI , "delete_cluster_stack_cli" )
301+ def delete_cluster_stack (stack_name : str , retain_resources : str , force_with_retain : bool , region : str , debug : bool ) -> None :
297302 """Delete a HyperPod cluster stack.
298303
299304 Removes the specified CloudFormation stack and all associated AWS resources.
@@ -306,11 +311,183 @@ def delete(stack_name: str, debug: bool) -> None:
306311
307312 # Delete a cluster stack
308313 hyp delete hyp-cluster my-stack-name
314+
315+ # Delete with retained resources
316+ hyp delete hyp-cluster my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models
317+
318+ # Force deletion with retention
319+ hyp delete hyp-cluster my-stack-name --retain-resources S3Bucket-TrainingData --force-with-retain
309320 """
310321 logger = setup_logging (logging .getLogger (__name__ ), debug )
311322
312- logger .info (f"Deleting stack: { stack_name } " )
313- logger .info ("This feature is not yet implemented." )
323+ try :
324+ # Parse retain resources
325+ retain_list = []
326+ if retain_resources :
327+ retain_list = [r .strip () for r in retain_resources .split (',' ) if r .strip ()]
328+
329+ # Get stack resources for warning display
330+ cf_client = create_boto3_client ('cloudformation' , region_name = region )
331+
332+ try :
333+ resources_response = cf_client .list_stack_resources (StackName = stack_name )
334+ resources = resources_response .get ('StackResourceSummaries' , [])
335+ except Exception as e :
336+ if "does not exist" in str (e ):
337+ click .secho (f"❌ Stack '{ stack_name } ' not found" , fg = 'red' )
338+ return
339+ raise
340+
341+ if not resources :
342+ click .secho (f"❌ No resources found in stack '{ stack_name } '" , fg = 'red' )
343+ return
344+
345+ # Categorize resources
346+ resource_categories = {
347+ 'EC2 Instances' : [],
348+ 'Networking' : [],
349+ 'IAM' : [],
350+ 'Storage' : [],
351+ 'Other' : []
352+ }
353+
354+ for resource in resources :
355+ resource_type = resource .get ('ResourceType' , '' )
356+ resource_name = resource .get ('LogicalResourceId' , '' )
357+ physical_id = resource .get ('PhysicalResourceId' , '' )
358+
359+ if 'EC2::Instance' in resource_type :
360+ resource_categories ['EC2 Instances' ].append (f" - { resource_name } ({ physical_id } )" )
361+ elif any (net_type in resource_type for net_type in ['VPC' , 'SecurityGroup' , 'InternetGateway' , 'Subnet' , 'RouteTable' ]):
362+ resource_categories ['Networking' ].append (f" - { resource_name } " )
363+ elif 'IAM' in resource_type :
364+ resource_categories ['IAM' ].append (f" - { resource_name } " )
365+ elif any (storage_type in resource_type for storage_type in ['S3' , 'EFS' , 'EBS' ]):
366+ resource_categories ['Storage' ].append (f" - { resource_name } " )
367+ else :
368+ resource_categories ['Other' ].append (f" - { resource_name } " )
369+
370+ # Count total resources
371+ total_resources = sum (len (category ) for category in resource_categories .values ())
372+ retained_count = len (retain_list )
373+
374+ # Display warning
375+ click .secho (f"⚠ WARNING: This will delete the following { total_resources } resources:" , fg = 'yellow' )
376+ click .echo ()
377+
378+ for category , items in resource_categories .items ():
379+ if items :
380+ click .echo (f"{ category } ({ len (items )} ):" )
381+ for item in items :
382+ click .echo (item )
383+ click .echo ()
384+
385+ if retain_list :
386+ click .secho (f"The following { retained_count } resources will be RETAINED:" , fg = 'green' )
387+ for resource in retain_list :
388+ click .secho (f" ✓ { resource } (retained)" , fg = 'green' )
389+ click .echo ()
390+
391+ # Confirmation prompt (skip if force flag is used)
392+ if not force_with_retain :
393+ if not click .confirm ("Continue?" , default = False ):
394+ click .echo ("Operation cancelled." )
395+ return
396+
397+ # Perform deletion
398+ delete_params = {'StackName' : stack_name }
399+ if retain_list :
400+ delete_params ['RetainResources' ] = retain_list
401+
402+ logger .info (f"Deleting stack: { stack_name } with params: { delete_params } " )
403+
404+ try :
405+ cf_client .delete_stack (** delete_params )
406+
407+ if force_with_retain :
408+ click .secho ("✓ Force deletion completed" , fg = 'green' )
409+ click .secho (f"✓ Deleted all possible resources ({ total_resources - retained_count } /{ total_resources } )" , fg = 'green' )
410+
411+ if retain_list :
412+ click .echo ()
413+ click .secho (f"Retained due to user request ({ len (retain_list )} ):" , fg = 'green' )
414+ for resource in retain_list :
415+ click .secho (f" ✓ { resource } (user requested)" , fg = 'green' )
416+
417+ click .echo ()
418+ click .secho (f"✓ Stack '{ stack_name } ' deletion completed with retentions" , fg = 'green' )
419+ else :
420+ click .secho (f"✓ Stack '{ stack_name } ' deletion initiated successfully" , fg = 'green' )
421+
422+ if retain_list :
423+ click .echo ()
424+ click .secho (f"Successfully retained as requested ({ len (retain_list )} ):" , fg = 'green' )
425+ for resource in retain_list :
426+ click .secho (f" ✓ { resource } (retained)" , fg = 'green' )
427+
428+ except Exception as delete_error :
429+ # Handle termination protection specifically
430+ if "TerminationProtection is enabled" in str (delete_error ):
431+ click .secho ("❌ Stack deletion blocked: Termination Protection is enabled" , fg = 'red' )
432+ click .echo ()
433+ click .secho ("To delete this stack, first disable termination protection:" , fg = 'yellow' )
434+ click .secho (f"aws cloudformation update-termination-protection --no-enable-termination-protection --stack-name { stack_name } --region { region or 'us-west-2' } " , fg = 'cyan' )
435+ click .echo ()
436+ click .secho ("Then retry the delete command." , fg = 'yellow' )
437+ raise click .ClickException ("Termination protection must be disabled before deletion" )
438+
439+ # Handle partial deletion failures
440+ click .secho ("✗ Stack deletion failed" , fg = 'red' )
441+
442+ # Try to get current stack resources to show what was deleted
443+ try :
444+ current_resources = cf_client .list_stack_resources (StackName = stack_name )
445+ current_resource_names = {r ['LogicalResourceId' ] for r in current_resources .get ('StackResourceSummaries' , [])}
446+ original_resource_names = {r ['LogicalResourceId' ] for r in resources }
447+
448+ deleted_resources = original_resource_names - current_resource_names
449+ failed_resources = current_resource_names - set (retain_list ) if retain_list else current_resource_names
450+
451+ if deleted_resources :
452+ click .echo ()
453+ click .secho (f"Successfully deleted ({ len (deleted_resources )} ):" , fg = 'green' )
454+ for resource in deleted_resources :
455+ click .secho (f" ✓ { resource } " , fg = 'green' )
456+
457+ if failed_resources :
458+ click .echo ()
459+ click .secho (f"Failed to delete ({ len (failed_resources )} ):" , fg = 'red' )
460+ for resource in failed_resources :
461+ click .secho (f" ✗ { resource } (DependencyViolation: has dependent resources)" , fg = 'red' )
462+
463+ if retain_list :
464+ click .echo ()
465+ click .secho (f"Successfully retained as requested ({ len (retain_list )} ):" , fg = 'green' )
466+ for resource in retain_list :
467+ click .secho (f" ✓ { resource } (retained)" , fg = 'green' )
468+
469+ click .echo ()
470+ click .secho ("Run with --force-with-retain to complete deletion of remaining resources" , fg = 'yellow' )
471+
472+ except :
473+ # If we can't get current resources, show generic error
474+ click .secho (f"Error: { delete_error } " , fg = 'red' )
475+
476+ raise click .ClickException (str (delete_error ))
477+
478+ except Exception as e :
479+ logger .error (f"Failed to delete stack: { e } " )
480+ if debug :
481+ logger .exception ("Detailed error information:" )
482+
483+ if "does not exist" in str (e ):
484+ click .secho (f"❌ Stack '{ stack_name } ' not found" , fg = 'red' )
485+ elif "AccessDenied" in str (e ):
486+ click .secho ("❌ Access denied. Check AWS permissions" , fg = 'red' )
487+ else :
488+ click .secho (f"❌ Error deleting stack: { e } " , fg = 'red' )
489+
490+ raise click .ClickException (str (e ))
314491
315492@click .command ("cluster" )
316493@click .option ("--cluster-name" , required = True , help = "The name of the cluster to update" )
0 commit comments