@@ -9,14 +9,20 @@ import (
99 "fmt"
1010 "strings"
1111 "text/template"
12+ "time"
1213
14+ "github.com/go-logr/logr"
1315 "github.com/spf13/pflag"
1416 corev1 "k8s.io/api/core/v1"
17+ apierrors "k8s.io/apimachinery/pkg/api/errors"
18+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19+ "k8s.io/apimachinery/pkg/util/wait"
1520 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
1621 runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1"
1722 ctrl "sigs.k8s.io/controller-runtime"
1823 ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
1924
25+ caaphv1 "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/external/sigs.k8s.io/cluster-api-addon-provider-helm/api/v1alpha1"
2026 "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/v1alpha1"
2127 apivariables "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/variables"
2228 commonhandlers "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/common/pkg/capi/clustertopology/handlers"
6874 _ commonhandlers.Named = & DefaultK8sRegistrationAgent {}
6975 _ lifecycle.AfterControlPlaneInitialized = & DefaultK8sRegistrationAgent {}
7076 _ lifecycle.BeforeClusterUpgrade = & DefaultK8sRegistrationAgent {}
77+ _ lifecycle.BeforeClusterDelete = & DefaultK8sRegistrationAgent {}
7178)
7279
7380func New (
@@ -315,3 +322,285 @@ func templateValuesFunc(
315322 return b .String (), nil
316323 }
317324}
325+
326+ func (n * DefaultK8sRegistrationAgent ) BeforeClusterDelete (
327+ ctx context.Context ,
328+ req * runtimehooksv1.BeforeClusterDeleteRequest ,
329+ resp * runtimehooksv1.BeforeClusterDeleteResponse ,
330+ ) {
331+ cluster := & req .Cluster
332+ clusterKey := ctrlclient .ObjectKeyFromObject (cluster )
333+
334+ log := ctrl .LoggerFrom (ctx ).WithValues (
335+ "cluster" ,
336+ clusterKey ,
337+ )
338+
339+ varMap := variables .ClusterVariablesToVariablesMap (cluster .Spec .Topology .Variables )
340+ k8sAgentVar , err := variables .Get [apivariables.NutanixK8sRegistrationAgent ](
341+ varMap ,
342+ n .variableName ,
343+ n .variablePath ... )
344+ if err != nil {
345+ if variables .IsNotFoundError (err ) {
346+ log .Info (
347+ "Skipping K8s Registration Agent cleanup, addon not specified in cluster definition" ,
348+ )
349+ resp .SetStatus (runtimehooksv1 .ResponseStatusSuccess )
350+ return
351+ }
352+ log .Error (
353+ err ,
354+ "failed to read K8s Registration Agent variable from cluster definition" ,
355+ )
356+ resp .SetStatus (runtimehooksv1 .ResponseStatusFailure )
357+ resp .SetMessage (
358+ fmt .Sprintf ("failed to read K8s Registration Agent variable from cluster definition: %v" ,
359+ err ,
360+ ),
361+ )
362+ return
363+ }
364+
365+ // Only handle HelmAddon strategy for cleanup
366+ switch k8sAgentVar .Strategy {
367+ case v1alpha1 .AddonStrategyHelmAddon :
368+ // Check if cleanup is already in progress or completed
369+ cleanupStatus , err := n .checkCleanupStatus (ctx , cluster , log )
370+ if err != nil {
371+ log .Error (err , "Failed to check cleanup status" )
372+ resp .SetStatus (runtimehooksv1 .ResponseStatusFailure )
373+ resp .SetMessage (err .Error ())
374+ return
375+ }
376+
377+ switch cleanupStatus {
378+ case "completed" :
379+ log .Info ("K8s Registration Agent cleanup already completed" )
380+ resp .SetStatus (runtimehooksv1 .ResponseStatusSuccess )
381+ return
382+ case "in-progress" :
383+ log .Info ("K8s Registration Agent cleanup in progress, requesting retry" )
384+ resp .SetStatus (runtimehooksv1 .ResponseStatusSuccess )
385+ resp .SetRetryAfterSeconds (10 ) // Retry after 10 seconds
386+ return
387+ case "not-started" :
388+ log .Info ("Starting K8s Registration Agent cleanup" )
389+ // Proceed with cleanup below
390+ }
391+
392+ err = n .deleteHelmChart (ctx , cluster , log )
393+ if err != nil {
394+ log .Error (err , "Failed to delete helm chart" )
395+ resp .SetStatus (runtimehooksv1 .ResponseStatusFailure )
396+ resp .SetMessage (err .Error ())
397+ return
398+ }
399+
400+ // After initiating cleanup, request a retry to monitor completion
401+ log .Info ("K8s Registration Agent cleanup initiated, will monitor progress" )
402+ resp .SetStatus (runtimehooksv1 .ResponseStatusSuccess )
403+ resp .SetRetryAfterSeconds (5 ) // Quick retry to start monitoring
404+
405+ case v1alpha1 .AddonStrategyClusterResourceSet :
406+ log .Info ("ClusterResourceSet strategy does not require cleanup" )
407+ resp .SetStatus (runtimehooksv1 .ResponseStatusSuccess )
408+ case "" :
409+ log .Info ("No strategy specified, skipping cleanup" )
410+ resp .SetStatus (runtimehooksv1 .ResponseStatusSuccess )
411+ default :
412+ log .Info (
413+ "Unknown K8s Registration Agent strategy, skipping cleanup" ,
414+ "strategy" , k8sAgentVar .Strategy ,
415+ )
416+ resp .SetStatus (runtimehooksv1 .ResponseStatusSuccess )
417+ }
418+ }
419+
420+ func (n * DefaultK8sRegistrationAgent ) deleteHelmChart (
421+ ctx context.Context ,
422+ cluster * clusterv1.Cluster ,
423+ log logr.Logger ,
424+ ) error {
425+ clusterUUID , ok := cluster .Annotations [v1alpha1 .ClusterUUIDAnnotationKey ]
426+ if ! ok {
427+ return fmt .Errorf (
428+ "cluster UUID not found in cluster annotations - missing key %s" ,
429+ v1alpha1 .ClusterUUIDAnnotationKey ,
430+ )
431+ }
432+
433+ // Create HelmChartProxy with the same naming pattern used during creation
434+ hcp := & caaphv1.HelmChartProxy {
435+ ObjectMeta : metav1.ObjectMeta {
436+ Name : fmt .Sprintf ("%s-%s" , defaultHelmReleaseName , clusterUUID ),
437+ Namespace : cluster .Namespace ,
438+ },
439+ }
440+
441+ // First, try to gracefully trigger helm uninstall while cluster is still accessible
442+ log .Info ("Initiating graceful deletion of K8s Registration Agent" , "name" , hcp .Name , "namespace" , hcp .Namespace )
443+
444+ // Get the current HCP to check if it exists and get its current state
445+ currentHCP := & caaphv1.HelmChartProxy {}
446+ err := n .client .Get (ctx , ctrlclient .ObjectKeyFromObject (hcp ), currentHCP )
447+ if err != nil {
448+ if ctrlclient .IgnoreNotFound (err ) == nil {
449+ log .Info ("K8s Registration Agent HelmChartProxy already deleted" , "name" , hcp .Name )
450+ return nil
451+ }
452+ return fmt .Errorf ("failed to get HelmChartProxy %q: %w" , ctrlclient .ObjectKeyFromObject (hcp ), err )
453+ }
454+
455+ // Add a deletion timestamp annotation to help CAAPH prioritize this deletion
456+ // and set a shorter timeout to fail fast if cluster becomes unreachable
457+ if currentHCP .Annotations == nil {
458+ currentHCP .Annotations = make (map [string ]string )
459+ }
460+ currentHCP .Annotations ["cluster.x-k8s.io/delete-priority" ] = "high"
461+ currentHCP .Annotations ["cluster.x-k8s.io/delete-timeout" ] = "60s"
462+
463+ // Update the HCP with priority annotations before deletion
464+ if err := n .client .Update (ctx , currentHCP ); err != nil {
465+ log .Info ("Failed to update HCP annotations, proceeding with deletion" , "error" , err )
466+ }
467+
468+ // Now delete the HelmChartProxy - CAAPH will handle the helm uninstall
469+ log .Info ("Deleting K8s Registration Agent HelmChartProxy" , "name" , hcp .Name , "namespace" , hcp .Namespace )
470+ if err := n .client .Delete (ctx , currentHCP ); err != nil {
471+ if ctrlclient .IgnoreNotFound (err ) == nil {
472+ log .Info ("K8s Registration Agent HelmChartProxy already deleted" , "name" , hcp .Name )
473+ return nil
474+ }
475+ return fmt .Errorf (
476+ "failed to delete K8s Registration Agent HelmChartProxy %q: %w" ,
477+ ctrlclient .ObjectKeyFromObject (hcp ),
478+ err ,
479+ )
480+ }
481+
482+ // Wait for CAAPH to complete the helm uninstall before allowing cluster deletion to proceed
483+ // This ensures graceful deletion order - helm uninstall completes before infrastructure teardown
484+ log .Info ("Waiting for helm uninstall to complete before proceeding with cluster deletion" , "name" , hcp .Name )
485+
486+ if err := n .waitForHelmUninstallCompletion (ctx , hcp , log ); err != nil {
487+ log .Error (err , "Helm uninstall did not complete gracefully, proceeding with cluster deletion" , "name" , hcp .Name )
488+ // Don't return error here - we want cluster deletion to proceed even if helm uninstall times out
489+ // The important thing is we gave it a reasonable chance to complete
490+ } else {
491+ log .Info ("Helm uninstall completed successfully" , "name" , hcp .Name )
492+ }
493+
494+ return nil
495+ }
496+
497+ // checkCleanupStatus checks the current status of K8s Registration Agent cleanup
498+ // Returns: "completed", "in-progress", or "not-started"
499+ func (n * DefaultK8sRegistrationAgent ) checkCleanupStatus (
500+ ctx context.Context ,
501+ cluster * clusterv1.Cluster ,
502+ log logr.Logger ,
503+ ) (string , error ) {
504+ clusterUUID , ok := cluster .Annotations [v1alpha1 .ClusterUUIDAnnotationKey ]
505+ if ! ok {
506+ return "completed" , nil // If no UUID, assume no agent was installed
507+ }
508+
509+ // Check if HelmChartProxy exists
510+ hcp := & caaphv1.HelmChartProxy {
511+ ObjectMeta : metav1.ObjectMeta {
512+ Name : fmt .Sprintf ("%s-%s" , defaultHelmReleaseName , clusterUUID ),
513+ Namespace : cluster .Namespace ,
514+ },
515+ }
516+
517+ err := n .client .Get (ctx , ctrlclient .ObjectKeyFromObject (hcp ), hcp )
518+ if err != nil {
519+ if apierrors .IsNotFound (err ) {
520+ log .Info ("HelmChartProxy not found, cleanup completed" , "name" , hcp .Name )
521+ return "completed" , nil
522+ }
523+ return "" , fmt .Errorf ("failed to get HelmChartProxy %q: %w" , ctrlclient .ObjectKeyFromObject (hcp ), err )
524+ }
525+
526+ // HCP exists - check if it's being deleted
527+ if hcp .DeletionTimestamp != nil {
528+ log .Info ("HelmChartProxy is being deleted, cleanup in progress" , "name" , hcp .Name )
529+ return "in-progress" , nil
530+ }
531+
532+ // HCP exists and is not being deleted
533+ log .Info ("HelmChartProxy exists, cleanup not started" , "name" , hcp .Name )
534+ return "not-started" , nil
535+ }
536+
537+ // waitForHelmUninstallCompletion waits for CAAPH to complete the helm uninstall process
538+ // before allowing cluster deletion to proceed. This ensures graceful deletion order.
539+ func (n * DefaultK8sRegistrationAgent ) waitForHelmUninstallCompletion (
540+ ctx context.Context ,
541+ hcp * caaphv1.HelmChartProxy ,
542+ log logr.Logger ,
543+ ) error {
544+ // Create a context with timeout to avoid blocking cluster deletion indefinitely
545+ // 90 seconds should be enough for most helm uninstalls while still being reasonable
546+ waitCtx , cancel := context .WithTimeout (ctx , 90 * time .Second )
547+ defer cancel ()
548+
549+ log .Info ("Monitoring HelmChartProxy deletion progress" , "name" , hcp .Name )
550+
551+ // First wait for the HelmChartProxy to be fully processed for deletion
552+ // This indicates CAAPH has acknowledged the deletion request
553+ err := wait .PollUntilContextTimeout (
554+ waitCtx ,
555+ 2 * time .Second ,
556+ 30 * time .Second ,
557+ true ,
558+ func (pollCtx context.Context ) (bool , error ) {
559+ currentHCP := & caaphv1.HelmChartProxy {}
560+ err := n .client .Get (pollCtx , ctrlclient .ObjectKeyFromObject (hcp ), currentHCP )
561+ if err != nil {
562+ if apierrors .IsNotFound (err ) {
563+ log .Info ("HelmChartProxy has been deleted" , "name" , hcp .Name )
564+ return true , nil
565+ }
566+ // If we can't reach the API server, the cluster might be shutting down
567+ // In this case, we should not block cluster deletion
568+ log .Info ("Error checking HelmChartProxy status, cluster may be shutting down" , "error" , err )
569+ return true , nil
570+ }
571+
572+ // Check if the HCP is in deletion phase
573+ if currentHCP .DeletionTimestamp != nil {
574+ log .Info ("HelmChartProxy is being deleted, waiting for completion" , "name" , hcp .Name )
575+ return false , nil
576+ }
577+
578+ // If HCP still exists without deletion timestamp, something might be wrong
579+ log .Info ("HelmChartProxy still exists, waiting for deletion to start" , "name" , hcp .Name )
580+ return false , nil
581+ },
582+ )
583+ if err != nil {
584+ if wait .Interrupted (err ) {
585+ return fmt .Errorf ("timeout waiting for HelmChartProxy deletion to complete" )
586+ }
587+ return fmt .Errorf ("error waiting for HelmChartProxy deletion: %w" , err )
588+ }
589+
590+ // Additional wait to give CAAPH more time to complete the helm uninstall
591+ // even after the HCP is deleted. This accounts for any cleanup operations.
592+ log .Info ("HelmChartProxy deleted, allowing additional time for helm uninstall completion" )
593+
594+ // Use a shorter additional wait to not delay cluster deletion too much
595+ additionalWaitCtx , additionalCancel := context .WithTimeout (ctx , 30 * time .Second )
596+ defer additionalCancel ()
597+
598+ select {
599+ case <- additionalWaitCtx .Done ():
600+ log .Info ("Additional wait period completed, proceeding with cluster deletion" )
601+ case <- time .After (10 * time .Second ):
602+ log .Info ("Reasonable wait time elapsed, proceeding with cluster deletion" )
603+ }
604+
605+ return nil
606+ }
0 commit comments