Skip to content

Commit e3a3c16

Browse files
feat:Adding before cluster delete hook
1 parent d8f0531 commit e3a3c16

File tree

1 file changed

+289
-0
lines changed
  • pkg/handlers/lifecycle/k8sregistrationagent

1 file changed

+289
-0
lines changed

pkg/handlers/lifecycle/k8sregistrationagent/handler.go

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,20 @@ import (
99
"fmt"
1010
"strings"
1111
"text/template"
12+
"time"
1213

14+
"github.com/go-logr/logr"
1315
"github.com/spf13/pflag"
1416
corev1 "k8s.io/api/core/v1"
17+
apierrors "k8s.io/apimachinery/pkg/api/errors"
18+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19+
"k8s.io/apimachinery/pkg/util/wait"
1520
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
1621
runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1"
1722
ctrl "sigs.k8s.io/controller-runtime"
1823
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
1924

25+
caaphv1 "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/external/sigs.k8s.io/cluster-api-addon-provider-helm/api/v1alpha1"
2026
"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/v1alpha1"
2127
apivariables "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/api/variables"
2228
commonhandlers "github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/common/pkg/capi/clustertopology/handlers"
@@ -68,6 +74,7 @@ var (
6874
_ commonhandlers.Named = &DefaultK8sRegistrationAgent{}
6975
_ lifecycle.AfterControlPlaneInitialized = &DefaultK8sRegistrationAgent{}
7076
_ lifecycle.BeforeClusterUpgrade = &DefaultK8sRegistrationAgent{}
77+
_ lifecycle.BeforeClusterDelete = &DefaultK8sRegistrationAgent{}
7178
)
7279

7380
func New(
@@ -315,3 +322,285 @@ func templateValuesFunc(
315322
return b.String(), nil
316323
}
317324
}
325+
326+
func (n *DefaultK8sRegistrationAgent) BeforeClusterDelete(
327+
ctx context.Context,
328+
req *runtimehooksv1.BeforeClusterDeleteRequest,
329+
resp *runtimehooksv1.BeforeClusterDeleteResponse,
330+
) {
331+
cluster := &req.Cluster
332+
clusterKey := ctrlclient.ObjectKeyFromObject(cluster)
333+
334+
log := ctrl.LoggerFrom(ctx).WithValues(
335+
"cluster",
336+
clusterKey,
337+
)
338+
339+
varMap := variables.ClusterVariablesToVariablesMap(cluster.Spec.Topology.Variables)
340+
k8sAgentVar, err := variables.Get[apivariables.NutanixK8sRegistrationAgent](
341+
varMap,
342+
n.variableName,
343+
n.variablePath...)
344+
if err != nil {
345+
if variables.IsNotFoundError(err) {
346+
log.Info(
347+
"Skipping K8s Registration Agent cleanup, addon not specified in cluster definition",
348+
)
349+
resp.SetStatus(runtimehooksv1.ResponseStatusSuccess)
350+
return
351+
}
352+
log.Error(
353+
err,
354+
"failed to read K8s Registration Agent variable from cluster definition",
355+
)
356+
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
357+
resp.SetMessage(
358+
fmt.Sprintf("failed to read K8s Registration Agent variable from cluster definition: %v",
359+
err,
360+
),
361+
)
362+
return
363+
}
364+
365+
// Only handle HelmAddon strategy for cleanup
366+
switch k8sAgentVar.Strategy {
367+
case v1alpha1.AddonStrategyHelmAddon:
368+
// Check if cleanup is already in progress or completed
369+
cleanupStatus, err := n.checkCleanupStatus(ctx, cluster, log)
370+
if err != nil {
371+
log.Error(err, "Failed to check cleanup status")
372+
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
373+
resp.SetMessage(err.Error())
374+
return
375+
}
376+
377+
switch cleanupStatus {
378+
case "completed":
379+
log.Info("K8s Registration Agent cleanup already completed")
380+
resp.SetStatus(runtimehooksv1.ResponseStatusSuccess)
381+
return
382+
case "in-progress":
383+
log.Info("K8s Registration Agent cleanup in progress, requesting retry")
384+
resp.SetStatus(runtimehooksv1.ResponseStatusSuccess)
385+
resp.SetRetryAfterSeconds(10) // Retry after 10 seconds
386+
return
387+
case "not-started":
388+
log.Info("Starting K8s Registration Agent cleanup")
389+
// Proceed with cleanup below
390+
}
391+
392+
err = n.deleteHelmChart(ctx, cluster, log)
393+
if err != nil {
394+
log.Error(err, "Failed to delete helm chart")
395+
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
396+
resp.SetMessage(err.Error())
397+
return
398+
}
399+
400+
// After initiating cleanup, request a retry to monitor completion
401+
log.Info("K8s Registration Agent cleanup initiated, will monitor progress")
402+
resp.SetStatus(runtimehooksv1.ResponseStatusSuccess)
403+
resp.SetRetryAfterSeconds(5) // Quick retry to start monitoring
404+
405+
case v1alpha1.AddonStrategyClusterResourceSet:
406+
log.Info("ClusterResourceSet strategy does not require cleanup")
407+
resp.SetStatus(runtimehooksv1.ResponseStatusSuccess)
408+
case "":
409+
log.Info("No strategy specified, skipping cleanup")
410+
resp.SetStatus(runtimehooksv1.ResponseStatusSuccess)
411+
default:
412+
log.Info(
413+
"Unknown K8s Registration Agent strategy, skipping cleanup",
414+
"strategy", k8sAgentVar.Strategy,
415+
)
416+
resp.SetStatus(runtimehooksv1.ResponseStatusSuccess)
417+
}
418+
}
419+
420+
func (n *DefaultK8sRegistrationAgent) deleteHelmChart(
421+
ctx context.Context,
422+
cluster *clusterv1.Cluster,
423+
log logr.Logger,
424+
) error {
425+
clusterUUID, ok := cluster.Annotations[v1alpha1.ClusterUUIDAnnotationKey]
426+
if !ok {
427+
return fmt.Errorf(
428+
"cluster UUID not found in cluster annotations - missing key %s",
429+
v1alpha1.ClusterUUIDAnnotationKey,
430+
)
431+
}
432+
433+
// Create HelmChartProxy with the same naming pattern used during creation
434+
hcp := &caaphv1.HelmChartProxy{
435+
ObjectMeta: metav1.ObjectMeta{
436+
Name: fmt.Sprintf("%s-%s", defaultHelmReleaseName, clusterUUID),
437+
Namespace: cluster.Namespace,
438+
},
439+
}
440+
441+
// First, try to gracefully trigger helm uninstall while cluster is still accessible
442+
log.Info("Initiating graceful deletion of K8s Registration Agent", "name", hcp.Name, "namespace", hcp.Namespace)
443+
444+
// Get the current HCP to check if it exists and get its current state
445+
currentHCP := &caaphv1.HelmChartProxy{}
446+
err := n.client.Get(ctx, ctrlclient.ObjectKeyFromObject(hcp), currentHCP)
447+
if err != nil {
448+
if ctrlclient.IgnoreNotFound(err) == nil {
449+
log.Info("K8s Registration Agent HelmChartProxy already deleted", "name", hcp.Name)
450+
return nil
451+
}
452+
return fmt.Errorf("failed to get HelmChartProxy %q: %w", ctrlclient.ObjectKeyFromObject(hcp), err)
453+
}
454+
455+
// Add a deletion timestamp annotation to help CAAPH prioritize this deletion
456+
// and set a shorter timeout to fail fast if cluster becomes unreachable
457+
if currentHCP.Annotations == nil {
458+
currentHCP.Annotations = make(map[string]string)
459+
}
460+
currentHCP.Annotations["cluster.x-k8s.io/delete-priority"] = "high"
461+
currentHCP.Annotations["cluster.x-k8s.io/delete-timeout"] = "60s"
462+
463+
// Update the HCP with priority annotations before deletion
464+
if err := n.client.Update(ctx, currentHCP); err != nil {
465+
log.Info("Failed to update HCP annotations, proceeding with deletion", "error", err)
466+
}
467+
468+
// Now delete the HelmChartProxy - CAAPH will handle the helm uninstall
469+
log.Info("Deleting K8s Registration Agent HelmChartProxy", "name", hcp.Name, "namespace", hcp.Namespace)
470+
if err := n.client.Delete(ctx, currentHCP); err != nil {
471+
if ctrlclient.IgnoreNotFound(err) == nil {
472+
log.Info("K8s Registration Agent HelmChartProxy already deleted", "name", hcp.Name)
473+
return nil
474+
}
475+
return fmt.Errorf(
476+
"failed to delete K8s Registration Agent HelmChartProxy %q: %w",
477+
ctrlclient.ObjectKeyFromObject(hcp),
478+
err,
479+
)
480+
}
481+
482+
// Wait for CAAPH to complete the helm uninstall before allowing cluster deletion to proceed
483+
// This ensures graceful deletion order - helm uninstall completes before infrastructure teardown
484+
log.Info("Waiting for helm uninstall to complete before proceeding with cluster deletion", "name", hcp.Name)
485+
486+
if err := n.waitForHelmUninstallCompletion(ctx, hcp, log); err != nil {
487+
log.Error(err, "Helm uninstall did not complete gracefully, proceeding with cluster deletion", "name", hcp.Name)
488+
// Don't return error here - we want cluster deletion to proceed even if helm uninstall times out
489+
// The important thing is we gave it a reasonable chance to complete
490+
} else {
491+
log.Info("Helm uninstall completed successfully", "name", hcp.Name)
492+
}
493+
494+
return nil
495+
}
496+
497+
// checkCleanupStatus checks the current status of K8s Registration Agent cleanup
498+
// Returns: "completed", "in-progress", or "not-started"
499+
func (n *DefaultK8sRegistrationAgent) checkCleanupStatus(
500+
ctx context.Context,
501+
cluster *clusterv1.Cluster,
502+
log logr.Logger,
503+
) (string, error) {
504+
clusterUUID, ok := cluster.Annotations[v1alpha1.ClusterUUIDAnnotationKey]
505+
if !ok {
506+
return "completed", nil // If no UUID, assume no agent was installed
507+
}
508+
509+
// Check if HelmChartProxy exists
510+
hcp := &caaphv1.HelmChartProxy{
511+
ObjectMeta: metav1.ObjectMeta{
512+
Name: fmt.Sprintf("%s-%s", defaultHelmReleaseName, clusterUUID),
513+
Namespace: cluster.Namespace,
514+
},
515+
}
516+
517+
err := n.client.Get(ctx, ctrlclient.ObjectKeyFromObject(hcp), hcp)
518+
if err != nil {
519+
if apierrors.IsNotFound(err) {
520+
log.Info("HelmChartProxy not found, cleanup completed", "name", hcp.Name)
521+
return "completed", nil
522+
}
523+
return "", fmt.Errorf("failed to get HelmChartProxy %q: %w", ctrlclient.ObjectKeyFromObject(hcp), err)
524+
}
525+
526+
// HCP exists - check if it's being deleted
527+
if hcp.DeletionTimestamp != nil {
528+
log.Info("HelmChartProxy is being deleted, cleanup in progress", "name", hcp.Name)
529+
return "in-progress", nil
530+
}
531+
532+
// HCP exists and is not being deleted
533+
log.Info("HelmChartProxy exists, cleanup not started", "name", hcp.Name)
534+
return "not-started", nil
535+
}
536+
537+
// waitForHelmUninstallCompletion waits for CAAPH to complete the helm uninstall process
538+
// before allowing cluster deletion to proceed. This ensures graceful deletion order.
539+
func (n *DefaultK8sRegistrationAgent) waitForHelmUninstallCompletion(
540+
ctx context.Context,
541+
hcp *caaphv1.HelmChartProxy,
542+
log logr.Logger,
543+
) error {
544+
// Create a context with timeout to avoid blocking cluster deletion indefinitely
545+
// 90 seconds should be enough for most helm uninstalls while still being reasonable
546+
waitCtx, cancel := context.WithTimeout(ctx, 90*time.Second)
547+
defer cancel()
548+
549+
log.Info("Monitoring HelmChartProxy deletion progress", "name", hcp.Name)
550+
551+
// First wait for the HelmChartProxy to be fully processed for deletion
552+
// This indicates CAAPH has acknowledged the deletion request
553+
err := wait.PollUntilContextTimeout(
554+
waitCtx,
555+
2*time.Second,
556+
30*time.Second,
557+
true,
558+
func(pollCtx context.Context) (bool, error) {
559+
currentHCP := &caaphv1.HelmChartProxy{}
560+
err := n.client.Get(pollCtx, ctrlclient.ObjectKeyFromObject(hcp), currentHCP)
561+
if err != nil {
562+
if apierrors.IsNotFound(err) {
563+
log.Info("HelmChartProxy has been deleted", "name", hcp.Name)
564+
return true, nil
565+
}
566+
// If we can't reach the API server, the cluster might be shutting down
567+
// In this case, we should not block cluster deletion
568+
log.Info("Error checking HelmChartProxy status, cluster may be shutting down", "error", err)
569+
return true, nil
570+
}
571+
572+
// Check if the HCP is in deletion phase
573+
if currentHCP.DeletionTimestamp != nil {
574+
log.Info("HelmChartProxy is being deleted, waiting for completion", "name", hcp.Name)
575+
return false, nil
576+
}
577+
578+
// If HCP still exists without deletion timestamp, something might be wrong
579+
log.Info("HelmChartProxy still exists, waiting for deletion to start", "name", hcp.Name)
580+
return false, nil
581+
},
582+
)
583+
if err != nil {
584+
if wait.Interrupted(err) {
585+
return fmt.Errorf("timeout waiting for HelmChartProxy deletion to complete")
586+
}
587+
return fmt.Errorf("error waiting for HelmChartProxy deletion: %w", err)
588+
}
589+
590+
// Additional wait to give CAAPH more time to complete the helm uninstall
591+
// even after the HCP is deleted. This accounts for any cleanup operations.
592+
log.Info("HelmChartProxy deleted, allowing additional time for helm uninstall completion")
593+
594+
// Use a shorter additional wait to not delay cluster deletion too much
595+
additionalWaitCtx, additionalCancel := context.WithTimeout(ctx, 30*time.Second)
596+
defer additionalCancel()
597+
598+
select {
599+
case <-additionalWaitCtx.Done():
600+
log.Info("Additional wait period completed, proceeding with cluster deletion")
601+
case <-time.After(10 * time.Second):
602+
log.Info("Reasonable wait time elapsed, proceeding with cluster deletion")
603+
}
604+
605+
return nil
606+
}

0 commit comments

Comments
 (0)