Skip to content

Commit cd590c6

Browse files
feat: fixing timeout issues during helm uninstallation
1 parent 44dcc33 commit cd590c6

File tree

1 file changed

+75
-12
lines changed

1 file changed

+75
-12
lines changed

pkg/handlers/lifecycle/konnectoragent/handler.go

Lines changed: 75 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"fmt"
1010
"strings"
1111
"text/template"
12+
"time"
1213

1314
"github.com/go-logr/logr"
1415
"github.com/spf13/pflag"
@@ -41,6 +42,11 @@ const (
4142
cleanupStatusCompleted = "completed"
4243
cleanupStatusInProgress = "in-progress"
4344
cleanupStatusNotStarted = "not-started"
45+
cleanupStatusTimedOut = "timed-out"
46+
47+
// helmUninstallTimeout is the maximum time to wait for HelmChartProxy deletion
48+
// before giving up and allowing cluster deletion to proceed
49+
helmUninstallTimeout = 5 * time.Minute
4450
)
4551

4652
type Config struct {
@@ -344,7 +350,7 @@ func (n *DefaultKonnectorAgent) BeforeClusterDelete(
344350
}
345351

346352
// Check if cleanup is already in progress or completed
347-
cleanupStatus, err := n.checkCleanupStatus(ctx, cluster, log)
353+
cleanupStatus, statusMsg, err := n.checkCleanupStatus(ctx, cluster, log)
348354
if err != nil {
349355
log.Error(err, "Failed to check cleanup status")
350356
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
@@ -357,10 +363,32 @@ func (n *DefaultKonnectorAgent) BeforeClusterDelete(
357363
log.Info("Konnector Agent cleanup already completed")
358364
resp.SetStatus(runtimehooksv1.ResponseStatusSuccess)
359365
return
366+
case cleanupStatusTimedOut:
367+
// Log the error prominently and block cluster deletion
368+
log.Error(
369+
fmt.Errorf("konnector Agent helm uninstallation timed out"),
370+
"ERROR: Konnector Agent cleanup timed out - blocking cluster deletion",
371+
"details", statusMsg,
372+
"action", "Manual intervention required - check HelmChartProxy status and remove finalizers if needed",
373+
)
374+
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
375+
resp.SetMessage(fmt.Sprintf(
376+
"Konnector Agent helm uninstallation timed out after %v. "+
377+
"The HelmChartProxy is stuck in deletion state. "+
378+
"Manual intervention required: Check HelmChartProxy status and remove finalizers if needed. "+
379+
"Details: %s",
380+
helmUninstallTimeout,
381+
statusMsg,
382+
))
383+
return
360384
case cleanupStatusInProgress:
361-
log.Info("Konnector Agent cleanup in progress, requesting retry")
385+
log.Info("Konnector Agent cleanup in progress, requesting retry", "details", statusMsg)
362386
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
363387
resp.SetRetryAfterSeconds(5) // Retry after 5 seconds
388+
resp.SetMessage(fmt.Sprintf(
389+
"Konnector Agent cleanup in progress. Waiting for HelmChartProxy deletion to complete. %s",
390+
statusMsg,
391+
))
364392
return
365393
case cleanupStatusNotStarted:
366394
log.Info("Starting Konnector Agent cleanup")
@@ -369,16 +397,17 @@ func (n *DefaultKonnectorAgent) BeforeClusterDelete(
369397

370398
err = n.deleteHelmChartProxy(ctx, cluster, log)
371399
if err != nil {
372-
log.Error(err, "Failed to delete helm chart")
400+
log.Error(err, "Failed to delete HelmChartProxy")
373401
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
374-
resp.SetMessage(err.Error())
402+
resp.SetMessage(fmt.Sprintf("Failed to delete Konnector Agent HelmChartProxy: %v", err))
375403
return
376404
}
377405

378406
// After initiating cleanup, request a retry to monitor completion
379407
log.Info("Konnector Agent cleanup initiated, will monitor progress")
380408
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
381409
resp.SetRetryAfterSeconds(5) // Quick retry to start monitoring
410+
resp.SetMessage("Konnector Agent cleanup initiated. Waiting for HelmChartProxy deletion to start.")
382411
}
383412

384413
func (n *DefaultKonnectorAgent) deleteHelmChartProxy(
@@ -434,15 +463,15 @@ func (n *DefaultKonnectorAgent) deleteHelmChartProxy(
434463
}
435464

436465
// checkCleanupStatus checks the current status of Konnector Agent cleanup.
437-
// Returns: "completed", "in-progress", or "not-started".
466+
// Returns: status ("completed", "in-progress", "not-started", or "timed-out"), status message, and error.
438467
func (n *DefaultKonnectorAgent) checkCleanupStatus(
439468
ctx context.Context,
440469
cluster *clusterv1.Cluster,
441470
log logr.Logger,
442-
) (string, error) {
471+
) (string, string, error) {
443472
clusterUUID, ok := cluster.Annotations[v1alpha1.ClusterUUIDAnnotationKey]
444473
if !ok {
445-
return cleanupStatusCompleted, nil // If no UUID, assume no agent was installed
474+
return cleanupStatusCompleted, "No cluster UUID found, assuming no agent installed", nil
446475
}
447476

448477
// Check if HelmChartProxy exists
@@ -457,18 +486,52 @@ func (n *DefaultKonnectorAgent) checkCleanupStatus(
457486
if err != nil {
458487
if apierrors.IsNotFound(err) {
459488
log.Info("HelmChartProxy not found, cleanup completed", "name", hcp.Name)
460-
return cleanupStatusCompleted, nil
489+
return cleanupStatusCompleted, "HelmChartProxy successfully deleted", nil
461490
}
462-
return "", fmt.Errorf("failed to get HelmChartProxy %q: %w", ctrlclient.ObjectKeyFromObject(hcp), err)
491+
return "", "", fmt.Errorf("failed to get HelmChartProxy %q: %w", ctrlclient.ObjectKeyFromObject(hcp), err)
463492
}
464493

465494
// HCP exists - check if it's being deleted
466495
if hcp.DeletionTimestamp != nil {
467-
log.Info("HelmChartProxy is being deleted, cleanup in progress", "name", hcp.Name)
468-
return cleanupStatusInProgress, nil
496+
// Check if deletion has timed out
497+
deletionDuration := time.Since(hcp.DeletionTimestamp.Time)
498+
if deletionDuration > helmUninstallTimeout {
499+
statusMsg := fmt.Sprintf(
500+
"HelmChartProxy %q has been in deletion state for %v (timeout: %v). "+
501+
"Possible causes: stuck finalizers, helm uninstall failure, or workload cluster unreachable. "+
502+
"HelmChartProxy status: %+v",
503+
ctrlclient.ObjectKeyFromObject(hcp),
504+
deletionDuration,
505+
helmUninstallTimeout,
506+
hcp.Status,
507+
)
508+
log.Error(
509+
fmt.Errorf("helm uninstall timeout exceeded"),
510+
"HelmChartProxy deletion timed out",
511+
"name", hcp.Name,
512+
"deletionTimestamp", hcp.DeletionTimestamp.Time,
513+
"duration", deletionDuration,
514+
"timeout", helmUninstallTimeout,
515+
"finalizers", hcp.Finalizers,
516+
"status", hcp.Status,
517+
)
518+
return cleanupStatusTimedOut, statusMsg, nil
519+
}
520+
521+
statusMsg := fmt.Sprintf(
522+
"HelmChartProxy is being deleted (in progress for %v, timeout in %v)",
523+
deletionDuration,
524+
helmUninstallTimeout-deletionDuration,
525+
)
526+
log.Info("HelmChartProxy is being deleted, cleanup in progress",
527+
"name", hcp.Name,
528+
"deletionDuration", deletionDuration,
529+
"remainingTime", helmUninstallTimeout-deletionDuration,
530+
)
531+
return cleanupStatusInProgress, statusMsg, nil
469532
}
470533

471534
// HCP exists and is not being deleted
472535
log.Info("HelmChartProxy exists, cleanup not started", "name", hcp.Name)
473-
return cleanupStatusNotStarted, nil
536+
return cleanupStatusNotStarted, "HelmChartProxy exists and needs to be deleted", nil
474537
}

0 commit comments

Comments
 (0)