Skip to content

Commit 1255127

Browse files
committed
fix: avoid snapshot query hanging during volume deletion
Try DeleteDataset first before querying snapshots — in the common case (no dependent snapshots), this avoids the expensive zfs.snapshot.query call entirely. When snapshot cleanup is needed, use select: ["id"] to return only snapshot IDs instead of full objects, and add a 30-second timeout as a safety net. This fixes a ~2 minute hang when TrueNAS returns oversized snapshot query responses (>1MB WebSocket limit).
1 parent 524faa7 commit 1255127

File tree

9 files changed

+126
-50
lines changed

9 files changed

+126
-50
lines changed

pkg/driver/controller.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -350,31 +350,39 @@ func (s *ControllerService) lookupSnapshotByCSIName(ctx context.Context, poolDat
350350
// - The snapshot moves from the source to the promoted clone.
351351
// - The original source volume becomes a dependent of the promoted snapshot.
352352
// - Without deleting the snapshot first, neither the clone nor the source can be deleted.
353+
//
354+
// Uses a 30-second timeout as a safety net — this is best-effort cleanup, not critical path.
355+
// Uses QuerySnapshotIDs with select: ["id"] to minimize response size (avoids "message too big"
356+
// errors when datasets have many snapshots with large property sets).
353357
func (s *ControllerService) deleteDatasetSnapshots(ctx context.Context, datasetID string) {
354358
klog.V(4).Infof("Checking for snapshots on dataset %s before deletion", datasetID)
355359

356-
// Query all snapshots on this dataset
357-
snapshots, err := s.apiClient.QuerySnapshots(ctx, []interface{}{
360+
// Use a short timeout — this is best-effort cleanup, not critical path
361+
snapCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
362+
defer cancel()
363+
364+
// Query only snapshot IDs (not full objects) to minimize response size
365+
snapIDs, err := s.apiClient.QuerySnapshotIDs(snapCtx, []interface{}{
358366
[]interface{}{"dataset", "=", datasetID},
359367
})
360368
if err != nil {
361369
klog.Warningf("Failed to query snapshots for dataset %s: %v (continuing with deletion)", datasetID, err)
362370
return // Don't fail deletion if we can't query snapshots
363371
}
364372

365-
if len(snapshots) == 0 {
373+
if len(snapIDs) == 0 {
366374
klog.V(4).Infof("No snapshots found on dataset %s", datasetID)
367375
return
368376
}
369377

370-
klog.Infof("Found %d snapshots on dataset %s, deleting them first", len(snapshots), datasetID)
378+
klog.Infof("Found %d snapshots on dataset %s, deleting them first", len(snapIDs), datasetID)
371379

372-
for _, snap := range snapshots {
373-
klog.V(4).Infof("Deleting snapshot %s (defer=true to handle dependent clones)", snap.ID)
374-
if err := s.apiClient.DeleteSnapshot(ctx, snap.ID); err != nil {
380+
for _, snapID := range snapIDs {
381+
klog.V(4).Infof("Deleting snapshot %s (defer=true to handle dependent clones)", snapID)
382+
if err := s.apiClient.DeleteSnapshot(snapCtx, snapID); err != nil {
375383
// Log warning but continue - the snapshot might already be deleted or
376384
// have dependents that will be handled by defer=true
377-
klog.Warningf("Failed to delete snapshot %s: %v (continuing)", snap.ID, err)
385+
klog.Warningf("Failed to delete snapshot %s: %v (continuing)", snapID, err)
378386
}
379387
}
380388
}

pkg/driver/controller_iscsi.go

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99

1010
"github.com/container-storage-interface/spec/lib/go/csi"
1111
"github.com/fenio/tns-csi/pkg/metrics"
12+
"github.com/fenio/tns-csi/pkg/retry"
1213
"github.com/fenio/tns-csi/pkg/tnsapi"
1314
"google.golang.org/grpc/codes"
1415
"google.golang.org/grpc/status"
@@ -520,6 +521,8 @@ func (s *ControllerService) createISCSITargetExtent(ctx context.Context, targetI
520521
}
521522

522523
// deleteISCSIVolume deletes an iSCSI volume and all associated resources.
524+
//
525+
//nolint:dupl // Intentionally similar ZVOL deletion pattern as NFS/NVMe-oF
523526
func (s *ControllerService) deleteISCSIVolume(ctx context.Context, meta *VolumeMetadata) (*csi.DeleteVolumeResponse, error) {
524527
timer := metrics.NewVolumeOperationTimer(metrics.ProtocolISCSI, "delete")
525528
klog.Infof("Deleting iSCSI volume: %s (Dataset: %s, Target: %d, Extent: %d)",
@@ -576,24 +579,29 @@ func (s *ControllerService) deleteISCSIVolume(ctx context.Context, meta *VolumeM
576579
}
577580
}
578581

579-
// Step 4: Delete any snapshots on the ZVOL first (handles promoted clone cleanup)
580-
// This is necessary because after ZFS clone promotion, snapshots may have dependent
581-
// clones that prevent deletion. Deleting with defer=true allows ZFS to clean up properly.
582+
// Step 4: Delete ZVOL (try direct first, snapshot cleanup on failure)
582583
if meta.DatasetID != "" {
583-
s.deleteDatasetSnapshots(ctx, meta.DatasetID)
584-
}
584+
firstErr := s.apiClient.DeleteDataset(ctx, meta.DatasetID)
585+
if firstErr != nil && !isNotFoundError(firstErr) {
586+
klog.Infof("Direct deletion failed for %s: %v — cleaning up snapshots before retry",
587+
meta.DatasetID, firstErr)
588+
s.deleteDatasetSnapshots(ctx, meta.DatasetID)
589+
590+
retryConfig := retry.DeletionConfig("delete-iscsi-zvol")
591+
err := retry.WithRetryNoResult(ctx, retryConfig, func() error {
592+
deleteErr := s.apiClient.DeleteDataset(ctx, meta.DatasetID)
593+
if deleteErr != nil && isNotFoundError(deleteErr) {
594+
return nil
595+
}
596+
return deleteErr
597+
})
585598

586-
// Step 5: Delete ZVOL
587-
if meta.DatasetID != "" {
588-
if err := s.apiClient.DeleteDataset(ctx, meta.DatasetID); err != nil {
589-
if !isNotFoundError(err) {
599+
if err != nil {
590600
timer.ObserveError()
591601
return nil, status.Errorf(codes.Internal, "Failed to delete ZVOL %s: %v", meta.DatasetID, err)
592602
}
593-
klog.V(4).Infof("ZVOL already deleted: %s", meta.DatasetID)
594-
} else {
595-
klog.V(4).Infof("Deleted ZVOL: %s", meta.DatasetID)
596603
}
604+
klog.V(4).Infof("Deleted ZVOL: %s", meta.DatasetID)
597605
}
598606

599607
// Clear volume capacity metric

pkg/driver/controller_nfs.go

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,8 @@ func (s *ControllerService) createNFSVolume(ctx context.Context, req *csi.Create
523523
// deleteNFSVolume deletes an NFS volume with ownership verification.
524524
// Dataset deletion is retried for busy resource errors.
525525
// If deleteStrategy is "retain", the volume is kept but CSI returns success.
526+
//
527+
//nolint:dupl // Intentionally similar dataset deletion pattern as iSCSI
526528
func (s *ControllerService) deleteNFSVolume(ctx context.Context, meta *VolumeMetadata) (*csi.DeleteVolumeResponse, error) {
527529
timer := metrics.NewVolumeOperationTimer(metrics.ProtocolNFS, "delete")
528530
klog.V(4).Infof("Deleting NFS volume: %s (dataset: %s, share ID: %d)", meta.Name, meta.DatasetName, meta.NFSShareID)
@@ -623,34 +625,31 @@ func (s *ControllerService) deleteNFSVolume(ctx context.Context, meta *VolumeMet
623625
}
624626
}
625627

626-
// Step 2: Delete any snapshots on the dataset first (handles promoted clone cleanup)
627-
// This is necessary because after ZFS clone promotion, snapshots may have dependent
628-
// clones that prevent deletion. Deleting with defer=true allows ZFS to clean up properly.
629-
if meta.DatasetID != "" {
630-
s.deleteDatasetSnapshots(ctx, meta.DatasetID)
631-
}
632-
633-
// Step 3: Delete ZFS dataset with retry logic for busy resources
628+
// Step 2: Delete dataset (try direct first, snapshot cleanup on failure)
634629
if meta.DatasetID == "" {
635630
klog.V(4).Infof("No dataset ID provided, skipping dataset deletion")
636631
} else {
637-
klog.V(4).Infof("Deleting dataset: %s (with retry for busy resources)", meta.DatasetID)
632+
klog.V(4).Infof("Deleting dataset: %s", meta.DatasetID)
633+
634+
firstErr := s.apiClient.DeleteDataset(ctx, meta.DatasetID)
635+
if firstErr != nil && !isNotFoundError(firstErr) {
636+
klog.Infof("Direct deletion failed for %s: %v — cleaning up snapshots before retry",
637+
meta.DatasetID, firstErr)
638+
s.deleteDatasetSnapshots(ctx, meta.DatasetID)
639+
640+
retryConfig := retry.DeletionConfig("delete-nfs-dataset")
641+
err := retry.WithRetryNoResult(ctx, retryConfig, func() error {
642+
deleteErr := s.apiClient.DeleteDataset(ctx, meta.DatasetID)
643+
if deleteErr != nil && isNotFoundError(deleteErr) {
644+
return nil
645+
}
646+
return deleteErr
647+
})
638648

639-
retryConfig := retry.DeletionConfig("delete-nfs-dataset")
640-
err := retry.WithRetryNoResult(ctx, retryConfig, func() error {
641-
deleteErr := s.apiClient.DeleteDataset(ctx, meta.DatasetID)
642-
if deleteErr != nil && isNotFoundError(deleteErr) {
643-
// Dataset already deleted - not an error (idempotency)
644-
klog.V(4).Infof("Dataset %s not found, assuming already deleted (idempotency)", meta.DatasetID)
645-
return nil
649+
if err != nil {
650+
timer.ObserveError()
651+
return nil, status.Errorf(codes.Internal, "Failed to delete dataset %s: %v", meta.DatasetID, err)
646652
}
647-
return deleteErr
648-
})
649-
650-
if err != nil {
651-
// All retries exhausted or non-retryable error
652-
timer.ObserveError()
653-
return nil, status.Errorf(codes.Internal, "Failed to delete dataset %s: %v", meta.DatasetID, err)
654653
}
655654
klog.V(4).Infof("Successfully deleted dataset %s", meta.DatasetID)
656655
}

pkg/driver/controller_nvmeof.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,9 @@ func (s *ControllerService) verifyNamespaceDeletion(ctx context.Context, meta *V
928928
}
929929

930930
// deleteZVOL deletes a ZVOL dataset with retry logic for busy resources.
931+
// Uses a try-first approach: attempts direct deletion (which handles the common case where
932+
// recursive=true succeeds), then falls back to snapshot cleanup + retry if the direct
933+
// attempt fails. This avoids the expensive snapshot query in the common case.
931934
func (s *ControllerService) deleteZVOL(ctx context.Context, meta *VolumeMetadata) error {
932935
if meta.DatasetID == "" {
933936
klog.Infof("deleteZVOL: DatasetID is empty, skipping deletion")
@@ -936,29 +939,33 @@ func (s *ControllerService) deleteZVOL(ctx context.Context, meta *VolumeMetadata
936939

937940
klog.Infof("deleteZVOL: Starting deletion of ZVOL %s for volume %s", meta.DatasetID, meta.Name)
938941

939-
// Delete any snapshots on the ZVOL first (handles promoted clone cleanup)
940-
// This is necessary because after ZFS clone promotion, snapshots may have dependent
941-
// clones that prevent deletion. Deleting with defer=true allows ZFS to clean up properly.
942+
// Try direct deletion first (common case: no dependent snapshots)
943+
firstErr := s.apiClient.DeleteDataset(ctx, meta.DatasetID)
944+
if firstErr == nil || isNotFoundError(firstErr) {
945+
klog.Infof("deleteZVOL: Successfully deleted ZVOL %s", meta.DatasetID)
946+
return nil
947+
}
948+
949+
// First attempt failed — clean up snapshots and retry
950+
klog.Infof("deleteZVOL: Direct deletion failed for %s: %v — cleaning up snapshots before retry",
951+
meta.DatasetID, firstErr)
942952
s.deleteDatasetSnapshots(ctx, meta.DatasetID)
943953

944954
retryConfig := retry.DeletionConfig("delete-zvol")
945955
err := retry.WithRetryNoResult(ctx, retryConfig, func() error {
946956
deleteErr := s.apiClient.DeleteDataset(ctx, meta.DatasetID)
947957
if deleteErr != nil && isNotFoundError(deleteErr) {
948-
// ZVOL already deleted - not an error (idempotency)
949-
klog.V(4).Infof("ZVOL %s not found, assuming already deleted (idempotency)", meta.DatasetID)
950958
return nil
951959
}
952960
return deleteErr
953961
})
954962

955963
if err != nil {
956-
// All retries exhausted or non-retryable error
957964
klog.Errorf("deleteZVOL: Failed to delete ZVOL %s: %v", meta.DatasetID, err)
958965
return status.Errorf(codes.Internal, "Failed to delete ZVOL %s: %v", meta.DatasetID, err)
959966
}
960967

961-
klog.Infof("deleteZVOL: Successfully deleted ZVOL %s for volume %s", meta.DatasetID, meta.Name)
968+
klog.Infof("deleteZVOL: Successfully deleted ZVOL %s after snapshot cleanup", meta.DatasetID)
962969
return nil
963970
}
964971

pkg/driver/controller_snapshot_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ func (m *MockAPIClientForSnapshots) QuerySnapshots(ctx context.Context, filters
6666
return nil, errors.New("QuerySnapshotsFunc not implemented")
6767
}
6868

69+
func (m *MockAPIClientForSnapshots) QuerySnapshotIDs(ctx context.Context, filters []interface{}) ([]string, error) {
70+
return nil, nil
71+
}
72+
6973
func (m *MockAPIClientForSnapshots) CloneSnapshot(ctx context.Context, params tnsapi.CloneSnapshotParams) (*tnsapi.Dataset, error) {
7074
if m.CloneSnapshotFunc != nil {
7175
return m.CloneSnapshotFunc(ctx, params)

pkg/driver/controller_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,10 @@ func (m *mockAPIClient) QuerySnapshots(ctx context.Context, filters []interface{
166166
return nil, nil
167167
}
168168

169+
func (m *mockAPIClient) QuerySnapshotIDs(ctx context.Context, filters []interface{}) ([]string, error) {
170+
return nil, nil
171+
}
172+
169173
func (m *mockAPIClient) CloneSnapshot(ctx context.Context, params tnsapi.CloneSnapshotParams) (*tnsapi.Dataset, error) {
170174
return nil, errNotImplemented
171175
}

pkg/tnsapi/client.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1651,6 +1651,32 @@ func (c *Client) QuerySnapshots(ctx context.Context, filters []interface{}) ([]S
16511651
return result, nil
16521652
}
16531653

1654+
// QuerySnapshotIDs is a lightweight version of QuerySnapshots that only returns snapshot IDs.
1655+
// It uses select: ["id"] to minimize response size, which is critical when datasets have
1656+
// many snapshots with large property sets (e.g., after migration from democratic-csi).
1657+
func (c *Client) QuerySnapshotIDs(ctx context.Context, filters []interface{}) ([]string, error) {
1658+
klog.V(4).Infof("Querying snapshot IDs with filters: %+v", filters)
1659+
1660+
queryOpts := map[string]interface{}{
1661+
"select": []string{"id"},
1662+
}
1663+
var result []struct {
1664+
ID string `json:"id"`
1665+
}
1666+
err := c.Call(ctx, "zfs.snapshot.query", []interface{}{filters, queryOpts}, &result)
1667+
if err != nil {
1668+
return nil, fmt.Errorf("failed to query snapshot IDs: %w", err)
1669+
}
1670+
1671+
ids := make([]string, len(result))
1672+
for i, s := range result {
1673+
ids[i] = s.ID
1674+
}
1675+
1676+
klog.V(4).Infof("Found %d snapshot IDs", len(ids))
1677+
return ids, nil
1678+
}
1679+
16541680
// CloneSnapshotParams represents parameters for cloning a snapshot.
16551681
type CloneSnapshotParams struct {
16561682
Snapshot string `json:"snapshot"` // Source snapshot ID (dataset@snapshot)

pkg/tnsapi/interface.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ type ClientInterface interface {
8787
CreateSnapshot(ctx context.Context, params SnapshotCreateParams) (*Snapshot, error)
8888
DeleteSnapshot(ctx context.Context, snapshotID string) error
8989
QuerySnapshots(ctx context.Context, filters []interface{}) ([]Snapshot, error)
90+
QuerySnapshotIDs(ctx context.Context, filters []interface{}) ([]string, error)
9091
CloneSnapshot(ctx context.Context, params CloneSnapshotParams) (*Dataset, error)
9192

9293
// Dataset promotion (for detached clones)

tests/sanity/mock_client.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -903,6 +903,25 @@ func (m *MockClient) QuerySnapshots(ctx context.Context, filters []any) ([]tnsap
903903
return result, nil
904904
}
905905

906+
// QuerySnapshotIDs mocks zfs.snapshot.query with select: ["id"].
907+
// Returns only snapshot IDs to minimize response size.
908+
func (m *MockClient) QuerySnapshotIDs(ctx context.Context, filters []any) ([]string, error) {
909+
m.logCall("QuerySnapshotIDs", filters)
910+
911+
m.mu.Lock()
912+
defer m.mu.Unlock()
913+
914+
var ids []string
915+
for _, snap := range m.snapshots {
916+
if !matchesSnapshotFilters(snap, filters) {
917+
continue
918+
}
919+
ids = append(ids, snap.ID)
920+
}
921+
922+
return ids, nil
923+
}
924+
906925
// matchesSnapshotFilters checks if a snapshot matches the provided filters.
907926
//
908927
//nolint:goconst // Filter field names are used locally in mock filter functions.

0 commit comments

Comments
 (0)