@@ -23,6 +23,8 @@ import (
2323
2424 crdv1 "github.com/kubernetes-csi/external-snapshotter/v2/pkg/apis/volumesnapshot/v1beta1"
2525 "github.com/kubernetes-csi/external-snapshotter/v2/pkg/utils"
26+ codes "google.golang.org/grpc/codes"
27+ "google.golang.org/grpc/status"
2628 v1 "k8s.io/api/core/v1"
2729 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2830 "k8s.io/klog"
@@ -81,6 +83,11 @@ func (ctrl *csiSnapshotSideCarController) syncContent(content *crdv1.VolumeSnaps
8183 // or ListSnapshots CSI methods over and over again for
8284 // performance reasons.
8385 if content .Status != nil && content .Status .ReadyToUse != nil && * content .Status .ReadyToUse == true {
86+ // Try to remove AnnVolumeSnapshotBeingCreated if it is not removed yet for some reason
87+ err := ctrl .removeAnnVolumeSnapshotBeingCreated (content )
88+ if err != nil {
89+ return fmt .Errorf ("failed to remove VolumeSnapshotBeingCreated annotation from the content %s: %q" , content .Name , err )
90+ }
8491 return nil
8592 }
8693 ctrl .checkandUpdateContentStatus (content )
@@ -126,10 +133,10 @@ func (ctrl *csiSnapshotSideCarController) createSnapshot(content *crdv1.VolumeSn
126133 klog .V (5 ).Infof ("createSnapshot for content [%s]: started" , content .Name )
127134 opName := fmt .Sprintf ("create-%s" , content .Name )
128135 ctrl .scheduleOperation (opName , func () error {
129- contentObj , err := ctrl .createSnapshotOperation (content )
136+ contentObj , err := ctrl .createSnapshotWrapper (content )
130137 if err != nil {
131138 ctrl .updateContentErrorStatusWithEvent (content , v1 .EventTypeWarning , "SnapshotCreationFailed" , fmt .Sprintf ("Failed to create snapshot: %v" , err ))
132- klog .Errorf ("createSnapshot [%s]: error occurred in createSnapshotOperation : %v" , opName , err )
139+ klog .Errorf ("createSnapshot [%s]: error occurred in createSnapshotWrapper : %v" , opName , err )
133140 return err
134141 }
135142
@@ -276,75 +283,80 @@ func (ctrl *csiSnapshotSideCarController) checkandUpdateContentStatusOperation(c
276283 }
277284 driverName = content .Spec .Driver
278285 snapshotID = * content .Spec .Source .SnapshotHandle
279- } else {
280- class , snapshotterCredentials , err := ctrl .getCSISnapshotInput (content )
281- if err != nil {
282- return nil , fmt .Errorf ("failed to get input parameters to create snapshot %s: %q" , content .Name , err )
286+
287+ klog .V (5 ).Infof ("checkandUpdateContentStatusOperation: driver %s, snapshotId %s, creationTime %v, size %d, readyToUse %t" , driverName , snapshotID , creationTime , size , readyToUse )
288+
289+ if creationTime .IsZero () {
290+ creationTime = time .Now ()
283291 }
284292
285- driverName , snapshotID , creationTime , size , readyToUse , err = ctrl .handler . CreateSnapshot (content , class . Parameters , snapshotterCredentials )
293+ updatedContent , err : = ctrl .updateSnapshotContentStatus (content , snapshotID , readyToUse , creationTime . UnixNano (), size )
286294 if err != nil {
287- klog .Errorf ("checkandUpdateContentStatusOperation: failed to call create snapshot to check whether the snapshot is ready to use %q" , err )
288295 return nil , err
289296 }
297+ return updatedContent , nil
298+ } else {
299+ return ctrl .createSnapshotWrapper (content )
290300 }
291- klog .V (5 ).Infof ("checkandUpdateContentStatusOperation: driver %s, snapshotId %s, creationTime %v, size %d, readyToUse %t" , driverName , snapshotID , creationTime , size , readyToUse )
292-
293- if creationTime .IsZero () {
294- creationTime = time .Now ()
295- }
296-
297- updateContent , err := ctrl .updateSnapshotContentStatus (content , snapshotID , readyToUse , creationTime .UnixNano (), size )
298- if err != nil {
299- return nil , err
300- }
301- return updateContent , nil
302301}
303302
304- // The function goes through the whole snapshot creation process.
305- // 1. Trigger the snapshot through csi storage provider.
306- // 2. Update VolumeSnapshot status with creationtimestamp information
307- // 3. Create the VolumeSnapshotContent object with the snapshot id information.
308- // 4. Bind the VolumeSnapshot and VolumeSnapshotContent object
309- func (ctrl * csiSnapshotSideCarController ) createSnapshotOperation (content * crdv1.VolumeSnapshotContent ) (* crdv1.VolumeSnapshotContent , error ) {
310- klog .Infof ("createSnapshotOperation: Creating snapshot for content %s through the plugin ..." , content .Name )
311-
312- // content.Status will be created for the first time after a snapshot
313- // is created by the CSI driver. If content.Status is not nil,
314- // we should update content status without creating snapshot again.
315- if content .Status != nil && content .Status .Error != nil && content .Status .Error .Message != nil && ! isControllerUpdateFailError (content .Status .Error ) {
316- klog .V (4 ).Infof ("error is already set in snapshot, do not retry to create: %s" , * content .Status .Error .Message )
317- return content , nil
318- }
303+ // This is a wrapper function for the snapshot creation process.
304+ func (ctrl * csiSnapshotSideCarController ) createSnapshotWrapper (content * crdv1.VolumeSnapshotContent ) (* crdv1.VolumeSnapshotContent , error ) {
305+ klog .Infof ("createSnapshotWrapper: Creating snapshot for content %s through the plugin ..." , content .Name )
319306
320307 class , snapshotterCredentials , err := ctrl .getCSISnapshotInput (content )
321308 if err != nil {
322309 return nil , fmt .Errorf ("failed to get input parameters to create snapshot for content %s: %q" , content .Name , err )
323310 }
324311
312+ // NOTE(xyang): handle create timeout
313+ // Add an annotation to indicate the snapshot creation request has been
314+ // sent to the storage system and the controller is waiting for a response.
315+ // The annotation will be removed after the storage system has responded with
316+ // success or permanent failure. If the request times out, annotation will
317+ // remain on the content to avoid potential leaking of a snapshot resource on
318+ // the storage system.
319+ err = ctrl .setAnnVolumeSnapshotBeingCreated (content )
320+ if err != nil {
321+ return nil , fmt .Errorf ("failed to add VolumeSnapshotBeingCreated annotation on the content %s: %q" , content .Name , err )
322+ }
323+
325324 driverName , snapshotID , creationTime , size , readyToUse , err := ctrl .handler .CreateSnapshot (content , class .Parameters , snapshotterCredentials )
326325 if err != nil {
326+ // NOTE(xyang): handle create timeout
327+ // If it is a final error, remove annotation to indicate
328+ // storage system has responded with an error
329+ klog .Infof ("createSnapshotWrapper: CreateSnapshot for content %s returned error: %v" , content .Name , err )
330+ if isCSIFinalError (err ) {
331+ err = ctrl .removeAnnVolumeSnapshotBeingCreated (content )
332+ if err != nil {
333+ return nil , fmt .Errorf ("failed to remove VolumeSnapshotBeingCreated annotation from the content %s: %q" , content .Name , err )
334+ }
335+ }
336+
327337 return nil , fmt .Errorf ("failed to take snapshot of the volume, %s: %q" , * content .Spec .Source .VolumeHandle , err )
328338 }
329- if driverName != class .Driver {
330- return nil , fmt .Errorf ("failed to take snapshot of the volume, %s: driver name %s returned from the driver is different from driver %s in snapshot class" , * content .Spec .Source .VolumeHandle , driverName , class .Driver )
331- }
332339
333340 klog .V (5 ).Infof ("Created snapshot: driver %s, snapshotId %s, creationTime %v, size %d, readyToUse %t" , driverName , snapshotID , creationTime , size , readyToUse )
334341
335- timestamp := creationTime .UnixNano ()
336- newContent , err := ctrl .updateSnapshotContentStatus (content , snapshotID , readyToUse , timestamp , size )
342+ if creationTime .IsZero () {
343+ creationTime = time .Now ()
344+ }
345+
346+ newContent , err := ctrl .updateSnapshotContentStatus (content , snapshotID , readyToUse , creationTime .UnixNano (), size )
337347 if err != nil {
338- strerr := fmt . Sprintf ("error updating volume snapshot content status for snapshot %s: %v." , content .Name , err )
339- klog . Error ( strerr )
348+ klog . Errorf ("error updating status for volume snapshot content %s: %v." , content .Name , err )
349+ return nil , fmt . Errorf ( "error updating status for volume snapshot content %s: %v." , content . Name , err )
340350 } else {
341351 content = newContent
342352 }
343353
344- // Update content in the cache store
345- _ , err = ctrl .storeContentUpdate (content )
354+ // NOTE(xyang): handle create timeout
355+ // Remove annotation to indicate storage system has successfully
356+ // cut the snapshot
357+ err = ctrl .removeAnnVolumeSnapshotBeingCreated (content )
346358 if err != nil {
347- klog .Errorf ("failed to update content store %v" , err )
359+ return nil , fmt .Errorf ("failed to remove VolumeSnapshotBeingCreated annotation on the content %s: %q" , content . Name , err )
348360 }
349361
350362 return content , nil
@@ -564,9 +576,99 @@ func (ctrl *csiSnapshotSideCarController) shouldDelete(content *crdv1.VolumeSnap
564576 if content .Spec .Source .SnapshotHandle != nil && content .Spec .VolumeSnapshotRef .UID == "" {
565577 return true
566578 }
567- // 2) shouldDelete returns true if AnnVolumeSnapshotBeingDeleted annotation is set
579+
580+ // NOTE(xyang): Handle create snapshot timeout
581+ // 2) shouldDelete returns false if AnnVolumeSnapshotBeingCreated
582+ // annotation is set. This indicates a CreateSnapshot CSI RPC has
583+ // not responded with success or failure.
584+ // We need to keep waiting for a response from the CSI driver.
585+ if metav1 .HasAnnotation (content .ObjectMeta , utils .AnnVolumeSnapshotBeingCreated ) {
586+ return false
587+ }
588+
589+ // 3) shouldDelete returns true if AnnVolumeSnapshotBeingDeleted annotation is set
568590 if metav1 .HasAnnotation (content .ObjectMeta , utils .AnnVolumeSnapshotBeingDeleted ) {
569591 return true
570592 }
571593 return false
572594}
595+
596+ // setAnnVolumeSnapshotBeingCreated sets VolumeSnapshotBeingCreated annotation
597+ // on VolumeSnapshotContent
598+ // If set, it indicates snapshot is being created
599+ func (ctrl * csiSnapshotSideCarController ) setAnnVolumeSnapshotBeingCreated (content * crdv1.VolumeSnapshotContent ) error {
600+ if metav1 .HasAnnotation (content .ObjectMeta , utils .AnnVolumeSnapshotBeingCreated ) {
601+ // the annotation already exists, return directly
602+ return nil
603+ }
604+
605+ // Set AnnVolumeSnapshotBeingCreated
606+ klog .V (5 ).Infof ("setAnnVolumeSnapshotBeingCreated: set annotation [%s:yes] on content [%s]." , utils .AnnVolumeSnapshotBeingCreated , content .Name )
607+ contentClone := content .DeepCopy ()
608+ metav1 .SetMetaDataAnnotation (& contentClone .ObjectMeta , utils .AnnVolumeSnapshotBeingCreated , "yes" )
609+
610+ updatedContent , err := ctrl .clientset .SnapshotV1beta1 ().VolumeSnapshotContents ().Update (contentClone )
611+ if err != nil {
612+ return newControllerUpdateError (content .Name , err .Error ())
613+ }
614+ // update content if update is successful
615+ content = updatedContent
616+
617+ _ , err = ctrl .storeContentUpdate (content )
618+ if err != nil {
619+ klog .V (4 ).Infof ("setAnnVolumeSnapshotBeingCreated for content [%s]: cannot update internal cache %v" , content .Name , err )
620+ }
621+ klog .V (5 ).Infof ("setAnnVolumeSnapshotBeingCreated: volume snapshot content %+v" , content )
622+
623+ return nil
624+ }
625+
626+ // removeAnnVolumeSnapshotBeingCreated removes the VolumeSnapshotBeingCreated
627+ // annotation from a content if there exists one.
628+ func (ctrl csiSnapshotSideCarController ) removeAnnVolumeSnapshotBeingCreated (content * crdv1.VolumeSnapshotContent ) error {
629+ if ! metav1 .HasAnnotation (content .ObjectMeta , utils .AnnVolumeSnapshotBeingCreated ) {
630+ // the annotation does not exist, return directly
631+ return nil
632+ }
633+ contentClone := content .DeepCopy ()
634+ delete (contentClone .ObjectMeta .Annotations , utils .AnnVolumeSnapshotBeingCreated )
635+
636+ updatedContent , err := ctrl .clientset .SnapshotV1beta1 ().VolumeSnapshotContents ().Update (contentClone )
637+ if err != nil {
638+ return newControllerUpdateError (content .Name , err .Error ())
639+ }
640+ // update content if update is successful
641+ content = updatedContent
642+
643+ klog .V (5 ).Infof ("Removed VolumeSnapshotBeingCreated annotation from volume snapshot content %s" , content .Name )
644+ _ , err = ctrl .storeContentUpdate (content )
645+ if err != nil {
646+ klog .Errorf ("failed to update content store %v" , err )
647+ }
648+ return nil
649+ }
650+
651+ // This function checks if the error is final
652+ func isCSIFinalError (err error ) bool {
653+ // Sources:
654+ // https://github.com/grpc/grpc/blob/master/doc/statuscodes.md
655+ // https://github.com/container-storage-interface/spec/blob/master/spec.md
656+ st , ok := status .FromError (err )
657+ if ! ok {
658+ // This is not gRPC error. The operation must have failed before gRPC
659+ // method was called, otherwise we would get gRPC error.
660+ // We don't know if any previous CreateSnapshot is in progress, be on the safe side.
661+ return false
662+ }
663+ switch st .Code () {
664+ case codes .Canceled , // gRPC: Client Application cancelled the request
665+ codes .DeadlineExceeded , // gRPC: Timeout
666+ codes .Unavailable , // gRPC: Server shutting down, TCP connection broken - previous CreateSnapshot() may be still in progress.
667+ codes .ResourceExhausted , // gRPC: Server temporarily out of resources - previous CreateSnapshot() may be still in progress.
668+ codes .Aborted : // CSI: Operation pending for Snapshot
669+ return false
670+ }
671+ // All other errors mean that creating snapshot either did not
672+ // even start or failed. It is for sure not in progress.
673+ return true
674+ }
0 commit comments