diff --git a/cmd/main.go b/cmd/main.go index 0ebab9318..d7e9bbdfd 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -47,6 +47,8 @@ func main() { maxInflightMountCalls = flag.Int64("max-inflight-mount-calls", driver.UnsetMaxInflightMountCounts, "New NodePublishVolume operation will be blocked if maximum number of inflight calls is reached. If maxInflightMountCallsOptIn is true, it has to be set to a positive value.") volumeAttachLimitOptIn = flag.Bool("volume-attach-limit-opt-in", false, "Opt in to use volume attach limit.") volumeAttachLimit = flag.Int64("volume-attach-limit", driver.UnsetVolumeAttachLimit, "Maximum number of volumes that can be attached to a node. If volumeAttachLimitOptIn is true, it has to be set to a positive value.") + forceUnmountAfterTimeout = flag.Bool("force-unmount-after-timeout", false, "Enable force unmount if normal unmount times out during NodeUnpublishVolume.") + unmountTimeout = flag.Duration("unmount-timeout", driver.DefaultUnmountTimeout, "Timeout for unmounting a volume during NodePublishVolume when forceUnmountAfterTimeout is true. If the timeout is reached, the volume will be forcibly unmounted. The default value is 30 seconds.") ) klog.InitFlags(nil) flag.Parse() @@ -65,7 +67,7 @@ func main() { if err != nil { klog.Fatalln(err) } - drv := driver.NewDriver(*endpoint, etcAmazonEfs, *efsUtilsStaticFilesPath, *tags, *volMetricsOptIn, *volMetricsRefreshPeriod, *volMetricsFsRateLimit, *deleteAccessPointRootDir, *adaptiveRetryMode, *maxInflightMountCallsOptIn, *maxInflightMountCalls, *volumeAttachLimitOptIn, *volumeAttachLimit) + drv := driver.NewDriver(*endpoint, etcAmazonEfs, *efsUtilsStaticFilesPath, *tags, *volMetricsOptIn, *volMetricsRefreshPeriod, *volMetricsFsRateLimit, *deleteAccessPointRootDir, *adaptiveRetryMode, *maxInflightMountCallsOptIn, *maxInflightMountCalls, *volumeAttachLimitOptIn, *volumeAttachLimit, *forceUnmountAfterTimeout, *unmountTimeout) if err := drv.Run(); err != nil { klog.Fatalln(err) } diff --git a/deploy/kubernetes/base/node-daemonset.yaml b/deploy/kubernetes/base/node-daemonset.yaml index a969fd1e6..e022122f3 100644 --- a/deploy/kubernetes/base/node-daemonset.yaml +++ b/deploy/kubernetes/base/node-daemonset.yaml @@ -61,6 +61,8 @@ spec: - --max-inflight-mount-calls=10 - --volume-attach-limit-opt-in=false - --volume-attach-limit=20 + - --force-unmount-after-timeout=false + - --unmount-timeout=30s env: - name: CSI_ENDPOINT value: unix:/csi/csi.sock diff --git a/docs/README.md b/docs/README.md index d6aa46506..26b0b5962 100644 --- a/docs/README.md +++ b/docs/README.md @@ -357,6 +357,11 @@ After deploying the driver, you can continue to these sections: | max-inflight-mount-calls | | -1 | true | New NodePublishVolume operation will be blocked if maximum number of inflight calls is reached. If maxInflightMountCallsOptIn is true, it has to be set to a positive value. | | volume-attach-limit-opt-in | | false | true | Opt in to use volume attach limit. | | volume-attach-limit | | -1 | true | Maximum number of volumes that can be attached to a node. If volumeAttachLimitOptIn is true, it has to be set to a positive value. | +| force-unmount-after-timeout | | false | true | Enable force unmount if normal unmount times out during NodeUnpublishVolume | +| unmount-timeout | | 30s | true | Timeout for unmounting a volume during NodePublishVolume when forceUnmountAfterTimeout is true. If the timeout is reached, the volume will be forcibly unmounted. The default value is 30 seconds. | + +#### Force Unmount After Timeout +The `force-unmount-after-timeout` feature addresses issues when `NodeUnpublishVolume` gets called infinite times and hangs indefinitely due to broken NFS connections. When enabled, if a normal unmount operation exceeds the configured timeout, the driver will forcibly unmount the volume to prevent indefinite hanging and allow the operation to complete. #### Suggestion for setting max-inflight-mount-calls and volume-attach-limit diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index 271cec284..3763f9394 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -36,6 +36,7 @@ const ( AgentNotReadyNodeTaintKey = "efs.csi.aws.com/agent-not-ready" UnsetMaxInflightMountCounts = -1 UnsetVolumeAttachLimit = -1 + DefaultUnmountTimeout = 30 * time.Second ) type Driver struct { @@ -57,9 +58,11 @@ type Driver struct { lockManager LockManagerMap inFlightMountTracker *InFlightMountTracker volumeAttachLimit int64 + forceUnmountAfterTimeout bool + unmountTimeout time.Duration } -func NewDriver(endpoint, efsUtilsCfgPath, efsUtilsStaticFilesPath, tags string, volMetricsOptIn bool, volMetricsRefreshPeriod float64, volMetricsFsRateLimit int, deleteAccessPointRootDir bool, adaptiveRetryMode bool, maxInflightMountCallsOptIn bool, maxInflightMountCalls int64, volumeAttachLimitOptIn bool, volumeAttachLimit int64) *Driver { +func NewDriver(endpoint, efsUtilsCfgPath, efsUtilsStaticFilesPath, tags string, volMetricsOptIn bool, volMetricsRefreshPeriod float64, volMetricsFsRateLimit int, deleteAccessPointRootDir bool, adaptiveRetryMode bool, maxInflightMountCallsOptIn bool, maxInflightMountCalls int64, volumeAttachLimitOptIn bool, volumeAttachLimit int64, forceUnmountAfterTimeout bool, unmountTimeout time.Duration) *Driver { cloud, err := cloud.NewCloud(adaptiveRetryMode) if err != nil { klog.Fatalln(err) @@ -85,6 +88,8 @@ func NewDriver(endpoint, efsUtilsCfgPath, efsUtilsStaticFilesPath, tags string, lockManager: NewLockManagerMap(), inFlightMountTracker: NewInFlightMountTracker(getMaxInflightMountCalls(maxInflightMountCallsOptIn, maxInflightMountCalls)), volumeAttachLimit: getVolumeAttachLimit(volumeAttachLimitOptIn, volumeAttachLimit), + forceUnmountAfterTimeout: forceUnmountAfterTimeout, + unmountTimeout: unmountTimeout, } } diff --git a/pkg/driver/mounter.go b/pkg/driver/mounter.go index 340b40c1e..3be04a190 100644 --- a/pkg/driver/mounter.go +++ b/pkg/driver/mounter.go @@ -21,7 +21,7 @@ import ( // Mounter is an interface for mount operations type Mounter interface { - mount_utils.Interface + mount_utils.MounterForceUnmounter MakeDir(pathname string) error Stat(pathname string) (os.FileInfo, error) GetDeviceName(mountPath string) (string, int, error) @@ -29,12 +29,12 @@ type Mounter interface { } type NodeMounter struct { - mount_utils.Interface + mount_utils.MounterForceUnmounter } func newNodeMounter() Mounter { return &NodeMounter{ - Interface: mount_utils.New(""), + MounterForceUnmounter: mount_utils.New("").(mount_utils.MounterForceUnmounter), } } @@ -57,7 +57,7 @@ func (m *NodeMounter) GetDeviceName(mountPath string) (string, int, error) { } func (m *NodeMounter) IsLikelyNotMountPoint(target string) (bool, error) { - notMnt, err := m.Interface.IsLikelyNotMountPoint(target) + notMnt, err := m.MounterForceUnmounter.IsLikelyNotMountPoint(target) if err != nil { if os.IsNotExist(err) { return false, nil diff --git a/pkg/driver/node.go b/pkg/driver/node.go index 4040fda84..6f5bd5475 100644 --- a/pkg/driver/node.go +++ b/pkg/driver/node.go @@ -240,28 +240,37 @@ func (d *Driver) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublish return nil, status.Error(codes.InvalidArgument, "Target path not provided") } - // Check if target directory is a mount point. GetDeviceNameFromMount - // given a mnt point, finds the device from /proc/mounts - // returns the device name, reference count, and error code - _, refCount, err := d.mounter.GetDeviceName(target) - if err != nil { - format := "failed to check if volume is mounted: %v" - return nil, status.Errorf(codes.Internal, format, err) - } + if d.forceUnmountAfterTimeout { + klog.V(5).Infof("NodeUnpublishVolume: will retry unmount %s with force after timeout %v", target, d.unmountTimeout) + err := d.mounter.UnmountWithForce(target, d.unmountTimeout) + if err != nil { + return nil, status.Errorf(codes.Internal, "Could not unmountWithForce %q: %v", target, err) + } + } else { + // Check if target directory is a mount point. GetDeviceNameFromMount + // given a mnt point, finds the device from /proc/mounts + // returns the device name, reference count, and error code + _, refCount, err := d.mounter.GetDeviceName(target) + if err != nil { + format := "failed to check if volume is mounted: %v" + return nil, status.Errorf(codes.Internal, format, err) + } - // From the spec: If the volume corresponding to the volume_id - // is not staged to the staging_target_path, the Plugin MUST - // reply 0 OK. - if refCount == 0 { - klog.V(5).Infof("NodeUnpublishVolume: %s target not mounted", target) - return &csi.NodeUnpublishVolumeResponse{}, nil - } + // From the spec: If the volume corresponding to the volume_id + // is not staged to the staging_target_path, the Plugin MUST + // reply 0 OK. + if refCount == 0 { + klog.V(5).Infof("NodeUnpublishVolume: %s target not mounted", target) + return &csi.NodeUnpublishVolumeResponse{}, nil + } - klog.V(5).Infof("NodeUnpublishVolume: unmounting %s", target) - err = d.mounter.Unmount(target) - if err != nil { - return nil, status.Errorf(codes.Internal, "Could not unmount %q: %v", target, err) + klog.V(5).Infof("NodeUnpublishVolume: unmounting %s", target) + err = d.mounter.Unmount(target) + if err != nil { + return nil, status.Errorf(codes.Internal, "Could not unmount %q: %v", target, err) + } } + klog.V(5).Infof("NodeUnpublishVolume: %s unmounted", target) //TODO: If `du` is running on a volume, unmount waits for it to complete. We should stop `du` on unmount in the future for NodeUnpublish