Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ func main() {
maxInflightMountCalls = flag.Int64("max-inflight-mount-calls", driver.UnsetMaxInflightMountCounts, "New NodePublishVolume operation will be blocked if maximum number of inflight calls is reached. If maxInflightMountCallsOptIn is true, it has to be set to a positive value.")
volumeAttachLimitOptIn = flag.Bool("volume-attach-limit-opt-in", false, "Opt in to use volume attach limit.")
volumeAttachLimit = flag.Int64("volume-attach-limit", driver.UnsetVolumeAttachLimit, "Maximum number of volumes that can be attached to a node. If volumeAttachLimitOptIn is true, it has to be set to a positive value.")
forceUnmountAfterTimeout = flag.Bool("force-unmount-after-timeout", false, "Enable force unmount if normal unmount times out during NodeUnpublishVolume.")
unmountTimeout = flag.Duration("unmount-timeout", driver.DefaultUnmountTimeout, "Timeout for unmounting a volume during NodePublishVolume when forceUnmountAfterTimeout is true. If the timeout is reached, the volume will be forcibly unmounted. The default value is 30 seconds.")
)
klog.InitFlags(nil)
flag.Parse()
Expand All @@ -65,7 +67,7 @@ func main() {
if err != nil {
klog.Fatalln(err)
}
drv := driver.NewDriver(*endpoint, etcAmazonEfs, *efsUtilsStaticFilesPath, *tags, *volMetricsOptIn, *volMetricsRefreshPeriod, *volMetricsFsRateLimit, *deleteAccessPointRootDir, *adaptiveRetryMode, *maxInflightMountCallsOptIn, *maxInflightMountCalls, *volumeAttachLimitOptIn, *volumeAttachLimit)
drv := driver.NewDriver(*endpoint, etcAmazonEfs, *efsUtilsStaticFilesPath, *tags, *volMetricsOptIn, *volMetricsRefreshPeriod, *volMetricsFsRateLimit, *deleteAccessPointRootDir, *adaptiveRetryMode, *maxInflightMountCallsOptIn, *maxInflightMountCalls, *volumeAttachLimitOptIn, *volumeAttachLimit, *forceUnmountAfterTimeout, *unmountTimeout)
if err := drv.Run(); err != nil {
klog.Fatalln(err)
}
Expand Down
2 changes: 2 additions & 0 deletions deploy/kubernetes/base/node-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ spec:
- --max-inflight-mount-calls=10
- --volume-attach-limit-opt-in=false
- --volume-attach-limit=20
- --force-unmount-after-timeout=false
- --unmount-timeout=30s
env:
- name: CSI_ENDPOINT
value: unix:/csi/csi.sock
Expand Down
5 changes: 5 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,11 @@ After deploying the driver, you can continue to these sections:
| max-inflight-mount-calls | | -1 | true | New NodePublishVolume operation will be blocked if maximum number of inflight calls is reached. If maxInflightMountCallsOptIn is true, it has to be set to a positive value. |
| volume-attach-limit-opt-in | | false | true | Opt in to use volume attach limit. |
| volume-attach-limit | | -1 | true | Maximum number of volumes that can be attached to a node. If volumeAttachLimitOptIn is true, it has to be set to a positive value. |
| force-unmount-after-timeout | | false | true | Enable force unmount if normal unmount times out during NodeUnpublishVolume |
| unmount-timeout | | 30s | true | Timeout for unmounting a volume during NodePublishVolume when forceUnmountAfterTimeout is true. If the timeout is reached, the volume will be forcibly unmounted. The default value is 30 seconds. |

#### Force Unmount After Timeout
The `force-unmount-after-timeout` feature addresses issues when `NodeUnpublishVolume` gets called infinite times and hangs indefinitely due to broken NFS connections. When enabled, if a normal unmount operation exceeds the configured timeout, the driver will forcibly unmount the volume to prevent indefinite hanging and allow the operation to complete.

#### Suggestion for setting max-inflight-mount-calls and volume-attach-limit

Expand Down
7 changes: 6 additions & 1 deletion pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const (
AgentNotReadyNodeTaintKey = "efs.csi.aws.com/agent-not-ready"
UnsetMaxInflightMountCounts = -1
UnsetVolumeAttachLimit = -1
DefaultUnmountTimeout = 30 * time.Second
)

type Driver struct {
Expand All @@ -57,9 +58,11 @@ type Driver struct {
lockManager LockManagerMap
inFlightMountTracker *InFlightMountTracker
volumeAttachLimit int64
forceUnmountAfterTimeout bool
unmountTimeout time.Duration
}

func NewDriver(endpoint, efsUtilsCfgPath, efsUtilsStaticFilesPath, tags string, volMetricsOptIn bool, volMetricsRefreshPeriod float64, volMetricsFsRateLimit int, deleteAccessPointRootDir bool, adaptiveRetryMode bool, maxInflightMountCallsOptIn bool, maxInflightMountCalls int64, volumeAttachLimitOptIn bool, volumeAttachLimit int64) *Driver {
func NewDriver(endpoint, efsUtilsCfgPath, efsUtilsStaticFilesPath, tags string, volMetricsOptIn bool, volMetricsRefreshPeriod float64, volMetricsFsRateLimit int, deleteAccessPointRootDir bool, adaptiveRetryMode bool, maxInflightMountCallsOptIn bool, maxInflightMountCalls int64, volumeAttachLimitOptIn bool, volumeAttachLimit int64, forceUnmountAfterTimeout bool, unmountTimeout time.Duration) *Driver {
cloud, err := cloud.NewCloud(adaptiveRetryMode)
if err != nil {
klog.Fatalln(err)
Expand All @@ -85,6 +88,8 @@ func NewDriver(endpoint, efsUtilsCfgPath, efsUtilsStaticFilesPath, tags string,
lockManager: NewLockManagerMap(),
inFlightMountTracker: NewInFlightMountTracker(getMaxInflightMountCalls(maxInflightMountCallsOptIn, maxInflightMountCalls)),
volumeAttachLimit: getVolumeAttachLimit(volumeAttachLimitOptIn, volumeAttachLimit),
forceUnmountAfterTimeout: forceUnmountAfterTimeout,
unmountTimeout: unmountTimeout,
}
}

Expand Down
8 changes: 4 additions & 4 deletions pkg/driver/mounter.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@ import (

// Mounter is an interface for mount operations
type Mounter interface {
mount_utils.Interface
mount_utils.MounterForceUnmounter
MakeDir(pathname string) error
Stat(pathname string) (os.FileInfo, error)
GetDeviceName(mountPath string) (string, int, error)
IsLikelyNotMountPoint(target string) (bool, error)
}

type NodeMounter struct {
mount_utils.Interface
mount_utils.MounterForceUnmounter
}

func newNodeMounter() Mounter {
return &NodeMounter{
Interface: mount_utils.New(""),
MounterForceUnmounter: mount_utils.New("").(mount_utils.MounterForceUnmounter),
}
}

Expand All @@ -57,7 +57,7 @@ func (m *NodeMounter) GetDeviceName(mountPath string) (string, int, error) {
}

func (m *NodeMounter) IsLikelyNotMountPoint(target string) (bool, error) {
notMnt, err := m.Interface.IsLikelyNotMountPoint(target)
notMnt, err := m.MounterForceUnmounter.IsLikelyNotMountPoint(target)
if err != nil {
if os.IsNotExist(err) {
return false, nil
Expand Down
47 changes: 28 additions & 19 deletions pkg/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,28 +240,37 @@ func (d *Driver) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublish
return nil, status.Error(codes.InvalidArgument, "Target path not provided")
}

// Check if target directory is a mount point. GetDeviceNameFromMount
// given a mnt point, finds the device from /proc/mounts
// returns the device name, reference count, and error code
_, refCount, err := d.mounter.GetDeviceName(target)
if err != nil {
format := "failed to check if volume is mounted: %v"
return nil, status.Errorf(codes.Internal, format, err)
}
if d.forceUnmountAfterTimeout {
klog.V(5).Infof("NodeUnpublishVolume: will retry unmount %s with force after timeout %v", target, d.unmountTimeout)
err := d.mounter.UnmountWithForce(target, d.unmountTimeout)
if err != nil {
return nil, status.Errorf(codes.Internal, "Could not unmountWithForce %q: %v", target, err)
}
} else {
// Check if target directory is a mount point. GetDeviceNameFromMount
// given a mnt point, finds the device from /proc/mounts
// returns the device name, reference count, and error code
_, refCount, err := d.mounter.GetDeviceName(target)
if err != nil {
format := "failed to check if volume is mounted: %v"
return nil, status.Errorf(codes.Internal, format, err)
}

// From the spec: If the volume corresponding to the volume_id
// is not staged to the staging_target_path, the Plugin MUST
// reply 0 OK.
if refCount == 0 {
klog.V(5).Infof("NodeUnpublishVolume: %s target not mounted", target)
return &csi.NodeUnpublishVolumeResponse{}, nil
}
// From the spec: If the volume corresponding to the volume_id
// is not staged to the staging_target_path, the Plugin MUST
// reply 0 OK.
if refCount == 0 {
klog.V(5).Infof("NodeUnpublishVolume: %s target not mounted", target)
return &csi.NodeUnpublishVolumeResponse{}, nil
}

klog.V(5).Infof("NodeUnpublishVolume: unmounting %s", target)
err = d.mounter.Unmount(target)
if err != nil {
return nil, status.Errorf(codes.Internal, "Could not unmount %q: %v", target, err)
klog.V(5).Infof("NodeUnpublishVolume: unmounting %s", target)
err = d.mounter.Unmount(target)
if err != nil {
return nil, status.Errorf(codes.Internal, "Could not unmount %q: %v", target, err)
}
}

klog.V(5).Infof("NodeUnpublishVolume: %s unmounted", target)

//TODO: If `du` is running on a volume, unmount waits for it to complete. We should stop `du` on unmount in the future for NodeUnpublish
Expand Down