Skip to content

Commit 49eb208

Browse files
fenioashtonian
andcommitted
feat: add NVMe-oF connect concurrency limiter (#131)
Prevent kernel NVMe subsystem registration lock contention when staging many volumes simultaneously on a single node. Uses a channel-based semaphore (default: 5) configurable via --max-concurrent-nvme-connects flag and node.maxConcurrentNVMeConnects Helm value. Adds Prometheus metrics for concurrent/waiting connect operations. Co-authored-by: Ashton Kinslow <github@ashtonkinslow.com>
1 parent 11e17d2 commit 49eb208

File tree

12 files changed

+130
-49
lines changed

12 files changed

+130
-49
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ The driver is configured via command-line flags and Kubernetes secrets:
205205
- `--driver-name` - CSI driver name (default: `tns.csi.io`)
206206
- `--api-url` - TrueNAS API URL (e.g., `ws://YOUR-TRUENAS-IP/api/v2.0/websocket`)
207207
- `--api-key` - TrueNAS API key
208+
- `--max-concurrent-nvme-connects` - Maximum concurrent NVMe-oF connect operations per node (default: `5`)
208209

209210
### Storage Class Parameters
210211

charts/tns-csi-driver/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ Detached snapshots use `zfs send/receive` to create independent dataset copies t
286286
| `node.kubeletPath` | Kubelet data directory | `/var/lib/kubelet` |
287287
| `node.logLevel` | Log verbosity (0-5) | `2` |
288288
| `node.debug` | Enable debug mode | `false` |
289+
| `node.maxConcurrentNVMeConnects` | Max concurrent NVMe-oF connect operations per node | `5` |
289290
| `node.resources.limits.cpu` | CPU limit | `200m` |
290291
| `node.resources.limits.memory` | Memory limit | `200Mi` |
291292
| `node.resources.requests.cpu` | CPU request | `10m` |

charts/tns-csi-driver/templates/node.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ spec:
5858
{{- if .Values.node.enableNVMeDiscovery }}
5959
- "--enable-nvme-discovery"
6060
{{- end }}
61+
- "--max-concurrent-nvme-connects={{ .Values.node.maxConcurrentNVMeConnects | default 5 }}"
6162
env:
6263
- name: NODE_ID
6364
valueFrom:

charts/tns-csi-driver/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,11 @@ node:
150150
# Enable only if you need discovery for multi-path or dynamic target topologies.
151151
enableNVMeDiscovery: false
152152

153+
# Maximum concurrent NVMe-oF connect operations per node.
154+
# Limits contention on the kernel's NVMe subsystem registration lock.
155+
# Recommended: 3-5. Set to 0 for unlimited (not recommended with >10 volumes per node).
156+
maxConcurrentNVMeConnects: 5
157+
153158
# Update strategy for DaemonSet
154159
updateStrategy:
155160
type: RollingUpdate

cmd/tns-csi-driver/main.go

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,17 @@ var (
2020
)
2121

2222
var (
23-
endpoint = flag.String("endpoint", "unix:///var/lib/kubelet/plugins/tns.csi.io/csi.sock", "CSI endpoint")
24-
nodeID = flag.String("node-id", "", "Node ID")
25-
driverName = flag.String("driver-name", "tns.csi.io", "Name of the driver")
26-
apiURL = flag.String("api-url", "", "Storage system API URL (e.g., ws://10.10.20.100/api/v2.0/websocket)")
27-
apiKey = flag.String("api-key", "", "Storage system API key")
28-
metricsAddr = flag.String("metrics-addr", ":8080", "Address to expose Prometheus metrics")
29-
skipTLSVerify = flag.Bool("skip-tls-verify", false, "Skip TLS certificate verification (for self-signed certificates)")
30-
showVersion = flag.Bool("show-version", false, "Show version and exit")
31-
debug = flag.Bool("debug", false, "Enable debug logging (equivalent to -v=4)")
32-
enableNVMeDiscovery = flag.Bool("enable-nvme-discovery", false, "Run nvme discover before nvme connect (default: false, all connection params are known from volume context)")
23+
endpoint = flag.String("endpoint", "unix:///var/lib/kubelet/plugins/tns.csi.io/csi.sock", "CSI endpoint")
24+
nodeID = flag.String("node-id", "", "Node ID")
25+
driverName = flag.String("driver-name", "tns.csi.io", "Name of the driver")
26+
apiURL = flag.String("api-url", "", "Storage system API URL (e.g., ws://10.10.20.100/api/v2.0/websocket)")
27+
apiKey = flag.String("api-key", "", "Storage system API key")
28+
metricsAddr = flag.String("metrics-addr", ":8080", "Address to expose Prometheus metrics")
29+
skipTLSVerify = flag.Bool("skip-tls-verify", false, "Skip TLS certificate verification (for self-signed certificates)")
30+
showVersion = flag.Bool("show-version", false, "Show version and exit")
31+
debug = flag.Bool("debug", false, "Enable debug logging (equivalent to -v=4)")
32+
enableNVMeDiscovery = flag.Bool("enable-nvme-discovery", false, "Run nvme discover before nvme connect (default: false, all connection params are known from volume context)")
33+
maxConcurrentNVMeConnects = flag.Int("max-concurrent-nvme-connects", 5, "Maximum number of concurrent NVMe-oF connect operations per node (limits kernel NVMe subsystem lock contention)")
3334
)
3435

3536
func main() {
@@ -72,15 +73,16 @@ func main() {
7273
klog.V(4).Infof("Node ID: %s", *nodeID)
7374

7475
drv, err := driver.NewDriver(driver.Config{
75-
DriverName: *driverName,
76-
Version: version,
77-
NodeID: *nodeID,
78-
Endpoint: *endpoint,
79-
APIURL: *apiURL,
80-
APIKey: *apiKey,
81-
MetricsAddr: *metricsAddr,
82-
SkipTLSVerify: *skipTLSVerify,
83-
EnableNVMeDiscovery: *enableNVMeDiscovery,
76+
DriverName: *driverName,
77+
Version: version,
78+
NodeID: *nodeID,
79+
Endpoint: *endpoint,
80+
APIURL: *apiURL,
81+
APIKey: *apiKey,
82+
MetricsAddr: *metricsAddr,
83+
SkipTLSVerify: *skipTLSVerify,
84+
EnableNVMeDiscovery: *enableNVMeDiscovery,
85+
MaxConcurrentNVMeConnects: *maxConcurrentNVMeConnects,
8486
})
8587
if err != nil {
8688
klog.Fatalf("Failed to create driver: %v", err)

docs/DEPLOYMENT.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -617,15 +617,20 @@ kubectl logs -n kube-system tns-csi-node-xxxxx -c tns-csi-plugin
617617
- Verify subsystem exists: `sudo nvme list-subsys`
618618
- Check /sys/class/nvme for device entries
619619

620-
7. **iSCSI connection failures**
620+
7. **NVMe-oF volumes timing out with many concurrent mounts**
621+
- Symptom: `signal: killed` in node plugin logs when staging many NVMe-oF volumes simultaneously
622+
- Cause: Too many concurrent `nvme connect` processes overwhelming the kernel's NVMe subsystem registration lock
623+
- Fix: The driver limits concurrency to 5 by default (`node.maxConcurrentNVMeConnects`). Lower this value if you still see timeouts, or increase it if mounts are too slow on fast hardware
624+
625+
8. **iSCSI connection failures**
621626
- Verify open-iscsi is installed: `iscsiadm --version`
622627
- Check iscsid service is running: `systemctl status iscsid`
623628
- Verify iSCSI service is enabled on TrueNAS
624629
- Check firewall allows port 3260 (default iSCSI port)
625630
- Test discovery: `sudo iscsiadm -m discovery -t sendtargets -p YOUR-TRUENAS-IP:3260`
626631
- Check node plugin logs for detailed error messages
627632

628-
8. **iSCSI device not appearing**
633+
9. **iSCSI device not appearing**
629634
- Wait a few seconds for device discovery
630635
- Check dmesg for SCSI errors: `sudo dmesg | grep -i scsi`
631636
- List active sessions: `sudo iscsiadm -m session`

docs/METRICS.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ Protocol-specific volume operations (NFS, NVMe-oF, and iSCSI):
3838
- Capacity of provisioned volumes in bytes
3939
- Labels: `volume_id`, `protocol`
4040

41+
### NVMe-oF Connect Concurrency Metrics
42+
43+
- **`tns_csi_nvme_connect_concurrent`** (gauge)
44+
- Number of NVMe-oF connect operations currently in progress
45+
46+
- **`tns_csi_nvme_connect_waiting`** (gauge)
47+
- Number of NVMe-oF connect operations waiting for the semaphore
48+
- Non-zero values indicate the concurrency limit is actively throttling connections
49+
4150
### WebSocket Connection Metrics
4251

4352
Metrics for the TrueNAS API WebSocket connection:

pkg/driver/driver.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,17 @@ import (
2020

2121
// Config contains the configuration for the driver.
2222
type Config struct {
23-
DriverName string
24-
Version string
25-
NodeID string
26-
Endpoint string
27-
APIURL string
28-
APIKey string
29-
MetricsAddr string // Address to expose Prometheus metrics (e.g., ":8080")
30-
TestMode bool // Enable test mode for sanity tests (skips actual mounts)
31-
SkipTLSVerify bool // Skip TLS certificate verification (for self-signed certs)
32-
EnableNVMeDiscovery bool // Run nvme discover before nvme connect (default: false)
23+
DriverName string
24+
Version string
25+
NodeID string
26+
Endpoint string
27+
APIURL string
28+
APIKey string
29+
MetricsAddr string // Address to expose Prometheus metrics (e.g., ":8080")
30+
TestMode bool // Enable test mode for sanity tests (skips actual mounts)
31+
SkipTLSVerify bool // Skip TLS certificate verification (for self-signed certs)
32+
EnableNVMeDiscovery bool // Run nvme discover before nvme connect (default: false)
33+
MaxConcurrentNVMeConnects int // Max concurrent NVMe-oF connect operations per node (default: 5)
3334
}
3435

3536
// Driver is the TNS CSI driver.
@@ -75,7 +76,7 @@ func NewDriverWithClient(cfg Config, client tnsapi.ClientInterface) (*Driver, er
7576
// Initialize CSI services
7677
d.identity = NewIdentityService(cfg.DriverName, cfg.Version)
7778
d.controller = NewControllerService(client, nodeRegistry)
78-
d.node = NewNodeService(cfg.NodeID, client, cfg.TestMode, nodeRegistry, cfg.EnableNVMeDiscovery)
79+
d.node = NewNodeService(cfg.NodeID, client, cfg.TestMode, nodeRegistry, cfg.EnableNVMeDiscovery, cfg.MaxConcurrentNVMeConnects)
7980

8081
return d, nil
8182
}

pkg/driver/node.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,19 +37,24 @@ type NodeService struct {
3737
csi.UnimplementedNodeServer
3838
apiClient tnsapi.ClientInterface
3939
nodeRegistry *NodeRegistry
40+
nvmeConnectSem chan struct{}
4041
nodeID string
41-
testMode bool // Test mode flag to skip actual mounts
42-
enableDiscovery bool // Run nvme discover before nvme connect
42+
testMode bool
43+
enableDiscovery bool
4344
}
4445

4546
// NewNodeService creates a new node service.
46-
func NewNodeService(nodeID string, apiClient tnsapi.ClientInterface, testMode bool, nodeRegistry *NodeRegistry, enableDiscovery bool) *NodeService {
47+
func NewNodeService(nodeID string, apiClient tnsapi.ClientInterface, testMode bool, nodeRegistry *NodeRegistry, enableDiscovery bool, maxConcurrentNVMeConnects int) *NodeService {
48+
if maxConcurrentNVMeConnects <= 0 {
49+
maxConcurrentNVMeConnects = 5
50+
}
4751
return &NodeService{
4852
nodeID: nodeID,
4953
apiClient: apiClient,
5054
testMode: testMode,
5155
nodeRegistry: nodeRegistry,
5256
enableDiscovery: enableDiscovery,
57+
nvmeConnectSem: make(chan struct{}, maxConcurrentNVMeConnects),
5358
}
5459
}
5560

pkg/driver/node_nvmeof.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"time"
1111

1212
"github.com/container-storage-interface/spec/lib/go/csi"
13+
"github.com/fenio/tns-csi/pkg/metrics"
1314
"github.com/fenio/tns-csi/pkg/mount"
1415
"google.golang.org/grpc/codes"
1516
"google.golang.org/grpc/status"
@@ -78,6 +79,27 @@ func (s *NodeService) stageNVMeOFVolume(ctx context.Context, req *csi.NodeStageV
7879
return nil, status.Errorf(codes.FailedPrecondition, "nvme-cli not available: %v", checkErr)
7980
}
8081

82+
// Acquire semaphore to limit concurrent NVMe-oF connect operations.
83+
// This prevents overwhelming the kernel's NVMe subsystem registration lock
84+
// when many volumes are being staged simultaneously.
85+
klog.V(4).Infof("Waiting for NVMe-oF connect semaphore (capacity: %d) for NQN: %s", cap(s.nvmeConnectSem), params.nqn)
86+
metrics.NVMeConnectWaiting()
87+
select {
88+
case s.nvmeConnectSem <- struct{}{}:
89+
metrics.NVMeConnectDoneWaiting()
90+
metrics.NVMeConnectStart()
91+
defer func() {
92+
<-s.nvmeConnectSem
93+
metrics.NVMeConnectDone()
94+
}()
95+
case <-ctx.Done():
96+
metrics.NVMeConnectDoneWaiting()
97+
return nil, status.Errorf(codes.DeadlineExceeded,
98+
"timed out waiting for NVMe-oF connect semaphore (max concurrent: %d): %v",
99+
cap(s.nvmeConnectSem), ctx.Err())
100+
}
101+
klog.V(4).Infof("Acquired NVMe-oF connect semaphore for NQN: %s", params.nqn)
102+
81103
// Connect to NVMe-oF target and stage device
82104
return s.connectAndStageDevice(ctx, params, volumeID, stagingTargetPath, volumeCapability, isBlockVolume, volumeContext, datasetName)
83105
}

0 commit comments

Comments
 (0)