Skip to content

Commit 193e16e

Browse files
committed
Fix deadlock scenario
1 parent fa12763 commit 193e16e

File tree

1 file changed

+130
-90
lines changed

1 file changed

+130
-90
lines changed

client.go

Lines changed: 130 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,17 @@ func (c *Client) ensureConnected() error {
390390
return c.connect()
391391
}
392392

393+
// requireDriver checks if the driver is initialized and returns an error if not.
394+
// This helper centralizes nil driver checks and provides consistent error messages.
395+
//
396+
// PRECONDITION: Caller must hold c.mu (read or write lock)
397+
func (c *Client) requireDriver(operation string) error {
398+
if c.driver == nil {
399+
return fmt.Errorf("operation %s failed: driver is nil (connection closed)", operation)
400+
}
401+
return nil
402+
}
403+
393404
// Close closes the NETCONF session and cleans up resources
394405
//
395406
// This sends a close-session RPC to the server and closes the underlying
@@ -467,6 +478,11 @@ func (c *Client) Get(ctx context.Context, filter Filter, mods ...func(*Req)) (Re
467478
c.mu.RLock()
468479
defer c.mu.RUnlock()
469480

481+
// Defensive check: Verify driver still valid after acquiring lock
482+
if err := c.requireDriver("get"); err != nil {
483+
return Res{}, err
484+
}
485+
470486
// Build request
471487
req := &Req{
472488
Operation: "get",
@@ -501,6 +517,11 @@ func (c *Client) GetConfig(ctx context.Context, source string, filter Filter, mo
501517
c.mu.RLock()
502518
defer c.mu.RUnlock()
503519

520+
// Defensive check: Verify driver still valid after acquiring lock
521+
if err := c.requireDriver("get-config"); err != nil {
522+
return Res{}, err
523+
}
524+
504525
// Build request
505526
req := &Req{
506527
Operation: "get-config",
@@ -539,6 +560,11 @@ func (c *Client) EditConfig(ctx context.Context, target, config string, mods ...
539560
c.mu.Lock()
540561
defer c.mu.Unlock()
541562

563+
// Defensive check: Verify driver still valid after acquiring lock
564+
if err := c.requireDriver("edit-config"); err != nil {
565+
return Res{}, err
566+
}
567+
542568
// Build request
543569
req := &Req{
544570
Operation: "edit-config",
@@ -572,6 +598,11 @@ func (c *Client) CopyConfig(ctx context.Context, source, target string, mods ...
572598
c.mu.Lock()
573599
defer c.mu.Unlock()
574600

601+
// Defensive check: Verify driver still valid after acquiring lock
602+
if err := c.requireDriver("copy-config"); err != nil {
603+
return Res{}, err
604+
}
605+
575606
// Build request
576607
req := &Req{
577608
Operation: "copy-config",
@@ -607,6 +638,11 @@ func (c *Client) DeleteConfig(ctx context.Context, target string, mods ...func(*
607638
c.mu.Lock()
608639
defer c.mu.Unlock()
609640

641+
// Defensive check: Verify driver still valid after acquiring lock
642+
if err := c.requireDriver("delete-config"); err != nil {
643+
return Res{}, err
644+
}
645+
610646
// Additional check: only startup can be deleted per RFC 6241
611647
target = strings.TrimSpace(strings.ToLower(target))
612648
if target != "startup" {
@@ -634,14 +670,21 @@ func (c *Client) DeleteConfig(ctx context.Context, target string, mods ...func(*
634670
//
635671
// Per RFC 6241 Section 7.5, a lock prevents other NETCONF sessions from
636672
// performing configuration changes. If another session holds the lock,
637-
// Lock() will block until the lock becomes available or the operation times out.
673+
// Lock() will automatically retry with exponential backoff until the lock
674+
// becomes available or the context deadline is reached.
675+
//
676+
// Connection Guarantee: This method ensures a stable connection exists
677+
// before attempting to acquire the lock. This prevents race conditions in
678+
// concurrent environments.
638679
//
639680
// IMPORTANT: Always use defer to ensure locks are released even if errors occur.
640681
// Failure to unlock can cause deadlocks and prevent other sessions from operating.
641682
//
642683
// Example:
643684
//
644-
// ctx := context.Background()
685+
// ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
686+
// defer cancel()
687+
//
645688
// res, err := client.Lock(ctx, "candidate")
646689
// if err != nil {
647690
// log.Fatal(err)
@@ -668,18 +711,27 @@ func (c *Client) Lock(ctx context.Context, target string, mods ...func(*Req)) (R
668711
mod(req)
669712
}
670713

671-
// Send RPC without holding mutex - driver has its own synchronization
672-
// and waitForLockRelease may recursively call Lock() to test availability
714+
// Send RPC without holding c.mu - driver has its own synchronization.
715+
// This differs from other operations (Get, EditConfig, etc.) which hold
716+
// c.mu during sendRPC(). Lock/Unlock operations are NETCONF protocol
717+
// operations that don't access client state, so driver-level
718+
// synchronization is sufficient. Connection validation happens in
719+
// executeRPC() (line 1582).
673720
return c.sendRPC(ctx, req)
674721
}
675722

676723
// Unlock unlocks the specified datastore
677724
//
725+
// Connection Guarantee: Like Lock(), this method ensures a stable connection exists
726+
// before attempting to release the lock.
727+
//
678728
// See Lock() for complete lock/unlock documentation and proper usage with defer.
679729
//
680730
// Example:
681731
//
682-
// ctx := context.Background()
732+
// ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
733+
// defer cancel()
734+
//
683735
// res, err := client.Lock(ctx, "candidate")
684736
// if err != nil {
685737
// log.Fatal(err)
@@ -702,7 +754,8 @@ func (c *Client) Unlock(ctx context.Context, target string, mods ...func(*Req))
702754
mod(req)
703755
}
704756

705-
// Send RPC without holding mutex - driver has its own synchronization
757+
// Send RPC without holding c.mu - driver has its own synchronization.
758+
// See Lock() for explanation of why this differs from other operations.
706759
return c.sendRPC(ctx, req)
707760
}
708761

@@ -754,6 +807,11 @@ func (c *Client) Commit(ctx context.Context, mods ...func(*Req)) (Res, error) {
754807
c.mu.Lock()
755808
defer c.mu.Unlock()
756809

810+
// Defensive check: Verify driver still valid after acquiring lock
811+
if err := c.requireDriver("commit"); err != nil {
812+
return Res{}, err
813+
}
814+
757815
// Build request
758816
req := &Req{
759817
Operation: "commit",
@@ -785,6 +843,11 @@ func (c *Client) Discard(ctx context.Context, mods ...func(*Req)) (Res, error) {
785843
c.mu.Lock()
786844
defer c.mu.Unlock()
787845

846+
// Defensive check: Verify driver still valid after acquiring lock
847+
if err := c.requireDriver("discard"); err != nil {
848+
return Res{}, err
849+
}
850+
788851
// Build request
789852
req := &Req{
790853
Operation: "discard",
@@ -819,6 +882,11 @@ func (c *Client) Validate(ctx context.Context, source string, mods ...func(*Req)
819882
c.mu.RLock()
820883
defer c.mu.RUnlock()
821884

885+
// Defensive check: Verify driver still valid after acquiring lock
886+
if err := c.requireDriver("validate"); err != nil {
887+
return Res{}, err
888+
}
889+
822890
// Build request
823891
req := &Req{
824892
Operation: "validate",
@@ -866,6 +934,11 @@ func (c *Client) RPC(ctx context.Context, rpcXML string, mods ...func(*Req)) (Re
866934
c.mu.Lock()
867935
defer c.mu.Unlock()
868936

937+
// Defensive check: Verify driver still valid after acquiring lock
938+
if err := c.requireDriver("rpc"); err != nil {
939+
return Res{}, err
940+
}
941+
869942
// Build request
870943
req := &Req{
871944
Operation: "rpc",
@@ -1274,64 +1347,23 @@ func (c *Client) Backoff(attempt int) time.Duration {
12741347
return time.Duration(delay)
12751348
}
12761349

1277-
// waitForLockRelease waits for a datastore lock to be released
1278-
//
1279-
// This method polls the device to check if the lock is still held and waits
1280-
// for it to be released. Used when a lock-denied error is encountered.
1281-
//
1282-
// Parameters:
1283-
// - ctx: Context for cancellation
1284-
// - target: Datastore name ("running", "candidate", "startup")
1285-
//
1286-
// Returns an error if the lock is not released within LockReleaseTimeout.
1287-
func (c *Client) waitForLockRelease(ctx context.Context, target string) error {
1288-
c.logger.Info(ctx, "NETCONF waiting for lock release",
1289-
"target", target,
1290-
"timeout", c.LockReleaseTimeout.String())
1291-
1292-
// Apply lock release timeout
1293-
ctx, cancel := context.WithTimeout(ctx, c.LockReleaseTimeout)
1294-
defer cancel()
1295-
1296-
// Poll interval (1 second)
1297-
ticker := time.NewTicker(1 * time.Second)
1298-
defer ticker.Stop()
1299-
1300-
for {
1301-
select {
1302-
case <-ctx.Done():
1303-
return ErrLockReleaseTimeout
1304-
case <-ticker.C:
1305-
// Try to acquire lock
1306-
res, err := c.Lock(ctx, target)
1307-
if err == nil && res.OK {
1308-
// Lock acquired, release it immediately to verify availability
1309-
// Note: ignoring unlock errors is intentional - we proved lock availability
1310-
_, _ = c.Unlock(ctx, target) //nolint:errcheck // Intentional: verifying lock availability only
1311-
1312-
c.logger.Info(ctx, "NETCONF lock acquired",
1313-
"target", target)
1314-
1315-
return nil
1316-
}
1317-
// Lock still held, continue waiting
1318-
}
1319-
}
1320-
}
1321-
13221350
// reconnect attempts to reconnect the NETCONF session
13231351
//
13241352
// This method closes the existing connection and establishes a new one,
13251353
// re-negotiating capabilities. Used internally when transport errors are
13261354
// detected during retry logic.
13271355
//
1328-
// PRECONDITION: Caller must hold c.mu.Lock() (write lock)
1356+
// PRECONDITION: Caller must NOT hold any locks (acquires write lock internally)
13291357
//
13301358
// Returns an error if reconnection fails.
13311359
func (c *Client) reconnect() error {
13321360
c.logger.Info(context.Background(), "NETCONF reconnecting",
13331361
"host", c.Host)
13341362

1363+
// Acquire write lock for exclusive access to driver state
1364+
c.mu.Lock()
1365+
defer c.mu.Unlock()
1366+
13351367
// Close existing connection (ignore errors - connection may already be broken)
13361368
if c.driver != nil {
13371369
_ = c.driver.Close() //nolint:errcheck // Explicitly ignore error (connection likely already broken)
@@ -1342,6 +1374,49 @@ func (c *Client) reconnect() error {
13421374
return c.connect()
13431375
}
13441376

1377+
// handleTransportErrorReconnect handles transport error reconnection with proper lock management.
1378+
// This helper reduces cyclomatic complexity by extracting the lock release/reacquire logic.
1379+
//
1380+
// Parameters:
1381+
// - ctx: Context for logging
1382+
// - req: Request that failed with transport error
1383+
//
1384+
// Returns an error if reconnection fails.
1385+
func (c *Client) handleTransportErrorReconnect(ctx context.Context, req *Req) error {
1386+
c.logger.Info(ctx, "NETCONF transport error detected, reconnecting",
1387+
"operation", req.Operation)
1388+
1389+
// Determine lock type held by caller based on operation type
1390+
// Read operations: get, get-config, validate (hold RLock)
1391+
// Write operations: edit-config, copy-config, delete-config, commit, discard, rpc (hold Lock)
1392+
// Lock operations: lock, unlock (hold no lock)
1393+
isReadOp := req.Operation == opGet || req.Operation == opGetConfig || req.Operation == opValidate
1394+
isWriteOp := req.Operation == opEditConfig || req.Operation == opCopyConfig ||
1395+
req.Operation == opDeleteConfig || req.Operation == opCommit ||
1396+
req.Operation == opDiscard || req.Operation == opRPC
1397+
1398+
// Release lock before reconnect (reconnect acquires its own write lock)
1399+
if isReadOp {
1400+
c.mu.RUnlock()
1401+
} else if isWriteOp {
1402+
c.mu.Unlock()
1403+
}
1404+
// Lock/Unlock operations don't hold c.mu, so nothing to release
1405+
1406+
// Attempt to reconnect (acquires and releases its own write lock)
1407+
reconnectErr := c.reconnect()
1408+
1409+
// Reacquire original lock type
1410+
if isReadOp {
1411+
c.mu.RLock()
1412+
} else if isWriteOp {
1413+
c.mu.Lock()
1414+
}
1415+
// Lock/Unlock operations don't hold c.mu, so nothing to reacquire
1416+
1417+
return reconnectErr
1418+
}
1419+
13451420
// sendRPC sends a NETCONF RPC request via scrapligo and parses the response
13461421
//
13471422
// This method handles all NETCONF operations by dispatching to the appropriate
@@ -1444,38 +1519,6 @@ func (c *Client) sendRPC(ctx context.Context, req *Req) (Res, error) {
14441519
"error", err.Error())
14451520
}
14461521

1447-
// Handle lock-denied errors with polling before general backoff
1448-
if isTransient {
1449-
// Check if this is a lock-denied error
1450-
hasLockDenied := false
1451-
var lockTarget string
1452-
for _, rpcErr := range res.Errors {
1453-
if rpcErr.ErrorTag == "lock-denied" || rpcErr.ErrorTag == "in-use" {
1454-
hasLockDenied = true
1455-
// Extract target from request
1456-
lockTarget = req.Target
1457-
break
1458-
}
1459-
}
1460-
1461-
if hasLockDenied && lockTarget != "" {
1462-
// Use lock-specific polling instead of exponential backoff
1463-
if waitErr := c.waitForLockRelease(ctx, lockTarget); waitErr != nil {
1464-
// Timeout waiting for lock, return error
1465-
return res, &NetconfError{
1466-
Operation: req.Operation,
1467-
Errors: res.Errors,
1468-
Message: "lock wait timeout",
1469-
InternalMsg: waitErr.Error(),
1470-
Retries: attempt,
1471-
IsTransient: true,
1472-
}
1473-
}
1474-
// Lock released, retry immediately without backoff
1475-
continue
1476-
}
1477-
}
1478-
14791522
// Check for transport/connection errors that require reconnection
14801523
// This includes both NETCONF <rpc-error> transport errors and scrapligo Go errors
14811524
hasTransportError := c.hasTransportError(res.Errors, err)
@@ -1522,12 +1565,8 @@ func (c *Client) sendRPC(ctx context.Context, req *Req) (Res, error) {
15221565

15231566
// Handle transport errors with reconnection
15241567
if hasTransportError {
1525-
c.logger.Info(ctx, "NETCONF transport error detected, reconnecting",
1526-
"operation", req.Operation,
1527-
"attempt", attempt)
1528-
1529-
// Attempt to reconnect
1530-
if reconnectErr := c.reconnect(); reconnectErr != nil {
1568+
// Use helper to handle lock release/reacquire around reconnection
1569+
if reconnectErr := c.handleTransportErrorReconnect(ctx, req); reconnectErr != nil {
15311570
// Reconnection failed, return original error
15321571
return res, &NetconfError{
15331572
Operation: req.Operation,
@@ -1538,6 +1577,7 @@ func (c *Client) sendRPC(ctx context.Context, req *Req) (Res, error) {
15381577
IsTransient: true,
15391578
}
15401579
}
1580+
15411581
c.logger.Info(ctx, "NETCONF reconnection successful",
15421582
"operation", req.Operation,
15431583
"sessionID", c.sessionID)

0 commit comments

Comments
 (0)