Skip to content

Commit a8da91c

Browse files
committed
Implement lock-denied polling
1 parent 1aaec7e commit a8da91c

File tree

3 files changed

+186
-24
lines changed

3 files changed

+186
-24
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- **Lock-Denied Polling**: Implemented intelligent lock polling for `lock-denied` and `in-use` errors. Lock operations now use fixed 1-second polling intervals (instead of exponential backoff) and respect `LockReleaseTimeout` (default 120s) instead of `MaxRetries`. This provides much better lock acquisition behavior when datastores are temporarily locked by other sessions.
13+
1014
### Changed
1115

1216
- **Serialization**: All operations now use exclusive locks instead of read locks. Operations on the same client connection are fully serialized to prevent write interleaving and simplify reconnection logic.
17+
- **Lock Timeout Behavior**: Lock-denied errors now poll for up to `LockReleaseTimeout` duration with 1-second intervals, ignoring `MaxRetries` configuration. Returns `ErrLockReleaseTimeout` when timeout is exceeded.
18+
- **Transport Error Priority**: Transport errors (connection failures) are now handled before lock polling to ensure reconnection happens first when both error types occur simultaneously.
1319

1420
### Fixed
1521

1622
- **EOF Handling**: Treat `io.EOF` errors as transient and trigger automatic reconnection. This handles scenarios where network devices close idle connections (session timeouts, device restarts), improving reliability for long-running applications and idle connection scenarios.
23+
- **Lock Polling Implementation**: Fixed `LockReleaseTimeout` option which was defined but not implemented. Lock-denied errors previously used standard exponential backoff and gave up after ~7-8 seconds (3 retries). Now correctly polls for configured timeout duration (default 120s).
24+
- **Context Cancellation in Lock Polling**: Added defensive context cancellation check at the start of lock polling to respect context deadlines more precisely.
1725

1826
## [0.1.0] - 2025-11-09
1927

client.go

Lines changed: 112 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,24 @@ func (c *Client) hasTransportError(errs []ErrorModel, goErr error) bool {
13481348
return false
13491349
}
13501350

1351+
// isLockDeniedError checks if errors indicate lock-denied or in-use conditions.
1352+
//
1353+
// Lock-denied errors require special polling behavior with fixed 1-second intervals
1354+
// instead of exponential backoff, and use LockReleaseTimeout instead of MaxRetries.
1355+
//
1356+
// This matches the NETCONF operational pattern where locks are typically released
1357+
// within seconds to minutes, and polling is more efficient than exponential backoff.
1358+
//
1359+
// Returns true if any error in the list is a lock contention error.
1360+
func (c *Client) isLockDeniedError(errs []ErrorModel) bool {
1361+
for _, err := range errs {
1362+
if err.ErrorTag == "lock-denied" || err.ErrorTag == "in-use" {
1363+
return true
1364+
}
1365+
}
1366+
return false
1367+
}
1368+
13511369
// Backoff calculates the backoff delay for retry attempt using exponential backoff with jitter
13521370
//
13531371
// The formula is: delay = min(minDelay * (factor ^ attempt) + jitter, maxDelay)
@@ -1501,6 +1519,10 @@ func (c *Client) sendRPC(ctx context.Context, req *Req) (Res, error) {
15011519
defer cancel()
15021520
}
15031521

1522+
// Track lock-denied polling state
1523+
var lockDeniedStart time.Time
1524+
isLockPolling := false
1525+
15041526
// Retry loop
15051527
for attempt := 0; attempt <= c.MaxRetries; attempt++ {
15061528
// Check context before attempt
@@ -1544,20 +1566,98 @@ func (c *Client) sendRPC(ctx context.Context, req *Req) (Res, error) {
15441566
// Check if transient (checks both NETCONF rpc-errors and scrapligo Go errors)
15451567
isTransient := c.checkTransientError(res.Errors, err)
15461568

1569+
// Check if this is a lock-denied error (requires special polling behavior)
1570+
isLockDenied := c.isLockDeniedError(res.Errors)
1571+
15471572
// Log transient detection for debugging
15481573
if err != nil && isTransient {
15491574
c.logger.Debug(ctx, "NETCONF transient error detected",
15501575
"operation", req.Operation,
15511576
"attempt", attempt,
1577+
"lockDenied", isLockDenied,
15521578
"error", err.Error())
15531579
}
15541580

15551581
// Check for transport/connection errors that require reconnection
15561582
// Both NETCONF <rpc-error> transport errors and EOF errors trigger reconnection
1583+
// IMPORTANT: Handle transport errors BEFORE lock polling to ensure reconnection happens first
15571584
hasTransportError := c.hasTransportError(res.Errors, err)
15581585

1559-
// Not transient or max retries reached
1560-
if !isTransient || attempt >= c.MaxRetries {
1586+
// Handle transport errors with reconnection (PRIORITY: Handle before lock polling)
1587+
if hasTransportError {
1588+
// Use helper to handle lock release/reacquire around reconnection
1589+
if reconnectErr := c.handleTransportErrorReconnect(ctx, req); reconnectErr != nil {
1590+
// Reconnection failed, return original error
1591+
return res, &NetconfError{
1592+
Operation: req.Operation,
1593+
Errors: res.Errors,
1594+
Message: "operation failed and reconnection failed",
1595+
InternalMsg: reconnectErr.Error(),
1596+
Retries: attempt,
1597+
IsTransient: true,
1598+
}
1599+
}
1600+
1601+
c.logger.Info(ctx, "NETCONF reconnection successful",
1602+
"operation", req.Operation,
1603+
"sessionID", c.sessionID)
1604+
1605+
// Reset lock polling state after reconnection
1606+
isLockPolling = false
1607+
lockDeniedStart = time.Time{}
1608+
// Reconnection succeeded, continue to backoff and retry
1609+
}
1610+
1611+
// Handle lock-denied with special timeout-based polling (AFTER transport error handling)
1612+
if isLockDenied {
1613+
// Check context cancellation before lock timeout (defensive check)
1614+
select {
1615+
case <-ctx.Done():
1616+
return Res{}, fmt.Errorf("operation canceled during lock polling: %w", ctx.Err())
1617+
default:
1618+
}
1619+
1620+
// Start lock polling timer on first lock-denied
1621+
if !isLockPolling {
1622+
isLockPolling = true
1623+
lockDeniedStart = time.Now()
1624+
c.logger.Info(ctx, "NETCONF waiting for lock release",
1625+
"operation", req.Operation,
1626+
"target", req.Target,
1627+
"timeout", c.LockReleaseTimeout.String())
1628+
}
1629+
1630+
// Check if lock polling has exceeded timeout - return ErrLockReleaseTimeout
1631+
lockPollingElapsed := time.Since(lockDeniedStart)
1632+
if lockPollingElapsed >= c.LockReleaseTimeout {
1633+
// Lock timeout exceeded, return error
1634+
c.logger.Error(ctx, "NETCONF lock release timeout exceeded",
1635+
"operation", req.Operation,
1636+
"elapsed", lockPollingElapsed.String(),
1637+
"timeout", c.LockReleaseTimeout.String())
1638+
1639+
// Log each RPC error for context
1640+
for i, rpcErr := range res.Errors {
1641+
c.logger.Error(ctx, "NETCONF RPC error",
1642+
"index", i,
1643+
"errorType", rpcErr.ErrorType,
1644+
"errorTag", rpcErr.ErrorTag,
1645+
"errorMessage", rpcErr.ErrorMessage)
1646+
}
1647+
1648+
return res, &NetconfError{
1649+
Operation: req.Operation,
1650+
Errors: res.Errors,
1651+
Message: ErrLockReleaseTimeout.Error(),
1652+
InternalMsg: fmt.Sprintf("waited %s for lock release", lockPollingElapsed),
1653+
Retries: attempt,
1654+
IsTransient: true,
1655+
}
1656+
}
1657+
}
1658+
1659+
// Not transient or (non-lock transient and max retries reached)
1660+
if !isTransient || (!isLockDenied && attempt >= c.MaxRetries) {
15611661
// Log operation failure with error details
15621662
if err != nil {
15631663
c.logger.Error(ctx, "NETCONF operation failed",
@@ -1596,33 +1696,21 @@ func (c *Client) sendRPC(ctx context.Context, req *Req) (Res, error) {
15961696
return res, fmt.Errorf("operation %s failed after %d retries: %w", req.Operation, attempt, err)
15971697
}
15981698

1599-
// Handle transport errors with reconnection
1600-
if hasTransportError {
1601-
// Use helper to handle lock release/reacquire around reconnection
1602-
if reconnectErr := c.handleTransportErrorReconnect(ctx, req); reconnectErr != nil {
1603-
// Reconnection failed, return original error
1604-
return res, &NetconfError{
1605-
Operation: req.Operation,
1606-
Errors: res.Errors,
1607-
Message: "operation failed and reconnection failed",
1608-
InternalMsg: reconnectErr.Error(),
1609-
Retries: attempt,
1610-
IsTransient: true,
1611-
}
1612-
}
1613-
1614-
c.logger.Info(ctx, "NETCONF reconnection successful",
1615-
"operation", req.Operation,
1616-
"sessionID", c.sessionID)
1617-
// Reconnection succeeded, continue to backoff and retry
1699+
// Apply backoff before next retry
1700+
var delay time.Duration
1701+
if isLockDenied {
1702+
// Use fixed 1-second interval for lock polling
1703+
delay = 1 * time.Second
1704+
} else {
1705+
// Use exponential backoff for other transient errors
1706+
delay = c.Backoff(attempt)
16181707
}
16191708

1620-
// Apply backoff before next retry
1621-
delay := c.Backoff(attempt)
16221709
c.logger.Debug(ctx, "NETCONF retry backoff",
16231710
"operation", req.Operation,
16241711
"attempt", attempt,
1625-
"delay", delay.String())
1712+
"delay", delay.String(),
1713+
"lockPolling", isLockPolling)
16261714

16271715
select {
16281716
case <-ctx.Done():

errors_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,3 +374,69 @@ func TestErrorModelRFC6241Fields(t *testing.T) {
374374
}
375375
}
376376
}
377+
378+
// TestIsLockDeniedError tests the isLockDeniedError helper method
379+
func TestIsLockDeniedError(t *testing.T) {
380+
tests := []struct {
381+
name string
382+
errors []ErrorModel
383+
expectLockDenied bool
384+
}{
385+
{
386+
name: "lock-denied error",
387+
errors: []ErrorModel{
388+
{ErrorTag: "lock-denied"},
389+
},
390+
expectLockDenied: true,
391+
},
392+
{
393+
name: "in-use error",
394+
errors: []ErrorModel{
395+
{ErrorTag: "in-use"},
396+
},
397+
expectLockDenied: true,
398+
},
399+
{
400+
name: "lock-denied with other errors",
401+
errors: []ErrorModel{
402+
{ErrorTag: "invalid-value"},
403+
{ErrorTag: "lock-denied"},
404+
},
405+
expectLockDenied: true,
406+
},
407+
{
408+
name: "non-lock error",
409+
errors: []ErrorModel{
410+
{ErrorTag: "invalid-value"},
411+
},
412+
expectLockDenied: false,
413+
},
414+
{
415+
name: "transport error",
416+
errors: []ErrorModel{
417+
{ErrorType: "transport"},
418+
},
419+
expectLockDenied: false,
420+
},
421+
{
422+
name: "empty errors",
423+
errors: []ErrorModel{},
424+
expectLockDenied: false,
425+
},
426+
{
427+
name: "nil errors",
428+
errors: nil,
429+
expectLockDenied: false,
430+
},
431+
}
432+
433+
for _, tt := range tests {
434+
t.Run(tt.name, func(t *testing.T) {
435+
client := &Client{}
436+
result := client.isLockDeniedError(tt.errors)
437+
if result != tt.expectLockDenied {
438+
t.Errorf("expected isLockDeniedError %v, got %v", tt.expectLockDenied, result)
439+
}
440+
})
441+
}
442+
}

0 commit comments

Comments
 (0)