Skip to content

Commit ab757f7

Browse files
authored
fix #2376 - avoid deadlock scenario when completing dead connections (#2378)
* fix #2376 1. to fix the immediate scenario: don't hold the queue lock when we abort things - only hold it when fetching next 2. to avoid similar not yet seen: in GetHeadMessages, don't blindly wait forever also standardise on TryPeek/TryDequeue * ExecuteSyncImpl: don't hold the lock-obj when throwing for timeout * use placeholder message when unable to query the connection queue * release notes
1 parent 51a7d90 commit ab757f7

File tree

5 files changed

+89
-33
lines changed

5 files changed

+89
-33
lines changed

docs/ReleaseNotes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Current package versions:
1111
- Fix [#2350](https://github.com/StackExchange/StackExchange.Redis/issues/2350): Properly parse lua script paramters in all cultures ([#2351 by NickCraver](https://github.com/StackExchange/StackExchange.Redis/pull/2351))
1212
- Fix [#2362](https://github.com/StackExchange/StackExchange.Redis/issues/2362): Set `RedisConnectionException.FailureType` to `AuthenticationFailure` on all authentication scenarios for better handling ([#2367 by NickCraver](https://github.com/StackExchange/StackExchange.Redis/pull/2367))
1313
- Fix [#2368](https://github.com/StackExchange/StackExchange.Redis/issues/2368): Support `RedisValue.Length()` for all storage types ([#2370 by mgravell](https://github.com/StackExchange/StackExchange.Redis/pull/2370))
14+
- Fix [#2376](https://github.com/StackExchange/StackExchange.Redis/issues/2376): Avoid a (rare) deadlock scenario ([#2378 by mgravell](https://github.com/StackExchange/StackExchange.Redis/pull/2378))
1415

1516
## 2.6.90
1617

src/StackExchange.Redis/ConnectionMultiplexer.cs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1959,6 +1959,7 @@ internal static void ThrowFailed<T>(TaskCompletionSource<T>? source, Exception u
19591959
{
19601960
var source = SimpleResultBox<T>.Get();
19611961

1962+
bool timeout = false;
19621963
lock (source)
19631964
{
19641965
#pragma warning disable CS0618 // Type or member is obsolete
@@ -1976,11 +1977,16 @@ internal static void ThrowFailed<T>(TaskCompletionSource<T>? source, Exception u
19761977
else
19771978
{
19781979
Trace("Timeout performing " + message);
1979-
Interlocked.Increment(ref syncTimeouts);
1980-
throw ExceptionFactory.Timeout(this, null, message, server);
1981-
// Very important not to return "source" to the pool here
1980+
timeout = true;
19821981
}
19831982
}
1983+
1984+
if (timeout) // note we throw *outside* of the main lock to avoid deadlock scenarios (#2376)
1985+
{
1986+
Interlocked.Increment(ref syncTimeouts);
1987+
// Very important not to return "source" to the pool here
1988+
throw ExceptionFactory.Timeout(this, null, message, server);
1989+
}
19841990
// Snapshot these so that we can recycle the box
19851991
var val = source.GetResult(out var ex, canRecycle: true); // now that we aren't locking it...
19861992
if (ex != null) throw ex;

src/StackExchange.Redis/ExtensionMethods.Internal.cs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System.Diagnostics.CodeAnalysis;
1+
using System.Collections.Generic;
2+
using System.Diagnostics.CodeAnalysis;
23

34
namespace StackExchange.Redis
45
{
@@ -9,5 +10,28 @@ internal static bool IsNullOrEmpty([NotNullWhen(false)] this string? s) =>
910

1011
internal static bool IsNullOrWhiteSpace([NotNullWhen(false)] this string? s) =>
1112
string.IsNullOrWhiteSpace(s);
13+
14+
#if !NETCOREAPP3_1_OR_GREATER
15+
internal static bool TryDequeue<T>(this Queue<T> queue, [NotNullWhen(true)] out T? result)
16+
{
17+
if (queue.Count == 0)
18+
{
19+
result = default;
20+
return false;
21+
}
22+
result = queue.Dequeue()!;
23+
return true;
24+
}
25+
internal static bool TryPeek<T>(this Queue<T> queue, [NotNullWhen(true)] out T? result)
26+
{
27+
if (queue.Count == 0)
28+
{
29+
result = default;
30+
return false;
31+
}
32+
result = queue.Peek()!;
33+
return true;
34+
}
35+
#endif
1236
}
1337
}

src/StackExchange.Redis/Message.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1566,5 +1566,15 @@ protected override void WriteImpl(PhysicalConnection physical)
15661566
}
15671567
public override int ArgCount => 1;
15681568
}
1569+
1570+
// this is a placeholder message for use when (for example) unable to queue the
1571+
// connection queue due to a lock timeout
1572+
internal sealed class UnknownMessage : Message
1573+
{
1574+
public static UnknownMessage Instance { get; } = new();
1575+
private UnknownMessage() : base(0, CommandFlags.None, RedisCommand.UNKNOWN) { }
1576+
public override int ArgCount => 0;
1577+
protected override void WriteImpl(PhysicalConnection physical) => throw new InvalidOperationException("This message cannot be written");
1578+
}
15691579
}
15701580
}

src/StackExchange.Redis/PhysicalConnection.cs

Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System.Buffers;
55
using System.Collections.Generic;
66
using System.Diagnostics;
7+
using System.Diagnostics.CodeAnalysis;
78
using System.IO;
89
using System.IO.Pipelines;
910
using System.Linq;
@@ -16,6 +17,7 @@
1617
using System.Text;
1718
using System.Threading;
1819
using System.Threading.Tasks;
20+
using static StackExchange.Redis.Message;
1921

2022
namespace StackExchange.Redis
2123
{
@@ -396,9 +398,8 @@ public void RecordConnectionFailed(
396398
lock (_writtenAwaitingResponse)
397399
{
398400
// find oldest message awaiting a response
399-
if (_writtenAwaitingResponse.Count != 0)
401+
if (_writtenAwaitingResponse.TryPeek(out var next))
400402
{
401-
var next = _writtenAwaitingResponse.Peek();
402403
unansweredWriteTime = next.GetWriteTime();
403404
}
404405
}
@@ -478,34 +479,42 @@ void add(string lk, string sk, string? v)
478479
bridge?.OnConnectionFailed(this, failureType, outerException);
479480
}
480481
}
481-
// cleanup
482+
// clean up (note: avoid holding the lock when we complete things, even if this means taking
483+
// the lock multiple times; this is fine here - we shouldn't be fighting anyone, and we're already toast)
482484
lock (_writtenAwaitingResponse)
483485
{
484486
bridge?.Trace(_writtenAwaitingResponse.Count != 0, "Failing outstanding messages: " + _writtenAwaitingResponse.Count);
485-
while (_writtenAwaitingResponse.Count != 0)
486-
{
487-
var next = _writtenAwaitingResponse.Dequeue();
487+
}
488488

489-
if (next.Command == RedisCommand.QUIT && next.TrySetResult(true))
490-
{
491-
// fine, death of a socket is close enough
492-
next.Complete();
493-
}
494-
else
489+
while (TryDequeueLocked(_writtenAwaitingResponse, out var next))
490+
{
491+
if (next.Command == RedisCommand.QUIT && next.TrySetResult(true))
492+
{
493+
// fine, death of a socket is close enough
494+
next.Complete();
495+
}
496+
else
497+
{
498+
var ex = innerException is RedisException ? innerException : outerException;
499+
if (bridge != null)
495500
{
496-
var ex = innerException is RedisException ? innerException : outerException;
497-
if (bridge != null)
498-
{
499-
bridge.Trace("Failing: " + next);
500-
bridge.Multiplexer?.OnMessageFaulted(next, ex, origin);
501-
}
502-
next.SetExceptionAndComplete(ex!, bridge);
501+
bridge.Trace("Failing: " + next);
502+
bridge.Multiplexer?.OnMessageFaulted(next, ex, origin);
503503
}
504+
next.SetExceptionAndComplete(ex!, bridge);
504505
}
505506
}
506507

507508
// burn the socket
508509
Shutdown();
510+
511+
static bool TryDequeueLocked(Queue<Message> queue, [NotNullWhen(true)] out Message? message)
512+
{
513+
lock (queue)
514+
{
515+
return queue.TryDequeue(out message);
516+
}
517+
}
509518
}
510519

511520
internal bool IsIdle() => _writeStatus == WriteStatus.Idle;
@@ -1580,18 +1589,10 @@ private void MatchResult(in RawResult result)
15801589
_readStatus = ReadStatus.DequeueResult;
15811590
lock (_writtenAwaitingResponse)
15821591
{
1583-
#if NET5_0_OR_GREATER
15841592
if (!_writtenAwaitingResponse.TryDequeue(out msg))
15851593
{
15861594
throw new InvalidOperationException("Received response with no message waiting: " + result.ToString());
15871595
}
1588-
#else
1589-
if (_writtenAwaitingResponse.Count == 0)
1590-
{
1591-
throw new InvalidOperationException("Received response with no message waiting: " + result.ToString());
1592-
}
1593-
msg = _writtenAwaitingResponse.Dequeue();
1594-
#endif
15951596
}
15961597
_activeMessage = msg;
15971598

@@ -1632,9 +1633,23 @@ static bool TryGetPubSubPayload(in RawResult value, out RedisValue parsed, bool
16321633
internal void GetHeadMessages(out Message? now, out Message? next)
16331634
{
16341635
now = _activeMessage;
1635-
lock(_writtenAwaitingResponse)
1636+
bool haveLock = false;
1637+
try
1638+
{
1639+
// careful locking here; a: don't try too hard (this is error info only), b: avoid deadlock (see #2376)
1640+
Monitor.TryEnter(_writtenAwaitingResponse, 10, ref haveLock);
1641+
if (haveLock)
1642+
{
1643+
_writtenAwaitingResponse.TryPeek(out next);
1644+
}
1645+
else
1646+
{
1647+
next = UnknownMessage.Instance;
1648+
}
1649+
}
1650+
finally
16361651
{
1637-
next = _writtenAwaitingResponse.Count == 0 ? null : _writtenAwaitingResponse.Peek();
1652+
if (haveLock) Monitor.Exit(_writtenAwaitingResponse);
16381653
}
16391654
}
16401655

0 commit comments

Comments
 (0)