Skip to content

Commit 9a3003f

Browse files
authored
Connections: detect and handle the Linux dead socket case (#2610)
In Linux, we see 15 minute socket stalls due to OS-level TCP retries. What this means is the PhysicalConnection detects no issues on the pipe that's retrying, but is also not receiving data at all leading to long stalls in client applications. The goal here is to detect that we're timing out commands people have issued to the connection but we're getting _NOTHING_ back on the socket at all. In this case, we should assume the socket is dead and issue a reconnect so that we get out of the hung situation much faster. For an initial go at this, we've chosen 4x the timeout interval as a threshold, but could make this configurable if needed.
1 parent 76f5205 commit 9a3003f

File tree

3 files changed

+24
-4
lines changed

3 files changed

+24
-4
lines changed

docs/ReleaseNotes.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ Current package versions:
88

99
## Unreleased
1010

11-
- Fix [#2593](https://github.com/StackExchange/StackExchange.Redis/pull/2593): `EXPIRETIME` and `PEXPIRETIME` miscategorized as `PrimaryOnly` commands causing them to fail when issued against a read-only replica ([#2593 by slorello89](https://github.com/StackExchange/StackExchange.Redis/pull/2593))
12-
- Fix [#2591](https://github.com/StackExchange/StackExchange.Redis/pull/2591): Add `HELLO` to Sentinel connections so they can support RESP3 ([#2601 by NickCraver](https://github.com/StackExchange/StackExchange.Redis/pull/2601))
11+
- Fix [#2593](https://github.com/StackExchange/StackExchange.Redis/issues/2593): `EXPIRETIME` and `PEXPIRETIME` miscategorized as `PrimaryOnly` commands causing them to fail when issued against a read-only replica ([#2593 by slorello89](https://github.com/StackExchange/StackExchange.Redis/pull/2593))
12+
- Fix [#2591](https://github.com/StackExchange/StackExchange.Redis/issues/2591): Add `HELLO` to Sentinel connections so they can support RESP3 ([#2601 by NickCraver](https://github.com/StackExchange/StackExchange.Redis/pull/2601))
13+
- Fix [#2595](https://github.com/StackExchange/StackExchange.Redis/issues/2595): Add detection handling for dead sockets that the OS says are okay, seen especially in Linux environments (https://github.com/StackExchange/StackExchange.Redis/pull/2610)
1314

1415
## 2.7.4
1516

src/StackExchange.Redis/PhysicalBridge.cs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,7 @@ internal void OnHeartbeat(bool ifConnectedOnly)
591591
Interlocked.Exchange(ref connectTimeoutRetryCount, 0);
592592
tmp.BridgeCouldBeNull?.ServerEndPoint?.ClearUnselectable(UnselectableFlags.DidNotRespond);
593593
}
594-
tmp.OnBridgeHeartbeat();
594+
int timedOutThisHeartbeat = tmp.OnBridgeHeartbeat();
595595
int writeEverySeconds = ServerEndPoint.WriteEverySeconds,
596596
checkConfigSeconds = ServerEndPoint.ConfigCheckSeconds;
597597

@@ -623,6 +623,17 @@ internal void OnHeartbeat(bool ifConnectedOnly)
623623
// queue, test the socket
624624
KeepAlive();
625625
}
626+
else if (timedOutThisHeartbeat > 0
627+
&& tmp.LastReadSecondsAgo * 1_000 > (tmp.BridgeCouldBeNull?.Multiplexer.AsyncTimeoutMilliseconds * 4))
628+
{
629+
// If we've received *NOTHING* on the pipe in 4 timeouts worth of time and we're timing out commands, issue a connection failure so that we reconnect
630+
// This is meant to address the scenario we see often in Linux configs where TCP retries will happen for 15 minutes.
631+
// To us as a client, we'll see the socket as green/open/fine when writing but we'll bet getting nothing back.
632+
// Since we can't depend on the pipe to fail in that case, we want to error here based on the criteria above so we reconnect broken clients much faster.
633+
tmp.BridgeCouldBeNull?.Multiplexer.Logger?.LogWarning($"Dead socket detected, no reads in {tmp.LastReadSecondsAgo} seconds with {timedOutThisHeartbeat} timeouts, issuing disconnect");
634+
OnDisconnected(ConnectionFailureType.SocketFailure, tmp, out _, out State oldState);
635+
tmp.Dispose(); // Cleanup the existing connection/socket if any, otherwise it will wait reading indefinitely
636+
}
626637
}
627638
break;
628639
case (int)State.Disconnected:

src/StackExchange.Redis/PhysicalConnection.cs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ private enum ReadMode : byte
248248
private readonly WeakReference _bridge;
249249
public PhysicalBridge? BridgeCouldBeNull => (PhysicalBridge?)_bridge.Target;
250250

251+
public long LastReadSecondsAgo => unchecked(Environment.TickCount - Thread.VolatileRead(ref lastReadTickCount)) / 1000;
251252
public long LastWriteSecondsAgo => unchecked(Environment.TickCount - Thread.VolatileRead(ref lastWriteTickCount)) / 1000;
252253

253254
private bool IncludeDetailInExceptions => BridgeCouldBeNull?.Multiplexer.RawConfig.IncludeDetailInExceptions ?? false;
@@ -720,8 +721,13 @@ internal void GetStormLog(StringBuilder sb)
720721
}
721722
}
722723

723-
internal void OnBridgeHeartbeat()
724+
/// <summary>
725+
/// Runs on every heartbeat for a bridge, timing out any commands that are overdue and returning an integer of how many we timed out.
726+
/// </summary>
727+
/// <returns>How many commands were overdue and threw timeout exceptions.</returns>
728+
internal int OnBridgeHeartbeat()
724729
{
730+
var result = 0;
725731
var now = Environment.TickCount;
726732
Interlocked.Exchange(ref lastBeatTickCount, now);
727733

@@ -747,6 +753,7 @@ internal void OnBridgeHeartbeat()
747753
multiplexer.OnMessageFaulted(msg, timeoutEx);
748754
msg.SetExceptionAndComplete(timeoutEx, bridge); // tell the message that it is doomed
749755
multiplexer.OnAsyncTimeout();
756+
result++;
750757
}
751758
}
752759
else
@@ -761,6 +768,7 @@ internal void OnBridgeHeartbeat()
761768
}
762769
}
763770
}
771+
return result;
764772
}
765773

766774
internal void OnInternalError(Exception exception, [CallerMemberName] string? origin = null)

0 commit comments

Comments
 (0)