Skip to content

Commit be8e15d

Browse files
authored
Better waiter errors with NIOTS (#1775)
Motivation: When establishing a connection using NIOTS, Network.framework may enter a 'waiting' state because of a transient error. This is surfaced as a user inbound event. In many cases the connect call won't resolve until the connect timeout passes (defaults to 20 seconds). Attempts to start a call on this connection during this time will fail with a timeout at the connection pool layer (where the max wait time defaults to 5 seconds) and users will see a 'deadline exceeded' error without any more context. However, we can provide more information by passing up the transient error from Network.framework to the connection pool. This isn't currently possible as events flow up on state changes, in this case from connecting to transient failure when the connect times out. Modifications: - Catch the `WaitingForConnectivity` event and bubble up its error though the idle handler to the connection pool Result: Better errors on connect timeouts in some situations
1 parent a260bb3 commit be8e15d

File tree

4 files changed

+112
-9
lines changed

4 files changed

+112
-9
lines changed

Sources/GRPC/ConnectionManager.swift

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ internal final class ConnectionManager: @unchecked Sendable {
3333
internal struct ConnectingState {
3434
var backoffIterator: ConnectionBackoffIterator?
3535
var reconnect: Reconnect
36+
var connectError: Error?
3637

3738
var candidate: EventLoopFuture<Channel>
3839
var readyChannelMuxPromise: EventLoopPromise<HTTP2StreamMultiplexer>
@@ -656,10 +657,30 @@ internal final class ConnectionManager: @unchecked Sendable {
656657
]
657658
)
658659

659-
case .connecting:
660-
// Ignore the error, the channel promise will notify the manager of any error which occurs
661-
// while connecting.
662-
()
660+
case .connecting(var state):
661+
// Record the error, the channel promise will notify the manager of any error which occurs
662+
// while connecting. It may be overridden by this error if it contains more relevant
663+
// information
664+
if state.connectError == nil {
665+
state.connectError = error
666+
self.state = .connecting(state)
667+
668+
// The pool is only notified of connection errors when the connection transitions to the
669+
// transient failure state. However, in some cases (i.e. with NIOTS), errors can be thrown
670+
// during the connect but before the connect times out.
671+
//
672+
// This opens up a period of time where you can start a call and have it fail with
673+
// deadline exceeded (because no connection was available within the configured max
674+
// wait time for the pool) but without any diagnostic information. The information is
675+
// available but it hasn't been made available to the pool at that point in time.
676+
//
677+
// The delegate can't easily be modified (it's public API) and a new API doesn't make all
678+
// that much sense so we elect to check whether the delegate is the pool and call it
679+
// directly.
680+
if let pool = self.connectivityDelegate as? ConnectionPool {
681+
pool.sync.updateMostRecentError(error)
682+
}
683+
}
663684

664685
case var .active(state):
665686
state.error = error
@@ -935,16 +956,26 @@ extension ConnectionManager {
935956

936957
switch self.state {
937958
case let .connecting(connecting):
959+
let reportedError: Error
960+
switch error as? ChannelError {
961+
case .some(.connectTimeout):
962+
// A more relevant error may have been caught earlier. Use that in preference to the
963+
// timeout as it'll likely be more useful.
964+
reportedError = connecting.connectError ?? error
965+
default:
966+
reportedError = error
967+
}
968+
938969
// Should we reconnect?
939970
switch connecting.reconnect {
940971
// No, shutdown.
941972
case .none:
942973
self.logger.debug("shutting down connection, no reconnect configured/remaining")
943974
self.state = .shutdown(
944-
ShutdownState(closeFuture: self.eventLoop.makeSucceededFuture(()), reason: error)
975+
ShutdownState(closeFuture: self.eventLoop.makeSucceededFuture(()), reason: reportedError)
945976
)
946-
connecting.readyChannelMuxPromise.fail(error)
947-
connecting.candidateMuxPromise.fail(error)
977+
connecting.readyChannelMuxPromise.fail(reportedError)
978+
connecting.candidateMuxPromise.fail(reportedError)
948979

949980
// Yes, after a delay.
950981
case let .after(delay):
@@ -953,10 +984,10 @@ extension ConnectionManager {
953984
self.startConnecting()
954985
}
955986
self.state = .transientFailure(
956-
TransientFailureState(from: connecting, scheduled: scheduled, reason: error)
987+
TransientFailureState(from: connecting, scheduled: scheduled, reason: reportedError)
957988
)
958989
// Candidate mux users are not willing to wait.
959-
connecting.candidateMuxPromise.fail(error)
990+
connecting.candidateMuxPromise.fail(reportedError)
960991
}
961992

962993
// The application must have called shutdown while we were trying to establish a connection

Sources/GRPC/ConnectionPool/ConnectionPool.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,12 @@ extension ConnectionPool {
857857
self.pool.eventLoop.assertInEventLoop()
858858
return self.pool._connections.values.reduce(0) { $0 + $1.reservedStreams }
859859
}
860+
861+
/// Updates the most recent connection error.
862+
internal func updateMostRecentError(_ error: Error) {
863+
self.pool.eventLoop.assertInEventLoop()
864+
self.pool.updateMostRecentError(error)
865+
}
860866
}
861867

862868
internal var sync: Sync {

Sources/GRPC/GRPCIdleHandler.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import Logging
1717
import NIOCore
1818
import NIOHTTP2
1919
import NIOTLS
20+
import NIOTransportServices
2021

2122
internal final class GRPCIdleHandler: ChannelInboundHandler {
2223
typealias InboundIn = HTTP2Frame
@@ -287,6 +288,14 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
287288
)
288289
context.fireUserInboundEventTriggered(event)
289290
} else {
291+
#if canImport(Network)
292+
if #available(macOS 10.14, iOS 12.0, tvOS 12.0, watchOS 6.0, *) {
293+
if let waitsForConnectivity = event as? NIOTSNetworkEvents.WaitingForConnectivity {
294+
self.mode.connectionManager?.channelError(waitsForConnectivity.transientError)
295+
}
296+
}
297+
#endif
298+
290299
context.fireUserInboundEventTriggered(event)
291300
}
292301
}

Tests/GRPCTests/GRPCNetworkFrameworkTests.swift

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import NIOCore
2424
import NIOPosix
2525
import NIOSSL
2626
import NIOTransportServices
27+
import GRPCSampleData
2728
import Security
2829
import XCTest
2930

@@ -208,6 +209,62 @@ final class GRPCNetworkFrameworkTests: GRPCTestCase {
208209

209210
XCTAssertNoThrow(try self.doEchoGet())
210211
}
212+
213+
func testWaiterPicksUpNWError(
214+
_ configure: (inout GRPCChannelPool.Configuration) -> Void
215+
) async throws {
216+
let builder = Server.usingTLSBackedByNIOSSL(
217+
on: self.group,
218+
certificateChain: [SampleCertificate.server.certificate],
219+
privateKey: SamplePrivateKey.server
220+
)
221+
222+
let server = try await builder.bind(host: "127.0.0.1", port: 0).get()
223+
defer { try? server.close().wait() }
224+
225+
let client = try GRPCChannelPool.with(
226+
target: .hostAndPort("127.0.0.1", server.channel.localAddress!.port!),
227+
transportSecurity: .tls(.makeClientConfigurationBackedByNetworkFramework()),
228+
eventLoopGroup: self.tsGroup
229+
) {
230+
configure(&$0)
231+
}
232+
233+
let echo = Echo_EchoAsyncClient(channel: client)
234+
do {
235+
let _ = try await echo.get(.with { $0.text = "ignored" })
236+
} catch let error as GRPCConnectionPoolError {
237+
XCTAssertEqual(error.code, .deadlineExceeded)
238+
XCTAssert(error.underlyingError is NWError)
239+
} catch {
240+
XCTFail("Expected GRPCConnectionPoolError")
241+
}
242+
243+
let promise = self.group.next().makePromise(of: Void.self)
244+
client.closeGracefully(deadline: .now() + .seconds(1), promise: promise)
245+
try await promise.futureResult.get()
246+
}
247+
248+
func testErrorPickedUpBeforeConnectTimeout() async throws {
249+
try await self.testWaiterPicksUpNWError {
250+
// Configure the wait time to be less than the connect timeout, the waiter
251+
// should fail with the appropriate NWError before the connect times out.
252+
$0.connectionPool.maxWaitTime = .milliseconds(500)
253+
$0.connectionBackoff.minimumConnectionTimeout = 1.0
254+
}
255+
}
256+
257+
func testNotWaitingForConnectivity() async throws {
258+
try await self.testWaiterPicksUpNWError {
259+
// The minimum connect time is still high, but setting wait for activity to false
260+
// means it fails on entering the waiting state rather than seeing out the connect
261+
// timeout.
262+
$0.connectionPool.maxWaitTime = .milliseconds(500)
263+
$0.debugChannelInitializer = { channel in
264+
channel.setOption(NIOTSChannelOptions.waitForActivity, value: false)
265+
}
266+
}
267+
}
211268
}
212269

213270
#endif // canImport(Network)

0 commit comments

Comments
 (0)