Skip to content

Commit 770629b

Browse files
authored
Don't trap on invalid connection state transitions (#1573)
Motivation: The connection manager is quite aggresive about trapping if an invalid state is hit. It's alsmost impossible to know when states are truly unreachable so in most coses we should handle them as best as we can. If we believe them to be unreachable but cannot easily prove it then we should crash in debug mode and handle it as best as possible in release mode. Modifications: - Handle various state transitions more gently in the connection manager. Result: Gentler state handling.
1 parent a20cac0 commit 770629b

File tree

2 files changed

+193
-70
lines changed

2 files changed

+193
-70
lines changed

Sources/GRPC/ConnectionManager.swift

Lines changed: 80 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,14 @@ internal final class ConnectionManager {
7777
var scheduled: Scheduled<Void>
7878
var reason: Error
7979

80-
init(from state: ConnectingState, scheduled: Scheduled<Void>, reason: Error) {
80+
init(from state: ConnectingState, scheduled: Scheduled<Void>, reason: Error?) {
8181
self.backoffIterator = state.backoffIterator
8282
self.readyChannelMuxPromise = state.readyChannelMuxPromise
8383
self.scheduled = scheduled
84-
self.reason = reason
84+
self.reason = reason ?? GRPCStatus(
85+
code: .unavailable,
86+
message: "Unexpected connection drop"
87+
)
8588
}
8689

8790
init(from state: ConnectedState, scheduled: Scheduled<Void>) {
@@ -391,7 +394,7 @@ internal final class ConnectionManager {
391394
self.startConnecting()
392395
// We started connecting so we must transition to the `connecting` state.
393396
guard case let .connecting(connecting) = self.state else {
394-
self.invalidState()
397+
self.unreachableState()
395398
}
396399
multiplexer = connecting.readyChannelMuxPromise.futureResult
397400

@@ -432,7 +435,7 @@ internal final class ConnectionManager {
432435
self.startConnecting()
433436
// We started connecting so we must transition to the `connecting` state.
434437
guard case let .connecting(connecting) = self.state else {
435-
self.invalidState()
438+
self.unreachableState()
436439
}
437440
return connecting.candidateMuxPromise.futureResult
438441
case let .connecting(state):
@@ -674,20 +677,13 @@ internal final class ConnectionManager {
674677
case .shutdown:
675678
channel.close(mode: .all, promise: nil)
676679

677-
// These cases are purposefully separated: some crash reporting services provide stack traces
678-
// which don't include the precondition failure message (which contain the invalid state we were
679-
// in). Keeping the cases separate allows us work out the state from the line number.
680-
case .idle:
681-
self.invalidState()
682-
683-
case .active:
684-
self.invalidState()
685-
686-
case .ready:
687-
self.invalidState()
688-
689-
case .transientFailure:
690-
self.invalidState()
680+
case .idle, .transientFailure:
681+
// Received a channelActive when not connecting. Can happen if channelActive and
682+
// channelInactive are reordered. Ignore.
683+
()
684+
case .active, .ready:
685+
// Received a second 'channelActive', already active so ignore.
686+
()
691687
}
692688
}
693689

@@ -700,6 +696,43 @@ internal final class ConnectionManager {
700696
])
701697

702698
switch self.state {
699+
// We can hit inactive in connecting if we see channelInactive before channelActive; that's not
700+
// common but we should tolerate it.
701+
case let .connecting(connecting):
702+
// Should we try connecting again?
703+
switch connecting.reconnect {
704+
// No, shutdown instead.
705+
case .none:
706+
self.logger.debug("shutting down connection")
707+
708+
let error = GRPCStatus(
709+
code: .unavailable,
710+
message: "The connection was dropped and connection re-establishment is disabled"
711+
)
712+
713+
let shutdownState = ShutdownState(
714+
closeFuture: self.eventLoop.makeSucceededFuture(()),
715+
reason: error
716+
)
717+
718+
self.state = .shutdown(shutdownState)
719+
// Shutting down, so fail the outstanding promises.
720+
connecting.readyChannelMuxPromise.fail(error)
721+
connecting.candidateMuxPromise.fail(error)
722+
723+
// Yes, after some time.
724+
case let .after(delay):
725+
let error = GRPCStatus(code: .unavailable, message: "Connection closed while connecting")
726+
// Fail the candidate mux promise. KEep the 'readyChannelMuxPromise' as we'll try again.
727+
connecting.candidateMuxPromise.fail(error)
728+
729+
let scheduled = self.eventLoop.scheduleTask(in: .seconds(timeInterval: delay)) {
730+
self.startConnecting()
731+
}
732+
self.logger.debug("scheduling connection attempt", metadata: ["delay_secs": "\(delay)"])
733+
self.state = .transientFailure(.init(from: connecting, scheduled: scheduled, reason: nil))
734+
}
735+
703736
// The channel is `active` but not `ready`. Should we try again?
704737
case let .active(active):
705738
switch active.reconnect {
@@ -766,14 +799,9 @@ internal final class ConnectionManager {
766799
case .shutdown:
767800
()
768801

769-
// These cases are purposefully separated: some crash reporting services provide stack traces
770-
// which don't include the precondition failure message (which contain the invalid state we were
771-
// in). Keeping the cases separate allows us work out the state from the line number.
772-
case .connecting:
773-
self.invalidState()
774-
802+
// Received 'channelInactive' twice; fine, ignore.
775803
case .transientFailure:
776-
self.invalidState()
804+
()
777805
}
778806
}
779807

@@ -793,20 +821,20 @@ internal final class ConnectionManager {
793821
case .shutdown:
794822
()
795823

796-
// These cases are purposefully separated: some crash reporting services provide stack traces
797-
// which don't include the precondition failure message (which contain the invalid state we were
798-
// in). Keeping the cases separate allows us work out the state from the line number.
799-
case .idle:
800-
self.invalidState()
801-
802-
case .transientFailure:
803-
self.invalidState()
824+
case .idle, .transientFailure:
825+
// No connection or connection attempt exists but connection was marked as ready. This is
826+
// strange. Ignore it in release mode as there's nothing to close and nowehere to fire an
827+
// error to.
828+
assertionFailure("received initial HTTP/2 SETTINGS frame in \(self.state.label) state")
804829

805830
case .connecting:
806-
self.invalidState()
831+
// No channel exists to receive initial HTTP/2 SETTINGS frame on... weird. Ignore in release
832+
// mode.
833+
assertionFailure("received initial HTTP/2 SETTINGS frame in \(self.state.label) state")
807834

808835
case .ready:
809-
self.invalidState()
836+
// Already received initial HTTP/2 SETTINGS frame; ignore in release mode.
837+
assertionFailure("received initial HTTP/2 SETTINGS frame in \(self.state.label) state")
810838
}
811839
}
812840

@@ -834,17 +862,14 @@ internal final class ConnectionManager {
834862
// 'channelInactive()'.
835863
()
836864

837-
// These cases are purposefully separated: some crash reporting services provide stack traces
838-
// which don't include the precondition failure message (which contain the invalid state we were
839-
// in). Keeping the cases separate allows us work out the state from the line number.
840-
case .idle:
841-
self.invalidState()
865+
case .idle, .transientFailure:
866+
// There's no connection to idle; ignore.
867+
()
842868

843869
case .connecting:
844-
self.invalidState()
845-
846-
case .transientFailure:
847-
self.invalidState()
870+
// The idle watchdog is started when the connection is active, this shouldn't happen
871+
// in the connecting state. Ignore it in release mode.
872+
assertionFailure("tried to idle a connection in the \(self.state.label) state")
848873
}
849874
}
850875

@@ -908,22 +933,10 @@ extension ConnectionManager {
908933
case .shutdown:
909934
()
910935

911-
// We can't fail to connect if we aren't trying.
912-
//
913-
// These cases are purposefully separated: some crash reporting services provide stack traces
914-
// which don't include the precondition failure message (which contain the invalid state we were
915-
// in). Keeping the cases separate allows us work out the state from the line number.
916-
case .idle:
917-
self.invalidState()
918-
919-
case .active:
920-
self.invalidState()
921-
922-
case .ready:
923-
self.invalidState()
924-
925-
case .transientFailure:
926-
self.invalidState()
936+
// Connection attempt failed, but no connection attempt is in progress.
937+
case .idle, .active, .ready, .transientFailure:
938+
// Nothing we can do other than ignore in release mode.
939+
assertionFailure("connect promise failed in \(self.state.label) state")
927940
}
928941
}
929942
}
@@ -951,17 +964,14 @@ extension ConnectionManager {
951964
case .shutdown:
952965
()
953966

954-
// These cases are purposefully separated: some crash reporting services provide stack traces
955-
// which don't include the precondition failure message (which contain the invalid state we were
956-
// in). Keeping the cases separate allows us work out the state from the line number.
967+
// We only call startConnecting() if the connection does not exist and after checking what the
968+
// current state is, so none of these states should be reachable.
957969
case .connecting:
958-
self.invalidState()
959-
970+
self.unreachableState()
960971
case .active:
961-
self.invalidState()
962-
972+
self.unreachableState()
963973
case .ready:
964-
self.invalidState()
974+
self.unreachableState()
965975
}
966976
}
967977

@@ -1066,11 +1076,11 @@ extension ConnectionManager {
10661076
}
10671077

10681078
extension ConnectionManager {
1069-
private func invalidState(
1079+
private func unreachableState(
10701080
function: StaticString = #function,
10711081
file: StaticString = #fileID,
10721082
line: UInt = #line
10731083
) -> Never {
1074-
preconditionFailure("Invalid state \(self.state) for \(function)", file: file, line: line)
1084+
fatalError("Invalid state \(self.state) for \(function)", file: file, line: line)
10751085
}
10761086
}

Tests/GRPCTests/ConnectionManagerTests.swift

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,119 @@ extension ConnectionManagerTests {
239239
}
240240
}
241241

242+
func testChannelInactiveBeforeActiveWithNoReconnect() throws {
243+
let channel = EmbeddedChannel(loop: self.loop)
244+
let channelPromise = self.loop.makePromise(of: Channel.self)
245+
246+
let manager = self.makeConnectionManager { _, _ in
247+
return channelPromise.futureResult
248+
}
249+
250+
// Start the connection.
251+
self.waitForStateChange(from: .idle, to: .connecting) {
252+
// Triggers the connect.
253+
_ = manager.getHTTP2Multiplexer()
254+
self.loop.run()
255+
}
256+
257+
try channel.pipeline.syncOperations.addHandler(
258+
GRPCIdleHandler(
259+
connectionManager: manager,
260+
multiplexer: HTTP2StreamMultiplexer(
261+
mode: .client,
262+
channel: channel,
263+
inboundStreamInitializer: nil
264+
),
265+
idleTimeout: .minutes(5),
266+
keepalive: .init(),
267+
logger: self.logger
268+
)
269+
)
270+
channelPromise.succeed(channel)
271+
// Oops: wrong way around. We should tolerate this.
272+
self.waitForStateChange(from: .connecting, to: .shutdown) {
273+
channel.pipeline.fireChannelInactive()
274+
}
275+
276+
// Should be ignored.
277+
channel.pipeline.fireChannelActive()
278+
}
279+
280+
func testChannelInactiveBeforeActiveWillReconnect() throws {
281+
var channels = [EmbeddedChannel(loop: self.loop), EmbeddedChannel(loop: self.loop)]
282+
var channelPromises: [EventLoopPromise<Channel>] = [self.loop.makePromise(),
283+
self.loop.makePromise()]
284+
var channelFutures = Array(channelPromises.map { $0.futureResult })
285+
286+
var configuration = self.defaultConfiguration
287+
configuration.connectionBackoff = .oneSecondFixed
288+
289+
let manager = self.makeConnectionManager(configuration: configuration) { _, _ in
290+
return channelFutures.removeLast()
291+
}
292+
293+
// Start the connection.
294+
self.waitForStateChange(from: .idle, to: .connecting) {
295+
// Triggers the connect.
296+
_ = manager.getHTTP2Multiplexer()
297+
self.loop.run()
298+
}
299+
300+
// Setup the channel.
301+
let channel1 = channels.removeLast()
302+
let channel1Promise = channelPromises.removeLast()
303+
304+
try channel1.pipeline.syncOperations.addHandler(
305+
GRPCIdleHandler(
306+
connectionManager: manager,
307+
multiplexer: HTTP2StreamMultiplexer(
308+
mode: .client,
309+
channel: channel1,
310+
inboundStreamInitializer: nil
311+
),
312+
idleTimeout: .minutes(5),
313+
keepalive: .init(),
314+
logger: self.logger
315+
)
316+
)
317+
channel1Promise.succeed(channel1)
318+
// Oops: wrong way around. We should tolerate this.
319+
self.waitForStateChange(from: .connecting, to: .transientFailure) {
320+
channel1.pipeline.fireChannelInactive()
321+
}
322+
323+
channel1.pipeline.fireChannelActive()
324+
325+
// Start the next attempt.
326+
self.waitForStateChange(from: .transientFailure, to: .connecting) {
327+
self.loop.advanceTime(by: .seconds(1))
328+
}
329+
330+
let channel2 = channels.removeLast()
331+
let channel2Promise = channelPromises.removeLast()
332+
try channel2.pipeline.syncOperations.addHandler(
333+
GRPCIdleHandler(
334+
connectionManager: manager,
335+
multiplexer: HTTP2StreamMultiplexer(
336+
mode: .client,
337+
channel: channel1,
338+
inboundStreamInitializer: nil
339+
),
340+
idleTimeout: .minutes(5),
341+
keepalive: .init(),
342+
logger: self.logger
343+
)
344+
)
345+
346+
channel2Promise.succeed(channel2)
347+
348+
try self.waitForStateChange(from: .connecting, to: .ready) {
349+
channel2.pipeline.fireChannelActive()
350+
let frame = HTTP2Frame(streamID: .rootStream, payload: .settings(.settings([])))
351+
XCTAssertNoThrow(try channel2.writeInbound(frame))
352+
}
353+
}
354+
242355
func testIdleTimeoutWhenThereAreActiveStreams() throws {
243356
let channelPromise = self.loop.makePromise(of: Channel.self)
244357
let manager = self.makeConnectionManager { _, _ in

0 commit comments

Comments
 (0)