Skip to content

Commit 20829ca

Browse files
committed
Surface transient connection errors
1 parent f11c322 commit 20829ca

File tree

6 files changed

+57
-17
lines changed

6 files changed

+57
-17
lines changed

Sources/GRPCNIOTransportCore/Client/Connection/Connection.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ package final class Connection: Sendable {
6868
/// Closed because the remote peer initiate shutdown (i.e. sent a GOAWAY frame).
6969
case remote
7070
/// Closed because the connection encountered an unexpected error.
71-
case error(any Error, wasIdle: Bool)
71+
case error(RPCError, wasIdle: Bool)
7272
}
7373

7474
/// Inputs to the 'run' method.

Sources/GRPCNIOTransportCore/Client/Connection/ConnectivityState.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
* limitations under the License.
1515
*/
1616

17+
package import GRPCCore
18+
1719
package enum ConnectivityState: Sendable, Hashable {
1820
/// This channel isn't trying to create a connection because of a lack of new or pending RPCs.
1921
///
@@ -34,7 +36,7 @@ package enum ConnectivityState: Sendable, Hashable {
3436
/// establish a connection again. Since retries are done with exponential backoff, channels that
3537
/// fail to connect will start out spending very little time in this state but as the attempts
3638
/// fail repeatedly, the channel will spend increasingly large amounts of time in this state.
37-
case transientFailure
39+
case transientFailure(cause: RPCError)
3840

3941
/// This channel has started shutting down. Any new RPCs should fail immediately. Pending RPCs
4042
/// may continue running until the application cancels them. Channels may enter this state either

Sources/GRPCNIOTransportCore/Client/Connection/GRPCChannel.swift

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,23 @@ extension GRPCChannel.StateMachine {
758758
result: .success(state.current)
759759
)
760760

761-
case .transientFailure, .shutdown: // shutdown includes shutting down
761+
case .transientFailure(let cause):
762+
// Current load-balancer failed. Remove all the 'fast-failing' continuations in the
763+
// queue, these are RPCs which set the 'wait for ready' option to false. The rest of
764+
// the entries in the queue will wait for a load-balancer to become ready.
765+
let continuations = state.queue.removeFastFailingEntries()
766+
actions.resumeContinuations = ConnectivityStateChangeActions.ResumableContinuations(
767+
continuations: continuations,
768+
result: .failure(
769+
RPCError(
770+
code: .unavailable,
771+
message: "channel isn't ready",
772+
cause: cause
773+
)
774+
)
775+
)
776+
777+
case .shutdown: // shutdown includes shutting down
762778
// Current load-balancer failed. Remove all the 'fast-failing' continuations in the
763779
// queue, these are RPCs which set the 'wait for ready' option to false. The rest of
764780
// the entries in the queue will wait for a load-balancer to become ready.

Sources/GRPCNIOTransportCore/Client/Connection/LoadBalancers/RoundRobinLoadBalancer.swift

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,7 @@ extension RoundRobinLoadBalancer {
530530
// The transition from transient failure to connecting is ignored.
531531
//
532532
// See: https://github.com/grpc/grpc/blob/master/doc/load-balancing.md
533-
if self.state == .transientFailure, newState == .connecting {
533+
if case .transientFailure = self.state, newState == .connecting {
534534
return false
535535
}
536536

@@ -750,12 +750,16 @@ extension ConnectivityState {
750750
return .idle
751751
}
752752

753-
// Otherwise, if all subchannels are in state TRANSIENT_FAILURE, the channel's state
754-
// is TRANSIENT_FAILURE.
755-
if states.allSatisfy({ $0 == .transientFailure }) {
756-
return .transientFailure
753+
// Otherwise, if all subchannels are in state TRANSIENT_FAILURE, the channel's state is TRANSIENT_FAILURE.
754+
for state in states {
755+
guard case .transientFailure = state else {
756+
return .shutdown
757+
}
757758
}
758759

759-
return .shutdown
760+
return .transientFailure(cause: RPCError(
761+
code: .internalError,
762+
message: "All subchannels are in TRANSIENT_FAILURE state."
763+
))
760764
}
761765
}

Sources/GRPCNIOTransportCore/Client/Connection/LoadBalancers/Subchannel.swift

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,14 @@ extension Subchannel {
291291

292292
case .backoff(let duration):
293293
// All addresses have been tried, backoff for some time.
294-
self.event.continuation.yield(.connectivityStateChanged(.transientFailure))
294+
self.event.continuation.yield(
295+
.connectivityStateChanged(
296+
.transientFailure(cause: RPCError(
297+
code: .unavailable,
298+
message: "All addresses have been tried: backing off."
299+
))
300+
)
301+
)
295302
group.addTask {
296303
do {
297304
try await Task.sleep(for: duration)
@@ -334,9 +341,9 @@ extension Subchannel {
334341
case .emitIdle:
335342
self.event.continuation.yield(.connectivityStateChanged(.idle))
336343

337-
case .emitTransientFailureAndReconnect:
344+
case .emitTransientFailureAndReconnect(let cause):
338345
// Unclean closes trigger a transient failure state change and a name resolution.
339-
self.event.continuation.yield(.connectivityStateChanged(.transientFailure))
346+
self.event.continuation.yield(.connectivityStateChanged(.transientFailure(cause: cause)))
340347
self.event.continuation.yield(.requiresNameResolution)
341348
// Attempt to reconnect.
342349
self.handleConnectInput(in: &group)
@@ -632,7 +639,7 @@ extension Subchannel {
632639
enum OnClosed {
633640
case nothing
634641
case emitIdle
635-
case emitTransientFailureAndReconnect
642+
case emitTransientFailureAndReconnect(cause: RPCError)
636643
case finish(emitShutdown: Bool)
637644
}
638645

@@ -646,9 +653,15 @@ extension Subchannel {
646653
self = .notConnected(NotConnected(from: state))
647654
onClosed = .emitIdle
648655

649-
case .keepaliveTimeout, .error(_, wasIdle: false):
656+
case .keepaliveTimeout:
657+
self = .notConnected(NotConnected(from: state))
658+
onClosed = .emitTransientFailureAndReconnect(
659+
cause: RPCError(code: .unavailable, message: "The keepalive timed out.")
660+
)
661+
662+
case .error(let error, wasIdle: false):
650663
self = .notConnected(NotConnected(from: state))
651-
onClosed = .emitTransientFailureAndReconnect
664+
onClosed = .emitTransientFailureAndReconnect(cause: error)
652665

653666
case .initiatedLocally:
654667
// Should be in the 'shuttingDown' state.

Tests/GRPCNIOTransportCoreTests/Client/Connection/LoadBalancers/SubchannelTests.swift

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,10 @@ final class SubchannelTests: XCTestCase {
161161
[
162162
.connectivityStateChanged(.idle),
163163
.connectivityStateChanged(.connecting),
164-
.connectivityStateChanged(.transientFailure),
164+
.connectivityStateChanged(.transientFailure(cause: RPCError(
165+
code: .unavailable,
166+
message: "All addresses have been tried: backing off."
167+
))),
165168
.connectivityStateChanged(.connecting),
166169
]
167170
)
@@ -440,7 +443,9 @@ final class SubchannelTests: XCTestCase {
440443
.connectivityStateChanged(.idle),
441444
.connectivityStateChanged(.connecting),
442445
.connectivityStateChanged(.ready),
443-
.connectivityStateChanged(.transientFailure),
446+
.connectivityStateChanged(.transientFailure(cause: RPCError(
447+
code: .unavailable, message: "The TCP connection was dropped unexpectedly."
448+
))),
444449
.requiresNameResolution,
445450
.connectivityStateChanged(.connecting),
446451
.connectivityStateChanged(.ready),

0 commit comments

Comments
 (0)