Skip to content

Commit 51b15ad

Browse files
authored
pickfirstleaf: Avoid getting stuck in IDLE on connection breakage (#8615)
Related issue: b/415354418 ## Problem On connection breakage, the pickfirst leaf balancer enters idle and returns an `Idle picker` that calls the balancer's `ExitIdle` method only the first time `Pick` is called. The following sequence of events will cause the balancer to get stuck in `Idle` state: 1. Existing connection breaks, SubConn [requests re-resolution and reports IDLE](https://github.com/grpc/grpc-go/blob/bb71072094cf533965450c44890f8f51c671c393/clientconn.go#L1388-L1393). In turn PF updates the ClientConn state to IDLE with an `Idle picker`. 1. An RPC is made, triggering `balancer.ExitIdle` through the idle picker. The balancer attempts to re-connect the failed SubConn. 1. The resolver produces a new endpoint list, removing the endpoint used by the existing SubConn. PF removes the existing SubConn. Since the balancer didn't update the ClientConn state to CONNECTING yet, pickfirst thinks that it's still in IDLE and doesn't start connecting to the new endpoints. 1. New RPC requests trigger the idle picker, but it's a no-op since it only [triggers the balancer's ExitIdle method once](https://github.com/grpc/grpc-go/blob/bb71072094cf533965450c44890f8f51c671c393/balancer/pickfirst/pickfirstleaf/pickfirstleaf.go#L663https://github.com/grpc/grpc-go/blob/bb71072094cf533965450c44890f8f51c671c393/balancer/pickfirst/pickfirstleaf/pickfirstleaf.go#L663). ## Fix This change moves the ClientConn into Connecting immediately when the `ExitIdle` method is called. This ensures that the balancer continues to re-connect when a new endpoint list is produced by the resolver. RELEASE NOTES: * balancer/pickfirst: Fix bug that can cause balancer to get stuck in `IDLE` state on connection failure.
1 parent 2780563 commit 51b15ad

File tree

2 files changed

+104
-1
lines changed

2 files changed

+104
-1
lines changed

balancer/pickfirst/pickfirstleaf/pickfirstleaf.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,13 @@ func (b *pickfirstBalancer) ExitIdle() {
351351
b.mu.Lock()
352352
defer b.mu.Unlock()
353353
if b.state == connectivity.Idle {
354+
// Move the balancer into CONNECTING state immediately. This is done to
355+
// avoid staying in IDLE if a resolver update arrives before the first
356+
// SubConn reports CONNECTING.
357+
b.updateBalancerState(balancer.State{
358+
ConnectivityState: connectivity.Connecting,
359+
Picker: &picker{err: balancer.ErrNoSubConnAvailable},
360+
})
354361
b.startFirstPassLocked()
355362
}
356363
}
@@ -604,7 +611,7 @@ func (b *pickfirstBalancer) updateSubConnState(sd *scData, newState balancer.Sub
604611
if !b.addressList.seekTo(sd.addr) {
605612
// This should not fail as we should have only one SubConn after
606613
// entering READY. The SubConn should be present in the addressList.
607-
b.logger.Errorf("Address %q not found address list in %v", sd.addr, b.addressList.addresses)
614+
b.logger.Errorf("Address %q not found address list in %v", sd.addr, b.addressList.addresses)
608615
return
609616
}
610617
if !b.healthCheckingEnabled {

balancer/pickfirst/pickfirstleaf/pickfirstleaf_ext_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1504,6 +1504,102 @@ func (s) TestPickFirstLeaf_AddressUpdateWithMetadata(t *testing.T) {
15041504
}
15051505
}
15061506

1507+
// Tests the scenario where a connection is established and then breaks, leading
1508+
// to a reconnection attempt. While the reconnection is in progress, a resolver
1509+
// update with a new address is received. The test verifies that the balancer
1510+
// creates a new SubConn for the new address and that the ClientConn eventually
1511+
// becomes READY.
1512+
func (s) TestPickFirstLeaf_Reconnection(t *testing.T) {
1513+
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
1514+
defer cancel()
1515+
cc := testutils.NewBalancerClientConn(t)
1516+
bal := balancer.Get(pickfirstleaf.Name).Build(cc, balancer.BuildOptions{})
1517+
defer bal.Close()
1518+
ccState := balancer.ClientConnState{
1519+
ResolverState: resolver.State{
1520+
Endpoints: []resolver.Endpoint{
1521+
{Addresses: []resolver.Address{{Addr: "1.1.1.1:1"}}},
1522+
},
1523+
},
1524+
}
1525+
if err := bal.UpdateClientConnState(ccState); err != nil {
1526+
t.Fatalf("UpdateClientConnState(%v) returned error: %v", ccState, err)
1527+
}
1528+
1529+
select {
1530+
case state := <-cc.NewStateCh:
1531+
if got, want := state, connectivity.Connecting; got != want {
1532+
t.Fatalf("Received unexpected ClientConn sate: got %v, want %v", got, want)
1533+
}
1534+
case <-ctx.Done():
1535+
t.Fatal("Context timed out waiting for ClientConn state update.")
1536+
}
1537+
1538+
sc1 := <-cc.NewSubConnCh
1539+
select {
1540+
case <-sc1.ConnectCh:
1541+
case <-ctx.Done():
1542+
t.Fatal("Context timed out waiting for Connect() to be called on sc1.")
1543+
}
1544+
sc1.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting})
1545+
sc1.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready})
1546+
1547+
if err := cc.WaitForConnectivityState(ctx, connectivity.Ready); err != nil {
1548+
t.Fatalf("Context timed out waiting for ClientConn to become READY.")
1549+
}
1550+
1551+
// Simulate a connection breakage, this should result the channel
1552+
// transitioning to IDLE.
1553+
sc1.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Idle})
1554+
if err := cc.WaitForConnectivityState(ctx, connectivity.Idle); err != nil {
1555+
t.Fatalf("Context timed out waiting for ClientConn to enter IDLE.")
1556+
}
1557+
1558+
// Calling the idle picker should result in the SubConn being re-connected.
1559+
picker := <-cc.NewPickerCh
1560+
if _, err := picker.Pick(balancer.PickInfo{}); err != balancer.ErrNoSubConnAvailable {
1561+
t.Fatalf("picker.Pick() returned error: %v, want %v", err, balancer.ErrNoSubConnAvailable)
1562+
}
1563+
1564+
select {
1565+
case <-sc1.ConnectCh:
1566+
case <-ctx.Done():
1567+
t.Fatal("Context timed out waiting for Connect() to be called on sc1.")
1568+
}
1569+
1570+
// Send a resolver update, removing the existing SubConn. Since the balancer
1571+
// is connecting, it should create a new SubConn for the new backend
1572+
// address.
1573+
ccState = balancer.ClientConnState{
1574+
ResolverState: resolver.State{
1575+
Endpoints: []resolver.Endpoint{
1576+
{Addresses: []resolver.Address{{Addr: "2.2.2.2:2"}}},
1577+
},
1578+
},
1579+
}
1580+
if err := bal.UpdateClientConnState(ccState); err != nil {
1581+
t.Fatalf("UpdateClientConnState(%v) returned error: %v", ccState, err)
1582+
}
1583+
1584+
var sc2 *testutils.TestSubConn
1585+
select {
1586+
case sc2 = <-cc.NewSubConnCh:
1587+
case <-ctx.Done():
1588+
t.Fatal("Context timed out waiting for new SubConn to be created.")
1589+
}
1590+
1591+
select {
1592+
case <-sc2.ConnectCh:
1593+
case <-ctx.Done():
1594+
t.Fatal("Context timed out waiting for Connect() to be called on sc2.")
1595+
}
1596+
sc2.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting})
1597+
sc2.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready})
1598+
if err := cc.WaitForConnectivityState(ctx, connectivity.Ready); err != nil {
1599+
t.Fatalf("Context timed out waiting for ClientConn to become READY.")
1600+
}
1601+
}
1602+
15071603
// healthListenerCapturingCCWrapper is used to capture the health listener so
15081604
// that health updates can be mocked for testing.
15091605
type healthListenerCapturingCCWrapper struct {

0 commit comments

Comments
 (0)