Skip to content

Commit e2406a9

Browse files
rls: only reset backoff on recovery from TRANSIENT_FAILURE
Fix control channel connectivity monitoring to track TRANSIENT_FAILURE state explicitly. Only reset backoff timers when transitioning from TRANSIENT_FAILURE to READY, not for benign state changes like READY → IDLE → READY. Fixes #8693
1 parent 50c6321 commit e2406a9

File tree

2 files changed

+111
-2
lines changed

2 files changed

+111
-2
lines changed

balancer/rls/control_channel.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,11 @@ func (cc *controlChannel) monitorConnectivityState() {
187187
cc.connectivityStateCh.Load()
188188
cc.logger.Infof("Connectivity state is READY")
189189

190+
// Track whether we've seen TRANSIENT_FAILURE since the last READY state.
191+
// We only want to reset backoff when recovering from an actual failure,
192+
// not when transitioning through benign states like IDLE.
193+
seenTransientFailure := false
194+
190195
for {
191196
s, ok := <-cc.connectivityStateCh.Get()
192197
if !ok {
@@ -197,9 +202,21 @@ func (cc *controlChannel) monitorConnectivityState() {
197202
if s == connectivity.Shutdown {
198203
return
199204
}
205+
206+
// Track if we've entered TRANSIENT_FAILURE state
207+
if s == connectivity.TransientFailure {
208+
seenTransientFailure = true
209+
}
210+
211+
// Only reset backoff if we're returning to READY after a failure
200212
if s == connectivity.Ready {
201-
cc.logger.Infof("Control channel back to READY")
202-
cc.backToReadyFunc()
213+
if seenTransientFailure {
214+
cc.logger.Infof("Control channel back to READY after TRANSIENT_FAILURE")
215+
cc.backToReadyFunc()
216+
seenTransientFailure = false
217+
} else {
218+
cc.logger.Infof("Control channel back to READY (no prior failure)")
219+
}
203220
}
204221

205222
cc.logger.Infof("Connectivity state is %s", s)

balancer/rls/control_channel_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,15 @@ import (
2626
"fmt"
2727
"os"
2828
"regexp"
29+
"sync"
2930
"testing"
3031
"time"
3132

3233
"github.com/google/go-cmp/cmp"
3334
"google.golang.org/grpc"
3435
"google.golang.org/grpc/balancer"
3536
"google.golang.org/grpc/codes"
37+
"google.golang.org/grpc/connectivity"
3638
"google.golang.org/grpc/credentials"
3739
"google.golang.org/grpc/internal"
3840
rlspb "google.golang.org/grpc/internal/proto/grpc_lookup_v1"
@@ -463,3 +465,93 @@ func (s) TestNewControlChannelUnsupportedCredsBundle(t *testing.T) {
463465
t.Fatal("newControlChannel succeeded when expected to fail")
464466
}
465467
}
468+
// TestControlChannelConnectivityStateTransitions verifies that the control
469+
// channel only resets backoff when recovering from TRANSIENT_FAILURE, not
470+
// when going through benign state changes like READY → IDLE → READY.
471+
func (s) TestControlChannelConnectivityStateTransitions(t *testing.T) {
472+
tests := []struct {
473+
name string
474+
states []connectivity.State
475+
wantCallbackCount int
476+
}{
477+
{
478+
name: "READY → TRANSIENT_FAILURE → READY triggers callback",
479+
states: []connectivity.State{
480+
connectivity.TransientFailure,
481+
connectivity.Ready,
482+
},
483+
wantCallbackCount: 1,
484+
},
485+
{
486+
name: "READY → IDLE → READY does not trigger callback",
487+
states: []connectivity.State{
488+
connectivity.Idle,
489+
connectivity.Ready,
490+
},
491+
wantCallbackCount: 0,
492+
},
493+
{
494+
name: "Multiple failures trigger callback each time",
495+
states: []connectivity.State{
496+
connectivity.TransientFailure,
497+
connectivity.Ready,
498+
connectivity.TransientFailure,
499+
connectivity.Ready,
500+
},
501+
wantCallbackCount: 2,
502+
},
503+
{
504+
name: "IDLE between failures doesn't affect callback",
505+
states: []connectivity.State{
506+
connectivity.TransientFailure,
507+
connectivity.Idle,
508+
connectivity.Ready,
509+
},
510+
wantCallbackCount: 1,
511+
},
512+
}
513+
514+
for _, tt := range tests {
515+
t.Run(tt.name, func(t *testing.T) {
516+
// Start an RLS server
517+
rlsServer, _ := rlstest.SetupFakeRLSServer(t, nil)
518+
519+
// Setup callback to count invocations
520+
callbackCount := 0
521+
var mu sync.Mutex
522+
callback := func() {
523+
mu.Lock()
524+
callbackCount++
525+
mu.Unlock()
526+
}
527+
528+
// Create control channel
529+
ctrlCh, err := newControlChannel(rlsServer.Address, "", defaultTestTimeout, balancer.BuildOptions{}, callback)
530+
if err != nil {
531+
t.Fatalf("Failed to create control channel: %v", err)
532+
}
533+
defer ctrlCh.close()
534+
535+
// Give the channel time to reach initial READY state
536+
time.Sleep(100 * time.Millisecond)
537+
538+
// Inject the test state sequence
539+
for _, state := range tt.states {
540+
ctrlCh.OnMessage(state)
541+
// Give time for the monitoring goroutine to process the state
542+
time.Sleep(50 * time.Millisecond)
543+
}
544+
545+
// Give extra time for any pending callbacks
546+
time.Sleep(100 * time.Millisecond)
547+
548+
mu.Lock()
549+
gotCallbackCount := callbackCount
550+
mu.Unlock()
551+
552+
if gotCallbackCount != tt.wantCallbackCount {
553+
t.Errorf("Got %d callback invocations, want %d", gotCallbackCount, tt.wantCallbackCount)
554+
}
555+
})
556+
}
557+
}

0 commit comments

Comments
 (0)