Skip to content

Commit a6fcb7e

Browse files
rls: only reset backoff on recovery from TRANSIENT_FAILURE
Fix control channel connectivity monitoring to track TRANSIENT_FAILURE state explicitly. Only reset backoff timers when transitioning from TRANSIENT_FAILURE to READY, not for benign state changes like READY → IDLE → READY. Fixes #8693
1 parent 50c6321 commit a6fcb7e

File tree

2 files changed

+112
-2
lines changed

2 files changed

+112
-2
lines changed

balancer/rls/control_channel.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,11 @@ func (cc *controlChannel) monitorConnectivityState() {
187187
cc.connectivityStateCh.Load()
188188
cc.logger.Infof("Connectivity state is READY")
189189

190+
// Track whether we've seen TRANSIENT_FAILURE since the last READY state.
191+
// We only want to reset backoff when recovering from an actual failure,
192+
// not when transitioning through benign states like IDLE.
193+
seenTransientFailure := false
194+
190195
for {
191196
s, ok := <-cc.connectivityStateCh.Get()
192197
if !ok {
@@ -197,9 +202,21 @@ func (cc *controlChannel) monitorConnectivityState() {
197202
if s == connectivity.Shutdown {
198203
return
199204
}
205+
206+
// Track if we've entered TRANSIENT_FAILURE state
207+
if s == connectivity.TransientFailure {
208+
seenTransientFailure = true
209+
}
210+
211+
// Only reset backoff if we're returning to READY after a failure
200212
if s == connectivity.Ready {
201-
cc.logger.Infof("Control channel back to READY")
202-
cc.backToReadyFunc()
213+
if seenTransientFailure {
214+
cc.logger.Infof("Control channel back to READY after TRANSIENT_FAILURE")
215+
cc.backToReadyFunc()
216+
seenTransientFailure = false
217+
} else {
218+
cc.logger.Infof("Control channel back to READY (no prior failure)")
219+
}
203220
}
204221

205222
cc.logger.Infof("Connectivity state is %s", s)

balancer/rls/control_channel_test.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,15 @@ import (
2626
"fmt"
2727
"os"
2828
"regexp"
29+
"sync"
2930
"testing"
3031
"time"
3132

3233
"github.com/google/go-cmp/cmp"
3334
"google.golang.org/grpc"
3435
"google.golang.org/grpc/balancer"
3536
"google.golang.org/grpc/codes"
37+
"google.golang.org/grpc/connectivity"
3638
"google.golang.org/grpc/credentials"
3739
"google.golang.org/grpc/internal"
3840
rlspb "google.golang.org/grpc/internal/proto/grpc_lookup_v1"
@@ -463,3 +465,94 @@ func (s) TestNewControlChannelUnsupportedCredsBundle(t *testing.T) {
463465
t.Fatal("newControlChannel succeeded when expected to fail")
464466
}
465467
}
468+
469+
// TestControlChannelConnectivityStateTransitions verifies that the control
470+
// channel only resets backoff when recovering from TRANSIENT_FAILURE, not
471+
// when going through benign state changes like READY → IDLE → READY.
472+
func (s) TestControlChannelConnectivityStateTransitions(t *testing.T) {
473+
tests := []struct {
474+
name string
475+
states []connectivity.State
476+
wantCallbackCount int
477+
}{
478+
{
479+
name: "READY → TRANSIENT_FAILURE → READY triggers callback",
480+
states: []connectivity.State{
481+
connectivity.TransientFailure,
482+
connectivity.Ready,
483+
},
484+
wantCallbackCount: 1,
485+
},
486+
{
487+
name: "READY → IDLE → READY does not trigger callback",
488+
states: []connectivity.State{
489+
connectivity.Idle,
490+
connectivity.Ready,
491+
},
492+
wantCallbackCount: 0,
493+
},
494+
{
495+
name: "Multiple failures trigger callback each time",
496+
states: []connectivity.State{
497+
connectivity.TransientFailure,
498+
connectivity.Ready,
499+
connectivity.TransientFailure,
500+
connectivity.Ready,
501+
},
502+
wantCallbackCount: 2,
503+
},
504+
{
505+
name: "IDLE between failures doesn't affect callback",
506+
states: []connectivity.State{
507+
connectivity.TransientFailure,
508+
connectivity.Idle,
509+
connectivity.Ready,
510+
},
511+
wantCallbackCount: 1,
512+
},
513+
}
514+
515+
for _, tt := range tests {
516+
t.Run(tt.name, func(t *testing.T) {
517+
// Start an RLS server
518+
rlsServer, _ := rlstest.SetupFakeRLSServer(t, nil)
519+
520+
// Setup callback to count invocations
521+
callbackCount := 0
522+
var mu sync.Mutex
523+
callback := func() {
524+
mu.Lock()
525+
callbackCount++
526+
mu.Unlock()
527+
}
528+
529+
// Create control channel
530+
ctrlCh, err := newControlChannel(rlsServer.Address, "", defaultTestTimeout, balancer.BuildOptions{}, callback)
531+
if err != nil {
532+
t.Fatalf("Failed to create control channel: %v", err)
533+
}
534+
defer ctrlCh.close()
535+
536+
// Give the channel time to reach initial READY state
537+
time.Sleep(100 * time.Millisecond)
538+
539+
// Inject the test state sequence
540+
for _, state := range tt.states {
541+
ctrlCh.OnMessage(state)
542+
// Give time for the monitoring goroutine to process the state
543+
time.Sleep(50 * time.Millisecond)
544+
}
545+
546+
// Give extra time for any pending callbacks
547+
time.Sleep(100 * time.Millisecond)
548+
549+
mu.Lock()
550+
gotCallbackCount := callbackCount
551+
mu.Unlock()
552+
553+
if gotCallbackCount != tt.wantCallbackCount {
554+
t.Errorf("Got %d callback invocations, want %d", gotCallbackCount, tt.wantCallbackCount)
555+
}
556+
})
557+
}
558+
}

0 commit comments

Comments
 (0)