grpc · ulascansenturk · Nov 20, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/balancer/rls/control_channel.go b/balancer/rls/control_channel.go
@@ -63,18 +63,22 @@ type controlChannel struct {
 	connectivityStateCh *buffer.Unbounded
 	unsubscribe         func()
 	monitorDoneCh       chan struct{}
+	// testOnlyInitialReadyDone is closed when the monitoring goroutine
+	// processes the initial READY state. Only used in tests.
+	testOnlyInitialReadyDone chan struct{}
 }
 
 // newControlChannel creates a controlChannel to rlsServerName and uses
 // serviceConfig, if non-empty, as the default service config for the underlying
 // gRPC channel.
 func newControlChannel(rlsServerName, serviceConfig string, rpcTimeout time.Duration, bOpts balancer.BuildOptions, backToReadyFunc func()) (*controlChannel, error) {
 	ctrlCh := &controlChannel{
-		rpcTimeout:          rpcTimeout,
-		backToReadyFunc:     backToReadyFunc,
-		throttler:           newAdaptiveThrottler(),
-		connectivityStateCh: buffer.NewUnbounded(),
-		monitorDoneCh:       make(chan struct{}),
+		rpcTimeout:               rpcTimeout,
+		backToReadyFunc:          backToReadyFunc,
+		throttler:                newAdaptiveThrottler(),
+		connectivityStateCh:      buffer.NewUnbounded(),
+		monitorDoneCh:            make(chan struct{}),
+		testOnlyInitialReadyDone: make(chan struct{}),
 	}
 	ctrlCh.logger = internalgrpclog.NewPrefixLogger(logger, fmt.Sprintf("[rls-control-channel %p] ", ctrlCh))
 
@@ -187,6 +191,14 @@ func (cc *controlChannel) monitorConnectivityState() {
 	cc.connectivityStateCh.Load()
 	cc.logger.Infof("Connectivity state is READY")
 
+	// Signal tests that initial READY has been processed
+	close(cc.testOnlyInitialReadyDone)
+
+	// Track whether we've seen TRANSIENT_FAILURE since the last READY state.
+	// We only want to reset backoff when recovering from an actual failure,
+	// not when transitioning through benign states like IDLE.
+	seenTransientFailure := false
+
 	for {
 		s, ok := <-cc.connectivityStateCh.Get()
 		if !ok {
@@ -197,9 +209,27 @@ func (cc *controlChannel) monitorConnectivityState() {
 		if s == connectivity.Shutdown {
 			return
 		}
+
+		// Track if we've entered TRANSIENT_FAILURE state
+		if s == connectivity.TransientFailure {
+			seenTransientFailure = true
+		}
+
+		// Only reset backoff when transitioning from TRANSIENT_FAILURE to READY.
+		// This indicates the RLS server has recovered from being unreachable, so
+		// we reset backoff state in all cache entries to allow pending RPCs to
+		// proceed immediately. We skip benign transitions like READY → IDLE → READY
+		// since those don't represent actual failures.
 		if s == connectivity.Ready {
-			cc.logger.Infof("Control channel back to READY")
-			cc.backToReadyFunc()
+			if seenTransientFailure {
+				cc.logger.Infof("Control channel back to READY after TRANSIENT_FAILURE")
+				if cc.backToReadyFunc != nil {
+					cc.backToReadyFunc()
+				}
+				seenTransientFailure = false
+			} else {
+				cc.logger.Infof("Control channel back to READY (no prior failure)")
+			}
 		}
 
 		cc.logger.Infof("Connectivity state is %s", s)

diff --git a/balancer/rls/control_channel_test.go b/balancer/rls/control_channel_test.go
@@ -26,13 +26,15 @@ import (
 	"fmt"
 	"os"
 	"regexp"
+	"sync"
 	"testing"
 	"time"
 
 	"github.com/google/go-cmp/cmp"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/balancer"
 	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/connectivity"
 	"google.golang.org/grpc/credentials"
 	"google.golang.org/grpc/internal"
 	rlspb "google.golang.org/grpc/internal/proto/grpc_lookup_v1"
@@ -463,3 +465,129 @@ func (s) TestNewControlChannelUnsupportedCredsBundle(t *testing.T) {
 		t.Fatal("newControlChannel succeeded when expected to fail")
 	}
 }
+
+// TestControlChannelConnectivityStateTransitions verifies that the control
+// channel only resets backoff when recovering from TRANSIENT_FAILURE, not
+// when going through benign state changes like READY → IDLE → READY.
+func (s) TestControlChannelConnectivityStateTransitions(t *testing.T) {
+	tests := []struct {
+		name              string
+		states            []connectivity.State
+		wantCallbackCount int
+	}{
+		{
+			name: "READY → TRANSIENT_FAILURE → READY triggers callback",
+			states: []connectivity.State{
+				connectivity.TransientFailure,
+				connectivity.Ready,
+			},
+			wantCallbackCount: 1,
+		},
+		{
+			name: "READY → IDLE → READY does not trigger callback",
+			states: []connectivity.State{
+				connectivity.Idle,
+				connectivity.Ready,
+			},
+			wantCallbackCount: 0,
+		},
+		{
+			name: "Multiple failures trigger callback each time",
+			states: []connectivity.State{
+				connectivity.TransientFailure,
+				connectivity.Ready,
+				connectivity.TransientFailure,
+				connectivity.Ready,
+			},
+			wantCallbackCount: 2,
+		},
+		{
+			name: "IDLE between failures doesn't affect callback",
+			states: []connectivity.State{
+				connectivity.TransientFailure,
+				connectivity.Idle,
+				connectivity.Ready,
+			},
+			wantCallbackCount: 1,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Start an RLS server
+			rlsServer, _ := rlstest.SetupFakeRLSServer(t, nil)
+
+			// Setup callback to count invocations
+			var mu sync.Mutex
+			var callbackCount int
+			// Buffered channel large enough to never block
+			callbackInvoked := make(chan struct{}, 100)
+			callback := func() {
+				mu.Lock()
+				callbackCount++
+				mu.Unlock()
+				// Send to channel - should never block with large buffer
+				callbackInvoked <- struct{}{}
+			}
+
+			// Create control channel
+			ctrlCh, err := newControlChannel(rlsServer.Address, "", defaultTestTimeout, balancer.BuildOptions{}, callback)
+			if err != nil {
+				t.Fatalf("Failed to create control channel: %v", err)
+			}
+			defer ctrlCh.close()
+
+			// Wait for the monitoring goroutine to process the initial READY state
+			// before injecting test states. This ensures our injected states are
+			// processed in the main monitoring loop, not consumed during initialization.
+			select {
+			case <-ctrlCh.testOnlyInitialReadyDone:
+				// Initial READY processed by monitoring goroutine
+			case <-time.After(defaultTestTimeout):
+				t.Fatal("Timeout waiting for monitoring goroutine to process initial READY state")
+			}
+
+			// Inject all test states
+			for _, state := range tt.states {
+				ctrlCh.OnMessage(state)
+			}
+
+			// Wait for all expected callbacks with timeout
+			callbackTimeout := time.NewTimer(defaultTestTimeout)
+			defer callbackTimeout.Stop()
+
+			receivedCallbacks := 0
+			for receivedCallbacks < tt.wantCallbackCount {
+				select {
+				case <-callbackInvoked:
+					receivedCallbacks++
+				case <-callbackTimeout.C:
+					mu.Lock()
+					got := callbackCount
+					mu.Unlock()
+					t.Fatalf("Timeout waiting for callbacks: expected %d, received %d via channel, callback count is %d", tt.wantCallbackCount, receivedCallbacks, got)
+				}
+			}
+
+			// Verify final callback count matches expected
+			mu.Lock()
+			gotCallbackCount := callbackCount
+			mu.Unlock()
+
+			if gotCallbackCount != tt.wantCallbackCount {
+				t.Errorf("Got %d callback invocations, want %d", gotCallbackCount, tt.wantCallbackCount)
+			}
+
+			// Ensure no extra callbacks are invoked
+			select {
+			case <-callbackInvoked:
+				mu.Lock()
+				final := callbackCount
+				mu.Unlock()
+				t.Fatalf("Received more callbacks than expected: got %d, want %d", final, tt.wantCallbackCount)
+			case <-time.After(50 * time.Millisecond):
+				// Expected: no more callbacks
+			}
+		})
+	}
+}