Skip to content

Commit 1b8b98e

Browse files
authored
xdsclient: fix race in ADS stream flow control causing indefinite blocking (#8605)
Fixes #8594 The above issue clearly describes the condition under which the race manifests. The changes in this PR are as follows: - Remove the `readyCh` field in the flow control that was previously used to block when waiting for flow control. Instead use a condition variable. - Have two bits of state inside the flow control: - One to indicate if there is a pending update that is waiting consumption by all watchers - One to indicate that the stream is closed - The flow control objects no longer needs to be recreated every time a new stream is created - The flow control object is stopped when the `adsStreamImpl` is stopped This PR also makes other minor changes: - Fix a flaky test by ensuring that the test stream implementation unblocks from a `Recv` call when the underlying stream context is cancelled - Couple of logging improvements RELEASE NOTES: - xdsclient: fix a race in the ADS stream implementation that could result in resource-not-found errors, causing the gRPC client channel to move to `TransientFailure`
1 parent d0ebcdf commit 1b8b98e

File tree

4 files changed

+85
-73
lines changed

4 files changed

+85
-73
lines changed

internal/xds/clients/xdsclient/ads_stream.go

Lines changed: 80 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"context"
2323
"fmt"
2424
"sync"
25-
"sync/atomic"
2625
"time"
2726

2827
"google.golang.org/grpc/grpclog"
@@ -103,11 +102,11 @@ type adsStreamImpl struct {
103102
requestCh *buffer.Unbounded // Subscriptions and unsubscriptions are pushed here.
104103
runnerDoneCh chan struct{} // Notify completion of runner goroutine.
105104
cancel context.CancelFunc // To cancel the context passed to the runner goroutine.
105+
fc *adsFlowControl // Flow control for ADS stream.
106106

107107
// Guards access to the below fields (and to the contents of the map).
108108
mu sync.Mutex
109109
resourceTypeState map[ResourceType]*resourceTypeState // Map of resource types to their state.
110-
fc *adsFlowControl // Flow control for ADS stream.
111110
firstRequest bool // False after the first request is sent out.
112111
}
113112

@@ -135,6 +134,7 @@ func newADSStreamImpl(opts adsStreamOpts) *adsStreamImpl {
135134
streamCh: make(chan clients.Stream, 1),
136135
requestCh: buffer.NewUnbounded(),
137136
runnerDoneCh: make(chan struct{}),
137+
fc: newADSFlowControl(),
138138
resourceTypeState: make(map[ResourceType]*resourceTypeState),
139139
}
140140

@@ -150,6 +150,7 @@ func newADSStreamImpl(opts adsStreamOpts) *adsStreamImpl {
150150
// Stop blocks until the stream is closed and all spawned goroutines exit.
151151
func (s *adsStreamImpl) Stop() {
152152
s.cancel()
153+
s.fc.stop()
153154
s.requestCh.Close()
154155
<-s.runnerDoneCh
155156
s.logger.Infof("Shutdown ADS stream")
@@ -240,9 +241,6 @@ func (s *adsStreamImpl) runner(ctx context.Context) {
240241
}
241242

242243
s.mu.Lock()
243-
// Flow control is a property of the underlying streaming RPC call and
244-
// needs to be initialized everytime a new one is created.
245-
s.fc = newADSFlowControl(s.logger)
246244
s.firstRequest = true
247245
s.mu.Unlock()
248246

@@ -256,7 +254,7 @@ func (s *adsStreamImpl) runner(ctx context.Context) {
256254

257255
// Backoff state is reset upon successful receipt of at least one
258256
// message from the server.
259-
if s.recv(ctx, stream) {
257+
if s.recv(stream) {
260258
return backoff.ErrResetBackoff
261259
}
262260
return nil
@@ -318,11 +316,13 @@ func (s *adsStreamImpl) sendNew(stream clients.Stream, typ ResourceType) error {
318316
// This allows us to batch writes for requests which are generated as part
319317
// of local processing of a received response.
320318
state := s.resourceTypeState[typ]
321-
if s.fc.pending.Load() {
319+
bufferRequest := func() {
322320
select {
323321
case state.bufferedRequests <- struct{}{}:
324322
default:
325323
}
324+
}
325+
if s.fc.runIfPending(bufferRequest) {
326326
return nil
327327
}
328328

@@ -477,18 +477,19 @@ func (s *adsStreamImpl) sendMessageLocked(stream clients.Stream, names []string,
477477
//
478478
// It returns a boolean indicating whether at least one message was received
479479
// from the server.
480-
func (s *adsStreamImpl) recv(ctx context.Context, stream clients.Stream) bool {
480+
func (s *adsStreamImpl) recv(stream clients.Stream) bool {
481481
msgReceived := false
482482
for {
483-
// Wait for ADS stream level flow control to be available, and send out
484-
// a request if anything was buffered while we were waiting for local
485-
// processing of the previous response to complete.
486-
if !s.fc.wait(ctx) {
483+
// Wait for ADS stream level flow control to be available.
484+
if s.fc.wait() {
487485
if s.logger.V(2) {
488-
s.logger.Infof("ADS stream context canceled")
486+
s.logger.Infof("ADS stream stopped while waiting for flow control")
489487
}
490488
return msgReceived
491489
}
490+
491+
// Send out a request if anything was buffered while we were waiting for
492+
// local processing of the previous response to complete.
492493
s.sendBuffered(stream)
493494

494495
resources, url, version, nonce, err := s.recvMessage(stream)
@@ -508,8 +509,8 @@ func (s *adsStreamImpl) recv(ctx context.Context, stream clients.Stream) bool {
508509
}
509510
var resourceNames []string
510511
var nackErr error
511-
s.fc.setPending()
512-
resourceNames, nackErr = s.eventHandler.onResponse(resp, s.fc.onDone)
512+
s.fc.setPending(true)
513+
resourceNames, nackErr = s.eventHandler.onResponse(resp, sync.OnceFunc(func() { s.fc.setPending(false) }))
513514
if xdsresource.ErrType(nackErr) == xdsresource.ErrorTypeResourceTypeUnsupported {
514515
// A general guiding principle is that if the server sends
515516
// something the client didn't actually subscribe to, then the
@@ -707,69 +708,84 @@ func resourceNames(m map[string]*xdsresource.ResourceWatchState) []string {
707708
return ret
708709
}
709710

710-
// adsFlowControl implements ADS stream level flow control that enables the
711-
// transport to block the reading of the next message off of the stream until
712-
// the previous update is consumed by all watchers.
711+
// adsFlowControl implements ADS stream level flow control that enables the ADS
712+
// stream to block the reading of the next message until the previous update is
713+
// consumed by all watchers.
713714
//
714-
// The lifetime of the flow control is tied to the lifetime of the stream.
715+
// The lifetime of the flow control is tied to the lifetime of the stream. When
716+
// the stream is closed, it is the responsibility of the caller to stop the flow
717+
// control. This ensures that any goroutine blocked on the flow control's wait
718+
// method is unblocked.
715719
type adsFlowControl struct {
716-
logger *igrpclog.PrefixLogger
717-
718-
// Whether the most recent update is pending consumption by all watchers.
719-
pending atomic.Bool
720-
// Channel used to notify when all the watchers have consumed the most
721-
// recent update. Wait() blocks on reading a value from this channel.
722-
readyCh chan struct{}
720+
mu sync.Mutex
721+
// cond is used to signal when the most recent update has been consumed, or
722+
// the flow control has been stopped (in which case, waiters should be
723+
// unblocked as well).
724+
cond *sync.Cond
725+
pending bool // indicates if the most recent update is pending consumption
726+
stopped bool // indicates if the ADS stream has been stopped
723727
}
724728

725729
// newADSFlowControl returns a new adsFlowControl.
726-
func newADSFlowControl(logger *igrpclog.PrefixLogger) *adsFlowControl {
727-
return &adsFlowControl{
728-
logger: logger,
729-
readyCh: make(chan struct{}, 1),
730-
}
730+
func newADSFlowControl() *adsFlowControl {
731+
fc := &adsFlowControl{}
732+
fc.cond = sync.NewCond(&fc.mu)
733+
return fc
731734
}
732735

733-
// setPending changes the internal state to indicate that there is an update
734-
// pending consumption by all watchers.
735-
func (fc *adsFlowControl) setPending() {
736-
fc.pending.Store(true)
736+
// stop marks the flow control as stopped and signals the condition variable to
737+
// unblock any goroutine waiting on it.
738+
func (fc *adsFlowControl) stop() {
739+
fc.mu.Lock()
740+
defer fc.mu.Unlock()
741+
742+
fc.stopped = true
743+
fc.cond.Broadcast()
737744
}
738745

739-
// wait blocks until all the watchers have consumed the most recent update and
740-
// returns true. If the context expires before that, it returns false.
741-
func (fc *adsFlowControl) wait(ctx context.Context) bool {
742-
// If there is no pending update, there is no need to block.
743-
if !fc.pending.Load() {
744-
// If all watchers finished processing the most recent update before the
745-
// `recv` goroutine made the next call to `Wait()`, there would be an
746-
// entry in the readyCh channel that needs to be drained to ensure that
747-
// the next call to `Wait()` doesn't unblock before it actually should.
748-
select {
749-
case <-fc.readyCh:
750-
default:
751-
}
752-
return true
746+
// setPending changes the internal state to indicate whether there is an update
747+
// pending consumption by all watchers. If there is no longer a pending update,
748+
// the condition variable is signaled to allow the recv method to proceed.
749+
func (fc *adsFlowControl) setPending(pending bool) {
750+
fc.mu.Lock()
751+
defer fc.mu.Unlock()
752+
753+
if fc.stopped {
754+
return
753755
}
754756

755-
select {
756-
case <-ctx.Done():
757+
fc.pending = pending
758+
if !pending {
759+
fc.cond.Broadcast()
760+
}
761+
}
762+
763+
func (fc *adsFlowControl) runIfPending(f func()) bool {
764+
fc.mu.Lock()
765+
defer fc.mu.Unlock()
766+
767+
if fc.stopped {
757768
return false
758-
case <-fc.readyCh:
759-
return true
760769
}
770+
771+
// If there's a pending update, run the function while still holding the
772+
// lock. This ensures that the pending state does not change between the
773+
// check and the function call.
774+
if fc.pending {
775+
f()
776+
}
777+
return fc.pending
761778
}
762779

763-
// onDone indicates that all watchers have consumed the most recent update.
764-
func (fc *adsFlowControl) onDone() {
765-
select {
766-
// Writes to the readyCh channel should not block ideally. The default
767-
// branch here is to appease the paranoid mind.
768-
case fc.readyCh <- struct{}{}:
769-
default:
770-
if fc.logger.V(2) {
771-
fc.logger.Infof("ADS stream flow control readyCh is full")
772-
}
780+
// wait blocks until all the watchers have consumed the most recent update.
781+
// Returns true if the flow control was stopped while waiting, false otherwise.
782+
func (fc *adsFlowControl) wait() bool {
783+
fc.mu.Lock()
784+
defer fc.mu.Unlock()
785+
786+
for fc.pending && !fc.stopped {
787+
fc.cond.Wait()
773788
}
774-
fc.pending.Store(false)
789+
790+
return fc.stopped
775791
}

internal/xds/clients/xdsclient/test/ads_stream_flow_control_test.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ package xdsclient_test
2020

2121
import (
2222
"context"
23-
"errors"
2423
"fmt"
2524
"slices"
2625
"sort"
@@ -125,7 +124,6 @@ func (t *transport) NewStream(ctx context.Context, method string) (clients.Strea
125124
stream := &stream{
126125
stream: s,
127126
recvCh: make(chan struct{}, 1),
128-
doneCh: make(chan struct{}),
129127
}
130128
t.adsStreamCh <- stream
131129

@@ -138,9 +136,7 @@ func (t *transport) Close() {
138136

139137
type stream struct {
140138
stream grpc.ClientStream
141-
142139
recvCh chan struct{}
143-
doneCh <-chan struct{}
144140
}
145141

146142
func (s *stream) Send(msg []byte) error {
@@ -150,8 +146,8 @@ func (s *stream) Send(msg []byte) error {
150146
func (s *stream) Recv() ([]byte, error) {
151147
select {
152148
case s.recvCh <- struct{}{}:
153-
case <-s.doneCh:
154-
return nil, errors.New("Recv() called after the test has finished")
149+
case <-s.stream.Context().Done():
150+
// Unblock the recv() once the stream is done.
155151
}
156152

157153
var typedRes []byte

internal/xds/clients/xdsclient/xdsclient.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ func (c *XDSClient) releaseChannel(serverConfig *ServerConfig, state *channelSta
319319
c.channelsMu.Lock()
320320

321321
if c.logger.V(2) {
322-
c.logger.Infof("Received request to release a reference to an xdsChannel for server config %q", serverConfig)
322+
c.logger.Infof("Received request to release a reference to an xdsChannel for server config %+v", serverConfig)
323323
}
324324
deInitLocked(state)
325325

internal/xds/xdsclient/tests/dump_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func makeGenericXdsConfig(typeURL, name, version string, status v3adminpb.Client
6363
}
6464

6565
func checkResourceDump(ctx context.Context, want *v3statuspb.ClientStatusResponse, pool *xdsclient.Pool) error {
66-
var cmpOpts = cmp.Options{
66+
cmpOpts := cmp.Options{
6767
protocmp.Transform(),
6868
protocmp.IgnoreFields((*v3statuspb.ClientConfig_GenericXdsConfig)(nil), "last_updated"),
6969
protocmp.IgnoreFields((*v3adminpb.UpdateFailureState)(nil), "last_update_attempt", "details"),
@@ -89,7 +89,7 @@ func checkResourceDump(ctx context.Context, want *v3statuspb.ClientStatusRespons
8989
if diff == "" {
9090
return nil
9191
}
92-
lastErr = fmt.Errorf("received unexpected resource dump, diff (-got, +want):\n%s, got: %s\n want:%s", diff, pretty.ToJSON(got), pretty.ToJSON(want))
92+
lastErr = fmt.Errorf("received unexpected resource dump, diff (-want, +got):\n%s, got: %s\n want:%s", diff, pretty.ToJSON(got), pretty.ToJSON(want))
9393
}
9494
return fmt.Errorf("timeout when waiting for resource dump to reach expected state: %v", lastErr)
9595
}

0 commit comments

Comments
 (0)