Skip to content

Commit c61e6b4

Browse files
committed
sql: always wait for all goroutines in fingerprintSpan
This commit fixes an oversight from 1cf6236. In particular, in that change we ensured that we never leak the worker goroutines in fingerprintSpan by cancelling the context in the coordinator and checking for that in the workers. However, it was possible for the coordinator to exit before the workers which can lead to undefined behavior down the line (in a test failure we just saw usage of the trace span after finish). This commit fixes the oversight by always blocking the coordinator until all its workers exit. Release note: None
1 parent dda5160 commit c61e6b4

File tree

1 file changed

+23
-5
lines changed

1 file changed

+23
-5
lines changed

pkg/sql/fingerprint_span.go

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,12 @@ func (p *planner) fingerprintSpanFanout(
133133
fingerprintPartition := func(
134134
partition roachpb.Spans,
135135
) func(ctx context.Context) error {
136-
return func(ctx context.Context) error {
136+
return func(ctx context.Context) (retErr error) {
137137
// workCh is used to divide up the partition between workers. It is
138138
// closed whenever there is no work to do. It might not be closed if
139139
// the coordinator encounters an error.
140140
workCh := make(chan roachpb.Span)
141-
// The context will be canceled when the coordinator exits
142-
// guaranteeing that the worker goroutines aren't leaked.
143141
ctx, cancel := context.WithCancel(ctx)
144-
defer cancel()
145142

146143
grp := ctxgroup.WithContext(ctx)
147144
for range maxWorkerCount {
@@ -169,6 +166,27 @@ func (p *planner) fingerprintSpanFanout(
169166
}
170167
})
171168
}
169+
defer func() {
170+
// Either workCh is closed (meaning that we've processed all the
171+
// work), or we hit an error in the loop below. If it's the
172+
// latter, we need to cancel the context to signal to the
173+
// workers to shutdown ASAP.
174+
if retErr != nil {
175+
cancel()
176+
}
177+
// Regardless of how we got here, ensure that we always block
178+
// until all workers exit.
179+
// TODO(yuzefovich): refactor the logic here so that the
180+
// coordinator goroutine had a single return point. This will
181+
// also allow us to prevent a hypothetical scenario where we're
182+
// blocked forever (i.e. until the context is canceled) writing
183+
// into workCh which can happen if all worker goroutines exit
184+
// due to an error.
185+
grpErr := grp.Wait()
186+
if retErr == nil {
187+
retErr = grpErr
188+
}
189+
}()
172190

173191
for _, part := range partition {
174192
rdi, err := p.execCfg.RangeDescIteratorFactory.NewLazyIterator(ctx, part, 64)
@@ -195,7 +213,7 @@ func (p *planner) fingerprintSpanFanout(
195213
}
196214
}
197215
close(workCh)
198-
return grp.Wait()
216+
return nil
199217
}
200218
}
201219

0 commit comments

Comments
 (0)