Skip to content

Commit 1137362

Browse files
craig[bot]stevendanna
andcommitted
157259: kvcoord: deflake rangefeed tests r=pav-kv,tbg,sumeerbhola a=stevendanna This aims at fixing recent failures in TestRangeFeedMetricsManagement TestMuxRangeFeedCatchupScanQuotaReleased A clear root cause was found for the former while the latter appears to be related to slowdowns as a result of recreating catchup scan iterators when running under the race-detector. Fixes cockroachdb#156852 Fixes cockroachdb#157003 Fixes cockroachdb#157064 Co-authored-by: Steven Danna <[email protected]>
2 parents 4c40f45 + 2b5e3ab commit 1137362

File tree

5 files changed

+120
-40
lines changed

5 files changed

+120
-40
lines changed

pkg/kv/kvclient/kvcoord/dist_sender_rangefeed_test.go

Lines changed: 48 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ import (
2929
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
3030
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
3131
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
32-
"github.com/cockroachdb/cockroach/pkg/util"
3332
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
3433
"github.com/cockroachdb/cockroach/pkg/util/hlc"
3534
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
@@ -129,14 +128,19 @@ func rangeFeed(
129128
startFrom hlc.Timestamp,
130129
onValue func(event kvcoord.RangeFeedMessage),
131130
opts ...kvcoord.RangeFeedOption,
132-
) func() {
131+
) (chan error, func()) {
133132
ds := dsI.(*kvcoord.DistSender)
134133
events := make(chan kvcoord.RangeFeedMessage)
135134
ctx, cancel := context.WithCancel(context.WithValue(context.Background(), testFeedCtxKey{}, struct{}{}))
136135

136+
errCh := make(chan error, 1)
137137
g := ctxgroup.WithContext(ctx)
138-
g.GoCtx(func(ctx context.Context) (err error) {
139-
return ds.RangeFeed(ctx, []kvcoord.SpanTimePair{{Span: sp, StartAfter: startFrom}}, events, opts...)
138+
g.GoCtx(func(ctx context.Context) error {
139+
err := ds.RangeFeed(ctx, []kvcoord.SpanTimePair{{Span: sp, StartAfter: startFrom}}, events, opts...)
140+
if err != nil {
141+
errCh <- err
142+
}
143+
return err
140144
})
141145
g.GoCtx(func(ctx context.Context) error {
142146
for {
@@ -149,7 +153,7 @@ func rangeFeed(
149153
}
150154
})
151155

152-
return func() {
156+
return errCh, func() {
153157
cancel()
154158
_ = g.Wait()
155159
}
@@ -176,19 +180,17 @@ func observeNValues(n int) (chan struct{}, func(ev kvcoord.RangeFeedMessage)) {
176180
}
177181
}
178182

179-
func channelWaitWithTimeout(t *testing.T, ch chan struct{}) {
183+
func channelWaitWithTimeout(t *testing.T, ch chan struct{}, errCh chan error) {
180184
t.Helper()
181-
timeOut := 30 * time.Second
182-
if util.RaceEnabled {
183-
timeOut *= 10
184-
}
185-
if syncutil.DeadlockEnabled {
186-
timeOut = 2 * deadlock.Opts.DeadlockTimeout
187-
}
185+
timeOut := testutils.SucceedsSoonDuration()
188186
select {
189187
case <-ch:
188+
case err := <-errCh:
189+
if err != nil {
190+
t.Fatalf("unexpected error while waiting on channel: %v", err)
191+
}
190192
case <-time.After(timeOut):
191-
t.Fatal("test timed out")
193+
t.Fatalf("test timed out after %s", timeOut)
192194
}
193195
}
194196

@@ -220,7 +222,7 @@ func TestMuxRangeFeedConnectsToNodeOnce(t *testing.T) {
220222
// test cluster nodes.
221223
sqlDB.ExecMultiple(t,
222224
`SET CLUSTER SETTING kv.rangefeed.enabled = true`,
223-
`ALTER DATABASE defaultdb CONFIGURE ZONE USING num_replicas = 1`,
225+
`ALTER DATABASE defaultdb CONFIGURE ZONE USING num_replicas = 1`,
224226
`CREATE TABLE foo (key INT PRIMARY KEY)`,
225227
`INSERT INTO foo (key) SELECT * FROM generate_series(1, 1000)`,
226228
`ALTER TABLE foo SPLIT AT (SELECT * FROM generate_series(100, 900, 100))`,
@@ -241,9 +243,9 @@ func TestMuxRangeFeedConnectsToNodeOnce(t *testing.T) {
241243
fooSpan := fooDesc.PrimaryIndexSpan(keys.SystemSQLCodec)
242244

243245
allSeen, onValue := observeNValues(1000)
244-
closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startTime, onValue)
246+
errCh, closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startTime, onValue)
245247
defer closeFeed()
246-
channelWaitWithTimeout(t, allSeen)
248+
channelWaitWithTimeout(t, allSeen, errCh)
247249
closeFeed() // Explicitly shutdown the feed to make sure counters no longer change.
248250

249251
// Verify we connected to each node once.
@@ -254,7 +256,7 @@ func TestMuxRangeFeedConnectsToNodeOnce(t *testing.T) {
254256
}
255257
}
256258

257-
func TestMuxRangeCatchupScanQuotaReleased(t *testing.T) {
259+
func TestMuxRangeFeedCatchupScanQuotaReleased(t *testing.T) {
258260
defer leaktest.AfterTest(t)()
259261
defer log.Scope(t).Close(t)
260262

@@ -289,20 +291,23 @@ func TestMuxRangeCatchupScanQuotaReleased(t *testing.T) {
289291
noValuesExpected := func(event kvcoord.RangeFeedMessage) {
290292
panic("received value when none expected")
291293
}
292-
const numErrsToReturn = 100
294+
const numErrsToReturn = 42
293295
var numErrors atomic.Int32
294296
enoughErrors := make(chan struct{})
295-
closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startTime, noValuesExpected,
297+
errCh, closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startTime, noValuesExpected,
296298
kvcoord.TestingWithOnRangefeedEvent(
297299
func(_ context.Context, _ roachpb.Span, _ int64, event *kvpb.RangeFeedEvent) (skip bool, _ error) {
300+
if event.Error != nil {
301+
return false, nil
302+
}
298303
*event = transientErrEvent
299304
if numErrors.Add(1) == numErrsToReturn {
300305
close(enoughErrors)
301306
}
302307
return false, nil
303308
}))
304309
defer closeFeed()
305-
channelWaitWithTimeout(t, enoughErrors)
310+
channelWaitWithTimeout(t, enoughErrors, errCh)
306311
}
307312

308313
// TestMuxRangeFeedDoesNotStallOnError tests that the mux rangefeed
@@ -413,14 +418,14 @@ func TestMuxRangeFeedDoesNotStallOnError(t *testing.T) {
413418

414419
shouldError.Store(true)
415420
allSeen, onValue := observeNValues(100)
416-
closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startFrom, onValue)
421+
errCh, closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startFrom, onValue)
417422
defer closeFeed()
418-
channelWaitWithTimeout(t, allSeen)
423+
channelWaitWithTimeout(t, allSeen, errCh)
419424
}
420425

421426
// Test to make sure the various metrics used by rangefeed are correctly
422427
// updated during the lifetime of the rangefeed and when the rangefeed completes.
423-
func TestRangeFeedMetricsManagement(t *testing.T) {
428+
func TestMuxRangeFeedMetricsManagement(t *testing.T) {
424429
defer leaktest.AfterTest(t)()
425430
defer log.Scope(t).Close(t)
426431

@@ -434,6 +439,9 @@ func TestRangeFeedMetricsManagement(t *testing.T) {
434439
sqlDB := sqlutils.MakeSQLRunner(tc.ServerConn(0))
435440
startTime := ts.Clock().Now()
436441

442+
kvserver.RangefeedEnabled.Override(
443+
context.Background(), &tc.SystemLayer(0).ClusterSettings().SV, true)
444+
437445
// Insert 1000 rows, and split them into 10 ranges.
438446
const numRanges = 10
439447
sqlDB.ExecMultiple(t,
@@ -502,8 +510,9 @@ func TestRangeFeedMetricsManagement(t *testing.T) {
502510
return skipSet.stuck.Contains(k)
503511
}
504512

513+
frontierAdvanced := make(chan struct{}, 1)
505514
ignoreValues := func(event kvcoord.RangeFeedMessage) {}
506-
closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startTime, ignoreValues,
515+
errCh, closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startTime, ignoreValues,
507516
kvcoord.TestingWithRangeFeedMetrics(&metrics),
508517
kvcoord.TestingWithOnRangefeedEvent(
509518
func(ctx context.Context, s roachpb.Span, _ int64, event *kvpb.RangeFeedEvent) (skip bool, _ error) {
@@ -535,10 +544,16 @@ func TestRangeFeedMetricsManagement(t *testing.T) {
535544
return false, nil
536545
}
537546

538-
_, err := frontier.Forward(checkpoint.Span, checkpoint.ResolvedTS)
547+
advanced, err := frontier.Forward(checkpoint.Span, checkpoint.ResolvedTS)
539548
if err != nil {
540549
return false, err
541550
}
551+
if advanced {
552+
select {
553+
case frontierAdvanced <- struct{}{}:
554+
default:
555+
}
556+
}
542557

543558
if numCatchupBlocked.Add(1) <= numCatchupToBlock {
544559
// Mux rangefeed can't block single range, so just skip this event
@@ -558,12 +573,7 @@ func TestRangeFeedMetricsManagement(t *testing.T) {
558573

559574
// Wait for the test frontier to advance. Once it advances,
560575
// we know the rangefeed is started, all ranges are running (even if some of them are blocked).
561-
testutils.SucceedsSoon(t, func() error {
562-
if frontier.Frontier().IsEmpty() {
563-
return errors.Newf("waiting for frontier advance: %s", frontier.String())
564-
}
565-
return nil
566-
})
576+
channelWaitWithTimeout(t, frontierAdvanced, errCh)
567577

568578
// At this point, we know the rangefeed for all ranges are running.
569579
require.EqualValues(t, numRanges, metrics.RangefeedRanges.Value(), frontier.String())
@@ -575,9 +585,9 @@ func TestRangeFeedMetricsManagement(t *testing.T) {
575585
require.EqualValues(t, numCatchupToBlock, metrics.RangefeedCatchupRanges.Value())
576586
}
577587

578-
// TestRangefeedRangeObserver ensures the kvcoord.WithRangeObserver option
588+
// TestMuxRangefeedRangeObserver ensures the kvcoord.WithRangeObserver option
579589
// works correctly.
580-
func TestRangefeedRangeObserver(t *testing.T) {
590+
func TestMuxRangefeedRangeObserver(t *testing.T) {
581591
defer leaktest.AfterTest(t)()
582592
defer log.Scope(t).Close(t)
583593

@@ -640,7 +650,7 @@ func TestRangefeedRangeObserver(t *testing.T) {
640650
})
641651
}
642652

643-
closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, ts.Clock().Now(), ignoreValues,
653+
_, closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, ts.Clock().Now(), ignoreValues,
644654
kvcoord.WithRangeObserver(observer))
645655
defer closeFeed()
646656

@@ -735,7 +745,7 @@ func TestMuxRangeFeedCanCloseStream(t *testing.T) {
735745
ignoreValues := func(event kvcoord.RangeFeedMessage) {}
736746
var numRestartStreams atomic.Int32
737747

738-
closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, ts.Clock().Now(), ignoreValues,
748+
_, closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, ts.Clock().Now(), ignoreValues,
739749
kvcoord.TestingWithMuxRangeFeedRequestSenderCapture(
740750
// We expect a single mux sender since we have 1 node in this test.
741751
func(nodeID roachpb.NodeID, capture func(request *kvpb.RangeFeedRequest) error) {
@@ -863,13 +873,13 @@ func TestMuxRangeFeedDoesNotDeadlockWithLocalStreams(t *testing.T) {
863873
fooSpan := fooDesc.PrimaryIndexSpan(keys.SystemSQLCodec)
864874

865875
allSeen, onValue := observeNValues(1000)
866-
closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startFrom, onValue,
876+
_, closeFeed := rangeFeed(ts.DistSenderI(), fooSpan, startFrom, onValue,
867877
kvcoord.TestingWithBeforeSendRequest(func() {
868878
// Prior to sending rangefeed request, block for just a bit
869879
// to make deadlock more likely.
870880
time.Sleep(100 * time.Millisecond)
871881
}),
872882
)
873883
defer closeFeed()
874-
channelWaitWithTimeout(t, allSeen)
884+
channelWaitWithTimeout(t, allSeen, nil)
875885
}

pkg/kv/kvserver/rangefeed/catchup_scan.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"bytes"
1010
"context"
1111
"time"
12+
"unsafe"
1213

1314
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
1415
"github.com/cockroachdb/cockroach/pkg/roachpb"
@@ -238,6 +239,7 @@ func (i *CatchUpSnapshot) CatchUpScan(
238239
unsafeKey := iter.UnsafeKey()
239240
sameKey := bytes.Equal(unsafeKey.Key, lastKey)
240241
if !sameKey {
242+
241243
// If so, output events for the last key encountered.
242244
if err := outputEvents(); err != nil {
243245
return err
@@ -253,6 +255,23 @@ func (i *CatchUpSnapshot) CatchUpScan(
253255
recreateIter := false
254256
// recreateIter => now is initialized.
255257
var now crtime.Mono
258+
259+
// Under race tests we want to recreate the iterator more often to
260+
// increase test coverage of this code. Doing this on every iteration
261+
// proved a bit slow.
262+
forceRecreateUnderTest := false
263+
if util.RaceEnabled {
264+
// Hash the pointer of the current iterator into the range [0, 3] to
265+
// give us a ~25% chance of recreating the iterator.
266+
//
267+
// unsafe.Pointer's documentation implies that this uintptr is not
268+
// technically guaranteed to be stable across multiple calls. So, we
269+
// could make a different decision for the same iterator. That's OK, the
270+
// goal here is to force recreation occasionally, nothing more.
271+
ptr := uintptr(unsafe.Pointer(&iter))
272+
forceRecreateUnderTest = util.FibHash(uint64(ptr), 2) == 0
273+
}
274+
256275
// We sample the current time only when readmitted is true, to avoid
257276
// doing it in every iteration of the loop. In practice, readmitted is
258277
// true after every 100ms of cpu time, and there is no wall time limit
@@ -262,12 +281,13 @@ func (i *CatchUpSnapshot) CatchUpScan(
262281
// Pace. We also need to consider the CPU cost of recreating the
263282
// iterator, and using the cpu time to amortize that cost seems
264283
// reasonable.
265-
if (readmitted && i.iterRecreateDuration > 0) || util.RaceEnabled {
284+
shouldSampleTime := (readmitted && i.iterRecreateDuration > 0) || forceRecreateUnderTest
285+
if shouldSampleTime {
266286
// If enough walltime has elapsed, close iter and reopen. This
267287
// prevents the iter from holding on to Pebble memtables for too long,
268288
// which can cause OOMs.
269289
now = crtime.NowMono()
270-
recreateIter = now.Sub(lastIterCreateTime) > i.iterRecreateDuration || util.RaceEnabled
290+
recreateIter = now.Sub(lastIterCreateTime) > i.iterRecreateDuration || forceRecreateUnderTest
271291
}
272292
if recreateIter {
273293
lastIterCreateTime = now

pkg/util/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ go_test(
4747
srcs = [
4848
"every_n_test.go",
4949
"fast_int_map_test.go",
50+
"hash_test.go",
5051
"slices_test.go",
5152
"smalltrace_test.go",
5253
"strings_test.go",

pkg/util/hash.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,12 @@ func (f *FNV64) Add(c uint64) {
5656
func (f *FNV64) Sum() uint64 {
5757
return f.sum
5858
}
59+
60+
// Fibonacci Hash is a multiplicative hash that maps the given `v` into 2^bits
61+
// values.
62+
//
63+
// Reference:
64+
// - https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
65+
func FibHash(v uint64, bits int) uint64 {
66+
return (v * 11400714819323198485) >> (64 - bits)
67+
}

pkg/util/hash_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Copyright 2016 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package util
7+
8+
import (
9+
"testing"
10+
"unsafe"
11+
12+
"github.com/cockroachdb/cockroach/pkg/util/randutil"
13+
"github.com/stretchr/testify/require"
14+
)
15+
16+
func TestFibHash(t *testing.T) {
17+
rng, _ := randutil.NewTestRand()
18+
counts := make(map[uint64]int)
19+
for range 1000 {
20+
h := FibHash(rng.Uint64(), 3)
21+
counts[h]++
22+
}
23+
24+
// Let's test using a ptr value. We use this slice to keep the pointers alive
25+
// so that we don't just get the same pointer each time.
26+
vs := make([]*int, 0, 1000)
27+
for range 1000 {
28+
v := new(int)
29+
vs = append(vs, v)
30+
h := FibHash(uint64(uintptr(unsafe.Pointer(v))), 3)
31+
counts[h]++
32+
}
33+
require.Equal(t, 1000, len(vs))
34+
// We can only make some weak assertions about the distribution here.
35+
for k, v := range counts {
36+
require.True(t, k < 8)
37+
perfectDistributionCount := 2000 / 8
38+
require.Less(t, v, perfectDistributionCount*2)
39+
}
40+
}

0 commit comments

Comments
 (0)