Skip to content

Commit 6c52edf

Browse files
committed
db: EFOS and excise race bug fix
Fixes #5390
1 parent 056b059 commit 6c52edf

File tree

5 files changed

+163
-37
lines changed

5 files changed

+163
-37
lines changed

db.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,12 @@ type DB struct {
501501
// sstables.
502502
cumulativePinnedCount uint64
503503
cumulativePinnedSize uint64
504+
505+
// The ongoing excises. These are under snapshots since this is only
506+
// needed to coordinate with EventuallyFileOnlySnapshot creation.
507+
ongoingExcises map[SeqNum]KeyRange
508+
// Signalled when an ongoingExcise is removed.
509+
ongoingExcisesRemovedCond *sync.Cond
504510
}
505511

506512
tableStats struct {
@@ -2812,3 +2818,14 @@ func (d *DB) DebugCurrentVersion() *manifest.Version {
28122818
defer d.mu.Unlock()
28132819
return d.mu.versions.currentVersion()
28142820
}
2821+
2822+
func (d *DB) removeFromOngoingExcises(seqNum base.SeqNum) {
2823+
d.mu.Lock()
2824+
defer d.mu.Unlock()
2825+
_, ok := d.mu.snapshots.ongoingExcises[seqNum]
2826+
if !ok {
2827+
panic(errors.AssertionFailedf("pebble: no ongoing excise for seqnum %d", seqNum))
2828+
}
2829+
delete(d.mu.snapshots.ongoingExcises, seqNum)
2830+
d.mu.snapshots.ongoingExcisesRemovedCond.Broadcast()
2831+
}

ingest.go

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1526,7 +1526,9 @@ func (d *DB) ingest(ctx context.Context, args ingestArgs) (IngestOperationStats,
15261526
// asFlushable indicates whether the sstable was ingested as a flushable.
15271527
var asFlushable bool
15281528
var waitFlushStart crtime.Mono
1529+
var assignedSeqNum SeqNum
15291530
prepare := func(seqNum base.SeqNum) {
1531+
assignedSeqNum = seqNum
15301532
// Note that d.commit.mu is held by commitPipeline when calling prepare.
15311533

15321534
// Determine the set of bounds we care about for the purpose of checking
@@ -1547,14 +1549,35 @@ func (d *DB) ingest(ctx context.Context, args ingestArgs) (IngestOperationStats,
15471549
}
15481550

15491551
d.mu.Lock()
1550-
defer d.mu.Unlock()
1551-
1552+
defer func() {
1553+
if args.ExciseSpan.Valid() && !asFlushable {
1554+
// This can conflict with a concurrent EFOS creation (unlike a
1555+
// flushable ingest, which does not destructively excise).
1556+
_, ok := d.mu.snapshots.ongoingExcises[seqNum]
1557+
if ok {
1558+
panic(errors.AssertionFailedf("pebble: excise with seqnum %s already in map", seqNum))
1559+
}
1560+
d.mu.snapshots.ongoingExcises[seqNum] = args.ExciseSpan
1561+
}
1562+
d.mu.Unlock()
1563+
}()
15521564
if args.ExciseSpan.Valid() {
15531565
// Check if any of the currently-open EventuallyFileOnlySnapshots
1554-
// overlap in key ranges with the excise span. If so, we need to
1555-
// check for memtable overlaps with all bounds of that
1556-
// EventuallyFileOnlySnapshot in addition to the ingestion's own
1557-
// bounds too.
1566+
// overlap in key ranges with the excise span. If so, we need to check
1567+
// for memtable overlaps with *all* bounds of that
1568+
// EventuallyFileOnlySnapshot in addition to the ingestion's own bounds
1569+
// too. This is needed because excise is a destructive operation that
1570+
// mutates the LSM in a non-additive way (unlike ingests and writes).
1571+
//
1572+
// Say there is an existing EFOS for the key-range [a, d) where a is in
1573+
// the memtable and visible to the EFOS, and a later excise wants to
1574+
// excise [c, g) and there is nothing overlapping with it in the
1575+
// memtable. By adding [a, d) to the overlap bounds, we ensure that the
1576+
// excise waits until all of that span is flushed from the memtables,
1577+
// which makes the EFOS transition to a FOS before the excise modifies
1578+
// the LSM. If the excise were to not wait, the [c, d) state that the
1579+
// EFOS needs in the LSM when transitioning to FOS will already have
1580+
// been excised.
15581581
overlapBounds = append(overlapBounds, exciseOverlapBounds(
15591582
d.cmp, &d.mu.snapshots.snapshotList, args.ExciseSpan, seqNum)...)
15601583
}
@@ -1735,6 +1758,13 @@ func (d *DB) ingest(ctx context.Context, args ingestArgs) (IngestOperationStats,
17351758
d.commit.ingestSem <- struct{}{}
17361759
d.commit.AllocateSeqNum(seqNumCount, prepare, apply)
17371760
<-d.commit.ingestSem
1761+
if args.ExciseSpan.Valid() && !asFlushable {
1762+
// NB: this must happen after the assignedSeqNum has become visible, so
1763+
// that any concurrent EFOS creation that acquires d.mu after the removal
1764+
// of this excise gets a visible seqnum after the excise. The
1765+
// assignedSeqNum becomes visible in AllocateSeqNum.
1766+
d.removeFromOngoingExcises(assignedSeqNum)
1767+
}
17381768

17391769
if err != nil {
17401770
if err2 := ingestCleanup(d.objProvider, loadResult.local); err2 != nil {

open.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"math"
1414
"os"
1515
"slices"
16+
"sync"
1617
"sync/atomic"
1718
"time"
1819

@@ -242,7 +243,9 @@ func Open(dirname string, opts *Options) (db *DB, err error) {
242243
d.mu.compact.cond.L = &d.mu.Mutex
243244
d.mu.compact.inProgress = make(map[compaction]struct{})
244245
d.mu.compact.noOngoingFlushStartTime = crtime.NowMono()
245-
d.mu.snapshots.init()
246+
d.mu.snapshots.snapshotList.init()
247+
d.mu.snapshots.ongoingExcises = make(map[SeqNum]KeyRange)
248+
d.mu.snapshots.ongoingExcisesRemovedCond = sync.NewCond(&d.mu.Mutex)
246249
// logSeqNum is the next sequence number that will be assigned.
247250
// Start assigning sequence numbers from base.SeqNumStart to leave
248251
// room for reserved sequence numbers (see comments around

snapshot.go

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -234,12 +234,83 @@ type EventuallyFileOnlySnapshot struct {
234234
}
235235

236236
func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot {
237-
isFileOnly := true
238-
237+
var snapshotSeqNum, exciseSeqNumToWaitFor base.SeqNum
238+
// tryGetSnapshotSeqNum attempts to initialize a snapshotSeqNum. The
239+
// snapshotSeqNum should be ignored if exciseSeqNumToWaitFor is > 0. In this
240+
// case the caller should wait for this seqnum to become visible and call
241+
// this function again.
242+
tryGetSnapshotSeqNum := func() {
243+
// In AllocateSeqNum, for an ingest-and-excise, some seqnums are
244+
// allocated, say starting at N. Then AllocateSeqNum calls prepare, which
245+
// acquires DB.mu and grabs all the existing EFOS that have not
246+
// transitioned to FOS (which are in d.mu.snapshots.snapshotList) and
247+
// overlap with the excise. After releasing DB.mu, in apply, the
248+
// ingest-and-excise waits for all the previous EFOSs to transition to FOS
249+
// (as a side effect of waiting for a memtable flush to complete). Note,
250+
// the visible seqnum is still <= N-1, and will not be bumped to N until
251+
// the ingest-and-excise completes. Any EFOS that gets created after
252+
// prepare looked at the existing EFOSs, but before the ingest-and-excise
253+
// completes, will be created with EventuallyFileOnlySnapshot.seqNum=N-1,
254+
// and may not transition to FOS until after the excise, which is
255+
// incorrect (the version at the transition to FOS has already experienced
256+
// the excise). To avoid this incorrectness, tryGetSnapshotSeqNum is
257+
// called in a loop, until there are no such ongoing excises. The loop can
258+
// starve EFOS creation if the keyRanges keep overlapping with new ongoing
259+
// ingest-and-excises. So EFOS should only be used when ingest-and-excises
260+
// are rare over the keyRanges.
261+
//
262+
// Improving this starvation behavior would require this snapshot to
263+
// register itself in a way that blocks future ingest-and-excises. The
264+
// blocking would need to be done before allocating the seqnum, since
265+
// blocking after can delay (latency sensitive) writes that get a seqnum
266+
// later than the ingest-and-excise. Then the problem shifts to not
267+
// starving the ingest-and-excise if EFOS creation is frequent. We observe
268+
// that in the CockroachDB use case (a) ingest-and-excises are not very
269+
// frequent, so EFOS starvation is unlikely, (b) EFOS creation is not
270+
// latency sensitive. Hence, we ignore this starvation problem.
271+
snapshotSeqNum = d.mu.versions.visibleSeqNum.Load()
272+
// Check if any of the keyRanges overlap with an ongoing
273+
// ingest-and-excise.
274+
//
275+
// NB: The zero seqnum cannot occur in practice, since base.SeqNumStart >
276+
// 0.
277+
exciseSeqNumToWaitFor = base.SeqNum(0)
278+
for seqNum, span := range d.mu.snapshots.ongoingExcises {
279+
if base.Visible(seqNum, snapshotSeqNum, base.SeqNumMax) {
280+
// Skip this excise, since this is visible to the snapshot.
281+
continue
282+
}
283+
// INVARIANT: seqNum >= snapshotSeqNum.
284+
if seqNum <= exciseSeqNumToWaitFor {
285+
// We are already waiting for a later excise.
286+
continue
287+
}
288+
for i := range keyRanges {
289+
if keyRanges[i].OverlapsKeyRange(d.cmp, span) {
290+
exciseSeqNumToWaitFor = seqNum
291+
break
292+
}
293+
}
294+
}
295+
}
239296
d.mu.Lock()
240297
defer d.mu.Unlock()
241-
seqNum := d.mu.versions.visibleSeqNum.Load()
242-
// Check if any of the keyRanges overlap with a memtable.
298+
for {
299+
// This call updates snapshotSeqNum and exciseSeqNumToWaitFor.
300+
tryGetSnapshotSeqNum()
301+
if exciseSeqNumToWaitFor == 0 {
302+
break
303+
}
304+
for !base.Visible(exciseSeqNumToWaitFor, d.mu.versions.visibleSeqNum.Load(), base.SeqNumMax) {
305+
d.mu.snapshots.ongoingExcisesRemovedCond.Wait()
306+
}
307+
}
308+
isFileOnly := true
309+
// Check if any of the keyRanges overlap with a memtable. It is possible
310+
// (with very low probability) that all these memtable have seqnums later
311+
// than seqNum, and the overlap is a false positive. This is harmless, and
312+
// this EFOS will transition to FOS, when the false positive memtable is
313+
// flushed.
243314
for i := range d.mu.mem.queue {
244315
d.mu.mem.queue[i].computePossibleOverlaps(func(bounded) shouldContinue {
245316
isFileOnly = false
@@ -248,7 +319,7 @@ func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFil
248319
}
249320
es := &EventuallyFileOnlySnapshot{
250321
db: d,
251-
seqNum: seqNum,
322+
seqNum: snapshotSeqNum,
252323
protectedRanges: keyRanges,
253324
closed: make(chan struct{}),
254325
}
@@ -258,7 +329,7 @@ func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFil
258329
} else {
259330
s := &Snapshot{
260331
db: d,
261-
seqNum: seqNum,
332+
seqNum: snapshotSeqNum,
262333
}
263334
s.efos = es
264335
es.mu.snap = s

snapshot_test.go

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -384,11 +384,8 @@ func TestNewSnapshotRace(t *testing.T) {
384384
require.NoError(t, d.Close())
385385
}
386386

387-
// Test for https://github.com/cockroachdb/pebble/issues/5390.
387+
// Test for fix to https://github.com/cockroachdb/pebble/issues/5390.
388388
func TestEFOSAndExciseRace(t *testing.T) {
389-
// Skipping since: panic: unexpected excise of an
390-
// EventuallyFileOnlySnapshot's bounds.
391-
t.Skip("#5390")
392389
o := &Options{
393390
FS: vfs.NewMem(),
394391
L0CompactionThreshold: 10,
@@ -414,25 +411,33 @@ func TestEFOSAndExciseRace(t *testing.T) {
414411
<-beforeIngestApply
415412
// IngestAndExcise is blocked, after waiting for memtable flush.
416413
//
417-
// Create a EFOS that overlaps with the excise.
418-
snap := d.NewEventuallyFileOnlySnapshot([]KeyRange{{Start: []byte("a"), End: []byte("d")}})
419-
defer snap.Close()
420-
checkIter := func() {
421-
iter, err := snap.NewIter(nil)
422-
// The iter sees a and c, since it precedes the excise.
423-
require.NoError(t, err)
424-
require.True(t, iter.First())
425-
require.Equal(t, []byte("a"), iter.Key())
426-
require.True(t, iter.Next())
427-
require.Equal(t, []byte("c"), iter.Key())
428-
require.False(t, iter.Next())
429-
require.NoError(t, iter.Close())
430-
}
431-
checkIter()
432-
// Unblock the excise.
414+
// Create a EFOS that overlaps with the excise. It will block due to the
415+
// ongoing excise.
416+
snapDone := make(chan struct{})
417+
go func() {
418+
snap := d.NewEventuallyFileOnlySnapshot([]KeyRange{{Start: []byte("a"), End: []byte("d")}})
419+
defer func() {
420+
require.NoError(t, snap.Close())
421+
snapDone <- struct{}{}
422+
}()
423+
checkIter := func() {
424+
iter, err := snap.NewIter(nil)
425+
// The iter only sees a, since it comes after the excise.
426+
require.NoError(t, err)
427+
require.True(t, iter.First())
428+
require.Equal(t, []byte("a"), iter.Key())
429+
require.False(t, iter.Next())
430+
require.NoError(t, iter.Close())
431+
}
432+
checkIter()
433+
// Flush and transition to FOS.
434+
require.NoError(t, d.Flush())
435+
require.NoError(t, snap.WaitForFileOnlySnapshot(context.Background(), 0))
436+
checkIter()
437+
}()
438+
time.Sleep(100 * time.Millisecond)
439+
// Unblock the excise, which eventually unblocks the snapshot creation.
433440
waitForCh <- struct{}{}
434-
// Flush and transition to FOS.
435-
require.NoError(t, d.Flush())
436-
require.NoError(t, snap.WaitForFileOnlySnapshot(context.Background(), 0))
437-
checkIter()
441+
// Wait for the snapshot checking to finish.
442+
<-snapDone
438443
}

0 commit comments

Comments
 (0)