Skip to content

Commit c4b8017

Browse files
committed
db: avoid stepping beyond iteration prefix in levelIter
During prefix iteration mode, avoid stepping the levelIter beyond the current iteration prefix. This fixes a bug whereby a levelIter.Next may position a levelIter to a file beyond the iteration prefix and a subsequent SeekPrefixGE to a later key with TrySeekUsingNext would fail to observe keys in the skipped file. Fix #3610.
1 parent 5ee10bd commit c4b8017

File tree

4 files changed

+82
-36
lines changed

4 files changed

+82
-36
lines changed

data_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,16 @@ func runDBDefineCmdReuseFS(td *datadriven.TestData, opts *Options) (*DB, error)
842842
for _, levelOpts := range opts.Levels {
843843
levelOpts.BlockSize = size
844844
}
845+
case "bloom-bits-per-key":
846+
v, err := strconv.Atoi(arg.Vals[0])
847+
if err != nil {
848+
return nil, err
849+
}
850+
fp := bloom.FilterPolicy(v)
851+
opts.Filters = map[string]FilterPolicy{fp.Name(): fp}
852+
for i := range opts.Levels {
853+
opts.Levels[i].FilterPolicy = fp
854+
}
845855
case "format-major-version":
846856
fmv, err := strconv.Atoi(arg.Vals[0])
847857
if err != nil {

internal/base/iterator.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,15 @@ type InternalIterator interface {
107107
// SeekPrefixGE only checks the upper bound. It is up to the caller to ensure
108108
// that key is greater than or equal to the lower bound.
109109
//
110-
// The prefix argument is used by some InternalIterator implementations (e.g.
111-
// sstable.Reader) to avoid expensive operations. This operation is only
112-
// useful when a user-defined Split function is supplied to the Comparer for
113-
// the DB. The supplied prefix will be the prefix of the given key returned by
114-
// that Split function. If the iterator is able to determine that no key with
115-
// the prefix exists, it can return (nil,nilv). Unlike SeekGE, this is not an
116-
// indication that iteration is exhausted.
110+
// The prefix argument is used by some InternalIterator implementations
111+
// (e.g. sstable.Reader) to avoid expensive operations. This operation is
112+
// only useful when a user-defined Split function is supplied to the
113+
// Comparer for the DB. The supplied prefix will be the prefix of the given
114+
// key returned by that Split function. If the iterator is able to determine
115+
// that no key with the prefix exists, it can return (nil,nilv). Unlike
116+
// SeekGE, this is not an indication that iteration is exhausted. The prefix
117+
// byte slice is guaranteed to be stable until the next absolute positioning
118+
// operation.
117119
//
118120
// Note that the iterator may return keys not matching the prefix. It is up
119121
// to the caller to check if the prefix matches.

level_iter.go

Lines changed: 37 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -31,21 +31,13 @@ type internalIterOpts struct {
3131
//
3232
// levelIter is used during compaction and as part of the Iterator
3333
// implementation. When used as part of the Iterator implementation, level
34-
// iteration needs to "pause" at sstable boundaries if a range deletion iterator
35-
// is open. In this case, we materialize a "synthetic" InternalKV to return from
36-
// levelIter. This prevents mergingIter from advancing past the sstable until
37-
// the sstable contains the smallest (or largest for reverse iteration) key in
38-
// the merged heap. Note that mergingIter treats a range deletion tombstone
39-
// returned by the point iterator as a no-op.
40-
//
41-
// SeekPrefixGE presents the need for a second type of pausing. If an sstable
42-
// iterator returns "not found" for a SeekPrefixGE operation, we don't want to
43-
// advance to the next sstable as the "not found" does not indicate that all of
44-
// the keys in the sstable are less than the search key. Advancing to the next
45-
// sstable would cause us to skip over range tombstones, violating
46-
// correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the
47-
// kind InternalKeyKindRangeDeletion which will be used to pause the levelIter
48-
// at the sstable until the mergingIter is ready to advance past it.
34+
// iteration needs to "pause" at range deletion boundaries if file contains
35+
// range deletions. In this case, the levelIter uses a keyspan.InterleavingIter
36+
// to materialize InternalKVs at start and end boundaries of range deletions.
37+
// This prevents mergingIter from advancing past the sstable until the sstable
38+
// contains the smallest (or largest for reverse iteration) key in the merged
39+
// heap. Note that mergingIter treats a range deletion tombstone returned by the
40+
// point iterator as a no-op.
4941
type levelIter struct {
5042
// The context is stored here since (a) iterators are expected to be
5143
// short-lived (since they pin sstables), (b) plumbing a context into every
@@ -60,6 +52,9 @@ type levelIter struct {
6052
// recent call to SetBounds.
6153
lower []byte
6254
upper []byte
55+
// prefix holds the iteration prefix when the most recent absolute
56+
// positioning method was a SeekPrefixGE.
57+
prefix []byte
6358
// The iterator options for the currently open table. If
6459
// tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary
6560
// does not lie within the table bounds.
@@ -161,6 +156,7 @@ func (l *levelIter) init(
161156
l.err = nil
162157
l.level = level
163158
l.logger = opts.getLogger()
159+
l.prefix = nil
164160
l.lower = opts.LowerBound
165161
l.upper = opts.UpperBound
166162
l.tableOpts.TableFilter = opts.TableFilter
@@ -629,6 +625,7 @@ func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV
629625

630626
l.err = nil // clear cached iteration error
631627
l.exhaustedDir = 0
628+
l.prefix = nil
632629
// NB: the top-level Iterator has already adjusted key based on
633630
// IterOptions.LowerBound.
634631
loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
@@ -651,9 +648,9 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba
651648
if invariants.Enabled && l.lower != nil && l.cmp(key, l.lower) < 0 {
652649
panic(errors.AssertionFailedf("levelIter SeekGE to key %q violates lower bound %q", key, l.lower))
653650
}
654-
655651
l.err = nil // clear cached iteration error
656652
l.exhaustedDir = 0
653+
l.prefix = prefix
657654

658655
// NB: the top-level Iterator has already adjusted key based on
659656
// IterOptions.LowerBound.
@@ -673,19 +670,6 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba
673670
if err := l.iter.Error(); err != nil {
674671
return nil
675672
}
676-
// It is possible that we are here because bloom filter matching failed. In
677-
// that case it is likely that all keys matching the prefix are wholly
678-
// within the current file and cannot be in the subsequent file. In that
679-
// case we don't want to go to the next file, since loading and seeking in
680-
// there has some cost. Additionally, for sparse key spaces, loading the
681-
// next file will defeat the optimization for the next SeekPrefixGE that is
682-
// called with flags.TrySeekUsingNext(), since for sparse key spaces it is
683-
// likely that the next key will also be contained in the current file.
684-
n := l.split(l.iterFile.LargestPointKey.UserKey)
685-
if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 {
686-
l.exhaustedForward()
687-
return nil
688-
}
689673
return l.verify(l.skipEmptyFileForward())
690674
}
691675

@@ -696,6 +680,7 @@ func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV
696680

697681
l.err = nil // clear cached iteration error
698682
l.exhaustedDir = 0
683+
l.prefix = nil
699684

700685
// NB: the top-level Iterator has already adjusted key based on
701686
// IterOptions.UpperBound.
@@ -716,6 +701,7 @@ func (l *levelIter) First() *base.InternalKV {
716701

717702
l.err = nil // clear cached iteration error
718703
l.exhaustedDir = 0
704+
l.prefix = nil
719705

720706
// NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is
721707
// set.
@@ -736,6 +722,7 @@ func (l *levelIter) Last() *base.InternalKV {
736722

737723
l.err = nil // clear cached iteration error
738724
l.exhaustedDir = 0
725+
l.prefix = nil
739726

740727
// NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is
741728
// set.
@@ -835,6 +822,27 @@ func (l *levelIter) skipEmptyFileForward() *base.InternalKV {
835822
l.exhaustedForward()
836823
return nil
837824
}
825+
826+
// If the iterator is in prefix iteration mode, it's possible that we
827+
// are here because bloom filter matching failed. In that case it is
828+
// likely that all keys matching the prefix are wholly within the
829+
// current file and cannot be in a subsequent file. In that case we
830+
// don't want to go to the next file, since loading and seeking in there
831+
// has some cost.
832+
//
833+
// This is not just an optimization. We must not advance to the next
834+
// file if the current file might possibly contain keys relevant to any
835+
// prefix greater than our current iteration prefix. If we did, a
836+
// subsequent SeekPrefixGE with TrySeekUsingNext could mistakenly skip
837+
// the file's relevant keys.
838+
if l.prefix != nil {
839+
fileLargestPrefix := l.iterFile.LargestPointKey.UserKey[:l.split(l.iterFile.LargestPointKey.UserKey)]
840+
if l.cmp(fileLargestPrefix, l.prefix) > 0 {
841+
l.exhaustedForward()
842+
return nil
843+
}
844+
}
845+
838846
// Current file was exhausted. Move to the next file.
839847
if l.loadFile(l.files.Next(), +1) == noFileLoaded {
840848
l.exhaustedForward()

testdata/iter_histories/prefix_iteration

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,3 +336,29 @@ seek-prefix-ge c@9
336336
----
337337
.
338338
.
339+
340+
# Regression test for #3610.
341+
#
342+
# Similar to the above case, this test consists of two SeekPrefixGEs for
343+
# ascending keys, resulting in TrySeekUsingNext()=true for the second seek.
344+
# Previously, during the first SeekPrefixGE the mergingIter could Next the
345+
# levelIter beyond the file containing point keys relevant to both seeks.
346+
347+
define bloom-bits-per-key=100
348+
L4
349+
350+
L5
351+
352+
353+
----
354+
L4:
355+
000004:[b@0#10,SET-b@0#10,SET]
356+
L5:
357+
000005:[b@8#3,RANGEDEL-c@3#0,SET]
358+
359+
combined-iter
360+
seek-prefix-ge b@10
361+
seek-prefix-ge c@10
362+
----
363+
b@0: (b@0, .)
364+
c@3: (c@3, .)

0 commit comments

Comments
 (0)