@@ -31,21 +31,13 @@ type internalIterOpts struct {
3131//
3232// levelIter is used during compaction and as part of the Iterator
3333// implementation. When used as part of the Iterator implementation, level
34- // iteration needs to "pause" at sstable boundaries if a range deletion iterator
35- // is open. In this case, we materialize a "synthetic" InternalKV to return from
36- // levelIter. This prevents mergingIter from advancing past the sstable until
37- // the sstable contains the smallest (or largest for reverse iteration) key in
38- // the merged heap. Note that mergingIter treats a range deletion tombstone
39- // returned by the point iterator as a no-op.
40- //
41- // SeekPrefixGE presents the need for a second type of pausing. If an sstable
42- // iterator returns "not found" for a SeekPrefixGE operation, we don't want to
43- // advance to the next sstable as the "not found" does not indicate that all of
44- // the keys in the sstable are less than the search key. Advancing to the next
45- // sstable would cause us to skip over range tombstones, violating
46- // correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the
47- // kind InternalKeyKindRangeDeletion which will be used to pause the levelIter
48- // at the sstable until the mergingIter is ready to advance past it.
34+ // iteration needs to "pause" at range deletion boundaries if file contains
35+ // range deletions. In this case, the levelIter uses a keyspan.InterleavingIter
36+ // to materialize InternalKVs at start and end boundaries of range deletions.
37+ // This prevents mergingIter from advancing past the sstable until the sstable
38+ // contains the smallest (or largest for reverse iteration) key in the merged
39+ // heap. Note that mergingIter treats a range deletion tombstone returned by the
40+ // point iterator as a no-op.
4941type levelIter struct {
5042 // The context is stored here since (a) iterators are expected to be
5143 // short-lived (since they pin sstables), (b) plumbing a context into every
@@ -60,6 +52,9 @@ type levelIter struct {
6052 // recent call to SetBounds.
6153 lower []byte
6254 upper []byte
55+ // prefix holds the iteration prefix when the most recent absolute
56+ // positioning method was a SeekPrefixGE.
57+ prefix []byte
6358 // The iterator options for the currently open table. If
6459 // tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary
6560 // does not lie within the table bounds.
@@ -161,6 +156,7 @@ func (l *levelIter) init(
161156 l .err = nil
162157 l .level = level
163158 l .logger = opts .getLogger ()
159+ l .prefix = nil
164160 l .lower = opts .LowerBound
165161 l .upper = opts .UpperBound
166162 l .tableOpts .TableFilter = opts .TableFilter
@@ -629,6 +625,7 @@ func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV
629625
630626 l .err = nil // clear cached iteration error
631627 l .exhaustedDir = 0
628+ l .prefix = nil
632629 // NB: the top-level Iterator has already adjusted key based on
633630 // IterOptions.LowerBound.
634631 loadFileIndicator := l .loadFile (l .findFileGE (key , flags ), + 1 )
@@ -651,9 +648,9 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba
651648 if invariants .Enabled && l .lower != nil && l .cmp (key , l .lower ) < 0 {
652649 panic (errors .AssertionFailedf ("levelIter SeekGE to key %q violates lower bound %q" , key , l .lower ))
653650 }
654-
655651 l .err = nil // clear cached iteration error
656652 l .exhaustedDir = 0
653+ l .prefix = prefix
657654
658655 // NB: the top-level Iterator has already adjusted key based on
659656 // IterOptions.LowerBound.
@@ -673,19 +670,6 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba
673670 if err := l .iter .Error (); err != nil {
674671 return nil
675672 }
676- // It is possible that we are here because bloom filter matching failed. In
677- // that case it is likely that all keys matching the prefix are wholly
678- // within the current file and cannot be in the subsequent file. In that
679- // case we don't want to go to the next file, since loading and seeking in
680- // there has some cost. Additionally, for sparse key spaces, loading the
681- // next file will defeat the optimization for the next SeekPrefixGE that is
682- // called with flags.TrySeekUsingNext(), since for sparse key spaces it is
683- // likely that the next key will also be contained in the current file.
684- n := l .split (l .iterFile .LargestPointKey .UserKey )
685- if l .cmp (prefix , l .iterFile .LargestPointKey .UserKey [:n ]) < 0 {
686- l .exhaustedForward ()
687- return nil
688- }
689673 return l .verify (l .skipEmptyFileForward ())
690674}
691675
@@ -696,6 +680,7 @@ func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV
696680
697681 l .err = nil // clear cached iteration error
698682 l .exhaustedDir = 0
683+ l .prefix = nil
699684
700685 // NB: the top-level Iterator has already adjusted key based on
701686 // IterOptions.UpperBound.
@@ -716,6 +701,7 @@ func (l *levelIter) First() *base.InternalKV {
716701
717702 l .err = nil // clear cached iteration error
718703 l .exhaustedDir = 0
704+ l .prefix = nil
719705
720706 // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is
721707 // set.
@@ -736,6 +722,7 @@ func (l *levelIter) Last() *base.InternalKV {
736722
737723 l .err = nil // clear cached iteration error
738724 l .exhaustedDir = 0
725+ l .prefix = nil
739726
740727 // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is
741728 // set.
@@ -835,6 +822,27 @@ func (l *levelIter) skipEmptyFileForward() *base.InternalKV {
835822 l .exhaustedForward ()
836823 return nil
837824 }
825+
826+ // If the iterator is in prefix iteration mode, it's possible that we
827+ // are here because bloom filter matching failed. In that case it is
828+ // likely that all keys matching the prefix are wholly within the
829+ // current file and cannot be in a subsequent file. In that case we
830+ // don't want to go to the next file, since loading and seeking in there
831+ // has some cost.
832+ //
833+ // This is not just an optimization. We must not advance to the next
834+ // file if the current file might possibly contain keys relevant to any
835+ // prefix greater than our current iteration prefix. If we did, a
836+ // subsequent SeekPrefixGE with TrySeekUsingNext could mistakenly skip
837+ // the file's relevant keys.
838+ if l .prefix != nil {
839+ fileLargestPrefix := l .iterFile .LargestPointKey .UserKey [:l .split (l .iterFile .LargestPointKey .UserKey )]
840+ if l .cmp (fileLargestPrefix , l .prefix ) > 0 {
841+ l .exhaustedForward ()
842+ return nil
843+ }
844+ }
845+
838846 // Current file was exhausted. Move to the next file.
839847 if l .loadFile (l .files .Next (), + 1 ) == noFileLoaded {
840848 l .exhaustedForward ()
0 commit comments