Skip to content

Commit a6d2952

Browse files
manifest: add range annotations
This change adds a "range annotation" feature to Annotators , which are computations that aggregate some value over a specific key range within a level. Range annotations use the same B-tree caching behavior as regular annotations, so queries remain fast even with thousands of tables because they avoid a sequential iteration over a level's files. This PR only sets up range annotations without changing any existing behavior. See #3793 for some potential use cases. `BenchmarkNumFilesRangeAnnotation` shows that range annotations are significantly faster than using `version.Overlaps` to aggregate over a key range: ``` pkg: github.com/cockroachdb/pebble/internal/manifest BenchmarkNumFilesRangeAnnotation/annotator-10 306010 4015 ns/op 48 B/op 6 allocs/op BenchmarkNumFilesRangeAnnotation/overlaps-10 2223 513519 ns/op 336 B/op 8 allocs/op ```
1 parent 2884026 commit a6d2952

File tree

7 files changed

+286
-79
lines changed

7 files changed

+286
-79
lines changed

internal/manifest/annotator.go

Lines changed: 110 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44

55
package manifest
66

7+
import (
8+
"sort"
9+
10+
"github.com/cockroachdb/pebble/internal/base"
11+
)
12+
713
// The Annotator type defined below is used by other packages to lazily
814
// compute a value over a B-Tree. Each node of the B-Tree stores one
915
// `annotation` per annotator, containing the result of the computation over
@@ -24,6 +30,10 @@ package manifest
2430
// computed incrementally as edits are applied to a level.
2531
type Annotator[T any] struct {
2632
Aggregator AnnotationAggregator[T]
33+
34+
// scratch is used to hold the aggregated annotation value when computing
35+
// range annotations in order to avoid additional allocations.
36+
scratch *T
2737
}
2838

2939
// An AnnotationAggregator defines how an annotation should be accumulated
@@ -116,6 +126,80 @@ func (a *Annotator[T]) nodeAnnotation(n *node) (_ *T, cacheOK bool) {
116126
return t, annot.valid
117127
}
118128

129+
// accumulateRangeAnnotation computes this annotator's annotation across all
130+
// files in the node's subtree which overlap with the range defined by bounds.
131+
// The computed annotation is accumulated into a.scratch.
132+
func (a *Annotator[T]) accumulateRangeAnnotation(
133+
n *node,
134+
cmp base.Compare,
135+
bounds base.UserKeyBounds,
136+
// fullyWithinLowerBound and fullyWithinUpperBound indicate whether this
137+
// node's subtree is already known to be within each bound.
138+
fullyWithinLowerBound bool,
139+
fullyWithinUpperBound bool,
140+
) {
141+
// If this node's subtree is fully within the bounds, compute a regular
142+
// annotation.
143+
if fullyWithinLowerBound && fullyWithinUpperBound {
144+
v, _ := a.nodeAnnotation(n)
145+
a.scratch = a.Aggregator.Merge(v, a.scratch)
146+
return
147+
}
148+
149+
// We will accumulate annotations from each item in the end-exclusive
150+
// range [leftItem, rightItem).
151+
leftItem, rightItem := 0, int(n.count)
152+
if !fullyWithinLowerBound {
153+
// leftItem is the index of the first item that overlaps the lower bound.
154+
leftItem = sort.Search(int(n.count), func(i int) bool {
155+
return cmp(bounds.Start, n.items[i].Largest.UserKey) <= 0
156+
})
157+
}
158+
if !fullyWithinUpperBound {
159+
// rightItem is the index of the first item that does not overlap the
160+
// upper bound.
161+
rightItem = sort.Search(int(n.count), func(i int) bool {
162+
return !bounds.End.IsUpperBoundFor(cmp, n.items[i].Smallest.UserKey)
163+
})
164+
}
165+
166+
// Accumulate annotations from every item that overlaps the bounds.
167+
for i := leftItem; i < rightItem; i++ {
168+
v, _ := a.Aggregator.Accumulate(n.items[i], a.scratch)
169+
a.scratch = v
170+
}
171+
172+
if !n.leaf {
173+
// We will accumulate annotations from each child in the end-inclusive
174+
// range [leftChild, rightChild].
175+
leftChild, rightChild := leftItem, rightItem
176+
// If the lower bound overlaps with the child at leftItem, there is no
177+
// need to accumulate annotations from the child to its left.
178+
if leftItem < int(n.count) && cmp(bounds.Start, n.items[leftItem].Smallest.UserKey) >= 0 {
179+
leftChild++
180+
}
181+
// If the upper bound spans beyond the child at rightItem, we must also
182+
// accumulate annotations from the child to its right.
183+
if rightItem < int(n.count) && bounds.End.IsUpperBoundFor(cmp, n.items[rightItem].Largest.UserKey) {
184+
rightChild++
185+
}
186+
187+
for i := leftChild; i <= rightChild; i++ {
188+
a.accumulateRangeAnnotation(
189+
n.children[i],
190+
cmp,
191+
bounds,
192+
// If this child is to the right of leftItem, then its entire
193+
// subtree is within the lower bound.
194+
fullyWithinLowerBound || i > leftItem,
195+
// If this child is to the left of rightItem, then its entire
196+
// subtree is within the upper bound.
197+
fullyWithinUpperBound || i < rightItem,
198+
)
199+
}
200+
}
201+
}
202+
119203
// InvalidateAnnotation removes any existing cached annotations from this
120204
// annotator from a node's subtree.
121205
func (a *Annotator[T]) invalidateNodeAnnotation(n *node) {
@@ -142,8 +226,8 @@ func (a *Annotator[T]) LevelAnnotation(lm LevelMetadata) *T {
142226
return v
143227
}
144228

145-
// LevelAnnotation calculates the annotation defined by this Annotator for all
146-
// files across the given levels. A pointer to the Annotator is used as the
229+
// MultiLevelAnnotation calculates the annotation defined by this Annotator for
230+
// all files across the given levels. A pointer to the Annotator is used as the
147231
// key for pre-calculated values, so the same Annotator must be used to avoid
148232
// duplicate computation. Annotation must not be called concurrently, and in
149233
// practice this is achieved by requiring callers to hold DB.mu.
@@ -158,6 +242,22 @@ func (a *Annotator[T]) MultiLevelAnnotation(lms []LevelMetadata) *T {
158242
return aggregated
159243
}
160244

245+
// LevelRangeAnnotation calculates the annotation defined by this Annotator for
246+
// the files within LevelMetadata which are within the range
247+
// [lowerBound, upperBound). A pointer to the Annotator is used as the key for
248+
// pre-calculated values, so the same Annotator must be used to avoid duplicate
249+
// computation. Annotation must not be called concurrently, and in practice this
250+
// is achieved by requiring callers to hold DB.mu.
251+
func (a *Annotator[T]) LevelRangeAnnotation(lm LevelMetadata, bounds base.UserKeyBounds) *T {
252+
if lm.Empty() {
253+
return a.Aggregator.Zero(nil)
254+
}
255+
256+
a.scratch = a.Aggregator.Zero(a.scratch)
257+
a.accumulateRangeAnnotation(lm.tree.root, lm.tree.cmp, bounds, false, false)
258+
return a.scratch
259+
}
260+
161261
// InvalidateAnnotation clears any cached annotations defined by Annotator. A
162262
// pointer to the Annotator is used as the key for pre-calculated values, so
163263
// the same Annotator must be used to clear the appropriate cached annotation.
@@ -206,6 +306,14 @@ func SumAnnotator(accumulate func(f *FileMetadata) (v uint64, cacheOK bool)) *An
206306
}
207307
}
208308

309+
// NumFilesAnnotator is an Annotator which computes an annotation value
310+
// equal to the number of files included in the annotation. Particularly, it
311+
// can be used to efficiently calculate the number of files in a given key
312+
// range using range annotations.
313+
var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
314+
return 1, true
315+
})
316+
209317
// PickFileAggregator implements the AnnotationAggregator interface. It defines
210318
// an aggregator that picks a single file from a set of eligible files.
211319
type PickFileAggregator struct {

internal/manifest/annotator_test.go

Lines changed: 127 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5,54 +5,47 @@
55
package manifest
66

77
import (
8+
"math/rand"
89
"testing"
910

1011
"github.com/cockroachdb/pebble/internal/base"
1112
"github.com/stretchr/testify/require"
1213
)
1314

14-
func makeTestLevelMetadata(count int) (LevelMetadata, []*FileMetadata) {
15-
files := make([]*FileMetadata, count)
16-
for i := 0; i < count; i++ {
17-
files[i] = newItem(key(i))
15+
// Creates a version with numFiles files in level 6.
16+
func makeTestVersion(numFiles int) (*Version, []*FileMetadata) {
17+
files := make([]*FileMetadata, numFiles)
18+
for i := 0; i < numFiles; i++ {
19+
// Each file spans 10 keys, e.g. [0->9], [10->19], etc.
20+
files[i] = (&FileMetadata{}).ExtendPointKeyBounds(
21+
base.DefaultComparer.Compare, key(i*10), key(i*10+9),
22+
)
23+
files[i].InitPhysicalBacking()
1824
}
1925

20-
lm := MakeLevelMetadata(base.DefaultComparer.Compare, 6, files)
21-
return lm, files
22-
}
26+
var levelFiles [7][]*FileMetadata
27+
levelFiles[6] = files
2328

24-
// NumFilesAnnotator is an Annotator which computes an annotation value
25-
// equal to the number of files included in the annotation.
26-
var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) {
27-
return 1, true
28-
})
29+
v := NewVersion(base.DefaultComparer, 0, levelFiles)
30+
return v, files
31+
}
2932

3033
func TestNumFilesAnnotator(t *testing.T) {
3134
const count = 1000
32-
lm, _ := makeTestLevelMetadata(0)
35+
v, _ := makeTestVersion(0)
3336

3437
for i := 1; i <= count; i++ {
35-
lm.tree.Insert(newItem(key(i)))
36-
numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
38+
v.Levels[6].tree.Insert(newItem(key(i)))
39+
numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6])
3740
require.EqualValues(t, i, numFiles)
3841
}
39-
40-
numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
41-
require.EqualValues(t, count, numFiles)
42-
43-
numFiles = *NumFilesAnnotator.LevelAnnotation(lm)
44-
require.EqualValues(t, count, numFiles)
45-
46-
lm.tree.Delete(newItem(key(count / 2)))
47-
numFiles = *NumFilesAnnotator.LevelAnnotation(lm)
48-
require.EqualValues(t, count-1, numFiles)
4942
}
5043

5144
func BenchmarkNumFilesAnnotator(b *testing.B) {
52-
lm, _ := makeTestLevelMetadata(0)
45+
v, _ := makeTestVersion(0)
5346
for i := 1; i <= b.N; i++ {
54-
lm.tree.Insert(newItem(key(i)))
55-
numFiles := *NumFilesAnnotator.LevelAnnotation(lm)
47+
v.Levels[6].tree.Insert(newItem(key(i)))
48+
numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6])
5649
require.EqualValues(b, uint64(i), numFiles)
5750
}
5851
}
@@ -70,12 +63,115 @@ func TestPickFileAggregator(t *testing.T) {
7063
},
7164
}
7265

73-
lm, files := makeTestLevelMetadata(1)
66+
v, files := makeTestVersion(1)
7467

7568
for i := 1; i <= count; i++ {
76-
lm.tree.Insert(newItem(key(i)))
77-
pickedFile := a.LevelAnnotation(lm)
69+
v.Levels[6].tree.Insert(newItem(key(i)))
70+
pickedFile := a.LevelAnnotation(v.Levels[6])
7871
// The picked file should always be the one with the smallest key.
7972
require.Same(t, files[0], pickedFile)
8073
}
8174
}
75+
76+
func bounds(i int, j int, exclusive bool) base.UserKeyBounds {
77+
b := base.UserKeyBoundsEndExclusiveIf(key(i).UserKey, key(j).UserKey, exclusive)
78+
return b
79+
}
80+
81+
func randomBounds(rng *rand.Rand, count int) base.UserKeyBounds {
82+
first := rng.Intn(count)
83+
second := rng.Intn(count)
84+
exclusive := rng.Intn(2) == 0
85+
return bounds(min(first, second), max(first, second), exclusive)
86+
}
87+
88+
func requireMatchOverlaps(t *testing.T, v *Version, bounds base.UserKeyBounds) {
89+
overlaps := v.Overlaps(6, bounds)
90+
numFiles := *NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], bounds)
91+
require.EqualValues(t, overlaps.length, numFiles)
92+
}
93+
94+
func TestNumFilesRangeAnnotationEmptyRanges(t *testing.T) {
95+
const count = 5_000
96+
v, files := makeTestVersion(count)
97+
98+
// Delete files containing key ranges [0, 999] and [24_000, 25_999].
99+
for i := 0; i < 100; i++ {
100+
v.Levels[6].tree.Delete(files[i])
101+
}
102+
for i := 2400; i < 2600; i++ {
103+
v.Levels[6].tree.Delete(files[i])
104+
}
105+
106+
// Ranges that are completely empty.
107+
requireMatchOverlaps(t, v, bounds(1, 999, false))
108+
requireMatchOverlaps(t, v, bounds(0, 1000, true))
109+
requireMatchOverlaps(t, v, bounds(50_000, 60_000, false))
110+
requireMatchOverlaps(t, v, bounds(24_500, 25_500, false))
111+
requireMatchOverlaps(t, v, bounds(24_000, 26_000, true))
112+
113+
// Partial overlaps with empty ranges.
114+
requireMatchOverlaps(t, v, bounds(0, 1000, false))
115+
requireMatchOverlaps(t, v, bounds(20, 1001, true))
116+
requireMatchOverlaps(t, v, bounds(20, 1010, true))
117+
requireMatchOverlaps(t, v, bounds(23_000, 27_000, true))
118+
requireMatchOverlaps(t, v, bounds(25_000, 40_000, false))
119+
requireMatchOverlaps(t, v, bounds(25_500, 26_001, true))
120+
121+
// Ranges which only spans a single table.
122+
requireMatchOverlaps(t, v, bounds(45_000, 45_000, true))
123+
requireMatchOverlaps(t, v, bounds(30_000, 30_001, true))
124+
requireMatchOverlaps(t, v, bounds(23_000, 23_000, false))
125+
}
126+
127+
func TestNumFilesRangeAnnotationRandomized(t *testing.T) {
128+
const count = 10_000
129+
const numIterations = 10_000
130+
131+
v, _ := makeTestVersion(count)
132+
133+
rng := rand.New(rand.NewSource(int64(0)))
134+
for i := 0; i < numIterations; i++ {
135+
requireMatchOverlaps(t, v, randomBounds(rng, count*11))
136+
}
137+
}
138+
139+
func BenchmarkNumFilesRangeAnnotation(b *testing.B) {
140+
const count = 100_000
141+
v, files := makeTestVersion(count)
142+
143+
rng := rand.New(rand.NewSource(int64(0)))
144+
b.Run("annotator", func(b *testing.B) {
145+
for i := 0; i < b.N; i++ {
146+
b := randomBounds(rng, count*11)
147+
// Randomly delete and reinsert a file to verify that range
148+
// annotations are still fast despite small mutations.
149+
toDelete := rng.Intn(count)
150+
v.Levels[6].tree.Delete(files[toDelete])
151+
152+
NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], b)
153+
154+
v.Levels[6].tree.Insert(files[toDelete])
155+
}
156+
})
157+
158+
// Also benchmark an equivalent aggregation using version.Overlaps to show
159+
// the difference in performance.
160+
b.Run("overlaps", func(b *testing.B) {
161+
for i := 0; i < b.N; i++ {
162+
b := randomBounds(rng, count*11)
163+
toDelete := rng.Intn(count)
164+
v.Levels[6].tree.Delete(files[toDelete])
165+
166+
overlaps := v.Overlaps(6, b)
167+
iter := overlaps.Iter()
168+
numFiles := 0
169+
for f := iter.First(); f != nil; f = iter.Next() {
170+
numFiles++
171+
}
172+
173+
v.Levels[6].tree.Insert(files[toDelete])
174+
}
175+
})
176+
177+
}

0 commit comments

Comments
 (0)