Skip to content

Commit 232b685

Browse files
committed
sstable: new property collector API
We simplify the property collector API: an instance now only needs to maintain a single state, not three (data block, index, table). The sstable writer uses multiple separate instances in parallel. The instance for index blocks uses a new `AddCollected` API which adds computed properties from the data blocks. Similarly, the table properties are calculated in a separate instance using `AddCollected` with the properties from the index blocks and the range key block. This new interface will allow re-deriving index and table properties for a partial file download which copies whole data blocks. In the future, this would also allow copying entire data blocks during compactions.
1 parent 89e5644 commit 232b685

File tree

8 files changed

+282
-455
lines changed

8 files changed

+282
-455
lines changed

iterator_test.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -255,22 +255,23 @@ func (c *minSeqNumPropertyCollector) AddRangeKeys(span sstable.Span) error {
255255
return nil
256256
}
257257

258-
func (c *minSeqNumPropertyCollector) FinishDataBlock(buf []byte) ([]byte, error) {
259-
return nil, nil
258+
func (c *minSeqNumPropertyCollector) Finish(buf []byte) []byte {
259+
return binary.AppendUvarint(buf, uint64(c.minSeqNum))
260260
}
261261

262-
func (c *minSeqNumPropertyCollector) AddPrevDataBlockToIndexBlock() {}
263-
264-
func (c *minSeqNumPropertyCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
265-
return nil, nil
266-
}
267-
268-
func (c *minSeqNumPropertyCollector) FinishTable(buf []byte) ([]byte, error) {
269-
return binary.AppendUvarint(buf, uint64(c.minSeqNum)), nil
262+
func (c *minSeqNumPropertyCollector) AddCollected(prop []byte) error {
263+
res, n := binary.Uvarint(prop)
264+
if n <= 0 {
265+
panic("could not decode")
266+
}
267+
if c.minSeqNum == 0 || c.minSeqNum > base.SeqNum(res) {
268+
c.minSeqNum = base.SeqNum(res)
269+
}
270+
return nil
270271
}
271272

272273
func (c *minSeqNumPropertyCollector) AddCollectedWithSuffixReplacement(
273-
oldProp []byte, oldSuffix, newSuffix []byte,
274+
oldProp []byte, newSuffix []byte,
274275
) error {
275276
return errors.Errorf("not implemented")
276277
}

sstable/block_property.go

Lines changed: 61 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,26 @@ import (
1111
"sync"
1212
"unsafe"
1313

14+
"github.com/cockroachdb/errors"
1415
"github.com/cockroachdb/pebble/internal/base"
1516
"github.com/cockroachdb/pebble/internal/invariants"
1617
"github.com/cockroachdb/pebble/internal/keyspan"
1718
)
1819

1920
// Block properties are an optional user-facing feature that can be used to
20-
// filter data blocks (and whole sstables) from an Iterator before they are
21-
// loaded. They do not apply to range delete blocks. These are expected to
22-
// very concisely represent a set of some attribute value contained within the
23-
// key or value, such that the set includes all the attribute values in the
24-
// block. This has some similarities with OLAP pruning approaches that
25-
// maintain min-max attribute values for some column (which concisely
26-
// represent a set), that is then used to prune at query time. In Pebble's
27-
// case, data blocks are small, typically 25-50KB, so these properties should
28-
// reduce their precision in order to be concise -- a good rule of thumb is to
29-
// not consume more than 50-100 bytes across all properties maintained for a
30-
// block, i.e., a 500x reduction compared to loading the data block.
21+
// filter data blocks or index blocks or whole sstables from an Iterator before
22+
// they are loaded.
23+
//
24+
// Block properties are expected to very concisely represent a set of some
25+
// attribute value contained within the key or value, such that the set includes
26+
// all the attribute values in the block. This has some similarities with OLAP
27+
// pruning approaches that maintain min-max attribute values for some column
28+
// (which concisely represent a set), that is then used to prune at query time.
29+
// In Pebble's case, data blocks are small, typically 25-50KB, so these
30+
// properties should reduce their precision in order to be concise -- a good
31+
// rule of thumb is to not consume more than 50-100 bytes across all properties
32+
// maintained for a block, i.e., a 500x reduction compared to loading the data
33+
// block.
3134
//
3235
// A block property must be assigned a unique name, which is encoded and
3336
// stored in the sstable. This name must be unique among all user-properties
@@ -37,17 +40,12 @@ import (
3740
// considered semantically identical. The caller is free to choose the
3841
// semantics of an empty byte slice e.g. they could use it to represent the
3942
// empty set or the universal set, whichever they think is more common and
40-
// therefore better to encode more concisely. The serialization of the
41-
// property for the various Finish*() calls in a BlockPropertyCollector
42-
// implementation should be identical, since the corresponding
43-
// BlockPropertyFilter implementation is not told the context in which it is
44-
// deserializing the property.
43+
// therefore better to encode more concisely.
4544
//
46-
// Block properties are more general than table properties and should be
47-
// preferred over using table properties. A BlockPropertyCollector can achieve
48-
// identical behavior to table properties by returning the nil slice from
49-
// FinishDataBlock and FinishIndexBlock, and interpret them as the universal
50-
// set in BlockPropertyFilter, and return a non-universal set in FinishTable.
45+
// Block properties are hierarchical: the properties for an index block must be
46+
// derivable just from the properties of the data blocks it contains. Similarly,
47+
// the table properties must be derivable just from the properties of the index
48+
// blocks and range block.
5149
//
5250
// Block property filtering is nondeterministic because the separation of keys
5351
// into blocks is nondeterministic. Clients use block-property filters to
@@ -83,19 +81,10 @@ import (
8381
// compactions. If Pebble is configured with such value separation, block
8482
// properties must only apply to the key, and will be provided a nil value.
8583

86-
// BlockPropertyCollector is used when writing a sstable.
87-
//
88-
// - All calls to Add are included in the next FinishDataBlock, after which
89-
// the next data block is expected to start.
90-
//
91-
// - The index entry generated for the data block, which contains the return
92-
// value from FinishDataBlock, is not immediately included in the current
93-
// index block. It is included when AddPrevDataBlockToIndexBlock is called.
94-
// An alternative would be to return an opaque handle from FinishDataBlock
95-
// and pass it to a new AddToIndexBlock method, which requires more
96-
// plumbing, and passing of an interface{} results in a undesirable heap
97-
// allocation. AddPrevDataBlockToIndexBlock must be called before keys are
98-
// added to the new data block.
84+
// BlockPropertyCollector is used when writing a sstable. Multiple
85+
// BlockPropertyCollector instances are used for each property, according to the
86+
// various levels (data/range blocks, index blocks, table). The lowest levels
87+
// use AddPointKey()/AddRangeKey() while the other levels use AddCollected().`
9988
type BlockPropertyCollector interface {
10089
// Name returns the name of the block property collector.
10190
Name() string
@@ -104,16 +93,20 @@ type BlockPropertyCollector interface {
10493
// sstable. The callee can assume that these are in sorted order.
10594
AddPointKey(key InternalKey, value []byte) error
10695

107-
// AddRangeKeys is called for each range span added to the sstable. The range
108-
// key properties are stored separately and don't contribute to data block
109-
// properties. They are only used when FinishTable is called.
110-
// TODO(radu): clean up this subtle semantic.
96+
// AddRangeKeys is called for each range span added to a range key block in
97+
// the sstable. The callee can assume these are fragmented and in sorted
98+
// order.
11199
AddRangeKeys(span keyspan.Span) error
112100

101+
// AddCollected adds previously collected property data. For example, when
102+
// calculating properties for index blocks, AddCollected is called with the
103+
// results of Finish for each data block.
104+
AddCollected(prop []byte) error
105+
113106
// AddCollectedWithSuffixReplacement adds previously collected property data
114-
// and updates it to reflect a change of suffix on all keys: the old property
115-
// data is assumed to be constructed from keys that all have the same
116-
// oldSuffix and is recalculated to reflect the same keys but with newSuffix.
107+
// after updating to reflect a change of suffix on all keys: the property data
108+
// is recalculated to reflect the same keys it was computed from but with
109+
// newSuffix.
117110
//
118111
// A collector which supports this method must be able to derive its updated
119112
// value from its old value and the change being made to the suffix, without
@@ -129,29 +122,15 @@ type BlockPropertyCollector interface {
129122
// This method is optional (if it is not implemented, it always returns an
130123
// error). SupportsSuffixReplacement() can be used to check if this method is
131124
// implemented.
132-
AddCollectedWithSuffixReplacement(oldProp []byte, oldSuffix, newSuffix []byte) error
125+
AddCollectedWithSuffixReplacement(oldProp []byte, newSuffix []byte) error
133126

134127
// SupportsSuffixReplacement returns whether the collector supports the
135128
// AddCollectedWithSuffixReplacement method.
136129
SupportsSuffixReplacement() bool
137130

138-
// FinishDataBlock is called when all the entries have been added to a
139-
// data block. Subsequent Add calls will be for the next data block. It
140-
// returns the property value for the finished block.
141-
FinishDataBlock(buf []byte) ([]byte, error)
142-
143-
// AddPrevDataBlockToIndexBlock adds the entry corresponding to the
144-
// previous FinishDataBlock to the current index block.
145-
AddPrevDataBlockToIndexBlock()
146-
147-
// FinishIndexBlock is called when an index block, containing all the
148-
// key-value pairs since the last FinishIndexBlock, will no longer see new
149-
// entries. It returns the property value for the index block.
150-
FinishIndexBlock(buf []byte) ([]byte, error)
151-
152-
// FinishTable is called when the sstable is finished, and returns the
153-
// property value for the sstable.
154-
FinishTable(buf []byte) ([]byte, error)
131+
// Finish appends the property value to buf and resets the collector to an
132+
// empty state.
133+
Finish(buf []byte) []byte
155134
}
156135

157136
// BlockPropertyFilter is used in an Iterator to filter sstables and blocks
@@ -233,9 +212,7 @@ type BlockIntervalCollector struct {
233212
mapper IntervalMapper
234213
suffixReplacer BlockIntervalSuffixReplacer
235214

236-
blockInterval BlockInterval
237-
indexInterval BlockInterval
238-
tableInterval BlockInterval
215+
interval BlockInterval
239216
}
240217

241218
var _ BlockPropertyCollector = &BlockIntervalCollector{}
@@ -291,7 +268,7 @@ func (b *BlockIntervalCollector) AddPointKey(key InternalKey, value []byte) erro
291268
if err != nil {
292269
return err
293270
}
294-
b.blockInterval.UnionWith(interval)
271+
b.interval.UnionWith(interval)
295272
return nil
296273
}
297274

@@ -306,14 +283,27 @@ func (b *BlockIntervalCollector) AddRangeKeys(span Span) error {
306283
}
307284
// Range keys are not included in block or index intervals; they just apply
308285
// directly to the table interval.
309-
b.tableInterval.UnionWith(interval)
286+
b.interval.UnionWith(interval)
287+
return nil
288+
}
289+
290+
// AddCollected is part of the BlockPropertyCollector interface.
291+
func (b *BlockIntervalCollector) AddCollected(prop []byte) error {
292+
i, err := decodeBlockInterval(prop)
293+
if err != nil {
294+
return err
295+
}
296+
b.interval.UnionWith(i)
310297
return nil
311298
}
312299

313300
// AddCollectedWithSuffixReplacement is part of the BlockPropertyCollector interface.
314301
func (b *BlockIntervalCollector) AddCollectedWithSuffixReplacement(
315-
oldProp []byte, oldSuffix, newSuffix []byte,
302+
oldProp []byte, newSuffix []byte,
316303
) error {
304+
if b.suffixReplacer == nil {
305+
return errors.Errorf("%s does not support suffix replacement", b.name)
306+
}
317307
i, err := decodeBlockInterval(oldProp)
318308
if err != nil {
319309
return err
@@ -322,7 +312,7 @@ func (b *BlockIntervalCollector) AddCollectedWithSuffixReplacement(
322312
if err != nil {
323313
return err
324314
}
325-
b.blockInterval.UnionWith(i)
315+
b.interval.UnionWith(i)
326316
return nil
327317
}
328318

@@ -331,30 +321,11 @@ func (b *BlockIntervalCollector) SupportsSuffixReplacement() bool {
331321
return b.suffixReplacer != nil
332322
}
333323

334-
// FinishDataBlock is part of the BlockPropertyCollector interface.
335-
func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
336-
buf = encodeBlockInterval(b.blockInterval, buf)
337-
b.tableInterval.UnionWith(b.blockInterval)
338-
return buf, nil
339-
}
340-
341-
// AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
342-
// interface.
343-
func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
344-
b.indexInterval.UnionWith(b.blockInterval)
345-
b.blockInterval = BlockInterval{}
346-
}
347-
348-
// FinishIndexBlock implements the BlockPropertyCollector interface.
349-
func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
350-
buf = encodeBlockInterval(b.indexInterval, buf)
351-
b.indexInterval = BlockInterval{}
352-
return buf, nil
353-
}
354-
355-
// FinishTable implements the BlockPropertyCollector interface.
356-
func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
357-
return encodeBlockInterval(b.tableInterval, buf), nil
324+
// Finish is part of the BlockPropertyCollector interface.
325+
func (b *BlockIntervalCollector) Finish(buf []byte) []byte {
326+
result := encodeBlockInterval(b.interval, buf)
327+
b.interval = BlockInterval{}
328+
return result
358329
}
359330

360331
// BlockInterval represents the [Lower, Upper) interval of 64-bit values

sstable/block_property_obsolete.go

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@ import (
1616
// For an explanation of obsolete keys, see the comment for TableFormatPebblev4
1717
// which explains obsolete keys.
1818
type obsoleteKeyBlockPropertyCollector struct {
19-
blockIsNonObsolete bool
20-
indexIsNonObsolete bool
21-
tableIsNonObsolete bool
19+
hasNonObsoletePoint bool
2220
}
2321

2422
var _ BlockPropertyCollector = (*obsoleteKeyBlockPropertyCollector)(nil)
@@ -42,44 +40,37 @@ func (o *obsoleteKeyBlockPropertyCollector) AddRangeKeys(span Span) error {
4240

4341
// AddPoint is an out-of-band method that is specific to this collector.
4442
func (o *obsoleteKeyBlockPropertyCollector) AddPoint(isObsolete bool) {
45-
o.blockIsNonObsolete = o.blockIsNonObsolete || !isObsolete
43+
o.hasNonObsoletePoint = o.hasNonObsoletePoint || !isObsolete
4644
}
4745

48-
// FinishDataBlock is part of the BlockPropertyCollector interface.
49-
func (o *obsoleteKeyBlockPropertyCollector) FinishDataBlock(buf []byte) ([]byte, error) {
50-
o.tableIsNonObsolete = o.tableIsNonObsolete || o.blockIsNonObsolete
51-
return obsoleteKeyBlockPropertyEncode(!o.blockIsNonObsolete, buf), nil
46+
// Finish is part of the BlockPropertyCollector interface.
47+
func (o *obsoleteKeyBlockPropertyCollector) Finish(buf []byte) []byte {
48+
res := obsoleteKeyBlockPropertyEncode(!o.hasNonObsoletePoint, buf)
49+
o.hasNonObsoletePoint = false
50+
return res
5251
}
5352

54-
// AddPrevDataBlockToIndexBlock is part of the BlockPropertyCollector interface.
55-
func (o *obsoleteKeyBlockPropertyCollector) AddPrevDataBlockToIndexBlock() {
56-
o.indexIsNonObsolete = o.indexIsNonObsolete || o.blockIsNonObsolete
57-
o.blockIsNonObsolete = false
58-
}
59-
60-
// FinishIndexBlock is part of the BlockPropertyCollector interface.
61-
func (o *obsoleteKeyBlockPropertyCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
62-
buf = obsoleteKeyBlockPropertyEncode(!o.indexIsNonObsolete, buf)
63-
o.indexIsNonObsolete = false
64-
return buf, nil
65-
}
66-
67-
// FinishTable is part of the BlockPropertyCollector interface.
68-
func (o *obsoleteKeyBlockPropertyCollector) FinishTable(buf []byte) ([]byte, error) {
69-
return obsoleteKeyBlockPropertyEncode(!o.tableIsNonObsolete, buf), nil
53+
// AddCollected is part of the BlockPropertyCollector interface.
54+
func (o *obsoleteKeyBlockPropertyCollector) AddCollected(oldProp []byte) error {
55+
isObsolete, err := obsoleteKeyBlockPropertyDecode(oldProp)
56+
if err != nil {
57+
return err
58+
}
59+
o.hasNonObsoletePoint = o.hasNonObsoletePoint || !isObsolete
60+
return nil
7061
}
7162

7263
// AddCollectedWithSuffixReplacement is part of the BlockPropertyCollector interface.
7364
func (o *obsoleteKeyBlockPropertyCollector) AddCollectedWithSuffixReplacement(
74-
oldProp []byte, oldSuffix, newSuffix []byte,
65+
oldProp []byte, newSuffix []byte,
7566
) error {
7667
// Verify the property is valid.
7768
_, err := obsoleteKeyBlockPropertyDecode(oldProp)
7869
if err != nil {
7970
return err
8071
}
8172
// Suffix rewriting currently loses the obsolete bit.
82-
o.blockIsNonObsolete = true
73+
o.hasNonObsoletePoint = true
8374
return nil
8475
}
8576

0 commit comments

Comments
 (0)