Skip to content

Commit 5242eb1

Browse files
committed
colblk: use key value columnar block for properties
This patch uses the new, drop-in columnar block to replace the row format of properties blocks. Fixes: #4425
1 parent 825c04c commit 5242eb1

24 files changed

+406
-240
lines changed

format_major_version.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,9 @@ const (
208208
// format TableFormatPebblev6.
209209
//
210210
// The TableFormatPebblev6 sstable format introduces a checksum within the
211-
// sstable footer, and allows inclusion of blob handle references within the
212-
// value column of a sstable block.
211+
// sstable footer, allows inclusion of blob handle references within the
212+
// value column of a sstable block, and supports columnar meta index +
213+
// properties blocks.
213214
//
214215
// This format major version does not yet enable use of value separation.
215216
FormatTableFormatV6

sstable/colblk/key_value_block_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,29 @@ func TestMetaIndexBlock(t *testing.T) {
4848
}
4949
})
5050
}
51+
52+
func TestPropertiesBlock(t *testing.T) {
53+
var decoder KeyValueBlockDecoder
54+
var buf bytes.Buffer
55+
datadriven.RunTest(t, "testdata/properties_block", func(t *testing.T, d *datadriven.TestData) string {
56+
buf.Reset()
57+
switch d.Cmd {
58+
case "build":
59+
var w KeyValueBlockWriter
60+
w.Init()
61+
for _, line := range strings.Split(d.Input, "\n") {
62+
fields := strings.Fields(line)
63+
key := []byte(fields[0])
64+
value := []byte(fields[1])
65+
w.AddKV(key, value)
66+
}
67+
68+
data := w.Finish(w.Rows())
69+
decoder.Init(data)
70+
fmt.Fprint(&buf, decoder.DebugString())
71+
return buf.String()
72+
default:
73+
return fmt.Sprintf("unknown command: %s", d.Cmd)
74+
}
75+
})
76+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
build
2+
rocksdb.deleted.keys 14
3+
rocksdb.filter.size 2
4+
rocksdb.index.size 3
5+
rocksdb.num.data.blocks 8
6+
----
7+
key-value-block-decoder
8+
└── key value block header
9+
├── columnar block header
10+
│ ├── 000-001: x 01 # version 1
11+
│ ├── 001-003: x 0200 # 2 columns
12+
│ ├── 003-007: x 04000000 # 4 rows
13+
│ ├── 007-008: b 00000011 # col 0: bytes
14+
│ ├── 008-012: x 11000000 # col 0: page start 17
15+
│ ├── 012-013: b 00000011 # col 1: bytes
16+
│ └── 013-017: x 67000000 # col 1: page start 103
17+
├── data for column 0 (bytes)
18+
│ ├── offsets table
19+
│ │ ├── 017-018: x 01 # encoding: 1b
20+
│ │ ├── 018-019: x 00 # data[0] = 0 [23 overall]
21+
│ │ ├── 019-020: x 14 # data[1] = 20 [43 overall]
22+
│ │ ├── 020-021: x 27 # data[2] = 39 [62 overall]
23+
│ │ ├── 021-022: x 39 # data[3] = 57 [80 overall]
24+
│ │ └── 022-023: x 50 # data[4] = 80 [103 overall]
25+
│ └── data
26+
│ ├── 023-033: x 726f636b7364622e6465 # data[0]: rocksdb.deleted.keys
27+
│ ├── 033-043: x 6c657465642e6b657973 # (continued...)
28+
│ ├── 043-053: x 726f636b7364622e6669 # data[1]: rocksdb.filter.size
29+
│ ├── 053-062: x 6c7465722e73697a65 # (continued...)
30+
│ ├── 062-072: x 726f636b7364622e696e # data[2]: rocksdb.index.size
31+
│ ├── 072-080: x 6465782e73697a65 # (continued...)
32+
│ ├── 080-090: x 726f636b7364622e6e75 # data[3]: rocksdb.num.data.blocks
33+
│ ├── 090-100: x 6d2e646174612e626c6f # (continued...)
34+
│ └── 100-103: x 636b73 # (continued...)
35+
├── data for column 1 (bytes)
36+
│ ├── offsets table
37+
│ │ ├── 103-104: x 01 # encoding: 1b
38+
│ │ ├── 104-105: x 00 # data[0] = 0 [109 overall]
39+
│ │ ├── 105-106: x 02 # data[1] = 2 [111 overall]
40+
│ │ ├── 106-107: x 03 # data[2] = 3 [112 overall]
41+
│ │ ├── 107-108: x 04 # data[3] = 4 [113 overall]
42+
│ │ └── 108-109: x 05 # data[4] = 5 [114 overall]
43+
│ └── data
44+
│ ├── 109-111: x 3134 # data[0]: 14
45+
│ ├── 111-112: x 32 # data[1]: 2
46+
│ ├── 112-113: x 33 # data[2]: 3
47+
│ └── 113-114: x 38 # data[3]: 8
48+
└── 114-115: x 00 # block padding byte

sstable/colblk_writer.go

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,16 +1011,25 @@ func (w *RawColumnWriter) Close() (err error) {
10111011
}
10121012
}
10131013

1014-
var raw rowblk.Writer
1015-
// The restart interval is set to infinity because the properties block
1016-
// is always read sequentially and cached in a heap located object. This
1017-
// reduces table size without a significant impact on performance.
1018-
raw.RestartInterval = propertiesBlockRestartInterval
1014+
var toWrite []byte
10191015
w.props.CompressionOptions = rocksDBCompressionOptions
1020-
if err := w.props.save(w.opts.TableFormat, &raw); err != nil {
1021-
return err
1016+
if w.opts.TableFormat >= TableFormatPebblev6 {
1017+
var cw colblk.KeyValueBlockWriter
1018+
cw.Init()
1019+
w.props.saveToColWriter(w.opts.TableFormat, &cw)
1020+
toWrite = cw.Finish(cw.Rows())
1021+
} else {
1022+
var raw rowblk.Writer
1023+
// The restart interval is set to infinity because the properties block
1024+
// is always read sequentially and cached in a heap located object. This
1025+
// reduces table size without a significant impact on performance.
1026+
raw.RestartInterval = propertiesBlockRestartInterval
1027+
if err = w.props.saveToRowWriter(w.opts.TableFormat, &raw); err != nil {
1028+
return err
1029+
}
1030+
toWrite = raw.Finish()
10221031
}
1023-
if _, err := w.layout.WritePropertiesBlock(raw.Finish()); err != nil {
1032+
if _, err = w.layout.WritePropertiesBlock(toWrite); err != nil {
10241033
return err
10251034
}
10261035
}

sstable/format.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ const (
2929
TableFormatPebblev3 // Value blocks.
3030
TableFormatPebblev4 // DELSIZED tombstones.
3131
TableFormatPebblev5 // Columnar blocks.
32-
TableFormatPebblev6 // Checksum footer + blob value handles + columnar metaindex.
32+
TableFormatPebblev6 // Checksum footer + blob value handles + columnar metaindex/properties.
3333
NumTableFormats
3434

3535
TableFormatMax = NumTableFormats - 1

sstable/layout.go

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,23 @@ func (l *Layout) Describe(
254254
if err != nil {
255255
return err
256256
}
257-
iter, _ := rowblk.NewRawIter(r.Comparer.Compare, h.BlockData())
258-
iter.Describe(tpNode, func(w io.Writer, key *base.InternalKey, value []byte, enc rowblk.KVEncoding) {
259-
fmt.Fprintf(w, "%05d %s (%d)", enc.Offset, key.UserKey, enc.Length)
260-
})
257+
if r.tableFormat >= TableFormatPebblev6 {
258+
var decoder colblk.KeyValueBlockDecoder
259+
decoder.Init(h.BlockData())
260+
offset := 0
261+
for i := 0; i < decoder.BlockDecoder().Rows(); i++ {
262+
key := decoder.KeyAt(i)
263+
value := decoder.ValueAt(i)
264+
length := len(key) + len(value)
265+
tpNode.Childf("%05d %s (%d)", offset, key, length)
266+
offset += length
267+
}
268+
} else {
269+
iter, _ := rowblk.NewRawIter(r.Comparer.Compare, h.BlockData())
270+
iter.Describe(tpNode, func(w io.Writer, key *base.InternalKey, value []byte, enc rowblk.KVEncoding) {
271+
fmt.Fprintf(w, "%05d %s (%d)", enc.Offset, key.UserKey, enc.Length)
272+
})
273+
}
261274

262275
case "meta-index":
263276
if b.Handle != r.metaindexBH {
@@ -286,18 +299,16 @@ func (l *Layout) Describe(
286299
bh, n = block.DecodeHandle(value)
287300
}
288301
if n == 0 || n != len(value) {
289-
s := fmt.Sprintf("%04d [err: %s]\n", i, err)
290-
tpNode.Child(s)
302+
tpNode.Childf("%04d [err: %s]\n", i, err)
291303
continue
292304
}
293305
var vbihStr string
294306
if isValueBlocksIndexHandle {
295307
vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)",
296308
vbih.BlockNumByteLength, vbih.BlockOffsetByteLength, vbih.BlockLengthByteLength)
297309
}
298-
s := fmt.Sprintf("%04d %s block:%d/%d%s\n",
310+
tpNode.Childf("%04d %s block:%d/%d%s\n",
299311
i, key, bh.Offset, bh.Length, vbihStr)
300-
tpNode.Child(s)
301312
}
302313
} else {
303314
iter, _ := rowblk.NewRawIter(r.Comparer.Compare, h.BlockData())
@@ -562,8 +573,14 @@ func decodeLayout(comparer *base.Comparer, data []byte, tableFormat TableFormat)
562573
if err != nil {
563574
return Layout{}, errors.Wrap(err, "decompressing properties")
564575
}
565-
if err := props.load(decompressedProps, map[string]struct{}{}); err != nil {
566-
return Layout{}, err
576+
if tableFormat >= TableFormatPebblev6 {
577+
if err = props.load(decompressedProps, map[string]struct{}{}); err != nil {
578+
return Layout{}, err
579+
}
580+
} else {
581+
if err = props.loadFromRowBlock(decompressedProps, map[string]struct{}{}); err != nil {
582+
return Layout{}, err
583+
}
567584
}
568585

569586
if props.IndexType == twoLevelIndex {
@@ -816,33 +833,38 @@ func (w *layoutWriter) WriteFilterBlock(f filterWriter) (bh block.Handle, err er
816833
if err != nil {
817834
return block.Handle{}, err
818835
}
819-
return w.writeNamedBlock(b, f.metaName())
836+
return w.writeNamedBlock(b, block.NoCompression, f.metaName())
820837
}
821838

822839
// WritePropertiesBlock constructs a trailer for the provided properties block
823840
// and writes the block and trailer to the writer. It automatically adds the
824841
// properties block to the file's meta index when the writer is finished.
825842
func (w *layoutWriter) WritePropertiesBlock(b []byte) (block.Handle, error) {
826-
return w.writeNamedBlock(b, metaPropertiesName)
843+
if w.tableFormat >= TableFormatPebblev6 {
844+
return w.writeNamedBlock(b, w.compression, metaPropertiesName)
845+
}
846+
return w.writeNamedBlock(b, block.NoCompression, metaPropertiesName)
827847
}
828848

829849
// WriteRangeKeyBlock constructs a trailer for the provided range key block and
830850
// writes the block and trailer to the writer. It automatically adds the range
831851
// key block to the file's meta index when the writer is finished.
832852
func (w *layoutWriter) WriteRangeKeyBlock(b []byte) (block.Handle, error) {
833-
return w.writeNamedBlock(b, metaRangeKeyName)
853+
return w.writeNamedBlock(b, block.NoCompression, metaRangeKeyName)
834854
}
835855

836856
// WriteRangeDeletionBlock constructs a trailer for the provided range deletion
837857
// block and writes the block and trailer to the writer. It automatically adds
838858
// the range deletion block to the file's meta index when the writer is
839859
// finished.
840860
func (w *layoutWriter) WriteRangeDeletionBlock(b []byte) (block.Handle, error) {
841-
return w.writeNamedBlock(b, metaRangeDelV2Name)
861+
return w.writeNamedBlock(b, block.NoCompression, metaRangeDelV2Name)
842862
}
843863

844-
func (w *layoutWriter) writeNamedBlock(b []byte, name string) (bh block.Handle, err error) {
845-
bh, err = w.writeBlock(b, block.NoCompression, &w.buf)
864+
func (w *layoutWriter) writeNamedBlock(
865+
b []byte, compression block.Compression, name string,
866+
) (bh block.Handle, err error) {
867+
bh, err = w.writeBlock(b, compression, &w.buf)
846868
if err == nil {
847869
w.recordToMetaindex(name, bh)
848870
}

sstable/properties.go

Lines changed: 66 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,16 @@ import (
88
"bytes"
99
"encoding/binary"
1010
"fmt"
11+
"maps"
1112
"math"
1213
"reflect"
14+
"slices"
1315
"sort"
1416
"strings"
1517
"unsafe"
1618

1719
"github.com/cockroachdb/pebble/internal/intern"
20+
"github.com/cockroachdb/pebble/sstable/colblk"
1821
"github.com/cockroachdb/pebble/sstable/rowblk"
1922
)
2023

@@ -271,7 +274,7 @@ func (p *Properties) String() string {
271274
return buf.String()
272275
}
273276

274-
func (p *Properties) load(b []byte, deniedUserProperties map[string]struct{}) error {
277+
func (p *Properties) loadFromRowBlock(b []byte, deniedUserProperties map[string]struct{}) error {
275278
i, err := rowblk.NewRawIter(bytes.Compare, b)
276279
if err != nil {
277280
return err
@@ -309,6 +312,44 @@ func (p *Properties) load(b []byte, deniedUserProperties map[string]struct{}) er
309312
return nil
310313
}
311314

315+
func (p *Properties) load(b []byte, deniedUserProperties map[string]struct{}) error {
316+
var decoder colblk.KeyValueBlockDecoder
317+
decoder.Init(b)
318+
p.Loaded = make(map[uintptr]struct{})
319+
v := reflect.ValueOf(p).Elem()
320+
321+
for i := 0; i < decoder.BlockDecoder().Rows(); i++ {
322+
key := decoder.KeyAt(i)
323+
value := decoder.ValueAt(i)
324+
if f, ok := propTagMap[string(key)]; ok {
325+
p.Loaded[f.Offset] = struct{}{}
326+
field := v.FieldByIndex(f.Index)
327+
switch f.Type.Kind() {
328+
case reflect.Bool:
329+
field.SetBool(bytes.Equal(value, propBoolTrue))
330+
case reflect.Uint32:
331+
field.SetUint(uint64(binary.LittleEndian.Uint32(value)))
332+
case reflect.Uint64:
333+
n, _ := binary.Uvarint(value)
334+
field.SetUint(n)
335+
case reflect.String:
336+
field.SetString(intern.Bytes(value))
337+
default:
338+
panic("not reached")
339+
}
340+
continue
341+
}
342+
if p.UserProperties == nil {
343+
p.UserProperties = make(map[string]string)
344+
}
345+
346+
if _, denied := deniedUserProperties[string(key)]; !denied {
347+
p.UserProperties[intern.Bytes(key)] = string(value)
348+
}
349+
}
350+
return nil
351+
}
352+
312353
func (p *Properties) saveBool(m map[string][]byte, offset uintptr, value bool) {
313354
tag := propOffsetTagMap[offset]
314355
if value {
@@ -342,7 +383,7 @@ func (p *Properties) saveString(m map[string][]byte, offset uintptr, value strin
342383
m[propOffsetTagMap[offset]] = []byte(value)
343384
}
344385

345-
func (p *Properties) save(tblFormat TableFormat, w *rowblk.Writer) error {
386+
func (p *Properties) accumulateProps(tblFormat TableFormat) ([]string, map[string][]byte) {
346387
m := make(map[string][]byte)
347388
for k, v := range p.UserProperties {
348389
m[k] = []byte(v)
@@ -438,11 +479,14 @@ func (p *Properties) save(tblFormat TableFormat, w *rowblk.Writer) error {
438479
m["rocksdb.format.version"] = singleZeroSlice
439480
}
440481

441-
keys := make([]string, 0, len(m))
442-
for key := range m {
443-
keys = append(keys, key)
444-
}
482+
keys := slices.Collect(maps.Keys(m))
445483
sort.Strings(keys)
484+
485+
return keys, m
486+
}
487+
488+
func (p *Properties) saveToRowWriter(tblFormat TableFormat, w *rowblk.Writer) error {
489+
keys, m := p.accumulateProps(tblFormat)
446490
for _, key := range keys {
447491
if err := w.AddRawString(key, m[key]); err != nil {
448492
return err
@@ -451,6 +495,22 @@ func (p *Properties) save(tblFormat TableFormat, w *rowblk.Writer) error {
451495
return nil
452496
}
453497

498+
func (p *Properties) saveToColWriter(tblFormat TableFormat, w *colblk.KeyValueBlockWriter) {
499+
keys, m := p.accumulateProps(tblFormat)
500+
for _, key := range keys {
501+
// Zero-length keys are unsupported. See below about StringData.
502+
if len(key) == 0 {
503+
continue
504+
}
505+
// Use an unsafe conversion to avoid allocating. AddKV is not
506+
// supposed to modify the given slice, so the unsafe conversion
507+
// is okay. Note that unsafe.StringData panics if len(key) == 0,
508+
// so we explicitly skip zero-length keys above. They shouldn't
509+
// occur in practice.
510+
w.AddKV(unsafe.Slice(unsafe.StringData(key), len(key)), m[key])
511+
}
512+
}
513+
454514
var (
455515
singleZeroSlice = []byte{0x00}
456516
maxInt32Slice = binary.AppendUvarint([]byte(nil), math.MaxInt32)

sstable/properties_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,10 @@ func TestPropertiesSave(t *testing.T) {
103103
// Check that we can save properties and read them back.
104104
var w rowblk.Writer
105105
w.RestartInterval = propertiesBlockRestartInterval
106-
require.NoError(t, e.save(TableFormatPebblev2, &w))
106+
require.NoError(t, e.saveToRowWriter(TableFormatPebblev2, &w))
107107
var props Properties
108108

109-
require.NoError(t, props.load(w.Finish(), make(map[string]struct{})))
109+
require.NoError(t, props.loadFromRowBlock(w.Finish(), make(map[string]struct{})))
110110
props.Loaded = nil
111111
if diff := pretty.Diff(*e, props); diff != nil {
112112
t.Fatalf("%s", strings.Join(diff, "\n"))
@@ -130,13 +130,13 @@ func TestPropertiesSave(t *testing.T) {
130130
func BenchmarkPropertiesLoad(b *testing.B) {
131131
var w rowblk.Writer
132132
w.RestartInterval = propertiesBlockRestartInterval
133-
require.NoError(b, testProps.save(TableFormatPebblev2, &w))
133+
require.NoError(b, testProps.saveToRowWriter(TableFormatPebblev2, &w))
134134
block := w.Finish()
135135

136136
b.ResetTimer()
137137
p := &Properties{}
138138
for i := 0; i < b.N; i++ {
139139
*p = Properties{}
140-
require.NoError(b, p.load(block, nil))
140+
require.NoError(b, p.loadFromRowBlock(block, nil))
141141
}
142142
}

0 commit comments

Comments
 (0)