Skip to content

Commit cede562

Browse files
committed
sstable/tieredmeta: add tiering histogram using t-digest
Add StatsHistogram, a quantile sketch backed by t-digest, to track byte distribution by TieringAttribute. The histogram supports CDF and quantile queries to estimate bytes above/below thresholds for tiering decisions. Add TieringHistogramBlockWriter to encode multiple histograms per sstable or blob file, keyed by (KindAndTier, SpanID). The block will support histogram types for sstable keys, inline values, and blob references (hot/cold).
1 parent 8594f9b commit cede562

File tree

8 files changed

+609
-4
lines changed

8 files changed

+609
-4
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ require (
44
github.com/DataDog/zstd v1.5.7
55
github.com/HdrHistogram/hdrhistogram-go v1.1.2
66
github.com/RaduBerinde/axisds v0.0.0-20260105221726-1be486564c85
7+
github.com/RaduBerinde/tdigest v0.0.0-20251022152254-90e030c3a314
78
github.com/cespare/xxhash/v2 v2.2.0
89
github.com/cockroachdb/crlib v0.0.0-20251122031428-fe658a2dbda1
910
github.com/cockroachdb/datadriven v1.0.3-0.20251123150250-ddff6747b112

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ github.com/RaduBerinde/axisds v0.0.0-20260105221726-1be486564c85 h1:OjUilXBeTEMa
1111
github.com/RaduBerinde/axisds v0.0.0-20260105221726-1be486564c85/go.mod h1:gAhfbTwNsV064J6aon3TDJLzeh5IxfptXalQVnjjnWQ=
1212
github.com/RaduBerinde/btreemap v0.0.0-20260105202824-d3184786f603 h1:fSdiBlO4Bad28mJOPlAynvfgdDC9v+yRlzSFHvvjKYI=
1313
github.com/RaduBerinde/btreemap v0.0.0-20260105202824-d3184786f603/go.mod h1:0tr7FllbE9gJkHq7CVeeDDFAFKQVy5RnCSSNBOvdqbc=
14+
github.com/RaduBerinde/tdigest v0.0.0-20251022152254-90e030c3a314 h1:RcSNrxDZ1ZyEfpGwrpWqd3YWfMjQbKOtT+p3Wht6XN0=
15+
github.com/RaduBerinde/tdigest v0.0.0-20251022152254-90e030c3a314/go.mod h1:RClmoWh7JPAzCuExyOvKDCRVYGz3D11DcpzEkq4N3RA=
1416
github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 h1:xlwdaKcTNVW4PtpQb8aKA4Pjy0CdJHEqvFbAnvR5m2g=
1517
github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794/go.mod h1:7e+I0LQFUI9AXWxOfsQROs9xPhoJtbsyWcjJqDd4KPY=
1618
github.com/aclements/go-perfevent v0.0.0-20240301234650-f7843625020f h1:JjxwchlOepwsUWcQwD2mLUAGE9aCp0/ehy6yCHFBOvo=

internal/base/internal.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,11 @@ type InternalKV struct {
672672
// The zero value is reserved to mean "no attribute" or "unknown".
673673
type TieringAttribute uint64
674674

675+
// TieringSpanID is an identifier that provides a context for interpreting a
676+
// TieringAttribute value. At any point in time, the span policies determine
677+
// a set of non-overlapping key regions with distinct TieringSpanIDs.
678+
type TieringSpanID uint64
679+
675680
// KVMeta describes optional metadata associated with an `InternalKV`.
676681
// It's currently produced only by sstable-backed iterators and is not embedded
677682
// within `InternalKV` to avoid overhead on the common iteration path.
@@ -683,7 +688,7 @@ type TieringAttribute uint64
683688
// These methods exist to support compaction-only logic (eg, `compaction.Iter`).
684689
// Regular iteration should use the standard methods that do not surface metadata.
685690
type KVMeta struct {
686-
TieringSpanID uint64
691+
TieringSpanID TieringSpanID
687692
TieringAttribute TieringAttribute
688693
}
689694

internal/testkeys/testkeys.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ func ExtractKVMeta(value []byte) base.KVMeta {
555555
if e != nil {
556556
panic(fmt.Sprintf("invalid tiering span in KV metadata in %q", value))
557557
}
558-
res.TieringSpanID = v
558+
res.TieringSpanID = base.TieringSpanID(v)
559559
v, e = strconv.ParseUint(m[2], 10, 64)
560560
if e != nil {
561561
panic(fmt.Sprintf("invalid tiering attr in KV metadata in %q", value))

sstable/colblk/data_block.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ func (w *DataBlockEncoder) Add(
708708
w.values.Put(value)
709709
}
710710
if w.columnConfig.SupportsTiering() && meta != (base.KVMeta{}) {
711-
w.tieringSpanIDs.Set(w.rows, meta.TieringSpanID)
711+
w.tieringSpanIDs.Set(w.rows, uint64(meta.TieringSpanID))
712712
w.tieringAttributes.Set(w.rows, uint64(meta.TieringAttribute))
713713
}
714714
if len(ikey.UserKey) > int(w.maximumKeyLength) {
@@ -1518,7 +1518,7 @@ func (i *DataBlockIter) decodeMeta() base.KVMeta {
15181518
return base.KVMeta{}
15191519
}
15201520
return base.KVMeta{
1521-
TieringSpanID: i.tieringSpanIDs.At(i.row),
1521+
TieringSpanID: base.TieringSpanID(i.tieringSpanIDs.At(i.row)),
15221522
TieringAttribute: base.TieringAttribute(i.tieringAttributes.At(i.row)),
15231523
}
15241524
}

sstable/tieredmeta/histogram.go

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package tieredmeta
6+
7+
import (
8+
"encoding/binary"
9+
10+
"github.com/RaduBerinde/tdigest"
11+
"github.com/cockroachdb/pebble/internal/base"
12+
"github.com/pkg/errors"
13+
)
14+
15+
// digestDelta is the compression argument for t-digest, used to specify the
16+
// tradeoff between accuracy and memory consumption: meaning higher values give
17+
// more accuracy, but use more memory/space. Specifically, the digest has at most
18+
// 2*compression centroids, with ~1.3*compression in practice.
19+
//
20+
// A value of 100 is a common default for t-digest implementations, which should
21+
// provide at least ±1% quantile accuracy (and much better at the tails).
22+
const digestDelta = 100
23+
24+
// StatsHistogram is a quantile sketch using t-digest to track the distribution
25+
// of a bytes value (with a caller-determined meaning) by TieringAttribute. It
26+
// can efficiently answer questions like:
27+
// - What fraction of bytes have attribute <= threshold? (CDF)
28+
// - What attribute value covers q% of bytes? (Quantile)
29+
// - How many bytes are "cold" vs "hot"?
30+
//
31+
// The sketch is mergeable, making it easy to combine statistics from
32+
// multiple files without bucket alignment issues.
33+
type StatsHistogram struct {
34+
// TotalBytes is the sum of all bytes recorded (including BytesNoAttr).
35+
TotalBytes uint64
36+
// TotalCount is the number of records added.
37+
TotalCount uint64
38+
// BytesNoAttr tracks bytes with attribute=0 separately, as this typically
39+
// indicates unset/special values that shouldn't be tiered.
40+
BytesNoAttr uint64
41+
// digest is the t-digest for non-zero attributes (read-only).
42+
digest tdigest.TDigest
43+
}
44+
45+
// CDF returns the fraction of non-zero bytes with attribute <= threshold.
46+
// Returns a value in [0, 1].
47+
// Example: CDF(coldThreshold) tells you what fraction of bytes are "cold".
48+
func (s *StatsHistogram) CDF(threshold base.TieringAttribute) float64 {
49+
if s.BytesWithAttr() == 0 {
50+
return 0
51+
}
52+
return s.digest.CDF(float64(threshold))
53+
}
54+
55+
// Quantile returns the attribute value at quantile ~q (where q is in [0, 1]) of
56+
// bytes with a tiering attribute set.
57+
// Example: Quantile(0.5) returns a value close to the median value.
58+
func (s *StatsHistogram) Quantile(q float64) base.TieringAttribute {
59+
return base.TieringAttribute(s.digest.Quantile(q))
60+
}
61+
62+
// BytesWithAttr returns bytes that have a tiering attribute set, excluding
63+
// bytes with attr=0 (which typically indicates unset/special values).
64+
func (s *StatsHistogram) BytesWithAttr() uint64 {
65+
return s.TotalBytes - s.BytesNoAttr
66+
}
67+
68+
// BytesBelowThreshold estimates how many bytes (excluding NoAttrBytes) have
69+
// attribute <= threshold. This is useful for tiering decisions.
70+
func (s *StatsHistogram) BytesBelowThreshold(threshold base.TieringAttribute) uint64 {
71+
return uint64(s.CDF(threshold) * float64(s.BytesWithAttr()))
72+
}
73+
74+
// BytesAboveThreshold estimates how many non-zero bytes have attribute > threshold.
75+
// This is useful for tiering decisions.
76+
func (s *StatsHistogram) BytesAboveThreshold(threshold base.TieringAttribute) uint64 {
77+
return s.BytesWithAttr() - s.BytesBelowThreshold(threshold)
78+
}
79+
80+
// encode serializes the histogram to bytes. The encoding format is:
81+
//
82+
// <total bytes> <total count> <zero count> <digest size> <digest data>
83+
func (s *StatsHistogram) encode() []byte {
84+
// Calculate the size needed for the t-digest.
85+
digestSize := s.digest.SerializedSize()
86+
87+
// Pre-allocate with estimated capacity (4 uvarints + digest).
88+
// Each uvarint is at most 10 bytes for uint64.
89+
buf := make([]byte, 0, 4*binary.MaxVarintLen64+digestSize)
90+
91+
buf = binary.AppendUvarint(buf, s.TotalBytes)
92+
buf = binary.AppendUvarint(buf, s.TotalCount)
93+
buf = binary.AppendUvarint(buf, s.BytesNoAttr)
94+
buf = binary.AppendUvarint(buf, uint64(digestSize))
95+
buf = s.digest.Serialize(buf)
96+
97+
return buf
98+
}
99+
100+
// DecodeStatsHistogram decodes a StatsHistogram from the provided buffer.
101+
// The encoding format is:
102+
//
103+
// <total bytes> <total count> <zero bytes> <digest size> <digest data>
104+
func DecodeStatsHistogram(buf []byte) (StatsHistogram, error) {
105+
var h StatsHistogram
106+
var n int
107+
108+
h.TotalBytes, n = binary.Uvarint(buf)
109+
if n <= 0 {
110+
return StatsHistogram{}, base.CorruptionErrorf("cannot decode TotalBytes from buf %x", buf)
111+
}
112+
buf = buf[n:]
113+
114+
h.TotalCount, n = binary.Uvarint(buf)
115+
if n <= 0 {
116+
return StatsHistogram{}, base.CorruptionErrorf("cannot decode TotalCount from buf %x", buf)
117+
}
118+
buf = buf[n:]
119+
120+
h.BytesNoAttr, n = binary.Uvarint(buf)
121+
if n <= 0 {
122+
return StatsHistogram{}, base.CorruptionErrorf("cannot decode BytesNoAttr from buf %x", buf)
123+
}
124+
buf = buf[n:]
125+
126+
digestSize, n := binary.Uvarint(buf)
127+
if n <= 0 {
128+
return StatsHistogram{}, base.CorruptionErrorf("cannot decode digest size from buf %x", buf)
129+
}
130+
buf = buf[n:]
131+
132+
if int(digestSize) != len(buf) {
133+
return StatsHistogram{}, errors.Errorf("histogram digest size %d does not match remaining data %d",
134+
digestSize, len(buf))
135+
}
136+
137+
if digestSize > 0 {
138+
var digest tdigest.TDigest
139+
_, err := tdigest.Deserialize(&digest, buf[:digestSize])
140+
if err != nil {
141+
return StatsHistogram{}, err
142+
}
143+
h.digest = digest
144+
}
145+
146+
return h, nil
147+
}
148+
149+
// Merge combines another histogram into this one using a t-digest Merger.
150+
// This is used to aggregate statistics from multiple files.
151+
func (s *StatsHistogram) Merge(other *StatsHistogram) {
152+
s.TotalBytes += other.TotalBytes
153+
s.TotalCount += other.TotalCount
154+
s.BytesNoAttr += other.BytesNoAttr
155+
156+
merger := tdigest.MakeMerger(digestDelta)
157+
merger.Merge(&s.digest)
158+
merger.Merge(&other.digest)
159+
merged := merger.Digest()
160+
s.digest = merged
161+
}
162+
163+
// histogramWriter wraps a tdigest.Builder for building during sstable/blob file
164+
// writing.
165+
type histogramWriter struct {
166+
builder tdigest.Builder
167+
totalBytes uint64
168+
totalCount uint64
169+
zeroBytes uint64
170+
}
171+
172+
func newHistogramWriter() *histogramWriter {
173+
return &histogramWriter{
174+
builder: tdigest.MakeBuilder(digestDelta),
175+
}
176+
}
177+
178+
// record adds a data point to the histogram. Records with attr=0 are tracked
179+
// separately and excluded from the t-digest - as zero typically indicates
180+
// an unset or special value that shouldn't influence tiering decisions.
181+
func (w *histogramWriter) record(attr base.TieringAttribute, bytes uint64) {
182+
w.totalCount++
183+
w.totalBytes += bytes
184+
if attr == 0 {
185+
w.zeroBytes += bytes
186+
return
187+
}
188+
w.builder.Add(float64(attr), float64(bytes))
189+
}
190+
191+
func (w *histogramWriter) encode() []byte {
192+
digest := w.builder.Digest()
193+
h := StatsHistogram{
194+
TotalBytes: w.totalBytes,
195+
TotalCount: w.totalCount,
196+
BytesNoAttr: w.zeroBytes,
197+
digest: digest,
198+
}
199+
return h.encode()
200+
}

0 commit comments

Comments
 (0)