Skip to content

Commit c2d5a00

Browse files
committed
sstable/tieredmeta: add tiering histogram using t-digest
Add StatsHistogram, a quantile sketch backed by t-digest, to track byte distribution by TieringAttribute. The histogram supports CDF and quantile queries to estimate bytes above/below thresholds for tiering decisions. Add TieringHistogramBlockWriter to encode multiple histograms per sstable or blob file, keyed by (KindAndTier, SpanID). The block will support histogram types for sstable keys, inline values, and blob references (hot/cold).
1 parent 97fbda9 commit c2d5a00

File tree

8 files changed

+511
-4
lines changed

8 files changed

+511
-4
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ require (
3636

3737
require (
3838
github.com/RaduBerinde/btreemap v0.0.0-20250419174037-3d62b7205d54 // indirect
39+
github.com/RaduBerinde/tdigest v0.0.0-20251022152254-90e030c3a314 // indirect
3940
github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 // indirect
4041
github.com/beorn7/perks v1.0.1 // indirect
4142
github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ github.com/RaduBerinde/axisds v0.0.0-20250419182453-5135a0650657 h1:8XBWWQD+vFF+
1111
github.com/RaduBerinde/axisds v0.0.0-20250419182453-5135a0650657/go.mod h1:UHGJonU9z4YYGKJxSaC6/TNcLOBptpmM5m2Cksbnw0Y=
1212
github.com/RaduBerinde/btreemap v0.0.0-20250419174037-3d62b7205d54 h1:bsU8Tzxr/PNz75ayvCnxKZWEYdLMPDkUgticP4a4Bvk=
1313
github.com/RaduBerinde/btreemap v0.0.0-20250419174037-3d62b7205d54/go.mod h1:0tr7FllbE9gJkHq7CVeeDDFAFKQVy5RnCSSNBOvdqbc=
14+
github.com/RaduBerinde/tdigest v0.0.0-20251022152254-90e030c3a314 h1:RcSNrxDZ1ZyEfpGwrpWqd3YWfMjQbKOtT+p3Wht6XN0=
15+
github.com/RaduBerinde/tdigest v0.0.0-20251022152254-90e030c3a314/go.mod h1:RClmoWh7JPAzCuExyOvKDCRVYGz3D11DcpzEkq4N3RA=
1416
github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 h1:xlwdaKcTNVW4PtpQb8aKA4Pjy0CdJHEqvFbAnvR5m2g=
1517
github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794/go.mod h1:7e+I0LQFUI9AXWxOfsQROs9xPhoJtbsyWcjJqDd4KPY=
1618
github.com/aclements/go-perfevent v0.0.0-20240301234650-f7843625020f h1:JjxwchlOepwsUWcQwD2mLUAGE9aCp0/ehy6yCHFBOvo=

internal/base/internal.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,9 @@ type InternalKV struct {
640640
// The zero value is reserved to mean "no attribute" or "unknown".
641641
type TieringAttribute uint64
642642

643+
// TieringSpanID TODO comment
644+
type TieringSpanID uint64
645+
643646
// KVMeta describes optional metadata associated with an `InternalKV`.
644647
// It's currently produced only by sstable-backed iterators and is not embedded
645648
// within `InternalKV` to avoid overhead on the common iteration path.
@@ -651,7 +654,7 @@ type TieringAttribute uint64
651654
// These methods exist to support compaction-only logic (eg, `compaction.Iter`).
652655
// Regular iteration should use the standard methods that do not surface metadata.
653656
type KVMeta struct {
654-
TieringSpanID uint64
657+
TieringSpanID TieringSpanID
655658
TieringAttribute TieringAttribute
656659
}
657660

internal/testkeys/testkeys.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ func ExtractKVMeta(value []byte) base.KVMeta {
555555
if e != nil {
556556
panic(fmt.Sprintf("invalid tiering span in KV metadata in %q", value))
557557
}
558-
res.TieringSpanID = v
558+
res.TieringSpanID = base.TieringSpanID(v)
559559
v, e = strconv.ParseUint(m[2], 10, 64)
560560
if e != nil {
561561
panic(fmt.Sprintf("invalid tiering attr in KV metadata in %q", value))

sstable/colblk/data_block.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ func (w *DataBlockEncoder) Add(
708708
w.values.Put(value)
709709
}
710710
if w.columnConfig.SupportsTiering() && meta != (base.KVMeta{}) {
711-
w.tieringSpanIDs.Set(w.rows, meta.TieringSpanID)
711+
w.tieringSpanIDs.Set(w.rows, uint64(meta.TieringSpanID))
712712
w.tieringAttributes.Set(w.rows, uint64(meta.TieringAttribute))
713713
}
714714
if len(ikey.UserKey) > int(w.maximumKeyLength) {
@@ -1518,7 +1518,7 @@ func (i *DataBlockIter) decodeMeta() base.KVMeta {
15181518
return base.KVMeta{}
15191519
}
15201520
return base.KVMeta{
1521-
TieringSpanID: i.tieringSpanIDs.At(i.row),
1521+
TieringSpanID: base.TieringSpanID(i.tieringSpanIDs.At(i.row)),
15221522
TieringAttribute: base.TieringAttribute(i.tieringAttributes.At(i.row)),
15231523
}
15241524
}

sstable/tieredmeta/histogram.go

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package tieredmeta
6+
7+
import (
8+
"encoding/binary"
9+
10+
"github.com/RaduBerinde/tdigest"
11+
"github.com/cockroachdb/pebble/internal/base"
12+
"github.com/pkg/errors"
13+
)
14+
15+
// CompressionLevel is the compression argument for t-digest, used to specify the
16+
// tradeoff between accuracy and memory consumption: meaning higher values give
17+
// more accuracy but use more memory/space. The default value is 100.
18+
const CompressionLevel = 100
19+
20+
// StatsHistogram is a quantile sketch using t-digest to track the distribution
21+
// of bytes by TieringAttribute. It can efficiently answer questions like:
22+
// - What fraction of bytes have attribute <= threshold? (CDF)
23+
// - What attribute value covers q% of bytes? (Quantile)
24+
// - How many bytes are "cold" vs "hot"?
25+
//
26+
// The sketch is mergeable, making it easy to combine statistics from
27+
// multiple files without bucket alignment issues.
28+
type StatsHistogram struct {
29+
// TotalBytes is the sum of all bytes recorded (including ZeroBytes).
30+
TotalBytes uint64
31+
// TotalCount is the number of records added.
32+
TotalCount uint64
33+
// ZeroBytes tracks bytes with attribute=0 separately, as this typically
34+
// indicates unset/special values that shouldn't be tiered.
35+
ZeroBytes uint64
36+
// digest is the t-digest for non-zero attributes (read-only).
37+
// May be nil if no non-zero samples have been recorded.
38+
digest *tdigest.TDigest
39+
}
40+
41+
// CDF returns the fraction of non-zero bytes with attribute <= threshold.
42+
// Returns a value in [0, 1].
43+
// Example: CDF(coldThreshold) tells you what fraction of bytes are "hot".
44+
func (s *StatsHistogram) CDF(threshold base.TieringAttribute) float64 {
45+
if s.digest == nil || s.NonZeroBytes() == 0 {
46+
return 0
47+
}
48+
return s.digest.CDF(float64(threshold))
49+
}
50+
51+
// Quantile returns the attribute value at quantile q (where q is in [0, 1]).
52+
// Example: Quantile(0.5) returns the median attribute value by bytes.
53+
func (s *StatsHistogram) Quantile(q float64) base.TieringAttribute {
54+
if s.digest == nil {
55+
return 0
56+
}
57+
return base.TieringAttribute(s.digest.Quantile(q))
58+
}
59+
60+
// NonZeroBytes returns the total bytes with non-zero attributes.
61+
func (s *StatsHistogram) NonZeroBytes() uint64 {
62+
return s.TotalBytes - s.ZeroBytes
63+
}
64+
65+
// BytesBelowThreshold estimates how many non-zero bytes have attribute <= threshold.
66+
// This is useful for tiering decisions.
67+
func (s *StatsHistogram) BytesBelowThreshold(threshold base.TieringAttribute) uint64 {
68+
return uint64(s.CDF(threshold) * float64(s.NonZeroBytes()))
69+
}
70+
71+
// BytesAboveThreshold estimates how many non-zero bytes have attribute > threshold.
72+
// This is useful for tiering decisions.
73+
func (s *StatsHistogram) BytesAboveThreshold(threshold base.TieringAttribute) uint64 {
74+
return s.NonZeroBytes() - s.BytesBelowThreshold(threshold)
75+
}
76+
77+
// encode serializes the histogram to bytes.
78+
//
79+
// <TotalBytes> <TotalCount> <ZeroBytes> <DigestSize> <DigestData>
80+
func (s *StatsHistogram) encode() []byte {
81+
// Calculate the size needed for the t-digest.
82+
var digestSize int
83+
if s.digest != nil {
84+
digestSize = s.digest.SerializedSize()
85+
}
86+
87+
// Pre-allocate with estimated capacity (4 uvarints + digest).
88+
// Each uvarint is at most 10 bytes for uint64.
89+
buf := make([]byte, 0, 4*binary.MaxVarintLen64+digestSize)
90+
91+
buf = binary.AppendUvarint(buf, s.TotalBytes)
92+
buf = binary.AppendUvarint(buf, s.TotalCount)
93+
buf = binary.AppendUvarint(buf, s.ZeroBytes)
94+
buf = binary.AppendUvarint(buf, uint64(digestSize))
95+
96+
// Serialize the t-digest if present.
97+
if s.digest != nil {
98+
buf = s.digest.Serialize(buf)
99+
}
100+
101+
return buf
102+
}
103+
104+
// DecodeStatsHistogram decodes a StatsHistogram from the provided buffer.
105+
// The encoding format is:
106+
//
107+
// <total bytes> <total count> <zero bytes> <digest size> <digest data>
108+
func DecodeStatsHistogram(buf []byte) (StatsHistogram, error) {
109+
var h StatsHistogram
110+
var n int
111+
112+
h.TotalBytes, n = binary.Uvarint(buf)
113+
buf = buf[n:]
114+
h.TotalCount, n = binary.Uvarint(buf)
115+
buf = buf[n:]
116+
h.ZeroBytes, n = binary.Uvarint(buf)
117+
buf = buf[n:]
118+
digestSize, n := binary.Uvarint(buf)
119+
buf = buf[n:]
120+
121+
if int(digestSize) > len(buf) {
122+
return StatsHistogram{}, errors.Errorf("histogram digest size %d exceeds remaining data %d",
123+
digestSize, len(buf))
124+
}
125+
126+
if digestSize > 0 {
127+
var digest tdigest.TDigest
128+
_, err := tdigest.Deserialize(&digest, buf[:digestSize])
129+
if err != nil {
130+
return StatsHistogram{}, err
131+
}
132+
h.digest = &digest
133+
}
134+
135+
return h, nil
136+
}
137+
138+
// Merge combines another histogram into this one using a t-digest Merger.
139+
// This is used to aggregate statistics from multiple files.
140+
func (s *StatsHistogram) Merge(other *StatsHistogram) {
141+
s.TotalBytes += other.TotalBytes
142+
s.TotalCount += other.TotalCount
143+
s.ZeroBytes += other.ZeroBytes
144+
145+
// Merge using the Merger.
146+
merger := tdigest.MakeMerger(CompressionLevel)
147+
merger.Merge(s.digest)
148+
merger.Merge(other.digest)
149+
merged := merger.Digest()
150+
s.digest = &merged
151+
}
152+
153+
// histogramWriter wraps a tdigest.Builder for building during sstable/blob file
154+
// writing.
155+
type histogramWriter struct {
156+
builder tdigest.Builder
157+
totalBytes uint64
158+
totalCount uint64
159+
zeroBytes uint64
160+
}
161+
162+
func newHistogramWriter() *histogramWriter {
163+
return &histogramWriter{
164+
builder: tdigest.MakeBuilder(CompressionLevel),
165+
}
166+
}
167+
168+
func (w *histogramWriter) record(attr base.TieringAttribute, bytes uint64) {
169+
w.totalCount++
170+
w.totalBytes += bytes
171+
if attr == 0 {
172+
w.zeroBytes += bytes
173+
return
174+
}
175+
// Add to builder with bytes as the weight
176+
w.builder.Add(float64(attr), float64(bytes))
177+
}
178+
179+
func (w *histogramWriter) encode() []byte {
180+
digest := w.builder.Digest()
181+
h := StatsHistogram{
182+
TotalBytes: w.totalBytes,
183+
TotalCount: w.totalCount,
184+
ZeroBytes: w.zeroBytes,
185+
digest: &digest,
186+
}
187+
return h.encode()
188+
}

0 commit comments

Comments
 (0)