Skip to content

Commit 2884026

Browse files
compact: add point tombstone density compaction heuristic
This change adds a heuristic to compact point tombstones based on their density across the LSM. We add a new table property called `TombstoneDenseBlocksRatio` and a corresponding field in `TableStats` that tracks the ratio of data blocks in each table which are considered tombstone-dense. This value is calculated on the fly while tables are being written, so no extra I/O is required later on to compute it. A data block is considered tombstone-dense if it fulfills either of the following criteria: 1. The block contains at least `options.Experimental.NumDeletionsThreshold` point tombstones. The default value is `100`. 2. The ratio of the uncompressed size of point tombstones to the uncompressed size of the block is at least `options.Experimental.DeletionSizeRatioThreshold`. For example, with the default value of `0.5`, a data block of size 4KB would be considered tombstone-dense if it contains at least 2KB of point tombstones. The intuition for these criteria is best described in [this discussion](#918 (comment)), which highlights that dense clusters are bad because they a) waste CPU when skipping over tombstones, and b) waste I/O because we end up loading more blocks per live key. The two criteria above are meant to tackle these two issues respectively; the the count-based threshold prevents CPU waste, and the size-based threshold prevents I/O waste. A table is considered eligible for the new tombstone compaction type if its ratio of tombstone-dense blocks is at least `options.Experimental.MinTombstoneDenseRatio`. The default value is `0.05`. We use an Annotator in a similar way to elision-only compactions in order to prioritize compacting the table with the most tombstone-dense blocks if there are multiple eligible tables. The default here was chosen through experimentation on CockroachDB KV workloads; with a lower value we were compacting too aggressively leading to very high write amplification, but lower values led to very few noticeable performance improvements. Fixes: #918
1 parent 1010734 commit 2884026

25 files changed

+379
-148
lines changed

compaction.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ const (
135135
compactionKindDeleteOnly
136136
compactionKindElisionOnly
137137
compactionKindRead
138+
compactionKindTombstoneDensity
138139
compactionKindRewrite
139140
compactionKindIngestedFlushable
140141
)
@@ -153,6 +154,8 @@ func (k compactionKind) String() string {
153154
return "elision-only"
154155
case compactionKindRead:
155156
return "read"
157+
case compactionKindTombstoneDensity:
158+
return "tombstone-density"
156159
case compactionKindRewrite:
157160
return "rewrite"
158161
case compactionKindIngestedFlushable:

compaction_picker.go

Lines changed: 110 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/cockroachdb/errors"
1515
"github.com/cockroachdb/pebble/internal/base"
1616
"github.com/cockroachdb/pebble/internal/humanize"
17+
"github.com/cockroachdb/pebble/internal/invariants"
1718
"github.com/cockroachdb/pebble/internal/manifest"
1819
)
1920

@@ -585,6 +586,7 @@ func newCompactionPickerByScore(
585586
virtualBackings: virtualBackings,
586587
}
587588
p.initLevelMaxBytes(inProgressCompactions)
589+
p.initTombstoneDensityAnnotator(opts)
588590
return p
589591
}
590592

@@ -665,6 +667,11 @@ type compactionPickerByScore struct {
665667
// levelMaxBytes holds the dynamically adjusted max bytes setting for each
666668
// level.
667669
levelMaxBytes [numLevels]int64
670+
// tombstoneDensityAnnotator holds the annotator for choosing tombstone
671+
// density compactions.
672+
// NB: This is declared here rather than globally because
673+
// options.Experimental.MinTombstoneDenseRatio is not known until runtime.
674+
tombstoneDensityAnnotator *manifest.Annotator[fileMetadata]
668675
}
669676

670677
var _ compactionPicker = &compactionPickerByScore{}
@@ -1280,6 +1287,13 @@ func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompact
12801287
}
12811288
}
12821289

1290+
// Check for files which contain excessive point tombstones that could slow
1291+
// down reads. Unlike elision-only compactions, these compactions may select
1292+
// a file at any level rather than only the lowest level.
1293+
if pc := p.pickTombstoneDensityCompaction(env); pc != nil {
1294+
return pc
1295+
}
1296+
12831297
// Check for L6 files with tombstones that may be elided. These files may
12841298
// exist if a snapshot prevented the elision of a tombstone or because of
12851299
// a move compaction. These are low-priority compactions because they
@@ -1408,6 +1422,52 @@ var markedForCompactionAnnotator = &manifest.Annotator[fileMetadata]{
14081422
},
14091423
}
14101424

1425+
// pickedCompactionFromCandidateFile creates a pickedCompaction from a *fileMetadata
1426+
// with various checks to ensure that the file still exists in the expected level
1427+
// and isn't already being compacted.
1428+
func (p *compactionPickerByScore) pickedCompactionFromCandidateFile(
1429+
candidate *fileMetadata, env compactionEnv, startLevel int, outputLevel int, kind compactionKind,
1430+
) *pickedCompaction {
1431+
if candidate == nil || candidate.IsCompacting() {
1432+
return nil
1433+
}
1434+
1435+
var inputs manifest.LevelSlice
1436+
if startLevel == 0 {
1437+
// Overlapping L0 files must also be compacted alongside the candidate.
1438+
inputs = p.vers.Overlaps(0, candidate.UserKeyBounds())
1439+
} else {
1440+
inputs = p.vers.Levels[startLevel].Find(p.opts.Comparer.Compare, candidate)
1441+
}
1442+
if invariants.Enabled {
1443+
found := false
1444+
inputs.Each(func(f *fileMetadata) {
1445+
if f.FileNum == candidate.FileNum {
1446+
found = true
1447+
}
1448+
})
1449+
if !found {
1450+
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, startLevel))
1451+
}
1452+
}
1453+
1454+
pc := newPickedCompaction(p.opts, p.vers, startLevel, outputLevel, p.baseLevel)
1455+
pc.kind = kind
1456+
pc.startLevel.files = inputs
1457+
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
1458+
1459+
// Fail-safe to protect against compacting the same sstable concurrently.
1460+
if inputRangeAlreadyCompacting(env, pc) {
1461+
return nil
1462+
}
1463+
1464+
if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) {
1465+
return nil
1466+
}
1467+
1468+
return pc
1469+
}
1470+
14111471
// pickElisionOnlyCompaction looks for compactions of sstables in the
14121472
// bottommost level containing obsolete records that may now be dropped.
14131473
func (p *compactionPickerByScore) pickElisionOnlyCompaction(
@@ -1420,28 +1480,10 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction(
14201480
if candidate == nil {
14211481
return nil
14221482
}
1423-
if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
1424-
return nil
1425-
}
1426-
lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate)
1427-
if lf.Empty() {
1428-
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
1429-
}
1430-
1431-
// Construct a picked compaction of the elision candidate's atomic
1432-
// compaction unit.
1433-
pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel)
1434-
pc.kind = compactionKindElisionOnly
1435-
pc.startLevel.files = lf
1436-
if anyTablesCompacting(lf) {
1483+
if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
14371484
return nil
14381485
}
1439-
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
1440-
// Fail-safe to protect against compacting the same sstable concurrently.
1441-
if !inputRangeAlreadyCompacting(env, pc) {
1442-
return pc
1443-
}
1444-
return nil
1486+
return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly)
14451487
}
14461488

14471489
// pickRewriteCompaction attempts to construct a compaction that
@@ -1456,36 +1498,60 @@ func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *
14561498
// Try the next level.
14571499
continue
14581500
}
1459-
if candidate.IsCompacting() {
1460-
// Try the next level.
1461-
continue
1462-
}
1463-
lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate)
1464-
if lf.Empty() {
1465-
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
1501+
pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite)
1502+
if pc != nil {
1503+
return pc
14661504
}
1505+
}
1506+
return nil
1507+
}
14671508

1468-
inputs := lf
1469-
if anyTablesCompacting(inputs) {
1470-
// Try the next level.
1471-
continue
1472-
}
1509+
func (p *compactionPickerByScore) initTombstoneDensityAnnotator(opts *Options) {
1510+
p.tombstoneDensityAnnotator = &manifest.Annotator[fileMetadata]{
1511+
Aggregator: manifest.PickFileAggregator{
1512+
Filter: func(f *fileMetadata) (eligible bool, cacheOK bool) {
1513+
if f.IsCompacting() {
1514+
return false, true
1515+
}
1516+
if !f.StatsValid() {
1517+
return false, false
1518+
}
1519+
return f.Stats.TombstoneDenseBlocksRatio > opts.Experimental.TombstoneDenseCompactionThreshold, true
1520+
},
1521+
Compare: func(a, b *fileMetadata) bool {
1522+
return a.Stats.TombstoneDenseBlocksRatio > b.Stats.TombstoneDenseBlocksRatio
1523+
},
1524+
},
1525+
}
1526+
}
14731527

1474-
pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel)
1475-
pc.outputLevel.level = l
1476-
pc.kind = compactionKindRewrite
1477-
pc.startLevel.files = inputs
1478-
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
1528+
// pickTombstoneDensityCompaction looks for a compaction that eliminates
1529+
// regions of extremely high point tombstone density. For each level, it picks
1530+
// a file where the ratio of tombstone-dense blocks is at least
1531+
// options.Experimental.MinTombstoneDenseRatio, prioritizing compaction of
1532+
// files with higher ratios of tombstone-dense blocks.
1533+
func (p *compactionPickerByScore) pickTombstoneDensityCompaction(
1534+
env compactionEnv,
1535+
) (pc *pickedCompaction) {
1536+
if p.opts.Experimental.TombstoneDenseCompactionThreshold <= 0 {
1537+
// Tombstone density compactions are disabled.
1538+
return nil
1539+
}
14791540

1480-
// Fail-safe to protect against compacting the same sstable concurrently.
1481-
if !inputRangeAlreadyCompacting(env, pc) {
1482-
if pc.startLevel.level == 0 {
1483-
pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
1484-
}
1485-
return pc
1541+
var candidate *fileMetadata
1542+
var level int
1543+
// NB: we don't consider the lowest level because elision-only compactions
1544+
// handle that case.
1545+
for l := 0; l < numLevels-1; l++ {
1546+
f := p.tombstoneDensityAnnotator.LevelAnnotation(p.vers.Levels[l])
1547+
newCandidate := p.tombstoneDensityAnnotator.Aggregator.Merge(f, candidate)
1548+
if newCandidate != candidate {
1549+
candidate = newCandidate
1550+
level = l
14861551
}
14871552
}
1488-
return nil
1553+
1554+
return p.pickedCompactionFromCandidateFile(candidate, env, level, defaultOutputLevel(level, p.baseLevel), compactionKindTombstoneDensity)
14891555
}
14901556

14911557
// pickAutoLPositive picks an automatic compaction for the candidate

compaction_picker_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,7 @@ func TestCompactionPickerL0(t *testing.T) {
517517
}
518518
vs.picker = picker
519519
picker.initLevelMaxBytes(inProgressCompactions)
520+
picker.initTombstoneDensityAnnotator(opts)
520521

521522
var buf bytes.Buffer
522523
fmt.Fprint(&buf, version.String())

internal/manifest/version.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,16 @@ type TableStats struct {
7575
ValueBlocksSize uint64
7676
// CompressionType is the compression type of the table.
7777
CompressionType block.Compression
78+
// TombstoneDenseBlocksRatio is the ratio of data blocks in this table that
79+
// fulfills at least one of the following:
80+
// 1. The block contains at least options.Experimental.NumDeletionsThreshold
81+
// point tombstones.
82+
// 2. The ratio of the uncompressed size of point tombstones to the
83+
// uncompressed size of the block is at least
84+
// options.Experimental.DeletionSizeRatioThreshold.
85+
// This statistic is used to determine eligibility for a tombstone density
86+
// compaction.
87+
TombstoneDenseBlocksRatio float64
7888
}
7989

8090
// boundType represents the type of key (point or range) present as the smallest

metrics.go

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -153,16 +153,17 @@ type Metrics struct {
153153

154154
Compact struct {
155155
// The total number of compactions, and per-compaction type counts.
156-
Count int64
157-
DefaultCount int64
158-
DeleteOnlyCount int64
159-
ElisionOnlyCount int64
160-
CopyCount int64
161-
MoveCount int64
162-
ReadCount int64
163-
RewriteCount int64
164-
MultiLevelCount int64
165-
CounterLevelCount int64
156+
Count int64
157+
DefaultCount int64
158+
DeleteOnlyCount int64
159+
ElisionOnlyCount int64
160+
CopyCount int64
161+
MoveCount int64
162+
ReadCount int64
163+
TombstoneDensityCount int64
164+
RewriteCount int64
165+
MultiLevelCount int64
166+
CounterLevelCount int64
166167
// An estimate of the number of bytes that need to be compacted for the LSM
167168
// to reach a stable state.
168169
EstimatedDebt uint64
@@ -580,12 +581,13 @@ func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune) {
580581
redact.Safe(m.Compact.NumInProgress),
581582
humanize.Bytes.Int64(m.Compact.InProgressBytes))
582583

583-
w.Printf(" default: %d delete: %d elision: %d move: %d read: %d rewrite: %d copy: %d multi-level: %d\n",
584+
w.Printf(" default: %d delete: %d elision: %d move: %d read: %d tombstone-density: %d rewrite: %d copy: %d multi-level: %d\n",
584585
redact.Safe(m.Compact.DefaultCount),
585586
redact.Safe(m.Compact.DeleteOnlyCount),
586587
redact.Safe(m.Compact.ElisionOnlyCount),
587588
redact.Safe(m.Compact.MoveCount),
588589
redact.Safe(m.Compact.ReadCount),
590+
redact.Safe(m.Compact.TombstoneDensityCount),
589591
redact.Safe(m.Compact.RewriteCount),
590592
redact.Safe(m.Compact.CopyCount),
591593
redact.Safe(m.Compact.MultiLevelCount))

metrics_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ func exampleMetrics() Metrics {
3939
m.Compact.ElisionOnlyCount = 29
4040
m.Compact.MoveCount = 30
4141
m.Compact.ReadCount = 31
42+
m.Compact.TombstoneDensityCount = 16
4243
m.Compact.RewriteCount = 32
4344
m.Compact.CopyCount = 33
4445
m.Compact.MultiLevelCount = 34

options.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,38 @@ type Options struct {
599599
// gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB).
600600
ReadSamplingMultiplier int64
601601

602+
// NumDeletionsThreshold defines the minimum number of point tombstones
603+
// that must be present in a single data block for that block to be
604+
// considered tombstone-dense for the purposes of triggering a
605+
// tombstone density compaction. Data blocks may also be considered
606+
// tombstone-dense if they meet the criteria defined by
607+
// DeletionSizeRatioThreshold below. Tombstone-dense blocks are identified
608+
// when sstables are written, and so this is effectively an option for
609+
// sstable writers. The default value is 100.
610+
NumDeletionsThreshold int
611+
612+
// DeletionSizeRatioThreshold defines the minimum ratio of the size of
613+
// point tombstones to the size of a data block that must be reached
614+
// for that block to be considered tombstone-dense for the purposes of
615+
// triggering a tombstone density compaction. Data blocks may also be
616+
// considered tombstone-dense if they meet the criteria defined by
617+
// NumDeletionsThreshold above. Tombstone-dense blocks are identified
618+
// when sstables are written, and so this is effectively an option for
619+
// sstable writers. The default value is 0.5.
620+
DeletionSizeRatioThreshold float32
621+
622+
// TombstoneDenseCompactionThreshold is the minimum percent of data
623+
// blocks in a table that must be tombstone-dense for that table to be
624+
// eligible for a tombstone density compaction. It should be defined as a
625+
// ratio out of 1. The default value is 0.05.
626+
//
627+
// If multiple tables are eligible for a tombstone density compaction, then
628+
// tables with a higher percent of tombstone-dense blocks are still
629+
// prioritized for compaction.
630+
//
631+
// A zero or negative value disables tombstone density compactions.
632+
TombstoneDenseCompactionThreshold float64
633+
602634
// TableCacheShards is the number of shards per table cache.
603635
// Reducing the value can reduce the number of idle goroutines per DB
604636
// instance which can be useful in scenarios with a lot of DB instances
@@ -1268,6 +1300,15 @@ func (o *Options) EnsureDefaults() *Options {
12681300
if o.Experimental.ReadSamplingMultiplier == 0 {
12691301
o.Experimental.ReadSamplingMultiplier = 1 << 4
12701302
}
1303+
if o.Experimental.NumDeletionsThreshold == 0 {
1304+
o.Experimental.NumDeletionsThreshold = sstable.DefaultNumDeletionsThreshold
1305+
}
1306+
if o.Experimental.DeletionSizeRatioThreshold == 0 {
1307+
o.Experimental.DeletionSizeRatioThreshold = sstable.DefaultDeletionSizeRatioThreshold
1308+
}
1309+
if o.Experimental.TombstoneDenseCompactionThreshold == 0 {
1310+
o.Experimental.TombstoneDenseCompactionThreshold = 0.05
1311+
}
12711312
if o.Experimental.TableCacheShards <= 0 {
12721313
o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0)
12731314
}
@@ -1395,6 +1436,9 @@ func (o *Options) String() string {
13951436
}
13961437
fmt.Fprintf(&buf, " read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate)
13971438
fmt.Fprintf(&buf, " read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier)
1439+
fmt.Fprintf(&buf, " num_deletions_threshold=%d\n", o.Experimental.NumDeletionsThreshold)
1440+
fmt.Fprintf(&buf, " deletion_size_ratio_threshold=%f\n", o.Experimental.DeletionSizeRatioThreshold)
1441+
fmt.Fprintf(&buf, " tombstone_dense_compaction_threshold=%f\n", o.Experimental.TombstoneDenseCompactionThreshold)
13981442
// We no longer care about strict_wal_tail, but set it to true in case an
13991443
// older version reads the options.
14001444
fmt.Fprintf(&buf, " strict_wal_tail=%t\n", true)
@@ -1711,6 +1755,14 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error {
17111755
o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64)
17121756
case "read_sampling_multiplier":
17131757
o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64)
1758+
case "num_deletions_threshold":
1759+
o.Experimental.NumDeletionsThreshold, err = strconv.Atoi(value)
1760+
case "deletion_size_ratio_threshold":
1761+
val, parseErr := strconv.ParseFloat(value, 32)
1762+
o.Experimental.DeletionSizeRatioThreshold = float32(val)
1763+
err = parseErr
1764+
case "tombstone_dense_compaction_threshold":
1765+
o.Experimental.TombstoneDenseCompactionThreshold, err = strconv.ParseFloat(value, 64)
17141766
case "table_cache_shards":
17151767
o.Experimental.TableCacheShards, err = strconv.Atoi(value)
17161768
case "table_format":
@@ -1985,6 +2037,8 @@ func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstab
19852037
writerOpts.FilterType = levelOpts.FilterType
19862038
writerOpts.IndexBlockSize = levelOpts.IndexBlockSize
19872039
writerOpts.AllocatorSizeClasses = o.AllocatorSizeClasses
2040+
writerOpts.NumDeletionsThreshold = o.Experimental.NumDeletionsThreshold
2041+
writerOpts.DeletionSizeRatioThreshold = o.Experimental.DeletionSizeRatioThreshold
19882042
return writerOpts
19892043
}
19902044

0 commit comments

Comments
 (0)