Skip to content

Commit a5edd16

Browse files
committed
pebble, sstable: preserve blob values only if inputs match output policy
Now that we have persisted the value separation policy used by each sstable, we can now decide to only preserve blob references during a compaction if the compaction's value separation policy matches that of its inputs. Resolves: #4814
1 parent 6c52edf commit a5edd16

File tree

10 files changed

+243
-76
lines changed

10 files changed

+243
-76
lines changed

compaction.go

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ type tableCompaction struct {
250250
// b) rewrite blob files: The compaction will write eligible values to new
251251
// blob files. This consumes more write bandwidth because all values are
252252
// rewritten. However it restores locality.
253-
getValueSeparation func(JobID, *tableCompaction) compact.ValueSeparation
253+
getValueSeparation func(JobID, *tableCompaction, ValueStoragePolicy) compact.ValueSeparation
254254

255255
// startLevel is the level that is being compacted. Inputs from startLevel
256256
// and outputLevel will be merged to produce a set of outputLevel files.
@@ -542,7 +542,7 @@ func (c *tableCompaction) makeInfo(jobID JobID) CompactionInfo {
542542
return info
543543
}
544544

545-
type getValueSeparation func(JobID, *tableCompaction) compact.ValueSeparation
545+
type getValueSeparation func(JobID, *tableCompaction, ValueStoragePolicy) compact.ValueSeparation
546546

547547
// newCompaction constructs a compaction from the provided picked compaction.
548548
//
@@ -3300,10 +3300,7 @@ func (d *DB) runDefaultTableCompaction(
33003300
d.mu.Unlock()
33013301
defer d.mu.Lock()
33023302

3303-
// Determine whether we should separate values into blob files.
3304-
valueSeparation := c.getValueSeparation(jobID, c)
3305-
3306-
result := d.compactAndWrite(jobID, c, snapshots, valueSeparation)
3303+
result := d.compactAndWrite(jobID, c, snapshots)
33073304
if result.Err == nil {
33083305
ve, result.Err = c.makeVersionEdit(result)
33093306
}
@@ -3344,10 +3341,7 @@ func (d *DB) runDefaultTableCompaction(
33443341
// compactAndWrite runs the data part of a compaction, where we set up a
33453342
// compaction iterator and use it to write output tables.
33463343
func (d *DB) compactAndWrite(
3347-
jobID JobID,
3348-
c *tableCompaction,
3349-
snapshots compact.Snapshots,
3350-
valueSeparation compact.ValueSeparation,
3344+
jobID JobID, c *tableCompaction, snapshots compact.Snapshots,
33513345
) (result compact.Result) {
33523346
suggestedCacheReaders := blob.SuggestedCachedReaders(len(c.inputs))
33533347
// Compactions use a pool of buffers to read blocks, avoiding polluting the
@@ -3442,25 +3436,36 @@ func (d *DB) compactAndWrite(
34423436
}
34433437
runner := compact.NewRunner(runnerCfg, iter)
34443438

3445-
var spanPolicyValid bool
3446-
var spanPolicy SpanPolicy
3447-
// If spanPolicyValid is true and spanPolicyEndKey is empty, then spanPolicy
3448-
// applies for the rest of the keyspace.
3439+
// Determine the value separation policy for this compaction.
3440+
// We pass the value storage span policy if one applies for the entire
3441+
// compaction keyspace in order to preserve blob references.
3442+
// Note that the value storage policy may change per table if
3443+
// there are different span policies in effect for this output
3444+
// range.
3445+
var compactionValueStoragePolicy ValueStoragePolicy
34493446
var spanPolicyEndKey []byte
3447+
var spanPolicy SpanPolicy
3448+
spanPolicy, spanPolicyEndKey, err = d.opts.Experimental.SpanPolicyFunc(c.bounds.Start)
3449+
if err != nil {
3450+
return runner.Finish().WithError(err)
3451+
}
3452+
if len(spanPolicyEndKey) == 0 || d.cmp(c.bounds.End.Key, spanPolicyEndKey) < 0 {
3453+
compactionValueStoragePolicy = spanPolicy.ValueStoragePolicy
3454+
}
34503455

3456+
valueSeparation := c.getValueSeparation(jobID, c, compactionValueStoragePolicy)
34513457
for runner.MoreDataToWrite() {
34523458
if c.cancel.Load() {
34533459
return runner.Finish().WithError(ErrCancelledCompaction)
34543460
}
34553461
// Create a new table.
34563462
firstKey := runner.FirstKey()
3457-
if !spanPolicyValid || (len(spanPolicyEndKey) > 0 && d.cmp(firstKey, spanPolicyEndKey) >= 0) {
3463+
if len(spanPolicyEndKey) > 0 && d.cmp(firstKey, spanPolicyEndKey) >= 0 {
34583464
var err error
34593465
spanPolicy, spanPolicyEndKey, err = d.opts.Experimental.SpanPolicyFunc(firstKey)
34603466
if err != nil {
34613467
return runner.Finish().WithError(err)
34623468
}
3463-
spanPolicyValid = true
34643469
}
34653470

34663471
writerOpts := d.makeWriterOptions(c.outputLevel.level)

data_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -912,7 +912,10 @@ func runDBDefineCmd(td *datadriven.TestData, opts *Options) (*DB, error) {
912912
// all the tables, we can construct all the referenced blob files and add
913913
// them to the final version edit.
914914
valueSeparator := &defineDBValueSeparator{
915-
pbr: &preserveBlobReferences{},
915+
pbr: &preserveBlobReferences{
916+
originalValueSeprationKind: sstable.ValueSeparationDefault,
917+
minimumValueSize: d.opts.Experimental.ValueSeparationPolicy().MinimumSize,
918+
},
916919
metas: make(map[base.BlobFileID]*manifest.PhysicalBlobFile),
917920
}
918921

@@ -936,7 +939,7 @@ func runDBDefineCmd(td *datadriven.TestData, opts *Options) (*DB, error) {
936939
if err != nil {
937940
return err
938941
}
939-
c.getValueSeparation = func(JobID, *tableCompaction) compact.ValueSeparation {
942+
c.getValueSeparation = func(JobID, *tableCompaction, ValueStoragePolicy) compact.ValueSeparation {
940943
return valueSeparator
941944
}
942945
// NB: define allows the test to exactly specify which keys go

internal/manifest/table_metadata.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,13 @@ type TableBackingProperties struct {
413413
TombstoneDenseBlocksRatio float64
414414

415415
CompressionStats block.CompressionStats
416+
417+
// ValueSeparationKind is the value separation policy used when writing the table.
418+
ValueSeparationKind sstable.ValueSeparationKind
419+
// ValueSeparationMinSize is the minimum value size for which values were
420+
// separated when writing the table. This value is 0 if the policy used
421+
// does not write blob files.
422+
ValueSeparationMinSize uint64
416423
}
417424

418425
// NumPointDeletions is the number of point deletions in the sstable. For virtual
@@ -447,6 +454,8 @@ func (b *TableBacking) PopulateProperties(props *sstable.Properties) *TableBacki
447454
NumRangeKeyDels: props.NumRangeKeyDels,
448455
NumRangeKeySets: props.NumRangeKeySets,
449456
ValueBlocksSize: props.ValueBlocksSize,
457+
ValueSeparationKind: sstable.ValueSeparationKind(props.ValueSeparationKind),
458+
ValueSeparationMinSize: props.ValueSeparationMinSize,
450459
}
451460
if props.NumDataBlocks != 0 {
452461
b.props.TombstoneDenseBlocksRatio = float64(props.NumTombstoneDenseBlocks) / float64(props.NumDataBlocks)

internal/manifest/table_metadata_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ func TestTableMetadataSize(t *testing.T) {
198198
structSize, tableMetadataSize)
199199
}
200200

201-
const tableBackingSize = 160
201+
const tableBackingSize = 176
202202
if structSize := unsafe.Sizeof(TableBacking{}); structSize != tableBackingSize {
203203
t.Errorf("TableBacking struct size (%d bytes) is not expected size (%d bytes)",
204204
structSize, tableBackingSize)

sstable/properties.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,10 +247,6 @@ const (
247247
// ValueSeparationSpanPolicy indicates that values were separated
248248
// based on a span policy during writing.
249249
ValueSeparationSpanPolicy
250-
251-
// ValueSeparationPreservedRefs indicates that existing blob references
252-
// were preserved without creating new blob files.
253-
ValueSeparationPreservedRefs
254250
)
255251

256252
var (

0 commit comments

Comments
 (0)