Skip to content

Commit d4de1d1

Browse files
committed
batcheval: add ComputeStatsDiff arg to addsstable request
ComputeStatsDiff causes the server to compute the effect this sst will have on the range's mvcc stats, even in the presence of overlapping keys. This flag cannot be passed with the MVCCStats, DisallowShadowingBelow, or DisallowShadowing fields. This flag assumes that any key in the sst that is shadowed by a key in the engine is also a duplicate. As an example, accurate stats will be computed here: - sst: a3,a2 and eng: a2,a1 but not for, as a1 is not a duplicate: - sst: a1 and eng: a2 Informs #145548 Release note: none
1 parent 975861f commit d4de1d1

File tree

3 files changed

+285
-199
lines changed

3 files changed

+285
-199
lines changed

pkg/kv/kvpb/api.proto

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2112,6 +2112,19 @@ message AddSSTableRequest {
21122112
//
21132113
// TODO(dt,msbutler,bilal): This is unsupported.
21142114
util.hlc.Timestamp ignore_keys_above_timestamp = 12 [(gogoproto.nullable) = false];
2115+
2116+
// ComputeStatsDiff causes the server to compute the effect this
2117+
// sst will have on the range's mvcc stats, even in the presence of
2118+
// overlapping keys. This flag cannot be passed with the MVCCStats,
2119+
// DisallowShadowingBelow, or DisallowShadowing fields.
2120+
//
2121+
// This flag assumes that any key in the sst that is shadowed by a key in the
2122+
// engine is also a duplicate. As an example, accurate stats will be computed
2123+
// here:
2124+
// - sst: a3,a2 and eng: a2,a1
2125+
// but not for, as a1 is not a duplicate:
2126+
// - sst: a1 and eng: a2
2127+
bool compute_stats_diff = 13 [(gogoproto.customname) = "ComputeStatsDiff"];
21152128

21162129
reserved 10, 11;
21172130
}

pkg/kv/kvserver/batcheval/cmd_add_sstable.go

Lines changed: 151 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,10 @@ func EvalAddSSTable(
172172
}
173173
}
174174

175+
if err := checkSSTSpanBounds(ctx, sst, start, end); err != nil {
176+
return result.Result{}, err
177+
}
178+
175179
// If requested and necessary, rewrite the SST's MVCC timestamps to the
176180
// request timestamp. This ensures the writes comply with the timestamp cache
177181
// and closed timestamp, i.e. by not writing to timestamps that have already
@@ -194,10 +198,16 @@ func EvalAddSSTable(
194198
}
195199
}
196200

197-
var statsDelta enginepb.MVCCStats
198201
maxLockConflicts := storage.MaxConflictsPerLockConflictError.Get(&cArgs.EvalCtx.ClusterSettings().SV)
199202
targetLockConflictBytes := storage.TargetBytesPerLockConflictError.Get(&cArgs.EvalCtx.ClusterSettings().SV)
200203
checkConflicts := args.DisallowConflicts || !args.DisallowShadowingBelow.IsEmpty()
204+
205+
// checkConflictsStatsDelta is a delta between the SST-only statistics and
206+
// their effect when applied. In other words:
207+
//
208+
// sstOnlyStats + checkConflictsStatsDelta equals the actual stats
209+
// contribution of the sst.
210+
var checkConflictsStatsDelta enginepb.MVCCStats
201211
if checkConflicts {
202212
// If requested, check for MVCC conflicts with existing keys. This enforces
203213
// all MVCC invariants by returning WriteTooOldError for any existing
@@ -207,10 +217,10 @@ func EvalAddSSTable(
207217
// Additionally, if DisallowShadowingBelow is set, it will not write
208218
// above existing/visible values (but will write above tombstones).
209219
//
210-
// If the overlap between the ingested SST and the engine is large (i.e.
211-
// the SST is wide in keyspace), or if the ingested SST is very small,
212-
// use prefix seeks in CheckSSTConflicts. This ends up being more performant
213-
// as it avoids expensive seeks with index/data block loading in the common
220+
// If the overlap between the ingested SST and the engine is large (i.e. the
221+
// SST is wide in keyspace), or if the ingested SST is very small, use
222+
// prefix seeks in CheckSSTConflicts. This ends up being more performant as
223+
// it avoids expensive seeks with index/data block loading in the common
214224
// case of no conflicts.
215225
usePrefixSeek := false
216226
bytes, err := cArgs.EvalCtx.GetApproximateDiskBytes(start.Key, end.Key)
@@ -229,9 +239,9 @@ func EvalAddSSTable(
229239
args.Key, args.EndKey, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey())
230240

231241
log.VEventf(ctx, 2, "checking conflicts for SSTable [%s,%s)", start.Key, end.Key)
232-
statsDelta, err = storage.CheckSSTConflicts(ctx, sst, readWriter, start, end, leftPeekBound, rightPeekBound,
242+
checkConflictsStatsDelta, err = storage.CheckSSTConflicts(ctx, sst, readWriter, start, end, leftPeekBound, rightPeekBound,
233243
args.DisallowShadowingBelow, sstTimestamp, maxLockConflicts, targetLockConflictBytes, usePrefixSeek)
234-
statsDelta.Add(sstReqStatsDelta)
244+
checkConflictsStatsDelta.Add(sstReqStatsDelta)
235245
if err != nil {
236246
return result.Result{}, errors.Wrap(err, "checking for key collisions")
237247
}
@@ -250,112 +260,65 @@ func EvalAddSSTable(
250260
}
251261
}
252262

253-
// Verify that the keys in the sstable are within the range specified by the
254-
// request header, and if the request did not include pre-computed stats,
255-
// compute the expected MVCC stats delta of ingesting the SST.
256-
sstIter, err := storage.NewMemSSTIterator(sst, true /* verify */, storage.IterOptions{
257-
KeyTypes: storage.IterKeyTypePointsAndRanges,
258-
LowerBound: keys.MinKey,
259-
UpperBound: keys.MaxKey,
260-
})
261-
if err != nil {
262-
return result.Result{}, err
263-
}
264-
defer sstIter.Close()
265-
266-
// Check that the first key is in the expected range.
267-
sstIter.SeekGE(storage.MVCCKey{Key: keys.MinKey})
268-
if ok, err := sstIter.Valid(); err != nil {
269-
return result.Result{}, err
270-
} else if ok {
271-
if unsafeKey := sstIter.UnsafeKey(); unsafeKey.Less(start) {
272-
return result.Result{}, errors.Errorf("first key %s not in request range [%s,%s)",
273-
unsafeKey.Key, start.Key, end.Key)
274-
}
275-
}
276-
277-
// Get the MVCCStats for the SST being ingested.
278-
var stats enginepb.MVCCStats
279-
if args.MVCCStats != nil {
280-
stats = *args.MVCCStats
281-
} else {
282-
log.VEventf(ctx, 2, "computing MVCCStats for SSTable [%s,%s)", start.Key, end.Key)
283-
stats, err = storage.ComputeStatsForIter(sstIter, h.Timestamp.WallTime)
284-
if err != nil {
285-
return result.Result{}, errors.Wrap(err, "computing SSTable MVCC stats")
286-
}
287-
}
288-
289-
sstIter.SeekGE(end)
290-
if ok, err := sstIter.Valid(); err != nil {
291-
return result.Result{}, err
292-
} else if ok {
293-
return result.Result{}, errors.Errorf("last key %s not in request range [%s,%s)",
294-
sstIter.UnsafeKey(), start.Key, end.Key)
295-
}
296-
297-
// The above MVCCStats represents what is in this new SST.
298-
//
299-
// *If* the keys in the SST do not conflict with keys currently in this range,
300-
// then adding the stats for this SST to the range stats should yield the
301-
// correct overall stats.
302-
//
303-
// *However*, if the keys in this range *do* overlap with keys already in this
304-
// range, then adding the SST semantically *replaces*, rather than adds, those
305-
// keys, and the net effect on the stats is not so simple.
263+
// stats will represent the contribution of MVCC stats from the SST to the
264+
// range. We assume these stats are not estimates if we:
306265
//
307-
// To perfectly compute the correct net stats, you could a) determine the
308-
// stats for the span of the existing range that this SST covers and subtract
309-
// it from the range's stats, then b) use a merging iterator that reads from
310-
// the SST and then underlying range and compute the stats of that merged
311-
// span, and then add those stats back in. That would result in correct stats
312-
// that reflect the merging semantics when the SST "shadows" an existing key.
266+
// 1. Call ComputeSSTStatsDiff below, which computes the exact stats diff of
267+
// the sst, even in the presence of duplicate, shadowing, and shadowed keys in
268+
// the ingesting keyspace. Note that this computation will throw an error if
269+
// there is a range key in the _incoming sst_.
313270
//
314-
// If the underlying range is mostly empty, this isn't terribly expensive --
315-
// computing the existing stats to subtract is cheap, as there is little or no
316-
// existing data to traverse and b) is also pretty cheap -- the merging
317-
// iterator can quickly iterate the in-memory SST.
271+
// TODO(msbutler): In the first iteration, mvcc stats estimates will be
272+
// returned if the _underlying_ range contains range keys.
318273
//
319-
// However, if the underlying range is _not_ empty, then this is not cheap:
320-
// recomputing its stats involves traversing lots of data, and iterating the
321-
// merged iterator has to constantly go back and forth to the iterator.
274+
// 2. Call for CheckForSSTConflicts above, which asserts the sst does not
275+
// conflict with any underlying keys, as defined by DisallowShadowing and
276+
// DisallowShadowingBelow. Note that the checkConflictsStatsDelta is a delta
277+
// between the SST-only statistics and their effect when applied, which when
278+
// added to the SST statistics, will adjust them for existing keys and values.
279+
// Thus, we still need to compute the sst-only statistics on this branch. Also
280+
// note that CheckForSSTConflicts can still return estimates in certain corner
281+
// cases.
322282
//
323-
// If we assume that most SSTs don't shadow too many keys, then the error of
324-
// simply adding the SST stats directly to the range stats is minimal. In the
325-
// worst-case, when we retry a whole SST, then it could be overcounting the
326-
// entire file, but we can hope that that is rare. In the worst case, it may
327-
// cause splitting an under-filled range that would later merge when the
328-
// over-count is fixed.
329-
//
330-
// We can indicate that these stats contain this estimation using the flag in
331-
// the MVCC stats so that later re-computations will not be surprised to find
332-
// any discrepancies.
333-
//
334-
// Callers can trigger such a re-computation to fixup any discrepancies (and
335-
// remove the ContainsEstimates flag) after they are done ingesting files by
336-
// sending an explicit recompute.
337-
//
338-
// There is a significant performance win to be achieved by ensuring that the
339-
// stats computed are not estimates as it prevents recomputation on splits.
340-
// Running AddSSTable with disallowShadowing=true gets us close to this as we
341-
// do not allow colliding keys to be ingested. However, in the situation that
342-
// two SSTs have KV(s) which "perfectly" shadow an existing key (equal ts and
343-
// value), we do not consider this a collision. While the KV would just
344-
// overwrite the existing data, the stats would be added to the cumulative
345-
// stats of the AddSSTable command, causing a double count for such KVs.
346-
// Therefore, we compute the stats for these "skipped" KVs on-the-fly while
347-
// checking for the collision condition in C++ and subtract them from the
348-
// stats of the SST being ingested before adding them to the running
349-
// cumulative for this command. These stats can then be marked as accurate.
283+
// While computing stats in this request requires a potentially expensive scan
284+
// of the range, it ensures a healthy allocator that makes good decisions on
285+
// when to split/merge or gc a range (etc). Stats estimates may not be harmful
286+
// if we assume that most SSTs don't shadow too many keys, so the error of
287+
// simply adding the SST stats directly to the range stats is minimal.
288+
var stats enginepb.MVCCStats
289+
log.VEventf(ctx, 2, "computing MVCCStats for SSTable [%s,%s)", start.Key, end.Key)
350290
if checkConflicts {
351-
stats.Add(statsDelta)
352-
if statsDelta.ContainsEstimates == 0 {
353-
stats.ContainsEstimates = 0
291+
stats.Add(checkConflictsStatsDelta)
292+
}
293+
if !(checkConflicts || args.ComputeStatsDiff) {
294+
// If CheckSSTConflicts or ComputeStatsDiff are not used to compute stats,
295+
// then the stats do not account for overlapping keys in the engine, so we
296+
// have to assume they are estimates.
297+
stats.ContainsEstimates++
298+
}
299+
if args.ComputeStatsDiff {
300+
if checkConflicts {
301+
return result.Result{}, errors.New(
302+
"AddSSTableRequest.ComputeStatsDiff cannot be used with DisallowConflicts or DisallowShadowingBelow")
303+
}
304+
if args.MVCCStats != nil {
305+
return result.Result{}, errors.New(
306+
"AddSSTableRequest.ComputeStatsDiff cannot be used with precomputed MVCCStats")
354307
}
308+
statsDiff, err := computeSSTStatsDiffWrapper(ctx, sst, readWriter, h.Timestamp.WallTime, start, end)
309+
if err != nil {
310+
return result.Result{}, errors.Wrap(err, "computing SST stats diff")
311+
}
312+
stats.Add(statsDiff)
313+
} else if args.MVCCStats != nil {
314+
stats.Add(*args.MVCCStats)
355315
} else {
356-
stats.ContainsEstimates++
316+
sstStats, err := computeSSTStats(ctx, sst, h.Timestamp.WallTime)
317+
if err != nil {
318+
return result.Result{}, errors.Wrap(err, "computing SST stats")
319+
}
320+
stats.Add(sstStats)
357321
}
358-
359322
ms.Add(stats)
360323

361324
var mvccHistoryMutation *kvserverpb.ReplicatedEvalResult_MVCCHistoryMutation
@@ -505,6 +468,89 @@ func EvalAddSSTable(
505468
}, nil
506469
}
507470

471+
func computeSSTStatsDiffWrapper(
472+
ctx context.Context,
473+
sst []byte,
474+
readWriter storage.ReadWriter,
475+
nowNanos int64,
476+
start, end storage.MVCCKey,
477+
) (enginepb.MVCCStats, error) {
478+
stats, err := storage.ComputeSSTStatsDiff(
479+
ctx, sst, readWriter, nowNanos, start, end)
480+
if errors.Is(err, storage.ComputeSSTStatsDiffReaderHasRangeKeys) {
481+
// Fall back to stats estimates if there are range keys in the engine.
482+
log.VEventf(ctx, 2, "computing SST stats as estimates after detecting range keys in engine")
483+
sstStats, err := computeSSTStats(ctx, sst, nowNanos)
484+
if err != nil {
485+
return enginepb.MVCCStats{}, errors.Wrap(err, "computing SST stats after detecting range keys in engine")
486+
}
487+
sstStats.ContainsEstimates = 1
488+
return sstStats, nil
489+
} else if err != nil {
490+
return enginepb.MVCCStats{}, err
491+
}
492+
return stats, nil
493+
}
494+
495+
func computeSSTStats(ctx context.Context, sst []byte, nowNanos int64) (enginepb.MVCCStats, error) {
496+
sstIter, err := storage.NewMemSSTIterator(sst, true /* verify */, storage.IterOptions{
497+
KeyTypes: storage.IterKeyTypePointsAndRanges,
498+
LowerBound: keys.MinKey,
499+
UpperBound: keys.MaxKey,
500+
})
501+
if err != nil {
502+
return enginepb.MVCCStats{}, err
503+
}
504+
defer sstIter.Close()
505+
506+
sstIter.SeekGE(storage.MVCCKey{Key: keys.MinKey})
507+
if ok, err := sstIter.Valid(); err != nil {
508+
return enginepb.MVCCStats{}, err
509+
} else if ok {
510+
// TODO(msbutler): this implies we tolerate ingesting an empty addstable.
511+
// Perhaps we should reject empty addstable requests?
512+
sstStats, err := storage.ComputeStatsForIter(sstIter, nowNanos)
513+
if err != nil {
514+
return enginepb.MVCCStats{}, errors.Wrap(err, "computing SSTable MVCC stats")
515+
}
516+
return sstStats, nil
517+
}
518+
return enginepb.MVCCStats{}, nil
519+
}
520+
521+
// checkSSTSpanBounds verifies that the keys in the sstable are within the
522+
// span specified by the request header.
523+
func checkSSTSpanBounds(ctx context.Context, sst []byte, start, end storage.MVCCKey) error {
524+
sstIter, err := storage.NewMemSSTIterator(sst, true /* verify */, storage.IterOptions{
525+
KeyTypes: storage.IterKeyTypePointsAndRanges,
526+
LowerBound: keys.MinKey,
527+
UpperBound: keys.MaxKey,
528+
})
529+
if err != nil {
530+
return err
531+
}
532+
defer sstIter.Close()
533+
534+
// Check that the first key is in the expected span.
535+
sstIter.SeekGE(storage.MVCCKey{Key: keys.MinKey})
536+
if ok, err := sstIter.Valid(); err != nil {
537+
return err
538+
} else if ok {
539+
if unsafeKey := sstIter.UnsafeKey(); unsafeKey.Less(start) {
540+
return errors.Errorf("first key %s not in request range [%s,%s)",
541+
unsafeKey.Key, start.Key, end.Key)
542+
}
543+
}
544+
sstIter.SeekGE(end)
545+
if ok, err := sstIter.Valid(); err != nil {
546+
return err
547+
} else if ok {
548+
return errors.Errorf("last key %s not in request range [%s,%s)",
549+
sstIter.UnsafeKey(), start.Key, end.Key)
550+
}
551+
return err
552+
}
553+
508554
// assertSSTContents checks that the SST contains expected inputs:
509555
//
510556
// * Only SST set operations (not explicitly verified).

0 commit comments

Comments
 (0)