@@ -172,6 +172,10 @@ func EvalAddSSTable(
172
172
}
173
173
}
174
174
175
+ if err := checkSSTSpanBounds (ctx , sst , start , end ); err != nil {
176
+ return result.Result {}, err
177
+ }
178
+
175
179
// If requested and necessary, rewrite the SST's MVCC timestamps to the
176
180
// request timestamp. This ensures the writes comply with the timestamp cache
177
181
// and closed timestamp, i.e. by not writing to timestamps that have already
@@ -194,10 +198,16 @@ func EvalAddSSTable(
194
198
}
195
199
}
196
200
197
- var statsDelta enginepb.MVCCStats
198
201
maxLockConflicts := storage .MaxConflictsPerLockConflictError .Get (& cArgs .EvalCtx .ClusterSettings ().SV )
199
202
targetLockConflictBytes := storage .TargetBytesPerLockConflictError .Get (& cArgs .EvalCtx .ClusterSettings ().SV )
200
203
checkConflicts := args .DisallowConflicts || ! args .DisallowShadowingBelow .IsEmpty ()
204
+
205
+ // checkConflictsStatsDelta is a delta between the SST-only statistics and
206
+ // their effect when applied. In other words:
207
+ //
208
+ // sstOnlyStats + checkConflictsStatsDelta equals the actual stats
209
+ // contribution of the sst.
210
+ var checkConflictsStatsDelta enginepb.MVCCStats
201
211
if checkConflicts {
202
212
// If requested, check for MVCC conflicts with existing keys. This enforces
203
213
// all MVCC invariants by returning WriteTooOldError for any existing
@@ -207,10 +217,10 @@ func EvalAddSSTable(
207
217
// Additionally, if DisallowShadowingBelow is set, it will not write
208
218
// above existing/visible values (but will write above tombstones).
209
219
//
210
- // If the overlap between the ingested SST and the engine is large (i.e.
211
- // the SST is wide in keyspace), or if the ingested SST is very small,
212
- // use prefix seeks in CheckSSTConflicts. This ends up being more performant
213
- // as it avoids expensive seeks with index/data block loading in the common
220
+ // If the overlap between the ingested SST and the engine is large (i.e. the
221
+ // SST is wide in keyspace), or if the ingested SST is very small, use
222
+ // prefix seeks in CheckSSTConflicts. This ends up being more performant as
223
+ // it avoids expensive seeks with index/data block loading in the common
214
224
// case of no conflicts.
215
225
usePrefixSeek := false
216
226
bytes , err := cArgs .EvalCtx .GetApproximateDiskBytes (start .Key , end .Key )
@@ -229,9 +239,9 @@ func EvalAddSSTable(
229
239
args .Key , args .EndKey , desc .StartKey .AsRawKey (), desc .EndKey .AsRawKey ())
230
240
231
241
log .VEventf (ctx , 2 , "checking conflicts for SSTable [%s,%s)" , start .Key , end .Key )
232
- statsDelta , err = storage .CheckSSTConflicts (ctx , sst , readWriter , start , end , leftPeekBound , rightPeekBound ,
242
+ checkConflictsStatsDelta , err = storage .CheckSSTConflicts (ctx , sst , readWriter , start , end , leftPeekBound , rightPeekBound ,
233
243
args .DisallowShadowingBelow , sstTimestamp , maxLockConflicts , targetLockConflictBytes , usePrefixSeek )
234
- statsDelta .Add (sstReqStatsDelta )
244
+ checkConflictsStatsDelta .Add (sstReqStatsDelta )
235
245
if err != nil {
236
246
return result.Result {}, errors .Wrap (err , "checking for key collisions" )
237
247
}
@@ -250,112 +260,65 @@ func EvalAddSSTable(
250
260
}
251
261
}
252
262
253
- // Verify that the keys in the sstable are within the range specified by the
254
- // request header, and if the request did not include pre-computed stats,
255
- // compute the expected MVCC stats delta of ingesting the SST.
256
- sstIter , err := storage .NewMemSSTIterator (sst , true /* verify */ , storage.IterOptions {
257
- KeyTypes : storage .IterKeyTypePointsAndRanges ,
258
- LowerBound : keys .MinKey ,
259
- UpperBound : keys .MaxKey ,
260
- })
261
- if err != nil {
262
- return result.Result {}, err
263
- }
264
- defer sstIter .Close ()
265
-
266
- // Check that the first key is in the expected range.
267
- sstIter .SeekGE (storage.MVCCKey {Key : keys .MinKey })
268
- if ok , err := sstIter .Valid (); err != nil {
269
- return result.Result {}, err
270
- } else if ok {
271
- if unsafeKey := sstIter .UnsafeKey (); unsafeKey .Less (start ) {
272
- return result.Result {}, errors .Errorf ("first key %s not in request range [%s,%s)" ,
273
- unsafeKey .Key , start .Key , end .Key )
274
- }
275
- }
276
-
277
- // Get the MVCCStats for the SST being ingested.
278
- var stats enginepb.MVCCStats
279
- if args .MVCCStats != nil {
280
- stats = * args .MVCCStats
281
- } else {
282
- log .VEventf (ctx , 2 , "computing MVCCStats for SSTable [%s,%s)" , start .Key , end .Key )
283
- stats , err = storage .ComputeStatsForIter (sstIter , h .Timestamp .WallTime )
284
- if err != nil {
285
- return result.Result {}, errors .Wrap (err , "computing SSTable MVCC stats" )
286
- }
287
- }
288
-
289
- sstIter .SeekGE (end )
290
- if ok , err := sstIter .Valid (); err != nil {
291
- return result.Result {}, err
292
- } else if ok {
293
- return result.Result {}, errors .Errorf ("last key %s not in request range [%s,%s)" ,
294
- sstIter .UnsafeKey (), start .Key , end .Key )
295
- }
296
-
297
- // The above MVCCStats represents what is in this new SST.
298
- //
299
- // *If* the keys in the SST do not conflict with keys currently in this range,
300
- // then adding the stats for this SST to the range stats should yield the
301
- // correct overall stats.
302
- //
303
- // *However*, if the keys in this range *do* overlap with keys already in this
304
- // range, then adding the SST semantically *replaces*, rather than adds, those
305
- // keys, and the net effect on the stats is not so simple.
263
+ // stats will represent the contribution of MVCC stats from the SST to the
264
+ // range. We assume these stats are not estimates if we:
306
265
//
307
- // To perfectly compute the correct net stats, you could a) determine the
308
- // stats for the span of the existing range that this SST covers and subtract
309
- // it from the range's stats, then b) use a merging iterator that reads from
310
- // the SST and then underlying range and compute the stats of that merged
311
- // span, and then add those stats back in. That would result in correct stats
312
- // that reflect the merging semantics when the SST "shadows" an existing key.
266
+ // 1. Call ComputeSSTStatsDiff below, which computes the exact stats diff of
267
+ // the sst, even in the presence of duplicate, shadowing, and shadowed keys in
268
+ // the ingesting keyspace. Note that this computation will throw an error if
269
+ // there is a range key in the _incoming sst_.
313
270
//
314
- // If the underlying range is mostly empty, this isn't terribly expensive --
315
- // computing the existing stats to subtract is cheap, as there is little or no
316
- // existing data to traverse and b) is also pretty cheap -- the merging
317
- // iterator can quickly iterate the in-memory SST.
271
+ // TODO(msbutler): In the first iteration, mvcc stats estimates will be
272
+ // returned if the _underlying_ range contains range keys.
318
273
//
319
- // However, if the underlying range is _not_ empty, then this is not cheap:
320
- // recomputing its stats involves traversing lots of data, and iterating the
321
- // merged iterator has to constantly go back and forth to the iterator.
274
+ // 2. Call for CheckForSSTConflicts above, which asserts the sst does not
275
+ // conflict with any underlying keys, as defined by DisallowShadowing and
276
+ // DisallowShadowingBelow. Note that the checkConflictsStatsDelta is a delta
277
+ // between the SST-only statistics and their effect when applied, which when
278
+ // added to the SST statistics, will adjust them for existing keys and values.
279
+ // Thus, we still need to compute the sst-only statistics on this branch. Also
280
+ // note that CheckForSSTConflicts can still return estimates in certain corner
281
+ // cases.
322
282
//
323
- // If we assume that most SSTs don't shadow too many keys, then the error of
324
- // simply adding the SST stats directly to the range stats is minimal. In the
325
- // worst-case, when we retry a whole SST, then it could be overcounting the
326
- // entire file, but we can hope that that is rare. In the worst case, it may
327
- // cause splitting an under-filled range that would later merge when the
328
- // over-count is fixed.
329
- //
330
- // We can indicate that these stats contain this estimation using the flag in
331
- // the MVCC stats so that later re-computations will not be surprised to find
332
- // any discrepancies.
333
- //
334
- // Callers can trigger such a re-computation to fixup any discrepancies (and
335
- // remove the ContainsEstimates flag) after they are done ingesting files by
336
- // sending an explicit recompute.
337
- //
338
- // There is a significant performance win to be achieved by ensuring that the
339
- // stats computed are not estimates as it prevents recomputation on splits.
340
- // Running AddSSTable with disallowShadowing=true gets us close to this as we
341
- // do not allow colliding keys to be ingested. However, in the situation that
342
- // two SSTs have KV(s) which "perfectly" shadow an existing key (equal ts and
343
- // value), we do not consider this a collision. While the KV would just
344
- // overwrite the existing data, the stats would be added to the cumulative
345
- // stats of the AddSSTable command, causing a double count for such KVs.
346
- // Therefore, we compute the stats for these "skipped" KVs on-the-fly while
347
- // checking for the collision condition in C++ and subtract them from the
348
- // stats of the SST being ingested before adding them to the running
349
- // cumulative for this command. These stats can then be marked as accurate.
283
+ // While computing stats in this request requires a potentially expensive scan
284
+ // of the range, it ensures a healthy allocator that makes good decisions on
285
+ // when to split/merge or gc a range (etc). Stats estimates may not be harmful
286
+ // if we assume that most SSTs don't shadow too many keys, so the error of
287
+ // simply adding the SST stats directly to the range stats is minimal.
288
+ var stats enginepb.MVCCStats
289
+ log .VEventf (ctx , 2 , "computing MVCCStats for SSTable [%s,%s)" , start .Key , end .Key )
350
290
if checkConflicts {
351
- stats .Add (statsDelta )
352
- if statsDelta .ContainsEstimates == 0 {
353
- stats .ContainsEstimates = 0
291
+ stats .Add (checkConflictsStatsDelta )
292
+ }
293
+ if ! (checkConflicts || args .ComputeStatsDiff ) {
294
+ // If CheckSSTConflicts or ComputeStatsDiff are not used to compute stats,
295
+ // then the stats do not account for overlapping keys in the engine, so we
296
+ // have to assume they are estimates.
297
+ stats .ContainsEstimates ++
298
+ }
299
+ if args .ComputeStatsDiff {
300
+ if checkConflicts {
301
+ return result.Result {}, errors .New (
302
+ "AddSSTableRequest.ComputeStatsDiff cannot be used with DisallowConflicts or DisallowShadowingBelow" )
303
+ }
304
+ if args .MVCCStats != nil {
305
+ return result.Result {}, errors .New (
306
+ "AddSSTableRequest.ComputeStatsDiff cannot be used with precomputed MVCCStats" )
354
307
}
308
+ statsDiff , err := computeSSTStatsDiffWrapper (ctx , sst , readWriter , h .Timestamp .WallTime , start , end )
309
+ if err != nil {
310
+ return result.Result {}, errors .Wrap (err , "computing SST stats diff" )
311
+ }
312
+ stats .Add (statsDiff )
313
+ } else if args .MVCCStats != nil {
314
+ stats .Add (* args .MVCCStats )
355
315
} else {
356
- stats .ContainsEstimates ++
316
+ sstStats , err := computeSSTStats (ctx , sst , h .Timestamp .WallTime )
317
+ if err != nil {
318
+ return result.Result {}, errors .Wrap (err , "computing SST stats" )
319
+ }
320
+ stats .Add (sstStats )
357
321
}
358
-
359
322
ms .Add (stats )
360
323
361
324
var mvccHistoryMutation * kvserverpb.ReplicatedEvalResult_MVCCHistoryMutation
@@ -505,6 +468,89 @@ func EvalAddSSTable(
505
468
}, nil
506
469
}
507
470
471
+ func computeSSTStatsDiffWrapper (
472
+ ctx context.Context ,
473
+ sst []byte ,
474
+ readWriter storage.ReadWriter ,
475
+ nowNanos int64 ,
476
+ start , end storage.MVCCKey ,
477
+ ) (enginepb.MVCCStats , error ) {
478
+ stats , err := storage .ComputeSSTStatsDiff (
479
+ ctx , sst , readWriter , nowNanos , start , end )
480
+ if errors .Is (err , storage .ComputeSSTStatsDiffReaderHasRangeKeys ) {
481
+ // Fall back to stats estimates if there are range keys in the engine.
482
+ log .VEventf (ctx , 2 , "computing SST stats as estimates after detecting range keys in engine" )
483
+ sstStats , err := computeSSTStats (ctx , sst , nowNanos )
484
+ if err != nil {
485
+ return enginepb.MVCCStats {}, errors .Wrap (err , "computing SST stats after detecting range keys in engine" )
486
+ }
487
+ sstStats .ContainsEstimates = 1
488
+ return sstStats , nil
489
+ } else if err != nil {
490
+ return enginepb.MVCCStats {}, err
491
+ }
492
+ return stats , nil
493
+ }
494
+
495
+ func computeSSTStats (ctx context.Context , sst []byte , nowNanos int64 ) (enginepb.MVCCStats , error ) {
496
+ sstIter , err := storage .NewMemSSTIterator (sst , true /* verify */ , storage.IterOptions {
497
+ KeyTypes : storage .IterKeyTypePointsAndRanges ,
498
+ LowerBound : keys .MinKey ,
499
+ UpperBound : keys .MaxKey ,
500
+ })
501
+ if err != nil {
502
+ return enginepb.MVCCStats {}, err
503
+ }
504
+ defer sstIter .Close ()
505
+
506
+ sstIter .SeekGE (storage.MVCCKey {Key : keys .MinKey })
507
+ if ok , err := sstIter .Valid (); err != nil {
508
+ return enginepb.MVCCStats {}, err
509
+ } else if ok {
510
+ // TODO(msbutler): this implies we tolerate ingesting an empty addstable.
511
+ // Perhaps we should reject empty addstable requests?
512
+ sstStats , err := storage .ComputeStatsForIter (sstIter , nowNanos )
513
+ if err != nil {
514
+ return enginepb.MVCCStats {}, errors .Wrap (err , "computing SSTable MVCC stats" )
515
+ }
516
+ return sstStats , nil
517
+ }
518
+ return enginepb.MVCCStats {}, nil
519
+ }
520
+
521
+ // checkSSTSpanBounds verifies that the keys in the sstable are within the
522
+ // span specified by the request header.
523
+ func checkSSTSpanBounds (ctx context.Context , sst []byte , start , end storage.MVCCKey ) error {
524
+ sstIter , err := storage .NewMemSSTIterator (sst , true /* verify */ , storage.IterOptions {
525
+ KeyTypes : storage .IterKeyTypePointsAndRanges ,
526
+ LowerBound : keys .MinKey ,
527
+ UpperBound : keys .MaxKey ,
528
+ })
529
+ if err != nil {
530
+ return err
531
+ }
532
+ defer sstIter .Close ()
533
+
534
+ // Check that the first key is in the expected span.
535
+ sstIter .SeekGE (storage.MVCCKey {Key : keys .MinKey })
536
+ if ok , err := sstIter .Valid (); err != nil {
537
+ return err
538
+ } else if ok {
539
+ if unsafeKey := sstIter .UnsafeKey (); unsafeKey .Less (start ) {
540
+ return errors .Errorf ("first key %s not in request range [%s,%s)" ,
541
+ unsafeKey .Key , start .Key , end .Key )
542
+ }
543
+ }
544
+ sstIter .SeekGE (end )
545
+ if ok , err := sstIter .Valid (); err != nil {
546
+ return err
547
+ } else if ok {
548
+ return errors .Errorf ("last key %s not in request range [%s,%s)" ,
549
+ sstIter .UnsafeKey (), start .Key , end .Key )
550
+ }
551
+ return err
552
+ }
553
+
508
554
// assertSSTContents checks that the SST contains expected inputs:
509
555
//
510
556
// * Only SST set operations (not explicitly verified).
0 commit comments