@@ -130,6 +130,11 @@ func EvalAddSSTable(
130
130
) (result.Result , error ) {
131
131
args := cArgs .Args .(* kvpb.AddSSTableRequest )
132
132
h := cArgs .Header
133
+
134
+ if err := args .Validate (h ); err != nil {
135
+ return result.Result {}, err
136
+ }
137
+
133
138
ms := cArgs .Stats
134
139
start , end := storage.MVCCKey {Key : args .Key }, storage.MVCCKey {Key : args .EndKey }
135
140
sst := args .Data
@@ -172,6 +177,10 @@ func EvalAddSSTable(
172
177
}
173
178
}
174
179
180
+ if err := checkSSTSpanBounds (ctx , sst , start , end ); err != nil {
181
+ return result.Result {}, err
182
+ }
183
+
175
184
// If requested and necessary, rewrite the SST's MVCC timestamps to the
176
185
// request timestamp. This ensures the writes comply with the timestamp cache
177
186
// and closed timestamp, i.e. by not writing to timestamps that have already
@@ -194,10 +203,16 @@ func EvalAddSSTable(
194
203
}
195
204
}
196
205
197
- var statsDelta enginepb.MVCCStats
198
206
maxLockConflicts := storage .MaxConflictsPerLockConflictError .Get (& cArgs .EvalCtx .ClusterSettings ().SV )
199
207
targetLockConflictBytes := storage .TargetBytesPerLockConflictError .Get (& cArgs .EvalCtx .ClusterSettings ().SV )
200
208
checkConflicts := args .DisallowConflicts || ! args .DisallowShadowingBelow .IsEmpty ()
209
+
210
+ // checkConflictsStatsDelta is a delta between the SST-only statistics and
211
+ // their effect when applied. In other words:
212
+ //
213
+ // sstOnlyStats + checkConflictsStatsDelta equals the actual stats
214
+ // contribution of the sst.
215
+ var checkConflictsStatsDelta enginepb.MVCCStats
201
216
if checkConflicts {
202
217
// If requested, check for MVCC conflicts with existing keys. This enforces
203
218
// all MVCC invariants by returning WriteTooOldError for any existing
@@ -207,10 +222,10 @@ func EvalAddSSTable(
207
222
// Additionally, if DisallowShadowingBelow is set, it will not write
208
223
// above existing/visible values (but will write above tombstones).
209
224
//
210
- // If the overlap between the ingested SST and the engine is large (i.e.
211
- // the SST is wide in keyspace), or if the ingested SST is very small,
212
- // use prefix seeks in CheckSSTConflicts. This ends up being more performant
213
- // as it avoids expensive seeks with index/data block loading in the common
225
+ // If the overlap between the ingested SST and the engine is large (i.e. the
226
+ // SST is wide in keyspace), or if the ingested SST is very small, use
227
+ // prefix seeks in CheckSSTConflicts. This ends up being more performant as
228
+ // it avoids expensive seeks with index/data block loading in the common
214
229
// case of no conflicts.
215
230
usePrefixSeek := false
216
231
bytes , err := cArgs .EvalCtx .GetApproximateDiskBytes (start .Key , end .Key )
@@ -229,9 +244,9 @@ func EvalAddSSTable(
229
244
args .Key , args .EndKey , desc .StartKey .AsRawKey (), desc .EndKey .AsRawKey ())
230
245
231
246
log .VEventf (ctx , 2 , "checking conflicts for SSTable [%s,%s)" , start .Key , end .Key )
232
- statsDelta , err = storage .CheckSSTConflicts (ctx , sst , readWriter , start , end , leftPeekBound , rightPeekBound ,
247
+ checkConflictsStatsDelta , err = storage .CheckSSTConflicts (ctx , sst , readWriter , start , end , leftPeekBound , rightPeekBound ,
233
248
args .DisallowShadowingBelow , sstTimestamp , maxLockConflicts , targetLockConflictBytes , usePrefixSeek )
234
- statsDelta .Add (sstReqStatsDelta )
249
+ checkConflictsStatsDelta .Add (sstReqStatsDelta )
235
250
if err != nil {
236
251
return result.Result {}, errors .Wrap (err , "checking for key collisions" )
237
252
}
@@ -250,112 +265,57 @@ func EvalAddSSTable(
250
265
}
251
266
}
252
267
253
- // Verify that the keys in the sstable are within the range specified by the
254
- // request header, and if the request did not include pre-computed stats,
255
- // compute the expected MVCC stats delta of ingesting the SST.
256
- sstIter , err := storage .NewMemSSTIterator (sst , true /* verify */ , storage.IterOptions {
257
- KeyTypes : storage .IterKeyTypePointsAndRanges ,
258
- LowerBound : keys .MinKey ,
259
- UpperBound : keys .MaxKey ,
260
- })
261
- if err != nil {
262
- return result.Result {}, err
263
- }
264
- defer sstIter .Close ()
265
-
266
- // Check that the first key is in the expected range.
267
- sstIter .SeekGE (storage.MVCCKey {Key : keys .MinKey })
268
- if ok , err := sstIter .Valid (); err != nil {
269
- return result.Result {}, err
270
- } else if ok {
271
- if unsafeKey := sstIter .UnsafeKey (); unsafeKey .Less (start ) {
272
- return result.Result {}, errors .Errorf ("first key %s not in request range [%s,%s)" ,
273
- unsafeKey .Key , start .Key , end .Key )
274
- }
275
- }
276
-
277
- // Get the MVCCStats for the SST being ingested.
278
- var stats enginepb.MVCCStats
279
- if args .MVCCStats != nil {
280
- stats = * args .MVCCStats
281
- } else {
282
- log .VEventf (ctx , 2 , "computing MVCCStats for SSTable [%s,%s)" , start .Key , end .Key )
283
- stats , err = storage .ComputeStatsForIter (sstIter , h .Timestamp .WallTime )
284
- if err != nil {
285
- return result.Result {}, errors .Wrap (err , "computing SSTable MVCC stats" )
286
- }
287
- }
288
-
289
- sstIter .SeekGE (end )
290
- if ok , err := sstIter .Valid (); err != nil {
291
- return result.Result {}, err
292
- } else if ok {
293
- return result.Result {}, errors .Errorf ("last key %s not in request range [%s,%s)" ,
294
- sstIter .UnsafeKey (), start .Key , end .Key )
295
- }
296
-
297
- // The above MVCCStats represents what is in this new SST.
298
- //
299
- // *If* the keys in the SST do not conflict with keys currently in this range,
300
- // then adding the stats for this SST to the range stats should yield the
301
- // correct overall stats.
302
- //
303
- // *However*, if the keys in this range *do* overlap with keys already in this
304
- // range, then adding the SST semantically *replaces*, rather than adds, those
305
- // keys, and the net effect on the stats is not so simple.
306
- //
307
- // To perfectly compute the correct net stats, you could a) determine the
308
- // stats for the span of the existing range that this SST covers and subtract
309
- // it from the range's stats, then b) use a merging iterator that reads from
310
- // the SST and then underlying range and compute the stats of that merged
311
- // span, and then add those stats back in. That would result in correct stats
312
- // that reflect the merging semantics when the SST "shadows" an existing key.
313
- //
314
- // If the underlying range is mostly empty, this isn't terribly expensive --
315
- // computing the existing stats to subtract is cheap, as there is little or no
316
- // existing data to traverse and b) is also pretty cheap -- the merging
317
- // iterator can quickly iterate the in-memory SST.
268
+ // stats will represent the contribution of MVCC stats from the SST to the
269
+ // range. We assume these stats are not estimates if we:
318
270
//
319
- // However, if the underlying range is _not_ empty, then this is not cheap:
320
- // recomputing its stats involves traversing lots of data, and iterating the
321
- // merged iterator has to constantly go back and forth to the iterator.
271
+ // 1. Call ComputeSSTStatsDiff below, which computes the exact stats diff of
272
+ // the sst, even in the presence of duplicate, shadowing, and shadowed keys in
273
+ // the ingesting keyspace. Note that this computation will throw an error if
274
+ // there is a range key in the _incoming sst_.
322
275
//
323
- // If we assume that most SSTs don't shadow too many keys, then the error of
324
- // simply adding the SST stats directly to the range stats is minimal. In the
325
- // worst-case, when we retry a whole SST, then it could be overcounting the
326
- // entire file, but we can hope that that is rare. In the worst case, it may
327
- // cause splitting an under-filled range that would later merge when the
328
- // over-count is fixed.
276
+ // TODO(msbutler): In the first iteration, mvcc stats estimates will be
277
+ // returned if the _underlying_ range contains range keys.
329
278
//
330
- // We can indicate that these stats contain this estimation using the flag in
331
- // the MVCC stats so that later re-computations will not be surprised to find
332
- // any discrepancies.
279
+ // 2. Call for CheckForSSTConflicts above, which asserts the sst does not
280
+ // conflict with any underlying keys, as defined by DisallowShadowing and
281
+ // DisallowShadowingBelow. Note that the checkConflictsStatsDelta is a delta
282
+ // between the SST-only statistics and their effect when applied, which when
283
+ // added to the SST statistics, will adjust them for existing keys and values.
284
+ // Thus, we still need to compute the sst-only statistics on this branch. Also
285
+ // note that CheckForSSTConflicts can still return estimates in certain corner
286
+ // cases.
333
287
//
334
- // Callers can trigger such a re-computation to fixup any discrepancies (and
335
- // remove the ContainsEstimates flag) after they are done ingesting files by
336
- // sending an explicit recompute.
337
- //
338
- // There is a significant performance win to be achieved by ensuring that the
339
- // stats computed are not estimates as it prevents recomputation on splits.
340
- // Running AddSSTable with disallowShadowing=true gets us close to this as we
341
- // do not allow colliding keys to be ingested. However, in the situation that
342
- // two SSTs have KV(s) which "perfectly" shadow an existing key (equal ts and
343
- // value), we do not consider this a collision. While the KV would just
344
- // overwrite the existing data, the stats would be added to the cumulative
345
- // stats of the AddSSTable command, causing a double count for such KVs.
346
- // Therefore, we compute the stats for these "skipped" KVs on-the-fly while
347
- // checking for the collision condition in C++ and subtract them from the
348
- // stats of the SST being ingested before adding them to the running
349
- // cumulative for this command. These stats can then be marked as accurate.
288
+ // While computing stats in this request requires a potentially expensive scan
289
+ // of the range, it ensures a healthy allocator that makes good decisions on
290
+ // when to split/merge or gc a range (etc). Stats estimates may not be harmful
291
+ // if we assume that most SSTs don't shadow too many keys, so the error of
292
+ // simply adding the SST stats directly to the range stats is minimal.
293
+ var stats enginepb.MVCCStats
294
+ log .VEventf (ctx , 2 , "computing MVCCStats for SSTable [%s,%s)" , start .Key , end .Key )
350
295
if checkConflicts {
351
- stats .Add (statsDelta )
352
- if statsDelta .ContainsEstimates == 0 {
353
- stats .ContainsEstimates = 0
296
+ stats .Add (checkConflictsStatsDelta )
297
+ }
298
+ if args .ComputeStatsDiff {
299
+ statsDiff , err := computeSSTStatsDiffWithFallback (ctx , sst , readWriter , h .Timestamp .WallTime , start , end )
300
+ if err != nil {
301
+ return result.Result {}, errors .Wrap (err , "computing SST stats diff" )
302
+ }
303
+ stats .Add (statsDiff )
304
+ } else if args .MVCCStats != nil {
305
+ stats .Add (* args .MVCCStats )
306
+ if ! checkConflicts {
307
+ stats .ContainsEstimates ++
354
308
}
355
309
} else {
356
- stats .ContainsEstimates ++
310
+ sstStats , err := computeSSTStats (ctx , sst , h .Timestamp .WallTime )
311
+ if err != nil {
312
+ return result.Result {}, errors .Wrap (err , "computing SST stats" )
313
+ }
314
+ stats .Add (sstStats )
315
+ if ! checkConflicts {
316
+ stats .ContainsEstimates ++
317
+ }
357
318
}
358
-
359
319
ms .Add (stats )
360
320
361
321
var mvccHistoryMutation * kvserverpb.ReplicatedEvalResult_MVCCHistoryMutation
@@ -505,6 +465,89 @@ func EvalAddSSTable(
505
465
}, nil
506
466
}
507
467
468
+ func computeSSTStatsDiffWithFallback (
469
+ ctx context.Context ,
470
+ sst []byte ,
471
+ readWriter storage.ReadWriter ,
472
+ nowNanos int64 ,
473
+ start , end storage.MVCCKey ,
474
+ ) (enginepb.MVCCStats , error ) {
475
+ stats , err := storage .ComputeSSTStatsDiff (
476
+ ctx , sst , readWriter , nowNanos , start , end )
477
+ if errors .Is (err , storage .ComputeSSTStatsDiffReaderHasRangeKeys ) {
478
+ // Fall back to stats estimates if there are range keys in the engine.
479
+ log .VEventf (ctx , 2 , "computing SST stats as estimates after detecting range keys in engine" )
480
+ sstStats , err := computeSSTStats (ctx , sst , nowNanos )
481
+ if err != nil {
482
+ return enginepb.MVCCStats {}, errors .Wrap (err , "computing SST stats after detecting range keys in engine" )
483
+ }
484
+ sstStats .ContainsEstimates = 1
485
+ return sstStats , nil
486
+ } else if err != nil {
487
+ return enginepb.MVCCStats {}, err
488
+ }
489
+ return stats , nil
490
+ }
491
+
492
+ func computeSSTStats (ctx context.Context , sst []byte , nowNanos int64 ) (enginepb.MVCCStats , error ) {
493
+ sstIter , err := storage .NewMemSSTIterator (sst , true /* verify */ , storage.IterOptions {
494
+ KeyTypes : storage .IterKeyTypePointsAndRanges ,
495
+ LowerBound : keys .MinKey ,
496
+ UpperBound : keys .MaxKey ,
497
+ })
498
+ if err != nil {
499
+ return enginepb.MVCCStats {}, err
500
+ }
501
+ defer sstIter .Close ()
502
+
503
+ sstIter .SeekGE (storage.MVCCKey {Key : keys .MinKey })
504
+ if ok , err := sstIter .Valid (); err != nil {
505
+ return enginepb.MVCCStats {}, err
506
+ } else if ok {
507
+ // TODO(msbutler): this implies we tolerate ingesting an empty addstable.
508
+ // Perhaps we should reject empty addstable requests?
509
+ sstStats , err := storage .ComputeStatsForIter (sstIter , nowNanos )
510
+ if err != nil {
511
+ return enginepb.MVCCStats {}, errors .Wrap (err , "computing SSTable MVCC stats" )
512
+ }
513
+ return sstStats , nil
514
+ }
515
+ return enginepb.MVCCStats {}, nil
516
+ }
517
+
518
+ // checkSSTSpanBounds verifies that the keys in the sstable are within the
519
+ // span specified by [start, end].
520
+ func checkSSTSpanBounds (ctx context.Context , sst []byte , start , end storage.MVCCKey ) error {
521
+ sstIter , err := storage .NewMemSSTIterator (sst , true /* verify */ , storage.IterOptions {
522
+ KeyTypes : storage .IterKeyTypePointsAndRanges ,
523
+ LowerBound : keys .MinKey ,
524
+ UpperBound : keys .MaxKey ,
525
+ })
526
+ if err != nil {
527
+ return err
528
+ }
529
+ defer sstIter .Close ()
530
+
531
+ // Check that the first key is in the expected span.
532
+ sstIter .SeekGE (storage.MVCCKey {Key : keys .MinKey })
533
+ if ok , err := sstIter .Valid (); err != nil {
534
+ return err
535
+ } else if ok {
536
+ if unsafeKey := sstIter .UnsafeKey (); unsafeKey .Less (start ) {
537
+ return errors .Errorf ("first key %s not in request range [%s,%s)" ,
538
+ unsafeKey .Key , start .Key , end .Key )
539
+ }
540
+ }
541
+ sstIter .SeekGE (end )
542
+ if ok , err := sstIter .Valid (); err != nil {
543
+ return err
544
+ } else if ok {
545
+ return errors .Errorf ("last key %s not in request range [%s,%s)" ,
546
+ sstIter .UnsafeKey (), start .Key , end .Key )
547
+ }
548
+ return err
549
+ }
550
+
508
551
// assertSSTContents checks that the SST contains expected inputs:
509
552
//
510
553
// * Only SST set operations (not explicitly verified).
0 commit comments