Merge #147919 #148695

craig[bot] · msbutler · rafiss · craig[bot] · commit 3cc3a222ecbf · 2025-06-23T19:04:09.000Z
147919: batcheval: add ComputeStatsDiff arg to addsstable r=steven a=msbutler ComputeStatsDiff causes the server to compute the effect this sst will have on the range's mvcc stats, even in the presence of overlapping keys. This flag cannot be passed with the MVCCStats, DisallowShadowingBelow, or DisallowShadowing fields. This flag assumes that any key in the sst that is shadowed by a key in the engine is also a duplicate. As an example, accurate stats will be computed here: - sst: a@3,a@2 and eng: a@2,a@1 but not for, as a@1 is not a duplicate: - sst: a@1 and eng: a@2 Informs #145548 Release note: none 148695: roachtest: mark psycopg test as flaky r=rafiss a=rafiss fixes #148343 fixes #146895 Release note: None Co-authored-by: Michael Butler <butler@cockroachlabs.com> Co-authored-by: Rafi Shamim <rafi@cockroachlabs.com>
diff --git a/pkg/cmd/roachtest/tests/psycopg_blocklist.go b/pkg/cmd/roachtest/tests/psycopg_blocklist.go
@@ -24,6 +24,7 @@ var psycopgIgnoreList = blocklist{
 	`tests.pool.test_pool.test_reconnect_after_grow_failed`:                                          "requires insecure mode",
 	`tests.pool.test_pool.test_reconnect_failure[False]`:                                             "requires insecure mode",
 	`tests.pool.test_pool.test_refill_on_check`:                                                      "requires insecure mode",
+	`tests.pool.test_pool.test_shrink`:                                                               "flaky; see #146895",
 	`tests.pool.test_pool.test_stats_connect`:                                                        "requires insecure mode",
 	`tests.pool.test_pool_async.test_connect_check_timeout[asyncio]`:                                 "requires insecure mode",
 	`tests.pool.test_pool_async.test_reconnect[asyncio]`:                                             "requires insecure mode",
diff --git a/pkg/kv/kvpb/api.go b/pkg/kv/kvpb/api.go
@@ -2574,3 +2574,17 @@ func validateExclusionTimestampForBatch(ts hlc.Timestamp, h Header) error {
 	}
 	return nil
 }
+
+func (r *AddSSTableRequest) Validate(bh Header) error {
+	if r.ComputeStatsDiff {
+		if r.DisallowConflicts || r.DisallowShadowingBelow.IsSet() {
+			return errors.New(
+				"invalid AddSSTableRequest: ComputeStatsDiff cannot be used with DisallowConflicts or DisallowShadowingBelow")
+		}
+		if r.MVCCStats != nil {
+			return errors.New(
+				"invalid AddSSTableRequest: ComputeStatsDiff cannot be used with precomputed MVCCStats")
+		}
+	}
+	return nil
+}
diff --git a/pkg/kv/kvpb/api.proto b/pkg/kv/kvpb/api.proto
@@ -2127,6 +2127,26 @@ message AddSSTableRequest {
   //
   // TODO(dt,msbutler,bilal): This is unsupported.
   util.hlc.Timestamp ignore_keys_above_timestamp = 12 [(gogoproto.nullable) = false];
+  
+  // ComputeStatsDiff causes the server to compute the effect this
+  // SST will have on the range's MVCC stats, even in the presence of
+  // overlapping keys. This flag cannot be passed with the MVCCStats,
+  // DisallowShadowingBelow, or DisallowShadowing fields. 
+  //
+  // This flag assumes that any key in the sst that is shadowed by a key in the
+  // engine is also a duplicate. As an example, accurate stats will be computed
+  // here: 
+  // - sst: a@3,a@2 and eng: a@2,a@1
+  //
+  // but not for, as a@1 is not a duplicate: 
+  // - sst: a@1 and eng: a@2
+  //
+  // In the ladder case, we silently create inaccurate stats currently. A TODO
+  // in storage.ComputeSSTStatsDiff is to detect this case and increment
+  // ContainsEstimates. At least for PCR, the first client of this flag, we
+  // expect the edge case to be quite rare, and would only occur if we're
+  // ingesting data older than the ingesting range's GC TTL.
+  bool compute_stats_diff = 13 [(gogoproto.customname) = "ComputeStatsDiff"];
 
   reserved 10, 11;
 }
diff --git a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go
@@ -130,6 +130,11 @@ func EvalAddSSTable(
 ) (result.Result, error) {
 	args := cArgs.Args.(*kvpb.AddSSTableRequest)
 	h := cArgs.Header
+
+	if err := args.Validate(h); err != nil {
+		return result.Result{}, err
+	}
+
 	ms := cArgs.Stats
 	start, end := storage.MVCCKey{Key: args.Key}, storage.MVCCKey{Key: args.EndKey}
 	sst := args.Data
@@ -172,6 +177,10 @@ func EvalAddSSTable(
 		}
 	}
 
+	if err := checkSSTSpanBounds(ctx, sst, start, end); err != nil {
+		return result.Result{}, err
+	}
+
 	// If requested and necessary, rewrite the SST's MVCC timestamps to the
 	// request timestamp. This ensures the writes comply with the timestamp cache
 	// and closed timestamp, i.e. by not writing to timestamps that have already
@@ -194,10 +203,16 @@ func EvalAddSSTable(
 		}
 	}
 
-	var statsDelta enginepb.MVCCStats
 	maxLockConflicts := storage.MaxConflictsPerLockConflictError.Get(&cArgs.EvalCtx.ClusterSettings().SV)
 	targetLockConflictBytes := storage.TargetBytesPerLockConflictError.Get(&cArgs.EvalCtx.ClusterSettings().SV)
 	checkConflicts := args.DisallowConflicts || !args.DisallowShadowingBelow.IsEmpty()
+
+	// checkConflictsStatsDelta is a delta between the SST-only statistics and
+	// their effect when applied. In other words:
+	//
+	// sstOnlyStats + checkConflictsStatsDelta equals the actual stats
+	// contribution of the sst.
+	var checkConflictsStatsDelta enginepb.MVCCStats
 	if checkConflicts {
 		// If requested, check for MVCC conflicts with existing keys. This enforces
 		// all MVCC invariants by returning WriteTooOldError for any existing
@@ -207,10 +222,10 @@ func EvalAddSSTable(
 		// Additionally, if DisallowShadowingBelow is set, it will not write
 		// above existing/visible values (but will write above tombstones).
 		//
-		// If the overlap between the ingested SST and the engine is large (i.e.
-		// the SST is wide in keyspace), or if the ingested SST is very small,
-		// use prefix seeks in CheckSSTConflicts. This ends up being more performant
-		// as it avoids expensive seeks with index/data block loading in the common
+		// If the overlap between the ingested SST and the engine is large (i.e. the
+		// SST is wide in keyspace), or if the ingested SST is very small, use
+		// prefix seeks in CheckSSTConflicts. This ends up being more performant as
+		// it avoids expensive seeks with index/data block loading in the common
 		// case of no conflicts.
 		usePrefixSeek := false
 		bytes, err := cArgs.EvalCtx.GetApproximateDiskBytes(start.Key, end.Key)
@@ -229,9 +244,9 @@ func EvalAddSSTable(
 			args.Key, args.EndKey, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey())
 
 		log.VEventf(ctx, 2, "checking conflicts for SSTable [%s,%s)", start.Key, end.Key)
-		statsDelta, err = storage.CheckSSTConflicts(ctx, sst, readWriter, start, end, leftPeekBound, rightPeekBound,
+		checkConflictsStatsDelta, err = storage.CheckSSTConflicts(ctx, sst, readWriter, start, end, leftPeekBound, rightPeekBound,
 			args.DisallowShadowingBelow, sstTimestamp, maxLockConflicts, targetLockConflictBytes, usePrefixSeek)
-		statsDelta.Add(sstReqStatsDelta)
+		checkConflictsStatsDelta.Add(sstReqStatsDelta)
 		if err != nil {
 			return result.Result{}, errors.Wrap(err, "checking for key collisions")
 		}
@@ -250,112 +265,57 @@ func EvalAddSSTable(
 		}
 	}
 
-	// Verify that the keys in the sstable are within the range specified by the
-	// request header, and if the request did not include pre-computed stats,
-	// compute the expected MVCC stats delta of ingesting the SST.
-	sstIter, err := storage.NewMemSSTIterator(sst, true /* verify */, storage.IterOptions{
-		KeyTypes:   storage.IterKeyTypePointsAndRanges,
-		LowerBound: keys.MinKey,
-		UpperBound: keys.MaxKey,
-	})
-	if err != nil {
-		return result.Result{}, err
-	}
-	defer sstIter.Close()
-
-	// Check that the first key is in the expected range.
-	sstIter.SeekGE(storage.MVCCKey{Key: keys.MinKey})
-	if ok, err := sstIter.Valid(); err != nil {
-		return result.Result{}, err
-	} else if ok {
-		if unsafeKey := sstIter.UnsafeKey(); unsafeKey.Less(start) {
-			return result.Result{}, errors.Errorf("first key %s not in request range [%s,%s)",
-				unsafeKey.Key, start.Key, end.Key)
-		}
-	}
-
-	// Get the MVCCStats for the SST being ingested.
-	var stats enginepb.MVCCStats
-	if args.MVCCStats != nil {
-		stats = *args.MVCCStats
-	} else {
-		log.VEventf(ctx, 2, "computing MVCCStats for SSTable [%s,%s)", start.Key, end.Key)
-		stats, err = storage.ComputeStatsForIter(sstIter, h.Timestamp.WallTime)
-		if err != nil {
-			return result.Result{}, errors.Wrap(err, "computing SSTable MVCC stats")
-		}
-	}
-
-	sstIter.SeekGE(end)
-	if ok, err := sstIter.Valid(); err != nil {
-		return result.Result{}, err
-	} else if ok {
-		return result.Result{}, errors.Errorf("last key %s not in request range [%s,%s)",
-			sstIter.UnsafeKey(), start.Key, end.Key)
-	}
-
-	// The above MVCCStats represents what is in this new SST.
-	//
-	// *If* the keys in the SST do not conflict with keys currently in this range,
-	// then adding the stats for this SST to the range stats should yield the
-	// correct overall stats.
-	//
-	// *However*, if the keys in this range *do* overlap with keys already in this
-	// range, then adding the SST semantically *replaces*, rather than adds, those
-	// keys, and the net effect on the stats is not so simple.
-	//
-	// To perfectly compute the correct net stats, you could a) determine the
-	// stats for the span of the existing range that this SST covers and subtract
-	// it from the range's stats, then b) use a merging iterator that reads from
-	// the SST and then underlying range and compute the stats of that merged
-	// span, and then add those stats back in. That would result in correct stats
-	// that reflect the merging semantics when the SST "shadows" an existing key.
-	//
-	// If the underlying range is mostly empty, this isn't terribly expensive --
-	// computing the existing stats to subtract is cheap, as there is little or no
-	// existing data to traverse and b) is also pretty cheap -- the merging
-	// iterator can quickly iterate the in-memory SST.
+	// stats will represent the contribution of MVCC stats from the SST to the
+	// range. We assume these stats are not estimates if we:
 	//
-	// However, if the underlying range is _not_ empty, then this is not cheap:
-	// recomputing its stats involves traversing lots of data, and iterating the
-	// merged iterator has to constantly go back and forth to the iterator.
+	// 1. Call ComputeSSTStatsDiff below, which computes the exact stats diff of
+	// the sst, even in the presence of duplicate, shadowing, and shadowed keys in
+	// the ingesting keyspace. Note that this computation will throw an error if
+	// there is a range key in the _incoming sst_.
 	//
-	// If we assume that most SSTs don't shadow too many keys, then the error of
-	// simply adding the SST stats directly to the range stats is minimal. In the
-	// worst-case, when we retry a whole SST, then it could be overcounting the
-	// entire file, but we can hope that that is rare. In the worst case, it may
-	// cause splitting an under-filled range that would later merge when the
-	// over-count is fixed.
+	// TODO(msbutler): In the first iteration, mvcc stats estimates will be
+	// returned if the _underlying_ range contains range keys.
 	//
-	// We can indicate that these stats contain this estimation using the flag in
-	// the MVCC stats so that later re-computations will not be surprised to find
-	// any discrepancies.
+	// 2. Call for CheckForSSTConflicts above, which asserts the sst does not
+	// conflict with any underlying keys, as defined by DisallowShadowing and
+	// DisallowShadowingBelow. Note that the checkConflictsStatsDelta is a delta
+	// between the SST-only statistics and their effect when applied, which when
+	// added to the SST statistics, will adjust them for existing keys and values.
+	// Thus, we still need to compute the sst-only statistics on this branch. Also
+	// note that CheckForSSTConflicts can still return estimates in certain corner
+	// cases.
 	//
-	// Callers can trigger such a re-computation to fixup any discrepancies (and
-	// remove the ContainsEstimates flag) after they are done ingesting files by
-	// sending an explicit recompute.
-	//
-	// There is a significant performance win to be achieved by ensuring that the
-	// stats computed are not estimates as it prevents recomputation on splits.
-	// Running AddSSTable with disallowShadowing=true gets us close to this as we
-	// do not allow colliding keys to be ingested. However, in the situation that
-	// two SSTs have KV(s) which "perfectly" shadow an existing key (equal ts and
-	// value), we do not consider this a collision. While the KV would just
-	// overwrite the existing data, the stats would be added to the cumulative
-	// stats of the AddSSTable command, causing a double count for such KVs.
-	// Therefore, we compute the stats for these "skipped" KVs on-the-fly while
-	// checking for the collision condition in C++ and subtract them from the
-	// stats of the SST being ingested before adding them to the running
-	// cumulative for this command. These stats can then be marked as accurate.
+	// While computing stats in this request requires a potentially expensive scan
+	// of the range, it ensures a healthy allocator that makes good decisions on
+	// when to split/merge or gc a range (etc). Stats estimates may not be harmful
+	// if we assume that most SSTs don't shadow too many keys, so the error of
+	// simply adding the SST stats directly to the range stats is minimal.
+	var stats enginepb.MVCCStats
+	log.VEventf(ctx, 2, "computing MVCCStats for SSTable [%s,%s)", start.Key, end.Key)
 	if checkConflicts {
-		stats.Add(statsDelta)
-		if statsDelta.ContainsEstimates == 0 {
-			stats.ContainsEstimates = 0
+		stats.Add(checkConflictsStatsDelta)
+	}
+	if args.ComputeStatsDiff {
+		statsDiff, err := computeSSTStatsDiffWithFallback(ctx, sst, readWriter, h.Timestamp.WallTime, start, end)
+		if err != nil {
+			return result.Result{}, errors.Wrap(err, "computing SST stats diff")
+		}
+		stats.Add(statsDiff)
+	} else if args.MVCCStats != nil {
+		stats.Add(*args.MVCCStats)
+		if !checkConflicts {
+			stats.ContainsEstimates++
 		}
 	} else {
-		stats.ContainsEstimates++
+		sstStats, err := computeSSTStats(ctx, sst, h.Timestamp.WallTime)
+		if err != nil {
+			return result.Result{}, errors.Wrap(err, "computing SST stats")
+		}
+		stats.Add(sstStats)
+		if !checkConflicts {
+			stats.ContainsEstimates++
+		}
 	}
-
 	ms.Add(stats)
 
 	var mvccHistoryMutation *kvserverpb.ReplicatedEvalResult_MVCCHistoryMutation
@@ -505,6 +465,89 @@ func EvalAddSSTable(
 	}, nil
 }
 
+func computeSSTStatsDiffWithFallback(
+	ctx context.Context,
+	sst []byte,
+	readWriter storage.ReadWriter,
+	nowNanos int64,
+	start, end storage.MVCCKey,
+) (enginepb.MVCCStats, error) {
+	stats, err := storage.ComputeSSTStatsDiff(
+		ctx, sst, readWriter, nowNanos, start, end)
+	if errors.Is(err, storage.ComputeSSTStatsDiffReaderHasRangeKeys) {
+		// Fall back to stats estimates if there are range keys in the engine.
+		log.VEventf(ctx, 2, "computing SST stats as estimates after detecting range keys in engine")
+		sstStats, err := computeSSTStats(ctx, sst, nowNanos)
+		if err != nil {
+			return enginepb.MVCCStats{}, errors.Wrap(err, "computing SST stats after detecting range keys in engine")
+		}
+		sstStats.ContainsEstimates = 1
+		return sstStats, nil
+	} else if err != nil {
+		return enginepb.MVCCStats{}, err
+	}
+	return stats, nil
+}
+
+func computeSSTStats(ctx context.Context, sst []byte, nowNanos int64) (enginepb.MVCCStats, error) {
+	sstIter, err := storage.NewMemSSTIterator(sst, true /* verify */, storage.IterOptions{
+		KeyTypes:   storage.IterKeyTypePointsAndRanges,
+		LowerBound: keys.MinKey,
+		UpperBound: keys.MaxKey,
+	})
+	if err != nil {
+		return enginepb.MVCCStats{}, err
+	}
+	defer sstIter.Close()
+
+	sstIter.SeekGE(storage.MVCCKey{Key: keys.MinKey})
+	if ok, err := sstIter.Valid(); err != nil {
+		return enginepb.MVCCStats{}, err
+	} else if ok {
+		// TODO(msbutler): this implies we tolerate ingesting an empty addstable.
+		// Perhaps we should reject empty addstable requests?
+		sstStats, err := storage.ComputeStatsForIter(sstIter, nowNanos)
+		if err != nil {
+			return enginepb.MVCCStats{}, errors.Wrap(err, "computing SSTable MVCC stats")
+		}
+		return sstStats, nil
+	}
+	return enginepb.MVCCStats{}, nil
+}
+
+// checkSSTSpanBounds verifies that the keys in the sstable are within the
+// span specified by [start, end].
+func checkSSTSpanBounds(ctx context.Context, sst []byte, start, end storage.MVCCKey) error {
+	sstIter, err := storage.NewMemSSTIterator(sst, true /* verify */, storage.IterOptions{
+		KeyTypes:   storage.IterKeyTypePointsAndRanges,
+		LowerBound: keys.MinKey,
+		UpperBound: keys.MaxKey,
+	})
+	if err != nil {
+		return err
+	}
+	defer sstIter.Close()
+
+	// Check that the first key is in the expected span.
+	sstIter.SeekGE(storage.MVCCKey{Key: keys.MinKey})
+	if ok, err := sstIter.Valid(); err != nil {
+		return err
+	} else if ok {
+		if unsafeKey := sstIter.UnsafeKey(); unsafeKey.Less(start) {
+			return errors.Errorf("first key %s not in request range [%s,%s)",
+				unsafeKey.Key, start.Key, end.Key)
+		}
+	}
+	sstIter.SeekGE(end)
+	if ok, err := sstIter.Valid(); err != nil {
+		return err
+	} else if ok {
+		return errors.Errorf("last key %s not in request range [%s,%s)",
+			sstIter.UnsafeKey(), start.Key, end.Key)
+	}
+	return err
+}
+
 // assertSSTContents checks that the SST contains expected inputs:
 //
 // * Only SST set operations (not explicitly verified).
diff --git a/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go b/pkg/kv/kvserver/batcheval/cmd_add_sstable_test.go