Merge #148538 #148857

craig[bot] · mgartner · annrpom · craig[bot] · commit ce35c63b8c86 · 2025-06-26T15:28:36.000Z
148538: sql: add index join and lookup join assertions r=mgartner a=mgartner #### sql: add fetch row count assertions in vectorized index joiner Assertions have been added to the vectorized index joiner that ensure that an index join fetches the expected number of rows. If an index join has a locking wait policy of `SKIP LOCKED`, it should fetch no more than the number of input rows. Otherwise, an index join should fetch exactly one row for each input row. If these assertions fail, the query results may be incorrect, e.g., due to index corruption, and an internal error is preferred over a successful result. Informs #135696 Release note: None #### sql: add fetch row count assertions in row-by-row index joiner Assertions have been added to the join reader that ensure that an index join fetches the expected number of rows. Informs #135696 Release note: None #### sql: add fetch row count assertions in row-by-row lookup joiner Assertions have been added to the join reader that ensure that a lookup join on key columns fetches the expected number of rows. Fixes #135696 Release note: None 148857: storage: enable value separation by default r=annrpom a=annrpom Epic: none Release note (ops change): The `storage.value_separation.enabled` cluster setting is now true by default. This enables value separation for sstables, where values exceeding a certain size threshold are stored in separate blob files rather than inline in the sstable. This helps improve write performance (write-amp) by avoiding rewriting such values during compactions. Co-authored-by: Marcus Gartner <marcus@cockroachlabs.com> Co-authored-by: Annie Pompa <annie@cockroachlabs.com>
diff --git a/pkg/sql/colfetcher/index_join.go b/pkg/sql/colfetcher/index_join.go
@@ -17,6 +17,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvstreamer"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
+	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
 	"github.com/cockroachdb/cockroach/pkg/sql/colexec/colexecspan"
 	"github.com/cockroachdb/cockroach/pkg/sql/colexecerror"
@@ -63,6 +64,16 @@ type ColIndexJoin struct {
 	// and may not correspond to batch boundaries.
 	startIdx int
 
+	// scanRowCounts contains the expected and actual number of rows fetched for
+	// the current lookup scan. The expected row count is equal to the number of
+	// input rows that have been consumed to create the spans. These counts are
+	// used to make assertions that prevent returning incorrect results due to
+	// index corruption. See assertScanRowCounts() for more details.
+	scanRowCounts struct {
+		expected int64
+		actual   int64
+	}
+
 	// limitHintHelper is used in limiting batches of input rows in the presence
 	// of hard and soft limits.
 	limitHintHelper execinfra.LimitHintHelper
@@ -120,6 +131,10 @@ type ColIndexJoin struct {
 	// table because the scan might synthesize additional implicit system columns.
 	ResultTypes []*types.T
 
+	// lockingWaitPolicy is the wait policy for the cFetcher's underlying
+	// row.KVFetcher.
+	lockingWaitPolicy descpb.ScanLockingWaitPolicy
+
 	// maintainOrdering is true when the index join is required to maintain its
 	// input ordering, in which case the ordering of the spans cannot be changed.
 	maintainOrdering bool
@@ -204,8 +219,11 @@ func (s *ColIndexJoin) Next() coldata.Batch {
 				sort.Sort(spans)
 			}
 
-			// Index joins will always return exactly one output row per input row.
+			// For memory accounting, we assume the index join will return
+			// exactly one output row per input row. This is true most of the
+			// time, except when the locking wait policy is SKIP LOCKED.
 			s.cf.setEstimatedRowCount(uint64(rowCount))
+			s.scanRowCounts.expected = rowCount
 			// Note that the fetcher takes ownership of the spans slice - it
 			// will modify it and perform the memory accounting. We don't care
 			// about the modification here, but we want to be conscious about
@@ -239,8 +257,11 @@ func (s *ColIndexJoin) Next() coldata.Batch {
 				// still has the references to it.
 				s.spanAssembler.AccountForSpans()
 				s.state = indexJoinConstructingSpans
+				s.assertScanRowCounts()
+				s.scanRowCounts.actual = 0
 				continue
 			}
+			s.scanRowCounts.actual += int64(n)
 			s.mu.Lock()
 			s.mu.rowsRead += int64(n)
 			s.mu.Unlock()
@@ -312,6 +333,30 @@ func (s *ColIndexJoin) getRowSize(idx int) int64 {
 	return rowSize
 }
 
+// assertScanRowCounts performs assertions to prevent silently returning
+// incorrect results, e.g., if an index is corrupt. In the common case, the
+// number of fetched rows in an index join should be equal to the number of
+// input rows. The only exception is when the locking wait policy is
+// SKIP LOCKED, in which case the number of fetched rows may be less than
+// the number of input rows, but never greater.
+func (s *ColIndexJoin) assertScanRowCounts() {
+	if s.lockingWaitPolicy == descpb.ScanLockingWaitPolicy_SKIP_LOCKED {
+		if s.scanRowCounts.actual > s.scanRowCounts.expected {
+			colexecerror.InternalError(errors.AssertionFailedf(
+				"expected to fetch no more than %d rows, found %d",
+				s.scanRowCounts.expected, s.scanRowCounts.actual,
+			))
+		}
+	} else {
+		if s.scanRowCounts.actual != s.scanRowCounts.expected {
+			colexecerror.InternalError(errors.AssertionFailedf(
+				"expected to fetch %d rows, found %d",
+				s.scanRowCounts.expected, s.scanRowCounts.actual,
+			))
+		}
+	}
+}
+
 // getBatchSize calculates the size of the entire current batch. Note that it
 // accounts only for the size of the data itself, and ignores extra overhead
 // such as selection vectors or byte offsets. getBatchSize is not exactly
@@ -616,16 +661,17 @@ func NewColIndexJoin(
 	)
 
 	op := &ColIndexJoin{
-		OneInputNode:     colexecop.NewOneInputNode(input),
-		flowCtx:          flowCtx,
-		processorID:      processorID,
-		cf:               fetcher,
-		spanAssembler:    spanAssembler,
-		ResultTypes:      tableArgs.typs,
-		maintainOrdering: spec.MaintainOrdering,
-		txn:              txn,
-		usesStreamer:     useStreamer,
-		limitHintHelper:  execinfra.MakeLimitHintHelper(spec.LimitHint, post),
+		OneInputNode:      colexecop.NewOneInputNode(input),
+		flowCtx:           flowCtx,
+		processorID:       processorID,
+		cf:                fetcher,
+		spanAssembler:     spanAssembler,
+		ResultTypes:       tableArgs.typs,
+		maintainOrdering:  spec.MaintainOrdering,
+		txn:               txn,
+		usesStreamer:      useStreamer,
+		limitHintHelper:   execinfra.MakeLimitHintHelper(spec.LimitHint, post),
+		lockingWaitPolicy: spec.LockingWaitPolicy,
 	}
 	op.mem.inputBatchSizeLimit = getIndexJoinBatchSize(
 		useStreamer, flowCtx.EvalCtx.TestingKnobs.ForceProductionValues, flowCtx.EvalCtx.SessionData(),
diff --git a/pkg/sql/rowexec/joinreader.go b/pkg/sql/rowexec/joinreader.go
@@ -209,6 +209,13 @@ type joinReader struct {
 	// curBatchInputRowCount is the number of input rows in the current batch.
 	curBatchInputRowCount int64
 
+	// If set, the lookup columns form a key in the target table and thus each
+	// lookup has at most one result.
+	lookupColumnsAreKey bool
+
+	// lockingWaitPolicy is the wait policy for the underlying rowFetcher.
+	lockingWaitPolicy descpb.ScanLockingWaitPolicy
+
 	// State variables for each batch of input rows.
 	scratchInputRows rowenc.EncDatumRows
 	// resetScratchWhenReadingInput tracks whether scratchInputRows needs to be
@@ -377,6 +384,8 @@ func newJoinReader(
 		outputGroupContinuationForLeftRow:   spec.OutputGroupContinuationForLeftRow,
 		parallelize:                         parallelize,
 		readerType:                          readerType,
+		lookupColumnsAreKey:                 spec.LookupColumnsAreKey,
+		lockingWaitPolicy:                   spec.LockingWaitPolicy,
 		txn:                                 txn,
 		usesStreamer:                        useStreamer,
 		limitHintHelper:                     execinfra.MakeLimitHintHelper(spec.LimitHint, post),
@@ -930,6 +939,12 @@ func (jr *joinReader) readInput() (
 		jr.resetScratchWhenReadingInput = false
 	}
 
+	// Assert that the correct number of rows were fetched in the last batch.
+	if err := jr.assertBatchRowCounts(); err != nil {
+		jr.MoveToDraining(err)
+		return jrStateUnknown, nil, jr.DrainHelper()
+	}
+
 	// Read the next batch of input rows.
 	for {
 		var encDatumRow rowenc.EncDatumRow
@@ -1103,6 +1118,33 @@ func (jr *joinReader) readInput() (
 	return jrFetchingLookupRows, outRow, nil
 }
 
+// assertBatchRowCounts performs assertions to prevent silently returning
+// incorrect results, e.g., if the lookup index is corrupt.
+func (jr *joinReader) assertBatchRowCounts() error {
+	// An index join without SKIP LOCKED should fetch exactly one row for each
+	// input row.
+	nonSkippingIndexJoin := jr.readerType == indexJoinReaderType &&
+		jr.lockingWaitPolicy != descpb.ScanLockingWaitPolicy_SKIP_LOCKED
+	if nonSkippingIndexJoin && jr.curBatchRowsRead != jr.curBatchInputRowCount {
+		return errors.AssertionFailedf(
+			"expected to fetch %d rows, found %d",
+			jr.curBatchInputRowCount, jr.curBatchRowsRead,
+		)
+	}
+	// An index join with SKIP LOCKED or a lookup join where the lookup columns
+	// form a key should fetch at most one row for each input row.
+	skippingIndexJoin := jr.readerType == indexJoinReaderType &&
+		jr.lockingWaitPolicy == descpb.ScanLockingWaitPolicy_SKIP_LOCKED
+	if (skippingIndexJoin || jr.lookupColumnsAreKey) &&
+		jr.curBatchRowsRead > jr.curBatchInputRowCount {
+		return errors.AssertionFailedf(
+			"expected to fetch no more than %d rows, found %d",
+			jr.curBatchInputRowCount, jr.curBatchRowsRead,
+		)
+	}
+	return nil
+}
+
 var noHomeRegionError = pgerror.Newf(pgcode.QueryHasNoHomeRegion,
 	"Query has no home region. Try using a lower LIMIT value or running the query from a different region. %s",
 	sqlerrors.EnforceHomeRegionFurtherInfo)
diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go
@@ -423,10 +423,10 @@ var (
 	valueSeparationEnabled = settings.RegisterBoolSetting(
 		settings.SystemVisible,
 		"storage.value_separation.enabled",
-		"(experimental) whether or not values may be separated into blob files; "+
+		"whether or not values may be separated into blob files; "+
 			"requires columnar blocks to be enabled",
 		metamorphic.ConstantWithTestBool(
-			"storage.value_separation.enabled", false), /* defaultValue */
+			"storage.value_separation.enabled", true /* defaultValue */),
 	)
 	valueSeparationMinimumSize = settings.RegisterIntSetting(
 		settings.SystemVisible,
@@ -448,7 +448,7 @@ var (
 		settings.SystemVisible,
 		"storage.value_separation.rewrite_minimum_age",
 		"the minimum age of a blob file before it is eligible for a rewrite compaction",
-		5*time.Minute, // 5 minutes
+		5*time.Minute,
 		settings.DurationWithMinimum(0),
 	)
 	valueSeparationCompactionGarbageThreshold = settings.RegisterIntSetting(