sql: add fetch row count assertions in vectorized index joiner

mgartner · mgartner · commit c117e9324f76 · 2025-06-25T14:20:43.000-04:00
Assertions have been added to the vectorized index joiner that ensure that an index join fetches the expected number of rows. If an index join has a locking wait policy of `SKIP LOCKED`, it should fetch no more than the number of input rows. Otherwise, an index join should fetch exactly one row for each input row. If these assertions fail, the query results may be incorrect, e.g., due to index corruption, and an internal error is preferred over a successful result. Informs #135696 Release note: None
diff --git a/pkg/sql/colfetcher/index_join.go b/pkg/sql/colfetcher/index_join.go
@@ -17,6 +17,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvstreamer"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
+	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
 	"github.com/cockroachdb/cockroach/pkg/sql/colexec/colexecspan"
 	"github.com/cockroachdb/cockroach/pkg/sql/colexecerror"
@@ -63,6 +64,16 @@ type ColIndexJoin struct {
 	// and may not correspond to batch boundaries.
 	startIdx int
 
+	// scanRowCounts contains the expected and actual number of rows fetched for
+	// the current lookup scan. The expected row count is equal to the number of
+	// input rows that have been consumed to create the spans. These counts are
+	// used to make assertions that prevent returning incorrect results due to
+	// index corruption. See assertScanRowCounts() for more details.
+	scanRowCounts struct {
+		expected int64
+		actual   int64
+	}
+
 	// limitHintHelper is used in limiting batches of input rows in the presence
 	// of hard and soft limits.
 	limitHintHelper execinfra.LimitHintHelper
@@ -120,6 +131,10 @@ type ColIndexJoin struct {
 	// table because the scan might synthesize additional implicit system columns.
 	ResultTypes []*types.T
 
+	// lockingWaitPolicy is the wait policy for the cFetcher's underlying
+	// row.KVFetcher.
+	lockingWaitPolicy descpb.ScanLockingWaitPolicy
+
 	// maintainOrdering is true when the index join is required to maintain its
 	// input ordering, in which case the ordering of the spans cannot be changed.
 	maintainOrdering bool
@@ -204,8 +219,11 @@ func (s *ColIndexJoin) Next() coldata.Batch {
 				sort.Sort(spans)
 			}
 
-			// Index joins will always return exactly one output row per input row.
+			// For memory accounting, we assume the index join will return
+			// exactly one output row per input row. This is true most of the
+			// time, except when the locking wait policy is SKIP LOCKED.
 			s.cf.setEstimatedRowCount(uint64(rowCount))
+			s.scanRowCounts.expected = rowCount
 			// Note that the fetcher takes ownership of the spans slice - it
 			// will modify it and perform the memory accounting. We don't care
 			// about the modification here, but we want to be conscious about
@@ -239,8 +257,11 @@ func (s *ColIndexJoin) Next() coldata.Batch {
 				// still has the references to it.
 				s.spanAssembler.AccountForSpans()
 				s.state = indexJoinConstructingSpans
+				s.assertScanRowCounts()
+				s.scanRowCounts.actual = 0
 				continue
 			}
+			s.scanRowCounts.actual += int64(n)
 			s.mu.Lock()
 			s.mu.rowsRead += int64(n)
 			s.mu.Unlock()
@@ -312,6 +333,30 @@ func (s *ColIndexJoin) getRowSize(idx int) int64 {
 	return rowSize
 }
 
+// assertScanRowCounts performs assertions to prevent silently returning
+// incorrect results, e.g., if an index is corrupt. In the common case, the
+// number of fetched rows in an index join should be equal to the number of
+// input rows. The only exception is when the locking wait policy is
+// SKIP LOCKED, in which case the number of fetched rows may be less than
+// the number of input rows, but never greater.
+func (s *ColIndexJoin) assertScanRowCounts() {
+	if s.lockingWaitPolicy == descpb.ScanLockingWaitPolicy_SKIP_LOCKED {
+		if s.scanRowCounts.actual > s.scanRowCounts.expected {
+			colexecerror.InternalError(errors.AssertionFailedf(
+				"expected to fetch no more than %d rows, found %d",
+				s.scanRowCounts.expected, s.scanRowCounts.actual,
+			))
+		}
+	} else {
+		if s.scanRowCounts.actual != s.scanRowCounts.expected {
+			colexecerror.InternalError(errors.AssertionFailedf(
+				"expected to fetch %d rows, found %d",
+				s.scanRowCounts.expected, s.scanRowCounts.actual,
+			))
+		}
+	}
+}
+
 // getBatchSize calculates the size of the entire current batch. Note that it
 // accounts only for the size of the data itself, and ignores extra overhead
 // such as selection vectors or byte offsets. getBatchSize is not exactly
@@ -616,16 +661,17 @@ func NewColIndexJoin(
 	)
 
 	op := &ColIndexJoin{
-		OneInputNode:     colexecop.NewOneInputNode(input),
-		flowCtx:          flowCtx,
-		processorID:      processorID,
-		cf:               fetcher,
-		spanAssembler:    spanAssembler,
-		ResultTypes:      tableArgs.typs,
-		maintainOrdering: spec.MaintainOrdering,
-		txn:              txn,
-		usesStreamer:     useStreamer,
-		limitHintHelper:  execinfra.MakeLimitHintHelper(spec.LimitHint, post),
+		OneInputNode:      colexecop.NewOneInputNode(input),
+		flowCtx:           flowCtx,
+		processorID:       processorID,
+		cf:                fetcher,
+		spanAssembler:     spanAssembler,
+		ResultTypes:       tableArgs.typs,
+		maintainOrdering:  spec.MaintainOrdering,
+		txn:               txn,
+		usesStreamer:      useStreamer,
+		limitHintHelper:   execinfra.MakeLimitHintHelper(spec.LimitHint, post),
+		lockingWaitPolicy: spec.LockingWaitPolicy,
 	}
 	op.mem.inputBatchSizeLimit = getIndexJoinBatchSize(
 		useStreamer, flowCtx.EvalCtx.TestingKnobs.ForceProductionValues, flowCtx.EvalCtx.SessionData(),