opt: make outside-of-histogram estimates more pessimistic

DrewKimball · DrewKimball · commit 84f790ea3d6d · 2025-10-22T22:41:47.000-05:00
This commit makes rowcount estimation fall back on distinct count estimates when a constraint includes zero histogram values. This biases the optimizer toward less risky plans when less information is known about the filtered values. The pessimistic logic is triggered when the estimate derived from a histogram is smaller than `table_row_count / 10,000`. This threshold is chosen because we choose samples such that we expect to sample *nearly* every value with multiplicity down to `table_row_count / 10,000` (see computeNumberSamples). Selecivity estimates from a histogram below this resolution are suspect, since there is increasing likelihood that a value was missed either due to being omitted from the sample, or due to staleness. Informs #130201 Release note (sql change): Added a clamp for row-count estimates over very large tables so that the optimizer assumes that at least one distinct value will be scanned. This reduces the chances of a catastrophic underestimate. The new logic is off by default, gated by a session setting `optimizer_clamp_low_histogram_selectivity`.
diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
@@ -4438,6 +4438,10 @@ func (m *sessionDataMutator) SetOptimizerUseImprovedHoistJoinProject(val bool) {
 	m.data.OptimizerUseImprovedHoistJoinProject = val
 }
 
+func (m *sessionDataMutator) SetOptimizerClampLowHistogramSelectivity(val bool) {
+	m.data.OptimizerClampLowHistogramSelectivity = val
+}
+
 // Utility functions related to scrubbing sensitive information on SQL Stats.
 
 // quantizeCounts ensures that the Count field in the
diff --git a/pkg/sql/logictest/testdata/logic_test/information_schema b/pkg/sql/logictest/testdata/logic_test/information_schema
@@ -4189,6 +4189,7 @@ opt_split_scan_limit                                             2048
 optimizer                                                        on
 optimizer_always_use_histograms                                  on
 optimizer_check_input_min_row_count                              1
+optimizer_clamp_low_histogram_selectivity                        off
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  on
 optimizer_enable_lock_elision                                    on
 optimizer_hoist_uncorrelated_equality_subqueries                 on
diff --git a/pkg/sql/logictest/testdata/logic_test/pg_catalog b/pkg/sql/logictest/testdata/logic_test/pg_catalog
@@ -3138,6 +3138,7 @@ on_update_rehome_row_enabled                                     on
 opt_split_scan_limit                                             2048                NULL      NULL        NULL        string
 optimizer_always_use_histograms                                  on                  NULL      NULL        NULL        string
 optimizer_check_input_min_row_count                              1                   NULL      NULL        NULL        string
+optimizer_clamp_low_histogram_selectivity                        off                 NULL      NULL        NULL        string
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  on                  NULL      NULL        NULL        string
 optimizer_enable_lock_elision                                    on                  NULL      NULL        NULL        string
 optimizer_hoist_uncorrelated_equality_subqueries                 on                  NULL      NULL        NULL        string
@@ -3383,6 +3384,7 @@ on_update_rehome_row_enabled                                     on
 opt_split_scan_limit                                             2048                NULL  user     NULL      2048                2048
 optimizer_always_use_histograms                                  on                  NULL  user     NULL      on                  on
 optimizer_check_input_min_row_count                              1                   NULL  user     NULL      1                   1
+optimizer_clamp_low_histogram_selectivity                        off                 NULL  user     NULL      off                 off
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  on                  NULL  user     NULL      on                  on
 optimizer_enable_lock_elision                                    on                  NULL  user     NULL      on                  on
 optimizer_hoist_uncorrelated_equality_subqueries                 on                  NULL  user     NULL      on                  on
@@ -3619,6 +3621,7 @@ opt_split_scan_limit                                             NULL    NULL
 optimizer                                                        NULL    NULL     NULL     NULL        NULL
 optimizer_always_use_histograms                                  NULL    NULL     NULL     NULL        NULL
 optimizer_check_input_min_row_count                              NULL    NULL     NULL     NULL        NULL
+optimizer_clamp_low_histogram_selectivity                        NULL    NULL     NULL     NULL        NULL
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  NULL    NULL     NULL     NULL        NULL
 optimizer_enable_lock_elision                                    NULL    NULL     NULL     NULL        NULL
 optimizer_hoist_uncorrelated_equality_subqueries                 NULL    NULL     NULL     NULL        NULL
diff --git a/pkg/sql/logictest/testdata/logic_test/show_source b/pkg/sql/logictest/testdata/logic_test/show_source
@@ -155,6 +155,7 @@ on_update_rehome_row_enabled                                     on
 opt_split_scan_limit                                             2048
 optimizer_always_use_histograms                                  on
 optimizer_check_input_min_row_count                              1
+optimizer_clamp_low_histogram_selectivity                        off
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  on
 optimizer_enable_lock_elision                                    on
 optimizer_hoist_uncorrelated_equality_subqueries                 on
diff --git a/pkg/sql/opt/memo/memo.go b/pkg/sql/opt/memo/memo.go
@@ -212,6 +212,7 @@ type Memo struct {
 	disableSlowCascadeFastPathForRBRTables     bool
 	useImprovedHoistJoinProject                bool
 	rowSecurity                                bool
+	clampLowHistogramSelectivity               bool
 
 	// txnIsoLevel is the isolation level under which the plan was created. This
 	// affects the planning of some locking operations, so it must be included in
@@ -320,6 +321,7 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
 		disableSlowCascadeFastPathForRBRTables:     evalCtx.SessionData().OptimizerDisableCrossRegionCascadeFastPathForRBRTables,
 		useImprovedHoistJoinProject:                evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject,
 		rowSecurity:                                evalCtx.SessionData().RowSecurity,
+		clampLowHistogramSelectivity:               evalCtx.SessionData().OptimizerClampLowHistogramSelectivity,
 		txnIsoLevel:                                evalCtx.TxnIsoLevel,
 	}
 	m.metadata.Init()
@@ -496,6 +498,7 @@ func (m *Memo) IsStale(
 		m.disableSlowCascadeFastPathForRBRTables != evalCtx.SessionData().OptimizerDisableCrossRegionCascadeFastPathForRBRTables ||
 		m.useImprovedHoistJoinProject != evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject ||
 		m.rowSecurity != evalCtx.SessionData().RowSecurity ||
+		m.clampLowHistogramSelectivity != evalCtx.SessionData().OptimizerClampLowHistogramSelectivity ||
 		m.txnIsoLevel != evalCtx.TxnIsoLevel {
 		return true, nil
 	}
diff --git a/pkg/sql/opt/memo/memo_test.go b/pkg/sql/opt/memo/memo_test.go
@@ -595,6 +595,11 @@ func TestMemoIsStale(t *testing.T) {
 	evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject = false
 	notStale()
 
+	evalCtx.SessionData().OptimizerClampLowHistogramSelectivity = true
+	stale()
+	evalCtx.SessionData().OptimizerClampLowHistogramSelectivity = false
+	notStale()
+
 	// User no longer has access to view.
 	catalog.View(tree.NewTableNameWithSchema("t", catconstants.PublicSchemaName, "abcview")).Revoked = true
 	_, err = o.Memo().IsStale(ctx, &evalCtx, catalog)
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -90,6 +90,16 @@ const (
 	// values from the spans a check constraint is allowed to have in order to build
 	// a histogram from it.
 	maxValuesForFullHistogramFromCheckConstraint = tabledesc.MaxBucketAllowed
+
+	// histogramPessimisticThreshold determines the cutoff point below which the
+	// selectivity estimate of a histogram is overridden with a more pessimistic
+	// estimate. This is to avoid over-fitting to a stale or inaccurate histogram.
+	//
+	// The value (1 in 10,000) was chosen because we choose sample sizes according
+	// to table size such that we expect to *nearly* always sample all values with
+	// multiplicity >= row_count/10000. Cardinality estimates below this threshold
+	// are increasingly likely to be inaccurate. See also computeNumberSamples.
+	histogramPessimisticThreshold = 1.0 / 10000.0
 )
 
 // statisticsBuilder is responsible for building the statistics that are
@@ -761,7 +771,10 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
 					// is tracked here: https://github.com/cockroachdb/cockroach/issues/50655
 					col := cols.SingleColumn()
 					colStat.Histogram = &props.Histogram{}
-					colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
+					// Track the minimum number of rows for which histogram selectivity
+					// estimates are trusted.
+					resolution := histogramPessimisticThreshold * stats.RowCount
+					colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram(), resolution)
 				}
 
 				// Make sure the distinct count is at least 1, for the same reason as
@@ -786,7 +799,18 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
 					invCols := opt.MakeColSet(invCol)
 					if invColStat, ok := stats.ColStats.Add(invCols); ok {
 						invColStat.Histogram = &props.Histogram{}
-						invColStat.Histogram.Init(sb.evalCtx, invCol, stat.Histogram())
+						// Track the minimum number of rows for which histogram selectivity
+						// estimates are trusted.
+						//
+						// NOTE: an inverted index can have multiple entries per table row.
+						// However, we still use the number of table rows here because the
+						// max multiplicity of a missed value is proportional to the number
+						// of table rows, not the number of inverted index entries. For
+						// example, the arrays [10, 20, 30] and [20, 40, 60] result in six
+						// inverted index entries, but only a maximum multiplicity of two
+						// for the value "20".
+						resolution := histogramPessimisticThreshold * stats.RowCount
+						invColStat.Histogram.Init(sb.evalCtx, invCol, stat.Histogram(), resolution)
 						// Set inverted entry counts from the histogram. Make sure the
 						// distinct count is at least 1, for the same reason as the row
 						// count above.
@@ -4558,10 +4582,15 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
 		newCount := newHist.ValuesCount()
 		oldCount := oldHist.ValuesCount()
 
-		// Calculate the selectivity of the predicate. Nulls are already included
-		// in the histogram, so we do not need to account for them separately.
+		// Calculate the selectivity of the predicate using the histogram. Nulls
+		// are already included in the histogram, so we do not need to account for
+		// them separately.
 		predicateSelectivity := props.MakeSelectivityFromFraction(newCount, oldCount)
 
+		// Possibly clamp the selectivity to a higher value to avoid overly
+		// optimistic estimates.
+		predicateSelectivity = sb.clampSelForHistogram(inputColStat, colStat, s, predicateSelectivity)
+
 		// The maximum possible selectivity of the entire expression is the minimum
 		// selectivity of all individual predicates.
 		selectivityUpperBound = props.MinSelectivity(selectivityUpperBound, predicateSelectivity)
@@ -4572,6 +4601,34 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
 	return selectivity, selectivityUpperBound
 }
 
+// clampSelForHistogram clamps the selectivity estimate derived from a histogram
+// to a minimum value. This accounts for the possibility that the histogram is
+// missing values due to sampling or staleness. See also
+// histogramPessimisticThreshold.
+func (sb *statisticsBuilder) clampSelForHistogram(
+	oldColStat, newColStat *props.ColumnStatistic, s *props.Statistics, originalSel props.Selectivity,
+) (clampedSel props.Selectivity) {
+	clampedSel = originalSel
+	oldHist, newHist := oldColStat.Histogram, newColStat.Histogram
+	if sb.evalCtx.SessionData().OptimizerClampLowHistogramSelectivity &&
+		newHist.ValuesCount() < oldHist.Resolution() {
+		// NOTE: columns with histograms are skipped when considering distinct
+		// counts in selectivityFromSingleColDistinctCounts, so this doesn't
+		// double count the effect of the predicate.
+		resClamp := props.MakeSelectivityFromFraction(newColStat.DistinctCount, oldColStat.DistinctCount)
+
+		// Cap the selectivity so that the row count estimate is no more than the
+		// pessimistic threshold. This can result in a lower estimate if the
+		// multiplicities of the filtered values really are low compared to the
+		// average multiplicity.
+		resClamp = props.MinSelectivity(resClamp,
+			props.MakeSelectivityFromFraction(oldHist.Resolution(), s.RowCount),
+		)
+		clampedSel = props.MaxSelectivity(clampedSel, resClamp)
+	}
+	return clampedSel
+}
+
 // selectivityFromMaxFrequencies calculates the selectivity of an equality
 // filters by using the maximum frequency of the histograms of the constrained
 // columns. This represents a worst-case selectivity estimate and is used to
@@ -5332,7 +5389,10 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
 			colStat.NullCount = nullCount
 			if useHistogram {
 				colStat.Histogram = &props.Histogram{}
-				colStat.Histogram.Init(sb.evalCtx, firstColID, histogram)
+				// Track the minimum number of rows for which histogram selectivity
+				// estimates are trusted.
+				resolution := histogramPessimisticThreshold * statistics.RowCount
+				colStat.Histogram.Init(sb.evalCtx, firstColID, histogram, resolution)
 			}
 			sb.finalizeFromRowCountAndDistinctCounts(colStat, statistics)
 			tabMeta.AddCheckConstraintsStats(firstColID, colStat)
diff --git a/pkg/sql/opt/props/histogram.go b/pkg/sql/opt/props/histogram.go
@@ -38,6 +38,13 @@ type Histogram struct {
 	selectivity float64
 	buckets     []cat.HistogramBucket
 	col         opt.ColumnID
+	// resolution is the number of rows below which selectivity estimates based on
+	// this histogram should fall back to a more pessimistic distinct-count based
+	// estimate. This is used to avoid overfitting to histograms that may be
+	// missing values due to sampling or staleness. This number roughly
+	// corresponds to the highest expected multiplicity of any value missing from
+	// the histogram.
+	resolution float64
 }
 
 func (h *Histogram) String() string {
@@ -49,14 +56,17 @@ func (h *Histogram) String() string {
 }
 
 // Init initializes the histogram with data from the catalog.
-func (h *Histogram) Init(evalCtx *eval.Context, col opt.ColumnID, buckets []cat.HistogramBucket) {
+func (h *Histogram) Init(
+	evalCtx *eval.Context, col opt.ColumnID, buckets []cat.HistogramBucket, resolution float64,
+) {
 	// This initialization pattern ensures that fields are not unwittingly
 	// reused. Field reuse must be explicit.
 	*h = Histogram{
 		evalCtx:     evalCtx,
 		col:         col,
 		selectivity: 1,
 		buckets:     buckets,
+		resolution:  resolution,
 	}
 }
 
@@ -134,6 +144,13 @@ func (h *Histogram) ValuesCount() float64 {
 	return count
 }
 
+// Resolution returns the minimum row count for which selectivity estimates
+// based on this histogram should be trusted. See the resolution field comment
+// for details.
+func (h *Histogram) Resolution() float64 {
+	return h.resolution
+}
+
 // EqEstimate returns the estimated number of rows that equal the given
 // datum. If the datum is equal to a bucket's upperbound, it returns the
 // bucket's NumEq. If the datum falls in the range of a bucket's upper and lower
@@ -329,6 +346,7 @@ func (h *Histogram) filter(
 		evalCtx:     h.evalCtx,
 		col:         h.col,
 		selectivity: h.selectivity,
+		resolution:  h.resolution,
 	}
 	if bucketCount == 0 {
 		return filtered
diff --git a/pkg/sql/opt/props/histogram_test.go b/pkg/sql/opt/props/histogram_test.go
@@ -29,7 +29,7 @@ func TestEqEstimate(t *testing.T) {
 	evalCtx := eval.MakeTestingEvalContext(cluster.MakeTestingClusterSettings())
 
 	emptyHist := &Histogram{}
-	emptyHist.Init(&evalCtx, opt.ColumnID(1), []cat.HistogramBucket{})
+	emptyHist.Init(&evalCtx, opt.ColumnID(1), []cat.HistogramBucket{}, 0 /* resolution */)
 
 	if eq := emptyHist.EqEstimate(ctx, tree.NewDInt(0)); eq != 0 {
 		t.Errorf("expected %f but found %f", 0.0, eq)
@@ -45,7 +45,7 @@ func TestEqEstimate(t *testing.T) {
 		{NumRange: 40, DistinctRange: 7, NumEq: 35, UpperBound: tree.NewDInt(42)},
 	}
 	h := &Histogram{}
-	h.Init(&evalCtx, opt.ColumnID(1), histData)
+	h.Init(&evalCtx, opt.ColumnID(1), histData, 0 /* resolution */)
 
 	testData := []struct {
 		datum    tree.Datum
@@ -139,7 +139,7 @@ func TestCanFilter(t *testing.T) {
 	}
 
 	h := Histogram{}
-	h.Init(&evalCtx, opt.ColumnID(1), []cat.HistogramBucket{})
+	h.Init(&evalCtx, opt.ColumnID(1), []cat.HistogramBucket{}, 0 /* resolution */)
 	for _, tc := range testData {
 		c := constraint.ParseConstraint(&evalCtx, tc.constraint)
 		colIdx, _, ok := h.CanFilter(ctx, &c)
@@ -170,7 +170,7 @@ func TestHistogram(t *testing.T) {
 		{NumRange: 40, DistinctRange: 7, NumEq: 35, UpperBound: tree.NewDInt(42)},
 	}
 	h := &Histogram{}
-	h.Init(&evalCtx, opt.ColumnID(1), histData)
+	h.Init(&evalCtx, opt.ColumnID(1), histData, 0 /* resolution */)
 	count, expected := h.ValuesCount(), float64(91)
 	if count != expected {
 		t.Fatalf("expected %f but found %f", expected, count)
@@ -1212,7 +1212,7 @@ func BenchmarkHistogram(b *testing.B) {
 			for _, bucketCount := range bucketCounts {
 				b.Run(fmt.Sprintf("buckets=%v", bucketCount), func(b *testing.B) {
 					h := Histogram{}
-					h.Init(&evalCtx, opt.ColumnID(1), makeBuckets(typ, bucketCount))
+					h.Init(&evalCtx, opt.ColumnID(1), makeBuckets(typ, bucketCount), 0 /* resolution */)
 					c := makeConstraint(typ, bucketCount)
 					b.Run("DistinctValuesCount", func(b *testing.B) {
 						for i := 0; i < b.N; i++ {
diff --git a/pkg/sql/opt/props/statistics.go b/pkg/sql/opt/props/statistics.go
@@ -283,7 +283,9 @@ func (c *ColumnStatistic) CopyFromOther(other *ColumnStatistic, evalCtx *eval.Co
 	c.NullCount = other.NullCount
 	if other.Histogram != nil && c.Cols.Len() == 1 {
 		c.Histogram = &Histogram{}
-		c.Histogram.Init(evalCtx, c.Cols.SingleColumn(), other.Histogram.buckets)
+		c.Histogram.Init(
+			evalCtx, c.Cols.SingleColumn(), other.Histogram.buckets, other.Histogram.resolution,
+		)
 	}
 }
 
diff --git a/pkg/sql/opt/xform/testdata/coster/outside-histogram b/pkg/sql/opt/xform/testdata/coster/outside-histogram
diff --git a/pkg/sql/sessiondatapb/local_only_session_data.proto b/pkg/sql/sessiondatapb/local_only_session_data.proto
diff --git a/pkg/sql/vars.go b/pkg/sql/vars.go

Original file line number	Diff line number	Diff line change
`@@ -4438,6 +4438,10 @@ func (m *sessionDataMutator) SetOptimizerUseImprovedHoistJoinProject(val bool) {`
`4438`	`4438`	`m.data.OptimizerUseImprovedHoistJoinProject = val`
`4439`	`4439`	`}`
`4440`	`4440`
	`4441`	`+func (m *sessionDataMutator) SetOptimizerClampLowHistogramSelectivity(val bool) {`
	`4442`	`+ m.data.OptimizerClampLowHistogramSelectivity = val`
	`4443`	`+}`
	`4444`	`+`
`4441`	`4445`	`// Utility functions related to scrubbing sensitive information on SQL Stats.`
`4442`	`4446`
`4443`	`4447`	`// quantizeCounts ensures that the Count field in the`
Original file line number	Diff line number	Diff line change
`@@ -283,7 +283,9 @@ func (c ColumnStatistic) CopyFromOther(other ColumnStatistic, evalCtx *eval.Co`
`283`	`283`	`c.NullCount = other.NullCount`
`284`	`284`	`if other.Histogram != nil && c.Cols.Len() == 1 {`
`285`	`285`	`c.Histogram = &Histogram{}`
`286`		`- c.Histogram.Init(evalCtx, c.Cols.SingleColumn(), other.Histogram.buckets)`
	`286`	`+ c.Histogram.Init(`
	`287`	`+ evalCtx, c.Cols.SingleColumn(), other.Histogram.buckets, other.Histogram.resolution,`
	`288`	`+ )`
`287`	`289`	`}`
`288`	`290`	`}`
`289`	`291`