opt: use pessimistic estimates for inequality filters

DrewKimball · DrewKimball · commit 0bdda9271cd4 · 2025-10-22T22:44:00.000-05:00
This commit changes row count estimates for inequality filters, so that we expect at least `rowCount / (bucketCount * 100)` rows to "survive" the filter. This is in line with Postgres, which clamps inequality rowcount estimates in a similar fashion. Informs #130201 Release note (sql change): Added a clamp for the estimated selectivity of inequality predicates that are unbounded on one or both sides (ex: `x > 5`). This reduces the chances of a catastrophic understimate causing the optimizer to choose a poorly-constrained scan. The new logic is off by default, gated by the session setting `optimizer_clamp_inequality_selectivity`.
diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
@@ -4442,6 +4442,10 @@ func (m *sessionDataMutator) SetOptimizerClampLowHistogramSelectivity(val bool)
 	m.data.OptimizerClampLowHistogramSelectivity = val
 }
 
+func (m *sessionDataMutator) SetOptimizerClampInequalitySelectivity(val bool) {
+	m.data.OptimizerClampInequalitySelectivity = val
+}
+
 // Utility functions related to scrubbing sensitive information on SQL Stats.
 
 // quantizeCounts ensures that the Count field in the
diff --git a/pkg/sql/logictest/testdata/logic_test/information_schema b/pkg/sql/logictest/testdata/logic_test/information_schema
@@ -4189,6 +4189,7 @@ opt_split_scan_limit                                             2048
 optimizer                                                        on
 optimizer_always_use_histograms                                  on
 optimizer_check_input_min_row_count                              1
+optimizer_clamp_inequality_selectivity                           off
 optimizer_clamp_low_histogram_selectivity                        off
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  on
 optimizer_enable_lock_elision                                    on
diff --git a/pkg/sql/logictest/testdata/logic_test/pg_catalog b/pkg/sql/logictest/testdata/logic_test/pg_catalog
@@ -3138,6 +3138,7 @@ on_update_rehome_row_enabled                                     on
 opt_split_scan_limit                                             2048                NULL      NULL        NULL        string
 optimizer_always_use_histograms                                  on                  NULL      NULL        NULL        string
 optimizer_check_input_min_row_count                              1                   NULL      NULL        NULL        string
+optimizer_clamp_inequality_selectivity                           off                 NULL      NULL        NULL        string
 optimizer_clamp_low_histogram_selectivity                        off                 NULL      NULL        NULL        string
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  on                  NULL      NULL        NULL        string
 optimizer_enable_lock_elision                                    on                  NULL      NULL        NULL        string
@@ -3384,6 +3385,7 @@ on_update_rehome_row_enabled                                     on
 opt_split_scan_limit                                             2048                NULL  user     NULL      2048                2048
 optimizer_always_use_histograms                                  on                  NULL  user     NULL      on                  on
 optimizer_check_input_min_row_count                              1                   NULL  user     NULL      1                   1
+optimizer_clamp_inequality_selectivity                           off                 NULL  user     NULL      off                 off
 optimizer_clamp_low_histogram_selectivity                        off                 NULL  user     NULL      off                 off
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  on                  NULL  user     NULL      on                  on
 optimizer_enable_lock_elision                                    on                  NULL  user     NULL      on                  on
@@ -3621,6 +3623,7 @@ opt_split_scan_limit                                             NULL    NULL
 optimizer                                                        NULL    NULL     NULL     NULL        NULL
 optimizer_always_use_histograms                                  NULL    NULL     NULL     NULL        NULL
 optimizer_check_input_min_row_count                              NULL    NULL     NULL     NULL        NULL
+optimizer_clamp_inequality_selectivity                           NULL    NULL     NULL     NULL        NULL
 optimizer_clamp_low_histogram_selectivity                        NULL    NULL     NULL     NULL        NULL
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  NULL    NULL     NULL     NULL        NULL
 optimizer_enable_lock_elision                                    NULL    NULL     NULL     NULL        NULL
diff --git a/pkg/sql/logictest/testdata/logic_test/show_source b/pkg/sql/logictest/testdata/logic_test/show_source
@@ -155,6 +155,7 @@ on_update_rehome_row_enabled                                     on
 opt_split_scan_limit                                             2048
 optimizer_always_use_histograms                                  on
 optimizer_check_input_min_row_count                              1
+optimizer_clamp_inequality_selectivity                           off
 optimizer_clamp_low_histogram_selectivity                        off
 optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables  on
 optimizer_enable_lock_elision                                    on
diff --git a/pkg/sql/opt/memo/memo.go b/pkg/sql/opt/memo/memo.go
@@ -213,6 +213,7 @@ type Memo struct {
 	useImprovedHoistJoinProject                bool
 	rowSecurity                                bool
 	clampLowHistogramSelectivity               bool
+	clampInequalitySelectivity                 bool
 
 	// txnIsoLevel is the isolation level under which the plan was created. This
 	// affects the planning of some locking operations, so it must be included in
@@ -322,6 +323,7 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
 		useImprovedHoistJoinProject:                evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject,
 		rowSecurity:                                evalCtx.SessionData().RowSecurity,
 		clampLowHistogramSelectivity:               evalCtx.SessionData().OptimizerClampLowHistogramSelectivity,
+		clampInequalitySelectivity:                 evalCtx.SessionData().OptimizerClampInequalitySelectivity,
 		txnIsoLevel:                                evalCtx.TxnIsoLevel,
 	}
 	m.metadata.Init()
@@ -499,6 +501,7 @@ func (m *Memo) IsStale(
 		m.useImprovedHoistJoinProject != evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject ||
 		m.rowSecurity != evalCtx.SessionData().RowSecurity ||
 		m.clampLowHistogramSelectivity != evalCtx.SessionData().OptimizerClampLowHistogramSelectivity ||
+		m.clampInequalitySelectivity != evalCtx.SessionData().OptimizerClampInequalitySelectivity ||
 		m.txnIsoLevel != evalCtx.TxnIsoLevel {
 		return true, nil
 	}
diff --git a/pkg/sql/opt/memo/memo_test.go b/pkg/sql/opt/memo/memo_test.go
@@ -600,6 +600,11 @@ func TestMemoIsStale(t *testing.T) {
 	evalCtx.SessionData().OptimizerClampLowHistogramSelectivity = false
 	notStale()
 
+	evalCtx.SessionData().OptimizerClampInequalitySelectivity = true
+	stale()
+	evalCtx.SessionData().OptimizerClampInequalitySelectivity = false
+	notStale()
+
 	// User no longer has access to view.
 	catalog.View(tree.NewTableNameWithSchema("t", catconstants.PublicSchemaName, "abcview")).Revoked = true
 	_, err = o.Memo().IsStale(ctx, &evalCtx, catalog)
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -100,6 +100,19 @@ const (
 	// multiplicity >= row_count/10000. Cardinality estimates below this threshold
 	// are increasingly likely to be inaccurate. See also computeNumberSamples.
 	histogramPessimisticThreshold = 1.0 / 10000.0
+
+	// histogramInequalityMinSelectivity determines the minimum selectivity
+	// estimate that can be derived from an unbounded (above or below) inequality
+	// used to filter a histogram. Similar to histogramPessimisticThreshold, this
+	// is to avoid over-fitting to a stale or inaccurate histogram.
+	//
+	// The value (1 in 10,000) was chosen based on similar logic in Postgres,
+	// which caps the selectivity to (1 / bucket_count*100). Postgres uses 100
+	// histogram buckets by default, so the number comes out to 10,000. We avoid
+	// using the number of histogram buckets directly to avoid arbitrary
+	// variation in the selectivity cap depending on user settings and partial
+	// stat collections.
+	histogramUnboundedInequalityMinSelectivity = 1.0 / 10000.0
 )
 
 // statisticsBuilder is responsible for building the statistics that are
@@ -4626,6 +4639,16 @@ func (sb *statisticsBuilder) clampSelForHistogram(
 		)
 		clampedSel = props.MaxSelectivity(clampedSel, resClamp)
 	}
+
+	tightUpperBound, tightLowerBound := newHist.TightBounds()
+	if sb.evalCtx.SessionData().OptimizerClampInequalitySelectivity &&
+		(!tightUpperBound || !tightLowerBound) {
+		// Similar to Postgres, assume that an open-ended inequality predicate will
+		// scan at least 1/10000th of the table. This accounts for the possibility
+		// that the histogram missed extreme values due to sampling or staleness.
+		inequalityClamp := props.MakeSelectivity(histogramUnboundedInequalityMinSelectivity)
+		clampedSel = props.MaxSelectivity(clampedSel, inequalityClamp)
+	}
 	return clampedSel
 }
 
diff --git a/pkg/sql/opt/props/histogram.go b/pkg/sql/opt/props/histogram.go
@@ -45,6 +45,13 @@ type Histogram struct {
 	// corresponds to the highest expected multiplicity of any value missing from
 	// the histogram.
 	resolution float64
+	// hasTightUB and hasTightLB indicate whether there are guaranteed upper
+	// bounds on the range of values in the underlying dataset (possibly, though
+	// not necessarily, equal to the upper and lower bounds of the histogram).
+	// This is the case for a histogram derived after a filter that restricts the
+	// column's values to a finite range. Contrast this with the histogram derived
+	// from a table sample, which may miss extreme values or become stale.
+	hasTightUB, hasTightLB bool
 }
 
 func (h *Histogram) String() string {
@@ -70,21 +77,21 @@ func (h *Histogram) Init(
 	}
 }
 
-// bucketCount returns the number of buckets in the histogram.
-func (h *Histogram) bucketCount() int {
+// BucketCount returns the number of buckets in the histogram.
+func (h *Histogram) BucketCount() int {
 	return len(h.buckets)
 }
 
 // numEq returns NumEq for the ith histogram bucket, with the histogram's
 // selectivity applied. i must be greater than or equal to 0 and less than
-// bucketCount.
+// BucketCount.
 func (h *Histogram) numEq(i int) float64 {
 	return h.buckets[i].NumEq * h.selectivity
 }
 
 // numRange returns NumRange for the ith histogram bucket, with the histogram's
 // selectivity applied. i must be greater than or equal to 0 and less than
-// bucketCount.
+// BucketCount.
 func (h *Histogram) numRange(i int) float64 {
 	// The first bucket always has a zero value for NumRange, so the lower bound
 	// of the histogram is the upper bound of the first bucket. We only check this
@@ -100,7 +107,7 @@ func (h *Histogram) numRange(i int) float64 {
 
 // distinctRange returns DistinctRange for the ith histogram bucket, with the
 // histogram's selectivity applied. i must be greater than or equal to 0 and
-// less than bucketCount.
+// less than BucketCount.
 func (h *Histogram) distinctRange(i int) float64 {
 	n := h.buckets[i].NumRange
 	d := h.buckets[i].DistinctRange
@@ -127,7 +134,7 @@ func (h *Histogram) distinctRange(i int) float64 {
 }
 
 // upperBound returns UpperBound for the ith histogram bucket. i must be
-// greater than or equal to 0 and less than bucketCount.
+// greater than or equal to 0 and less than BucketCount.
 func (h *Histogram) upperBound(i int) tree.Datum {
 	return h.buckets[i].UpperBound
 }
@@ -151,6 +158,16 @@ func (h *Histogram) Resolution() float64 {
 	return h.resolution
 }
 
+// TightBounds returns whether the histogram has been constrained such that
+// there are guaranteed finite upper and lower bounds on the values in the
+// histogram column. Note that the guaranteed bounds may not match the
+// histogram's maximum and minimum values. This information can be used to
+// determine how to clamp row-count estimates for inequality filters to avoid
+// over-fitting on stale or inaccurate histograms.
+func (h *Histogram) TightBounds() (tightUpper, tightLower bool) {
+	return h.hasTightUB, h.hasTightLB
+}
+
 // EqEstimate returns the estimated number of rows that equal the given
 // datum. If the datum is equal to a bucket's upperbound, it returns the
 // bucket's NumEq. If the datum falls in the range of a bucket's upper and lower
@@ -332,6 +349,33 @@ func (h *Histogram) CanFilter(
 	return 0, exactPrefix, false
 }
 
+// checkSpanBounds determines whether the given spans bound the histogram column
+// above and below. This can be used to determine how to clamp row-count
+// estimates for inequality filters to avoid over-fitting on stale or inaccurate
+// histograms.
+func checkSpanBounds(
+	spanCount int, getSpan func(int) *constraint.Span, desc bool, colOffset int,
+) (hasUpperBound, hasLowerBound bool) {
+	if spanCount == 0 {
+		return false, false
+	}
+	firstSpan := getSpan(0)
+	lastSpan := getSpan(spanCount - 1)
+	hasBound := func(key constraint.Key) bool {
+		// A NULL value is not considered a bound in this context, since they order
+		// before (or after) all non-NULL values and are not included in histograms.
+		return key.Length() > colOffset && key.Value(colOffset) != tree.DNull
+	}
+	if desc {
+		hasUpperBound = hasBound(firstSpan.StartKey())
+		hasLowerBound = hasBound(lastSpan.EndKey())
+	} else {
+		hasLowerBound = hasBound(firstSpan.StartKey())
+		hasUpperBound = hasBound(lastSpan.EndKey())
+	}
+	return hasUpperBound, hasLowerBound
+}
+
 func (h *Histogram) filter(
 	ctx context.Context,
 	spanCount int,
@@ -341,12 +385,21 @@ func (h *Histogram) filter(
 	prefix []tree.Datum,
 	columns constraint.Columns,
 ) *Histogram {
-	bucketCount := h.bucketCount()
+	bucketCount := h.BucketCount()
 	filtered := &Histogram{
 		evalCtx:     h.evalCtx,
 		col:         h.col,
 		selectivity: h.selectivity,
 		resolution:  h.resolution,
+		hasTightLB:  h.hasTightLB,
+		hasTightUB:  h.hasTightUB,
+	}
+	spanUB, spanLB := checkSpanBounds(spanCount, getSpan, desc, colOffset)
+	if spanUB {
+		filtered.hasTightUB = true
+	}
+	if spanLB {
+		filtered.hasTightLB = true
 	}
 	if bucketCount == 0 {
 		return filtered
@@ -665,7 +718,7 @@ func (hi *histogramIter) init(h *Histogram, desc bool) {
 		desc: desc,
 	}
 	if desc {
-		hi.idx = h.bucketCount()
+		hi.idx = h.BucketCount()
 	}
 	hi.next()
 }
@@ -709,7 +762,7 @@ func (hi *histogramIter) next() (ok bool) {
 		hi.eub, hi.ub, hi.elb, hi.lb = getBounds()
 	} else {
 		hi.idx++
-		if hi.idx >= hi.h.bucketCount() {
+		if hi.idx >= hi.h.BucketCount() {
 			return false
 		}
 		// If iter.desc=false, the lower bounds are less than the upper bounds.
diff --git a/pkg/sql/opt/xform/testdata/coster/outside-histogram b/pkg/sql/opt/xform/testdata/coster/outside-histogram
diff --git a/pkg/sql/sessiondatapb/local_only_session_data.proto b/pkg/sql/sessiondatapb/local_only_session_data.proto
diff --git a/pkg/sql/vars.go b/pkg/sql/vars.go

Original file line number	Diff line number	Diff line change
`@@ -4442,6 +4442,10 @@ func (m *sessionDataMutator) SetOptimizerClampLowHistogramSelectivity(val bool)`
`4442`	`4442`	`m.data.OptimizerClampLowHistogramSelectivity = val`
`4443`	`4443`	`}`
`4444`	`4444`
	`4445`	`+func (m *sessionDataMutator) SetOptimizerClampInequalitySelectivity(val bool) {`
	`4446`	`+ m.data.OptimizerClampInequalitySelectivity = val`
	`4447`	`+}`
	`4448`	`+`
`4445`	`4449`	`// Utility functions related to scrubbing sensitive information on SQL Stats.`
`4446`	`4450`
`4447`	`4451`	`// quantizeCounts ensures that the Count field in the`