Skip to content

Commit 0bdda92

Browse files
committed
opt: use pessimistic estimates for inequality filters
This commit changes row count estimates for inequality filters, so that we expect at least `rowCount / (bucketCount * 100)` rows to "survive" the filter. This is in line with Postgres, which clamps inequality rowcount estimates in a similar fashion. Informs #130201 Release note (sql change): Added a clamp for the estimated selectivity of inequality predicates that are unbounded on one or both sides (ex: `x > 5`). This reduces the chances of a catastrophic understimate causing the optimizer to choose a poorly-constrained scan. The new logic is off by default, gated by the session setting `optimizer_clamp_inequality_selectivity`.
1 parent 84f790e commit 0bdda92

File tree

11 files changed

+759
-12
lines changed

11 files changed

+759
-12
lines changed

pkg/sql/exec_util.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4442,6 +4442,10 @@ func (m *sessionDataMutator) SetOptimizerClampLowHistogramSelectivity(val bool)
44424442
m.data.OptimizerClampLowHistogramSelectivity = val
44434443
}
44444444

4445+
func (m *sessionDataMutator) SetOptimizerClampInequalitySelectivity(val bool) {
4446+
m.data.OptimizerClampInequalitySelectivity = val
4447+
}
4448+
44454449
// Utility functions related to scrubbing sensitive information on SQL Stats.
44464450

44474451
// quantizeCounts ensures that the Count field in the

pkg/sql/logictest/testdata/logic_test/information_schema

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4189,6 +4189,7 @@ opt_split_scan_limit 2048
41894189
optimizer on
41904190
optimizer_always_use_histograms on
41914191
optimizer_check_input_min_row_count 1
4192+
optimizer_clamp_inequality_selectivity off
41924193
optimizer_clamp_low_histogram_selectivity off
41934194
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on
41944195
optimizer_enable_lock_elision on

pkg/sql/logictest/testdata/logic_test/pg_catalog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3138,6 +3138,7 @@ on_update_rehome_row_enabled on
31383138
opt_split_scan_limit 2048 NULL NULL NULL string
31393139
optimizer_always_use_histograms on NULL NULL NULL string
31403140
optimizer_check_input_min_row_count 1 NULL NULL NULL string
3141+
optimizer_clamp_inequality_selectivity off NULL NULL NULL string
31413142
optimizer_clamp_low_histogram_selectivity off NULL NULL NULL string
31423143
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on NULL NULL NULL string
31433144
optimizer_enable_lock_elision on NULL NULL NULL string
@@ -3384,6 +3385,7 @@ on_update_rehome_row_enabled on
33843385
opt_split_scan_limit 2048 NULL user NULL 2048 2048
33853386
optimizer_always_use_histograms on NULL user NULL on on
33863387
optimizer_check_input_min_row_count 1 NULL user NULL 1 1
3388+
optimizer_clamp_inequality_selectivity off NULL user NULL off off
33873389
optimizer_clamp_low_histogram_selectivity off NULL user NULL off off
33883390
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on NULL user NULL on on
33893391
optimizer_enable_lock_elision on NULL user NULL on on
@@ -3621,6 +3623,7 @@ opt_split_scan_limit NULL NULL
36213623
optimizer NULL NULL NULL NULL NULL
36223624
optimizer_always_use_histograms NULL NULL NULL NULL NULL
36233625
optimizer_check_input_min_row_count NULL NULL NULL NULL NULL
3626+
optimizer_clamp_inequality_selectivity NULL NULL NULL NULL NULL
36243627
optimizer_clamp_low_histogram_selectivity NULL NULL NULL NULL NULL
36253628
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables NULL NULL NULL NULL NULL
36263629
optimizer_enable_lock_elision NULL NULL NULL NULL NULL

pkg/sql/logictest/testdata/logic_test/show_source

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ on_update_rehome_row_enabled on
155155
opt_split_scan_limit 2048
156156
optimizer_always_use_histograms on
157157
optimizer_check_input_min_row_count 1
158+
optimizer_clamp_inequality_selectivity off
158159
optimizer_clamp_low_histogram_selectivity off
159160
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on
160161
optimizer_enable_lock_elision on

pkg/sql/opt/memo/memo.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ type Memo struct {
213213
useImprovedHoistJoinProject bool
214214
rowSecurity bool
215215
clampLowHistogramSelectivity bool
216+
clampInequalitySelectivity bool
216217

217218
// txnIsoLevel is the isolation level under which the plan was created. This
218219
// affects the planning of some locking operations, so it must be included in
@@ -322,6 +323,7 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
322323
useImprovedHoistJoinProject: evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject,
323324
rowSecurity: evalCtx.SessionData().RowSecurity,
324325
clampLowHistogramSelectivity: evalCtx.SessionData().OptimizerClampLowHistogramSelectivity,
326+
clampInequalitySelectivity: evalCtx.SessionData().OptimizerClampInequalitySelectivity,
325327
txnIsoLevel: evalCtx.TxnIsoLevel,
326328
}
327329
m.metadata.Init()
@@ -499,6 +501,7 @@ func (m *Memo) IsStale(
499501
m.useImprovedHoistJoinProject != evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject ||
500502
m.rowSecurity != evalCtx.SessionData().RowSecurity ||
501503
m.clampLowHistogramSelectivity != evalCtx.SessionData().OptimizerClampLowHistogramSelectivity ||
504+
m.clampInequalitySelectivity != evalCtx.SessionData().OptimizerClampInequalitySelectivity ||
502505
m.txnIsoLevel != evalCtx.TxnIsoLevel {
503506
return true, nil
504507
}

pkg/sql/opt/memo/memo_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,11 @@ func TestMemoIsStale(t *testing.T) {
600600
evalCtx.SessionData().OptimizerClampLowHistogramSelectivity = false
601601
notStale()
602602

603+
evalCtx.SessionData().OptimizerClampInequalitySelectivity = true
604+
stale()
605+
evalCtx.SessionData().OptimizerClampInequalitySelectivity = false
606+
notStale()
607+
603608
// User no longer has access to view.
604609
catalog.View(tree.NewTableNameWithSchema("t", catconstants.PublicSchemaName, "abcview")).Revoked = true
605610
_, err = o.Memo().IsStale(ctx, &evalCtx, catalog)

pkg/sql/opt/memo/statistics_builder.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,19 @@ const (
100100
// multiplicity >= row_count/10000. Cardinality estimates below this threshold
101101
// are increasingly likely to be inaccurate. See also computeNumberSamples.
102102
histogramPessimisticThreshold = 1.0 / 10000.0
103+
104+
// histogramInequalityMinSelectivity determines the minimum selectivity
105+
// estimate that can be derived from an unbounded (above or below) inequality
106+
// used to filter a histogram. Similar to histogramPessimisticThreshold, this
107+
// is to avoid over-fitting to a stale or inaccurate histogram.
108+
//
109+
// The value (1 in 10,000) was chosen based on similar logic in Postgres,
110+
// which caps the selectivity to (1 / bucket_count*100). Postgres uses 100
111+
// histogram buckets by default, so the number comes out to 10,000. We avoid
112+
// using the number of histogram buckets directly to avoid arbitrary
113+
// variation in the selectivity cap depending on user settings and partial
114+
// stat collections.
115+
histogramUnboundedInequalityMinSelectivity = 1.0 / 10000.0
103116
)
104117

105118
// statisticsBuilder is responsible for building the statistics that are
@@ -4626,6 +4639,16 @@ func (sb *statisticsBuilder) clampSelForHistogram(
46264639
)
46274640
clampedSel = props.MaxSelectivity(clampedSel, resClamp)
46284641
}
4642+
4643+
tightUpperBound, tightLowerBound := newHist.TightBounds()
4644+
if sb.evalCtx.SessionData().OptimizerClampInequalitySelectivity &&
4645+
(!tightUpperBound || !tightLowerBound) {
4646+
// Similar to Postgres, assume that an open-ended inequality predicate will
4647+
// scan at least 1/10000th of the table. This accounts for the possibility
4648+
// that the histogram missed extreme values due to sampling or staleness.
4649+
inequalityClamp := props.MakeSelectivity(histogramUnboundedInequalityMinSelectivity)
4650+
clampedSel = props.MaxSelectivity(clampedSel, inequalityClamp)
4651+
}
46294652
return clampedSel
46304653
}
46314654

pkg/sql/opt/props/histogram.go

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ type Histogram struct {
4545
// corresponds to the highest expected multiplicity of any value missing from
4646
// the histogram.
4747
resolution float64
48+
// hasTightUB and hasTightLB indicate whether there are guaranteed upper
49+
// bounds on the range of values in the underlying dataset (possibly, though
50+
// not necessarily, equal to the upper and lower bounds of the histogram).
51+
// This is the case for a histogram derived after a filter that restricts the
52+
// column's values to a finite range. Contrast this with the histogram derived
53+
// from a table sample, which may miss extreme values or become stale.
54+
hasTightUB, hasTightLB bool
4855
}
4956

5057
func (h *Histogram) String() string {
@@ -70,21 +77,21 @@ func (h *Histogram) Init(
7077
}
7178
}
7279

73-
// bucketCount returns the number of buckets in the histogram.
74-
func (h *Histogram) bucketCount() int {
80+
// BucketCount returns the number of buckets in the histogram.
81+
func (h *Histogram) BucketCount() int {
7582
return len(h.buckets)
7683
}
7784

7885
// numEq returns NumEq for the ith histogram bucket, with the histogram's
7986
// selectivity applied. i must be greater than or equal to 0 and less than
80-
// bucketCount.
87+
// BucketCount.
8188
func (h *Histogram) numEq(i int) float64 {
8289
return h.buckets[i].NumEq * h.selectivity
8390
}
8491

8592
// numRange returns NumRange for the ith histogram bucket, with the histogram's
8693
// selectivity applied. i must be greater than or equal to 0 and less than
87-
// bucketCount.
94+
// BucketCount.
8895
func (h *Histogram) numRange(i int) float64 {
8996
// The first bucket always has a zero value for NumRange, so the lower bound
9097
// of the histogram is the upper bound of the first bucket. We only check this
@@ -100,7 +107,7 @@ func (h *Histogram) numRange(i int) float64 {
100107

101108
// distinctRange returns DistinctRange for the ith histogram bucket, with the
102109
// histogram's selectivity applied. i must be greater than or equal to 0 and
103-
// less than bucketCount.
110+
// less than BucketCount.
104111
func (h *Histogram) distinctRange(i int) float64 {
105112
n := h.buckets[i].NumRange
106113
d := h.buckets[i].DistinctRange
@@ -127,7 +134,7 @@ func (h *Histogram) distinctRange(i int) float64 {
127134
}
128135

129136
// upperBound returns UpperBound for the ith histogram bucket. i must be
130-
// greater than or equal to 0 and less than bucketCount.
137+
// greater than or equal to 0 and less than BucketCount.
131138
func (h *Histogram) upperBound(i int) tree.Datum {
132139
return h.buckets[i].UpperBound
133140
}
@@ -151,6 +158,16 @@ func (h *Histogram) Resolution() float64 {
151158
return h.resolution
152159
}
153160

161+
// TightBounds returns whether the histogram has been constrained such that
162+
// there are guaranteed finite upper and lower bounds on the values in the
163+
// histogram column. Note that the guaranteed bounds may not match the
164+
// histogram's maximum and minimum values. This information can be used to
165+
// determine how to clamp row-count estimates for inequality filters to avoid
166+
// over-fitting on stale or inaccurate histograms.
167+
func (h *Histogram) TightBounds() (tightUpper, tightLower bool) {
168+
return h.hasTightUB, h.hasTightLB
169+
}
170+
154171
// EqEstimate returns the estimated number of rows that equal the given
155172
// datum. If the datum is equal to a bucket's upperbound, it returns the
156173
// bucket's NumEq. If the datum falls in the range of a bucket's upper and lower
@@ -332,6 +349,33 @@ func (h *Histogram) CanFilter(
332349
return 0, exactPrefix, false
333350
}
334351

352+
// checkSpanBounds determines whether the given spans bound the histogram column
353+
// above and below. This can be used to determine how to clamp row-count
354+
// estimates for inequality filters to avoid over-fitting on stale or inaccurate
355+
// histograms.
356+
func checkSpanBounds(
357+
spanCount int, getSpan func(int) *constraint.Span, desc bool, colOffset int,
358+
) (hasUpperBound, hasLowerBound bool) {
359+
if spanCount == 0 {
360+
return false, false
361+
}
362+
firstSpan := getSpan(0)
363+
lastSpan := getSpan(spanCount - 1)
364+
hasBound := func(key constraint.Key) bool {
365+
// A NULL value is not considered a bound in this context, since they order
366+
// before (or after) all non-NULL values and are not included in histograms.
367+
return key.Length() > colOffset && key.Value(colOffset) != tree.DNull
368+
}
369+
if desc {
370+
hasUpperBound = hasBound(firstSpan.StartKey())
371+
hasLowerBound = hasBound(lastSpan.EndKey())
372+
} else {
373+
hasLowerBound = hasBound(firstSpan.StartKey())
374+
hasUpperBound = hasBound(lastSpan.EndKey())
375+
}
376+
return hasUpperBound, hasLowerBound
377+
}
378+
335379
func (h *Histogram) filter(
336380
ctx context.Context,
337381
spanCount int,
@@ -341,12 +385,21 @@ func (h *Histogram) filter(
341385
prefix []tree.Datum,
342386
columns constraint.Columns,
343387
) *Histogram {
344-
bucketCount := h.bucketCount()
388+
bucketCount := h.BucketCount()
345389
filtered := &Histogram{
346390
evalCtx: h.evalCtx,
347391
col: h.col,
348392
selectivity: h.selectivity,
349393
resolution: h.resolution,
394+
hasTightLB: h.hasTightLB,
395+
hasTightUB: h.hasTightUB,
396+
}
397+
spanUB, spanLB := checkSpanBounds(spanCount, getSpan, desc, colOffset)
398+
if spanUB {
399+
filtered.hasTightUB = true
400+
}
401+
if spanLB {
402+
filtered.hasTightLB = true
350403
}
351404
if bucketCount == 0 {
352405
return filtered
@@ -665,7 +718,7 @@ func (hi *histogramIter) init(h *Histogram, desc bool) {
665718
desc: desc,
666719
}
667720
if desc {
668-
hi.idx = h.bucketCount()
721+
hi.idx = h.BucketCount()
669722
}
670723
hi.next()
671724
}
@@ -709,7 +762,7 @@ func (hi *histogramIter) next() (ok bool) {
709762
hi.eub, hi.ub, hi.elb, hi.lb = getBounds()
710763
} else {
711764
hi.idx++
712-
if hi.idx >= hi.h.bucketCount() {
765+
if hi.idx >= hi.h.BucketCount() {
713766
return false
714767
}
715768
// If iter.desc=false, the lower bounds are less than the upper bounds.

0 commit comments

Comments
 (0)