Skip to content

Commit 84f790e

Browse files
committed
opt: make outside-of-histogram estimates more pessimistic
This commit makes rowcount estimation fall back on distinct count estimates when a constraint includes zero histogram values. This biases the optimizer toward less risky plans when less information is known about the filtered values. The pessimistic logic is triggered when the estimate derived from a histogram is smaller than `table_row_count / 10,000`. This threshold is chosen because we choose samples such that we expect to sample *nearly* every value with multiplicity down to `table_row_count / 10,000` (see computeNumberSamples). Selecivity estimates from a histogram below this resolution are suspect, since there is increasing likelihood that a value was missed either due to being omitted from the sample, or due to staleness. Informs #130201 Release note (sql change): Added a clamp for row-count estimates over very large tables so that the optimizer assumes that at least one distinct value will be scanned. This reduces the chances of a catastrophic underestimate. The new logic is off by default, gated by a session setting `optimizer_clamp_low_histogram_selectivity`.
1 parent 5a6ba82 commit 84f790e

File tree

13 files changed

+803
-15
lines changed

13 files changed

+803
-15
lines changed

pkg/sql/exec_util.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4438,6 +4438,10 @@ func (m *sessionDataMutator) SetOptimizerUseImprovedHoistJoinProject(val bool) {
44384438
m.data.OptimizerUseImprovedHoistJoinProject = val
44394439
}
44404440

4441+
func (m *sessionDataMutator) SetOptimizerClampLowHistogramSelectivity(val bool) {
4442+
m.data.OptimizerClampLowHistogramSelectivity = val
4443+
}
4444+
44414445
// Utility functions related to scrubbing sensitive information on SQL Stats.
44424446

44434447
// quantizeCounts ensures that the Count field in the

pkg/sql/logictest/testdata/logic_test/information_schema

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4189,6 +4189,7 @@ opt_split_scan_limit 2048
41894189
optimizer on
41904190
optimizer_always_use_histograms on
41914191
optimizer_check_input_min_row_count 1
4192+
optimizer_clamp_low_histogram_selectivity off
41924193
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on
41934194
optimizer_enable_lock_elision on
41944195
optimizer_hoist_uncorrelated_equality_subqueries on

pkg/sql/logictest/testdata/logic_test/pg_catalog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3138,6 +3138,7 @@ on_update_rehome_row_enabled on
31383138
opt_split_scan_limit 2048 NULL NULL NULL string
31393139
optimizer_always_use_histograms on NULL NULL NULL string
31403140
optimizer_check_input_min_row_count 1 NULL NULL NULL string
3141+
optimizer_clamp_low_histogram_selectivity off NULL NULL NULL string
31413142
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on NULL NULL NULL string
31423143
optimizer_enable_lock_elision on NULL NULL NULL string
31433144
optimizer_hoist_uncorrelated_equality_subqueries on NULL NULL NULL string
@@ -3383,6 +3384,7 @@ on_update_rehome_row_enabled on
33833384
opt_split_scan_limit 2048 NULL user NULL 2048 2048
33843385
optimizer_always_use_histograms on NULL user NULL on on
33853386
optimizer_check_input_min_row_count 1 NULL user NULL 1 1
3387+
optimizer_clamp_low_histogram_selectivity off NULL user NULL off off
33863388
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on NULL user NULL on on
33873389
optimizer_enable_lock_elision on NULL user NULL on on
33883390
optimizer_hoist_uncorrelated_equality_subqueries on NULL user NULL on on
@@ -3619,6 +3621,7 @@ opt_split_scan_limit NULL NULL
36193621
optimizer NULL NULL NULL NULL NULL
36203622
optimizer_always_use_histograms NULL NULL NULL NULL NULL
36213623
optimizer_check_input_min_row_count NULL NULL NULL NULL NULL
3624+
optimizer_clamp_low_histogram_selectivity NULL NULL NULL NULL NULL
36223625
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables NULL NULL NULL NULL NULL
36233626
optimizer_enable_lock_elision NULL NULL NULL NULL NULL
36243627
optimizer_hoist_uncorrelated_equality_subqueries NULL NULL NULL NULL NULL

pkg/sql/logictest/testdata/logic_test/show_source

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ on_update_rehome_row_enabled on
155155
opt_split_scan_limit 2048
156156
optimizer_always_use_histograms on
157157
optimizer_check_input_min_row_count 1
158+
optimizer_clamp_low_histogram_selectivity off
158159
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on
159160
optimizer_enable_lock_elision on
160161
optimizer_hoist_uncorrelated_equality_subqueries on

pkg/sql/opt/memo/memo.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ type Memo struct {
212212
disableSlowCascadeFastPathForRBRTables bool
213213
useImprovedHoistJoinProject bool
214214
rowSecurity bool
215+
clampLowHistogramSelectivity bool
215216

216217
// txnIsoLevel is the isolation level under which the plan was created. This
217218
// affects the planning of some locking operations, so it must be included in
@@ -320,6 +321,7 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
320321
disableSlowCascadeFastPathForRBRTables: evalCtx.SessionData().OptimizerDisableCrossRegionCascadeFastPathForRBRTables,
321322
useImprovedHoistJoinProject: evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject,
322323
rowSecurity: evalCtx.SessionData().RowSecurity,
324+
clampLowHistogramSelectivity: evalCtx.SessionData().OptimizerClampLowHistogramSelectivity,
323325
txnIsoLevel: evalCtx.TxnIsoLevel,
324326
}
325327
m.metadata.Init()
@@ -496,6 +498,7 @@ func (m *Memo) IsStale(
496498
m.disableSlowCascadeFastPathForRBRTables != evalCtx.SessionData().OptimizerDisableCrossRegionCascadeFastPathForRBRTables ||
497499
m.useImprovedHoistJoinProject != evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject ||
498500
m.rowSecurity != evalCtx.SessionData().RowSecurity ||
501+
m.clampLowHistogramSelectivity != evalCtx.SessionData().OptimizerClampLowHistogramSelectivity ||
499502
m.txnIsoLevel != evalCtx.TxnIsoLevel {
500503
return true, nil
501504
}

pkg/sql/opt/memo/memo_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,11 @@ func TestMemoIsStale(t *testing.T) {
595595
evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject = false
596596
notStale()
597597

598+
evalCtx.SessionData().OptimizerClampLowHistogramSelectivity = true
599+
stale()
600+
evalCtx.SessionData().OptimizerClampLowHistogramSelectivity = false
601+
notStale()
602+
598603
// User no longer has access to view.
599604
catalog.View(tree.NewTableNameWithSchema("t", catconstants.PublicSchemaName, "abcview")).Revoked = true
600605
_, err = o.Memo().IsStale(ctx, &evalCtx, catalog)

pkg/sql/opt/memo/statistics_builder.go

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,16 @@ const (
9090
// values from the spans a check constraint is allowed to have in order to build
9191
// a histogram from it.
9292
maxValuesForFullHistogramFromCheckConstraint = tabledesc.MaxBucketAllowed
93+
94+
// histogramPessimisticThreshold determines the cutoff point below which the
95+
// selectivity estimate of a histogram is overridden with a more pessimistic
96+
// estimate. This is to avoid over-fitting to a stale or inaccurate histogram.
97+
//
98+
// The value (1 in 10,000) was chosen because we choose sample sizes according
99+
// to table size such that we expect to *nearly* always sample all values with
100+
// multiplicity >= row_count/10000. Cardinality estimates below this threshold
101+
// are increasingly likely to be inaccurate. See also computeNumberSamples.
102+
histogramPessimisticThreshold = 1.0 / 10000.0
93103
)
94104

95105
// statisticsBuilder is responsible for building the statistics that are
@@ -761,7 +771,10 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
761771
// is tracked here: https://github.com/cockroachdb/cockroach/issues/50655
762772
col := cols.SingleColumn()
763773
colStat.Histogram = &props.Histogram{}
764-
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
774+
// Track the minimum number of rows for which histogram selectivity
775+
// estimates are trusted.
776+
resolution := histogramPessimisticThreshold * stats.RowCount
777+
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram(), resolution)
765778
}
766779

767780
// Make sure the distinct count is at least 1, for the same reason as
@@ -786,7 +799,18 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
786799
invCols := opt.MakeColSet(invCol)
787800
if invColStat, ok := stats.ColStats.Add(invCols); ok {
788801
invColStat.Histogram = &props.Histogram{}
789-
invColStat.Histogram.Init(sb.evalCtx, invCol, stat.Histogram())
802+
// Track the minimum number of rows for which histogram selectivity
803+
// estimates are trusted.
804+
//
805+
// NOTE: an inverted index can have multiple entries per table row.
806+
// However, we still use the number of table rows here because the
807+
// max multiplicity of a missed value is proportional to the number
808+
// of table rows, not the number of inverted index entries. For
809+
// example, the arrays [10, 20, 30] and [20, 40, 60] result in six
810+
// inverted index entries, but only a maximum multiplicity of two
811+
// for the value "20".
812+
resolution := histogramPessimisticThreshold * stats.RowCount
813+
invColStat.Histogram.Init(sb.evalCtx, invCol, stat.Histogram(), resolution)
790814
// Set inverted entry counts from the histogram. Make sure the
791815
// distinct count is at least 1, for the same reason as the row
792816
// count above.
@@ -4558,10 +4582,15 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
45584582
newCount := newHist.ValuesCount()
45594583
oldCount := oldHist.ValuesCount()
45604584

4561-
// Calculate the selectivity of the predicate. Nulls are already included
4562-
// in the histogram, so we do not need to account for them separately.
4585+
// Calculate the selectivity of the predicate using the histogram. Nulls
4586+
// are already included in the histogram, so we do not need to account for
4587+
// them separately.
45634588
predicateSelectivity := props.MakeSelectivityFromFraction(newCount, oldCount)
45644589

4590+
// Possibly clamp the selectivity to a higher value to avoid overly
4591+
// optimistic estimates.
4592+
predicateSelectivity = sb.clampSelForHistogram(inputColStat, colStat, s, predicateSelectivity)
4593+
45654594
// The maximum possible selectivity of the entire expression is the minimum
45664595
// selectivity of all individual predicates.
45674596
selectivityUpperBound = props.MinSelectivity(selectivityUpperBound, predicateSelectivity)
@@ -4572,6 +4601,34 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
45724601
return selectivity, selectivityUpperBound
45734602
}
45744603

4604+
// clampSelForHistogram clamps the selectivity estimate derived from a histogram
4605+
// to a minimum value. This accounts for the possibility that the histogram is
4606+
// missing values due to sampling or staleness. See also
4607+
// histogramPessimisticThreshold.
4608+
func (sb *statisticsBuilder) clampSelForHistogram(
4609+
oldColStat, newColStat *props.ColumnStatistic, s *props.Statistics, originalSel props.Selectivity,
4610+
) (clampedSel props.Selectivity) {
4611+
clampedSel = originalSel
4612+
oldHist, newHist := oldColStat.Histogram, newColStat.Histogram
4613+
if sb.evalCtx.SessionData().OptimizerClampLowHistogramSelectivity &&
4614+
newHist.ValuesCount() < oldHist.Resolution() {
4615+
// NOTE: columns with histograms are skipped when considering distinct
4616+
// counts in selectivityFromSingleColDistinctCounts, so this doesn't
4617+
// double count the effect of the predicate.
4618+
resClamp := props.MakeSelectivityFromFraction(newColStat.DistinctCount, oldColStat.DistinctCount)
4619+
4620+
// Cap the selectivity so that the row count estimate is no more than the
4621+
// pessimistic threshold. This can result in a lower estimate if the
4622+
// multiplicities of the filtered values really are low compared to the
4623+
// average multiplicity.
4624+
resClamp = props.MinSelectivity(resClamp,
4625+
props.MakeSelectivityFromFraction(oldHist.Resolution(), s.RowCount),
4626+
)
4627+
clampedSel = props.MaxSelectivity(clampedSel, resClamp)
4628+
}
4629+
return clampedSel
4630+
}
4631+
45754632
// selectivityFromMaxFrequencies calculates the selectivity of an equality
45764633
// filters by using the maximum frequency of the histograms of the constrained
45774634
// columns. This represents a worst-case selectivity estimate and is used to
@@ -5332,7 +5389,10 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
53325389
colStat.NullCount = nullCount
53335390
if useHistogram {
53345391
colStat.Histogram = &props.Histogram{}
5335-
colStat.Histogram.Init(sb.evalCtx, firstColID, histogram)
5392+
// Track the minimum number of rows for which histogram selectivity
5393+
// estimates are trusted.
5394+
resolution := histogramPessimisticThreshold * statistics.RowCount
5395+
colStat.Histogram.Init(sb.evalCtx, firstColID, histogram, resolution)
53365396
}
53375397
sb.finalizeFromRowCountAndDistinctCounts(colStat, statistics)
53385398
tabMeta.AddCheckConstraintsStats(firstColID, colStat)

pkg/sql/opt/props/histogram.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@ type Histogram struct {
3838
selectivity float64
3939
buckets []cat.HistogramBucket
4040
col opt.ColumnID
41+
// resolution is the number of rows below which selectivity estimates based on
42+
// this histogram should fall back to a more pessimistic distinct-count based
43+
// estimate. This is used to avoid overfitting to histograms that may be
44+
// missing values due to sampling or staleness. This number roughly
45+
// corresponds to the highest expected multiplicity of any value missing from
46+
// the histogram.
47+
resolution float64
4148
}
4249

4350
func (h *Histogram) String() string {
@@ -49,14 +56,17 @@ func (h *Histogram) String() string {
4956
}
5057

5158
// Init initializes the histogram with data from the catalog.
52-
func (h *Histogram) Init(evalCtx *eval.Context, col opt.ColumnID, buckets []cat.HistogramBucket) {
59+
func (h *Histogram) Init(
60+
evalCtx *eval.Context, col opt.ColumnID, buckets []cat.HistogramBucket, resolution float64,
61+
) {
5362
// This initialization pattern ensures that fields are not unwittingly
5463
// reused. Field reuse must be explicit.
5564
*h = Histogram{
5665
evalCtx: evalCtx,
5766
col: col,
5867
selectivity: 1,
5968
buckets: buckets,
69+
resolution: resolution,
6070
}
6171
}
6272

@@ -134,6 +144,13 @@ func (h *Histogram) ValuesCount() float64 {
134144
return count
135145
}
136146

147+
// Resolution returns the minimum row count for which selectivity estimates
148+
// based on this histogram should be trusted. See the resolution field comment
149+
// for details.
150+
func (h *Histogram) Resolution() float64 {
151+
return h.resolution
152+
}
153+
137154
// EqEstimate returns the estimated number of rows that equal the given
138155
// datum. If the datum is equal to a bucket's upperbound, it returns the
139156
// bucket's NumEq. If the datum falls in the range of a bucket's upper and lower
@@ -329,6 +346,7 @@ func (h *Histogram) filter(
329346
evalCtx: h.evalCtx,
330347
col: h.col,
331348
selectivity: h.selectivity,
349+
resolution: h.resolution,
332350
}
333351
if bucketCount == 0 {
334352
return filtered

pkg/sql/opt/props/histogram_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ func TestEqEstimate(t *testing.T) {
2929
evalCtx := eval.MakeTestingEvalContext(cluster.MakeTestingClusterSettings())
3030

3131
emptyHist := &Histogram{}
32-
emptyHist.Init(&evalCtx, opt.ColumnID(1), []cat.HistogramBucket{})
32+
emptyHist.Init(&evalCtx, opt.ColumnID(1), []cat.HistogramBucket{}, 0 /* resolution */)
3333

3434
if eq := emptyHist.EqEstimate(ctx, tree.NewDInt(0)); eq != 0 {
3535
t.Errorf("expected %f but found %f", 0.0, eq)
@@ -45,7 +45,7 @@ func TestEqEstimate(t *testing.T) {
4545
{NumRange: 40, DistinctRange: 7, NumEq: 35, UpperBound: tree.NewDInt(42)},
4646
}
4747
h := &Histogram{}
48-
h.Init(&evalCtx, opt.ColumnID(1), histData)
48+
h.Init(&evalCtx, opt.ColumnID(1), histData, 0 /* resolution */)
4949

5050
testData := []struct {
5151
datum tree.Datum
@@ -139,7 +139,7 @@ func TestCanFilter(t *testing.T) {
139139
}
140140

141141
h := Histogram{}
142-
h.Init(&evalCtx, opt.ColumnID(1), []cat.HistogramBucket{})
142+
h.Init(&evalCtx, opt.ColumnID(1), []cat.HistogramBucket{}, 0 /* resolution */)
143143
for _, tc := range testData {
144144
c := constraint.ParseConstraint(&evalCtx, tc.constraint)
145145
colIdx, _, ok := h.CanFilter(ctx, &c)
@@ -170,7 +170,7 @@ func TestHistogram(t *testing.T) {
170170
{NumRange: 40, DistinctRange: 7, NumEq: 35, UpperBound: tree.NewDInt(42)},
171171
}
172172
h := &Histogram{}
173-
h.Init(&evalCtx, opt.ColumnID(1), histData)
173+
h.Init(&evalCtx, opt.ColumnID(1), histData, 0 /* resolution */)
174174
count, expected := h.ValuesCount(), float64(91)
175175
if count != expected {
176176
t.Fatalf("expected %f but found %f", expected, count)
@@ -1212,7 +1212,7 @@ func BenchmarkHistogram(b *testing.B) {
12121212
for _, bucketCount := range bucketCounts {
12131213
b.Run(fmt.Sprintf("buckets=%v", bucketCount), func(b *testing.B) {
12141214
h := Histogram{}
1215-
h.Init(&evalCtx, opt.ColumnID(1), makeBuckets(typ, bucketCount))
1215+
h.Init(&evalCtx, opt.ColumnID(1), makeBuckets(typ, bucketCount), 0 /* resolution */)
12161216
c := makeConstraint(typ, bucketCount)
12171217
b.Run("DistinctValuesCount", func(b *testing.B) {
12181218
for i := 0; i < b.N; i++ {

pkg/sql/opt/props/statistics.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,9 @@ func (c *ColumnStatistic) CopyFromOther(other *ColumnStatistic, evalCtx *eval.Co
283283
c.NullCount = other.NullCount
284284
if other.Histogram != nil && c.Cols.Len() == 1 {
285285
c.Histogram = &Histogram{}
286-
c.Histogram.Init(evalCtx, c.Cols.SingleColumn(), other.Histogram.buckets)
286+
c.Histogram.Init(
287+
evalCtx, c.Cols.SingleColumn(), other.Histogram.buckets, other.Histogram.resolution,
288+
)
287289
}
288290
}
289291

0 commit comments

Comments
 (0)