@@ -90,6 +90,16 @@ const (
9090 // values from the spans a check constraint is allowed to have in order to build
9191 // a histogram from it.
9292 maxValuesForFullHistogramFromCheckConstraint = tabledesc .MaxBucketAllowed
93+
94+ // histogramPessimisticThreshold determines the cutoff point below which the
95+ // selectivity estimate of a histogram is overridden with a more pessimistic
96+ // estimate. This is to avoid over-fitting to a stale or inaccurate histogram.
97+ //
98+ // The value (1 in 10,000) was chosen because we choose sample sizes according
99+ // to table size such that we expect to *nearly* always sample all values with
100+ // multiplicity >= row_count/10000. Cardinality estimates below this threshold
101+ // are increasingly likely to be inaccurate. See also computeNumberSamples.
102+ histogramPessimisticThreshold = 1.0 / 10000.0
93103)
94104
95105// statisticsBuilder is responsible for building the statistics that are
@@ -761,7 +771,10 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
761771 // is tracked here: https://github.com/cockroachdb/cockroach/issues/50655
762772 col := cols .SingleColumn ()
763773 colStat .Histogram = & props.Histogram {}
764- colStat .Histogram .Init (sb .evalCtx , col , stat .Histogram ())
774+ // Track the minimum number of rows for which histogram selectivity
775+ // estimates are trusted.
776+ resolution := histogramPessimisticThreshold * stats .RowCount
777+ colStat .Histogram .Init (sb .evalCtx , col , stat .Histogram (), resolution )
765778 }
766779
767780 // Make sure the distinct count is at least 1, for the same reason as
@@ -786,7 +799,18 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
786799 invCols := opt .MakeColSet (invCol )
787800 if invColStat , ok := stats .ColStats .Add (invCols ); ok {
788801 invColStat .Histogram = & props.Histogram {}
789- invColStat .Histogram .Init (sb .evalCtx , invCol , stat .Histogram ())
802+ // Track the minimum number of rows for which histogram selectivity
803+ // estimates are trusted.
804+ //
805+ // NOTE: an inverted index can have multiple entries per table row.
806+ // However, we still use the number of table rows here because the
807+ // max multiplicity of a missed value is proportional to the number
808+ // of table rows, not the number of inverted index entries. For
809+ // example, the arrays [10, 20, 30] and [20, 40, 60] result in six
810+ // inverted index entries, but only a maximum multiplicity of two
811+ // for the value "20".
812+ resolution := histogramPessimisticThreshold * stats .RowCount
813+ invColStat .Histogram .Init (sb .evalCtx , invCol , stat .Histogram (), resolution )
790814 // Set inverted entry counts from the histogram. Make sure the
791815 // distinct count is at least 1, for the same reason as the row
792816 // count above.
@@ -4558,10 +4582,15 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
45584582 newCount := newHist .ValuesCount ()
45594583 oldCount := oldHist .ValuesCount ()
45604584
4561- // Calculate the selectivity of the predicate. Nulls are already included
4562- // in the histogram, so we do not need to account for them separately.
4585+ // Calculate the selectivity of the predicate using the histogram. Nulls
4586+ // are already included in the histogram, so we do not need to account for
4587+ // them separately.
45634588 predicateSelectivity := props .MakeSelectivityFromFraction (newCount , oldCount )
45644589
4590+ // Possibly clamp the selectivity to a higher value to avoid overly
4591+ // optimistic estimates.
4592+ predicateSelectivity = sb .clampSelForHistogram (inputColStat , colStat , s , predicateSelectivity )
4593+
45654594 // The maximum possible selectivity of the entire expression is the minimum
45664595 // selectivity of all individual predicates.
45674596 selectivityUpperBound = props .MinSelectivity (selectivityUpperBound , predicateSelectivity )
@@ -4572,6 +4601,34 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
45724601 return selectivity , selectivityUpperBound
45734602}
45744603
4604+ // clampSelForHistogram clamps the selectivity estimate derived from a histogram
4605+ // to a minimum value. This accounts for the possibility that the histogram is
4606+ // missing values due to sampling or staleness. See also
4607+ // histogramPessimisticThreshold.
4608+ func (sb * statisticsBuilder ) clampSelForHistogram (
4609+ oldColStat , newColStat * props.ColumnStatistic , s * props.Statistics , originalSel props.Selectivity ,
4610+ ) (clampedSel props.Selectivity ) {
4611+ clampedSel = originalSel
4612+ oldHist , newHist := oldColStat .Histogram , newColStat .Histogram
4613+ if sb .evalCtx .SessionData ().OptimizerClampLowHistogramSelectivity &&
4614+ newHist .ValuesCount () < oldHist .Resolution () {
4615+ // NOTE: columns with histograms are skipped when considering distinct
4616+ // counts in selectivityFromSingleColDistinctCounts, so this doesn't
4617+ // double count the effect of the predicate.
4618+ resClamp := props .MakeSelectivityFromFraction (newColStat .DistinctCount , oldColStat .DistinctCount )
4619+
4620+ // Cap the selectivity so that the row count estimate is no more than the
4621+ // pessimistic threshold. This can result in a lower estimate if the
4622+ // multiplicities of the filtered values really are low compared to the
4623+ // average multiplicity.
4624+ resClamp = props .MinSelectivity (resClamp ,
4625+ props .MakeSelectivityFromFraction (oldHist .Resolution (), s .RowCount ),
4626+ )
4627+ clampedSel = props .MaxSelectivity (clampedSel , resClamp )
4628+ }
4629+ return clampedSel
4630+ }
4631+
45754632// selectivityFromMaxFrequencies calculates the selectivity of an equality
45764633// filters by using the maximum frequency of the histograms of the constrained
45774634// columns. This represents a worst-case selectivity estimate and is used to
@@ -5332,7 +5389,10 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
53325389 colStat .NullCount = nullCount
53335390 if useHistogram {
53345391 colStat .Histogram = & props.Histogram {}
5335- colStat .Histogram .Init (sb .evalCtx , firstColID , histogram )
5392+ // Track the minimum number of rows for which histogram selectivity
5393+ // estimates are trusted.
5394+ resolution := histogramPessimisticThreshold * statistics .RowCount
5395+ colStat .Histogram .Init (sb .evalCtx , firstColID , histogram , resolution )
53365396 }
53375397 sb .finalizeFromRowCountAndDistinctCounts (colStat , statistics )
53385398 tabMeta .AddCheckConstraintsStats (firstColID , colStat )
0 commit comments