@@ -45,6 +45,13 @@ type Histogram struct {
4545 // corresponds to the highest expected multiplicity of any value missing from
4646 // the histogram.
4747 resolution float64
48+ // hasTightUB and hasTightLB indicate whether there are guaranteed upper
49+ // bounds on the range of values in the underlying dataset (possibly, though
50+ // not necessarily, equal to the upper and lower bounds of the histogram).
51+ // This is the case for a histogram derived after a filter that restricts the
52+ // column's values to a finite range. Contrast this with the histogram derived
53+ // from a table sample, which may miss extreme values or become stale.
54+ hasTightUB , hasTightLB bool
4855}
4956
5057func (h * Histogram ) String () string {
@@ -70,21 +77,21 @@ func (h *Histogram) Init(
7077 }
7178}
7279
73- // bucketCount returns the number of buckets in the histogram.
74- func (h * Histogram ) bucketCount () int {
80+ // BucketCount returns the number of buckets in the histogram.
81+ func (h * Histogram ) BucketCount () int {
7582 return len (h .buckets )
7683}
7784
7885// numEq returns NumEq for the ith histogram bucket, with the histogram's
7986// selectivity applied. i must be greater than or equal to 0 and less than
80- // bucketCount .
87+ // BucketCount .
8188func (h * Histogram ) numEq (i int ) float64 {
8289 return h .buckets [i ].NumEq * h .selectivity
8390}
8491
8592// numRange returns NumRange for the ith histogram bucket, with the histogram's
8693// selectivity applied. i must be greater than or equal to 0 and less than
87- // bucketCount .
94+ // BucketCount .
8895func (h * Histogram ) numRange (i int ) float64 {
8996 // The first bucket always has a zero value for NumRange, so the lower bound
9097 // of the histogram is the upper bound of the first bucket. We only check this
@@ -100,7 +107,7 @@ func (h *Histogram) numRange(i int) float64 {
100107
101108// distinctRange returns DistinctRange for the ith histogram bucket, with the
102109// histogram's selectivity applied. i must be greater than or equal to 0 and
103- // less than bucketCount .
110+ // less than BucketCount .
104111func (h * Histogram ) distinctRange (i int ) float64 {
105112 n := h .buckets [i ].NumRange
106113 d := h .buckets [i ].DistinctRange
@@ -127,7 +134,7 @@ func (h *Histogram) distinctRange(i int) float64 {
127134}
128135
129136// upperBound returns UpperBound for the ith histogram bucket. i must be
130- // greater than or equal to 0 and less than bucketCount .
137+ // greater than or equal to 0 and less than BucketCount .
131138func (h * Histogram ) upperBound (i int ) tree.Datum {
132139 return h .buckets [i ].UpperBound
133140}
@@ -151,6 +158,16 @@ func (h *Histogram) Resolution() float64 {
151158 return h .resolution
152159}
153160
161+ // TightBounds returns whether the histogram has been constrained such that
162+ // there are guaranteed finite upper and lower bounds on the values in the
163+ // histogram column. Note that the guaranteed bounds may not match the
164+ // histogram's maximum and minimum values. This information can be used to
165+ // determine how to clamp row-count estimates for inequality filters to avoid
166+ // over-fitting on stale or inaccurate histograms.
167+ func (h * Histogram ) TightBounds () (tightUpper , tightLower bool ) {
168+ return h .hasTightUB , h .hasTightLB
169+ }
170+
154171// EqEstimate returns the estimated number of rows that equal the given
155172// datum. If the datum is equal to a bucket's upperbound, it returns the
156173// bucket's NumEq. If the datum falls in the range of a bucket's upper and lower
@@ -332,6 +349,33 @@ func (h *Histogram) CanFilter(
332349 return 0 , exactPrefix , false
333350}
334351
352+ // checkSpanBounds determines whether the given spans bound the histogram column
353+ // above and below. This can be used to determine how to clamp row-count
354+ // estimates for inequality filters to avoid over-fitting on stale or inaccurate
355+ // histograms.
356+ func checkSpanBounds (
357+ spanCount int , getSpan func (int ) * constraint.Span , desc bool , colOffset int ,
358+ ) (hasUpperBound , hasLowerBound bool ) {
359+ if spanCount == 0 {
360+ return false , false
361+ }
362+ firstSpan := getSpan (0 )
363+ lastSpan := getSpan (spanCount - 1 )
364+ hasBound := func (key constraint.Key ) bool {
365+ // A NULL value is not considered a bound in this context, since they order
366+ // before (or after) all non-NULL values and are not included in histograms.
367+ return key .Length () > colOffset && key .Value (colOffset ) != tree .DNull
368+ }
369+ if desc {
370+ hasUpperBound = hasBound (firstSpan .StartKey ())
371+ hasLowerBound = hasBound (lastSpan .EndKey ())
372+ } else {
373+ hasLowerBound = hasBound (firstSpan .StartKey ())
374+ hasUpperBound = hasBound (lastSpan .EndKey ())
375+ }
376+ return hasUpperBound , hasLowerBound
377+ }
378+
335379func (h * Histogram ) filter (
336380 ctx context.Context ,
337381 spanCount int ,
@@ -341,12 +385,21 @@ func (h *Histogram) filter(
341385 prefix []tree.Datum ,
342386 columns constraint.Columns ,
343387) * Histogram {
344- bucketCount := h .bucketCount ()
388+ bucketCount := h .BucketCount ()
345389 filtered := & Histogram {
346390 evalCtx : h .evalCtx ,
347391 col : h .col ,
348392 selectivity : h .selectivity ,
349393 resolution : h .resolution ,
394+ hasTightLB : h .hasTightLB ,
395+ hasTightUB : h .hasTightUB ,
396+ }
397+ spanUB , spanLB := checkSpanBounds (spanCount , getSpan , desc , colOffset )
398+ if spanUB {
399+ filtered .hasTightUB = true
400+ }
401+ if spanLB {
402+ filtered .hasTightLB = true
350403 }
351404 if bucketCount == 0 {
352405 return filtered
@@ -665,7 +718,7 @@ func (hi *histogramIter) init(h *Histogram, desc bool) {
665718 desc : desc ,
666719 }
667720 if desc {
668- hi .idx = h .bucketCount ()
721+ hi .idx = h .BucketCount ()
669722 }
670723 hi .next ()
671724}
@@ -709,7 +762,7 @@ func (hi *histogramIter) next() (ok bool) {
709762 hi .eub , hi .ub , hi .elb , hi .lb = getBounds ()
710763 } else {
711764 hi .idx ++
712- if hi .idx >= hi .h .bucketCount () {
765+ if hi .idx >= hi .h .BucketCount () {
713766 return false
714767 }
715768 // If iter.desc=false, the lower bounds are less than the upper bounds.
0 commit comments