DRILL-7242: Handle additional boundary cases and compute better estimates when popular values span multiple buckets.

Aman Sinha · Aman Sinha · commit 05a1a3a888a7 · 2019-05-14T10:31:47.000-07:00
Address review comments. close #1785
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java
@@ -38,5 +38,5 @@ public interface Histogram {
    * @param totalRowCount
    * @return estimated selectivity or NULL if it could not be estimated for any reason
    */
-  Double estimatedSelectivity(final RexNode filter, final long totalRowCount);
+  Double estimatedSelectivity(final RexNode filter, final long totalRowCount, final long ndv);
 }
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
@@ -30,7 +30,9 @@
 import org.apache.calcite.rex.RexLiteral;
 import com.tdunning.math.stats.MergingDigest;
 import org.apache.calcite.sql.SqlOperator;
+import org.apache.drill.shaded.guava.com.google.common.annotations.VisibleForTesting;
 import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.BoundType;
 import org.apache.drill.shaded.guava.com.google.common.collect.Range;
 
 /**
@@ -85,6 +87,11 @@ public Double[] getBuckets() {
     return buckets;
   }
 
+  @VisibleForTesting
+  protected void setBucketValue(int index, Double value) {
+    buckets[index] = value;
+  }
+
   /**
    * Get the number of buckets in the histogram
    * number of buckets is 1 less than the total # entries in the buckets array since last
@@ -105,7 +112,7 @@ public int getNumBuckets() {
    * first and last bucket may be partially covered and all other buckets in the middle are fully covered.
    */
   @Override
-  public Double estimatedSelectivity(final RexNode columnFilter, final long totalRowCount) {
+  public Double estimatedSelectivity(final RexNode columnFilter, final long totalRowCount, final long ndv) {
     if (numRowsPerBucket == 0) {
       return null;
     }
@@ -127,7 +134,7 @@ public Double estimatedSelectivity(final RexNode columnFilter, final long totalR
     int unknown = unknownFilterList.size();
 
     if (valuesRange.hasLowerBound() || valuesRange.hasUpperBound()) {
-      numSelectedRows = getSelectedRows(valuesRange);
+      numSelectedRows = getSelectedRows(valuesRange, ndv);
     } else {
       numSelectedRows = 0;
     }
@@ -178,101 +185,143 @@ private Range<Double> getValuesRange(List<RexNode> filterList, Range<Double> ful
     return currentRange;
   }
 
-  private long getSelectedRows(final Range range) {
-    final int numBuckets = buckets.length - 1;
+  @VisibleForTesting
+  protected long getSelectedRows(final Range range, final long ndv) {
     double startBucketFraction = 1.0;
     double endBucketFraction = 1.0;
     long numRows = 0;
     int result;
     Double lowValue = null;
     Double highValue = null;
-    final int first = 0;
-    final int last = buckets.length - 1;
-    int startBucket = first;
-    int endBucket = last;
+    final int firstStartPointIndex = 0;
+    final int lastEndPointIndex = buckets.length - 1;
+    int startBucket = firstStartPointIndex;
+    int endBucket = lastEndPointIndex - 1;
 
     if (range.hasLowerBound()) {
       lowValue = (Double) range.lowerEndpoint();
 
-      // if low value is greater than the end point of the last bucket then none of the rows qualify
-      if (lowValue.compareTo(buckets[last]) > 0) {
+      // if low value is greater than the end point of the last bucket or if it is equal but the range is open (i.e
+      // predicate is of type > 5 where 5 is the end point of last bucket) then none of the rows qualify
+      result = lowValue.compareTo(buckets[lastEndPointIndex]);
+      if (result > 0 || result == 0 && range.lowerBoundType() == BoundType.OPEN)  {
         return 0;
       }
-
-      result = lowValue.compareTo(buckets[first]);
+      result = lowValue.compareTo(buckets[firstStartPointIndex]);
 
       // if low value is less than or equal to the first bucket's start point then start with the first bucket and all
       // rows in first bucket are included
       if (result <= 0) {
-        startBucket = first;
+        startBucket = firstStartPointIndex;
         startBucketFraction = 1.0;
       } else {
-        // Use a simplified logic where we treat > and >= the same when computing selectivity since the
-        // difference is going to be very small for reasonable sized data sets
-        startBucket = getContainingBucket(lowValue, numBuckets);
+        startBucket = getContainingBucket(lowValue, lastEndPointIndex, true);
+
         // expecting start bucket to be >= 0 since other conditions have been handled previously
         Preconditions.checkArgument(startBucket >= 0, "Expected start bucket id >= 0");
-        startBucketFraction = ((double) (buckets[startBucket + 1] - lowValue)) / (buckets[startBucket + 1] - buckets[startBucket]);
+
+       if (buckets[startBucket + 1].doubleValue() == buckets[startBucket].doubleValue()) {
+         // if start and end points of the bucket are the same, consider entire bucket
+         startBucketFraction = 1.0;
+       } else if (range.lowerBoundType() == BoundType.CLOSED && buckets[startBucket + 1].doubleValue() == lowValue.doubleValue()) {
+         // predicate is of type >= 5.0 and 5.0 happens to be the start point of the bucket
+         // In this case, use the overall NDV to approximate
+         startBucketFraction = 1.0 / ndv;
+       } else {
+          startBucketFraction = ((double) (buckets[startBucket + 1] - lowValue)) / (buckets[startBucket + 1] - buckets[startBucket]);
+        }
       }
     }
 
     if (range.hasUpperBound()) {
       highValue = (Double) range.upperEndpoint();
 
-      // if the high value is less than the start point of the first bucket then none of the rows qualify
-      if (highValue.compareTo(buckets[first]) < 0) {
+      // if the high value is less than the start point of the first bucket or if it is equal but the range is open (i.e
+      // predicate is of type < 1 where 1 is the start point of the first bucket) then none of the rows qualify
+      result = highValue.compareTo(buckets[firstStartPointIndex]);
+      if (result < 0 || (result == 0 && range.upperBoundType() == BoundType.OPEN)) {
         return 0;
       }
 
-      result = highValue.compareTo(buckets[last]);
+      result = highValue.compareTo(buckets[lastEndPointIndex]);
 
       // if high value is greater than or equal to the last bucket's end point then include the last bucket and all rows in
       // last bucket qualify
       if (result >= 0) {
-        endBucket = last;
+        endBucket = lastEndPointIndex - 1;
         endBucketFraction = 1.0;
       } else {
-        // Use a simplified logic where we treat < and <= the same when computing selectivity since the
-        // difference is going to be very small for reasonable sized data sets
-        endBucket = getContainingBucket(highValue, numBuckets);
+        endBucket = getContainingBucket(highValue, lastEndPointIndex, false);
+
         // expecting end bucket to be >= 0 since other conditions have been handled previously
         Preconditions.checkArgument(endBucket >= 0, "Expected end bucket id >= 0");
-        endBucketFraction = ((double)(highValue - buckets[endBucket])) / (buckets[endBucket + 1] - buckets[endBucket]);
+
+        if (buckets[endBucket + 1].doubleValue() == buckets[endBucket].doubleValue()) {
+          // if start and end points of the bucket are the same, consider entire bucket
+          endBucketFraction = 1.0;
+        } else if (range.upperBoundType() == BoundType.CLOSED && buckets[endBucket].doubleValue() == highValue.doubleValue()) {
+          // predicate is of type <= 5.0 and 5.0 happens to be the start point of the bucket
+          // In this case, use the overall NDV to approximate
+          endBucketFraction = 1.0/ndv;
+        } else {
+          endBucketFraction = ((double) (highValue - buckets[endBucket])) / (buckets[endBucket + 1] - buckets[endBucket]);
+        }
       }
     }
 
-    Preconditions.checkArgument(startBucket <= endBucket);
+    Preconditions.checkArgument(startBucket >= 0 && startBucket + 1 <= lastEndPointIndex, "Invalid startBucket: " + startBucket);
+    Preconditions.checkArgument(endBucket >= 0 && endBucket + 1 <= lastEndPointIndex, "Invalid endBucket: " +  endBucket);
+    Preconditions.checkArgument(startBucket <= endBucket,
+      "Start bucket: " + startBucket + " should be less than or equal to end bucket: " + endBucket);
 
-    // if the endBucketId corresponds to the last endpoint, then adjust it to be one less
-    if (endBucket == last) {
-      endBucket = last - 1;
-    }
-    if (startBucket == endBucket && highValue != null && lowValue != null) {
+    if (startBucket == endBucket) {
       // if the start and end buckets are the same, interpolate based on the difference between the high and low value
-      numRows = (long) ((highValue - lowValue) / (buckets[endBucket + 1] - buckets[startBucket]) * numRowsPerBucket);
+      if (highValue != null && lowValue != null) {
+        numRows = (long) ((highValue - lowValue) / (buckets[startBucket + 1] - buckets[startBucket]) * numRowsPerBucket);
+      } else if (highValue != null) {
+        numRows = (long) (endBucketFraction * numRowsPerBucket);
+      } else {
+        numRows = (long) (startBucketFraction * numRowsPerBucket);
+      }
     } else {
-      numRows = (long) ((startBucketFraction + endBucketFraction) * numRowsPerBucket + (endBucket - startBucket - 1) * numRowsPerBucket);
+      int numIntermediateBuckets = (endBucket > startBucket + 1) ? (endBucket - startBucket - 1) : 0;
+      numRows = (long) ((startBucketFraction + endBucketFraction) * numRowsPerBucket + numIntermediateBuckets * numRowsPerBucket);
     }
 
     return numRows;
   }
 
-  private int getContainingBucket(final Double value, final int numBuckets) {
+  /**
+   * Get the start point of the containing bucket for the supplied value. If there are multiple buckets with the
+   * same start point, return either the first matching or last matching depending on firstMatching flag
+   * @param value the input double value
+   * @param lastEndPointIndex
+   * @param firstMatching If true, return the first bucket that matches the specified criteria otherwise return the last one
+   * @return index of either the first or last matching bucket if a match was found, otherwise return -1
+   */
+  private int getContainingBucket(final Double value, final int lastEndPointIndex, final boolean firstMatching) {
     int i = 0;
     int containing_bucket = -1;
+
     // check which bucket this value falls in
-    for (; i <= numBuckets; i++) {
+    for (; i <= lastEndPointIndex; i++) {
       int result = buckets[i].compareTo(value);
       if (result > 0) {
         containing_bucket = i - 1;
         break;
       } else if (result == 0) {
-        containing_bucket = i;
-        break;
+        // if we are already at the lastEndPointIndex, mark the containing bucket
+        // as i-1 because the containing bucket should correspond to the start point of the bucket
+        // (recall that lastEndPointIndex is the final end point of the last bucket)
+        containing_bucket = (i == lastEndPointIndex) ? i - 1 : i;
+        if (firstMatching) {
+          // break if we are only interested in the first matching bucket
+          break;
+        }
       }
     }
     return containing_bucket;
-  }
+   }
 
   private Double getLiteralValue(final RexNode filter) {
     Double value = null;
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
@@ -311,7 +311,8 @@ private double computeRangeSelectivity(TableMetadata tableMetadata, RexNode orPr
       Histogram histogram = columnStatistics != null ? (Histogram) columnStatistics.getStatistic(ColumnStatisticsKind.HISTOGRAM) : null;
       if (histogram != null) {
         Double totalCount = (Double) columnStatistics.getStatistic(ColumnStatisticsKind.ROWCOUNT);
-        Double sel = histogram.estimatedSelectivity(orPred, totalCount.longValue());
+        Double ndv = (Double) columnStatistics.getStatistic(ColumnStatisticsKind.NDV);
+        Double sel = histogram.estimatedSelectivity(orPred, totalCount.longValue(), ndv.longValue());
         if (sel != null) {
           return sel;
         }
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/planner/common/TestNumericEquiDepthHistogram.java b/exec/java-exec/src/test/java/org/apache/drill/exec/planner/common/TestNumericEquiDepthHistogram.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.planner.common;
+
+import org.apache.drill.categories.PlannerTest;
+
+import org.apache.drill.shaded.guava.com.google.common.collect.BoundType;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.Assert;
+import org.apache.drill.shaded.guava.com.google.common.collect.Range;
+
+
+@Category(PlannerTest.class)
+public class TestNumericEquiDepthHistogram {
+
+  @Test
+  public void testHistogramWithUniqueEndpoints() throws Exception {
+    int numBuckets = 10;
+    int numRowsPerBucket = 250;
+    long ndv = 25;
+
+    // init array with numBuckets + 1 values
+    Double[] buckets = {1.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0};
+
+    NumericEquiDepthHistogram histogram = new NumericEquiDepthHistogram(numBuckets);
+
+    for (int i = 0; i < buckets.length; i++) {
+      histogram.setBucketValue(i, buckets[i]);
+    }
+    histogram.setNumRowsPerBucket(numRowsPerBucket);
+
+    // Range: <= 1.0
+    Range<Double> range1 = Range.atMost(new Double(1.0));
+    long result1 = histogram.getSelectedRows(range1, ndv);
+    long expected1 = 10;
+    Assert.assertEquals(expected1, result1);
+
+    // Range: >= 100.0
+    Range<Double> range2 = Range.atLeast(new Double(100.0));
+    long result2 = histogram.getSelectedRows(range2, ndv);
+    long expected2 = 10;
+    Assert.assertEquals(expected2, result2);
+  }
+
+  @Test
+  public void testHistogramWithDuplicateEndpoints() throws Exception {
+    int numBuckets = 10;
+    int numRowsPerBucket = 250;
+    long ndv = 25;
+
+    // init array with numBuckets + 1 values
+    Double[] buckets = {10.0, 10.0, 10.0, 20.0, 20.0, 50.0, 55.0, 55.0, 60.0, 100.0, 100.0};
+
+    NumericEquiDepthHistogram histogram = new NumericEquiDepthHistogram(numBuckets);
+
+    for (int i = 0; i < buckets.length; i++) {
+      histogram.setBucketValue(i, buckets[i]);
+    }
+    histogram.setNumRowsPerBucket(numRowsPerBucket);
+
+    // Range: <= 10.0
+    Range<Double> range1 = Range.atMost(new Double(10.0));
+    long result1 = histogram.getSelectedRows(range1, ndv);
+    long expected1 = 510; // 2 full buckets plus the exact match with start point of 3rd bucket
+    Assert.assertEquals(expected1, result1);
+
+    // Range: >= 100.0
+    Range<Double> range2 = Range.atLeast(new Double(100.0));
+    long result2 = histogram.getSelectedRows(range2, ndv);
+    long expected2 = 250;
+    Assert.assertEquals(expected2, result2);
+
+    // Range: < 10.0
+    Range<Double> range3 = Range.lessThan(new Double(10.0));
+    long result3 = histogram.getSelectedRows(range3, ndv);
+    long expected3 = 0;
+    Assert.assertEquals(expected3, result3);
+
+    // Range: > 100.0
+    Range<Double> range4 = Range.greaterThan(new Double(100.0));
+    long result4 = histogram.getSelectedRows(range4, ndv);
+    long expected4 = 0;
+    Assert.assertEquals(expected4, result4);
+
+    // Range: >= 20.0 AND <= 55.0
+    Range<Double> range5 = Range.range(new Double(20.0), BoundType.CLOSED, new Double(55.0), BoundType.CLOSED);
+    long result5 = histogram.getSelectedRows(range5, ndv);
+    long expected5 = 1010;
+    Assert.assertEquals(expected5, result5);
+
+    // Range: BETWEEN 15 AND 80
+    Range<Double> range6 = Range.range(new Double(15.0), BoundType.CLOSED, new Double(80.0), BoundType.CLOSED);
+    long result6 = histogram.getSelectedRows(range6, ndv);
+    long expected6 = 1500;
+    Assert.assertEquals(expected6, result6);
+  }
+}
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java
@@ -425,6 +425,18 @@ public void testHistogramWithDataTypes1() throws Exception {
         "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"};
       PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan4, new String[]{});
 
+      // col > end_point of last bucket
+      query = "select 1 from dfs.tmp.employee1 where store_id > 24";
+      String[] expectedPlan5 = {"Filter\\(condition.*\\).*rowcount = 1.0,.*",
+        "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"};
+      PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan5, new String[]{});
+
+      // col < start_point of first bucket
+      query = "select 1 from dfs.tmp.employee1 where store_id < 1";
+      String[] expectedPlan6 = {"Filter\\(condition.*\\).*rowcount = 1.0,.*",
+        "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"};
+      PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan6, new String[]{});
+
     } finally {
       test("ALTER SESSION SET `planner.slice_target` = " + ExecConstants.SLICE_TARGET_DEFAULT);
     }

Original file line number	Diff line number	Diff line change
`@@ -38,5 +38,5 @@ public interface Histogram {`
`38`	`38`	`* @param totalRowCount`
`39`	`39`	`* @return estimated selectivity or NULL if it could not be estimated for any reason`
`40`	`40`	`*/`
`41`		`- Double estimatedSelectivity(final RexNode filter, final long totalRowCount);`
	`41`	`+ Double estimatedSelectivity(final RexNode filter, final long totalRowCount, final long ndv);`
`42`	`42`	`}`