Enabling histogram collection for PointRangeQuery (#14560)

jainankitk · jainankitk · commit 0a354cbd1093 · 2025-05-06T17:10:46.000-07:00
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -33,7 +33,7 @@ Optimizations
 ---------------------
 * GITHUB#14418: Quick exit on filter query matching no docs when rewriting knn query. (Pan Guixin)
 
-* GITHUB#14439: Efficient Histogram Collection using multi range traversal over PointTrees (Ankit Jain)
+* GITHUB#14439, GITHUB#14560: Efficient Histogram Collection using multi range traversal over PointTrees (Ankit Jain)
 
 * GITHUB#14268: PointInSetQuery early exit on non-matching segments. (hanbj)
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollector.java
@@ -17,7 +17,10 @@
 package org.apache.lucene.sandbox.facet.plain.histograms;
 
 import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.concurrent.ConcurrentMap;
+import java.util.function.Function;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.DocValuesSkipper;
 import org.apache.lucene.index.DocValuesType;
@@ -27,10 +30,16 @@
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.internal.hppc.LongIntHashMap;
+import org.apache.lucene.queries.function.FunctionScoreQuery;
+import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.CollectionTerminatedException;
 import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.DocIdStream;
+import org.apache.lucene.search.IndexOrDocValuesQuery;
 import org.apache.lucene.search.LeafCollector;
+import org.apache.lucene.search.PointRangeQuery;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Scorable;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.Weight;
@@ -67,13 +76,15 @@ public LeafCollector getLeafCollector(LeafReaderContext context) throws IOExcept
     // We can use multi range traversal logic to collect the histogram on numeric
     // field indexed as point for MATCH_ALL cases. In future, this can be extended
     // for Point Range Query cases as well
-    if (weight != null && weight.count(context) == context.reader().maxDoc()) {
+    final PointRangeQuery pointRangeQuery = getPointRangeQuery(field);
+    if (isMatchAll(context) || pointRangeQuery != null) {
       final PointValues pointValues = context.reader().getPointValues(field);
       if (PointTreeBulkCollector.canCollectEfficiently(pointValues, bucketWidth)) {
         // In case of intra segment concurrency, only one collector should collect
         // documents for all the partitions to avoid duplications across collectors
         if (leafBulkCollected.putIfAbsent(context, true) == null) {
-          PointTreeBulkCollector.collect(pointValues, bucketWidth, counts, maxBuckets);
+          PointTreeBulkCollector.collect(
+              pointValues, pointRangeQuery, bucketWidth, counts, maxBuckets);
         }
         // Either the collection is finished on this collector, or some other collector
         // already started that collection, so this collector can finish early!
@@ -330,4 +341,45 @@ static void checkMaxBuckets(int size, int maxBuckets) {
   public void setWeight(Weight weight) {
     this.weight = weight;
   }
+
+  private boolean isMatchAll(LeafReaderContext context) throws IOException {
+    return weight != null && weight.count(context) == context.reader().maxDoc();
+  }
+
+  private static final Map<Class<?>, Function<Query, Query>> queryWrappers;
+
+  // Initialize the wrapper map for unwrapping the query
+  static {
+    queryWrappers = new HashMap<>();
+    queryWrappers.put(BoostQuery.class, q -> ((BoostQuery) q).getQuery());
+    queryWrappers.put(ConstantScoreQuery.class, q -> ((ConstantScoreQuery) q).getQuery());
+    queryWrappers.put(FunctionScoreQuery.class, q -> ((FunctionScoreQuery) q).getWrappedQuery());
+    queryWrappers.put(
+        IndexOrDocValuesQuery.class, q -> ((IndexOrDocValuesQuery) q).getIndexQuery());
+  }
+
+  /** Recursively unwraps query into the concrete form for applying the optimization */
+  private static Query unwrapIntoConcreteQuery(Query query) {
+    while (queryWrappers.containsKey(query.getClass())) {
+      query = queryWrappers.get(query.getClass()).apply(query);
+    }
+
+    return query;
+  }
+
+  private PointRangeQuery getPointRangeQuery(final String field) {
+    if (weight == null || weight.getQuery() == null) {
+      return null;
+    }
+
+    final Query concreteQuery = unwrapIntoConcreteQuery(weight.getQuery());
+
+    if (concreteQuery instanceof PointRangeQuery prq) {
+      if (prq.getField().equals(field)) {
+        return prq;
+      }
+    }
+
+    return null;
+  }
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/PointTreeBulkCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/PointTreeBulkCollector.java
@@ -22,7 +22,9 @@
 import java.util.function.Function;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.internal.hppc.LongIntHashMap;
+import org.apache.lucene.search.CollectionTerminatedException;
 import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.PointRangeQuery;
 import org.apache.lucene.util.NumericUtils;
 
 /**
@@ -76,15 +78,23 @@ static boolean canCollectEfficiently(final PointValues pointValues, final long b
 
   static void collect(
       final PointValues pointValues,
+      final PointRangeQuery prq,
       final long bucketWidth,
       final LongIntHashMap collectorCounts,
       final int maxBuckets)
       throws IOException {
     final Function<byte[], Long> byteToLong = bytesToLong(pointValues.getBytesPerDimension());
+    long leafMin = byteToLong.apply(pointValues.getMinPackedValue());
+    long leafMax = byteToLong.apply(pointValues.getMaxPackedValue());
+    if (prq != null) {
+      leafMin = Math.max(leafMin, byteToLong.apply(prq.getLowerPoint()));
+      leafMax = Math.min(leafMax, byteToLong.apply(prq.getUpperPoint()));
+    }
     BucketManager collector =
         new BucketManager(
             collectorCounts,
-            byteToLong.apply(pointValues.getMinPackedValue()),
+            leafMin,
+            leafMax + 1, // the max value is exclusive for collector
             bucketWidth,
             byteToLong,
             maxBuckets);
@@ -135,6 +145,11 @@ public void visit(int docID) {
       public void visit(int docID, byte[] packedValue) throws IOException {
         if (!collector.withinUpperBound(packedValue)) {
           collector.finalizePreviousBucket(packedValue);
+          // If the packedValue is not within upper bound even after updating upper bound,
+          // we have exhausted the max value and should throw early termination error
+          if (!collector.withinUpperBound(packedValue)) {
+            throw new CollectionTerminatedException();
+          }
         }
 
         if (collector.withinRange(packedValue)) {
@@ -146,6 +161,11 @@ public void visit(int docID, byte[] packedValue) throws IOException {
       public void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOException {
         if (!collector.withinUpperBound(packedValue)) {
           collector.finalizePreviousBucket(packedValue);
+          // If the packedValue is not within upper bound even after updating upper bound,
+          // we have exhausted the max value and should throw early termination error
+          if (!collector.withinUpperBound(packedValue)) {
+            throw new CollectionTerminatedException();
+          }
         }
 
         if (collector.withinRange(packedValue)) {
@@ -157,9 +177,14 @@ public void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOExcept
 
       @Override
       public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
-        // try to find the first range that may collect values from this cell
+        // Try to find the first range that may collect values from this cell
         if (!collector.withinUpperBound(minPackedValue)) {
           collector.finalizePreviousBucket(minPackedValue);
+          // If the minPackedValue is not within upper bound even after updating upper bound,
+          // we have exhausted the max value and should throw early termination error
+          if (!collector.withinUpperBound(minPackedValue)) {
+            throw new CollectionTerminatedException();
+          }
         }
 
         // Not possible to have the CELL_OUTSIDE_QUERY, as bucket lower bound is updated
@@ -176,6 +201,7 @@ private static class BucketManager {
     private final LongIntHashMap collectorCounts;
     private int counter = 0;
     private long startValue;
+    private long maxValue;
     private long endValue;
     private int nonZeroBuckets = 0;
     private int maxBuckets;
@@ -185,13 +211,16 @@ private static class BucketManager {
     public BucketManager(
         LongIntHashMap collectorCounts,
         long minValue,
+        long maxValue,
         long bucketWidth,
         Function<byte[], Long> byteToLong,
         int maxBuckets) {
       this.collectorCounts = collectorCounts;
       this.bucketWidth = bucketWidth;
-      this.startValue = Math.floorDiv(minValue, bucketWidth) * bucketWidth;
-      this.endValue = startValue + bucketWidth;
+      this.startValue = minValue;
+      this.endValue =
+          Math.min((Math.floorDiv(startValue, bucketWidth) + 1) * bucketWidth, maxValue);
+      this.maxValue = maxValue;
       this.byteToLong = byteToLong;
       this.maxBuckets = maxBuckets;
     }
@@ -205,19 +234,19 @@ private void countNode(int count) {
     }
 
     private void finalizePreviousBucket(byte[] packedValue) {
-      // TODO: Can counter ever be 0?
+      // counter can be 0 for first bucket in case
+      // of Point Range Query
       if (counter > 0) {
         collectorCounts.addTo(Math.floorDiv(startValue, bucketWidth), counter);
-        if (packedValue != null) {
-          startValue = byteToLong.apply(packedValue);
-          // Align the start value with bucket width
-          startValue = Math.floorDiv(startValue, bucketWidth) * bucketWidth;
-          endValue = startValue + bucketWidth;
-        }
         nonZeroBuckets++;
         counter = 0;
         HistogramCollector.checkMaxBuckets(nonZeroBuckets, maxBuckets);
       }
+
+      if (packedValue != null) {
+        startValue = byteToLong.apply(packedValue);
+        endValue = Math.min((Math.floorDiv(startValue, bucketWidth) + 1) * bucketWidth, maxValue);
+      }
     }
 
     private boolean withinLowerBound(byte[] value) {
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/plain/histograms/TestHistogramCollectorManager.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/plain/histograms/TestHistogramCollectorManager.java
@@ -26,16 +26,23 @@
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.internal.hppc.LongIntHashMap;
+import org.apache.lucene.queries.function.FunctionScoreQuery;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.DoubleValuesSource;
+import org.apache.lucene.search.IndexOrDocValuesQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PointRangeQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.NumericUtils;
 
 public class TestHistogramCollectorManager extends LuceneTestCase {
 
@@ -136,6 +143,8 @@ public void testMultiRangePointTreeCollector() throws IOException {
     DirectoryReader reader = DirectoryReader.open(w);
     w.close();
     IndexSearcher searcher = newSearcher(reader);
+
+    // Validate the MATCH_ALL case
     LongIntHashMap actualCounts =
         searcher.search(new MatchAllDocsQuery(), new HistogramCollectorManager("f", 1000));
     LongIntHashMap expectedCounts = new LongIntHashMap();
@@ -144,6 +153,71 @@ public void testMultiRangePointTreeCollector() throws IOException {
     }
     assertEquals(expectedCounts, actualCounts);
 
+    // Validate the Point Range Query case
+    int lowerBound = random().nextInt(0, 1500);
+    int upperBound = random().nextInt(3500, 5000);
+
+    byte[] lowerPoint = new byte[Long.BYTES];
+    byte[] upperPoint = new byte[Long.BYTES];
+    NumericUtils.longToSortableBytes(lowerBound, lowerPoint, 0);
+    NumericUtils.longToSortableBytes(upperBound, upperPoint, 0);
+    final PointRangeQuery prq =
+        new PointRangeQuery("f", lowerPoint, upperPoint, 1) {
+          @Override
+          protected String toString(int dimension, byte[] value) {
+            return Long.toString(NumericUtils.sortableBytesToLong(value, 0));
+          }
+        };
+
+    actualCounts = searcher.search(prq, new HistogramCollectorManager("f", 1000));
+    expectedCounts = new LongIntHashMap();
+    for (long value : values) {
+      if (value >= lowerBound && value <= upperBound) {
+        expectedCounts.addTo(Math.floorDiv(value, 1000), 1);
+      }
+    }
+    assertEquals(expectedCounts, actualCounts);
+
+    // Validate the BoostQuery case
+    actualCounts =
+        searcher.search(new BoostQuery(prq, 1.5f), new HistogramCollectorManager("f", 1000));
+    // Don't need to compute expectedCounts again as underlying point range
+    // query is not changing
+    assertEquals(expectedCounts, actualCounts);
+
+    // Validate the ConstantScoreQuery case
+    actualCounts =
+        searcher.search(new ConstantScoreQuery(prq), new HistogramCollectorManager("f", 1000));
+    // Don't need to compute expectedCounts again as underlying point range query is not changing
+    assertEquals(expectedCounts, actualCounts);
+
+    // Validate the FunctionScoreQuery case
+    actualCounts =
+        searcher.search(
+            new FunctionScoreQuery(prq, DoubleValuesSource.SCORES),
+            new HistogramCollectorManager("f", 1000));
+    // Don't need to compute expectedCounts again as underlying point range query is not changing
+    assertEquals(expectedCounts, actualCounts);
+
+    // Validate the IndexOrDocValuesQuery case
+    actualCounts =
+        searcher.search(
+            new IndexOrDocValuesQuery(prq, prq), new HistogramCollectorManager("f", 1000));
+    // Don't need to compute expectedCounts again as underlying point range query is not changing
+    assertEquals(expectedCounts, actualCounts);
+
+    // Validate the recursive wrapping case
+    actualCounts =
+        searcher.search(
+            new ConstantScoreQuery(
+                new BoostQuery(
+                    new FunctionScoreQuery(
+                        new IndexOrDocValuesQuery(prq, prq), DoubleValuesSource.SCORES),
+                    1.5f)),
+            new HistogramCollectorManager("f", 1000));
+    // Don't need to compute expectedCounts again as underlying point range query is not changing
+    assertEquals(expectedCounts, actualCounts);
+
     reader.close();
     dir.close();
   }