Logic for collecting Histogram efficiently using Point Trees (apache#14439)

jainankitk · stefanvodita · commit 7a21f53eeff7 · 2025-04-25T09:54:29.000Z
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -29,6 +29,8 @@ Optimizations
 ---------------------
 * GITHUB#14418: Quick exit on filter query matching no docs when rewriting knn query. (Pan Guixin)
 
+* GITHUB#14439: Efficient Histogram Collection using multi range traversal over PointTrees (Ankit Jain)
+
 * GITHUB#14268: PointInSetQuery early exit on non-matching segments. (hanbj)
 
 * GITHUB#14425: KeywordField.newSetQuery() reuses prefixed terms (Mikhail Khludnev)
diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle
@@ -24,6 +24,7 @@ description = 'Lucene JMH micro-benchmarking module'
 dependencies {
   moduleImplementation project(':lucene:core')
   moduleImplementation project(':lucene:expressions')
+  moduleImplementation project(':lucene:sandbox')
 
   moduleImplementation deps.jmh.core
   annotationProcessor deps.jmh.annprocess
diff --git a/lucene/benchmark-jmh/src/java/module-info.java b/lucene/benchmark-jmh/src/java/module-info.java
@@ -21,6 +21,7 @@
   requires jdk.unsupported;
   requires org.apache.lucene.core;
   requires org.apache.lucene.expressions;
+  requires org.apache.lucene.sandbox;
 
   exports org.apache.lucene.benchmark.jmh;
   exports org.apache.lucene.benchmark.jmh.jmh_generated;
diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/HistogramCollectorBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/HistogramCollectorBenchmark.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.jmh;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Comparator;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Stream;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.sandbox.facet.plain.histograms.HistogramCollectorManager;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+
+@State(Scope.Thread)
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(value = 1, warmups = 1)
+@Warmup(iterations = 1, time = 1)
+@Measurement(iterations = 3, time = 3)
+public class HistogramCollectorBenchmark {
+  Directory dir;
+  IndexReader reader;
+  Path path;
+
+  @Setup(Level.Trial)
+  public void setup(BenchmarkParams params) throws Exception {
+    path = Files.createTempDirectory("forUtil");
+    Directory dir = MMapDirectory.open(path);
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig());
+    Random r = new Random(0);
+
+    for (int i = 0; i < params.docCount; i++) {
+      Document doc = new Document();
+      long value = r.nextInt(0, params.docCount);
+      if (params.pointEnabled) {
+        // Adding indexed point field to verify multi range collector
+        doc.add(new LongPoint("f", value));
+      } else {
+        doc.add(NumericDocValuesField.indexedField("f", value));
+      }
+      w.addDocument(doc);
+    }
+    // Force merging into single segment for testing more documents in segment scenario
+    w.forceMerge(1, true);
+    reader = DirectoryReader.open(w);
+    w.close();
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() throws Exception {
+    reader.close();
+    if (dir != null) {
+      dir.close();
+      dir = null;
+    }
+
+    // Clean up the segment files before next run
+    if (Files.exists(path)) {
+      try (Stream<Path> walk = Files.walk(path)) {
+        walk.sorted(Comparator.reverseOrder())
+            .forEach(
+                path -> {
+                  try {
+                    Files.delete(path);
+                  } catch (
+                      @SuppressWarnings("unused")
+                      IOException unused) {
+                    // Do nothing
+                  }
+                });
+      }
+    }
+  }
+
+  @State(Scope.Benchmark)
+  public static class BenchmarkParams {
+    // Test with both point enabled and disabled
+    @Param({"true", "false"})
+    public boolean pointEnabled;
+
+    @Param({"500000", "5000000"})
+    public int docCount;
+
+    @Param({"5000", "25000"})
+    public long bucketWidth;
+  }
+
+  @Benchmark
+  public void collectHistogram(BenchmarkParams params) throws IOException {
+    IndexSearcher searcher = new IndexSearcher(reader);
+    searcher.search(
+        new MatchAllDocsQuery(), new HistogramCollectorManager("f", params.bucketWidth, 10000));
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollector.java
@@ -17,12 +17,14 @@
 package org.apache.lucene.sandbox.facet.plain.histograms;
 
 import java.io.IOException;
+import java.util.concurrent.ConcurrentMap;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.DocValuesSkipper;
 import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.internal.hppc.LongIntHashMap;
 import org.apache.lucene.search.CollectionTerminatedException;
@@ -31,18 +33,26 @@
 import org.apache.lucene.search.LeafCollector;
 import org.apache.lucene.search.Scorable;
 import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Weight;
 
 final class HistogramCollector implements Collector {
 
   private final String field;
   private final long bucketWidth;
   private final int maxBuckets;
   private final LongIntHashMap counts;
-
-  HistogramCollector(String field, long bucketWidth, int maxBuckets) {
+  private final ConcurrentMap<LeafReaderContext, Boolean> leafBulkCollected;
+  private Weight weight;
+
+  HistogramCollector(
+      String field,
+      long bucketWidth,
+      int maxBuckets,
+      ConcurrentMap<LeafReaderContext, Boolean> leafBulkCollected) {
     this.field = field;
     this.bucketWidth = bucketWidth;
     this.maxBuckets = maxBuckets;
+    this.leafBulkCollected = leafBulkCollected;
     this.counts = new LongIntHashMap();
   }
 
@@ -53,11 +63,30 @@ public LeafCollector getLeafCollector(LeafReaderContext context) throws IOExcept
       // The segment has no values, nothing to do.
       throw new CollectionTerminatedException();
     }
+
+    // We can use multi range traversal logic to collect the histogram on numeric
+    // field indexed as point for MATCH_ALL cases. In future, this can be extended
+    // for Point Range Query cases as well
+    if (weight != null && weight.count(context) == context.reader().maxDoc()) {
+      final PointValues pointValues = context.reader().getPointValues(field);
+      if (PointTreeBulkCollector.canCollectEfficiently(pointValues, bucketWidth)) {
+        // In case of intra segment concurrency, only one collector should collect
+        // documents for all the partitions to avoid duplications across collectors
+        if (leafBulkCollected.putIfAbsent(context, true) == null) {
+          PointTreeBulkCollector.collect(pointValues, bucketWidth, counts, maxBuckets);
+        }
+        // Either the collection is finished on this collector, or some other collector
+        // already started that collection, so this collector can finish early!
+        throw new CollectionTerminatedException();
+      }
+    }
+
     if (fi.getDocValuesType() != DocValuesType.NUMERIC
         && fi.getDocValuesType() != DocValuesType.SORTED_NUMERIC) {
       throw new IllegalStateException(
           "Expected numeric field, but got doc-value type: " + fi.getDocValuesType());
     }
+
     SortedNumericDocValues values = DocValues.getSortedNumeric(context.reader(), field);
     NumericDocValues singleton = DocValues.unwrapSingleton(values);
     if (singleton == null) {
@@ -296,4 +325,9 @@ static void checkMaxBuckets(int size, int maxBuckets) {
               + maxBuckets);
     }
   }
+
+  @Override
+  public void setWeight(Weight weight) {
+    this.weight = weight;
+  }
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollectorManager.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollectorManager.java
@@ -19,7 +19,10 @@
 import java.io.IOException;
 import java.util.Collection;
 import java.util.Objects;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.internal.hppc.LongIntHashMap;
 import org.apache.lucene.internal.hppc.LongIntHashMap.LongIntCursor;
 import org.apache.lucene.search.CollectorManager;
@@ -50,6 +53,7 @@ public final class HistogramCollectorManager
   private final String field;
   private final long bucketWidth;
   private final int maxBuckets;
+  private final ConcurrentMap<LeafReaderContext, Boolean> leafBulkCollected;
 
   /**
    * Compute a histogram of the distribution of the values of the given {@code field} according to
@@ -76,11 +80,12 @@ public HistogramCollectorManager(String field, long bucketWidth, int maxBuckets)
       throw new IllegalArgumentException("maxBuckets must be at least 1, got: " + maxBuckets);
     }
     this.maxBuckets = maxBuckets;
+    this.leafBulkCollected = new ConcurrentHashMap<>();
   }
 
   @Override
   public HistogramCollector newCollector() throws IOException {
-    return new HistogramCollector(field, bucketWidth, maxBuckets);
+    return new HistogramCollector(field, bucketWidth, maxBuckets, leafBulkCollected);
   }
 
   @Override
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/PointTreeBulkCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/PointTreeBulkCollector.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/plain/histograms/TestHistogramCollectorManager.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/plain/histograms/TestHistogramCollectorManager.java