Skip to content

Commit 7a21f53

Browse files
jainankitkstefanvodita
authored andcommitted
Logic for collecting Histogram efficiently using Point Trees (apache#14439)
1 parent bb3167e commit 7a21f53

File tree

8 files changed

+446
-3
lines changed

8 files changed

+446
-3
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ Optimizations
2929
---------------------
3030
* GITHUB#14418: Quick exit on filter query matching no docs when rewriting knn query. (Pan Guixin)
3131

32+
* GITHUB#14439: Efficient Histogram Collection using multi range traversal over PointTrees (Ankit Jain)
33+
3234
* GITHUB#14268: PointInSetQuery early exit on non-matching segments. (hanbj)
3335

3436
* GITHUB#14425: KeywordField.newSetQuery() reuses prefixed terms (Mikhail Khludnev)

lucene/benchmark-jmh/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ description = 'Lucene JMH micro-benchmarking module'
2424
dependencies {
2525
moduleImplementation project(':lucene:core')
2626
moduleImplementation project(':lucene:expressions')
27+
moduleImplementation project(':lucene:sandbox')
2728

2829
moduleImplementation deps.jmh.core
2930
annotationProcessor deps.jmh.annprocess

lucene/benchmark-jmh/src/java/module-info.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
requires jdk.unsupported;
2222
requires org.apache.lucene.core;
2323
requires org.apache.lucene.expressions;
24+
requires org.apache.lucene.sandbox;
2425

2526
exports org.apache.lucene.benchmark.jmh;
2627
exports org.apache.lucene.benchmark.jmh.jmh_generated;
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.benchmark.jmh;
18+
19+
import java.io.IOException;
20+
import java.nio.file.Files;
21+
import java.nio.file.Path;
22+
import java.util.Comparator;
23+
import java.util.Random;
24+
import java.util.concurrent.TimeUnit;
25+
import java.util.stream.Stream;
26+
import org.apache.lucene.document.Document;
27+
import org.apache.lucene.document.LongPoint;
28+
import org.apache.lucene.document.NumericDocValuesField;
29+
import org.apache.lucene.index.DirectoryReader;
30+
import org.apache.lucene.index.IndexReader;
31+
import org.apache.lucene.index.IndexWriter;
32+
import org.apache.lucene.index.IndexWriterConfig;
33+
import org.apache.lucene.sandbox.facet.plain.histograms.HistogramCollectorManager;
34+
import org.apache.lucene.search.IndexSearcher;
35+
import org.apache.lucene.search.MatchAllDocsQuery;
36+
import org.apache.lucene.store.Directory;
37+
import org.apache.lucene.store.MMapDirectory;
38+
import org.openjdk.jmh.annotations.Benchmark;
39+
import org.openjdk.jmh.annotations.BenchmarkMode;
40+
import org.openjdk.jmh.annotations.Fork;
41+
import org.openjdk.jmh.annotations.Level;
42+
import org.openjdk.jmh.annotations.Measurement;
43+
import org.openjdk.jmh.annotations.Mode;
44+
import org.openjdk.jmh.annotations.OutputTimeUnit;
45+
import org.openjdk.jmh.annotations.Param;
46+
import org.openjdk.jmh.annotations.Scope;
47+
import org.openjdk.jmh.annotations.Setup;
48+
import org.openjdk.jmh.annotations.State;
49+
import org.openjdk.jmh.annotations.TearDown;
50+
import org.openjdk.jmh.annotations.Warmup;
51+
52+
@State(Scope.Thread)
53+
@BenchmarkMode(Mode.Throughput)
54+
@OutputTimeUnit(TimeUnit.SECONDS)
55+
@Fork(value = 1, warmups = 1)
56+
@Warmup(iterations = 1, time = 1)
57+
@Measurement(iterations = 3, time = 3)
58+
public class HistogramCollectorBenchmark {
59+
Directory dir;
60+
IndexReader reader;
61+
Path path;
62+
63+
@Setup(Level.Trial)
64+
public void setup(BenchmarkParams params) throws Exception {
65+
path = Files.createTempDirectory("forUtil");
66+
Directory dir = MMapDirectory.open(path);
67+
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig());
68+
Random r = new Random(0);
69+
70+
for (int i = 0; i < params.docCount; i++) {
71+
Document doc = new Document();
72+
long value = r.nextInt(0, params.docCount);
73+
if (params.pointEnabled) {
74+
// Adding indexed point field to verify multi range collector
75+
doc.add(new LongPoint("f", value));
76+
} else {
77+
doc.add(NumericDocValuesField.indexedField("f", value));
78+
}
79+
w.addDocument(doc);
80+
}
81+
// Force merging into single segment for testing more documents in segment scenario
82+
w.forceMerge(1, true);
83+
reader = DirectoryReader.open(w);
84+
w.close();
85+
}
86+
87+
@TearDown(Level.Trial)
88+
public void tearDown() throws Exception {
89+
reader.close();
90+
if (dir != null) {
91+
dir.close();
92+
dir = null;
93+
}
94+
95+
// Clean up the segment files before next run
96+
if (Files.exists(path)) {
97+
try (Stream<Path> walk = Files.walk(path)) {
98+
walk.sorted(Comparator.reverseOrder())
99+
.forEach(
100+
path -> {
101+
try {
102+
Files.delete(path);
103+
} catch (
104+
@SuppressWarnings("unused")
105+
IOException unused) {
106+
// Do nothing
107+
}
108+
});
109+
}
110+
}
111+
}
112+
113+
@State(Scope.Benchmark)
114+
public static class BenchmarkParams {
115+
// Test with both point enabled and disabled
116+
@Param({"true", "false"})
117+
public boolean pointEnabled;
118+
119+
@Param({"500000", "5000000"})
120+
public int docCount;
121+
122+
@Param({"5000", "25000"})
123+
public long bucketWidth;
124+
}
125+
126+
@Benchmark
127+
public void collectHistogram(BenchmarkParams params) throws IOException {
128+
IndexSearcher searcher = new IndexSearcher(reader);
129+
searcher.search(
130+
new MatchAllDocsQuery(), new HistogramCollectorManager("f", params.bucketWidth, 10000));
131+
}
132+
}

lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollector.java

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@
1717
package org.apache.lucene.sandbox.facet.plain.histograms;
1818

1919
import java.io.IOException;
20+
import java.util.concurrent.ConcurrentMap;
2021
import org.apache.lucene.index.DocValues;
2122
import org.apache.lucene.index.DocValuesSkipper;
2223
import org.apache.lucene.index.DocValuesType;
2324
import org.apache.lucene.index.FieldInfo;
2425
import org.apache.lucene.index.LeafReaderContext;
2526
import org.apache.lucene.index.NumericDocValues;
27+
import org.apache.lucene.index.PointValues;
2628
import org.apache.lucene.index.SortedNumericDocValues;
2729
import org.apache.lucene.internal.hppc.LongIntHashMap;
2830
import org.apache.lucene.search.CollectionTerminatedException;
@@ -31,18 +33,26 @@
3133
import org.apache.lucene.search.LeafCollector;
3234
import org.apache.lucene.search.Scorable;
3335
import org.apache.lucene.search.ScoreMode;
36+
import org.apache.lucene.search.Weight;
3437

3538
final class HistogramCollector implements Collector {
3639

3740
private final String field;
3841
private final long bucketWidth;
3942
private final int maxBuckets;
4043
private final LongIntHashMap counts;
41-
42-
HistogramCollector(String field, long bucketWidth, int maxBuckets) {
44+
private final ConcurrentMap<LeafReaderContext, Boolean> leafBulkCollected;
45+
private Weight weight;
46+
47+
HistogramCollector(
48+
String field,
49+
long bucketWidth,
50+
int maxBuckets,
51+
ConcurrentMap<LeafReaderContext, Boolean> leafBulkCollected) {
4352
this.field = field;
4453
this.bucketWidth = bucketWidth;
4554
this.maxBuckets = maxBuckets;
55+
this.leafBulkCollected = leafBulkCollected;
4656
this.counts = new LongIntHashMap();
4757
}
4858

@@ -53,11 +63,30 @@ public LeafCollector getLeafCollector(LeafReaderContext context) throws IOExcept
5363
// The segment has no values, nothing to do.
5464
throw new CollectionTerminatedException();
5565
}
66+
67+
// We can use multi range traversal logic to collect the histogram on numeric
68+
// field indexed as point for MATCH_ALL cases. In future, this can be extended
69+
// for Point Range Query cases as well
70+
if (weight != null && weight.count(context) == context.reader().maxDoc()) {
71+
final PointValues pointValues = context.reader().getPointValues(field);
72+
if (PointTreeBulkCollector.canCollectEfficiently(pointValues, bucketWidth)) {
73+
// In case of intra segment concurrency, only one collector should collect
74+
// documents for all the partitions to avoid duplications across collectors
75+
if (leafBulkCollected.putIfAbsent(context, true) == null) {
76+
PointTreeBulkCollector.collect(pointValues, bucketWidth, counts, maxBuckets);
77+
}
78+
// Either the collection is finished on this collector, or some other collector
79+
// already started that collection, so this collector can finish early!
80+
throw new CollectionTerminatedException();
81+
}
82+
}
83+
5684
if (fi.getDocValuesType() != DocValuesType.NUMERIC
5785
&& fi.getDocValuesType() != DocValuesType.SORTED_NUMERIC) {
5886
throw new IllegalStateException(
5987
"Expected numeric field, but got doc-value type: " + fi.getDocValuesType());
6088
}
89+
6190
SortedNumericDocValues values = DocValues.getSortedNumeric(context.reader(), field);
6291
NumericDocValues singleton = DocValues.unwrapSingleton(values);
6392
if (singleton == null) {
@@ -296,4 +325,9 @@ static void checkMaxBuckets(int size, int maxBuckets) {
296325
+ maxBuckets);
297326
}
298327
}
328+
329+
@Override
330+
public void setWeight(Weight weight) {
331+
this.weight = weight;
332+
}
299333
}

lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/plain/histograms/HistogramCollectorManager.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919
import java.io.IOException;
2020
import java.util.Collection;
2121
import java.util.Objects;
22+
import java.util.concurrent.ConcurrentHashMap;
23+
import java.util.concurrent.ConcurrentMap;
2224
import org.apache.lucene.document.FieldType;
25+
import org.apache.lucene.index.LeafReaderContext;
2326
import org.apache.lucene.internal.hppc.LongIntHashMap;
2427
import org.apache.lucene.internal.hppc.LongIntHashMap.LongIntCursor;
2528
import org.apache.lucene.search.CollectorManager;
@@ -50,6 +53,7 @@ public final class HistogramCollectorManager
5053
private final String field;
5154
private final long bucketWidth;
5255
private final int maxBuckets;
56+
private final ConcurrentMap<LeafReaderContext, Boolean> leafBulkCollected;
5357

5458
/**
5559
* Compute a histogram of the distribution of the values of the given {@code field} according to
@@ -76,11 +80,12 @@ public HistogramCollectorManager(String field, long bucketWidth, int maxBuckets)
7680
throw new IllegalArgumentException("maxBuckets must be at least 1, got: " + maxBuckets);
7781
}
7882
this.maxBuckets = maxBuckets;
83+
this.leafBulkCollected = new ConcurrentHashMap<>();
7984
}
8085

8186
@Override
8287
public HistogramCollector newCollector() throws IOException {
83-
return new HistogramCollector(field, bucketWidth, maxBuckets);
88+
return new HistogramCollector(field, bucketWidth, maxBuckets, leafBulkCollected);
8489
}
8590

8691
@Override

0 commit comments

Comments
 (0)