99
1010package org .elasticsearch .search .aggregations .bucket .sampler .random ;
1111
12+ import org .apache .lucene .search .BooleanClause ;
13+ import org .apache .lucene .search .BooleanQuery ;
1214import org .apache .lucene .search .CollectionTerminatedException ;
1315import org .apache .lucene .search .DocIdSetIterator ;
16+ import org .apache .lucene .search .ScoreMode ;
1417import org .apache .lucene .search .Scorer ;
1518import org .apache .lucene .search .Weight ;
1619import org .apache .lucene .util .Bits ;
17- import org .elasticsearch .common .CheckedSupplier ;
1820import org .elasticsearch .common .util .LongArray ;
1921import org .elasticsearch .search .aggregations .AggregationExecutionContext ;
2022import org .elasticsearch .search .aggregations .Aggregator ;
@@ -34,14 +36,13 @@ public class RandomSamplerAggregator extends BucketsAggregator implements Single
3436 private final int seed ;
3537 private final Integer shardSeed ;
3638 private final double probability ;
37- private final CheckedSupplier < Weight , IOException > weightSupplier ;
39+ private Weight weight ;
3840
3941 RandomSamplerAggregator (
4042 String name ,
4143 int seed ,
4244 Integer shardSeed ,
4345 double probability ,
44- CheckedSupplier <Weight , IOException > weightSupplier ,
4546 AggregatorFactories factories ,
4647 AggregationContext context ,
4748 Aggregator parent ,
@@ -56,10 +57,33 @@ public class RandomSamplerAggregator extends BucketsAggregator implements Single
5657 RandomSamplerAggregationBuilder .NAME + " aggregation [" + name + "] must have sub aggregations configured"
5758 );
5859 }
59- this .weightSupplier = weightSupplier ;
6060 this .shardSeed = shardSeed ;
6161 }
6262
63+ /**
64+ * This creates the query weight which will be used in the aggregator.
65+ *
66+ * This weight is a boolean query between {@link RandomSamplingQuery} and the configured top level query of the search. This allows
67+ * the aggregation to iterate the documents directly, thus sampling in the background instead of the foreground.
68+ * @return weight to be used, is cached for additional usages
69+ * @throws IOException when building the weight or queries fails;
70+ */
71+ private Weight getWeight () throws IOException {
72+ if (weight == null ) {
73+ RandomSamplingQuery query = new RandomSamplingQuery (
74+ probability ,
75+ seed ,
76+ shardSeed == null ? context .shardRandomSeed () : shardSeed
77+ );
78+ ScoreMode scoreMode = scoreMode ();
79+ BooleanQuery booleanQuery = new BooleanQuery .Builder ().add (query , BooleanClause .Occur .FILTER )
80+ .add (context .query (), scoreMode .needsScores () ? BooleanClause .Occur .MUST : BooleanClause .Occur .FILTER )
81+ .build ();
82+ weight = context .searcher ().createWeight (context .searcher ().rewrite (booleanQuery ), scoreMode , 1f );
83+ }
84+ return weight ;
85+ }
86+
6387 @ Override
6488 public InternalAggregation [] buildAggregations (LongArray owningBucketOrds ) throws IOException {
6589 return buildAggregationsForSingleBucket (
@@ -112,7 +136,7 @@ public void collect(int doc, long owningBucketOrd) throws IOException {
112136 };
113137 }
114138 // TODO know when sampling would be much slower and skip sampling: https://github.com/elastic/elasticsearch/issues/84353
115- Scorer scorer = weightSupplier . get ().scorer (aggCtx .getLeafReaderContext ());
139+ Scorer scorer = getWeight ().scorer (aggCtx .getLeafReaderContext ());
116140 // This means there are no docs to iterate, possibly due to the fields not existing
117141 if (scorer == null ) {
118142 return LeafBucketCollector .NO_OP_COLLECTOR ;
0 commit comments