Skip to content

Commit 0e21f15

Browse files
authored
Remove supersetSize and subsetSize from InternalSignificantTerms.Bucket (#117574) (#117874)
Those fields are only used to update the score and not serialized in the bucket so they can be removed.
1 parent c1b9842 commit 0e21f15

File tree

14 files changed

+127
-262
lines changed

14 files changed

+127
-262
lines changed

server/src/internalClusterTest/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreIT.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ public void testScriptScore() throws ExecutionException, InterruptedException, I
495495
for (SignificantTerms.Bucket bucket : sigTerms.getBuckets()) {
496496
assertThat(
497497
bucket.getSignificanceScore(),
498-
is((double) bucket.getSubsetDf() + bucket.getSubsetSize() + bucket.getSupersetDf() + bucket.getSupersetSize())
498+
is((double) bucket.getSubsetDf() + sigTerms.getSubsetSize() + bucket.getSupersetDf() + sigTerms.getSupersetSize())
499499
);
500500
}
501501
}

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,7 @@ SignificantStringTerms.Bucket[] buildBuckets(int size) {
985985

986986
@Override
987987
SignificantStringTerms.Bucket buildEmptyTemporaryBucket() {
988-
return new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null, format, 0);
988+
return new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, null, format, 0);
989989
}
990990

991991
private long subsetSize(long owningBucketOrd) {
@@ -994,22 +994,19 @@ private long subsetSize(long owningBucketOrd) {
994994
}
995995

996996
@Override
997-
BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd, GlobalOrdLookupFunction lookupGlobalOrd)
998-
throws IOException {
997+
BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd, GlobalOrdLookupFunction lookupGlobalOrd) {
999998
long subsetSize = subsetSize(owningBucketOrd);
1000999
return (spare, globalOrd, bucketOrd, docCount) -> {
10011000
spare.bucketOrd = bucketOrd;
10021001
oversizedCopy(lookupGlobalOrd.apply(globalOrd), spare.termBytes);
10031002
spare.subsetDf = docCount;
1004-
spare.subsetSize = subsetSize;
10051003
spare.supersetDf = backgroundFrequencies.freq(spare.termBytes);
1006-
spare.supersetSize = supersetSize;
10071004
/*
10081005
* During shard-local down-selection we use subset/superset stats
10091006
* that are for this shard only. Back at the central reducer these
10101007
* properties will be updated with global stats.
10111008
*/
1012-
spare.updateScore(significanceHeuristic);
1009+
spare.updateScore(significanceHeuristic, subsetSize, supersetSize);
10131010
};
10141011
}
10151012

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedSignificantTerms.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ protected InternalMappedSignificantTerms(StreamInput in, Bucket.Reader<B> bucket
5959
subsetSize = in.readVLong();
6060
supersetSize = in.readVLong();
6161
significanceHeuristic = in.readNamedWriteable(SignificanceHeuristic.class);
62-
buckets = in.readCollectionAsList(stream -> bucketReader.read(stream, subsetSize, supersetSize, format));
62+
buckets = in.readCollectionAsList(stream -> bucketReader.read(stream, format));
6363
}
6464

6565
@Override
@@ -91,12 +91,12 @@ public B getBucketByKey(String term) {
9191
}
9292

9393
@Override
94-
protected long getSubsetSize() {
94+
public long getSubsetSize() {
9595
return subsetSize;
9696
}
9797

9898
@Override
99-
protected long getSupersetSize() {
99+
public long getSupersetSize() {
100100
return supersetSize;
101101
}
102102

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalSignificantTerms.java

Lines changed: 6 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,11 @@ public abstract static class Bucket<B extends Bucket<B>> extends InternalMultiBu
5353
*/
5454
@FunctionalInterface
5555
public interface Reader<B extends Bucket<B>> {
56-
B read(StreamInput in, long subsetSize, long supersetSize, DocValueFormat format) throws IOException;
56+
B read(StreamInput in, DocValueFormat format) throws IOException;
5757
}
5858

5959
long subsetDf;
60-
long subsetSize;
6160
long supersetDf;
62-
long supersetSize;
6361
/**
6462
* Ordinal of the bucket while it is being built. Not used after it is
6563
* returned from {@link Aggregator#buildAggregations(org.elasticsearch.common.util.LongArray)} and not
@@ -70,16 +68,7 @@ public interface Reader<B extends Bucket<B>> {
7068
protected InternalAggregations aggregations;
7169
final transient DocValueFormat format;
7270

73-
protected Bucket(
74-
long subsetDf,
75-
long subsetSize,
76-
long supersetDf,
77-
long supersetSize,
78-
InternalAggregations aggregations,
79-
DocValueFormat format
80-
) {
81-
this.subsetSize = subsetSize;
82-
this.supersetSize = supersetSize;
71+
protected Bucket(long subsetDf, long supersetDf, InternalAggregations aggregations, DocValueFormat format) {
8372
this.subsetDf = subsetDf;
8473
this.supersetDf = supersetDf;
8574
this.aggregations = aggregations;
@@ -89,9 +78,7 @@ protected Bucket(
8978
/**
9079
* Read from a stream.
9180
*/
92-
protected Bucket(StreamInput in, long subsetSize, long supersetSize, DocValueFormat format) {
93-
this.subsetSize = subsetSize;
94-
this.supersetSize = supersetSize;
81+
protected Bucket(StreamInput in, DocValueFormat format) {
9582
this.format = format;
9683
}
9784

@@ -105,20 +92,10 @@ public long getSupersetDf() {
10592
return supersetDf;
10693
}
10794

108-
@Override
109-
public long getSupersetSize() {
110-
return supersetSize;
111-
}
112-
113-
@Override
114-
public long getSubsetSize() {
115-
return subsetSize;
116-
}
117-
11895
// TODO we should refactor to remove this, since buckets should be immutable after they are generated.
11996
// This can lead to confusing bugs if the bucket is re-created (via createBucket() or similar) without
12097
// the score
121-
void updateScore(SignificanceHeuristic significanceHeuristic) {
98+
void updateScore(SignificanceHeuristic significanceHeuristic, long subsetSize, long supersetSize) {
12299
score = significanceHeuristic.getScore(subsetDf, subsetSize, supersetDf, supersetSize);
123100
}
124101

@@ -262,13 +239,11 @@ public InternalAggregation get() {
262239
buckets.forEach(entry -> {
263240
final B b = createBucket(
264241
entry.value.subsetDf[0],
265-
globalSubsetSize,
266242
entry.value.supersetDf[0],
267-
globalSupersetSize,
268243
entry.value.reducer.getAggregations(),
269244
entry.value.reducer.getProto()
270245
);
271-
b.updateScore(heuristic);
246+
b.updateScore(heuristic, globalSubsetSize, globalSupersetSize);
272247
if (((b.score > 0) && (b.subsetDf >= minDocCount)) || reduceContext.isFinalReduce() == false) {
273248
final B removed = ordered.insertWithOverflow(b);
274249
if (removed == null) {
@@ -317,9 +292,7 @@ public InternalAggregation finalizeSampling(SamplingContext samplingContext) {
317292
.map(
318293
b -> createBucket(
319294
samplingContext.scaleUp(b.subsetDf),
320-
subsetSize,
321295
samplingContext.scaleUp(b.supersetDf),
322-
supersetSize,
323296
InternalAggregations.finalizeSampling(b.aggregations, samplingContext),
324297
b
325298
)
@@ -328,14 +301,7 @@ public InternalAggregation finalizeSampling(SamplingContext samplingContext) {
328301
);
329302
}
330303

331-
abstract B createBucket(
332-
long subsetDf,
333-
long subsetSize,
334-
long supersetDf,
335-
long supersetSize,
336-
InternalAggregations aggregations,
337-
B prototype
338-
);
304+
abstract B createBucket(long subsetDf, long supersetDf, InternalAggregations aggregations, B prototype);
339305

340306
protected abstract A create(long subsetSize, long supersetSize, List<B> buckets);
341307

@@ -344,10 +310,6 @@ abstract B createBucket(
344310
*/
345311
protected abstract B[] createBucketsArray(int size);
346312

347-
protected abstract long getSubsetSize();
348-
349-
protected abstract long getSupersetSize();
350-
351313
protected abstract SignificanceHeuristic getSignificanceHeuristic();
352314

353315
@Override

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
import java.util.function.BiConsumer;
4848
import java.util.function.Function;
4949
import java.util.function.LongConsumer;
50-
import java.util.function.Supplier;
5150

5251
import static org.elasticsearch.search.aggregations.InternalOrder.isKeyOrder;
5352

@@ -296,7 +295,7 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro
296295
try (ObjectArrayPriorityQueue<B> ordered = buildPriorityQueue(size)) {
297296
B spare = null;
298297
BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningOrd);
299-
Supplier<B> emptyBucketBuilder = emptyBucketBuilder(owningOrd);
298+
BucketUpdater<B> bucketUpdater = bucketUpdater(owningOrd);
300299
while (ordsEnum.next()) {
301300
long docCount = bucketDocCount(ordsEnum.ord());
302301
otherDocCounts.increment(ordIdx, docCount);
@@ -305,9 +304,9 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro
305304
}
306305
if (spare == null) {
307306
checkRealMemoryCBForInternalBucket();
308-
spare = emptyBucketBuilder.get();
307+
spare = buildEmptyBucket();
309308
}
310-
updateBucket(spare, ordsEnum, docCount);
309+
bucketUpdater.updateBucket(spare, ordsEnum, docCount);
311310
spare = ordered.insertWithOverflow(spare);
312311
}
313312

@@ -348,9 +347,9 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro
348347
abstract void collectZeroDocEntriesIfNeeded(long owningBucketOrd, boolean excludeDeletedDocs) throws IOException;
349348

350349
/**
351-
* Build an empty temporary bucket.
350+
* Build an empty bucket.
352351
*/
353-
abstract Supplier<B> emptyBucketBuilder(long owningBucketOrd);
352+
abstract B buildEmptyBucket();
354353

355354
/**
356355
* Build a {@link PriorityQueue} to sort the buckets. After we've
@@ -362,7 +361,7 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro
362361
* Update fields in {@code spare} to reflect information collected for
363362
* this bucket ordinal.
364363
*/
365-
abstract void updateBucket(B spare, BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum, long docCount) throws IOException;
364+
abstract BucketUpdater<B> bucketUpdater(long owningBucketOrd);
366365

367366
/**
368367
* Build an array to hold the "top" buckets for each ordinal.
@@ -399,6 +398,10 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro
399398
abstract R buildEmptyResult();
400399
}
401400

401+
interface BucketUpdater<B extends InternalMultiBucketAggregation.InternalBucket> {
402+
void updateBucket(B spare, BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum, long docCount) throws IOException;
403+
}
404+
402405
/**
403406
* Builds results for the standard {@code terms} aggregation.
404407
*/
@@ -490,8 +493,8 @@ private void collectZeroDocEntries(BinaryDocValues values, Bits liveDocs, int ma
490493
}
491494

492495
@Override
493-
Supplier<StringTerms.Bucket> emptyBucketBuilder(long owningBucketOrd) {
494-
return () -> new StringTerms.Bucket(new BytesRef(), 0, null, showTermDocCountError, 0, format);
496+
StringTerms.Bucket buildEmptyBucket() {
497+
return new StringTerms.Bucket(new BytesRef(), 0, null, showTermDocCountError, 0, format);
495498
}
496499

497500
@Override
@@ -500,10 +503,12 @@ ObjectArrayPriorityQueue<StringTerms.Bucket> buildPriorityQueue(int size) {
500503
}
501504

502505
@Override
503-
void updateBucket(StringTerms.Bucket spare, BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum, long docCount) throws IOException {
504-
ordsEnum.readValue(spare.termBytes);
505-
spare.docCount = docCount;
506-
spare.bucketOrd = ordsEnum.ord();
506+
BucketUpdater<StringTerms.Bucket> bucketUpdater(long owningBucketOrd) {
507+
return (spare, ordsEnum, docCount) -> {
508+
ordsEnum.readValue(spare.termBytes);
509+
spare.docCount = docCount;
510+
spare.bucketOrd = ordsEnum.ord();
511+
};
507512
}
508513

509514
@Override
@@ -615,9 +620,8 @@ public void collect(int doc, long owningBucketOrd) throws IOException {
615620
void collectZeroDocEntriesIfNeeded(long owningBucketOrd, boolean excludeDeletedDocs) throws IOException {}
616621

617622
@Override
618-
Supplier<SignificantStringTerms.Bucket> emptyBucketBuilder(long owningBucketOrd) {
619-
long subsetSize = subsetSizes.get(owningBucketOrd);
620-
return () -> new SignificantStringTerms.Bucket(new BytesRef(), 0, subsetSize, 0, 0, null, format, 0);
623+
SignificantStringTerms.Bucket buildEmptyBucket() {
624+
return new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, null, format, 0);
621625
}
622626

623627
@Override
@@ -626,20 +630,20 @@ ObjectArrayPriorityQueue<SignificantStringTerms.Bucket> buildPriorityQueue(int s
626630
}
627631

628632
@Override
629-
void updateBucket(SignificantStringTerms.Bucket spare, BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum, long docCount)
630-
throws IOException {
631-
632-
ordsEnum.readValue(spare.termBytes);
633-
spare.bucketOrd = ordsEnum.ord();
634-
spare.subsetDf = docCount;
635-
spare.supersetDf = backgroundFrequencies.freq(spare.termBytes);
636-
spare.supersetSize = supersetSize;
637-
/*
638-
* During shard-local down-selection we use subset/superset stats
639-
* that are for this shard only. Back at the central reducer these
640-
* properties will be updated with global stats.
641-
*/
642-
spare.updateScore(significanceHeuristic);
633+
BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd) {
634+
long subsetSize = subsetSizes.get(owningBucketOrd);
635+
return (spare, ordsEnum, docCount) -> {
636+
ordsEnum.readValue(spare.termBytes);
637+
spare.bucketOrd = ordsEnum.ord();
638+
spare.subsetDf = docCount;
639+
spare.supersetDf = backgroundFrequencies.freq(spare.termBytes);
640+
/*
641+
* During shard-local down-selection we use subset/superset stats
642+
* that are for this shard only. Back at the central reducer these
643+
* properties will be updated with global stats.
644+
*/
645+
spare.updateScore(significanceHeuristic, subsetSize, supersetSize);
646+
};
643647
}
644648

645649
@Override

0 commit comments

Comments
 (0)