-
Notifications
You must be signed in to change notification settings - Fork 25.7k
ESQL: Add TopN support for exponential histograms #137313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
d2e8ffa
bae17ea
6906c53
8e75b89
fc60990
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -94,6 +94,20 @@ public ExponentialHistogram getExponentialHistogram(int valueIndex, ExponentialH | |
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void serializeExponentialHistogram(int valueIndex, SerializedOutput out, BytesRef scratch) { | ||
| long valueCount = valueCounts.getLong(valueCounts.getFirstValueIndex(valueIndex)); | ||
| out.appendLong(valueCounts.getLong(valueCounts.getFirstValueIndex(valueIndex))); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you are confusing this with the value count returned via |
||
| out.appendDouble(sums.getDouble(sums.getFirstValueIndex(valueIndex))); | ||
| out.appendDouble(zeroThresholds.getDouble(zeroThresholds.getFirstValueIndex(valueIndex))); | ||
| if (valueCount > 0) { | ||
| // min / max are only non-null for non-empty histograms | ||
| out.appendDouble(minima.getDouble(minima.getFirstValueIndex(valueIndex))); | ||
| out.appendDouble(maxima.getDouble(maxima.getFirstValueIndex(valueIndex))); | ||
| } | ||
| out.appendBytesRef(encodedHistograms.getBytesRef(encodedHistograms.getFirstValueIndex(valueIndex), scratch)); | ||
| } | ||
|
|
||
| @Override | ||
| protected void closeInternal() { | ||
| Releasables.close(getSubBlocks()); | ||
|
|
@@ -376,5 +390,4 @@ public int hashCode() { | |
| // this ensures proper equality with null blocks and should be unique enough for practical purposes | ||
| return encodedHistograms.hashCode(); | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,7 @@ | |
|
|
||
| package org.elasticsearch.compute.data; | ||
|
|
||
| import org.apache.lucene.util.BytesRef; | ||
| import org.elasticsearch.exponentialhistogram.ExponentialHistogram; | ||
|
|
||
| /** | ||
|
|
@@ -27,6 +28,16 @@ public sealed interface ExponentialHistogramBlock extends Block permits Constant | |
| */ | ||
| ExponentialHistogram getExponentialHistogram(int valueIndex, ExponentialHistogramScratch scratch); | ||
|
|
||
| /** | ||
| * Serializes the exponential histogram at the given index into the provided output, so that it can be read back | ||
| * via {@link ExponentialHistogramBlockBuilder#deserializeAndAppend(SerializedInput)}. | ||
| * | ||
| * @param valueIndex | ||
| * @param out | ||
| * @param scratch | ||
| */ | ||
| void serializeExponentialHistogram(int valueIndex, SerializedOutput out, BytesRef scratch); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See #137313 (comment). |
||
|
|
||
| static boolean equals(ExponentialHistogramBlock blockA, ExponentialHistogramBlock blockB) { | ||
| if (blockA == blockB) { | ||
| return true; | ||
|
|
@@ -42,4 +53,27 @@ static boolean equals(ExponentialHistogramBlock blockA, ExponentialHistogramBloc | |
| }; | ||
| } | ||
|
|
||
| /** | ||
| * Abstraction to use for writing individual values via {@link #serializeExponentialHistogram(int, SerializedOutput, BytesRef)}. | ||
| */ | ||
| interface SerializedOutput { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer not to have these interfaces, but they're okay.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really don't want to expose the zero-threshold from the block to prevent it becoming a maintenance nightmare if we do changes to the disk format. That's why I want to keep the knowledge of its existence local to the block implementation. I though pulling in a direct dependency on the |
||
| void appendDouble(double value); | ||
|
|
||
| void appendLong(long value); | ||
|
|
||
| void appendBytesRef(BytesRef bytesRef); | ||
| } | ||
|
|
||
| /** | ||
| * Abstraction to use for reading individual serialized via | ||
| * {@link ExponentialHistogramBlockBuilder#deserializeAndAppend(SerializedInput)}. | ||
| */ | ||
| interface SerializedInput { | ||
| double readDouble(); | ||
|
|
||
| long readLong(); | ||
|
|
||
| BytesRef readBytesRef(BytesRef tempBytesRef); | ||
JonasKunz marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the Elastic License | ||
| * 2.0; you may not use this file except in compliance with the Elastic License | ||
| * 2.0. | ||
| */ | ||
|
|
||
| package org.elasticsearch.compute.operator.topn; | ||
|
|
||
| import org.apache.lucene.util.BytesRef; | ||
| import org.elasticsearch.compute.data.Block; | ||
| import org.elasticsearch.compute.data.BlockFactory; | ||
| import org.elasticsearch.compute.data.ExponentialHistogramBlock; | ||
| import org.elasticsearch.compute.data.ExponentialHistogramBlockBuilder; | ||
|
|
||
| public class ResultBuilderForExponentialHistogram implements ResultBuilder { | ||
|
|
||
| private final ExponentialHistogramBlockBuilder builder; | ||
| private final ReusableTopNEncoderInput reusableInput = new ReusableTopNEncoderInput(); | ||
|
|
||
| ResultBuilderForExponentialHistogram(BlockFactory blockFactory, int positions) { | ||
| this.builder = blockFactory.newExponentialHistogramBlockBuilder(positions); | ||
| } | ||
|
|
||
| @Override | ||
| public void decodeKey(BytesRef keys) { | ||
| throw new AssertionError("ExponentialHistogramBlock can't be a key"); | ||
| } | ||
|
|
||
| @Override | ||
| public void decodeValue(BytesRef values) { | ||
| int count = TopNEncoder.DEFAULT_UNSORTABLE.decodeVInt(values); | ||
| if (count == 0) { | ||
| builder.appendNull(); | ||
| return; | ||
| } | ||
| assert count == 1 : "ExponentialHistogramBlock does not support multi values"; | ||
| reusableInput.inputValues = values; | ||
| builder.deserializeAndAppend(reusableInput); | ||
| } | ||
|
|
||
| @Override | ||
| public Block build() { | ||
| return builder.build(); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return "ResultBuilderForExponentialHistogram"; | ||
| } | ||
|
|
||
| @Override | ||
| public void close() { | ||
| builder.close(); | ||
| } | ||
|
|
||
| private class ReusableTopNEncoderInput implements ExponentialHistogramBlock.SerializedInput { | ||
JonasKunz marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| BytesRef inputValues; | ||
|
|
||
| @Override | ||
| public double readDouble() { | ||
| return TopNEncoder.DEFAULT_UNSORTABLE.decodeDouble(inputValues); | ||
| } | ||
|
|
||
| @Override | ||
| public long readLong() { | ||
| return TopNEncoder.DEFAULT_UNSORTABLE.decodeLong(inputValues); | ||
| } | ||
|
|
||
| @Override | ||
| public BytesRef readBytesRef(BytesRef tempBytesRef) { | ||
| return TopNEncoder.DEFAULT_UNSORTABLE.decodeBytesRef(inputValues, tempBytesRef); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the Elastic License | ||
| * 2.0; you may not use this file except in compliance with the Elastic License | ||
| * 2.0. | ||
| */ | ||
|
|
||
| package org.elasticsearch.compute.operator.topn; | ||
|
|
||
| import org.apache.lucene.util.BytesRef; | ||
| import org.elasticsearch.compute.data.ExponentialHistogramBlock; | ||
| import org.elasticsearch.compute.operator.BreakingBytesRefBuilder; | ||
|
|
||
| public class ValueExtractorForExponentialHistogram implements ValueExtractor { | ||
| private final ExponentialHistogramBlock block; | ||
|
|
||
| private final BytesRef scratch = new BytesRef(); | ||
| private final ReusableTopNEncoderOutput reusableOutput = new ReusableTopNEncoderOutput(); | ||
|
|
||
| ValueExtractorForExponentialHistogram(TopNEncoder encoder, ExponentialHistogramBlock block) { | ||
| assert encoder == TopNEncoder.DEFAULT_UNSORTABLE; | ||
| this.block = block; | ||
| } | ||
|
|
||
| @Override | ||
| public void writeValue(BreakingBytesRefBuilder values, int position) { | ||
| // number of multi-values first for compatibility with ValueExtractorForNull | ||
| if (block.isNull(position)) { | ||
| TopNEncoder.DEFAULT_UNSORTABLE.encodeVInt(0, values); | ||
| } else { | ||
| assert block.getValueCount(position) == 1 : "Multi-valued ExponentialHistogram blocks are not supported in TopN"; | ||
| TopNEncoder.DEFAULT_UNSORTABLE.encodeVInt(1, values); | ||
| int valueIndex = block.getFirstValueIndex(position); | ||
| reusableOutput.target = values; | ||
| block.serializeExponentialHistogram(valueIndex, reusableOutput, scratch); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return "ValueExtractorForExponentialHistogram"; | ||
| } | ||
|
|
||
| private static final class ReusableTopNEncoderOutput implements ExponentialHistogramBlock.SerializedOutput { | ||
| BreakingBytesRefBuilder target; | ||
|
|
||
| @Override | ||
| public void appendDouble(double value) { | ||
| TopNEncoder.DEFAULT_UNSORTABLE.encodeDouble(value, target); | ||
| } | ||
|
|
||
| @Override | ||
| public void appendLong(long value) { | ||
| TopNEncoder.DEFAULT_UNSORTABLE.encodeLong(value, target); | ||
| } | ||
|
|
||
| @Override | ||
| public void appendBytesRef(BytesRef value) { | ||
| TopNEncoder.DEFAULT_UNSORTABLE.encodeBytesRef(value, target); | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
valueIndexshould bepositioninstead.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The intended usage pattern for
serializeExponentialHistogramis just like for getters on blocks:So
valueIndexinstead ofpositionis correct here.Right now it is true for exponential histogram blocks that we just use the positions directly as
valueIndex, but that will change when we support multi-values.See also this comment: #133393 (comment)
I'll add a comment in
ExponentialHistogramArrayBlockexplaining the mapping of positions and valueIndices to positions in the sub-blocks.