From 6f2a058bd521e73131b9edb8ccf80a13b4fadabd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20Cea=20Fontenla?= Date: Thu, 28 Nov 2024 11:40:12 +0100 Subject: [PATCH] Backport 9022ccc --- docs/changelog/114317.yaml | 5 + .../kibana/definition/categorize.json | 4 +- .../esql/functions/types/categorize.asciidoc | 4 +- muted-tests.yml | 15 - .../AbstractCategorizeBlockHash.java | 105 ++++ .../aggregation/blockhash/BlockHash.java | 28 +- .../blockhash/CategorizeRawBlockHash.java | 137 +++++ .../CategorizedIntermediateBlockHash.java | 77 +++ .../operator/HashAggregationOperator.java | 9 + .../GroupingAggregatorFunctionTestCase.java | 1 + .../blockhash/BlockHashTestCase.java | 34 ++ .../aggregation/blockhash/BlockHashTests.java | 22 +- .../blockhash/CategorizeBlockHashTests.java | 406 ++++++++++++++ .../HashAggregationOperatorTests.java | 1 + .../xpack/esql/CsvTestsDataLoader.java | 2 + .../src/main/resources/categorize.csv-spec | 526 +++++++++++++++++- .../resources/mapping-mv_sample_data.json | 16 + .../src/main/resources/mv_sample_data.csv | 8 + .../grouping/CategorizeEvaluator.java | 145 ----- .../xpack/esql/action/EsqlCapabilities.java | 5 +- .../function/grouping/Categorize.java | 76 +-- .../rules/logical/CombineProjections.java | 38 +- .../optimizer/rules/logical/FoldNull.java | 2 + ...laceAggregateNestedExpressionWithEval.java | 31 +- .../physical/local/InsertFieldExtraction.java | 17 +- .../AbstractPhysicalOperationProviders.java | 42 +- .../xpack/esql/analysis/VerifierTests.java | 6 +- .../function/AbstractAggregationTestCase.java | 3 +- .../function/AbstractFunctionTestCase.java | 19 +- .../AbstractScalarFunctionTestCase.java | 1 + .../expression/function/TestCaseSupplier.java | 83 ++- .../function/grouping/CategorizeTests.java | 16 +- .../optimizer/LogicalPlanOptimizerTests.java | 61 ++ .../rules/logical/FoldNullTests.java | 13 + .../categorization/TokenListCategorizer.java | 24 + 35 files changed, 1660 insertions(+), 322 deletions(-) create mode 100644 docs/changelog/114317.yaml create mode 100644 x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java create mode 100644 x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java create mode 100644 x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java create mode 100644 x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/BlockHashTestCase.java create mode 100644 x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java create mode 100644 x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-mv_sample_data.json create mode 100644 x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_sample_data.csv delete mode 100644 x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeEvaluator.java diff --git a/docs/changelog/114317.yaml b/docs/changelog/114317.yaml new file mode 100644 index 0000000000000..9c73fe513e197 --- /dev/null +++ b/docs/changelog/114317.yaml @@ -0,0 +1,5 @@ +pr: 114317 +summary: "ESQL: CATEGORIZE as a `BlockHash`" +area: ES|QL +type: enhancement +issues: [] diff --git a/docs/reference/esql/functions/kibana/definition/categorize.json b/docs/reference/esql/functions/kibana/definition/categorize.json index 386b178d3753f..ca3971a6e05a3 100644 --- a/docs/reference/esql/functions/kibana/definition/categorize.json +++ b/docs/reference/esql/functions/kibana/definition/categorize.json @@ -14,7 +14,7 @@ } ], "variadic" : false, - "returnType" : "integer" + "returnType" : "keyword" }, { "params" : [ @@ -26,7 +26,7 @@ } ], "variadic" : false, - "returnType" : "integer" + "returnType" : "keyword" } ], "preview" : false, diff --git a/docs/reference/esql/functions/types/categorize.asciidoc b/docs/reference/esql/functions/types/categorize.asciidoc index 4917ed313e6d7..5b64971cbc482 100644 --- a/docs/reference/esql/functions/types/categorize.asciidoc +++ b/docs/reference/esql/functions/types/categorize.asciidoc @@ -5,6 +5,6 @@ [%header.monospaced.styled,format=dsv,separator=|] |=== field | result -keyword | integer -text | integer +keyword | keyword +text | keyword |=== diff --git a/muted-tests.yml b/muted-tests.yml index 336dd2696df91..ddfa55d93f4ee 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -193,12 +193,6 @@ tests: - class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT method: test {p0=indices.split/40_routing_partition_size/more than 1} issue: https://github.com/elastic/elasticsearch/issues/113841 -- class: org.elasticsearch.xpack.esql.qa.mixed.MixedClusterEsqlSpecIT - method: test {categorize.Categorize SYNC} - issue: https://github.com/elastic/elasticsearch/issues/113722 -- class: org.elasticsearch.xpack.esql.qa.mixed.MixedClusterEsqlSpecIT - method: test {categorize.Categorize ASYNC} - issue: https://github.com/elastic/elasticsearch/issues/116373 - class: org.elasticsearch.kibana.KibanaThreadPoolIT method: testBlockedThreadPoolsRejectUserRequests issue: https://github.com/elastic/elasticsearch/issues/113939 @@ -254,12 +248,6 @@ tests: - class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT method: test {p0=search/380_sort_segments_on_timestamp/Test that index segments are NOT sorted on timestamp field when @timestamp field is dynamically added} issue: https://github.com/elastic/elasticsearch/issues/116221 -- class: org.elasticsearch.xpack.esql.qa.multi_node.EsqlSpecIT - method: test {categorize.Categorize SYNC} - issue: https://github.com/elastic/elasticsearch/issues/113054 -- class: org.elasticsearch.xpack.esql.qa.multi_node.EsqlSpecIT - method: test {categorize.Categorize ASYNC} - issue: https://github.com/elastic/elasticsearch/issues/113054 - class: org.elasticsearch.ingest.common.IngestCommonClientYamlTestSuiteIT method: test {yaml=ingest/310_reroute_processor/Test remove then add reroute processor with and without lazy rollover} issue: https://github.com/elastic/elasticsearch/issues/116158 @@ -272,9 +260,6 @@ tests: - class: org.elasticsearch.xpack.deprecation.DeprecationHttpIT method: testDeprecatedSettingsReturnWarnings issue: https://github.com/elastic/elasticsearch/issues/108628 -- class: org.elasticsearch.xpack.esql.ccq.MultiClusterSpecIT - method: test {categorize.Categorize} - issue: https://github.com/elastic/elasticsearch/issues/116434 - class: org.elasticsearch.xpack.apmdata.APMYamlTestSuiteIT method: test {yaml=/10_apm/Test template reinstallation} issue: https://github.com/elastic/elasticsearch/issues/116445 diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java new file mode 100644 index 0000000000000..22d3a10facb06 --- /dev/null +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java @@ -0,0 +1,105 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.compute.aggregation.blockhash; + +import org.apache.lucene.util.BytesRefBuilder; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.BitArray; +import org.elasticsearch.common.util.BytesRefHash; +import org.elasticsearch.compute.data.Block; +import org.elasticsearch.compute.data.BlockFactory; +import org.elasticsearch.compute.data.BytesRefVector; +import org.elasticsearch.compute.data.IntBlock; +import org.elasticsearch.compute.data.IntVector; +import org.elasticsearch.compute.data.Page; +import org.elasticsearch.core.ReleasableIterator; +import org.elasticsearch.xpack.ml.aggs.categorization.CategorizationBytesRefHash; +import org.elasticsearch.xpack.ml.aggs.categorization.CategorizationPartOfSpeechDictionary; +import org.elasticsearch.xpack.ml.aggs.categorization.SerializableTokenListCategory; +import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer; + +import java.io.IOException; + +/** + * Base BlockHash implementation for {@code Categorize} grouping function. + */ +public abstract class AbstractCategorizeBlockHash extends BlockHash { + // TODO: this should probably also take an emitBatchSize + private final int channel; + private final boolean outputPartial; + protected final TokenListCategorizer.CloseableTokenListCategorizer categorizer; + + AbstractCategorizeBlockHash(BlockFactory blockFactory, int channel, boolean outputPartial) { + super(blockFactory); + this.channel = channel; + this.outputPartial = outputPartial; + this.categorizer = new TokenListCategorizer.CloseableTokenListCategorizer( + new CategorizationBytesRefHash(new BytesRefHash(2048, blockFactory.bigArrays())), + CategorizationPartOfSpeechDictionary.getInstance(), + 0.70f + ); + } + + protected int channel() { + return channel; + } + + @Override + public Block[] getKeys() { + return new Block[] { outputPartial ? buildIntermediateBlock() : buildFinalBlock() }; + } + + @Override + public IntVector nonEmpty() { + return IntVector.range(0, categorizer.getCategoryCount(), blockFactory); + } + + @Override + public BitArray seenGroupIds(BigArrays bigArrays) { + throw new UnsupportedOperationException(); + } + + @Override + public final ReleasableIterator lookup(Page page, ByteSizeValue targetBlockSize) { + throw new UnsupportedOperationException(); + } + + /** + * Serializes the intermediate state into a single BytesRef block, or an empty Null block if there are no categories. + */ + private Block buildIntermediateBlock() { + if (categorizer.getCategoryCount() == 0) { + return blockFactory.newConstantNullBlock(0); + } + try (BytesStreamOutput out = new BytesStreamOutput()) { + // TODO be more careful here. + out.writeVInt(categorizer.getCategoryCount()); + for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { + category.writeTo(out); + } + // We're returning a block with N positions just because the Page must have all blocks with the same position count! + return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), categorizer.getCategoryCount()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Block buildFinalBlock() { + try (BytesRefVector.Builder result = blockFactory.newBytesRefVectorBuilder(categorizer.getCategoryCount())) { + BytesRefBuilder scratch = new BytesRefBuilder(); + for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { + scratch.copyChars(category.getRegex()); + result.appendBytesRef(scratch.get()); + scratch.clear(); + } + return result.build().asBlock(); + } + } +} diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java index 919cb92f79260..ef0f3ceb112c4 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java @@ -14,6 +14,7 @@ import org.elasticsearch.common.util.Int3Hash; import org.elasticsearch.common.util.LongHash; import org.elasticsearch.common.util.LongLongHash; +import org.elasticsearch.compute.aggregation.AggregatorMode; import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction; import org.elasticsearch.compute.aggregation.SeenGroupIds; import org.elasticsearch.compute.data.Block; @@ -58,9 +59,7 @@ * leave a big gap, even if we never see {@code null}. *

*/ -public abstract sealed class BlockHash implements Releasable, SeenGroupIds // - permits BooleanBlockHash, BytesRefBlockHash, DoubleBlockHash, IntBlockHash, LongBlockHash, BytesRef2BlockHash, BytesRef3BlockHash, // - NullBlockHash, PackedValuesBlockHash, BytesRefLongBlockHash, LongLongBlockHash, TimeSeriesBlockHash { +public abstract class BlockHash implements Releasable, SeenGroupIds { protected final BlockFactory blockFactory; @@ -107,7 +106,15 @@ public abstract sealed class BlockHash implements Releasable, SeenGroupIds // @Override public abstract BitArray seenGroupIds(BigArrays bigArrays); - public record GroupSpec(int channel, ElementType elementType) {} + /** + * @param isCategorize Whether this group is a CATEGORIZE() or not. + * May be changed in the future when more stateful grouping functions are added. + */ + public record GroupSpec(int channel, ElementType elementType, boolean isCategorize) { + public GroupSpec(int channel, ElementType elementType) { + this(channel, elementType, false); + } + } /** * Creates a specialized hash table that maps one or more {@link Block}s to ids. @@ -159,6 +166,19 @@ public static BlockHash buildPackedValuesBlockHash(List groups, Block return new PackedValuesBlockHash(groups, blockFactory, emitBatchSize); } + /** + * Builds a BlockHash for the Categorize grouping function. + */ + public static BlockHash buildCategorizeBlockHash(List groups, AggregatorMode aggregatorMode, BlockFactory blockFactory) { + if (groups.size() != 1) { + throw new IllegalArgumentException("only a single CATEGORIZE group can used"); + } + + return aggregatorMode.isInputPartial() + ? new CategorizedIntermediateBlockHash(groups.get(0).channel, blockFactory, aggregatorMode.isOutputPartial()) + : new CategorizeRawBlockHash(groups.get(0).channel, blockFactory, aggregatorMode.isOutputPartial()); + } + /** * Creates a specialized hash table that maps a {@link Block} of the given input element type to ids. */ diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java new file mode 100644 index 0000000000000..bf633e0454384 --- /dev/null +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java @@ -0,0 +1,137 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.compute.aggregation.blockhash; + +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction; +import org.elasticsearch.compute.data.Block; +import org.elasticsearch.compute.data.BlockFactory; +import org.elasticsearch.compute.data.BytesRefBlock; +import org.elasticsearch.compute.data.BytesRefVector; +import org.elasticsearch.compute.data.IntBlock; +import org.elasticsearch.compute.data.IntVector; +import org.elasticsearch.compute.data.Page; +import org.elasticsearch.core.Releasable; +import org.elasticsearch.core.Releasables; +import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.index.analysis.CustomAnalyzer; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; +import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer; +import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer; + +/** + * BlockHash implementation for {@code Categorize} grouping function. + *

+ * This implementation expects rows, and can't deserialize intermediate states coming from other nodes. + *

+ */ +public class CategorizeRawBlockHash extends AbstractCategorizeBlockHash { + private final CategorizeEvaluator evaluator; + + CategorizeRawBlockHash(int channel, BlockFactory blockFactory, boolean outputPartial) { + super(blockFactory, channel, outputPartial); + CategorizationAnalyzer analyzer = new CategorizationAnalyzer( + // TODO: should be the same analyzer as used in Production + new CustomAnalyzer( + TokenizerFactory.newFactory("whitespace", WhitespaceTokenizer::new), + new CharFilterFactory[0], + new TokenFilterFactory[0] + ), + true + ); + this.evaluator = new CategorizeEvaluator(analyzer, categorizer, blockFactory); + } + + @Override + public void add(Page page, GroupingAggregatorFunction.AddInput addInput) { + try (IntBlock result = (IntBlock) evaluator.eval(page.getBlock(channel()))) { + addInput.add(0, result); + } + } + + @Override + public void close() { + evaluator.close(); + } + + /** + * Similar implementation to an Evaluator. + */ + public static final class CategorizeEvaluator implements Releasable { + private final CategorizationAnalyzer analyzer; + + private final TokenListCategorizer.CloseableTokenListCategorizer categorizer; + + private final BlockFactory blockFactory; + + public CategorizeEvaluator( + CategorizationAnalyzer analyzer, + TokenListCategorizer.CloseableTokenListCategorizer categorizer, + BlockFactory blockFactory + ) { + this.analyzer = analyzer; + this.categorizer = categorizer; + this.blockFactory = blockFactory; + } + + public Block eval(BytesRefBlock vBlock) { + BytesRefVector vVector = vBlock.asVector(); + if (vVector == null) { + return eval(vBlock.getPositionCount(), vBlock); + } + IntVector vector = eval(vBlock.getPositionCount(), vVector); + return vector.asBlock(); + } + + public IntBlock eval(int positionCount, BytesRefBlock vBlock) { + try (IntBlock.Builder result = blockFactory.newIntBlockBuilder(positionCount)) { + BytesRef vScratch = new BytesRef(); + for (int p = 0; p < positionCount; p++) { + if (vBlock.isNull(p)) { + result.appendNull(); + continue; + } + int first = vBlock.getFirstValueIndex(p); + int count = vBlock.getValueCount(p); + if (count == 1) { + result.appendInt(process(vBlock.getBytesRef(first, vScratch))); + continue; + } + int end = first + count; + result.beginPositionEntry(); + for (int i = first; i < end; i++) { + result.appendInt(process(vBlock.getBytesRef(i, vScratch))); + } + result.endPositionEntry(); + } + return result.build(); + } + } + + public IntVector eval(int positionCount, BytesRefVector vVector) { + try (IntVector.FixedBuilder result = blockFactory.newIntVectorFixedBuilder(positionCount)) { + BytesRef vScratch = new BytesRef(); + for (int p = 0; p < positionCount; p++) { + result.appendInt(p, process(vVector.getBytesRef(p, vScratch))); + } + return result.build(); + } + } + + private int process(BytesRef v) { + return categorizer.computeCategory(v.utf8ToString(), analyzer).getId(); + } + + @Override + public void close() { + Releasables.closeExpectNoException(analyzer, categorizer); + } + } +} diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java new file mode 100644 index 0000000000000..1bca34a70e5fa --- /dev/null +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java @@ -0,0 +1,77 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.compute.aggregation.blockhash; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction; +import org.elasticsearch.compute.data.BlockFactory; +import org.elasticsearch.compute.data.BytesRefBlock; +import org.elasticsearch.compute.data.IntBlock; +import org.elasticsearch.compute.data.Page; +import org.elasticsearch.xpack.ml.aggs.categorization.SerializableTokenListCategory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** + * BlockHash implementation for {@code Categorize} grouping function. + *

+ * This implementation expects a single intermediate state in a block, as generated by {@link AbstractCategorizeBlockHash}. + *

+ */ +public class CategorizedIntermediateBlockHash extends AbstractCategorizeBlockHash { + + CategorizedIntermediateBlockHash(int channel, BlockFactory blockFactory, boolean outputPartial) { + super(blockFactory, channel, outputPartial); + } + + @Override + public void add(Page page, GroupingAggregatorFunction.AddInput addInput) { + if (page.getPositionCount() == 0) { + // No categories + return; + } + BytesRefBlock categorizerState = page.getBlock(channel()); + Map idMap = readIntermediate(categorizerState.getBytesRef(0, new BytesRef())); + try (IntBlock.Builder newIdsBuilder = blockFactory.newIntBlockBuilder(idMap.size())) { + for (int i = 0; i < idMap.size(); i++) { + newIdsBuilder.appendInt(idMap.get(i)); + } + try (IntBlock newIds = newIdsBuilder.build()) { + addInput.add(0, newIds); + } + } + } + + /** + * Read intermediate state from a block. + * + * @return a map from the old category id to the new one. The old ids go from 0 to {@code size - 1}. + */ + private Map readIntermediate(BytesRef bytes) { + Map idMap = new HashMap<>(); + try (StreamInput in = new BytesArray(bytes).streamInput()) { + int count = in.readVInt(); + for (int oldCategoryId = 0; oldCategoryId < count; oldCategoryId++) { + int newCategoryId = categorizer.mergeWireCategory(new SerializableTokenListCategory(in)).getId(); + idMap.put(oldCategoryId, newCategoryId); + } + return idMap; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() { + categorizer.close(); + } +} diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/HashAggregationOperator.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/HashAggregationOperator.java index 03a4ca2b0ad5e..a69e8ca767014 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/HashAggregationOperator.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/HashAggregationOperator.java @@ -14,6 +14,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.compute.Describable; +import org.elasticsearch.compute.aggregation.AggregatorMode; import org.elasticsearch.compute.aggregation.GroupingAggregator; import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction; import org.elasticsearch.compute.aggregation.blockhash.BlockHash; @@ -39,11 +40,19 @@ public class HashAggregationOperator implements Operator { public record HashAggregationOperatorFactory( List groups, + AggregatorMode aggregatorMode, List aggregators, int maxPageSize ) implements OperatorFactory { @Override public Operator get(DriverContext driverContext) { + if (groups.stream().anyMatch(BlockHash.GroupSpec::isCategorize)) { + return new HashAggregationOperator( + aggregators, + () -> BlockHash.buildCategorizeBlockHash(groups, aggregatorMode, driverContext.blockFactory()), + driverContext + ); + } return new HashAggregationOperator( aggregators, () -> BlockHash.build(groups, driverContext.blockFactory(), maxPageSize, false), diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/GroupingAggregatorFunctionTestCase.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/GroupingAggregatorFunctionTestCase.java index cb190dfffafb9..1e97bdf5a2e79 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/GroupingAggregatorFunctionTestCase.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/GroupingAggregatorFunctionTestCase.java @@ -105,6 +105,7 @@ private Operator.OperatorFactory simpleWithMode( } return new HashAggregationOperator.HashAggregationOperatorFactory( List.of(new BlockHash.GroupSpec(0, ElementType.LONG)), + mode, List.of(supplier.groupingAggregatorFactory(mode)), randomPageSize() ); diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/BlockHashTestCase.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/BlockHashTestCase.java new file mode 100644 index 0000000000000..fa93c0aa1c375 --- /dev/null +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/BlockHashTestCase.java @@ -0,0 +1,34 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.compute.aggregation.blockhash; + +import org.elasticsearch.common.breaker.CircuitBreaker; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.MockBigArrays; +import org.elasticsearch.common.util.PageCacheRecycler; +import org.elasticsearch.compute.data.MockBlockFactory; +import org.elasticsearch.indices.breaker.CircuitBreakerService; +import org.elasticsearch.test.ESTestCase; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public abstract class BlockHashTestCase extends ESTestCase { + + final CircuitBreaker breaker = newLimitedBreaker(ByteSizeValue.ofGb(1)); + final BigArrays bigArrays = new MockBigArrays(PageCacheRecycler.NON_RECYCLING_INSTANCE, mockBreakerService(breaker)); + final MockBlockFactory blockFactory = new MockBlockFactory(breaker, bigArrays); + + // A breaker service that always returns the given breaker for getBreaker(CircuitBreaker.REQUEST) + private static CircuitBreakerService mockBreakerService(CircuitBreaker breaker) { + CircuitBreakerService breakerService = mock(CircuitBreakerService.class); + when(breakerService.getBreaker(CircuitBreaker.REQUEST)).thenReturn(breaker); + return breakerService; + } +} diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/BlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/BlockHashTests.java index 088e791348840..ede2d68ca2367 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/BlockHashTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/BlockHashTests.java @@ -11,11 +11,7 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.common.breaker.CircuitBreaker; import org.elasticsearch.common.unit.ByteSizeValue; -import org.elasticsearch.common.util.BigArrays; -import org.elasticsearch.common.util.MockBigArrays; -import org.elasticsearch.common.util.PageCacheRecycler; import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction; import org.elasticsearch.compute.data.Block; import org.elasticsearch.compute.data.BooleanBlock; @@ -26,7 +22,6 @@ import org.elasticsearch.compute.data.IntBlock; import org.elasticsearch.compute.data.IntVector; import org.elasticsearch.compute.data.LongBlock; -import org.elasticsearch.compute.data.MockBlockFactory; import org.elasticsearch.compute.data.OrdinalBytesRefBlock; import org.elasticsearch.compute.data.OrdinalBytesRefVector; import org.elasticsearch.compute.data.Page; @@ -34,8 +29,6 @@ import org.elasticsearch.core.Releasable; import org.elasticsearch.core.ReleasableIterator; import org.elasticsearch.core.Releasables; -import org.elasticsearch.indices.breaker.CircuitBreakerService; -import org.elasticsearch.test.ESTestCase; import org.junit.After; import java.util.ArrayList; @@ -54,14 +47,8 @@ import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.startsWith; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; -public class BlockHashTests extends ESTestCase { - - final CircuitBreaker breaker = new MockBigArrays.LimitedBreaker("esql-test-breaker", ByteSizeValue.ofGb(1)); - final BigArrays bigArrays = new MockBigArrays(PageCacheRecycler.NON_RECYCLING_INSTANCE, mockBreakerService(breaker)); - final MockBlockFactory blockFactory = new MockBlockFactory(breaker, bigArrays); +public class BlockHashTests extends BlockHashTestCase { @ParametersFactory public static List params() { @@ -1534,13 +1521,6 @@ private void assertKeys(Block[] actualKeys, Object[][] expectedKeys) { } } - // A breaker service that always returns the given breaker for getBreaker(CircuitBreaker.REQUEST) - static CircuitBreakerService mockBreakerService(CircuitBreaker breaker) { - CircuitBreakerService breakerService = mock(CircuitBreakerService.class); - when(breakerService.getBreaker(CircuitBreaker.REQUEST)).thenReturn(breaker); - return breakerService; - } - IntVector intRange(int startInclusive, int endExclusive) { return IntVector.range(startInclusive, endExclusive, TestBlockFactory.getNonBreakingInstance()); } diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java new file mode 100644 index 0000000000000..de8a2a44266fe --- /dev/null +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java @@ -0,0 +1,406 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.compute.aggregation.blockhash; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.breaker.CircuitBreaker; +import org.elasticsearch.common.collect.Iterators; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.MockBigArrays; +import org.elasticsearch.common.util.PageCacheRecycler; +import org.elasticsearch.compute.aggregation.AggregatorMode; +import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction; +import org.elasticsearch.compute.aggregation.MaxLongAggregatorFunctionSupplier; +import org.elasticsearch.compute.aggregation.SumLongAggregatorFunctionSupplier; +import org.elasticsearch.compute.data.Block; +import org.elasticsearch.compute.data.BlockFactory; +import org.elasticsearch.compute.data.BytesRefBlock; +import org.elasticsearch.compute.data.BytesRefVector; +import org.elasticsearch.compute.data.ElementType; +import org.elasticsearch.compute.data.IntBlock; +import org.elasticsearch.compute.data.IntVector; +import org.elasticsearch.compute.data.LongBlock; +import org.elasticsearch.compute.data.LongVector; +import org.elasticsearch.compute.data.Page; +import org.elasticsearch.compute.operator.CannedSourceOperator; +import org.elasticsearch.compute.operator.Driver; +import org.elasticsearch.compute.operator.DriverContext; +import org.elasticsearch.compute.operator.HashAggregationOperator; +import org.elasticsearch.compute.operator.LocalSourceOperator; +import org.elasticsearch.compute.operator.PageConsumerOperator; +import org.elasticsearch.core.Releasables; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.elasticsearch.compute.operator.OperatorTestCase.runDriver; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasSize; + +public class CategorizeBlockHashTests extends BlockHashTestCase { + + public void testCategorizeRaw() { + final Page page; + final int positions = 7; + try (BytesRefBlock.Builder builder = blockFactory.newBytesRefBlockBuilder(positions)) { + builder.appendBytesRef(new BytesRef("Connected to 10.1.0.1")); + builder.appendBytesRef(new BytesRef("Connection error")); + builder.appendBytesRef(new BytesRef("Connection error")); + builder.appendBytesRef(new BytesRef("Connection error")); + builder.appendBytesRef(new BytesRef("Disconnected")); + builder.appendBytesRef(new BytesRef("Connected to 10.1.0.2")); + builder.appendBytesRef(new BytesRef("Connected to 10.1.0.3")); + page = new Page(builder.build()); + } + + try (BlockHash hash = new CategorizeRawBlockHash(0, blockFactory, true)) { + hash.add(page, new GroupingAggregatorFunction.AddInput() { + @Override + public void add(int positionOffset, IntBlock groupIds) { + assertEquals(groupIds.getPositionCount(), positions); + + assertEquals(0, groupIds.getInt(0)); + assertEquals(1, groupIds.getInt(1)); + assertEquals(1, groupIds.getInt(2)); + assertEquals(1, groupIds.getInt(3)); + assertEquals(2, groupIds.getInt(4)); + assertEquals(0, groupIds.getInt(5)); + assertEquals(0, groupIds.getInt(6)); + } + + @Override + public void add(int positionOffset, IntVector groupIds) { + add(positionOffset, groupIds.asBlock()); + } + + @Override + public void close() { + fail("hashes should not close AddInput"); + } + }); + } finally { + page.releaseBlocks(); + } + + // TODO: randomize and try multiple pages. + // TODO: assert the state of the BlockHash after adding pages. Including the categorizer state. + // TODO: also test the lookup method and other stuff. + } + + public void testCategorizeIntermediate() { + Page page1; + int positions1 = 7; + try (BytesRefBlock.Builder builder = blockFactory.newBytesRefBlockBuilder(positions1)) { + builder.appendBytesRef(new BytesRef("Connected to 10.1.0.1")); + builder.appendBytesRef(new BytesRef("Connection error")); + builder.appendBytesRef(new BytesRef("Connection error")); + builder.appendBytesRef(new BytesRef("Connected to 10.1.0.2")); + builder.appendBytesRef(new BytesRef("Connection error")); + builder.appendBytesRef(new BytesRef("Connected to 10.1.0.3")); + builder.appendBytesRef(new BytesRef("Connected to 10.1.0.4")); + page1 = new Page(builder.build()); + } + Page page2; + int positions2 = 5; + try (BytesRefBlock.Builder builder = blockFactory.newBytesRefBlockBuilder(positions2)) { + builder.appendBytesRef(new BytesRef("Disconnected")); + builder.appendBytesRef(new BytesRef("Connected to 10.2.0.1")); + builder.appendBytesRef(new BytesRef("Disconnected")); + builder.appendBytesRef(new BytesRef("Connected to 10.3.0.2")); + builder.appendBytesRef(new BytesRef("System shutdown")); + page2 = new Page(builder.build()); + } + + Page intermediatePage1, intermediatePage2; + + // Fill intermediatePages with the intermediate state from the raw hashes + try ( + BlockHash rawHash1 = new CategorizeRawBlockHash(0, blockFactory, true); + BlockHash rawHash2 = new CategorizeRawBlockHash(0, blockFactory, true) + ) { + rawHash1.add(page1, new GroupingAggregatorFunction.AddInput() { + @Override + public void add(int positionOffset, IntBlock groupIds) { + assertEquals(groupIds.getPositionCount(), positions1); + assertEquals(0, groupIds.getInt(0)); + assertEquals(1, groupIds.getInt(1)); + assertEquals(1, groupIds.getInt(2)); + assertEquals(0, groupIds.getInt(3)); + assertEquals(1, groupIds.getInt(4)); + assertEquals(0, groupIds.getInt(5)); + assertEquals(0, groupIds.getInt(6)); + } + + @Override + public void add(int positionOffset, IntVector groupIds) { + add(positionOffset, groupIds.asBlock()); + } + + @Override + public void close() { + fail("hashes should not close AddInput"); + } + }); + intermediatePage1 = new Page(rawHash1.getKeys()[0]); + + rawHash2.add(page2, new GroupingAggregatorFunction.AddInput() { + @Override + public void add(int positionOffset, IntBlock groupIds) { + assertEquals(groupIds.getPositionCount(), positions2); + assertEquals(0, groupIds.getInt(0)); + assertEquals(1, groupIds.getInt(1)); + assertEquals(0, groupIds.getInt(2)); + assertEquals(1, groupIds.getInt(3)); + assertEquals(2, groupIds.getInt(4)); + } + + @Override + public void add(int positionOffset, IntVector groupIds) { + add(positionOffset, groupIds.asBlock()); + } + + @Override + public void close() { + fail("hashes should not close AddInput"); + } + }); + intermediatePage2 = new Page(rawHash2.getKeys()[0]); + } finally { + page1.releaseBlocks(); + page2.releaseBlocks(); + } + + try (BlockHash intermediateHash = new CategorizedIntermediateBlockHash(0, blockFactory, true)) { + intermediateHash.add(intermediatePage1, new GroupingAggregatorFunction.AddInput() { + @Override + public void add(int positionOffset, IntBlock groupIds) { + Set values = IntStream.range(0, groupIds.getPositionCount()) + .map(groupIds::getInt) + .boxed() + .collect(Collectors.toSet()); + assertEquals(values, Set.of(0, 1)); + } + + @Override + public void add(int positionOffset, IntVector groupIds) { + add(positionOffset, groupIds.asBlock()); + } + + @Override + public void close() { + fail("hashes should not close AddInput"); + } + }); + + intermediateHash.add(intermediatePage2, new GroupingAggregatorFunction.AddInput() { + @Override + public void add(int positionOffset, IntBlock groupIds) { + Set values = IntStream.range(0, groupIds.getPositionCount()) + .map(groupIds::getInt) + .boxed() + .collect(Collectors.toSet()); + // The category IDs {0, 1, 2} should map to groups {0, 2, 3}, because + // 0 matches an existing category (Connected to ...), and the others are new. + assertEquals(values, Set.of(0, 2, 3)); + } + + @Override + public void add(int positionOffset, IntVector groupIds) { + add(positionOffset, groupIds.asBlock()); + } + + @Override + public void close() { + fail("hashes should not close AddInput"); + } + }); + } finally { + intermediatePage1.releaseBlocks(); + intermediatePage2.releaseBlocks(); + } + } + + public void testCategorize_withDriver() { + BigArrays bigArrays = new MockBigArrays(PageCacheRecycler.NON_RECYCLING_INSTANCE, ByteSizeValue.ofMb(256)).withCircuitBreaking(); + CircuitBreaker breaker = bigArrays.breakerService().getBreaker(CircuitBreaker.REQUEST); + DriverContext driverContext = new DriverContext(bigArrays, new BlockFactory(breaker, bigArrays)); + + LocalSourceOperator.BlockSupplier input1 = () -> { + try ( + BytesRefVector.Builder textsBuilder = driverContext.blockFactory().newBytesRefVectorBuilder(10); + LongVector.Builder countsBuilder = driverContext.blockFactory().newLongVectorBuilder(10) + ) { + textsBuilder.appendBytesRef(new BytesRef("a")); + textsBuilder.appendBytesRef(new BytesRef("b")); + textsBuilder.appendBytesRef(new BytesRef("words words words goodbye jan")); + textsBuilder.appendBytesRef(new BytesRef("words words words goodbye nik")); + textsBuilder.appendBytesRef(new BytesRef("words words words goodbye tom")); + textsBuilder.appendBytesRef(new BytesRef("words words words hello jan")); + textsBuilder.appendBytesRef(new BytesRef("c")); + textsBuilder.appendBytesRef(new BytesRef("d")); + countsBuilder.appendLong(1); + countsBuilder.appendLong(2); + countsBuilder.appendLong(800); + countsBuilder.appendLong(80); + countsBuilder.appendLong(8000); + countsBuilder.appendLong(900); + countsBuilder.appendLong(30); + countsBuilder.appendLong(4); + return new Block[] { textsBuilder.build().asBlock(), countsBuilder.build().asBlock() }; + } + }; + LocalSourceOperator.BlockSupplier input2 = () -> { + try ( + BytesRefVector.Builder textsBuilder = driverContext.blockFactory().newBytesRefVectorBuilder(10); + LongVector.Builder countsBuilder = driverContext.blockFactory().newLongVectorBuilder(10) + ) { + textsBuilder.appendBytesRef(new BytesRef("words words words hello nik")); + textsBuilder.appendBytesRef(new BytesRef("words words words hello nik")); + textsBuilder.appendBytesRef(new BytesRef("c")); + textsBuilder.appendBytesRef(new BytesRef("words words words goodbye chris")); + textsBuilder.appendBytesRef(new BytesRef("d")); + textsBuilder.appendBytesRef(new BytesRef("e")); + countsBuilder.appendLong(9); + countsBuilder.appendLong(90); + countsBuilder.appendLong(3); + countsBuilder.appendLong(8); + countsBuilder.appendLong(40); + countsBuilder.appendLong(5); + return new Block[] { textsBuilder.build().asBlock(), countsBuilder.build().asBlock() }; + } + }; + + List intermediateOutput = new ArrayList<>(); + + Driver driver = new Driver( + driverContext, + new LocalSourceOperator(input1), + List.of( + new HashAggregationOperator.HashAggregationOperatorFactory( + List.of(makeGroupSpec()), + AggregatorMode.INITIAL, + List.of( + new SumLongAggregatorFunctionSupplier(List.of(1)).groupingAggregatorFactory(AggregatorMode.INITIAL), + new MaxLongAggregatorFunctionSupplier(List.of(1)).groupingAggregatorFactory(AggregatorMode.INITIAL) + ), + 16 * 1024 + ).get(driverContext) + ), + new PageConsumerOperator(intermediateOutput::add), + () -> {} + ); + runDriver(driver); + + driver = new Driver( + driverContext, + new LocalSourceOperator(input2), + List.of( + new HashAggregationOperator.HashAggregationOperatorFactory( + List.of(makeGroupSpec()), + AggregatorMode.INITIAL, + List.of( + new SumLongAggregatorFunctionSupplier(List.of(1)).groupingAggregatorFactory(AggregatorMode.INITIAL), + new MaxLongAggregatorFunctionSupplier(List.of(1)).groupingAggregatorFactory(AggregatorMode.INITIAL) + ), + 16 * 1024 + ).get(driverContext) + ), + new PageConsumerOperator(intermediateOutput::add), + () -> {} + ); + runDriver(driver); + + List finalOutput = new ArrayList<>(); + + driver = new Driver( + driverContext, + new CannedSourceOperator(intermediateOutput.iterator()), + List.of( + new HashAggregationOperator.HashAggregationOperatorFactory( + List.of(makeGroupSpec()), + AggregatorMode.FINAL, + List.of( + new SumLongAggregatorFunctionSupplier(List.of(1, 2)).groupingAggregatorFactory(AggregatorMode.FINAL), + new MaxLongAggregatorFunctionSupplier(List.of(3, 4)).groupingAggregatorFactory(AggregatorMode.FINAL) + ), + 16 * 1024 + ).get(driverContext) + ), + new PageConsumerOperator(finalOutput::add), + () -> {} + ); + runDriver(driver); + + assertThat(finalOutput, hasSize(1)); + assertThat(finalOutput.get(0).getBlockCount(), equalTo(3)); + BytesRefBlock outputTexts = finalOutput.get(0).getBlock(0); + LongBlock outputSums = finalOutput.get(0).getBlock(1); + LongBlock outputMaxs = finalOutput.get(0).getBlock(2); + assertThat(outputSums.getPositionCount(), equalTo(outputTexts.getPositionCount())); + assertThat(outputMaxs.getPositionCount(), equalTo(outputTexts.getPositionCount())); + Map sums = new HashMap<>(); + Map maxs = new HashMap<>(); + for (int i = 0; i < outputTexts.getPositionCount(); i++) { + sums.put(outputTexts.getBytesRef(i, new BytesRef()).utf8ToString(), outputSums.getLong(i)); + maxs.put(outputTexts.getBytesRef(i, new BytesRef()).utf8ToString(), outputMaxs.getLong(i)); + } + assertThat( + sums, + equalTo( + Map.of( + ".*?a.*?", + 1L, + ".*?b.*?", + 2L, + ".*?c.*?", + 33L, + ".*?d.*?", + 44L, + ".*?e.*?", + 5L, + ".*?words.+?words.+?words.+?goodbye.*?", + 8888L, + ".*?words.+?words.+?words.+?hello.*?", + 999L + ) + ) + ); + assertThat( + maxs, + equalTo( + Map.of( + ".*?a.*?", + 1L, + ".*?b.*?", + 2L, + ".*?c.*?", + 30L, + ".*?d.*?", + 40L, + ".*?e.*?", + 5L, + ".*?words.+?words.+?words.+?goodbye.*?", + 8000L, + ".*?words.+?words.+?words.+?hello.*?", + 900L + ) + ) + ); + Releasables.close(() -> Iterators.map(finalOutput.iterator(), (Page p) -> p::releaseBlocks)); + } + + private BlockHash.GroupSpec makeGroupSpec() { + return new BlockHash.GroupSpec(0, ElementType.BYTES_REF, true); + } +} diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java index f2fa94c1feb08..b2f4ad594936e 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java @@ -54,6 +54,7 @@ protected Operator.OperatorFactory simpleWithMode(AggregatorMode mode) { return new HashAggregationOperator.HashAggregationOperatorFactory( List.of(new BlockHash.GroupSpec(0, ElementType.LONG)), + mode, List.of( new SumLongAggregatorFunctionSupplier(sumChannels).groupingAggregatorFactory(mode), new MaxLongAggregatorFunctionSupplier(maxChannels).groupingAggregatorFactory(mode) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java index df04e1397556e..32e244f4b729d 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java @@ -59,6 +59,7 @@ public class CsvTestsDataLoader { private static final TestsDataset ALERTS = new TestsDataset("alerts"); private static final TestsDataset UL_LOGS = new TestsDataset("ul_logs"); private static final TestsDataset SAMPLE_DATA = new TestsDataset("sample_data"); + private static final TestsDataset MV_SAMPLE_DATA = new TestsDataset("mv_sample_data"); private static final TestsDataset SAMPLE_DATA_STR = SAMPLE_DATA.withIndex("sample_data_str") .withTypeMapping(Map.of("client_ip", "keyword")); private static final TestsDataset SAMPLE_DATA_TS_LONG = SAMPLE_DATA.withIndex("sample_data_ts_long") @@ -103,6 +104,7 @@ public class CsvTestsDataLoader { Map.entry(LANGUAGES.indexName, LANGUAGES), Map.entry(UL_LOGS.indexName, UL_LOGS), Map.entry(SAMPLE_DATA.indexName, SAMPLE_DATA), + Map.entry(MV_SAMPLE_DATA.indexName, MV_SAMPLE_DATA), Map.entry(ALERTS.indexName, ALERTS), Map.entry(SAMPLE_DATA_STR.indexName, SAMPLE_DATA_STR), Map.entry(SAMPLE_DATA_TS_LONG.indexName, SAMPLE_DATA_TS_LONG), diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec index 8e0fcd78f0322..89d9026423204 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec @@ -1,14 +1,524 @@ -categorize -required_capability: categorize +standard aggs +required_capability: categorize_v2 FROM sample_data - | SORT message ASC - | STATS count=COUNT(), values=MV_SORT(VALUES(message)) BY category=CATEGORIZE(message) + | STATS count=COUNT(), + sum=SUM(event_duration), + avg=AVG(event_duration), + count_distinct=COUNT_DISTINCT(event_duration) + BY category=CATEGORIZE(message) + | SORT count DESC, category +; + +count:long | sum:long | avg:double | count_distinct:long | category:keyword + 3 | 7971589 | 2657196.3333333335 | 3 | .*?Connected.+?to.*? + 3 | 14027356 | 4675785.333333333 | 3 | .*?Connection.+?error.*? + 1 | 1232382 | 1232382.0 | 1 | .*?Disconnected.*? +; + +values aggs +required_capability: categorize_v2 + +FROM sample_data + | STATS values=MV_SORT(VALUES(message)), + top=TOP(event_duration, 2, "DESC") + BY category=CATEGORIZE(message) + | SORT category +; + +values:keyword | top:long | category:keyword +[Connected to 10.1.0.1, Connected to 10.1.0.2, Connected to 10.1.0.3] | [3450233, 2764889] | .*?Connected.+?to.*? +[Connection error] | [8268153, 5033755] | .*?Connection.+?error.*? +[Disconnected] | 1232382 | .*?Disconnected.*? +; + +mv +required_capability: categorize_v2 + +FROM mv_sample_data + | STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(message) + | SORT category +; + +COUNT():long | SUM(event_duration):long | category:keyword + 7 | 23231327 | .*?Banana.*? + 3 | 7971589 | .*?Connected.+?to.*? + 3 | 14027356 | .*?Connection.+?error.*? + 1 | 1232382 | .*?Disconnected.*? +; + +row mv +required_capability: categorize_v2 + +ROW message = ["connected to a", "connected to b", "disconnected"], str = ["a", "b", "c"] + | STATS COUNT(), VALUES(str) BY category=CATEGORIZE(message) + | SORT category +; + +COUNT():long | VALUES(str):keyword | category:keyword + 2 | [a, b, c] | .*?connected.+?to.*? + 1 | [a, b, c] | .*?disconnected.*? +; + +with multiple indices +required_capability: categorize_v2 +required_capability: union_types + +FROM sample_data* + | STATS COUNT() BY category=CATEGORIZE(message) + | SORT category +; + +COUNT():long | category:keyword + 12 | .*?Connected.+?to.*? + 12 | .*?Connection.+?error.*? + 4 | .*?Disconnected.*? +; + +mv with many values +required_capability: categorize_v2 + +FROM employees + | STATS COUNT() BY category=CATEGORIZE(job_positions) + | SORT category + | LIMIT 5 +; + +COUNT():long | category:keyword + 18 | .*?Accountant.*? + 13 | .*?Architect.*? + 11 | .*?Business.+?Analyst.*? + 13 | .*?Data.+?Scientist.*? + 10 | .*?Head.+?Human.+?Resources.*? +; + +# Throws when calling AbstractCategorizeBlockHash.seenGroupIds() - Requires nulls support? +mv with many values-Ignore +required_capability: categorize_v2 + +FROM employees + | STATS SUM(languages) BY category=CATEGORIZE(job_positions) + | SORT category DESC + | LIMIT 3 +; + +SUM(languages):integer | category:keyword + 43 | .*?Accountant.*? + 46 | .*?Architect.*? + 35 | .*?Business.+?Analyst.*? +; + +mv via eval +required_capability: categorize_v2 + +FROM sample_data + | EVAL message = MV_APPEND(message, "Banana") + | STATS COUNT() BY category=CATEGORIZE(message) + | SORT category +; + +COUNT():long | category:keyword + 7 | .*?Banana.*? + 3 | .*?Connected.+?to.*? + 3 | .*?Connection.+?error.*? + 1 | .*?Disconnected.*? +; + +mv via eval const +required_capability: categorize_v2 + +FROM sample_data + | EVAL message = ["Banana", "Bread"] + | STATS COUNT() BY category=CATEGORIZE(message) + | SORT category +; + +COUNT():long | category:keyword + 7 | .*?Banana.*? + 7 | .*?Bread.*? +; + +mv via eval const without aliases +required_capability: categorize_v2 + +FROM sample_data + | EVAL message = ["Banana", "Bread"] + | STATS COUNT() BY CATEGORIZE(message) + | SORT `CATEGORIZE(message)` +; + +COUNT():long | CATEGORIZE(message):keyword + 7 | .*?Banana.*? + 7 | .*?Bread.*? +; + +mv const in parameter +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY c = CATEGORIZE(["Banana", "Bread"]) + | SORT c +; + +COUNT():long | c:keyword + 7 | .*?Banana.*? + 7 | .*?Bread.*? +; + +agg alias shadowing +required_capability: categorize_v2 + +FROM sample_data + | STATS c = COUNT() BY c = CATEGORIZE(["Banana", "Bread"]) + | SORT c +; + +warning:Line 2:9: Field 'c' shadowed by field at line 2:24 + +c:keyword +.*?Banana.*? +.*?Bread.*? +; + +chained aggregations using categorize +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY category=CATEGORIZE(message) + | STATS COUNT() BY category=CATEGORIZE(category) + | SORT category +; + +COUNT():long | category:keyword + 1 | .*?\.\*\?Connected\.\+\?to\.\*\?.*? + 1 | .*?\.\*\?Connection\.\+\?error\.\*\?.*? + 1 | .*?\.\*\?Disconnected\.\*\?.*? +; + +stats without aggs +required_capability: categorize_v2 + +FROM sample_data + | STATS BY category=CATEGORIZE(message) + | SORT category +; + +category:keyword +.*?Connected.+?to.*? +.*?Connection.+?error.*? +.*?Disconnected.*? +; + +text field +required_capability: categorize_v2 + +FROM hosts + | STATS COUNT() BY category=CATEGORIZE(host_group) + | SORT category +; + +COUNT():long | category:keyword + 2 | .*?DB.+?servers.*? + 2 | .*?Gateway.+?instances.*? + 5 | .*?Kubernetes.+?cluster.*? +; + +on TO_UPPER +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY category=CATEGORIZE(TO_UPPER(message)) + | SORT category +; + +COUNT():long | category:keyword + 3 | .*?CONNECTED.+?TO.*? + 3 | .*?CONNECTION.+?ERROR.*? + 1 | .*?DISCONNECTED.*? +; + +on CONCAT +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " banana")) + | SORT category +; + +COUNT():long | category:keyword + 3 | .*?Connected.+?to.+?banana.*? + 3 | .*?Connection.+?error.+?banana.*? + 1 | .*?Disconnected.+?banana.*? +; + +on CONCAT with unicode +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " 👍🏽😊")) + | SORT category +; + +COUNT():long | category:keyword + 3 | .*?Connected.+?to.+?👍🏽😊.*? + 3 | .*?Connection.+?error.+?👍🏽😊.*? + 1 | .*?Disconnected.+?👍🏽😊.*? +; + +on REVERSE(CONCAT()) +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY category=CATEGORIZE(REVERSE(CONCAT(message, " 👍🏽😊"))) + | SORT category +; + +COUNT():long | category:keyword + 1 | .*?😊👍🏽.+?detcennocsiD.*? + 3 | .*?😊👍🏽.+?ot.+?detcennoC.*? + 3 | .*?😊👍🏽.+?rorre.+?noitcennoC.*? +; + +and then TO_LOWER +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY category=CATEGORIZE(message) + | EVAL category=TO_LOWER(category) + | SORT category +; + +COUNT():long | category:keyword + 3 | .*?connected.+?to.*? + 3 | .*?connection.+?error.*? + 1 | .*?disconnected.*? +; + +# Throws NPE - Requires nulls support +on const empty string-Ignore +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY category=CATEGORIZE("") + | SORT category +; + +COUNT():long | category:keyword + 7 | .*?.*? +; + +# Throws NPE - Requires nulls support +on const empty string from eval-Ignore +required_capability: categorize_v2 + +FROM sample_data + | EVAL x = "" + | STATS COUNT() BY category=CATEGORIZE(x) + | SORT category +; + +COUNT():long | category:keyword + 7 | .*?.*? +; + +# Doesn't give the correct results - Requires nulls support +on null-Ignore +required_capability: categorize_v2 + +FROM sample_data + | EVAL x = null + | STATS COUNT() BY category=CATEGORIZE(x) + | SORT category +; + +COUNT():long | category:keyword + 7 | null +; + +# Doesn't give the correct results - Requires nulls support +on null string-Ignore +required_capability: categorize_v2 + +FROM sample_data + | EVAL x = null::string + | STATS COUNT() BY category=CATEGORIZE(x) + | SORT category +; + +COUNT():long | category:keyword + 7 | null +; + +filtering out all data +required_capability: categorize_v2 + +FROM sample_data + | WHERE @timestamp < "2023-10-23T00:00:00Z" + | STATS COUNT() BY category=CATEGORIZE(message) + | SORT category +; + +COUNT():long | category:keyword +; + +filtering out all data with constant +required_capability: categorize_v2 + +FROM sample_data + | STATS COUNT() BY category=CATEGORIZE(message) + | WHERE false +; + +COUNT():long | category:keyword +; + +drop output columns +required_capability: categorize_v2 + +FROM sample_data + | STATS count=COUNT() BY category=CATEGORIZE(message) + | EVAL x=1 + | DROP count, category +; + +x:integer +1 +1 +1 +; + +category value processing +required_capability: categorize_v2 + +ROW message = ["connected to a", "connected to b", "disconnected"] + | STATS COUNT() BY category=CATEGORIZE(message) + | EVAL category = TO_UPPER(category) | SORT category ; -count:long | values:keyword | category:integer -3 | [Connected to 10.1.0.1, Connected to 10.1.0.2, Connected to 10.1.0.3] | 0 -3 | [Connection error] | 1 -1 | [Disconnected] | 2 +COUNT():long | category:keyword + 2 | .*?CONNECTED.+?TO.*? + 1 | .*?DISCONNECTED.*? +; + +row aliases +required_capability: categorize_v2 + +ROW message = "connected to a" + | EVAL x = message + | STATS COUNT() BY category=CATEGORIZE(x) + | EVAL y = category + | SORT y +; + +COUNT():long | category:keyword | y:keyword + 1 | .*?connected.+?to.+?a.*? | .*?connected.+?to.+?a.*? +; + +from aliases +required_capability: categorize_v2 + +FROM sample_data + | EVAL x = message + | STATS COUNT() BY category=CATEGORIZE(x) + | EVAL y = category + | SORT y +; + +COUNT():long | category:keyword | y:keyword + 3 | .*?Connected.+?to.*? | .*?Connected.+?to.*? + 3 | .*?Connection.+?error.*? | .*?Connection.+?error.*? + 1 | .*?Disconnected.*? | .*?Disconnected.*? +; + +row aliases with keep +required_capability: categorize_v2 + +ROW message = "connected to a" + | EVAL x = message + | KEEP x + | STATS COUNT() BY category=CATEGORIZE(x) + | EVAL y = category + | KEEP `COUNT()`, y + | SORT y +; + +COUNT():long | y:keyword + 1 | .*?connected.+?to.+?a.*? +; + +from aliases with keep +required_capability: categorize_v2 + +FROM sample_data + | EVAL x = message + | KEEP x + | STATS COUNT() BY category=CATEGORIZE(x) + | EVAL y = category + | KEEP `COUNT()`, y + | SORT y +; + +COUNT():long | y:keyword + 3 | .*?Connected.+?to.*? + 3 | .*?Connection.+?error.*? + 1 | .*?Disconnected.*? +; + +row rename +required_capability: categorize_v2 + +ROW message = "connected to a" + | RENAME message as x + | STATS COUNT() BY category=CATEGORIZE(x) + | RENAME category as y + | SORT y +; + +COUNT():long | y:keyword + 1 | .*?connected.+?to.+?a.*? +; + +from rename +required_capability: categorize_v2 + +FROM sample_data + | RENAME message as x + | STATS COUNT() BY category=CATEGORIZE(x) + | RENAME category as y + | SORT y +; + +COUNT():long | y:keyword + 3 | .*?Connected.+?to.*? + 3 | .*?Connection.+?error.*? + 1 | .*?Disconnected.*? +; + +row drop +required_capability: categorize_v2 + +ROW message = "connected to a" + | STATS c = COUNT() BY category=CATEGORIZE(message) + | DROP category + | SORT c +; + +c:long +1 +; + +from drop +required_capability: categorize_v2 + +FROM sample_data + | STATS c = COUNT() BY category=CATEGORIZE(message) + | DROP category + | SORT c +; + +c:long +1 +3 +3 ; diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-mv_sample_data.json b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-mv_sample_data.json new file mode 100644 index 0000000000000..838a8ba09b45a --- /dev/null +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-mv_sample_data.json @@ -0,0 +1,16 @@ +{ + "properties": { + "@timestamp": { + "type": "date" + }, + "client_ip": { + "type": "ip" + }, + "event_duration": { + "type": "long" + }, + "message": { + "type": "keyword" + } + } +} \ No newline at end of file diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_sample_data.csv b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_sample_data.csv new file mode 100644 index 0000000000000..c02a4a7a5845f --- /dev/null +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_sample_data.csv @@ -0,0 +1,8 @@ +@timestamp:date ,client_ip:ip,event_duration:long,message:keyword +2023-10-23T13:55:01.543Z,172.21.3.15 ,1756467,[Connected to 10.1.0.1, Banana] +2023-10-23T13:53:55.832Z,172.21.3.15 ,5033755,[Connection error, Banana] +2023-10-23T13:52:55.015Z,172.21.3.15 ,8268153,[Connection error, Banana] +2023-10-23T13:51:54.732Z,172.21.3.15 , 725448,[Connection error, Banana] +2023-10-23T13:33:34.937Z,172.21.0.5 ,1232382,[Disconnected, Banana] +2023-10-23T12:27:28.948Z,172.21.2.113,2764889,[Connected to 10.1.0.2, Banana] +2023-10-23T12:15:03.360Z,172.21.2.162,3450233,[Connected to 10.1.0.3, Banana] diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeEvaluator.java deleted file mode 100644 index c6349907f9b4b..0000000000000 --- a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeEvaluator.java +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License -// 2.0; you may not use this file except in compliance with the Elastic License -// 2.0. -package org.elasticsearch.xpack.esql.expression.function.grouping; - -import java.lang.IllegalArgumentException; -import java.lang.Override; -import java.lang.String; -import java.util.function.Function; -import org.apache.lucene.util.BytesRef; -import org.elasticsearch.compute.data.Block; -import org.elasticsearch.compute.data.BytesRefBlock; -import org.elasticsearch.compute.data.BytesRefVector; -import org.elasticsearch.compute.data.IntBlock; -import org.elasticsearch.compute.data.IntVector; -import org.elasticsearch.compute.data.Page; -import org.elasticsearch.compute.operator.DriverContext; -import org.elasticsearch.compute.operator.EvalOperator; -import org.elasticsearch.compute.operator.Warnings; -import org.elasticsearch.core.Releasables; -import org.elasticsearch.xpack.esql.core.tree.Source; -import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer; -import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer; - -/** - * {@link EvalOperator.ExpressionEvaluator} implementation for {@link Categorize}. - * This class is generated. Do not edit it. - */ -public final class CategorizeEvaluator implements EvalOperator.ExpressionEvaluator { - private final Source source; - - private final EvalOperator.ExpressionEvaluator v; - - private final CategorizationAnalyzer analyzer; - - private final TokenListCategorizer.CloseableTokenListCategorizer categorizer; - - private final DriverContext driverContext; - - private Warnings warnings; - - public CategorizeEvaluator(Source source, EvalOperator.ExpressionEvaluator v, - CategorizationAnalyzer analyzer, - TokenListCategorizer.CloseableTokenListCategorizer categorizer, DriverContext driverContext) { - this.source = source; - this.v = v; - this.analyzer = analyzer; - this.categorizer = categorizer; - this.driverContext = driverContext; - } - - @Override - public Block eval(Page page) { - try (BytesRefBlock vBlock = (BytesRefBlock) v.eval(page)) { - BytesRefVector vVector = vBlock.asVector(); - if (vVector == null) { - return eval(page.getPositionCount(), vBlock); - } - return eval(page.getPositionCount(), vVector).asBlock(); - } - } - - public IntBlock eval(int positionCount, BytesRefBlock vBlock) { - try(IntBlock.Builder result = driverContext.blockFactory().newIntBlockBuilder(positionCount)) { - BytesRef vScratch = new BytesRef(); - position: for (int p = 0; p < positionCount; p++) { - if (vBlock.isNull(p)) { - result.appendNull(); - continue position; - } - if (vBlock.getValueCount(p) != 1) { - if (vBlock.getValueCount(p) > 1) { - warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); - } - result.appendNull(); - continue position; - } - result.appendInt(Categorize.process(vBlock.getBytesRef(vBlock.getFirstValueIndex(p), vScratch), this.analyzer, this.categorizer)); - } - return result.build(); - } - } - - public IntVector eval(int positionCount, BytesRefVector vVector) { - try(IntVector.FixedBuilder result = driverContext.blockFactory().newIntVectorFixedBuilder(positionCount)) { - BytesRef vScratch = new BytesRef(); - position: for (int p = 0; p < positionCount; p++) { - result.appendInt(p, Categorize.process(vVector.getBytesRef(p, vScratch), this.analyzer, this.categorizer)); - } - return result.build(); - } - } - - @Override - public String toString() { - return "CategorizeEvaluator[" + "v=" + v + "]"; - } - - @Override - public void close() { - Releasables.closeExpectNoException(v, analyzer, categorizer); - } - - private Warnings warnings() { - if (warnings == null) { - this.warnings = Warnings.createWarnings( - driverContext.warningsMode(), - source.source().getLineNumber(), - source.source().getColumnNumber(), - source.text() - ); - } - return warnings; - } - - static class Factory implements EvalOperator.ExpressionEvaluator.Factory { - private final Source source; - - private final EvalOperator.ExpressionEvaluator.Factory v; - - private final Function analyzer; - - private final Function categorizer; - - public Factory(Source source, EvalOperator.ExpressionEvaluator.Factory v, - Function analyzer, - Function categorizer) { - this.source = source; - this.v = v; - this.analyzer = analyzer; - this.categorizer = categorizer; - } - - @Override - public CategorizeEvaluator get(DriverContext context) { - return new CategorizeEvaluator(source, v.get(context), analyzer.apply(context), categorizer.apply(context), context); - } - - @Override - public String toString() { - return "CategorizeEvaluator[" + "v=" + v + "]"; - } - } -} diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 9c24daec58a84..bc779bd54a30c 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -395,8 +395,11 @@ public enum Cap { /** * Supported the text categorization function "CATEGORIZE". + *

+ * This capability was initially named `CATEGORIZE`, and got renamed after the function started correctly returning keywords. + *

*/ - CATEGORIZE(Build.current().isSnapshot()), + CATEGORIZE_V2(Build.current().isSnapshot()), /** * QSTR function diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java index 75a9883a77102..31b603ecef889 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java @@ -7,20 +7,10 @@ package org.elasticsearch.xpack.esql.expression.function.grouping; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.util.BytesRefHash; -import org.elasticsearch.compute.ann.Evaluator; -import org.elasticsearch.compute.ann.Fixed; import org.elasticsearch.compute.operator.EvalOperator.ExpressionEvaluator; -import org.elasticsearch.index.analysis.CharFilterFactory; -import org.elasticsearch.index.analysis.CustomAnalyzer; -import org.elasticsearch.index.analysis.TokenFilterFactory; -import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.xpack.esql.capabilities.Validatable; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; @@ -29,10 +19,6 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; -import org.elasticsearch.xpack.ml.aggs.categorization.CategorizationBytesRefHash; -import org.elasticsearch.xpack.ml.aggs.categorization.CategorizationPartOfSpeechDictionary; -import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer; -import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer; import java.io.IOException; import java.util.List; @@ -42,16 +28,16 @@ /** * Categorizes text messages. - * - * This implementation is incomplete and comes with the following caveats: - * - it only works correctly on a single node. - * - when running on multiple nodes, category IDs of the different nodes are - * aggregated, even though the same ID can correspond to a totally different - * category - * - the output consists of category IDs, which should be replaced by category - * regexes or keys - * - * TODO(jan, nik): fix this + *

+ * This function has no evaluators, as it works like an aggregation (Accumulates values, stores intermediate states, etc). + *

+ *

+ * For the implementation, see: + *

+ *
    + *
  • {@link org.elasticsearch.compute.aggregation.blockhash.CategorizedIntermediateBlockHash}
  • + *
  • {@link org.elasticsearch.compute.aggregation.blockhash.CategorizeRawBlockHash}
  • + *
*/ public class Categorize extends GroupingFunction implements Validatable { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry( @@ -62,7 +48,7 @@ public class Categorize extends GroupingFunction implements Validatable { private final Expression field; - @FunctionInfo(returnType = { "integer" }, description = "Categorizes text messages.") + @FunctionInfo(returnType = "keyword", description = "Categorizes text messages.") public Categorize( Source source, @Param(name = "field", type = { "text", "keyword" }, description = "Expression to categorize") Expression field @@ -88,43 +74,13 @@ public String getWriteableName() { @Override public boolean foldable() { - return field.foldable(); - } - - @Evaluator - static int process( - BytesRef v, - @Fixed(includeInToString = false, build = true) CategorizationAnalyzer analyzer, - @Fixed(includeInToString = false, build = true) TokenListCategorizer.CloseableTokenListCategorizer categorizer - ) { - String s = v.utf8ToString(); - try (TokenStream ts = analyzer.tokenStream("text", s)) { - return categorizer.computeCategory(ts, s.length(), 1).getId(); - } catch (IOException e) { - throw new RuntimeException(e); - } + // Categorize cannot be currently folded + return false; } @Override public ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { - return new CategorizeEvaluator.Factory( - source(), - toEvaluator.apply(field), - context -> new CategorizationAnalyzer( - // TODO(jan): get the correct analyzer in here, see CategorizationAnalyzerConfig::buildStandardCategorizationAnalyzer - new CustomAnalyzer( - TokenizerFactory.newFactory("whitespace", WhitespaceTokenizer::new), - new CharFilterFactory[0], - new TokenFilterFactory[0] - ), - true - ), - context -> new TokenListCategorizer.CloseableTokenListCategorizer( - new CategorizationBytesRefHash(new BytesRefHash(2048, context.bigArrays())), - CategorizationPartOfSpeechDictionary.getInstance(), - 0.70f - ) - ); + throw new UnsupportedOperationException("CATEGORIZE is only evaluated during aggregations"); } @Override @@ -134,11 +90,11 @@ protected TypeResolution resolveType() { @Override public DataType dataType() { - return DataType.INTEGER; + return DataType.KEYWORD; } @Override - public Expression replaceChildren(List newChildren) { + public Categorize replaceChildren(List newChildren) { return new Categorize(source(), newChildren.get(0)); } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/CombineProjections.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/CombineProjections.java index 1c256012baeb0..be7096538fb9a 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/CombineProjections.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/CombineProjections.java @@ -15,6 +15,7 @@ import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Expressions; import org.elasticsearch.xpack.esql.core.expression.NamedExpression; +import org.elasticsearch.xpack.esql.expression.function.grouping.Categorize; import org.elasticsearch.xpack.esql.plan.logical.Aggregate; import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan; import org.elasticsearch.xpack.esql.plan.logical.Project; @@ -61,12 +62,15 @@ protected LogicalPlan rule(UnaryPlan plan) { if (plan instanceof Aggregate a) { if (child instanceof Project p) { var groupings = a.groupings(); - List groupingAttrs = new ArrayList<>(a.groupings().size()); + List groupingAttrs = new ArrayList<>(a.groupings().size()); for (Expression grouping : groupings) { if (grouping instanceof Attribute attribute) { groupingAttrs.add(attribute); + } else if (grouping instanceof Alias as && as.child() instanceof Categorize) { + groupingAttrs.add(as); } else { - // After applying ReplaceAggregateNestedExpressionWithEval, groupings can only contain attributes. + // After applying ReplaceAggregateNestedExpressionWithEval, + // groupings (except Categorize) can only contain attributes. throw new EsqlIllegalArgumentException("Expected an Attribute, got {}", grouping); } } @@ -137,23 +141,33 @@ private static List combineProjections(List combineUpperGroupingsAndLowerProjections( - List upperGroupings, + List upperGroupings, List lowerProjections ) { // Collect the alias map for resolving the source (f1 = 1, f2 = f1, etc..) - AttributeMap aliases = new AttributeMap<>(); + AttributeMap aliases = new AttributeMap<>(); for (NamedExpression ne : lowerProjections) { - // Projections are just aliases for attributes, so casting is safe. - aliases.put(ne.toAttribute(), (Attribute) Alias.unwrap(ne)); + // record the alias + aliases.put(ne.toAttribute(), Alias.unwrap(ne)); } - // Replace any matching attribute directly with the aliased attribute from the projection. - AttributeSet replaced = new AttributeSet(); - for (Attribute attr : upperGroupings) { - // All substitutions happen before; groupings must be attributes at this point. - replaced.add(aliases.resolve(attr, attr)); + AttributeSet seen = new AttributeSet(); + List replaced = new ArrayList<>(); + for (NamedExpression ne : upperGroupings) { + // Duplicated attributes are ignored. + if (ne instanceof Attribute attribute) { + var newExpression = aliases.resolve(attribute, attribute); + if (newExpression instanceof Attribute newAttribute && seen.add(newAttribute) == false) { + // Already seen, skip + continue; + } + replaced.add(newExpression); + } else { + // For grouping functions, this will replace nested properties too + replaced.add(ne.transformUp(Attribute.class, a -> aliases.resolve(a, a))); + } } - return new ArrayList<>(replaced); + return replaced; } /** diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNull.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNull.java index 0f08cd66444a3..638fa1b8db456 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNull.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNull.java @@ -13,6 +13,7 @@ import org.elasticsearch.xpack.esql.core.expression.Literal; import org.elasticsearch.xpack.esql.core.expression.Nullability; import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction; +import org.elasticsearch.xpack.esql.expression.function.grouping.Categorize; import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.In; public class FoldNull extends OptimizerRules.OptimizerExpressionRule { @@ -42,6 +43,7 @@ public Expression rule(Expression e) { } } else if (e instanceof Alias == false && e.nullable() == Nullability.TRUE + && e instanceof Categorize == false && Expressions.anyMatch(e.children(), Expressions::isNull)) { return Literal.of(e, null); } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java index 173940af19935..985e68252a1f9 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java @@ -13,6 +13,7 @@ import org.elasticsearch.xpack.esql.core.expression.NamedExpression; import org.elasticsearch.xpack.esql.core.util.Holder; import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction; +import org.elasticsearch.xpack.esql.expression.function.grouping.Categorize; import org.elasticsearch.xpack.esql.expression.function.grouping.GroupingFunction; import org.elasticsearch.xpack.esql.plan.logical.Aggregate; import org.elasticsearch.xpack.esql.plan.logical.Eval; @@ -46,15 +47,29 @@ protected LogicalPlan rule(Aggregate aggregate) { // start with the groupings since the aggs might duplicate it for (int i = 0, s = newGroupings.size(); i < s; i++) { Expression g = newGroupings.get(i); - // move the alias into an eval and replace it with its attribute + // Move the alias into an eval and replace it with its attribute. + // Exception: Categorize is internal to the aggregation and remains in the groupings. We move its child expression into an eval. if (g instanceof Alias as) { - groupingChanged = true; - var attr = as.toAttribute(); - evals.add(as); - evalNames.put(as.name(), attr); - newGroupings.set(i, attr); - if (as.child() instanceof GroupingFunction gf) { - groupingAttributes.put(gf, attr); + if (as.child() instanceof Categorize cat) { + if (cat.field() instanceof Attribute == false) { + groupingChanged = true; + var fieldAs = new Alias(as.source(), as.name(), cat.field(), null, true); + var fieldAttr = fieldAs.toAttribute(); + evals.add(fieldAs); + evalNames.put(fieldAs.name(), fieldAttr); + Categorize replacement = cat.replaceChildren(List.of(fieldAttr)); + newGroupings.set(i, as.replaceChild(replacement)); + groupingAttributes.put(cat, fieldAttr); + } + } else { + groupingChanged = true; + var attr = as.toAttribute(); + evals.add(as); + evalNames.put(as.name(), attr); + newGroupings.set(i, attr); + if (as.child() instanceof GroupingFunction gf) { + groupingAttributes.put(gf, attr); + } } } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/InsertFieldExtraction.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/InsertFieldExtraction.java index ea9cd76bcb9bc..72573821dfeb8 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/InsertFieldExtraction.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/InsertFieldExtraction.java @@ -12,6 +12,7 @@ import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute; import org.elasticsearch.xpack.esql.core.expression.TypedAttribute; +import org.elasticsearch.xpack.esql.expression.function.grouping.Categorize; import org.elasticsearch.xpack.esql.optimizer.rules.physical.ProjectAwayColumns; import org.elasticsearch.xpack.esql.plan.physical.AggregateExec; import org.elasticsearch.xpack.esql.plan.physical.EsQueryExec; @@ -58,11 +59,17 @@ public PhysicalPlan apply(PhysicalPlan plan) { * make sure the fields are loaded for the standard hash aggregator. */ if (p instanceof AggregateExec agg && agg.groupings().size() == 1) { - var leaves = new LinkedList<>(); - // TODO: this seems out of place - agg.aggregates().stream().filter(a -> agg.groupings().contains(a) == false).forEach(a -> leaves.addAll(a.collectLeaves())); - var remove = agg.groupings().stream().filter(g -> leaves.contains(g) == false).toList(); - missing.removeAll(Expressions.references(remove)); + // CATEGORIZE requires the standard hash aggregator as well. + if (agg.groupings().get(0).anyMatch(e -> e instanceof Categorize) == false) { + var leaves = new LinkedList<>(); + // TODO: this seems out of place + agg.aggregates() + .stream() + .filter(a -> agg.groupings().contains(a) == false) + .forEach(a -> leaves.addAll(a.collectLeaves())); + var remove = agg.groupings().stream().filter(g -> leaves.contains(g) == false).toList(); + missing.removeAll(Expressions.references(remove)); + } } // add extractor diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java index 94a9246a56f83..a7418654f6b0e 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java @@ -29,6 +29,7 @@ import org.elasticsearch.xpack.esql.evaluator.EvalMapper; import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction; import org.elasticsearch.xpack.esql.expression.function.aggregate.Count; +import org.elasticsearch.xpack.esql.expression.function.grouping.Categorize; import org.elasticsearch.xpack.esql.plan.physical.AggregateExec; import org.elasticsearch.xpack.esql.plan.physical.ExchangeSourceExec; import org.elasticsearch.xpack.esql.planner.LocalExecutionPlanner.LocalExecutionPlannerContext; @@ -52,6 +53,7 @@ public final PhysicalOperation groupingPhysicalOperation( PhysicalOperation source, LocalExecutionPlannerContext context ) { + // The layout this operation will produce. Layout.Builder layout = new Layout.Builder(); Operator.OperatorFactory operatorFactory = null; AggregatorMode aggregatorMode = aggregateExec.getMode(); @@ -95,12 +97,17 @@ public final PhysicalOperation groupingPhysicalOperation( List aggregatorFactories = new ArrayList<>(); List groupSpecs = new ArrayList<>(aggregateExec.groupings().size()); for (Expression group : aggregateExec.groupings()) { - var groupAttribute = Expressions.attribute(group); - if (groupAttribute == null) { + Attribute groupAttribute = Expressions.attribute(group); + // In case of `... BY groupAttribute = CATEGORIZE(sourceGroupAttribute)` the actual source attribute is different. + Attribute sourceGroupAttribute = (aggregatorMode.isInputPartial() == false + && group instanceof Alias as + && as.child() instanceof Categorize categorize) ? Expressions.attribute(categorize.field()) : groupAttribute; + if (sourceGroupAttribute == null) { throw new EsqlIllegalArgumentException("Unexpected non-named expression[{}] as grouping in [{}]", group, aggregateExec); } - Layout.ChannelSet groupAttributeLayout = new Layout.ChannelSet(new HashSet<>(), groupAttribute.dataType()); - groupAttributeLayout.nameIds().add(groupAttribute.id()); + Layout.ChannelSet groupAttributeLayout = new Layout.ChannelSet(new HashSet<>(), sourceGroupAttribute.dataType()); + groupAttributeLayout.nameIds() + .add(group instanceof Alias as && as.child() instanceof Categorize ? groupAttribute.id() : sourceGroupAttribute.id()); /* * Check for aliasing in aggregates which occurs in two cases (due to combining project + stats): @@ -119,7 +126,7 @@ public final PhysicalOperation groupingPhysicalOperation( // check if there's any alias used in grouping - no need for the final reduction since the intermediate data // is in the output form // if the group points to an alias declared in the aggregate, use the alias child as source - else if (aggregatorMode == AggregatorMode.INITIAL || aggregatorMode == AggregatorMode.INTERMEDIATE) { + else if (aggregatorMode.isOutputPartial()) { if (groupAttribute.semanticEquals(a.toAttribute())) { groupAttribute = attr; break; @@ -129,8 +136,8 @@ else if (aggregatorMode == AggregatorMode.INITIAL || aggregatorMode == Aggregato } } layout.append(groupAttributeLayout); - Layout.ChannelAndType groupInput = source.layout.get(groupAttribute.id()); - groupSpecs.add(new GroupSpec(groupInput == null ? null : groupInput.channel(), groupAttribute)); + Layout.ChannelAndType groupInput = source.layout.get(sourceGroupAttribute.id()); + groupSpecs.add(new GroupSpec(groupInput == null ? null : groupInput.channel(), sourceGroupAttribute, group)); } if (aggregatorMode == AggregatorMode.FINAL) { @@ -164,6 +171,7 @@ else if (aggregatorMode == AggregatorMode.INITIAL || aggregatorMode == Aggregato } else { operatorFactory = new HashAggregationOperatorFactory( groupSpecs.stream().map(GroupSpec::toHashGroupSpec).toList(), + aggregatorMode, aggregatorFactories, context.pageSize(aggregateExec.estimatedRowSize()) ); @@ -178,10 +186,14 @@ else if (aggregatorMode == AggregatorMode.INITIAL || aggregatorMode == Aggregato /*** * Creates a standard layout for intermediate aggregations, typically used across exchanges. * Puts the group first, followed by each aggregation. - * - * It's similar to the code above (groupingPhysicalOperation) but ignores the factory creation. + *

+ * It's similar to the code above (groupingPhysicalOperation) but ignores the factory creation. + *

*/ public static List intermediateAttributes(List aggregates, List groupings) { + // TODO: This should take CATEGORIZE into account: + // it currently works because the CATEGORIZE intermediate state is just 1 block with the same type as the function return, + // so the attribute generated here is the expected one var aggregateMapper = new AggregateMapper(); List attrs = new ArrayList<>(); @@ -304,12 +316,20 @@ private static AggregatorFunctionSupplier supplier(AggregateFunction aggregateFu throw new EsqlIllegalArgumentException("aggregate functions must extend ToAggregator"); } - private record GroupSpec(Integer channel, Attribute attribute) { + /** + * The input configuration of this group. + * + * @param channel The source channel of this group + * @param attribute The attribute, source of this group + * @param expression The expression being used to group + */ + private record GroupSpec(Integer channel, Attribute attribute, Expression expression) { BlockHash.GroupSpec toHashGroupSpec() { if (channel == null) { throw new EsqlIllegalArgumentException("planned to use ordinals but tried to use the hash instead"); } - return new BlockHash.GroupSpec(channel, elementType()); + + return new BlockHash.GroupSpec(channel, elementType(), Alias.unwrap(expression) instanceof Categorize); } ElementType elementType() { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index f25b19c4e5d1c..355073fcc873f 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -1821,7 +1821,7 @@ public void testIntervalAsString() { } public void testCategorizeSingleGrouping() { - assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE.isEnabled()); + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); query("from test | STATS COUNT(*) BY CATEGORIZE(first_name)"); query("from test | STATS COUNT(*) BY cat = CATEGORIZE(first_name)"); @@ -1850,7 +1850,7 @@ public void testCategorizeSingleGrouping() { } public void testCategorizeNestedGrouping() { - assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE.isEnabled()); + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); query("from test | STATS COUNT(*) BY CATEGORIZE(LENGTH(first_name)::string)"); @@ -1865,7 +1865,7 @@ public void testCategorizeNestedGrouping() { } public void testCategorizeWithinAggregations() { - assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE.isEnabled()); + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); query("from test | STATS MV_COUNT(cat), COUNT(*) BY cat = CATEGORIZE(first_name)"); diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractAggregationTestCase.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractAggregationTestCase.java index db5d8e03458ea..df1675ba22568 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractAggregationTestCase.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractAggregationTestCase.java @@ -111,7 +111,8 @@ protected static List withNoRowsExpectingNull(List anyNullIsNull( oc.getExpectedTypeError(), null, null, - null + null, + oc.canBuildEvaluator() ); })); @@ -260,7 +261,8 @@ protected static List anyNullIsNull( oc.getExpectedTypeError(), null, null, - null + null, + oc.canBuildEvaluator() ); })); } @@ -648,18 +650,7 @@ protected static List randomizeBytesRefsOffset(List data, String expectedTypeError) Class foldingExceptionClass, String foldingExceptionMessage, Object extra + ) { + this( + data, + evaluatorToString, + expectedType, + matcher, + expectedWarnings, + expectedBuildEvaluatorWarnings, + expectedTypeError, + foldingExceptionClass, + foldingExceptionMessage, + extra, + data.stream().allMatch(d -> d.forceLiteral || DataType.isRepresentable(d.type)) + ); + } + + TestCase( + List data, + Matcher evaluatorToString, + DataType expectedType, + Matcher matcher, + String[] expectedWarnings, + String[] expectedBuildEvaluatorWarnings, + String expectedTypeError, + Class foldingExceptionClass, + String foldingExceptionMessage, + Object extra, + boolean canBuildEvaluator ) { this.source = Source.EMPTY; this.data = data; @@ -1442,10 +1470,10 @@ public static TestCase typeError(List data, String expectedTypeError) this.expectedWarnings = expectedWarnings; this.expectedBuildEvaluatorWarnings = expectedBuildEvaluatorWarnings; this.expectedTypeError = expectedTypeError; - this.canBuildEvaluator = data.stream().allMatch(d -> d.forceLiteral || DataType.isRepresentable(d.type)); this.foldingExceptionClass = foldingExceptionClass; this.foldingExceptionMessage = foldingExceptionMessage; this.extra = extra; + this.canBuildEvaluator = canBuildEvaluator; } public Source getSource() { @@ -1520,6 +1548,25 @@ public Object extra() { return extra; } + /** + * Build a new {@link TestCase} with new {@link #data}. + */ + public TestCase withData(List data) { + return new TestCase( + data, + evaluatorToString, + expectedType, + matcher, + expectedWarnings, + expectedBuildEvaluatorWarnings, + expectedTypeError, + foldingExceptionClass, + foldingExceptionMessage, + extra, + canBuildEvaluator + ); + } + /** * Build a new {@link TestCase} with new {@link #extra()}. */ @@ -1534,7 +1581,8 @@ public TestCase withExtra(Object extra) { expectedTypeError, foldingExceptionClass, foldingExceptionMessage, - extra + extra, + canBuildEvaluator ); } @@ -1549,7 +1597,8 @@ public TestCase withWarning(String warning) { expectedTypeError, foldingExceptionClass, foldingExceptionMessage, - extra + extra, + canBuildEvaluator ); } @@ -1568,7 +1617,8 @@ public TestCase withBuildEvaluatorWarning(String warning) { expectedTypeError, foldingExceptionClass, foldingExceptionMessage, - extra + extra, + canBuildEvaluator ); } @@ -1592,7 +1642,30 @@ public TestCase withFoldingException(Class clazz, String me expectedTypeError, clazz, message, - extra + extra, + canBuildEvaluator + ); + } + + /** + * Build a new {@link TestCase} that can't build an evaluator. + *

+ * Useful for special cases that can't be executed, but should still be considered. + *

+ */ + public TestCase withoutEvaluator() { + return new TestCase( + data, + evaluatorToString, + expectedType, + matcher, + expectedWarnings, + expectedBuildEvaluatorWarnings, + expectedTypeError, + foldingExceptionClass, + foldingExceptionMessage, + extra, + false ); } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java index f93389d5cb659..d29ac635e4bb7 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java @@ -23,6 +23,12 @@ import static org.hamcrest.Matchers.equalTo; +/** + * Dummy test implementation for Categorize. Used just to generate documentation. + *

+ * Most test cases are currently skipped as this function can't build an evaluator. + *

+ */ public class CategorizeTests extends AbstractScalarFunctionTestCase { public CategorizeTests(@Name("TestCase") Supplier testCaseSupplier) { this.testCase = testCaseSupplier.get(); @@ -37,11 +43,11 @@ public static Iterable parameters() { "text with " + dataType.typeName(), List.of(dataType), () -> new TestCaseSupplier.TestCase( - List.of(new TestCaseSupplier.TypedData(new BytesRef("blah blah blah"), dataType, "f")), - "CategorizeEvaluator[v=Attribute[channel=0]]", - DataType.INTEGER, - equalTo(0) - ) + List.of(new TestCaseSupplier.TypedData(new BytesRef(""), dataType, "field")), + "", + DataType.KEYWORD, + equalTo(new BytesRef("")) + ).withoutEvaluator() ) ); } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java index c642db1afe286..eaf724456dbb2 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java @@ -57,6 +57,7 @@ import org.elasticsearch.xpack.esql.expression.function.aggregate.ToPartial; import org.elasticsearch.xpack.esql.expression.function.aggregate.Values; import org.elasticsearch.xpack.esql.expression.function.grouping.Bucket; +import org.elasticsearch.xpack.esql.expression.function.grouping.Categorize; import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToDouble; import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToInteger; import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong; @@ -1203,6 +1204,33 @@ public void testCombineProjectionWithAggregationFirstAndAliasedGroupingUsedInAgg assertThat(Expressions.names(agg.groupings()), contains("first_name")); } + /** + * Expects + * Limit[1000[INTEGER]] + * \_Aggregate[STANDARD,[CATEGORIZE(first_name{f}#18) AS cat],[SUM(salary{f}#22,true[BOOLEAN]) AS s, cat{r}#10]] + * \_EsRelation[test][_meta_field{f}#23, emp_no{f}#17, first_name{f}#18, ..] + */ + public void testCombineProjectionWithCategorizeGrouping() { + var plan = plan(""" + from test + | eval k = first_name, k1 = k + | stats s = sum(salary) by cat = CATEGORIZE(k) + | keep s, cat + """); + + var limit = as(plan, Limit.class); + var agg = as(limit.child(), Aggregate.class); + assertThat(agg.child(), instanceOf(EsRelation.class)); + + assertThat(Expressions.names(agg.aggregates()), contains("s", "cat")); + assertThat(Expressions.names(agg.groupings()), contains("cat")); + + var categorizeAlias = as(agg.groupings().get(0), Alias.class); + var categorize = as(categorizeAlias.child(), Categorize.class); + var categorizeField = as(categorize.field(), FieldAttribute.class); + assertThat(categorizeField.name(), is("first_name")); + } + /** * Expects * Limit[1000[INTEGER]] @@ -3909,6 +3937,39 @@ public void testNestedExpressionsInGroups() { assertThat(eval.fields().get(0).name(), is("emp_no % 2")); } + /** + * Expects + * Limit[1000[INTEGER]] + * \_Aggregate[STANDARD,[CATEGORIZE(CATEGORIZE(CONCAT(first_name, "abc")){r$}#18) AS CATEGORIZE(CONCAT(first_name, "abc"))],[CO + * UNT(salary{f}#13,true[BOOLEAN]) AS c, CATEGORIZE(CONCAT(first_name, "abc")){r}#3]] + * \_Eval[[CONCAT(first_name{f}#9,[61 62 63][KEYWORD]) AS CATEGORIZE(CONCAT(first_name, "abc"))]] + * \_EsRelation[test][_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, ge..] + */ + public void testNestedExpressionsInGroupsWithCategorize() { + var plan = optimizedPlan(""" + from test + | stats c = count(salary) by CATEGORIZE(CONCAT(first_name, "abc")) + """); + + var limit = as(plan, Limit.class); + var agg = as(limit.child(), Aggregate.class); + var groupings = agg.groupings(); + var categorizeAlias = as(groupings.get(0), Alias.class); + var categorize = as(categorizeAlias.child(), Categorize.class); + var aggs = agg.aggregates(); + assertThat(aggs.get(1), is(categorizeAlias.toAttribute())); + + var eval = as(agg.child(), Eval.class); + assertThat(eval.fields(), hasSize(1)); + var evalFieldAlias = as(eval.fields().get(0), Alias.class); + var evalField = as(evalFieldAlias.child(), Concat.class); + + assertThat(evalFieldAlias.name(), is("CATEGORIZE(CONCAT(first_name, \"abc\"))")); + assertThat(categorize.field(), is(evalFieldAlias.toAttribute())); + assertThat(evalField.source().text(), is("CONCAT(first_name, \"abc\")")); + assertThat(categorizeAlias.source(), is(evalFieldAlias.source())); + } + /** * Expects * Limit[1000[INTEGER]] diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java index 89117b5d4e729..ae31576184938 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java @@ -28,6 +28,8 @@ import org.elasticsearch.xpack.esql.expression.function.aggregate.Percentile; import org.elasticsearch.xpack.esql.expression.function.aggregate.SpatialCentroid; import org.elasticsearch.xpack.esql.expression.function.aggregate.Sum; +import org.elasticsearch.xpack.esql.expression.function.grouping.Bucket; +import org.elasticsearch.xpack.esql.expression.function.grouping.Categorize; import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToString; import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateExtract; import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat; @@ -267,6 +269,17 @@ public void testNullFoldableDoesNotApplyToIsNullAndNotNull() { } } + public void testNullBucketGetsFolded() { + FoldNull foldNull = new FoldNull(); + assertEquals(NULL, foldNull.rule(new Bucket(EMPTY, NULL, NULL, NULL, NULL))); + } + + public void testNullCategorizeGroupingNotFolded() { + FoldNull foldNull = new FoldNull(); + Categorize categorize = new Categorize(EMPTY, NULL); + assertEquals(categorize, foldNull.rule(categorize)); + } + private void assertNullLiteral(Expression expression) { assertEquals(Literal.class, expression.getClass()); assertNull(expression.fold()); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java index d0088edcb0805..e4257270ce641 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java @@ -19,6 +19,7 @@ import org.elasticsearch.search.aggregations.AggregationReduceContext; import org.elasticsearch.search.aggregations.InternalAggregations; import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategory.TokenAndWeight; +import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -83,6 +84,8 @@ public void close() { @Nullable private final CategorizationPartOfSpeechDictionary partOfSpeechDictionary; + private final List categoriesById; + /** * Categories stored in such a way that the most common are accessed first. * This is implemented as an {@link ArrayList} with bespoke ordering rather @@ -108,9 +111,18 @@ public TokenListCategorizer( this.lowerThreshold = threshold; this.upperThreshold = (1.0f + threshold) / 2.0f; this.categoriesByNumMatches = new ArrayList<>(); + this.categoriesById = new ArrayList<>(); cacheRamUsage(0); } + public TokenListCategory computeCategory(String s, CategorizationAnalyzer analyzer) { + try (TokenStream ts = analyzer.tokenStream("text", s)) { + return computeCategory(ts, s.length(), 1); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + public TokenListCategory computeCategory(TokenStream ts, int unfilteredStringLen, long numDocs) throws IOException { assert partOfSpeechDictionary != null : "This version of computeCategory should only be used when a part-of-speech dictionary is available"; @@ -301,6 +313,7 @@ private synchronized TokenListCategory computeCategory( maxUnfilteredStringLen, numDocs ); + categoriesById.add(newCategory); categoriesByNumMatches.add(newCategory); cacheRamUsage(newCategory.ramBytesUsed()); return repositionCategory(newCategory, newIndex); @@ -412,6 +425,17 @@ static float similarity(List left, int leftWeight, List toCategories(int size) { + return categoriesByNumMatches.stream() + .limit(size) + .map(category -> new SerializableTokenListCategory(category, bytesRefHash)) + .toList(); + } + + public List toCategoriesById() { + return categoriesById.stream().map(category -> new SerializableTokenListCategory(category, bytesRefHash)).toList(); + } + public InternalCategorizationAggregation.Bucket[] toOrderedBuckets(int size) { return categoriesByNumMatches.stream() .limit(size)