ES|QL categorize with multiple groupings.

jan-elastic · jan-elastic · commit defe8077179d · 2024-12-09T10:05:16.000+01:00
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java
@@ -174,13 +174,15 @@ public static BlockHash buildCategorizeBlockHash(
         List<GroupSpec> groups,
         AggregatorMode aggregatorMode,
         BlockFactory blockFactory,
-        AnalysisRegistry analysisRegistry
+        AnalysisRegistry analysisRegistry,
+        int emitBatchSize
     ) {
-        if (groups.size() != 1) {
-            throw new IllegalArgumentException("only a single CATEGORIZE group can used");
+        if (groups.size() == 1) {
+            return new CategorizeBlockHash(blockFactory, groups.get(0).channel, aggregatorMode, analysisRegistry);
+        } else {
+            assert groups.get(0).isCategorize();
+            return new CategorizePackedValuesBlockHash(groups, blockFactory, aggregatorMode, analysisRegistry, emitBatchSize);
         }
-
-        return new CategorizeBlockHash(blockFactory, groups.get(0).channel, aggregatorMode, analysisRegistry);
     }
 
     /**
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
@@ -44,7 +44,7 @@
 import java.util.Objects;
 
 /**
- * Base BlockHash implementation for {@code Categorize} grouping function.
+ * BlockHash implementation for {@code Categorize} grouping function.
  */
 public class CategorizeBlockHash extends BlockHash {
 
@@ -95,12 +95,14 @@ public class CategorizeBlockHash extends BlockHash {
         }
     }
 
+    boolean seenNull() {
+        return seenNull;
+    }
+
     @Override
     public void add(Page page, GroupingAggregatorFunction.AddInput addInput) {
-        if (aggregatorMode.isInputPartial() == false) {
-            addInitial(page, addInput);
-        } else {
-            addIntermediate(page, addInput);
+        try (IntBlock block = add(page)) {
+            addInput.add(0, block);
         }
     }
 
@@ -129,29 +131,28 @@ public void close() {
         Releasables.close(evaluator, categorizer);
     }
 
+    private IntBlock add(Page page) {
+        return aggregatorMode.isInputPartial() == false ? addInitial(page) : addIntermediate(page);
+    }
+
     /**
      * Adds initial (raw) input to the state.
      */
-    private void addInitial(Page page, GroupingAggregatorFunction.AddInput addInput) {
-        try (IntBlock result = (IntBlock) evaluator.eval(page.getBlock(channel))) {
-            addInput.add(0, result);
-        }
+    IntBlock addInitial(Page page) {
+        return (IntBlock) evaluator.eval(page.getBlock(channel));
     }
 
     /**
      * Adds intermediate state to the state.
      */
-    private void addIntermediate(Page page, GroupingAggregatorFunction.AddInput addInput) {
+    private IntBlock addIntermediate(Page page) {
         if (page.getPositionCount() == 0) {
-            return;
+            return null;
         }
         BytesRefBlock categorizerState = page.getBlock(channel);
         if (categorizerState.areAllValuesNull()) {
             seenNull = true;
-            try (var newIds = blockFactory.newConstantIntVector(NULL_ORD, 1)) {
-                addInput.add(0, newIds);
-            }
-            return;
+            return blockFactory.newConstantIntBlockWith(NULL_ORD, 1);
         }
 
         Map<Integer, Integer> idMap = readIntermediate(categorizerState.getBytesRef(0, new BytesRef()));
@@ -161,9 +162,7 @@ private void addIntermediate(Page page, GroupingAggregatorFunction.AddInput addI
             for (int i = fromId; i < toId; i++) {
                 newIdsBuilder.appendInt(idMap.get(i));
             }
-            try (IntBlock newIds = newIdsBuilder.build()) {
-                addInput.add(0, newIds);
-            }
+            return newIdsBuilder.build();
         }
     }
 
@@ -172,7 +171,7 @@ private void addIntermediate(Page page, GroupingAggregatorFunction.AddInput addI
      *
      * @return a map from the old category id to the new one. The old ids go from 0 to {@code size - 1}.
      */
-    private Map<Integer, Integer> readIntermediate(BytesRef bytes) {
+    Map<Integer, Integer> readIntermediate(BytesRef bytes) {
         Map<Integer, Integer> idMap = new HashMap<>();
         try (StreamInput in = new BytesArray(bytes).streamInput()) {
             if (in.readBoolean()) {
@@ -198,15 +197,19 @@ private Block buildIntermediateBlock() {
         if (categorizer.getCategoryCount() == 0) {
             return blockFactory.newConstantNullBlock(seenNull ? 1 : 0);
         }
+        int positionCount = categorizer.getCategoryCount() + (seenNull ? 1 : 0);
+        // We're returning a block with N positions just because the Page must have all blocks with the same position count!
+        return blockFactory.newConstantBytesRefBlockWith(serializeCategorizer(), positionCount);
+    }
+
+    BytesRef serializeCategorizer() {
         try (BytesStreamOutput out = new BytesStreamOutput()) {
             out.writeBoolean(seenNull);
             out.writeVInt(categorizer.getCategoryCount());
             for (SerializableTokenListCategory category : categorizer.toCategoriesById()) {
                 category.writeTo(out);
             }
-            // We're returning a block with N positions just because the Page must have all blocks with the same position count!
-            int positionCount = categorizer.getCategoryCount() + (seenNull ? 1 : 0);
-            return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), positionCount);
+            return out.bytes().toBytesRef();
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java
@@ -0,0 +1,156 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.compute.aggregation.blockhash;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.bytes.BytesArray;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.util.BigArrays;
+import org.elasticsearch.common.util.BitArray;
+import org.elasticsearch.compute.aggregation.AggregatorMode;
+import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction;
+import org.elasticsearch.compute.data.Block;
+import org.elasticsearch.compute.data.BlockFactory;
+import org.elasticsearch.compute.data.BytesRefBlock;
+import org.elasticsearch.compute.data.ElementType;
+import org.elasticsearch.compute.data.IntBlock;
+import org.elasticsearch.compute.data.IntVector;
+import org.elasticsearch.compute.data.Page;
+import org.elasticsearch.core.ReleasableIterator;
+import org.elasticsearch.core.Releasables;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * BlockHash implementation for {@code Categorize} grouping function as first
+ * grouping expression, followed by one or mode other grouping expressions.
+ */
+public class CategorizePackedValuesBlockHash extends BlockHash {
+
+    private final AggregatorMode aggregatorMode;
+    private final List<GroupSpec> specs;
+    private final CategorizeBlockHash categorizeBlockHash;
+    private final PackedValuesBlockHash packedValuesBlockHash;
+
+    CategorizePackedValuesBlockHash(
+        List<GroupSpec> specs,
+        BlockFactory blockFactory,
+        AggregatorMode aggregatorMode,
+        AnalysisRegistry analysisRegistry,
+        int emitBatchSize
+    ) {
+        super(blockFactory);
+        this.aggregatorMode = aggregatorMode;
+        this.specs = specs;
+        categorizeBlockHash = new CategorizeBlockHash(blockFactory, specs.get(0).channel(), aggregatorMode, analysisRegistry);
+
+        List<GroupSpec> newSpecs = new ArrayList<>(specs);
+        newSpecs.set(0, new GroupSpec(-1, ElementType.INT));
+        packedValuesBlockHash = new PackedValuesBlockHash(newSpecs, blockFactory, emitBatchSize);
+
+        // TODO: close stuff upon failure
+    }
+
+    @Override
+    public void add(Page page, GroupingAggregatorFunction.AddInput addInput) {
+        try (IntBlock categories = getCategories(page)) {
+            packedValuesBlockHash.add(page.appendBlock(categories), addInput);
+        }
+    }
+
+    private IntBlock getCategories(Page page) {
+        if (aggregatorMode.isInputPartial() == false) {
+            return categorizeBlockHash.addInitial(page);
+        } else {
+            BytesRefBlock stateBlock = page.getBlock(0);
+            BytesRef stateBytes = stateBlock.getBytesRef(0, new BytesRef());
+
+            try (StreamInput in = new BytesArray(stateBytes).streamInput()) {
+                BytesRef categorizerState = in.readBytesRef();
+                Map<Integer, Integer> idMap = categorizeBlockHash.readIntermediate(categorizerState);
+                int[] oldIds = in.readIntArray();
+                try (IntBlock.Builder newIds = blockFactory.newIntBlockBuilder(page.getPositionCount())) {
+                    for (int oldId : oldIds) {
+                        newIds.appendInt(idMap.get(oldId));
+                    }
+                    return newIds.build();
+                }
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    @Override
+    public Block[] getKeys() {
+        Block[] keys = packedValuesBlockHash.getKeys();
+        if (aggregatorMode.isOutputPartial() == false) {
+            try (
+                BytesRefBlock regexes = (BytesRefBlock) categorizeBlockHash.getKeys()[0];
+                BytesRefBlock.Builder builder = blockFactory.newBytesRefBlockBuilder(keys[0].getPositionCount())
+            ) {
+                IntVector idsVector = (IntVector) keys[0].asVector();
+                int idsOffset = categorizeBlockHash.seenNull() ? 0 : -1;
+                BytesRef scratch = new BytesRef();
+                for (int i = 0; i < idsVector.getPositionCount(); i++) {
+                    int id = idsVector.getInt(i);
+                    if (id == 0) {
+                        builder.appendNull();
+                    } else {
+                        builder.appendBytesRef(regexes.getBytesRef(id + idsOffset, scratch));
+                    }
+                }
+                keys[0].close();
+                keys[0] = builder.build();
+            }
+        } else {
+            BytesRef state;
+            try (BytesStreamOutput out = new BytesStreamOutput()) {
+                out.writeBytesRef(categorizeBlockHash.serializeCategorizer());
+                IntVector idsVector = (IntVector) keys[0].asVector();
+                int[] idsArray = new int[idsVector.getPositionCount()];
+                for (int i = 0; i < idsVector.getPositionCount(); i++) {
+                    idsArray[i] = idsVector.getInt(i);
+                }
+                out.writeIntArray(idsArray);
+                state = out.bytes().toBytesRef();
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            keys[0].close();
+            keys[0] = blockFactory.newConstantBytesRefBlockWith(state, keys[0].getPositionCount());
+        }
+        return keys;
+    }
+
+    @Override
+    public IntVector nonEmpty() {
+        return packedValuesBlockHash.nonEmpty();
+    }
+
+    @Override
+    public BitArray seenGroupIds(BigArrays bigArrays) {
+        return packedValuesBlockHash.seenGroupIds(bigArrays);
+    }
+
+    @Override
+    public final ReleasableIterator<IntBlock> lookup(Page page, ByteSizeValue targetBlockSize) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void close() {
+        Releasables.close(categorizeBlockHash, packedValuesBlockHash);
+    }
+}
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/data/Page.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/data/Page.java
@@ -131,6 +131,9 @@ public <B extends Block> B getBlock(int blockIndex) {
         if (blocksReleased) {
             throw new IllegalStateException("can't read released page");
         }
+        if (blockIndex < 0) {
+            blockIndex += blocks.length;
+        }
         @SuppressWarnings("unchecked")
         B block = (B) blocks[blockIndex];
         if (block.isReleased()) {
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/HashAggregationOperator.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/HashAggregationOperator.java
@@ -51,7 +51,13 @@ public Operator get(DriverContext driverContext) {
             if (groups.stream().anyMatch(BlockHash.GroupSpec::isCategorize)) {
                 return new HashAggregationOperator(
                     aggregators,
-                    () -> BlockHash.buildCategorizeBlockHash(groups, aggregatorMode, driverContext.blockFactory(), analysisRegistry),
+                    () -> BlockHash.buildCategorizeBlockHash(
+                        groups,
+                        aggregatorMode,
+                        driverContext.blockFactory(),
+                        analysisRegistry,
+                        maxPageSize
+                    ),
                     driverContext
                 );
             }
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec
@@ -592,3 +592,84 @@ COUNT():long | x:keyword
            3 | [.*?Connection.+?error.*?,.*?Connection.+?error.*?]
            1 | [.*?Disconnected.*?,.*?Disconnected.*?]
 ;
+
+multiple groupings with categorize and ip
+required_capability: categorize_multiple_groupings
+
+FROM sample_data
+  | STATS count=COUNT() BY category=CATEGORIZE(message), client_ip
+  | SORT category, client_ip
+;
+
+count:long | category:keyword         | client_ip:ip
+         1 | .*?Connected.+?to.*?     | 172.21.2.113
+         1 | .*?Connected.+?to.*?     | 172.21.2.162
+         1 | .*?Connected.+?to.*?     | 172.21.3.15
+         3 | .*?Connection.+?error.*? | 172.21.3.15
+         1 | .*?Disconnected.*?       | 172.21.0.5
+;
+
+multiple groupings with categorize and bucketed timestamp
+required_capability: categorize_multiple_groupings
+
+FROM sample_data
+  | STATS count=COUNT() BY category=CATEGORIZE(message), timestamp=BUCKET(@timestamp, 1 HOUR)
+  | SORT category, timestamp
+;
+
+count:long | category:keyword         | timestamp:datetime
+         2 | .*?Connected.+?to.*?     | 2023-10-23T12:00:00.000Z
+         1 | .*?Connected.+?to.*?     | 2023-10-23T13:00:00.000Z
+         3 | .*?Connection.+?error.*? | 2023-10-23T13:00:00.000Z
+         1 | .*?Disconnected.*?       | 2023-10-23T13:00:00.000Z
+;
+
+multiple groupings with categorize and nulls
+required_capability: categorize_multiple_groupings
+
+FROM employees
+  | STATS SUM(languages) BY category=CATEGORIZE(job_positions), gender
+  | SORT category DESC, gender ASC
+  | LIMIT 5
+;
+
+SUM(languages):long | category:keyword  | gender:keyword
+                 11 | null              | F
+                 16 | null              | M
+                 14 | .*?Tech.+?Lead.*? | F
+                 23 | .*?Tech.+?Lead.*? | M
+                  9 | .*?Tech.+?Lead.*? | null
+;
+
+multiple groupings with categorize and a field that's always null
+required_capability: categorize_multiple_groupings
+
+FROM sample_data
+  | EVAL nullfield = null
+  | STATS count=COUNT() BY category=CATEGORIZE(nullfield), client_ip
+  | SORT client_ip
+;
+
+count:long | category:keyword | client_ip:ip
+         1 | null             | 172.21.0.5
+         1 | null             | 172.21.2.113
+         1 | null             | 172.21.2.162
+         4 | null             | 172.21.3.15
+;
+
+
+multiple groupings with categorize and the same text field
+required_capability: categorize_multiple_groupings
+
+FROM sample_data
+  | STATS count=COUNT() BY category=CATEGORIZE(message), message
+  | SORT message
+;
+
+count:long | category:keyword         | message:keyword
+         1 | .*?Connected.+?to.*?     | Connected to 10.1.0.1
+         1 | .*?Connected.+?to.*?     | Connected to 10.1.0.2
+         1 | .*?Connected.+?to.*?     | Connected to 10.1.0.3
+         3 | .*?Connection.+?error.*? | Connection error
+         1 | .*?Disconnected.*?       | Disconnected
+;
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Verifier.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Verifier.java