Move new block hashes to typical location

alex-spies · alex-spies · commit a3091339a356 · 2024-10-22T18:12:38.000+02:00
This makes them easier to be tested.
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.compute.aggregation.blockhash;
+
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.compute.data.Block;
+import org.elasticsearch.compute.data.BlockFactory;
+import org.elasticsearch.compute.data.IntBlock;
+import org.elasticsearch.compute.data.Page;
+import org.elasticsearch.core.ReleasableIterator;
+import org.elasticsearch.xpack.ml.aggs.categorization.SerializableTokenListCategory;
+import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer;
+
+import java.io.IOException;
+
+public abstract class AbstractCategorizeBlockHash extends BlockHash {
+    private final boolean outputPartial;
+    protected final TokenListCategorizer.CloseableTokenListCategorizer categorizer;
+
+    AbstractCategorizeBlockHash(
+        BlockFactory blockFactory,
+        boolean outputPartial,
+        TokenListCategorizer.CloseableTokenListCategorizer categorizer
+    ) {
+        super(blockFactory);
+        this.outputPartial = outputPartial;
+        this.categorizer = categorizer;
+    }
+
+    @Override
+    public Block[] getKeys() {
+        if (outputPartial) {
+            // NOCOMMIT load partial
+            Block state = null;
+            Block keys; // NOCOMMIT do we even need to send the keys? it's just going to be 0 to the length of state
+            // return new Block[] {new CompositeBlock()};
+            return null;
+        }
+
+        // NOCOMMIT load final
+        return new Block[0];
+    }
+
+    @Override
+    public final ReleasableIterator<IntBlock> lookup(Page page, ByteSizeValue targetBlockSize) {
+        throw new UnsupportedOperationException();
+    }
+
+    private Block buildIntermediateBlock(BlockFactory blockFactory, int positionCount) {
+        if (categorizer.getCategoryCount() == 0) {
+            return blockFactory.newConstantNullBlock(positionCount);
+        }
+        try (BytesStreamOutput out = new BytesStreamOutput()) {
+            // TODO be more careful here.
+            out.writeVInt(categorizer.getCategoryCount());
+            for (SerializableTokenListCategory category : categorizer.toCategories(categorizer.getCategoryCount())) {
+                category.writeTo(out);
+            }
+            return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), positionCount);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java
@@ -0,0 +1,160 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.compute.aggregation.blockhash;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.util.BigArrays;
+import org.elasticsearch.common.util.BitArray;
+import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction;
+import org.elasticsearch.compute.aggregation.Warnings;
+import org.elasticsearch.compute.ann.Fixed;
+import org.elasticsearch.compute.data.Block;
+import org.elasticsearch.compute.data.BlockFactory;
+import org.elasticsearch.compute.data.BytesRefBlock;
+import org.elasticsearch.compute.data.BytesRefVector;
+import org.elasticsearch.compute.data.IntBlock;
+import org.elasticsearch.compute.data.IntVector;
+import org.elasticsearch.compute.data.Page;
+import org.elasticsearch.compute.operator.DriverContext;
+import org.elasticsearch.compute.operator.EvalOperator;
+import org.elasticsearch.core.Releasables;
+import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
+
+import java.io.IOException;
+
+public class CategorizeRawBlockHash extends AbstractCategorizeBlockHash {
+    private final CategorizeEvaluator evaluator;
+
+    CategorizeRawBlockHash(
+        BlockFactory blockFactory,
+        boolean outputPartial,
+        TokenListCategorizer.CloseableTokenListCategorizer categorizer,
+        CategorizeEvaluator evaluator
+    ) {
+        super(blockFactory, outputPartial, categorizer);
+        this.evaluator = evaluator;
+    }
+
+    @Override
+    public void add(Page page, GroupingAggregatorFunction.AddInput addInput) {
+        IntBlock result = (IntBlock) evaluator.eval(page);
+        addInput.add(0, result);
+    }
+
+    @Override
+    public IntVector nonEmpty() {
+        // TODO
+        return null;
+    }
+
+    @Override
+    public BitArray seenGroupIds(BigArrays bigArrays) {
+        // TODO
+        return null;
+    }
+
+    @Override
+    public void close() {
+        // TODO
+    }
+
+    /**
+     * NOCOMMIT: Super-duper copy-pasted.
+     */
+    public static final class CategorizeEvaluator implements EvalOperator.ExpressionEvaluator {
+        private final Warnings warnings;
+
+        private final EvalOperator.ExpressionEvaluator v;
+
+        private final CategorizationAnalyzer analyzer;
+
+        private final TokenListCategorizer.CloseableTokenListCategorizer categorizer;
+
+        private final DriverContext driverContext;
+
+        static int process(
+            BytesRef v,
+            @Fixed(includeInToString = false, build = true) CategorizationAnalyzer analyzer,
+            @Fixed(includeInToString = false, build = true) TokenListCategorizer.CloseableTokenListCategorizer categorizer
+        ) {
+            String s = v.utf8ToString();
+            try (TokenStream ts = analyzer.tokenStream("text", s)) {
+                return categorizer.computeCategory(ts, s.length(), 1).getId();
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        public CategorizeEvaluator(
+            EvalOperator.ExpressionEvaluator v,
+            CategorizationAnalyzer analyzer,
+            TokenListCategorizer.CloseableTokenListCategorizer categorizer,
+            DriverContext driverContext
+        ) {
+            this.v = v;
+            this.analyzer = analyzer;
+            this.categorizer = categorizer;
+            this.driverContext = driverContext;
+            this.warnings = Warnings.createWarnings(driverContext.warningsMode(), -1, -1, "");
+        }
+
+        @Override
+        public Block eval(Page page) {
+            try (BytesRefBlock vBlock = (BytesRefBlock) v.eval(page)) {
+                BytesRefVector vVector = vBlock.asVector();
+                if (vVector == null) {
+                    return eval(page.getPositionCount(), vBlock);
+                }
+                return eval(page.getPositionCount(), vVector).asBlock();
+            }
+        }
+
+        public IntBlock eval(int positionCount, BytesRefBlock vBlock) {
+            try (IntBlock.Builder result = driverContext.blockFactory().newIntBlockBuilder(positionCount)) {
+                BytesRef vScratch = new BytesRef();
+                position: for (int p = 0; p < positionCount; p++) {
+                    if (vBlock.isNull(p)) {
+                        result.appendNull();
+                        continue position;
+                    }
+                    if (vBlock.getValueCount(p) != 1) {
+                        if (vBlock.getValueCount(p) > 1) {
+                            warnings.registerException(new IllegalArgumentException("single-value function encountered multi-value"));
+                        }
+                        result.appendNull();
+                        continue position;
+                    }
+                    result.appendInt(process(vBlock.getBytesRef(vBlock.getFirstValueIndex(p), vScratch), this.analyzer, this.categorizer));
+                }
+                return result.build();
+            }
+        }
+
+        public IntVector eval(int positionCount, BytesRefVector vVector) {
+            try (IntVector.FixedBuilder result = driverContext.blockFactory().newIntVectorFixedBuilder(positionCount)) {
+                BytesRef vScratch = new BytesRef();
+                position: for (int p = 0; p < positionCount; p++) {
+                    result.appendInt(p, process(vVector.getBytesRef(p, vScratch), this.analyzer, this.categorizer));
+                }
+                return result.build();
+            }
+        }
+
+        @Override
+        public String toString() {
+            return "CategorizeEvaluator[" + "v=" + v + "]";
+        }
+
+        @Override
+        public void close() {
+            Releasables.closeExpectNoException(v, analyzer, categorizer);
+        }
+    }
+}
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java
@@ -0,0 +1,95 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.compute.aggregation.blockhash;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.bytes.BytesArray;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.util.BigArrays;
+import org.elasticsearch.common.util.BitArray;
+import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction;
+import org.elasticsearch.compute.data.BlockFactory;
+import org.elasticsearch.compute.data.BytesRefBlock;
+import org.elasticsearch.compute.data.CompositeBlock;
+import org.elasticsearch.compute.data.IntBlock;
+import org.elasticsearch.compute.data.IntVector;
+import org.elasticsearch.compute.data.Page;
+import org.elasticsearch.xpack.ml.aggs.categorization.SerializableTokenListCategory;
+import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+public class CategorizedIntermediateBlockHash extends AbstractCategorizeBlockHash {
+    private final IntBlockHash hash;
+    private final int channel;
+
+    CategorizedIntermediateBlockHash(
+        BlockFactory blockFactory,
+        boolean outputPartial,
+        TokenListCategorizer.CloseableTokenListCategorizer categorizer,
+        IntBlockHash hash,
+        int channel
+    ) {
+        super(blockFactory, outputPartial, categorizer);
+        this.hash = hash;
+        this.channel = channel;
+    }
+
+    public void add(Page page, GroupingAggregatorFunction.AddInput addInput) {
+        CompositeBlock block = page.getBlock(channel);
+        BytesRefBlock groupingState = block.getBlock(0);
+        BytesRefBlock groups = block.getBlock(0);
+        Map<Integer, Integer> idMap;
+        if (groupingState.areAllValuesNull() == false) {
+            idMap = readIntermediate(groupingState.getBytesRef(0, new BytesRef()));
+        } else {
+            idMap = Collections.emptyMap();
+        }
+        try (IntBlock.Builder newIdsBuilder = blockFactory.newIntBlockBuilder(groups.getTotalValueCount())) {
+            for (int i = 0; i < groups.getTotalValueCount(); i++) {
+                newIdsBuilder.appendInt(idMap.get(i));
+            }
+            IntBlock newIds = newIdsBuilder.build();
+            addInput.add(0, hash.add(newIds));
+        }
+    }
+
+    private Map<Integer, Integer> readIntermediate(BytesRef bytes) {
+        Map<Integer, Integer> idMap = new HashMap<>();
+        try (StreamInput in = new BytesArray(bytes).streamInput()) {
+            int count = in.readVInt();
+            for (int oldCategoryId = 0; oldCategoryId < count; oldCategoryId++) {
+                SerializableTokenListCategory category = new SerializableTokenListCategory(in);
+                int newCategoryId = categorizer.mergeWireCategory(category).getId();
+                System.err.println("category id map: " + oldCategoryId + " -> " + newCategoryId + " (" + category.getRegex() + ")");
+                idMap.put(oldCategoryId, newCategoryId);
+            }
+            return idMap;
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    @Override
+    public IntVector nonEmpty() {
+        return hash.nonEmpty();
+    }
+
+    @Override
+    public BitArray seenGroupIds(BigArrays bigArrays) {
+        return hash.seenGroupIds(bigArrays);
+    }
+
+    @Override
+    public void close() {
+
+    }
+}
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java