Add aggregator to unit test

jan-elastic · jan-elastic · commit 239d1590593e · 2024-11-04T12:54:28.000+01:00
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java
@@ -73,10 +73,10 @@ private Block buildIntermediateBlock() {
         try (BytesStreamOutput out = new BytesStreamOutput()) {
             // TODO be more careful here.
             out.writeVInt(categorizer.getCategoryCount());
-            for (SerializableTokenListCategory category : categorizer.toCategories(categorizer.getCategoryCount())) {
+            for (SerializableTokenListCategory category : categorizer.toCategoriesById()) {
                 category.writeTo(out);
             }
-            return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), 1);
+            return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), categorizer.getCategoryCount());
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
@@ -85,7 +85,7 @@ private Block buildIntermediateBlock() {
     private Block buildFinalBlock() {
         try (BytesRefVector.Builder result = blockFactory.newBytesRefVectorBuilder(categorizer.getCategoryCount())) {
             BytesRefBuilder scratch = new BytesRefBuilder();
-            for (SerializableTokenListCategory category : categorizer.toCategories(categorizer.getCategoryCount())) {
+            for (SerializableTokenListCategory category : categorizer.toCategoriesById()) {
                 scratch.copyChars(category.getRegex());
                 result.appendBytesRef(scratch.get());
                 scratch.clear();
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java
@@ -52,6 +52,7 @@ public class CategorizedIntermediateBlockHash extends AbstractCategorizeBlockHas
         this.hash = new IntBlockHash(channel, blockFactory);
     }
 
+    @Override
     public void add(Page page, GroupingAggregatorFunction.AddInput addInput) {
         BytesRefBlock categorizerState = page.getBlock(channel());
         Map<Integer, Integer> idMap;
@@ -60,6 +61,8 @@ public void add(Page page, GroupingAggregatorFunction.AddInput addInput) {
         } else {
             idMap = Collections.emptyMap();
         }
+        // TODO: when there are aggregators running, this renumbering doesn't work.
+        // This should renumber the destination IDs only, but it also renumbers the source IDs.
         try (IntBlock.Builder newIdsBuilder = blockFactory.newIntBlockBuilder(idMap.size())) {
             for (int i = 0; i < idMap.size(); i++) {
                 newIdsBuilder.appendInt(idMap.get(i));
diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java
@@ -26,6 +26,7 @@
 import org.elasticsearch.compute.data.ElementType;
 import org.elasticsearch.compute.data.IntBlock;
 import org.elasticsearch.compute.data.IntVector;
+import org.elasticsearch.compute.data.LongBlock;
 import org.elasticsearch.compute.data.LongVector;
 import org.elasticsearch.compute.data.Page;
 import org.elasticsearch.compute.operator.CannedSourceOperator;
@@ -45,13 +46,14 @@
 import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
 
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
 import static org.elasticsearch.compute.operator.OperatorTestCase.runDriver;
-import static org.hamcrest.Matchers.containsInAnyOrder;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 
@@ -244,24 +246,41 @@ public void testCategorize_withDriver() {
         DriverContext driverContext = new DriverContext(bigArrays, new BlockFactory(breaker, bigArrays));
 
         LocalSourceOperator.BlockSupplier input1 = () -> {
-            try (BytesRefVector.Builder textsBuilder = driverContext.blockFactory().newBytesRefVectorBuilder(10)) {
+            try (
+                BytesRefVector.Builder textsBuilder = driverContext.blockFactory().newBytesRefVectorBuilder(10);
+                LongVector.Builder countsBuilder = driverContext.blockFactory().newLongVectorBuilder(10)
+            ) {
                 textsBuilder.appendBytesRef(new BytesRef("a"));
                 textsBuilder.appendBytesRef(new BytesRef("b"));
                 textsBuilder.appendBytesRef(new BytesRef("words words words goodbye jan"));
                 textsBuilder.appendBytesRef(new BytesRef("words words words goodbye nik"));
                 textsBuilder.appendBytesRef(new BytesRef("words words words hello jan"));
                 textsBuilder.appendBytesRef(new BytesRef("c"));
-                return new Block[] { textsBuilder.build().asBlock() };
+                countsBuilder.appendLong(11);
+                countsBuilder.appendLong(22);
+                countsBuilder.appendLong(800);
+                countsBuilder.appendLong(80);
+                countsBuilder.appendLong(900);
+                countsBuilder.appendLong(30);
+                return new Block[] { textsBuilder.build().asBlock(), countsBuilder.build().asBlock() };
             }
         };
         LocalSourceOperator.BlockSupplier input2 = () -> {
-            try (BytesRefVector.Builder builder = driverContext.blockFactory().newBytesRefVectorBuilder(10)) {
-                builder.appendBytesRef(new BytesRef("words words words hello nik"));
-                builder.appendBytesRef(new BytesRef("c"));
-                builder.appendBytesRef(new BytesRef("words words words goodbye chris"));
-                builder.appendBytesRef(new BytesRef("d"));
-                builder.appendBytesRef(new BytesRef("e"));
-                return new Block[] { builder.build().asBlock() };
+            try (
+                BytesRefVector.Builder textsBuilder = driverContext.blockFactory().newBytesRefVectorBuilder(10);
+                LongVector.Builder countsBuilder = driverContext.blockFactory().newLongVectorBuilder(10)
+            ) {
+                textsBuilder.appendBytesRef(new BytesRef("words words words hello nik"));
+                textsBuilder.appendBytesRef(new BytesRef("c"));
+                textsBuilder.appendBytesRef(new BytesRef("words words words goodbye chris"));
+                textsBuilder.appendBytesRef(new BytesRef("d"));
+                textsBuilder.appendBytesRef(new BytesRef("e"));
+                countsBuilder.appendLong(99);
+                countsBuilder.appendLong(3);
+                countsBuilder.appendLong(8);
+                countsBuilder.appendLong(44);
+                countsBuilder.appendLong(55);
+                return new Block[] { textsBuilder.build().asBlock(), countsBuilder.build().asBlock() };
             }
         };
         List<Page> intermediateOutput = new ArrayList<>();
@@ -273,7 +292,7 @@ public void testCategorize_withDriver() {
             List.of(
                 new HashAggregationOperator.HashAggregationOperatorFactory(
                     List.of(new BlockHash.GroupSpec(0, ElementType.CATEGORY_RAW)),
-                    List.of(),
+                    List.of(new SumLongAggregatorFunctionSupplier(List.of(1)).groupingAggregatorFactory(AggregatorMode.INITIAL)),
                     16 * 1024
                 ).get(driverContext)
             ),
@@ -288,7 +307,7 @@ public void testCategorize_withDriver() {
             List.of(
                 new HashAggregationOperator.HashAggregationOperatorFactory(
                     List.of(new BlockHash.GroupSpec(0, ElementType.CATEGORY_RAW)),
-                    List.of(),
+                    List.of(new SumLongAggregatorFunctionSupplier(List.of(1)).groupingAggregatorFactory(AggregatorMode.INITIAL)),
                     16 * 1024
                 ).get(driverContext)
             ),
@@ -303,7 +322,7 @@ public void testCategorize_withDriver() {
             List.of(
                 new HashAggregationOperator.HashAggregationOperatorFactory(
                     List.of(new BlockHash.GroupSpec(0, ElementType.CATEGORY_INTERMEDIATE)),
-                    List.of(),
+                    List.of(new SumLongAggregatorFunctionSupplier(List.of(1)).groupingAggregatorFactory(AggregatorMode.INITIAL)),
                     16 * 1024
                 ).get(driverContext)
             ),
@@ -313,23 +332,32 @@ public void testCategorize_withDriver() {
         runDriver(driver);
 
         assertThat(finalOutput, hasSize(1));
-        assertThat(finalOutput.get(0).getBlockCount(), equalTo(1));
-        BytesRefBlock block = finalOutput.get(0).getBlock(0);
-        BytesRefVector vector = block.asVector();
-        List<String> values = new ArrayList<>();
-        for (int p = 0; p < vector.getPositionCount(); p++) {
-            values.add(vector.getBytesRef(p, new BytesRef()).utf8ToString());
+        assertThat(finalOutput.get(0).getBlockCount(), equalTo(3));
+        BytesRefVector textsVector = ((BytesRefBlock) finalOutput.get(0).getBlock(0)).asVector();
+        LongVector countsVector = ((LongBlock) finalOutput.get(0).getBlock(1)).asVector();
+        Map<String, Long> counts = new HashMap<>();
+        for (int i = 0; i < countsVector.getPositionCount(); i++) {
+            counts.put(textsVector.getBytesRef(i, new BytesRef()).utf8ToString(), countsVector.getLong(i));
         }
         assertThat(
-            values,
-            containsInAnyOrder(
-                ".*?a.*?",
-                ".*?b.*?",
-                ".*?c.*?",
-                ".*?d.*?",
-                ".*?e.*?",
-                ".*?words.+?words.+?words.+?goodbye.*?",
-                ".*?words.+?words.+?words.+?hello.*?"
+            counts,
+            equalTo(
+                Map.of(
+                    ".*?a.*?",
+                    11,
+                    ".*?b.*?",
+                    22,
+                    ".*?c.*?",
+                    33,
+                    ".*?d.*?",
+                    44,
+                    ".*?e.*?",
+                    55,
+                    ".*?words.+?words.+?words.+?goodbye.*?",
+                    888,
+                    ".*?words.+?words.+?words.+?hello.*?",
+                    999
+                )
             )
         );
         Releasables.close(() -> Iterators.map(finalOutput.iterator(), (Page p) -> p::releaseBlocks));
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java
@@ -84,6 +84,8 @@ public void close() {
     @Nullable
     private final CategorizationPartOfSpeechDictionary partOfSpeechDictionary;
 
+    private final List<TokenListCategory> categoriesById;
+
     /**
      * Categories stored in such a way that the most common are accessed first.
      * This is implemented as an {@link ArrayList} with bespoke ordering rather
@@ -109,6 +111,7 @@ public TokenListCategorizer(
         this.lowerThreshold = threshold;
         this.upperThreshold = (1.0f + threshold) / 2.0f;
         this.categoriesByNumMatches = new ArrayList<>();
+        this.categoriesById = new ArrayList<>();
         cacheRamUsage(0);
     }
 
@@ -310,6 +313,7 @@ private synchronized TokenListCategory computeCategory(
             maxUnfilteredStringLen,
             numDocs
         );
+        categoriesById.add(newCategory);
         categoriesByNumMatches.add(newCategory);
         cacheRamUsage(newCategory.ramBytesUsed());
         return repositionCategory(newCategory, newIndex);
@@ -428,6 +432,10 @@ public List<SerializableTokenListCategory> toCategories(int size) {
             .toList();
     }
 
+    public List<SerializableTokenListCategory> toCategoriesById() {
+        return categoriesById.stream().map(category -> new SerializableTokenListCategory(category, bytesRefHash)).toList();
+    }
+
     public InternalCategorizationAggregation.Bucket[] toOrderedBuckets(int size) {
         return categoriesByNumMatches.stream()
             .limit(size)