From fc0a549e8413613baf791e767d8ae153701f86bb Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Mon, 27 Oct 2025 14:56:34 -0400 Subject: [PATCH 1/4] ESQL: Block loader for pushing LENGTH Creates a `BlockLoader` for pushing the `LENGTH` function down into the loader for `keyword` fields. It takes advantage of the terms dictionary so we only need to calculate the code point count once per unique term loaded. This `BlockLoader` implementation isn't plugged into the infrastructure for emitting it because we're waiting on the infrastructure we've started in #137002. We'll make a follow up PR to plug this in. We're doing this mostly to demonstrate another function that we can push into field loading, in addition to the vector similarity functions we're building in #137002. We don't expect `LENGTH` to be a super hot function. If it happens to be then this'll help. Before we plug this in we'll have to figure out emitting warnings from functions that we've fused to field loading. Because `LENGTH` can emit a warning, specifically when it hits a multivalued field. --- .../index/mapper/BlockLoader.java | 14 + .../Utf8CodePointsFromOrdsBlockLoader.java | 403 ++++++++++++++++++ .../docvalues/ForceDocAtATime.java | 46 ++ ...tf8CodePointsFromOrdsBlockLoaderTests.java | 135 ++++++ .../elasticsearch/index/mapper/TestBlock.java | 20 + .../read/DelegatingBlockLoaderFactory.java | 11 + 6 files changed, 629 insertions(+) create mode 100644 server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java create mode 100644 server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/ForceDocAtATime.java create mode 100644 server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoaderTests.java diff --git a/server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java index 30f8409356d9c..eb5748d3647fc 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java @@ -13,6 +13,7 @@ import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.breaker.CircuitBreakingException; import org.elasticsearch.core.Nullable; import org.elasticsearch.core.Releasable; import org.elasticsearch.index.mapper.blockloader.docvalues.BlockDocValuesReader; @@ -377,6 +378,13 @@ interface Docs { * also a test implementation, but there may be no more other implementations. */ interface BlockFactory { + /** + * Adjust the circuit breaker with the given delta, if the delta is negative, the breaker will + * be adjusted without tripping. + * @throws CircuitBreakingException if the breaker was put above its limit + */ + void adjustBreaker(long delta) throws CircuitBreakingException; + /** * Build a builder to load booleans as loaded from doc values. Doc values * load booleans in sorted order. @@ -486,6 +494,12 @@ interface BlockFactory { */ Block constantBytes(BytesRef value, int count); + /** + * Build a block that contains {@code value} repeated + * {@code count} times. + */ + Block constantInt(int value, int count); + /** * Build a reader for reading {@link SortedDocValues} */ diff --git a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java new file mode 100644 index 0000000000000..f5d997b7ef3e3 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java @@ -0,0 +1,403 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper.blockloader.docvalues; + +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; + +import java.io.IOException; +import java.util.Arrays; + +/** + * A count of utf-8 code points for {@code keyword} style fields that are stored as a lookup table. + */ +public class Utf8CodePointsFromOrdsBlockLoader extends BlockDocValuesReader.DocValuesBlockLoader { + /** + * When there are fewer than this many unique values we use much more efficient "low cardinality" + * loaders. This must be fairly small because we build an untracked int[] with at most this many + * entries for a cache. + */ + static final int LOW_CARDINALITY = 1024; + private final String fieldName; + + public Utf8CodePointsFromOrdsBlockLoader(String fieldName) { + this.fieldName = fieldName; + } + + @Override + public IntBuilder builder(BlockFactory factory, int expectedCount) { + return factory.ints(expectedCount); + } + + @Override + public AllReader reader(LeafReaderContext context) throws IOException { + SortedSetDocValues docValues = context.reader().getSortedSetDocValues(fieldName); + if (docValues != null) { + if (docValues.getValueCount() > LOW_CARDINALITY) { + return new ImmediateOrdinals(docValues); + } + SortedDocValues singleton = DocValues.unwrapSingleton(docValues); + if (singleton != null) { + return new SingletonOrdinals(singleton); + } + return new Ordinals(docValues); + } + SortedDocValues singleton = context.reader().getSortedDocValues(fieldName); + if (singleton != null) { + if (singleton.getValueCount() > LOW_CARDINALITY) { + return new ImmediateOrdinals(DocValues.singleton(singleton)); + } + return new SingletonOrdinals(singleton); + } + return new ConstantNullsReader(); + } + + @Override + public String toString() { + return "Utf8CodePointsFromOrds[" + fieldName + "]"; + } + + /** + * Loads low cardinality singleton ordinals in using a cache of code point counts. + *
    + *
  1. Build an int[] containing the ordinals
  2. + *
  3. Sort a copy of the int[] and load the cache for each ordinal
  4. + *
  5. Walk the int[] in order, reading from the cache to build the page
  6. + *
+ */ + private static class SingletonOrdinals extends BlockDocValuesReader { + private final SortedDocValues ordinals; + private final int[] cache; + + SingletonOrdinals(SortedDocValues ordinals) { + this.ordinals = ordinals; + + // TODO track this memory. we can't yet because this isn't Closeable + this.cache = new int[ordinals.getValueCount()]; + Arrays.fill(this.cache, -1); + } + + @Override + public Block read(BlockFactory factory, Docs docs, int offset, boolean nullsFiltered) throws IOException { + if (docs.count() - offset == 1) { + return blockForSingleDoc(factory, docs.get(offset)); + } + + // TODO if the cache is full we can skip this three step thing and load immediately from the cache. + + int[] ords = readOrds(factory, docs, offset); + try { + fillCache(factory, ords); + return buildFromCache(factory, cache, ords); + } finally { + factory.adjustBreaker(-RamUsageEstimator.sizeOf(ords)); + } + } + + @Override + public void read(int docId, StoredFields storedFields, Builder builder) throws IOException { + if (ordinals.advanceExact(docId)) { + ((IntBuilder) builder).appendInt(codePointsAtOrd(ordinals.ordValue())); + } else { + builder.appendNull(); + } + } + + @Override + public int docId() { + return ordinals.docID(); + } + + @Override + public String toString() { + return "Utf8CodePointsFromOrds.SingletonOrdinals"; + } + + private Block blockForSingleDoc(BlockFactory factory, int docId) throws IOException { + if (ordinals.advanceExact(docId)) { + return factory.constantInt(codePointsAtOrd(ordinals.ordValue()), 1); + } else { + return factory.constantNulls(1); + } + } + + private int[] readOrds(BlockFactory factory, Docs docs, int offset) throws IOException { + int count = docs.count() - offset; + long size = sizeOfArray(count); + factory.adjustBreaker(size); + int[] ords = null; + try { + ords = new int[count]; + for (int i = offset; i < docs.count(); i++) { + int doc = docs.get(i); + if (ordinals.advanceExact(doc) == false) { + ords[i] = -1; + continue; + } + ords[i] = ordinals.ordValue(); + } + int[] result = ords; + ords = null; + return result; + } finally { + if (ords != null) { + factory.adjustBreaker(-size); + } + } + } + + private void fillCache(BlockFactory factory, int[] ords) throws IOException { + factory.adjustBreaker(RamUsageEstimator.sizeOf(ords)); + try { + int[] sortedOrds = ords.clone(); + Arrays.sort(sortedOrds); + int i = 0; + while (i < sortedOrds.length && sortedOrds[i] < 0) { + i++; + } + while (i < sortedOrds.length) { + // Fill the cache. Duplicates will noop quickly. + codePointsAtOrd(sortedOrds[i++]); + } + } finally { + factory.adjustBreaker(-RamUsageEstimator.sizeOf(ords)); + } + } + + private int codePointsAtOrd(int ord) throws IOException { + if (cache[ord] >= 0) { + return cache[ord]; + } + BytesRef v = ordinals.lookupOrd(ord); + int count = UnicodeUtil.codePointCount(v); + cache[ord] = count; + return count; + } + } + + /** + * Loads low cardinality non-singleton ordinals in using a cache of code point counts. + *
    + *
  1. Build an int[] containing the ordinals
  2. + *
  3. Sort a copy of the int[] and load the cache for each ordinal
  4. + *
  5. Walk the int[] in order, reading from the cache to build the page
  6. + *
+ */ + private static class Ordinals extends BlockDocValuesReader { + private final SortedSetDocValues ordinals; + private final int[] cache; + + Ordinals(SortedSetDocValues ordinals) { + this.ordinals = ordinals; + + // TODO track this memory. we can't yet because this isn't Closeable + this.cache = new int[Math.toIntExact(ordinals.getValueCount())]; + Arrays.fill(this.cache, -1); + } + + @Override + public Block read(BlockFactory factory, Docs docs, int offset, boolean nullsFiltered) throws IOException { + if (docs.count() - offset == 1) { + return blockForSingleDoc(factory, docs.get(offset)); + } + + int[] ords = readOrds(factory, docs, offset); + try { + fillCache(factory, ords); + return buildFromCache(factory, cache, ords); + } finally { + factory.adjustBreaker(-RamUsageEstimator.shallowSizeOf(ords)); + } + } + + @Override + public void read(int docId, StoredFields storedFields, Builder builder) throws IOException { + if (ordinals.advanceExact(docId) == false) { + builder.appendNull(); + return; + } + if (ordinals.docValueCount() != 1) { + // TODO warning + builder.appendNull(); + return; + } + ((IntBuilder) builder).appendInt(codePointsAtOrd(Math.toIntExact(ordinals.nextOrd()))); + } + + @Override + public int docId() { + return ordinals.docID(); + } + + @Override + public String toString() { + return "Utf8CodePointsFromOrds.Ordinals"; + } + + private Block blockForSingleDoc(BlockFactory factory, int docId) throws IOException { + if (ordinals.advanceExact(docId) == false) { + return factory.constantNulls(1); + } + if (ordinals.docValueCount() == 1) { + return factory.constantInt(codePointsAtOrd(Math.toIntExact(ordinals.nextOrd())), 1); + } + // TODO warning! + return factory.constantNulls(1); + } + + private int[] readOrds(BlockFactory factory, Docs docs, int offset) throws IOException { + int count = docs.count() - offset; + long size = sizeOfArray(count); + factory.adjustBreaker(size); + int[] ords = null; + try { + ords = new int[docs.count() - offset]; + for (int i = offset; i < docs.count(); i++) { + int doc = docs.get(i); + if (ordinals.advanceExact(doc) == false) { + ords[i] = -1; + continue; + } + if (ordinals.docValueCount() != 1) { + // TODO warning + ords[i] = -1; + continue; + } + ords[i] = Math.toIntExact(ordinals.nextOrd()); + } + int[] result = ords; + ords = null; + return result; + } finally { + if (ords != null) { + factory.adjustBreaker(-size); + } + } + } + + private void fillCache(BlockFactory factory, int[] ords) throws IOException { + factory.adjustBreaker(RamUsageEstimator.sizeOf(ords)); + try { + int[] sortedOrds = ords.clone(); + Arrays.sort(sortedOrds); + int i = 0; + while (i < sortedOrds.length && sortedOrds[i] < 0) { + i++; + } + while (i < sortedOrds.length) { + // Fill the cache. Duplicates will noop quickly. + codePointsAtOrd(sortedOrds[i++]); + } + } finally { + factory.adjustBreaker(-RamUsageEstimator.sizeOf(ords)); + } + } + + private int codePointsAtOrd(int ord) throws IOException { + if (cache[ord] >= 0) { + return cache[ord]; + } + BytesRef v = ordinals.lookupOrd(ord); + int count = UnicodeUtil.codePointCount(v); + cache[ord] = count; + return count; + } + } + + /** + * Loads a count of utf-8 code points for each ordinal doc by doc, without a cache. We use this when there + * are many unique doc values and the cache hit rate is unlikely to be high. + */ + private static class ImmediateOrdinals extends BlockDocValuesReader { + private final SortedSetDocValues ordinals; + + ImmediateOrdinals(SortedSetDocValues ordinals) { + this.ordinals = ordinals; + } + + @Override + public Block read(BlockFactory factory, Docs docs, int offset, boolean nullsFiltered) throws IOException { + if (docs.count() - offset == 1) { + return blockForSingleDoc(factory, docs.get(offset)); + } + try (IntBuilder builder = factory.ints(docs.count() - offset)) { + for (int i = offset; i < docs.count(); i++) { + read(docs.get(i), builder); + } + return builder.build(); + } + } + + @Override + public void read(int docId, StoredFields storedFields, Builder builder) throws IOException { + read(docId, (IntBuilder) builder); + } + + private void read(int docId, IntBuilder builder) throws IOException { + if (ordinals.advanceExact(docId) == false) { + builder.appendNull(); + return; + } + if (ordinals.docValueCount() != 1) { + // TODO warning + builder.appendNull(); + return; + } + builder.appendInt(codePointsAtOrd(ordinals.nextOrd())); + } + + @Override + public int docId() { + return ordinals.docID(); + } + + @Override + public String toString() { + return "Utf8CodePointsFromOrds.Immediate"; + } + + private Block blockForSingleDoc(BlockFactory factory, int docId) throws IOException { + if (ordinals.advanceExact(docId) == false) { + return factory.constantNulls(1); + } + if (ordinals.docValueCount() == 1) { + return factory.constantInt(codePointsAtOrd(ordinals.nextOrd()), 1); + } + // TODO warning! + return factory.constantNulls(1); + } + + private int codePointsAtOrd(long ord) throws IOException { + return UnicodeUtil.codePointCount(ordinals.lookupOrd(ord)); + } + } + + private static Block buildFromCache(BlockFactory factory, int[] cache, int[] ords) { + try (IntBuilder builder = factory.ints(ords.length)) { + for (int ord : ords) { + if (ord >= 0) { + builder.appendInt(cache[ord]); + } else { + builder.appendNull(); + } + } + return builder.build(); + } + } + + private static long sizeOfArray(int count) { + return RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + (long) Integer.BYTES * (long) count); + } +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/ForceDocAtATime.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/ForceDocAtATime.java new file mode 100644 index 0000000000000..0a8ffc7ba2941 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/ForceDocAtATime.java @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper.blockloader.docvalues; + +import org.elasticsearch.index.mapper.BlockLoader; + +import java.io.IOException; +import java.util.function.Supplier; + +public class ForceDocAtATime implements BlockLoader.AllReader { + private final Supplier builder; + private final BlockLoader.AllReader delegate; + + public ForceDocAtATime(Supplier builder, BlockLoader.AllReader delegate) { + this.builder = builder; + this.delegate = delegate; + } + + @Override + public BlockLoader.Block read(BlockLoader.BlockFactory factory, BlockLoader.Docs docs, int offset, boolean nullsFiltered) + throws IOException { + try (BlockLoader.Builder builder = this.builder.get()) { + for (int i = 0; i < docs.count(); i++) { + delegate.read(docs.get(i), null, builder); + } + return builder.build(); + } + } + + @Override + public void read(int docId, BlockLoader.StoredFields storedFields, BlockLoader.Builder builder) throws IOException { + delegate.read(docId, storedFields, builder); + } + + @Override + public boolean canReuse(int startingDocID) { + return delegate.canReuse(startingDocID); + } +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoaderTests.java new file mode 100644 index 0000000000000..8cdb81480064d --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoaderTests.java @@ -0,0 +1,135 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper.blockloader.docvalues; + +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.index.mapper.BlockLoader; +import org.elasticsearch.index.mapper.KeywordFieldMapper; +import org.elasticsearch.index.mapper.TestBlock; +import org.elasticsearch.test.ESTestCase; +import org.hamcrest.Matcher; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.elasticsearch.index.mapper.blockloader.docvalues.Utf8CodePointsFromOrdsBlockLoader.LOW_CARDINALITY; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasToString; +import static org.hamcrest.Matchers.nullValue; + +public class Utf8CodePointsFromOrdsBlockLoaderTests extends ESTestCase { + @ParametersFactory(argumentFormatting = "blockAtATime=%s, lowCardinality=%s, multiValues=%s, missingValues=%s") + public static List parameters() throws IOException { + List parameters = new ArrayList<>(); + for (boolean blockAtATime : new boolean[] { true, false }) { + for (boolean lowCardinality : new boolean[] { true, false }) { + for (boolean multiValues : new boolean[] { true, false }) { + for (boolean missingValues : new boolean[] { true, false }) { + parameters.add(new Object[] { blockAtATime, lowCardinality, multiValues, missingValues }); + } + } + } + } + return parameters; + } + + private final boolean blockAtATime; + private final boolean lowCardinality; + private final boolean multiValues; + private final boolean missingValues; + + public Utf8CodePointsFromOrdsBlockLoaderTests( + boolean blockAtATime, + boolean highCardinality, + boolean multiValues, + boolean missingValues + ) { + this.blockAtATime = blockAtATime; + this.lowCardinality = highCardinality; + this.multiValues = multiValues; + this.missingValues = missingValues; + } + + public void test() throws IOException { + try (Directory dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), dir)) { + int docCount = 10_000; + int cardinality = lowCardinality ? between(1, LOW_CARDINALITY) : between(LOW_CARDINALITY + 1, LOW_CARDINALITY * 2); + for (int i = 0; i < docCount; i++) { + List doc = new ArrayList<>(2); + doc.add(field(i % cardinality)); + if (multiValues && i % cardinality == 0) { + doc.add(field((i % cardinality) + 1)); + } + iw.addDocument(doc); + } + if (missingValues) { + iw.addDocument(List.of()); + } + iw.forceMerge(1); + try (DirectoryReader dr = iw.getReader()) { + LeafReaderContext ctx = getOnlyLeafReader(dr).getContext(); + + var stringsLoader = new BytesRefsFromOrdsBlockLoader("field"); + var codePointsLoader = new Utf8CodePointsFromOrdsBlockLoader("field"); + var codePointsReader = codePointsLoader.reader(ctx); + assertThat(codePointsReader, readerMatcher()); + + try ( + TestBlock strings = read(stringsLoader, stringsLoader.reader(ctx), ctx); + TestBlock codePoints = read(codePointsLoader, codePointsReader, ctx); + ) { + for (int i = 0; i < strings.size(); i++) { + Object str = strings.get(i); + if (str instanceof List || str == null) { + assertThat(codePoints.get(i), nullValue()); + continue; + } + BytesRef bytes = (BytesRef) strings.get(i); + assertThat(codePoints.get(i), equalTo(bytes.length)); + } + } + } + } + } + + private TestBlock read(BlockLoader loader, BlockLoader.AllReader reader, LeafReaderContext ctx) throws IOException { + BlockLoader.AllReader toUse = blockAtATime + ? reader + : new ForceDocAtATime(() -> loader.builder(TestBlock.factory(), ctx.reader().numDocs()), reader); + + return (TestBlock) toUse.read(TestBlock.factory(), TestBlock.docs(ctx), 0, false); + } + + private Matcher readerMatcher() { + if (lowCardinality == false) { + return hasToString("Utf8CodePointsFromOrds.Immediate"); + } + if (multiValues) { + return hasToString("Utf8CodePointsFromOrds.Ordinals"); + } + return hasToString("Utf8CodePointsFromOrds.SingletonOrdinals"); + } + + private static KeywordFieldMapper.KeywordField field(int codePointCount) { + return new KeywordFieldMapper.KeywordField( + "field", + new BytesRef("a".repeat(codePointCount)), + KeywordFieldMapper.Defaults.FIELD_TYPE + ); + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/TestBlock.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/TestBlock.java index 060165decab78..7b19a809df5aa 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/mapper/TestBlock.java +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/TestBlock.java @@ -13,6 +13,7 @@ import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.breaker.CircuitBreakingException; import org.elasticsearch.index.mapper.blockloader.docvalues.BlockDocValuesReader; import org.hamcrest.Matcher; @@ -33,6 +34,11 @@ public class TestBlock implements BlockLoader.Block { public static BlockLoader.BlockFactory factory() { return new BlockLoader.BlockFactory() { + @Override + public void adjustBreaker(long delta) throws CircuitBreakingException { + // Intentionally NOOP + } + @Override public BlockLoader.BooleanBuilder booleansFromDocValues(int expectedCount) { return booleans(expectedCount); @@ -368,6 +374,15 @@ public BlockLoader.Block constantBytes(BytesRef value, int count) { return builder.build(); } + @Override + public BlockLoader.Block constantInt(int value, int count) { + BlockLoader.IntBuilder builder = ints(count); + for (int i = 0; i < count; i++) { + builder.appendInt(value); + } + return builder.build(); + } + @Override public BlockLoader.SingletonOrdinalsBuilder singletonOrdinalsBuilder( SortedDocValues ordinals, @@ -481,6 +496,11 @@ public void close() { // TODO assert that we close the test blocks } + @Override + public String toString() { + return "TestBlock" + values; + } + private abstract static class Builder implements BlockLoader.Builder { private final List values = new ArrayList<>(); diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/read/DelegatingBlockLoaderFactory.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/read/DelegatingBlockLoaderFactory.java index 9e5f045b965ee..78d0e3fc04060 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/read/DelegatingBlockLoaderFactory.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/read/DelegatingBlockLoaderFactory.java @@ -10,6 +10,7 @@ import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.breaker.CircuitBreakingException; import org.elasticsearch.compute.data.Block; import org.elasticsearch.compute.data.BlockFactory; import org.elasticsearch.compute.data.BytesRefBlock; @@ -27,6 +28,11 @@ protected DelegatingBlockLoaderFactory(BlockFactory factory) { this.factory = factory; } + @Override + public void adjustBreaker(long delta) throws CircuitBreakingException { + factory.adjustBreaker(delta); + } + @Override public BlockLoader.BooleanBuilder booleansFromDocValues(int expectedCount) { return factory.newBooleanBlockBuilder(expectedCount).mvOrdering(Block.MvOrdering.SORTED_ASCENDING); @@ -68,6 +74,11 @@ public BytesRefBlock constantBytes(BytesRef value, int count) { } } + @Override + public BlockLoader.Block constantInt(int value, int count) { + return factory.newConstantIntVector(value, count).asBlock(); + } + @Override public BlockLoader.DoubleBuilder doublesFromDocValues(int expectedCount) { return factory.newDoubleBlockBuilder(expectedCount).mvOrdering(Block.MvOrdering.SORTED_ASCENDING); From 4b8b98f9c4b980808116f921b1461da1e26e399e Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Mon, 27 Oct 2025 15:25:04 -0400 Subject: [PATCH 2/4] Format --- .../blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java index f5d997b7ef3e3..9a445987b1a0d 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java @@ -30,6 +30,7 @@ public class Utf8CodePointsFromOrdsBlockLoader extends BlockDocValuesReader.DocV * entries for a cache. */ static final int LOW_CARDINALITY = 1024; + private final String fieldName; public Utf8CodePointsFromOrdsBlockLoader(String fieldName) { From 9561a28e5c4b8a9b2a998969f01bcc52613c8830 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 27 Oct 2025 19:49:45 +0000 Subject: [PATCH 3/4] [CI] Auto commit changes from spotless --- .../docvalues/Utf8CodePointsFromOrdsBlockLoader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java index 9a445987b1a0d..afa0745df9c85 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java @@ -30,7 +30,7 @@ public class Utf8CodePointsFromOrdsBlockLoader extends BlockDocValuesReader.DocV * entries for a cache. */ static final int LOW_CARDINALITY = 1024; - + private final String fieldName; public Utf8CodePointsFromOrdsBlockLoader(String fieldName) { From d22593ed0fbdbc33d871d1ea715cb6e24d8a7f06 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Tue, 28 Oct 2025 09:09:15 -0400 Subject: [PATCH 4/4] Load from cache --- .../Utf8CodePointsFromOrdsBlockLoader.java | 68 ++++++++++++++++--- ...tf8CodePointsFromOrdsBlockLoaderTests.java | 51 ++++++++++---- 2 files changed, 97 insertions(+), 22 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java index 9a445987b1a0d..6fd1b97396978 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoader.java @@ -30,7 +30,6 @@ public class Utf8CodePointsFromOrdsBlockLoader extends BlockDocValuesReader.DocV * entries for a cache. */ static final int LOW_CARDINALITY = 1024; - private final String fieldName; public Utf8CodePointsFromOrdsBlockLoader(String fieldName) { @@ -72,16 +71,28 @@ public String toString() { /** * Loads low cardinality singleton ordinals in using a cache of code point counts. + *

+ * If we haven't cached the counts for all ordinals then the process looks like: + *

*
    *
  1. Build an int[] containing the ordinals
  2. - *
  3. Sort a copy of the int[] and load the cache for each ordinal
  4. + *
  5. + * Sort a copy of the int[] and load the cache for each ordinal. The sorting + * is important here because ordinals are faster to resolved in ascending order. + *
  6. *
  7. Walk the int[] in order, reading from the cache to build the page
  8. *
+ *

+ * If we have cached the counts for all ordinals we load the + * ordinals and look them up in the cache immediately. + *

*/ private static class SingletonOrdinals extends BlockDocValuesReader { private final SortedDocValues ordinals; private final int[] cache; + private int cacheEntriesFilled; + SingletonOrdinals(SortedDocValues ordinals) { this.ordinals = ordinals; @@ -96,7 +107,9 @@ public Block read(BlockFactory factory, Docs docs, int offset, boolean nullsFilt return blockForSingleDoc(factory, docs.get(offset)); } - // TODO if the cache is full we can skip this three step thing and load immediately from the cache. + if (cacheEntriesFilled == cache.length) { + return buildFromFilledCache(factory, docs, offset); + } int[] ords = readOrds(factory, docs, offset); try { @@ -177,6 +190,21 @@ private void fillCache(BlockFactory factory, int[] ords) throws IOException { } } + private Block buildFromFilledCache(BlockFactory factory, Docs docs, int offset) throws IOException { + int count = docs.count() - offset; + try (IntBuilder builder = factory.ints(count)) { + for (int i = offset; i < docs.count(); i++) { + int doc = docs.get(i); + if (ordinals.advanceExact(doc) == false) { + builder.appendNull(); + continue; + } + builder.appendInt(cache[ordinals.ordValue()]); + } + return builder.build(); + } + } + private int codePointsAtOrd(int ord) throws IOException { if (cache[ord] >= 0) { return cache[ord]; @@ -184,22 +212,21 @@ private int codePointsAtOrd(int ord) throws IOException { BytesRef v = ordinals.lookupOrd(ord); int count = UnicodeUtil.codePointCount(v); cache[ord] = count; + cacheEntriesFilled++; return count; } } /** * Loads low cardinality non-singleton ordinals in using a cache of code point counts. - *
    - *
  1. Build an int[] containing the ordinals
  2. - *
  3. Sort a copy of the int[] and load the cache for each ordinal
  4. - *
  5. Walk the int[] in order, reading from the cache to build the page
  6. - *
+ * See {@link SingletonOrdinals} for the process */ private static class Ordinals extends BlockDocValuesReader { private final SortedSetDocValues ordinals; private final int[] cache; + private int cacheEntriesFilled; + Ordinals(SortedSetDocValues ordinals) { this.ordinals = ordinals; @@ -214,6 +241,10 @@ public Block read(BlockFactory factory, Docs docs, int offset, boolean nullsFilt return blockForSingleDoc(factory, docs.get(offset)); } + if (cacheEntriesFilled == cache.length) { + return buildFromFilledCache(factory, docs, offset); + } + int[] ords = readOrds(factory, docs, offset); try { fillCache(factory, ords); @@ -306,6 +337,26 @@ private void fillCache(BlockFactory factory, int[] ords) throws IOException { } } + private Block buildFromFilledCache(BlockFactory factory, Docs docs, int offset) throws IOException { + int count = docs.count() - offset; + try (IntBuilder builder = factory.ints(count)) { + for (int i = offset; i < docs.count(); i++) { + int doc = docs.get(i); + if (ordinals.advanceExact(doc) == false) { + builder.appendNull(); + continue; + } + if (ordinals.docValueCount() != 1) { + // TODO warning + builder.appendNull(); + continue; + } + builder.appendInt(cache[Math.toIntExact(ordinals.nextOrd())]); + } + return builder.build(); + } + } + private int codePointsAtOrd(int ord) throws IOException { if (cache[ord] >= 0) { return cache[ord]; @@ -313,6 +364,7 @@ private int codePointsAtOrd(int ord) throws IOException { BytesRef v = ordinals.lookupOrd(ord); int count = UnicodeUtil.codePointCount(v); cache[ord] = count; + cacheEntriesFilled++; return count; } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoaderTests.java index 8cdb81480064d..4bcd85312e5d6 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/docvalues/Utf8CodePointsFromOrdsBlockLoaderTests.java @@ -86,33 +86,56 @@ public void test() throws IOException { var stringsLoader = new BytesRefsFromOrdsBlockLoader("field"); var codePointsLoader = new Utf8CodePointsFromOrdsBlockLoader("field"); + + var stringsReader = stringsLoader.reader(ctx); var codePointsReader = codePointsLoader.reader(ctx); assertThat(codePointsReader, readerMatcher()); - + BlockLoader.Docs docs = TestBlock.docs(ctx); try ( - TestBlock strings = read(stringsLoader, stringsLoader.reader(ctx), ctx); - TestBlock codePoints = read(codePointsLoader, codePointsReader, ctx); + TestBlock strings = read(stringsLoader, stringsReader, ctx, docs); + TestBlock codePoints = read(codePointsLoader, codePointsReader, ctx, docs); ) { - for (int i = 0; i < strings.size(); i++) { - Object str = strings.get(i); - if (str instanceof List || str == null) { - assertThat(codePoints.get(i), nullValue()); - continue; - } - BytesRef bytes = (BytesRef) strings.get(i); - assertThat(codePoints.get(i), equalTo(bytes.length)); + checkBlocks(strings, codePoints); + } + + stringsReader = stringsLoader.reader(ctx); + codePointsReader = codePointsLoader.reader(ctx); + for (int i = 0; i < ctx.reader().numDocs(); i += 10) { + int[] docsArray = new int[Math.min(10, ctx.reader().numDocs() - i)]; + for (int d = 0; d < docsArray.length; d++) { + docsArray[d] = i + d; + } + docs = TestBlock.docs(docsArray); + try ( + TestBlock strings = read(stringsLoader, stringsReader, ctx, docs); + TestBlock codePoints = read(codePointsLoader, codePointsReader, ctx, docs); + ) { + checkBlocks(strings, codePoints); } } } } } - private TestBlock read(BlockLoader loader, BlockLoader.AllReader reader, LeafReaderContext ctx) throws IOException { + private void checkBlocks(TestBlock strings, TestBlock codePoints) { + for (int i = 0; i < strings.size(); i++) { + Object str = strings.get(i); + if (str instanceof List || str == null) { + assertThat(codePoints.get(i), nullValue()); + continue; + } + BytesRef bytes = (BytesRef) strings.get(i); + assertThat(codePoints.get(i), equalTo(bytes.length)); + } + } + + private TestBlock read(BlockLoader loader, BlockLoader.AllReader reader, LeafReaderContext ctx, BlockLoader.Docs docs) + throws IOException { BlockLoader.AllReader toUse = blockAtATime ? reader - : new ForceDocAtATime(() -> loader.builder(TestBlock.factory(), ctx.reader().numDocs()), reader); + : new ForceDocAtATime(() -> loader.builder(TestBlock.factory(), docs.count()), reader); - return (TestBlock) toUse.read(TestBlock.factory(), TestBlock.docs(ctx), 0, false); + return (TestBlock) toUse.read(TestBlock.factory(), docs, 0, false); } private Matcher readerMatcher() {