From dab897f4f1ff293070a3340049aea9677025a5a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Mon, 27 Oct 2025 22:06:57 +0100 Subject: [PATCH 01/10] Add ES93BloomFilterStoredFieldsFormat for efficient field existence checks Introduces a new stored fields format that builds a Bloom filter for a specific field to enable fast existence checks without storing the field itself. This delegates storage of all other fields to another StoredFieldsFormat while maintaining the ability to quickly determine if a document might contain the target field. --- .../bloomfilter/BloomFilterHashFunctions.java | 194 +++++++ .../ES87BloomFilterPostingsFormat.java | 182 +----- .../ES93BloomFilterStoredFieldsFormat.java | 539 ++++++++++++++++++ ...S93BloomFilterStoredFieldsFormatTests.java | 162 ++++++ 4 files changed, 896 insertions(+), 181 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterHashFunctions.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java create mode 100644 server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterHashFunctions.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterHashFunctions.java new file mode 100644 index 0000000000000..8723b9dc5dbe3 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterHashFunctions.java @@ -0,0 +1,194 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.bloomfilter; + +import org.elasticsearch.common.util.ByteUtils; + +public class BloomFilterHashFunctions { + private BloomFilterHashFunctions() {} + + // + // The following Murmur3 implementation is borrowed from commons-codec. + // + /** + * Implementation of the MurmurHash3 128-bit hash functions. + * + *

+ * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic + * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not + * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes. + *

+ * + *

+ * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function + * {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher. + *

+ * + *

+ * This is public domain code with no copyrights. From home page of + * SMHasher: + *

+ * + *
"All MurmurHash versions are public domain software, and the author disclaims all copyright to their + * code."
+ * + *

+ * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original + * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a + * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes. + *

+ * + * @see MurmurHash + * @see Original MurmurHash3 c++ + * code + * @see + * Apache Hive Murmer3 + * @since 1.13 + */ + public static final class MurmurHash3 { + /** + * A default seed to use for the murmur hash algorithm. + * Has the value {@code 104729}. + */ + public static final int DEFAULT_SEED = 104729; + + // Constants for 128-bit variant + private static final long C1 = 0x87c37b91114253d5L; + private static final long C2 = 0x4cf5ad432745937fL; + private static final int R1 = 31; + private static final int R2 = 27; + private static final int R3 = 33; + private static final int M = 5; + private static final int N1 = 0x52dce729; + private static final int N2 = 0x38495ab5; + + /** No instance methods. */ + private MurmurHash3() {} + + /** + * Generates 64-bit hash from the byte array with the given offset, length and seed by discarding the second value of the 128-bit + * hash. + * + * This version uses the default seed. + * + * @param data The input byte array + * @param offset The first element of array + * @param length The length of array + * @return The sum of the two 64-bit hashes that make up the hash128 + */ + @SuppressWarnings("fallthrough") + public static long hash64(final byte[] data, final int offset, final int length) { + long h1 = MurmurHash3.DEFAULT_SEED; + long h2 = MurmurHash3.DEFAULT_SEED; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 4); + long k1 = ByteUtils.readLongLE(data, index); + long k2 = ByteUtils.readLongLE(data, index + 8); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + final int index = offset + (nblocks << 4); + switch (offset + length - index) { + case 15: + k2 ^= ((long) data[index + 14] & 0xff) << 48; + case 14: + k2 ^= ((long) data[index + 13] & 0xff) << 40; + case 13: + k2 ^= ((long) data[index + 12] & 0xff) << 32; + case 12: + k2 ^= ((long) data[index + 11] & 0xff) << 24; + case 11: + k2 ^= ((long) data[index + 10] & 0xff) << 16; + case 10: + k2 ^= ((long) data[index + 9] & 0xff) << 8; + case 9: + k2 ^= data[index + 8] & 0xff; + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= ((long) data[index + 7] & 0xff) << 56; + case 7: + k1 ^= ((long) data[index + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[index + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[index + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[index + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[index + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[index + 1] & 0xff) << 8; + case 1: + k1 ^= data[index] & 0xff; + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + + return h1; + } + + /** + * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}. + * + * @param hash The current hash + * @return The final hash + */ + private static long fmix64(long hash) { + hash ^= (hash >>> 33); + hash *= 0xff51afd7ed558ccdL; + hash ^= (hash >>> 33); + hash *= 0xc4ceb9fe1a85ec53L; + hash ^= (hash >>> 33); + return hash; + } + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java index abf68abe51887..65d2c1317b86e 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java @@ -47,7 +47,6 @@ import org.elasticsearch.common.lucene.store.IndexOutputOutputStream; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.ByteArray; -import org.elasticsearch.common.util.ByteUtils; import org.elasticsearch.core.IOUtils; import java.io.Closeable; @@ -548,7 +547,7 @@ static int numBytesForBloomFilter(int bloomFilterSize) { // Uses MurmurHash3-128 to generate a 64-bit hash value, then picks 7 subsets of 31 bits each and returns the values in the // outputs array. This provides us with 7 reasonably independent hashes of the data for the cost of one MurmurHash3 calculation. static int[] hashTerm(BytesRef br, int[] outputs) { - final long hash64 = MurmurHash3.hash64(br.bytes, br.offset, br.length); + final long hash64 = BloomFilterHashFunctions.MurmurHash3.hash64(br.bytes, br.offset, br.length); final int upperHalf = (int) (hash64 >> 32); final int lowerHalf = (int) hash64; // Derive 7 hash outputs by combining the two 64-bit halves, adding the upper half multiplied with different small constants @@ -562,183 +561,4 @@ static int[] hashTerm(BytesRef br, int[] outputs) { outputs[6] = (lowerHalf + 17 * upperHalf) & 0x7FFF_FFFF; return outputs; } - - // - // The following Murmur3 implementation is borrowed from commons-codec. - // - - /** - * Implementation of the MurmurHash3 128-bit hash functions. - * - *

- * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic - * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not - * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes. - *

- * - *

- * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function - * {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher. - *

- * - *

- * This is public domain code with no copyrights. From home page of - * SMHasher: - *

- * - *
"All MurmurHash versions are public domain software, and the author disclaims all copyright to their - * code."
- * - *

- * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original - * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a - * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes. - *

- * - * @see MurmurHash - * @see Original MurmurHash3 c++ - * code - * @see - * Apache Hive Murmer3 - * @since 1.13 - */ - public static final class MurmurHash3 { - /** - * A default seed to use for the murmur hash algorithm. - * Has the value {@code 104729}. - */ - public static final int DEFAULT_SEED = 104729; - - // Constants for 128-bit variant - private static final long C1 = 0x87c37b91114253d5L; - private static final long C2 = 0x4cf5ad432745937fL; - private static final int R1 = 31; - private static final int R2 = 27; - private static final int R3 = 33; - private static final int M = 5; - private static final int N1 = 0x52dce729; - private static final int N2 = 0x38495ab5; - - /** No instance methods. */ - private MurmurHash3() {} - - /** - * Generates 64-bit hash from the byte array with the given offset, length and seed by discarding the second value of the 128-bit - * hash. - * - * This version uses the default seed. - * - * @param data The input byte array - * @param offset The first element of array - * @param length The length of array - * @return The sum of the two 64-bit hashes that make up the hash128 - */ - @SuppressWarnings("fallthrough") - public static long hash64(final byte[] data, final int offset, final int length) { - long h1 = MurmurHash3.DEFAULT_SEED; - long h2 = MurmurHash3.DEFAULT_SEED; - final int nblocks = length >> 4; - - // body - for (int i = 0; i < nblocks; i++) { - final int index = offset + (i << 4); - long k1 = ByteUtils.readLongLE(data, index); - long k2 = ByteUtils.readLongLE(data, index + 8); - - // mix functions for k1 - k1 *= C1; - k1 = Long.rotateLeft(k1, R1); - k1 *= C2; - h1 ^= k1; - h1 = Long.rotateLeft(h1, R2); - h1 += h2; - h1 = h1 * M + N1; - - // mix functions for k2 - k2 *= C2; - k2 = Long.rotateLeft(k2, R3); - k2 *= C1; - h2 ^= k2; - h2 = Long.rotateLeft(h2, R1); - h2 += h1; - h2 = h2 * M + N2; - } - - // tail - long k1 = 0; - long k2 = 0; - final int index = offset + (nblocks << 4); - switch (offset + length - index) { - case 15: - k2 ^= ((long) data[index + 14] & 0xff) << 48; - case 14: - k2 ^= ((long) data[index + 13] & 0xff) << 40; - case 13: - k2 ^= ((long) data[index + 12] & 0xff) << 32; - case 12: - k2 ^= ((long) data[index + 11] & 0xff) << 24; - case 11: - k2 ^= ((long) data[index + 10] & 0xff) << 16; - case 10: - k2 ^= ((long) data[index + 9] & 0xff) << 8; - case 9: - k2 ^= data[index + 8] & 0xff; - k2 *= C2; - k2 = Long.rotateLeft(k2, R3); - k2 *= C1; - h2 ^= k2; - - case 8: - k1 ^= ((long) data[index + 7] & 0xff) << 56; - case 7: - k1 ^= ((long) data[index + 6] & 0xff) << 48; - case 6: - k1 ^= ((long) data[index + 5] & 0xff) << 40; - case 5: - k1 ^= ((long) data[index + 4] & 0xff) << 32; - case 4: - k1 ^= ((long) data[index + 3] & 0xff) << 24; - case 3: - k1 ^= ((long) data[index + 2] & 0xff) << 16; - case 2: - k1 ^= ((long) data[index + 1] & 0xff) << 8; - case 1: - k1 ^= data[index] & 0xff; - k1 *= C1; - k1 = Long.rotateLeft(k1, R1); - k1 *= C2; - h1 ^= k1; - } - - // finalization - h1 ^= length; - h2 ^= length; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - - return h1; - } - - /** - * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}. - * - * @param hash The current hash - * @return The final hash - */ - private static long fmix64(long hash) { - hash ^= (hash >>> 33); - hash *= 0xff51afd7ed558ccdL; - hash ^= (hash >>> 33); - hash *= 0xc4ceb9fe1a85ec53L; - hash ^= (hash >>> 33); - return hash; - } - } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java new file mode 100644 index 0000000000000..75998d982ccb9 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -0,0 +1,539 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.bloomfilter; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldDataInput; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.lucene.store.IndexOutputOutputStream; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.ByteArray; +import org.elasticsearch.core.IOUtils; +import org.elasticsearch.core.Nullable; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import static org.elasticsearch.index.codec.bloomfilter.BloomFilterHashFunctions.MurmurHash3.hash64; + +/** + * A stored fields format that builds a Bloom filter for a specific field to enable fast + * existence checks, while delegating storage of all other fields to another StoredFieldsFormat. + * + *

The field used to build the Bloom filter is not stored, as only its presence + * needs to be tracked for filtering purposes. This reduces storage overhead while maintaining + * the ability to quickly determine if a segment might contain the field. + * + *

File formats + * + *

Bloom filter stored fields are represented by two files: + * + *

    + *
  1. + *

    Bloom filter data file (extension .sfbf). This file stores the bloom + * filter bitset. + *

  2. + *

    A bloom filter meta file (extension .sfbfm). This file stores metadata about the + * bloom filters stored in the bloom filter data file. The in-memory representation can + * be found in {@link BloomFilterMetadata}. + *

+ */ +public class ES93BloomFilterStoredFieldsFormat extends StoredFieldsFormat { + public static final String STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME = "ES93BloomFilterStoredFieldsFormat"; + public static final String STORED_FIELDS_BLOOM_FILTER_EXTENSION = "sfbf"; + public static final String STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION = "sfbfm"; + private static final int VERSION_START = 1; + private static final int VERSION_CURRENT = VERSION_START; + + // We use prime numbers with the Kirsch-Mitzenmacher technique to obtain multiple hashes from two hash functions + private static final int[] PRIMES = new int[] { 2, 5, 11, 17, 23, 29, 41, 47, 53, 59, 71 }; + private static final int[] powerOfTwoBitSetSizes; + private static final int DEFAULT_NUM_HASH_FUNCTIONS = 7; + private static final byte BLOOM_FILTER_STORED = 1; + private static final byte BLOOM_FILTER_NOT_STORED = 0; + + static { + // Precompute powers of two (2^1 to 2^26) for efficient modulo operations using bitwise AND. + // We start from 2^1 (2 bits) and go up to 2^26 (67,108,864 bits / 8,388,608 bytes = 8 MB) + // as the maximum, staying within positive int range. + powerOfTwoBitSetSizes = new int[27]; + for (int i = 0; i < powerOfTwoBitSetSizes.length; i++) { + powerOfTwoBitSetSizes[i] = 1 << i; + assert powerOfTwoBitSetSizes[i] > 0; + } + } + + private final BigArrays bigArrays; + private final String segmentSuffix; + private final StoredFieldsFormat delegate; + private final String bloomFilterFieldName; + private final int numHashFunctions; + private final int bloomFilterSizeInBits; + + public ES93BloomFilterStoredFieldsFormat( + BigArrays bigArrays, + String segmentSuffix, + StoredFieldsFormat delegate, + ByteSizeValue bloomFilterSize, + String bloomFilterFieldName + ) { + this.bigArrays = bigArrays; + this.segmentSuffix = segmentSuffix; + this.delegate = delegate; + this.bloomFilterFieldName = bloomFilterFieldName; + this.numHashFunctions = DEFAULT_NUM_HASH_FUNCTIONS; + int bloomFilterSizeInBits = 0; + // Find the closest power of 2 that fits the required size + for (int powerOfTwoBitSetSize : powerOfTwoBitSetSizes) { + if (powerOfTwoBitSetSize <= (Math.multiplyExact(bloomFilterSize.getBytes(), Byte.SIZE))) { + bloomFilterSizeInBits = powerOfTwoBitSetSize; + } + } + assert bloomFilterSizeInBits > 0; + this.bloomFilterSizeInBits = bloomFilterSizeInBits; + } + + @Override + public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + return new Reader(directory, si, fn, context, segmentSuffix, delegate.fieldsReader(directory, si, fn, context)); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { + // TODO: compute the bloom filter size based on heuristics and oversize factor + return new Writer( + directory, + si, + context, + segmentSuffix, + bigArrays, + numHashFunctions, + bloomFilterSizeInBits, + bloomFilterFieldName, + delegate.fieldsWriter(directory, si, context) + ); + } + + static class Writer extends StoredFieldsWriter { + private final IndexOutput bloomFilterDataOut; + private final IndexOutput metadataOut; + private final ByteArray buffer; + private final List toClose = new ArrayList<>(); + private final int[] hashes; + private final int numHashFunctions; + private final int bloomFilterSizeInBits; + private final int bloomFilterSizeInBytes; + private final StoredFieldsWriter delegateWriter; + private final String bloomFilterFieldName; + private FieldInfo bloomFilterFieldInfo; + + Writer( + Directory directory, + SegmentInfo segmentInfo, + IOContext context, + String segmentSuffix, + BigArrays bigArrays, + int numHashFunctions, + int bloomFilterSizeInBits, + String bloomFilterFieldName, + StoredFieldsWriter delegateWriter + ) throws IOException { + assert isPowerOfTwo(bloomFilterSizeInBits) : "Bloom filter size is not a power of 2"; + assert numHashFunctions <= PRIMES.length + : "Number of hash functions must be <= " + PRIMES.length + " but was " + numHashFunctions; + + this.numHashFunctions = numHashFunctions; + this.hashes = new int[numHashFunctions]; + this.bloomFilterSizeInBits = bloomFilterSizeInBits; + this.bloomFilterSizeInBytes = bloomFilterSizeInBits / Byte.SIZE; + this.bloomFilterFieldName = bloomFilterFieldName; + + this.delegateWriter = delegateWriter; + toClose.add(delegateWriter); + + boolean success = false; + try { + bloomFilterDataOut = directory.createOutput(bloomFilterFileName(segmentInfo, segmentSuffix), context); + toClose.add(bloomFilterDataOut); + CodecUtil.writeIndexHeader( + bloomFilterDataOut, + STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, + VERSION_CURRENT, + segmentInfo.getId(), + segmentSuffix + ); + + metadataOut = directory.createOutput(bloomFilterMetadataFileName(segmentInfo, segmentSuffix), context); + toClose.add(metadataOut); + CodecUtil.writeIndexHeader( + metadataOut, + STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, + VERSION_CURRENT, + segmentInfo.getId(), + segmentSuffix + ); + + buffer = bigArrays.newByteArray(bloomFilterSizeInBytes, false); + toClose.add(buffer); + + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(toClose); + } + } + } + + @Override + public void startDocument() throws IOException { + delegateWriter.startDocument(); + } + + @Override + public void finishDocument() throws IOException { + delegateWriter.finishDocument(); + } + + @Override + public void writeField(FieldInfo info, int value) throws IOException { + delegateWriter.writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, long value) throws IOException { + delegateWriter.writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, float value) throws IOException { + delegateWriter.writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, double value) throws IOException { + delegateWriter.writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException { + delegateWriter.writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, String value) throws IOException { + delegateWriter.writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, BytesRef value) throws IOException { + if (info.getName().equals(bloomFilterFieldName)) { + addToBloomFilter(info, value); + } else { + delegateWriter.writeField(info, value); + } + } + + private void addToBloomFilter(FieldInfo info, BytesRef value) { + bloomFilterFieldInfo = info; + var termHashes = hashTerm(value, hashes); + for (int hash : termHashes) { + final int posInBitArray = hash & (bloomFilterSizeInBits - 1); + final int pos = posInBitArray >> 3; // div 8 + final int mask = 1 << (posInBitArray & 7); // mod 8 + final byte val = (byte) (buffer.get(pos) | mask); + buffer.set(pos, val); + } + } + + @Override + public void finish(int numDocs) throws IOException { + finishBloomFilterStoredFormat(); + delegateWriter.finish(numDocs); + } + + private void finishBloomFilterStoredFormat() throws IOException { + BloomFilterMetadata bloomFilterMetadata = null; + if (bloomFilterFieldInfo != null) { + bloomFilterMetadata = new BloomFilterMetadata( + bloomFilterFieldInfo, + bloomFilterDataOut.getFilePointer(), + bloomFilterSizeInBits, + numHashFunctions + ); + + if (buffer.hasArray()) { + bloomFilterDataOut.writeBytes(buffer.array(), 0, bloomFilterSizeInBytes); + } else { + BytesReference.fromByteArray(buffer, bloomFilterSizeInBytes).writeTo(new IndexOutputOutputStream(bloomFilterDataOut)); + } + } + CodecUtil.writeFooter(bloomFilterDataOut); + + if (bloomFilterMetadata != null) { + metadataOut.writeByte(BLOOM_FILTER_STORED); + bloomFilterMetadata.writeTo(metadataOut); + } else { + metadataOut.writeByte(BLOOM_FILTER_NOT_STORED); + } + CodecUtil.writeFooter(metadataOut); + } + + @Override + public int merge(MergeState mergeState) throws IOException { + // Skip merging the bloom filter for now + finishBloomFilterStoredFormat(); + return delegateWriter.merge(mergeState); + } + + @Override + public void close() throws IOException { + IOUtils.close(toClose); + } + + @Override + public long ramBytesUsed() { + return buffer.ramBytesUsed() + delegateWriter.ramBytesUsed(); + } + } + + public static class Reader extends StoredFieldsReader { + @Nullable + private final BloomFilterFieldReader bloomFilterFieldReader; + private final StoredFieldsReader delegateReader; + + Reader( + Directory directory, + SegmentInfo si, + FieldInfos fn, + IOContext context, + String segmentSuffix, + StoredFieldsReader delegateReader + ) throws IOException { + this.delegateReader = delegateReader; + var success = false; + try { + bloomFilterFieldReader = BloomFilterFieldReader.open(directory, si, fn, context, segmentSuffix); + success = true; + } finally { + if (success == false) { + delegateReader.close(); + } + } + } + + @Override + public StoredFieldsReader clone() { + return this; + } + + @Override + public void checkIntegrity() throws IOException { + if (bloomFilterFieldReader != null) { + bloomFilterFieldReader.checkIntegrity(); + } + delegateReader.checkIntegrity(); + } + + @Override + public void close() throws IOException { + IOUtils.close(bloomFilterFieldReader, delegateReader); + } + + @Override + public void document(int docID, StoredFieldVisitor visitor) throws IOException { + delegateReader.document(docID, visitor); + } + + @Nullable + BloomFilterFieldReader getBloomFilterFieldReader() { + return bloomFilterFieldReader; + } + } + + record BloomFilterMetadata(FieldInfo fieldInfo, long fileOffset, int sizeInBits, int numHashFunctions) { + BloomFilterMetadata { + assert fieldInfo != null; + assert isPowerOfTwo(sizeInBits); + } + + int sizeInBytes() { + return sizeInBits / Byte.SIZE; + } + + void writeTo(IndexOutput indexOut) throws IOException { + indexOut.writeVInt(fieldInfo.number); + indexOut.writeVLong(fileOffset); + indexOut.writeVInt(sizeInBits); + indexOut.writeVInt(numHashFunctions); + } + + static BloomFilterMetadata readFrom(IndexInput in, FieldInfos fieldInfos) throws IOException { + final var fieldInfo = fieldInfos.fieldInfo(in.readVInt()); + final long fileOffset = in.readVLong(); + final int bloomFilterSizeInBits = in.readVInt(); + final int numOfHashFunctions = in.readVInt(); + return new BloomFilterMetadata(fieldInfo, fileOffset, bloomFilterSizeInBits, numOfHashFunctions); + } + } + + public static class BloomFilterFieldReader implements Closeable { + private final FieldInfo fieldInfo; + private final IndexInput bloomFilterData; + private final RandomAccessInput bloomFilterIn; + private final int bloomFilterSizeInBits; + private final int[] hashes; + + @Nullable + public static BloomFilterFieldReader open( + Directory directory, + SegmentInfo si, + FieldInfos fn, + IOContext context, + String segmentSuffix + ) throws IOException { + List toClose = new ArrayList<>(); + boolean success = false; + try (var metaInput = directory.openChecksumInput(bloomFilterMetadataFileName(si, segmentSuffix))) { + var metadataVersion = CodecUtil.checkIndexHeader( + metaInput, + STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, + VERSION_START, + VERSION_CURRENT, + si.getId(), + segmentSuffix + ); + var hasBloomFilter = metaInput.readByte() == BLOOM_FILTER_STORED; + if (hasBloomFilter == false) { + return null; + } + BloomFilterMetadata bloomFilterMetadata = BloomFilterMetadata.readFrom(metaInput, fn); + CodecUtil.checkFooter(metaInput); + + IndexInput bloomFilterData = directory.openInput(bloomFilterFileName(si, segmentSuffix), context); + toClose.add(bloomFilterData); + var bloomFilterDataVersion = CodecUtil.checkIndexHeader( + bloomFilterData, + STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, + VERSION_START, + VERSION_CURRENT, + si.getId(), + segmentSuffix + ); + + if (metadataVersion != bloomFilterDataVersion) { + throw new CorruptIndexException( + "Format versions mismatch: meta=" + metadataVersion + ", data=" + bloomFilterDataVersion, + bloomFilterData + ); + } + CodecUtil.checksumEntireFile(bloomFilterData); + + var bloomFilterFieldReader = new BloomFilterFieldReader( + bloomFilterMetadata.fieldInfo(), + bloomFilterData.randomAccessSlice(bloomFilterMetadata.fileOffset(), bloomFilterMetadata.sizeInBytes()), + bloomFilterMetadata.sizeInBits(), + bloomFilterMetadata.numHashFunctions(), + bloomFilterData + ); + success = true; + return bloomFilterFieldReader; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(toClose); + } + } + } + + public BloomFilterFieldReader( + FieldInfo fieldInfo, + RandomAccessInput bloomFilterIn, + int bloomFilterSizeInBits, + int numHashFunctions, + IndexInput bloomFilterData + ) { + this.fieldInfo = Objects.requireNonNull(fieldInfo); + this.bloomFilterIn = bloomFilterIn; + this.bloomFilterSizeInBits = bloomFilterSizeInBits; + this.hashes = new int[numHashFunctions]; + this.bloomFilterData = bloomFilterData; + } + + public boolean mayContainTerm(String field, BytesRef term) throws IOException { + assert fieldInfo.getName().equals(field); + + var termHashes = hashTerm(term, hashes); + + for (int hash : termHashes) { + final int posInBitArray = hash & (bloomFilterSizeInBits - 1); + final int pos = posInBitArray >> 3; // div 8 + final int mask = 1 << (posInBitArray & 7); // mod 8 + final byte bits = bloomFilterIn.readByte(pos); + if ((bits & mask) == 0) { + return false; + } + } + return true; + } + + void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(bloomFilterData); + } + + @Override + public void close() throws IOException { + bloomFilterData.close(); + } + } + + private static int[] hashTerm(BytesRef value, int[] outputs) { + long hash64 = hash64(value.bytes, value.offset, value.length); + // First use output splitting to get two hash values out of a single hash function + int upperHalf = (int) (hash64 >> Integer.SIZE); + int lowerHalf = (int) hash64; + // Then use the Kirsch-Mitzenmacher technique to obtain multiple hashes efficiently + for (int i = 0; i < outputs.length; i++) { + // Use prime numbers as the constant for the KM technique so these don't have a common gcd + outputs[i] = (lowerHalf + PRIMES[i] * upperHalf) & 0x7FFF_FFFF; // Clears sign bit, gives positive 31-bit values + } + return outputs; + } + + private static boolean isPowerOfTwo(int value) { + return (value & (value - 1)) == 0; + } + + private static String bloomFilterMetadataFileName(SegmentInfo segmentInfo, String segmentSuffix) { + return IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION); + } + + private static String bloomFilterFileName(SegmentInfo segmentInfo, String segmentSuffix) { + return IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, STORED_FIELDS_BLOOM_FILTER_EXTENSION); + } +} diff --git a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java new file mode 100644 index 0000000000000..165b8232a2f31 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java @@ -0,0 +1,162 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.bloomfilter; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReader; +import org.apache.lucene.index.StandardDirectoryReader; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.codecs.asserting.AssertingCodec; +import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.UUIDs; +import org.elasticsearch.common.logging.LogConfigurator; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.mapper.IdFieldMapper; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.not; +import static org.hamcrest.Matchers.nullValue; + +public class ES93BloomFilterStoredFieldsFormatTests extends BaseStoredFieldsFormatTestCase { + + static { + LogConfigurator.loadLog4jPlugins(); + LogConfigurator.configureESLogging(); // native access requires logging to be initialized + } + + @Override + protected Codec getCodec() { + return new AssertingCodec() { + @Override + public StoredFieldsFormat storedFieldsFormat() { + return new ES93BloomFilterStoredFieldsFormat( + BigArrays.NON_RECYCLING_INSTANCE, + "", + TestUtil.getDefaultCodec().storedFieldsFormat(), + ByteSizeValue.ofKb(2), + IdFieldMapper.NAME + ); + } + }; + } + + @Override + protected void addRandomFields(Document doc) { + + } + + public void testBloomFilterFieldIsNotStoredAndBloomFilterCanBeChecked() throws IOException { + try (var directory = newDirectory()) { + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setCodec(getCodec()); + conf.setMergePolicy(newLogMergePolicy()); + // We want to have at most 1 segment + conf.setMaxBufferedDocs(200); + // The stored fields reader that can be accessed through a StandardDirectoryReader wraps + // the ES93BloomFilterStoredFieldsFormat.Reader. Thus, we need to open it directly from + // the segment info codec and if we use compound files we would need to obtain a compound + // file directory. For simplicity, we just use regular files. + conf.setUseCompoundFile(false); + // We don't use the RandomIndexWriter because we want to control the settings so we get + // deterministic test runs + try (IndexWriter writer = new IndexWriter(directory, conf)) { + FieldType customType = new FieldType(TextField.TYPE_STORED); + customType.setTokenized(false); + Field hostField = newField("host", "", customType); + var fieldType = new FieldType(); + fieldType.setStored(true); + fieldType.setTokenized(false); + Field idField = newField(random(), IdFieldMapper.NAME, getBytesRefFromString(""), fieldType); + + List indexedIds = new ArrayList<>(); + var docCount = atLeast(50); + for (int i = 0; i < docCount; i++) { + Document doc = new Document(); + doc.add(idField); + var id = getBytesRefFromString(UUIDs.randomBase64UUID()); + indexedIds.add(id); + idField.setBytesValue(id); + doc.add(hostField); + hostField.setStringValue("host-" + i); + writer.addDocument(doc); + } + + try (var directoryReader = StandardDirectoryReader.open(writer)) { + for (LeafReaderContext leaf : directoryReader.leaves()) { + try (ES93BloomFilterStoredFieldsFormat.Reader fieldReader = getBloomFilterReaderFrom(leaf)) { + var bloomFilterFieldReader = fieldReader.getBloomFilterFieldReader(); + // the bloom filter reader is null only if the _id field is not stored during indexing + assertThat(bloomFilterFieldReader, is(not(nullValue()))); + + for (BytesRef indexedId : indexedIds) { + assertThat(bloomFilterFieldReader.mayContainTerm(IdFieldMapper.NAME, indexedId), is(true)); + } + assertThat( + bloomFilterFieldReader.mayContainTerm(IdFieldMapper.NAME, getBytesRefFromString("random")), + is(false) + ); + + assertThat( + bloomFilterFieldReader.mayContainTerm(IdFieldMapper.NAME, getBytesRefFromString("12345")), + is(false) + ); + } + } + + var storedFields = directoryReader.storedFields(); + for (int docId = 0; docId < docCount; docId++) { + var document = storedFields.document(docId); + // The _id field is not actually stored, just used to build the bloom filter + assertThat(document.get(IdFieldMapper.NAME), nullValue()); + assertThat(document.get("host"), not(nullValue())); + } + } + } + } + } + + private static BytesRef getBytesRefFromString(String random) { + return new BytesRef(random.getBytes(StandardCharsets.UTF_8)); + } + + private ES93BloomFilterStoredFieldsFormat.Reader getBloomFilterReaderFrom(LeafReaderContext leafReaderContext) throws IOException { + LeafReader reader = leafReaderContext.reader(); + var fieldInfos = reader.getFieldInfos(); + assertThat(reader, is(instanceOf(SegmentReader.class))); + SegmentReader segmentReader = (SegmentReader) reader; + SegmentInfo si = segmentReader.getSegmentInfo().info; + + var storedFieldsReader = si.getCodec().storedFieldsFormat().fieldsReader(si.dir, si, fieldInfos, IOContext.DEFAULT); + assertThat(storedFieldsReader, is(instanceOf(ES93BloomFilterStoredFieldsFormat.Reader.class))); + return (ES93BloomFilterStoredFieldsFormat.Reader) storedFieldsReader; + } +} From a1af7b966a245fdceacabf1e723402564fc6ec1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Thu, 30 Oct 2025 15:00:22 +0100 Subject: [PATCH 02/10] Review comments Skip skipping the bloom filter field for all data types Compute the bloom filter size in a simpler way --- .../ES93BloomFilterStoredFieldsFormat.java | 70 +++++++++++-------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java index 75998d982ccb9..600912fdcece0 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -74,21 +74,10 @@ public class ES93BloomFilterStoredFieldsFormat extends StoredFieldsFormat { // We use prime numbers with the Kirsch-Mitzenmacher technique to obtain multiple hashes from two hash functions private static final int[] PRIMES = new int[] { 2, 5, 11, 17, 23, 29, 41, 47, 53, 59, 71 }; - private static final int[] powerOfTwoBitSetSizes; private static final int DEFAULT_NUM_HASH_FUNCTIONS = 7; private static final byte BLOOM_FILTER_STORED = 1; private static final byte BLOOM_FILTER_NOT_STORED = 0; - - static { - // Precompute powers of two (2^1 to 2^26) for efficient modulo operations using bitwise AND. - // We start from 2^1 (2 bits) and go up to 2^26 (67,108,864 bits / 8,388,608 bytes = 8 MB) - // as the maximum, staying within positive int range. - powerOfTwoBitSetSizes = new int[27]; - for (int i = 0; i < powerOfTwoBitSetSizes.length; i++) { - powerOfTwoBitSetSizes[i] = 1 << i; - assert powerOfTwoBitSetSizes[i] > 0; - } - } + private static final ByteSizeValue MAX_BLOOM_FILTER_SIZE = ByteSizeValue.ofMb(8); private final BigArrays bigArrays; private final String segmentSuffix; @@ -109,15 +98,23 @@ public ES93BloomFilterStoredFieldsFormat( this.delegate = delegate; this.bloomFilterFieldName = bloomFilterFieldName; this.numHashFunctions = DEFAULT_NUM_HASH_FUNCTIONS; - int bloomFilterSizeInBits = 0; - // Find the closest power of 2 that fits the required size - for (int powerOfTwoBitSetSize : powerOfTwoBitSetSizes) { - if (powerOfTwoBitSetSize <= (Math.multiplyExact(bloomFilterSize.getBytes(), Byte.SIZE))) { - bloomFilterSizeInBits = powerOfTwoBitSetSize; - } + + if (bloomFilterSize.getBytes() <= 0) { + throw new IllegalArgumentException("bloom filter size must be greater than 0"); } - assert bloomFilterSizeInBits > 0; - this.bloomFilterSizeInBits = bloomFilterSizeInBits; + + var closestPowerOfTwoBloomFilterSizeInBytes = Long.highestOneBit(bloomFilterSize.getBytes()); + if (closestPowerOfTwoBloomFilterSizeInBytes > MAX_BLOOM_FILTER_SIZE.getBytes()) { + throw new IllegalArgumentException( + "bloom filter size [" + + bloomFilterSize + + "] is too large; " + + "must be " + + MAX_BLOOM_FILTER_SIZE + + " or less (rounded to nearest power of two)" + ); + } + this.bloomFilterSizeInBits = (int) Math.multiplyExact(closestPowerOfTwoBloomFilterSizeInBytes, Byte.SIZE); } @Override @@ -165,7 +162,7 @@ static class Writer extends StoredFieldsWriter { String bloomFilterFieldName, StoredFieldsWriter delegateWriter ) throws IOException { - assert isPowerOfTwo(bloomFilterSizeInBits) : "Bloom filter size is not a power of 2"; + assert isPowerOfTwo(bloomFilterSizeInBits) : "Bloom filter size is not a power of 2: " + bloomFilterSizeInBits; assert numHashFunctions <= PRIMES.length : "Number of hash functions must be <= " + PRIMES.length + " but was " + numHashFunctions; @@ -223,43 +220,60 @@ public void finishDocument() throws IOException { @Override public void writeField(FieldInfo info, int value) throws IOException { - delegateWriter.writeField(info, value); + if (isBloomFilterField(info) == false) { + delegateWriter.writeField(info, value); + } } @Override public void writeField(FieldInfo info, long value) throws IOException { - delegateWriter.writeField(info, value); + if (isBloomFilterField(info) == false) { + delegateWriter.writeField(info, value); + } } @Override public void writeField(FieldInfo info, float value) throws IOException { - delegateWriter.writeField(info, value); + if (isBloomFilterField(info) == false) { + delegateWriter.writeField(info, value); + } } @Override public void writeField(FieldInfo info, double value) throws IOException { - delegateWriter.writeField(info, value); + if (isBloomFilterField(info) == false) { + delegateWriter.writeField(info, value); + } } @Override public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException { - delegateWriter.writeField(info, value); + if (isBloomFilterField(info) == false) { + delegateWriter.writeField(info, value); + } } @Override public void writeField(FieldInfo info, String value) throws IOException { - delegateWriter.writeField(info, value); + if (isBloomFilterField(info) == false) { + delegateWriter.writeField(info, value); + } } @Override public void writeField(FieldInfo info, BytesRef value) throws IOException { - if (info.getName().equals(bloomFilterFieldName)) { + if (isBloomFilterField(info)) { addToBloomFilter(info, value); } else { delegateWriter.writeField(info, value); } } + private boolean isBloomFilterField(FieldInfo info) { + return (bloomFilterFieldInfo != null && bloomFilterFieldInfo.getFieldNumber() == info.getFieldNumber()) + || info.getName().equals(bloomFilterFieldName); + } + private void addToBloomFilter(FieldInfo info, BytesRef value) { bloomFilterFieldInfo = info; var termHashes = hashTerm(value, hashes); From 78eafbf72b33bb715638075d96d1fbb32c8196ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Thu, 30 Oct 2025 15:00:50 +0100 Subject: [PATCH 03/10] Add more randomness --- .../bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java index 165b8232a2f31..a24ddd7a2d4cc 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java @@ -57,11 +57,12 @@ protected Codec getCodec() { return new AssertingCodec() { @Override public StoredFieldsFormat storedFieldsFormat() { + var bloomFilterSizeInKb = atLeast(2); return new ES93BloomFilterStoredFieldsFormat( BigArrays.NON_RECYCLING_INSTANCE, "", TestUtil.getDefaultCodec().storedFieldsFormat(), - ByteSizeValue.ofKb(2), + ByteSizeValue.ofKb(bloomFilterSizeInKb), IdFieldMapper.NAME ); } From d26a6e1a628b25ab772420542d1a6b1c9550d63b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Thu, 30 Oct 2025 15:16:32 +0100 Subject: [PATCH 04/10] Update docs/changelog/137331.yaml --- docs/changelog/137331.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/137331.yaml diff --git a/docs/changelog/137331.yaml b/docs/changelog/137331.yaml new file mode 100644 index 0000000000000..fc267dd859adb --- /dev/null +++ b/docs/changelog/137331.yaml @@ -0,0 +1,5 @@ +pr: 137331 +summary: Add ES93BloomFilterStoredFieldsFormat for efficient field existence checks +area: TSDB +type: enhancement +issues: [] From d68c2ad3b1bfbc1243d36983c4331b8462923e8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Thu, 30 Oct 2025 23:08:52 +0100 Subject: [PATCH 05/10] Improve test a bit --- ...S93BloomFilterStoredFieldsFormatTests.java | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java index a24ddd7a2d4cc..75b9af85e9995 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java @@ -14,8 +14,8 @@ import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.TextField; +import org.apache.lucene.document.LongField; +import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; @@ -90,24 +90,15 @@ public void testBloomFilterFieldIsNotStoredAndBloomFilterCanBeChecked() throws I // We don't use the RandomIndexWriter because we want to control the settings so we get // deterministic test runs try (IndexWriter writer = new IndexWriter(directory, conf)) { - FieldType customType = new FieldType(TextField.TYPE_STORED); - customType.setTokenized(false); - Field hostField = newField("host", "", customType); - var fieldType = new FieldType(); - fieldType.setStored(true); - fieldType.setTokenized(false); - Field idField = newField(random(), IdFieldMapper.NAME, getBytesRefFromString(""), fieldType); - List indexedIds = new ArrayList<>(); var docCount = atLeast(50); for (int i = 0; i < docCount; i++) { Document doc = new Document(); - doc.add(idField); var id = getBytesRefFromString(UUIDs.randomBase64UUID()); indexedIds.add(id); - idField.setBytesValue(id); - doc.add(hostField); - hostField.setStringValue("host-" + i); + doc.add(new StringField(IdFieldMapper.NAME, id, Field.Store.YES)); + doc.add(new StringField("host", "host-" + i, Field.Store.YES)); + doc.add(new LongField("counter", i, Field.Store.YES)); writer.addDocument(doc); } @@ -139,6 +130,7 @@ public void testBloomFilterFieldIsNotStoredAndBloomFilterCanBeChecked() throws I // The _id field is not actually stored, just used to build the bloom filter assertThat(document.get(IdFieldMapper.NAME), nullValue()); assertThat(document.get("host"), not(nullValue())); + assertThat(document.get("counter"), not(nullValue())); } } } From 49fcc663a5162d045b8a9ba4dd150faf93a111e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Mon, 3 Nov 2025 09:51:26 +0100 Subject: [PATCH 06/10] Add more assertions --- .../bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java index 75b9af85e9995..6796d82150d15 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java @@ -40,6 +40,7 @@ import java.util.ArrayList; import java.util.List; +import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.not; @@ -130,7 +131,9 @@ public void testBloomFilterFieldIsNotStoredAndBloomFilterCanBeChecked() throws I // The _id field is not actually stored, just used to build the bloom filter assertThat(document.get(IdFieldMapper.NAME), nullValue()); assertThat(document.get("host"), not(nullValue())); + assertThat(document.get("host"), is(equalTo("host-" + docId))); assertThat(document.get("counter"), not(nullValue())); + assertThat(document.getField("counter").storedValue().getLongValue(), is(equalTo((long) docId))); } } } From 605c93de9cf0df375d6af7d37c48c66155b47740 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Mon, 3 Nov 2025 09:57:05 +0100 Subject: [PATCH 07/10] Start version from 0 --- .../codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java index 600912fdcece0..b0bba971277f4 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -69,7 +69,7 @@ public class ES93BloomFilterStoredFieldsFormat extends StoredFieldsFormat { public static final String STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME = "ES93BloomFilterStoredFieldsFormat"; public static final String STORED_FIELDS_BLOOM_FILTER_EXTENSION = "sfbf"; public static final String STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION = "sfbfm"; - private static final int VERSION_START = 1; + private static final int VERSION_START = 0; private static final int VERSION_CURRENT = VERSION_START; // We use prime numbers with the Kirsch-Mitzenmacher technique to obtain multiple hashes from two hash functions From 1e4ea4c7b7d7caaa2537e6c1ef7e53520b0b2d1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Mon, 3 Nov 2025 09:58:06 +0100 Subject: [PATCH 08/10] Use toIntExact --- .../codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java index b0bba971277f4..054215d42cc10 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -114,7 +114,7 @@ public ES93BloomFilterStoredFieldsFormat( + " or less (rounded to nearest power of two)" ); } - this.bloomFilterSizeInBits = (int) Math.multiplyExact(closestPowerOfTwoBloomFilterSizeInBytes, Byte.SIZE); + this.bloomFilterSizeInBits = Math.toIntExact(Math.multiplyExact(closestPowerOfTwoBloomFilterSizeInBytes, Byte.SIZE)); } @Override From 4293207129d397cffea94e5fc1847ef1711b64c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Mon, 3 Nov 2025 09:59:09 +0100 Subject: [PATCH 09/10] Add assertion for bloom filter name --- .../codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java index 054215d42cc10..800193a3d3c38 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -275,6 +275,7 @@ private boolean isBloomFilterField(FieldInfo info) { } private void addToBloomFilter(FieldInfo info, BytesRef value) { + assert info.getName().equals(bloomFilterFieldName): "Expected " + bloomFilterFieldName + " but got " + info; bloomFilterFieldInfo = info; var termHashes = hashTerm(value, hashes); for (int hash : termHashes) { From 65506011d14729672f26f41f57b99ebbdcb4b4d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Mon, 3 Nov 2025 10:22:35 +0100 Subject: [PATCH 10/10] Extract interface --- .../ES93BloomFilterStoredFieldsFormat.java | 37 ++++++++++++------- ...S93BloomFilterStoredFieldsFormatTests.java | 27 ++++++-------- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java index 800193a3d3c38..63719af9135c2 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -275,7 +275,7 @@ private boolean isBloomFilterField(FieldInfo info) { } private void addToBloomFilter(FieldInfo info, BytesRef value) { - assert info.getName().equals(bloomFilterFieldName): "Expected " + bloomFilterFieldName + " but got " + info; + assert info.getName().equals(bloomFilterFieldName) : "Expected " + bloomFilterFieldName + " but got " + info; bloomFilterFieldInfo = info; var termHashes = hashTerm(value, hashes); for (int hash : termHashes) { @@ -338,7 +338,7 @@ public long ramBytesUsed() { } } - public static class Reader extends StoredFieldsReader { + private static class Reader extends StoredFieldsReader implements BloomFilterProvider { @Nullable private final BloomFilterFieldReader bloomFilterFieldReader; private final StoredFieldsReader delegateReader; @@ -386,8 +386,8 @@ public void document(int docID, StoredFieldVisitor visitor) throws IOException { delegateReader.document(docID, visitor); } - @Nullable - BloomFilterFieldReader getBloomFilterFieldReader() { + @Override + public BloomFilter getBloomFilter() throws IOException { return bloomFilterFieldReader; } } @@ -418,7 +418,7 @@ static BloomFilterMetadata readFrom(IndexInput in, FieldInfos fieldInfos) throws } } - public static class BloomFilterFieldReader implements Closeable { + static class BloomFilterFieldReader implements BloomFilter { private final FieldInfo fieldInfo; private final IndexInput bloomFilterData; private final RandomAccessInput bloomFilterIn; @@ -426,13 +426,8 @@ public static class BloomFilterFieldReader implements Closeable { private final int[] hashes; @Nullable - public static BloomFilterFieldReader open( - Directory directory, - SegmentInfo si, - FieldInfos fn, - IOContext context, - String segmentSuffix - ) throws IOException { + static BloomFilterFieldReader open(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context, String segmentSuffix) + throws IOException { List toClose = new ArrayList<>(); boolean success = false; try (var metaInput = directory.openChecksumInput(bloomFilterMetadataFileName(si, segmentSuffix))) { @@ -486,7 +481,7 @@ public static BloomFilterFieldReader open( } } - public BloomFilterFieldReader( + BloomFilterFieldReader( FieldInfo fieldInfo, RandomAccessInput bloomFilterIn, int bloomFilterSizeInBits, @@ -551,4 +546,20 @@ private static String bloomFilterMetadataFileName(SegmentInfo segmentInfo, Strin private static String bloomFilterFileName(SegmentInfo segmentInfo, String segmentSuffix) { return IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, STORED_FIELDS_BLOOM_FILTER_EXTENSION); } + + public interface BloomFilter extends Closeable { + /** + * Tests whether the given term may exist in the specified field. + * + * @param field the field name to check + * @param term the term to test for membership + * @return true if term may be present, false if definitely absent + */ + boolean mayContainTerm(String field, BytesRef term) throws IOException; + } + + public interface BloomFilterProvider extends Closeable { + @Nullable + BloomFilter getBloomFilter() throws IOException; + } } diff --git a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java index 6796d82150d15..ebcf5fcec5758 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java @@ -105,23 +105,17 @@ public void testBloomFilterFieldIsNotStoredAndBloomFilterCanBeChecked() throws I try (var directoryReader = StandardDirectoryReader.open(writer)) { for (LeafReaderContext leaf : directoryReader.leaves()) { - try (ES93BloomFilterStoredFieldsFormat.Reader fieldReader = getBloomFilterReaderFrom(leaf)) { - var bloomFilterFieldReader = fieldReader.getBloomFilterFieldReader(); + try (ES93BloomFilterStoredFieldsFormat.BloomFilterProvider fieldReader = getBloomFilterProvider(leaf)) { + var bloomFilter = fieldReader.getBloomFilter(); // the bloom filter reader is null only if the _id field is not stored during indexing - assertThat(bloomFilterFieldReader, is(not(nullValue()))); + assertThat(bloomFilter, is(not(nullValue()))); for (BytesRef indexedId : indexedIds) { - assertThat(bloomFilterFieldReader.mayContainTerm(IdFieldMapper.NAME, indexedId), is(true)); + assertThat(bloomFilter.mayContainTerm(IdFieldMapper.NAME, indexedId), is(true)); } - assertThat( - bloomFilterFieldReader.mayContainTerm(IdFieldMapper.NAME, getBytesRefFromString("random")), - is(false) - ); - - assertThat( - bloomFilterFieldReader.mayContainTerm(IdFieldMapper.NAME, getBytesRefFromString("12345")), - is(false) - ); + assertThat(bloomFilter.mayContainTerm(IdFieldMapper.NAME, getBytesRefFromString("random")), is(false)); + + assertThat(bloomFilter.mayContainTerm(IdFieldMapper.NAME, getBytesRefFromString("12345")), is(false)); } } @@ -144,7 +138,8 @@ private static BytesRef getBytesRefFromString(String random) { return new BytesRef(random.getBytes(StandardCharsets.UTF_8)); } - private ES93BloomFilterStoredFieldsFormat.Reader getBloomFilterReaderFrom(LeafReaderContext leafReaderContext) throws IOException { + private ES93BloomFilterStoredFieldsFormat.BloomFilterProvider getBloomFilterProvider(LeafReaderContext leafReaderContext) + throws IOException { LeafReader reader = leafReaderContext.reader(); var fieldInfos = reader.getFieldInfos(); assertThat(reader, is(instanceOf(SegmentReader.class))); @@ -152,7 +147,7 @@ private ES93BloomFilterStoredFieldsFormat.Reader getBloomFilterReaderFrom(LeafRe SegmentInfo si = segmentReader.getSegmentInfo().info; var storedFieldsReader = si.getCodec().storedFieldsFormat().fieldsReader(si.dir, si, fieldInfos, IOContext.DEFAULT); - assertThat(storedFieldsReader, is(instanceOf(ES93BloomFilterStoredFieldsFormat.Reader.class))); - return (ES93BloomFilterStoredFieldsFormat.Reader) storedFieldsReader; + assertThat(storedFieldsReader, is(instanceOf(ES93BloomFilterStoredFieldsFormat.BloomFilterProvider.class))); + return ((ES93BloomFilterStoredFieldsFormat.BloomFilterProvider) storedFieldsReader); } }