Skip to content

Commit dab897f

Browse files
committed
Add ES93BloomFilterStoredFieldsFormat for efficient field existence checks
Introduces a new stored fields format that builds a Bloom filter for a specific field to enable fast existence checks without storing the field itself. This delegates storage of all other fields to another StoredFieldsFormat while maintaining the ability to quickly determine if a document might contain the target field.
1 parent c5b0360 commit dab897f

File tree

4 files changed

+896
-181
lines changed

4 files changed

+896
-181
lines changed
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.codec.bloomfilter;
11+
12+
import org.elasticsearch.common.util.ByteUtils;
13+
14+
public class BloomFilterHashFunctions {
15+
private BloomFilterHashFunctions() {}
16+
17+
//
18+
// The following Murmur3 implementation is borrowed from commons-codec.
19+
//
20+
/**
21+
* Implementation of the MurmurHash3 128-bit hash functions.
22+
*
23+
* <p>
24+
* MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic
25+
* operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not
26+
* specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes.
27+
* </p>
28+
*
29+
* <p>
30+
* This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function
31+
* {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher.
32+
* </p>
33+
*
34+
* <p>
35+
* This is public domain code with no copyrights. From home page of
36+
* <a href="https://github.com/aappleby/smhasher">SMHasher</a>:
37+
* </p>
38+
*
39+
* <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their
40+
* code." </blockquote>
41+
*
42+
* <p>
43+
* Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original
44+
* MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a
45+
* 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes.
46+
* </p>
47+
*
48+
* @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
49+
* @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++
50+
* code</a>
51+
* @see <a href=
52+
* "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java">
53+
* Apache Hive Murmer3</a>
54+
* @since 1.13
55+
*/
56+
public static final class MurmurHash3 {
57+
/**
58+
* A default seed to use for the murmur hash algorithm.
59+
* Has the value {@code 104729}.
60+
*/
61+
public static final int DEFAULT_SEED = 104729;
62+
63+
// Constants for 128-bit variant
64+
private static final long C1 = 0x87c37b91114253d5L;
65+
private static final long C2 = 0x4cf5ad432745937fL;
66+
private static final int R1 = 31;
67+
private static final int R2 = 27;
68+
private static final int R3 = 33;
69+
private static final int M = 5;
70+
private static final int N1 = 0x52dce729;
71+
private static final int N2 = 0x38495ab5;
72+
73+
/** No instance methods. */
74+
private MurmurHash3() {}
75+
76+
/**
77+
* Generates 64-bit hash from the byte array with the given offset, length and seed by discarding the second value of the 128-bit
78+
* hash.
79+
*
80+
* This version uses the default seed.
81+
*
82+
* @param data The input byte array
83+
* @param offset The first element of array
84+
* @param length The length of array
85+
* @return The sum of the two 64-bit hashes that make up the hash128
86+
*/
87+
@SuppressWarnings("fallthrough")
88+
public static long hash64(final byte[] data, final int offset, final int length) {
89+
long h1 = MurmurHash3.DEFAULT_SEED;
90+
long h2 = MurmurHash3.DEFAULT_SEED;
91+
final int nblocks = length >> 4;
92+
93+
// body
94+
for (int i = 0; i < nblocks; i++) {
95+
final int index = offset + (i << 4);
96+
long k1 = ByteUtils.readLongLE(data, index);
97+
long k2 = ByteUtils.readLongLE(data, index + 8);
98+
99+
// mix functions for k1
100+
k1 *= C1;
101+
k1 = Long.rotateLeft(k1, R1);
102+
k1 *= C2;
103+
h1 ^= k1;
104+
h1 = Long.rotateLeft(h1, R2);
105+
h1 += h2;
106+
h1 = h1 * M + N1;
107+
108+
// mix functions for k2
109+
k2 *= C2;
110+
k2 = Long.rotateLeft(k2, R3);
111+
k2 *= C1;
112+
h2 ^= k2;
113+
h2 = Long.rotateLeft(h2, R1);
114+
h2 += h1;
115+
h2 = h2 * M + N2;
116+
}
117+
118+
// tail
119+
long k1 = 0;
120+
long k2 = 0;
121+
final int index = offset + (nblocks << 4);
122+
switch (offset + length - index) {
123+
case 15:
124+
k2 ^= ((long) data[index + 14] & 0xff) << 48;
125+
case 14:
126+
k2 ^= ((long) data[index + 13] & 0xff) << 40;
127+
case 13:
128+
k2 ^= ((long) data[index + 12] & 0xff) << 32;
129+
case 12:
130+
k2 ^= ((long) data[index + 11] & 0xff) << 24;
131+
case 11:
132+
k2 ^= ((long) data[index + 10] & 0xff) << 16;
133+
case 10:
134+
k2 ^= ((long) data[index + 9] & 0xff) << 8;
135+
case 9:
136+
k2 ^= data[index + 8] & 0xff;
137+
k2 *= C2;
138+
k2 = Long.rotateLeft(k2, R3);
139+
k2 *= C1;
140+
h2 ^= k2;
141+
142+
case 8:
143+
k1 ^= ((long) data[index + 7] & 0xff) << 56;
144+
case 7:
145+
k1 ^= ((long) data[index + 6] & 0xff) << 48;
146+
case 6:
147+
k1 ^= ((long) data[index + 5] & 0xff) << 40;
148+
case 5:
149+
k1 ^= ((long) data[index + 4] & 0xff) << 32;
150+
case 4:
151+
k1 ^= ((long) data[index + 3] & 0xff) << 24;
152+
case 3:
153+
k1 ^= ((long) data[index + 2] & 0xff) << 16;
154+
case 2:
155+
k1 ^= ((long) data[index + 1] & 0xff) << 8;
156+
case 1:
157+
k1 ^= data[index] & 0xff;
158+
k1 *= C1;
159+
k1 = Long.rotateLeft(k1, R1);
160+
k1 *= C2;
161+
h1 ^= k1;
162+
}
163+
164+
// finalization
165+
h1 ^= length;
166+
h2 ^= length;
167+
168+
h1 += h2;
169+
h2 += h1;
170+
171+
h1 = fmix64(h1);
172+
h2 = fmix64(h2);
173+
174+
h1 += h2;
175+
176+
return h1;
177+
}
178+
179+
/**
180+
* Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}.
181+
*
182+
* @param hash The current hash
183+
* @return The final hash
184+
*/
185+
private static long fmix64(long hash) {
186+
hash ^= (hash >>> 33);
187+
hash *= 0xff51afd7ed558ccdL;
188+
hash ^= (hash >>> 33);
189+
hash *= 0xc4ceb9fe1a85ec53L;
190+
hash ^= (hash >>> 33);
191+
return hash;
192+
}
193+
}
194+
}

server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java

Lines changed: 1 addition & 181 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
import org.elasticsearch.common.lucene.store.IndexOutputOutputStream;
4848
import org.elasticsearch.common.util.BigArrays;
4949
import org.elasticsearch.common.util.ByteArray;
50-
import org.elasticsearch.common.util.ByteUtils;
5150
import org.elasticsearch.core.IOUtils;
5251

5352
import java.io.Closeable;
@@ -548,7 +547,7 @@ static int numBytesForBloomFilter(int bloomFilterSize) {
548547
// Uses MurmurHash3-128 to generate a 64-bit hash value, then picks 7 subsets of 31 bits each and returns the values in the
549548
// outputs array. This provides us with 7 reasonably independent hashes of the data for the cost of one MurmurHash3 calculation.
550549
static int[] hashTerm(BytesRef br, int[] outputs) {
551-
final long hash64 = MurmurHash3.hash64(br.bytes, br.offset, br.length);
550+
final long hash64 = BloomFilterHashFunctions.MurmurHash3.hash64(br.bytes, br.offset, br.length);
552551
final int upperHalf = (int) (hash64 >> 32);
553552
final int lowerHalf = (int) hash64;
554553
// Derive 7 hash outputs by combining the two 64-bit halves, adding the upper half multiplied with different small constants
@@ -562,183 +561,4 @@ static int[] hashTerm(BytesRef br, int[] outputs) {
562561
outputs[6] = (lowerHalf + 17 * upperHalf) & 0x7FFF_FFFF;
563562
return outputs;
564563
}
565-
566-
//
567-
// The following Murmur3 implementation is borrowed from commons-codec.
568-
//
569-
570-
/**
571-
* Implementation of the MurmurHash3 128-bit hash functions.
572-
*
573-
* <p>
574-
* MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic
575-
* operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not
576-
* specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes.
577-
* </p>
578-
*
579-
* <p>
580-
* This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function
581-
* {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher.
582-
* </p>
583-
*
584-
* <p>
585-
* This is public domain code with no copyrights. From home page of
586-
* <a href="https://github.com/aappleby/smhasher">SMHasher</a>:
587-
* </p>
588-
*
589-
* <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their
590-
* code." </blockquote>
591-
*
592-
* <p>
593-
* Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original
594-
* MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a
595-
* 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes.
596-
* </p>
597-
*
598-
* @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
599-
* @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++
600-
* code</a>
601-
* @see <a href=
602-
* "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java">
603-
* Apache Hive Murmer3</a>
604-
* @since 1.13
605-
*/
606-
public static final class MurmurHash3 {
607-
/**
608-
* A default seed to use for the murmur hash algorithm.
609-
* Has the value {@code 104729}.
610-
*/
611-
public static final int DEFAULT_SEED = 104729;
612-
613-
// Constants for 128-bit variant
614-
private static final long C1 = 0x87c37b91114253d5L;
615-
private static final long C2 = 0x4cf5ad432745937fL;
616-
private static final int R1 = 31;
617-
private static final int R2 = 27;
618-
private static final int R3 = 33;
619-
private static final int M = 5;
620-
private static final int N1 = 0x52dce729;
621-
private static final int N2 = 0x38495ab5;
622-
623-
/** No instance methods. */
624-
private MurmurHash3() {}
625-
626-
/**
627-
* Generates 64-bit hash from the byte array with the given offset, length and seed by discarding the second value of the 128-bit
628-
* hash.
629-
*
630-
* This version uses the default seed.
631-
*
632-
* @param data The input byte array
633-
* @param offset The first element of array
634-
* @param length The length of array
635-
* @return The sum of the two 64-bit hashes that make up the hash128
636-
*/
637-
@SuppressWarnings("fallthrough")
638-
public static long hash64(final byte[] data, final int offset, final int length) {
639-
long h1 = MurmurHash3.DEFAULT_SEED;
640-
long h2 = MurmurHash3.DEFAULT_SEED;
641-
final int nblocks = length >> 4;
642-
643-
// body
644-
for (int i = 0; i < nblocks; i++) {
645-
final int index = offset + (i << 4);
646-
long k1 = ByteUtils.readLongLE(data, index);
647-
long k2 = ByteUtils.readLongLE(data, index + 8);
648-
649-
// mix functions for k1
650-
k1 *= C1;
651-
k1 = Long.rotateLeft(k1, R1);
652-
k1 *= C2;
653-
h1 ^= k1;
654-
h1 = Long.rotateLeft(h1, R2);
655-
h1 += h2;
656-
h1 = h1 * M + N1;
657-
658-
// mix functions for k2
659-
k2 *= C2;
660-
k2 = Long.rotateLeft(k2, R3);
661-
k2 *= C1;
662-
h2 ^= k2;
663-
h2 = Long.rotateLeft(h2, R1);
664-
h2 += h1;
665-
h2 = h2 * M + N2;
666-
}
667-
668-
// tail
669-
long k1 = 0;
670-
long k2 = 0;
671-
final int index = offset + (nblocks << 4);
672-
switch (offset + length - index) {
673-
case 15:
674-
k2 ^= ((long) data[index + 14] & 0xff) << 48;
675-
case 14:
676-
k2 ^= ((long) data[index + 13] & 0xff) << 40;
677-
case 13:
678-
k2 ^= ((long) data[index + 12] & 0xff) << 32;
679-
case 12:
680-
k2 ^= ((long) data[index + 11] & 0xff) << 24;
681-
case 11:
682-
k2 ^= ((long) data[index + 10] & 0xff) << 16;
683-
case 10:
684-
k2 ^= ((long) data[index + 9] & 0xff) << 8;
685-
case 9:
686-
k2 ^= data[index + 8] & 0xff;
687-
k2 *= C2;
688-
k2 = Long.rotateLeft(k2, R3);
689-
k2 *= C1;
690-
h2 ^= k2;
691-
692-
case 8:
693-
k1 ^= ((long) data[index + 7] & 0xff) << 56;
694-
case 7:
695-
k1 ^= ((long) data[index + 6] & 0xff) << 48;
696-
case 6:
697-
k1 ^= ((long) data[index + 5] & 0xff) << 40;
698-
case 5:
699-
k1 ^= ((long) data[index + 4] & 0xff) << 32;
700-
case 4:
701-
k1 ^= ((long) data[index + 3] & 0xff) << 24;
702-
case 3:
703-
k1 ^= ((long) data[index + 2] & 0xff) << 16;
704-
case 2:
705-
k1 ^= ((long) data[index + 1] & 0xff) << 8;
706-
case 1:
707-
k1 ^= data[index] & 0xff;
708-
k1 *= C1;
709-
k1 = Long.rotateLeft(k1, R1);
710-
k1 *= C2;
711-
h1 ^= k1;
712-
}
713-
714-
// finalization
715-
h1 ^= length;
716-
h2 ^= length;
717-
718-
h1 += h2;
719-
h2 += h1;
720-
721-
h1 = fmix64(h1);
722-
h2 = fmix64(h2);
723-
724-
h1 += h2;
725-
726-
return h1;
727-
}
728-
729-
/**
730-
* Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}.
731-
*
732-
* @param hash The current hash
733-
* @return The final hash
734-
*/
735-
private static long fmix64(long hash) {
736-
hash ^= (hash >>> 33);
737-
hash *= 0xff51afd7ed558ccdL;
738-
hash ^= (hash >>> 33);
739-
hash *= 0xc4ceb9fe1a85ec53L;
740-
hash ^= (hash >>> 33);
741-
return hash;
742-
}
743-
}
744564
}

0 commit comments

Comments
 (0)