Skip to content

Commit 2f6ea7a

Browse files
authored
Add ES93BloomFilterStoredFieldsFormat for efficient field existence checks (#137331)
Introduces a new stored fields format that builds a Bloom filter for a specific field to enable fast existence checks without storing the field itself. This delegates storage of all other fields to another StoredFieldsFormat while maintaining the ability to quickly determine if a document might contain the target field.
1 parent f2a05ea commit 2f6ea7a

File tree

5 files changed

+918
-181
lines changed

5 files changed

+918
-181
lines changed

docs/changelog/137331.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 137331
2+
summary: Add ES93BloomFilterStoredFieldsFormat for efficient field existence checks
3+
area: TSDB
4+
type: enhancement
5+
issues: []
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.codec.bloomfilter;
11+
12+
import org.elasticsearch.common.util.ByteUtils;
13+
14+
public class BloomFilterHashFunctions {
15+
private BloomFilterHashFunctions() {}
16+
17+
//
18+
// The following Murmur3 implementation is borrowed from commons-codec.
19+
//
20+
/**
21+
* Implementation of the MurmurHash3 128-bit hash functions.
22+
*
23+
* <p>
24+
* MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic
25+
* operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not
26+
* specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes.
27+
* </p>
28+
*
29+
* <p>
30+
* This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function
31+
* {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher.
32+
* </p>
33+
*
34+
* <p>
35+
* This is public domain code with no copyrights. From home page of
36+
* <a href="https://github.com/aappleby/smhasher">SMHasher</a>:
37+
* </p>
38+
*
39+
* <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their
40+
* code." </blockquote>
41+
*
42+
* <p>
43+
* Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original
44+
* MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a
45+
* 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes.
46+
* </p>
47+
*
48+
* @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
49+
* @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++
50+
* code</a>
51+
* @see <a href=
52+
* "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java">
53+
* Apache Hive Murmer3</a>
54+
* @since 1.13
55+
*/
56+
public static final class MurmurHash3 {
57+
/**
58+
* A default seed to use for the murmur hash algorithm.
59+
* Has the value {@code 104729}.
60+
*/
61+
public static final int DEFAULT_SEED = 104729;
62+
63+
// Constants for 128-bit variant
64+
private static final long C1 = 0x87c37b91114253d5L;
65+
private static final long C2 = 0x4cf5ad432745937fL;
66+
private static final int R1 = 31;
67+
private static final int R2 = 27;
68+
private static final int R3 = 33;
69+
private static final int M = 5;
70+
private static final int N1 = 0x52dce729;
71+
private static final int N2 = 0x38495ab5;
72+
73+
/** No instance methods. */
74+
private MurmurHash3() {}
75+
76+
/**
77+
* Generates 64-bit hash from the byte array with the given offset, length and seed by discarding the second value of the 128-bit
78+
* hash.
79+
*
80+
* This version uses the default seed.
81+
*
82+
* @param data The input byte array
83+
* @param offset The first element of array
84+
* @param length The length of array
85+
* @return The sum of the two 64-bit hashes that make up the hash128
86+
*/
87+
@SuppressWarnings("fallthrough")
88+
public static long hash64(final byte[] data, final int offset, final int length) {
89+
long h1 = MurmurHash3.DEFAULT_SEED;
90+
long h2 = MurmurHash3.DEFAULT_SEED;
91+
final int nblocks = length >> 4;
92+
93+
// body
94+
for (int i = 0; i < nblocks; i++) {
95+
final int index = offset + (i << 4);
96+
long k1 = ByteUtils.readLongLE(data, index);
97+
long k2 = ByteUtils.readLongLE(data, index + 8);
98+
99+
// mix functions for k1
100+
k1 *= C1;
101+
k1 = Long.rotateLeft(k1, R1);
102+
k1 *= C2;
103+
h1 ^= k1;
104+
h1 = Long.rotateLeft(h1, R2);
105+
h1 += h2;
106+
h1 = h1 * M + N1;
107+
108+
// mix functions for k2
109+
k2 *= C2;
110+
k2 = Long.rotateLeft(k2, R3);
111+
k2 *= C1;
112+
h2 ^= k2;
113+
h2 = Long.rotateLeft(h2, R1);
114+
h2 += h1;
115+
h2 = h2 * M + N2;
116+
}
117+
118+
// tail
119+
long k1 = 0;
120+
long k2 = 0;
121+
final int index = offset + (nblocks << 4);
122+
switch (offset + length - index) {
123+
case 15:
124+
k2 ^= ((long) data[index + 14] & 0xff) << 48;
125+
case 14:
126+
k2 ^= ((long) data[index + 13] & 0xff) << 40;
127+
case 13:
128+
k2 ^= ((long) data[index + 12] & 0xff) << 32;
129+
case 12:
130+
k2 ^= ((long) data[index + 11] & 0xff) << 24;
131+
case 11:
132+
k2 ^= ((long) data[index + 10] & 0xff) << 16;
133+
case 10:
134+
k2 ^= ((long) data[index + 9] & 0xff) << 8;
135+
case 9:
136+
k2 ^= data[index + 8] & 0xff;
137+
k2 *= C2;
138+
k2 = Long.rotateLeft(k2, R3);
139+
k2 *= C1;
140+
h2 ^= k2;
141+
142+
case 8:
143+
k1 ^= ((long) data[index + 7] & 0xff) << 56;
144+
case 7:
145+
k1 ^= ((long) data[index + 6] & 0xff) << 48;
146+
case 6:
147+
k1 ^= ((long) data[index + 5] & 0xff) << 40;
148+
case 5:
149+
k1 ^= ((long) data[index + 4] & 0xff) << 32;
150+
case 4:
151+
k1 ^= ((long) data[index + 3] & 0xff) << 24;
152+
case 3:
153+
k1 ^= ((long) data[index + 2] & 0xff) << 16;
154+
case 2:
155+
k1 ^= ((long) data[index + 1] & 0xff) << 8;
156+
case 1:
157+
k1 ^= data[index] & 0xff;
158+
k1 *= C1;
159+
k1 = Long.rotateLeft(k1, R1);
160+
k1 *= C2;
161+
h1 ^= k1;
162+
}
163+
164+
// finalization
165+
h1 ^= length;
166+
h2 ^= length;
167+
168+
h1 += h2;
169+
h2 += h1;
170+
171+
h1 = fmix64(h1);
172+
h2 = fmix64(h2);
173+
174+
h1 += h2;
175+
176+
return h1;
177+
}
178+
179+
/**
180+
* Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}.
181+
*
182+
* @param hash The current hash
183+
* @return The final hash
184+
*/
185+
private static long fmix64(long hash) {
186+
hash ^= (hash >>> 33);
187+
hash *= 0xff51afd7ed558ccdL;
188+
hash ^= (hash >>> 33);
189+
hash *= 0xc4ceb9fe1a85ec53L;
190+
hash ^= (hash >>> 33);
191+
return hash;
192+
}
193+
}
194+
}

0 commit comments

Comments
 (0)