Skip to content

Commit 11ac7de

Browse files
authored
Add BufferedMurmur3Hasher to reduce allocations when hashing Strings (#133226)
1 parent 6b1560d commit 11ac7de

File tree

4 files changed

+301
-39
lines changed

4 files changed

+301
-39
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/TsidBuilder.java

Lines changed: 7 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
package org.elasticsearch.cluster.routing;
1111

1212
import org.apache.lucene.util.BytesRef;
13-
import org.elasticsearch.common.hash.Murmur3Hasher;
13+
import org.elasticsearch.common.hash.BufferedMurmur3Hasher;
1414
import org.elasticsearch.common.hash.MurmurHash3;
1515
import org.elasticsearch.common.util.ByteUtils;
1616
import org.elasticsearch.index.mapper.RoutingPathFields;
@@ -32,7 +32,7 @@
3232
public class TsidBuilder {
3333

3434
private static final int MAX_TSID_VALUE_FIELDS = 16;
35-
private final Murmur3Hasher murmur3Hasher = new Murmur3Hasher(0L);
35+
private final BufferedMurmur3Hasher murmur3Hasher = new BufferedMurmur3Hasher(0L);
3636

3737
private final List<Dimension> dimensions = new ArrayList<>();
3838

@@ -166,7 +166,7 @@ public <T, E extends Exception> TsidBuilder add(T value, ThrowingTsidFunnel<T, E
166166

167167
private void addDimension(String path, MurmurHash3.Hash128 valueHash) {
168168
murmur3Hasher.reset();
169-
addString(murmur3Hasher, path);
169+
murmur3Hasher.addString(path);
170170
MurmurHash3.Hash128 pathHash = murmur3Hasher.digestHash();
171171
dimensions.add(new Dimension(path, pathHash, valueHash, dimensions.size()));
172172
}
@@ -198,7 +198,7 @@ public MurmurHash3.Hash128 hash() {
198198
Collections.sort(dimensions);
199199
murmur3Hasher.reset();
200200
for (Dimension dim : dimensions) {
201-
addLongs(murmur3Hasher, dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
201+
murmur3Hasher.addLongs(dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
202202
}
203203
return murmur3Hasher.digestHash();
204204
}
@@ -237,7 +237,7 @@ public BytesRef buildTsid() {
237237
murmur3Hasher.reset();
238238
for (int i = 0; i < dimensions.size(); i++) {
239239
Dimension dim = dimensions.get(i);
240-
addLong(murmur3Hasher, dim.pathHash.h1 ^ dim.pathHash.h2);
240+
murmur3Hasher.addLong(dim.pathHash.h1 ^ dim.pathHash.h2);
241241
}
242242
ByteUtils.writeIntLE((int) murmur3Hasher.digestHash(hashBuffer).h1, hash, index);
243243
index += 4;
@@ -253,15 +253,15 @@ public BytesRef buildTsid() {
253253
}
254254
MurmurHash3.Hash128 valueHash = dim.valueHash();
255255
murmur3Hasher.reset();
256-
addLong(murmur3Hasher, valueHash.h1 ^ valueHash.h2);
256+
murmur3Hasher.addLong(valueHash.h1 ^ valueHash.h2);
257257
hash[index++] = (byte) murmur3Hasher.digestHash(hashBuffer).h1;
258258
previousPath = path;
259259
}
260260

261261
murmur3Hasher.reset();
262262
for (int i = 0; i < dimensions.size(); i++) {
263263
Dimension dim = dimensions.get(i);
264-
addLongs(murmur3Hasher, dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
264+
murmur3Hasher.addLongs(dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
265265
}
266266
index = writeHash128(murmur3Hasher.digestHash(hashBuffer), hash, index);
267267
return new BytesRef(hash, 0, index);
@@ -314,33 +314,4 @@ public int compareTo(Dimension o) {
314314
return Integer.compare(insertionOrder, o.insertionOrder);
315315
}
316316
}
317-
318-
// these methods will be replaced with a more optimized version when https://github.com/elastic/elasticsearch/pull/133226 is merged
319-
320-
private static void addString(Murmur3Hasher murmur3Hasher, String path) {
321-
BytesRef bytesRef = new BytesRef(path);
322-
murmur3Hasher.update(bytesRef.bytes, bytesRef.offset, bytesRef.length);
323-
}
324-
325-
private static void addLong(Murmur3Hasher murmur3Hasher, long value) {
326-
byte[] bytes = new byte[8];
327-
ByteUtils.writeLongLE(value, bytes, 0);
328-
murmur3Hasher.update(bytes);
329-
}
330-
331-
private static void addLongs(Murmur3Hasher murmur3Hasher, long v1, long v2) {
332-
byte[] bytes = new byte[16];
333-
ByteUtils.writeLongLE(v1, bytes, 0);
334-
ByteUtils.writeLongLE(v2, bytes, 8);
335-
murmur3Hasher.update(bytes);
336-
}
337-
338-
private static void addLongs(Murmur3Hasher murmur3Hasher, long v1, long v2, long v3, long v4) {
339-
byte[] bytes = new byte[32];
340-
ByteUtils.writeLongLE(v1, bytes, 0);
341-
ByteUtils.writeLongLE(v2, bytes, 8);
342-
ByteUtils.writeLongLE(v3, bytes, 16);
343-
ByteUtils.writeLongLE(v4, bytes, 24);
344-
murmur3Hasher.update(bytes);
345-
}
346317
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.common.hash;
11+
12+
import org.apache.lucene.util.UnicodeUtil;
13+
import org.elasticsearch.common.util.ByteUtils;
14+
15+
/**
16+
* A buffered Murmur3 hasher that allows hashing strings and longs efficiently.
17+
* It uses a byte array buffer to reduce allocations for converting strings and longs to bytes before passing them to the hasher.
18+
* The buffer also allows for more efficient execution by minimizing the number of times the underlying hasher is updated,
19+
* and by maximizing the amount of data processed in each update call.
20+
*/
21+
public class BufferedMurmur3Hasher extends Murmur3Hasher {
22+
23+
public static final int DEFAULT_BUFFER_SIZE = 32 * 4; // 32 characters, each character may take up to 4 bytes in UTF-8
24+
/**
25+
* The buffer used for holding the UTF-8 encoded strings before passing them to the hasher.
26+
* Should be sized so that it can hold the longest UTF-8 encoded string that is expected to be hashed,
27+
* to avoid re-sizing the buffer.
28+
* But should also be small enough to not waste memory in case the keys are short.
29+
*/
30+
private byte[] buffer;
31+
private int pos;
32+
33+
public BufferedMurmur3Hasher(long seed) {
34+
this(seed, DEFAULT_BUFFER_SIZE);
35+
}
36+
37+
/**
38+
* Constructs a BufferedMurmur3Hasher with a specified seed and buffer size.
39+
*
40+
* @param seed the seed for the Murmur3 hash function
41+
* @param bufferSize the size of the buffer in bytes, must be at least 32
42+
*/
43+
public BufferedMurmur3Hasher(long seed, int bufferSize) {
44+
super(seed);
45+
if (bufferSize < 32) {
46+
throw new IllegalArgumentException("Buffer size must be at least 32 bytes");
47+
}
48+
this.buffer = new byte[bufferSize];
49+
}
50+
51+
@Override
52+
public MurmurHash3.Hash128 digestHash(MurmurHash3.Hash128 hash) {
53+
flush();
54+
return super.digestHash(hash);
55+
}
56+
57+
@Override
58+
public void reset() {
59+
super.reset();
60+
pos = 0;
61+
}
62+
63+
/**
64+
* Adds a string to the hasher.
65+
* The string is converted to UTF-8 and written into the buffer.
66+
* The buffer is resized if necessary to accommodate the UTF-8 encoded string.
67+
*
68+
* @param value the string value to add
69+
*/
70+
public void addString(String value) {
71+
int requiredBufferLength = UnicodeUtil.maxUTF8Length(value.length());
72+
ensureCapacity(requiredBufferLength);
73+
flushIfRemainingCapacityLowerThan(requiredBufferLength);
74+
pos = UnicodeUtil.UTF16toUTF8(value, 0, value.length(), buffer, pos);
75+
}
76+
77+
/**
78+
* Adds a long value to the hasher.
79+
* The long is written in little-endian format.
80+
*
81+
* @param value the long value to add
82+
*/
83+
public void addLong(long value) {
84+
flushIfRemainingCapacityLowerThan(Long.BYTES);
85+
ByteUtils.writeLongLE(value, buffer, pos);
86+
pos += Long.BYTES;
87+
}
88+
89+
/**
90+
* Adds two long values to the hasher.
91+
* Each long is written in little-endian format.
92+
*
93+
* @param v1 the first long value to add
94+
* @param v2 the second long value to add
95+
*/
96+
public void addLongs(long v1, long v2) {
97+
flushIfRemainingCapacityLowerThan(Long.BYTES * 2);
98+
ByteUtils.writeLongLE(v1, buffer, pos);
99+
ByteUtils.writeLongLE(v2, buffer, pos + 8);
100+
pos += Long.BYTES * 2;
101+
}
102+
103+
/**
104+
* Adds four long values to the hasher.
105+
* Each long is written in little-endian format.
106+
*
107+
* @param v1 the first long value to add
108+
* @param v2 the second long value to add
109+
* @param v3 the third long value to add
110+
* @param v4 the fourth long value to add
111+
*/
112+
public void addLongs(long v1, long v2, long v3, long v4) {
113+
flushIfRemainingCapacityLowerThan(Long.BYTES * 4);
114+
ByteUtils.writeLongLE(v1, buffer, pos);
115+
ByteUtils.writeLongLE(v2, buffer, pos + 8);
116+
ByteUtils.writeLongLE(v3, buffer, pos + 16);
117+
ByteUtils.writeLongLE(v4, buffer, pos + 24);
118+
pos += Long.BYTES * 4;
119+
}
120+
121+
private void ensureCapacity(int requiredBufferLength) {
122+
if (buffer.length < requiredBufferLength) {
123+
flush();
124+
buffer = new byte[requiredBufferLength];
125+
}
126+
}
127+
128+
private void flush() {
129+
if (pos > 0) {
130+
update(buffer, 0, pos);
131+
pos = 0;
132+
}
133+
}
134+
135+
private void flushIfRemainingCapacityLowerThan(int requiredCapacity) {
136+
if (buffer.length - pos < requiredCapacity) {
137+
flush();
138+
}
139+
}
140+
}

server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import org.elasticsearch.common.util.ByteUtils;
1313

1414
import java.math.BigInteger;
15-
import java.util.Objects;
1615

1716
/**
1817
* MurmurHash3 hashing functions.
@@ -56,12 +55,12 @@ public boolean equals(Object other) {
5655
return false;
5756
}
5857
Hash128 that = (Hash128) other;
59-
return Objects.equals(this.h1, that.h1) && Objects.equals(this.h2, that.h2);
58+
return this.h1 == that.h1 && this.h2 == that.h2;
6059
}
6160

6261
@Override
6362
public int hashCode() {
64-
return Objects.hash(h1, h2);
63+
return (int) (h1 ^ h2);
6564
}
6665

6766
@Override

0 commit comments

Comments
 (0)