Skip to content

Commit f790311

Browse files
authored
Add tsid builder (elastic#133344)
1 parent 84a4bf4 commit f790311

File tree

3 files changed

+485
-0
lines changed

3 files changed

+485
-0
lines changed
Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.cluster.routing;
11+
12+
import org.apache.lucene.util.BytesRef;
13+
import org.elasticsearch.common.hash.Murmur3Hasher;
14+
import org.elasticsearch.common.hash.MurmurHash3;
15+
import org.elasticsearch.common.util.ByteUtils;
16+
import org.elasticsearch.index.mapper.RoutingPathFields;
17+
import org.elasticsearch.xcontent.XContentString;
18+
19+
import java.util.ArrayList;
20+
import java.util.Collections;
21+
import java.util.List;
22+
23+
/**
24+
* A builder for creating time series identifiers (TSIDs) based on dimensions.
25+
* This class allows adding various types of dimensions (int, long, double, boolean, string, bytes)
26+
* and builds a TSID that is a hash of the dimension names and values.
27+
* Important properties of TSIDs are that they cluster similar time series together,
28+
* which helps with storage efficiency,
29+
* and that they minimize the risk of hash collisions.
30+
* At the same time, they should be short to be efficient in terms of storage and processing.
31+
*/
32+
public class TsidBuilder {
33+
34+
private static final int MAX_TSID_VALUE_FIELDS = 16;
35+
private final Murmur3Hasher murmur3Hasher = new Murmur3Hasher(0L);
36+
37+
private final List<Dimension> dimensions = new ArrayList<>();
38+
39+
public static TsidBuilder newBuilder() {
40+
return new TsidBuilder();
41+
}
42+
43+
/**
44+
* Adds an integer dimension to the TSID.
45+
*
46+
* @param path the path of the dimension
47+
* @param value the integer value of the dimension
48+
* @return the TsidBuilder instance for method chaining
49+
*/
50+
public TsidBuilder addIntDimension(String path, int value) {
51+
addDimension(path, new MurmurHash3.Hash128(1, value));
52+
return this;
53+
}
54+
55+
/**
56+
* Adds a long dimension to the TSID.
57+
*
58+
* @param path the path of the dimension
59+
* @param value the long value of the dimension
60+
* @return the TsidBuilder instance for method chaining
61+
*/
62+
public TsidBuilder addLongDimension(String path, long value) {
63+
addDimension(path, new MurmurHash3.Hash128(1, value));
64+
return this;
65+
}
66+
67+
/**
68+
* Adds a double dimension to the TSID.
69+
*
70+
* @param path the path of the dimension
71+
* @param value the double value of the dimension
72+
* @return the TsidBuilder instance for method chaining
73+
*/
74+
public TsidBuilder addDoubleDimension(String path, double value) {
75+
addDimension(path, new MurmurHash3.Hash128(2, Double.doubleToLongBits(value)));
76+
return this;
77+
}
78+
79+
/**
80+
* Adds a boolean dimension to the TSID.
81+
*
82+
* @param path the path of the dimension
83+
* @param value the boolean value of the dimension
84+
* @return the TsidBuilder instance for method chaining
85+
*/
86+
public TsidBuilder addBooleanDimension(String path, boolean value) {
87+
addDimension(path, new MurmurHash3.Hash128(3, value ? 1 : 0));
88+
return this;
89+
}
90+
91+
/**
92+
* Adds a string dimension to the TSID.
93+
*
94+
* @param path the path of the dimension
95+
* @param value the string value of the dimension
96+
* @return the TsidBuilder instance for method chaining
97+
*/
98+
public TsidBuilder addStringDimension(String path, String value) {
99+
addStringDimension(path, new BytesRef(value));
100+
return this;
101+
}
102+
103+
private void addStringDimension(String path, BytesRef value) {
104+
addStringDimension(path, value.bytes, value.offset, value.length);
105+
}
106+
107+
/**
108+
* Adds a string dimension to the TSID.
109+
*
110+
* @param path the path of the dimension
111+
* @param value the UTF8Bytes value of the dimension
112+
* @return the TsidBuilder instance for method chaining
113+
*/
114+
public TsidBuilder addStringDimension(String path, XContentString.UTF8Bytes value) {
115+
addStringDimension(path, value.bytes(), value.offset(), value.length());
116+
return this;
117+
}
118+
119+
/**
120+
* Adds a string dimension to the TSID using a byte array.
121+
* The value is provided as UTF-8 encoded bytes[].
122+
*
123+
* @param path the path of the dimension
124+
* @param utf8Bytes the UTF-8 encoded bytes of the string value
125+
* @param offset the offset in the byte array where the string starts
126+
* @param length the length of the string in bytes
127+
* @return the TsidBuilder instance for method chaining
128+
*/
129+
public TsidBuilder addStringDimension(String path, byte[] utf8Bytes, int offset, int length) {
130+
murmur3Hasher.reset();
131+
murmur3Hasher.update(utf8Bytes, offset, length);
132+
MurmurHash3.Hash128 hash128 = murmur3Hasher.digestHash();
133+
addDimension(path, hash128);
134+
return this;
135+
}
136+
137+
/**
138+
* Adds a value to the TSID using a funnel.
139+
* This allows for complex types to be added to the TSID.
140+
*
141+
* @param value the value to add
142+
* @param funnel the funnel that describes how to add the value
143+
* @param <T> the type of the value
144+
* @return the TsidBuilder instance for method chaining
145+
*/
146+
public <T> TsidBuilder add(T value, TsidFunnel<T> funnel) {
147+
funnel.add(value, this);
148+
return this;
149+
}
150+
151+
/**
152+
* Adds a value to the TSID using a funnel that can throw exceptions.
153+
* This allows for complex types to be added to the TSID.
154+
*
155+
* @param value the value to add
156+
* @param funnel the funnel that describes how to add the value
157+
* @param <T> the type of the value
158+
* @param <E> the type of exception that can be thrown
159+
* @return the TsidBuilder instance for method chaining
160+
* @throws E if an exception occurs while adding the value
161+
*/
162+
public <T, E extends Exception> TsidBuilder add(T value, ThrowingTsidFunnel<T, E> funnel) throws E {
163+
funnel.add(value, this);
164+
return this;
165+
}
166+
167+
private void addDimension(String path, MurmurHash3.Hash128 valueHash) {
168+
murmur3Hasher.reset();
169+
addString(murmur3Hasher, path);
170+
MurmurHash3.Hash128 pathHash = murmur3Hasher.digestHash();
171+
dimensions.add(new Dimension(path, pathHash, valueHash, dimensions.size()));
172+
}
173+
174+
/**
175+
* Adds all dimensions from another TsidBuilder to this one.
176+
* If the other builder is null or has no dimensions, this method does nothing.
177+
*
178+
* @param other the other TsidBuilder to add dimensions from
179+
* @return this TsidBuilder instance for method chaining
180+
*/
181+
public TsidBuilder addAll(TsidBuilder other) {
182+
if (other == null || other.dimensions.isEmpty()) {
183+
return this;
184+
}
185+
dimensions.addAll(other.dimensions);
186+
return this;
187+
}
188+
189+
/**
190+
* Computes the hash of the dimensions added to this builder.
191+
* The hash is a 128-bit value that is computed based on the dimension names and values.
192+
*
193+
* @return a HashValue128 representing the hash of the dimensions
194+
* @throws IllegalArgumentException if no dimensions have been added
195+
*/
196+
public MurmurHash3.Hash128 hash() {
197+
throwIfEmpty();
198+
Collections.sort(dimensions);
199+
murmur3Hasher.reset();
200+
for (Dimension dim : dimensions) {
201+
addLongs(murmur3Hasher, dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
202+
}
203+
return murmur3Hasher.digestHash();
204+
}
205+
206+
/**
207+
* Builds a time series identifier (TSID) based on the dimensions added to this builder.
208+
* This is a slight adaptation of {@link RoutingPathFields#buildHash()} but creates shorter tsids.
209+
* The TSID is a hash that includes:
210+
* <ul>
211+
* <li>
212+
* A hash of the dimension field names (4 bytes).
213+
* This is to cluster time series that are using the same dimensions together, which makes the encodings more effective.
214+
* </li>
215+
* <li>
216+
* A hash of the dimension field values (1 byte each, up to a maximum of 16 fields).
217+
* This is to cluster time series with similar values together, also helping with making encodings more effective.
218+
* </li>
219+
* <li>
220+
* A hash of all names and values combined (16 bytes).
221+
* This is to avoid hash collisions.
222+
* </li>
223+
* </ul>
224+
*
225+
* @return a BytesRef containing the TSID
226+
* @throws IllegalArgumentException if no dimensions have been added
227+
*/
228+
public BytesRef buildTsid() {
229+
throwIfEmpty();
230+
int numberOfValues = Math.min(MAX_TSID_VALUE_FIELDS, dimensions.size());
231+
byte[] hash = new byte[4 + numberOfValues + 16];
232+
int index = 0;
233+
234+
Collections.sort(dimensions);
235+
236+
MurmurHash3.Hash128 hashBuffer = new MurmurHash3.Hash128();
237+
murmur3Hasher.reset();
238+
for (int i = 0; i < dimensions.size(); i++) {
239+
Dimension dim = dimensions.get(i);
240+
addLong(murmur3Hasher, dim.pathHash.h1 ^ dim.pathHash.h2);
241+
}
242+
ByteUtils.writeIntLE((int) murmur3Hasher.digestHash(hashBuffer).h1, hash, index);
243+
index += 4;
244+
245+
// similarity hash for values
246+
String previousPath = null;
247+
for (int i = 0; i < numberOfValues; i++) {
248+
Dimension dim = dimensions.get(i);
249+
String path = dim.path();
250+
if (path.equals(previousPath)) {
251+
// only add the first value for array fields
252+
continue;
253+
}
254+
MurmurHash3.Hash128 valueHash = dim.valueHash();
255+
murmur3Hasher.reset();
256+
addLong(murmur3Hasher, valueHash.h1 ^ valueHash.h2);
257+
hash[index++] = (byte) murmur3Hasher.digestHash(hashBuffer).h1;
258+
previousPath = path;
259+
}
260+
261+
murmur3Hasher.reset();
262+
for (int i = 0; i < dimensions.size(); i++) {
263+
Dimension dim = dimensions.get(i);
264+
addLongs(murmur3Hasher, dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
265+
}
266+
index = writeHash128(murmur3Hasher.digestHash(hashBuffer), hash, index);
267+
return new BytesRef(hash, 0, index);
268+
}
269+
270+
private void throwIfEmpty() {
271+
if (dimensions.isEmpty()) {
272+
throw new IllegalArgumentException("Dimensions are empty");
273+
}
274+
}
275+
276+
private static int writeHash128(MurmurHash3.Hash128 hash128, byte[] buffer, int index) {
277+
ByteUtils.writeLongLE(hash128.h2, buffer, index);
278+
index += 8;
279+
ByteUtils.writeLongLE(hash128.h1, buffer, index);
280+
index += 8;
281+
return index;
282+
}
283+
284+
/**
285+
* A functional interface that describes how objects of a complex type are added to a TSID.
286+
*
287+
* @param <T> the type of the value
288+
*/
289+
@FunctionalInterface
290+
public interface TsidFunnel<T> {
291+
void add(T value, TsidBuilder tsidBuilder);
292+
}
293+
294+
/**
295+
* A functional interface that describes how objects of a complex type are added to a TSID,
296+
* allowing for exceptions to be thrown during the process.
297+
*
298+
* @param <T> the type of the value
299+
* @param <E> the type of exception that can be thrown
300+
*/
301+
@FunctionalInterface
302+
public interface ThrowingTsidFunnel<T, E extends Exception> {
303+
void add(T value, TsidBuilder tsidBuilder) throws E;
304+
}
305+
306+
private record Dimension(String path, MurmurHash3.Hash128 pathHash, MurmurHash3.Hash128 valueHash, int insertionOrder)
307+
implements
308+
Comparable<Dimension> {
309+
@Override
310+
public int compareTo(Dimension o) {
311+
int i = path.compareTo(o.path);
312+
if (i != 0) return i;
313+
// ensures array values are in the order as they appear in the source
314+
return Integer.compare(insertionOrder, o.insertionOrder);
315+
}
316+
}
317+
318+
// these methods will be replaced with a more optimized version when https://github.com/elastic/elasticsearch/pull/133226 is merged
319+
320+
private static void addString(Murmur3Hasher murmur3Hasher, String path) {
321+
BytesRef bytesRef = new BytesRef(path);
322+
murmur3Hasher.update(bytesRef.bytes, bytesRef.offset, bytesRef.length);
323+
}
324+
325+
private static void addLong(Murmur3Hasher murmur3Hasher, long value) {
326+
byte[] bytes = new byte[8];
327+
ByteUtils.writeLongLE(value, bytes, 0);
328+
murmur3Hasher.update(bytes);
329+
}
330+
331+
private static void addLongs(Murmur3Hasher murmur3Hasher, long v1, long v2) {
332+
byte[] bytes = new byte[16];
333+
ByteUtils.writeLongLE(v1, bytes, 0);
334+
ByteUtils.writeLongLE(v2, bytes, 8);
335+
murmur3Hasher.update(bytes);
336+
}
337+
338+
private static void addLongs(Murmur3Hasher murmur3Hasher, long v1, long v2, long v3, long v4) {
339+
byte[] bytes = new byte[32];
340+
ByteUtils.writeLongLE(v1, bytes, 0);
341+
ByteUtils.writeLongLE(v2, bytes, 8);
342+
ByteUtils.writeLongLE(v3, bytes, 16);
343+
ByteUtils.writeLongLE(v4, bytes, 24);
344+
murmur3Hasher.update(bytes);
345+
}
346+
}

server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ public static class Hash128 {
2929
/** higher 64 bits part **/
3030
public long h2;
3131

32+
public Hash128() {}
33+
34+
public Hash128(long h1, long h2) {
35+
this.h1 = h1;
36+
this.h2 = h2;
37+
}
38+
3239
public byte[] getBytes() {
3340
byte[] hash = new byte[16];
3441
getBytes(hash, 0);

0 commit comments

Comments
 (0)