diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/TsidBuilder.java b/server/src/main/java/org/elasticsearch/cluster/routing/TsidBuilder.java new file mode 100644 index 0000000000000..173400ddfcbbd --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/TsidBuilder.java @@ -0,0 +1,346 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.cluster.routing; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.hash.Murmur3Hasher; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.util.ByteUtils; +import org.elasticsearch.index.mapper.RoutingPathFields; +import org.elasticsearch.xcontent.XContentString; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * A builder for creating time series identifiers (TSIDs) based on dimensions. + * This class allows adding various types of dimensions (int, long, double, boolean, string, bytes) + * and builds a TSID that is a hash of the dimension names and values. + * Important properties of TSIDs are that they cluster similar time series together, + * which helps with storage efficiency, + * and that they minimize the risk of hash collisions. + * At the same time, they should be short to be efficient in terms of storage and processing. + */ +public class TsidBuilder { + + private static final int MAX_TSID_VALUE_FIELDS = 16; + private final Murmur3Hasher murmur3Hasher = new Murmur3Hasher(0L); + + private final List dimensions = new ArrayList<>(); + + public static TsidBuilder newBuilder() { + return new TsidBuilder(); + } + + /** + * Adds an integer dimension to the TSID. + * + * @param path the path of the dimension + * @param value the integer value of the dimension + * @return the TsidBuilder instance for method chaining + */ + public TsidBuilder addIntDimension(String path, int value) { + addDimension(path, new MurmurHash3.Hash128(1, value)); + return this; + } + + /** + * Adds a long dimension to the TSID. + * + * @param path the path of the dimension + * @param value the long value of the dimension + * @return the TsidBuilder instance for method chaining + */ + public TsidBuilder addLongDimension(String path, long value) { + addDimension(path, new MurmurHash3.Hash128(1, value)); + return this; + } + + /** + * Adds a double dimension to the TSID. + * + * @param path the path of the dimension + * @param value the double value of the dimension + * @return the TsidBuilder instance for method chaining + */ + public TsidBuilder addDoubleDimension(String path, double value) { + addDimension(path, new MurmurHash3.Hash128(2, Double.doubleToLongBits(value))); + return this; + } + + /** + * Adds a boolean dimension to the TSID. + * + * @param path the path of the dimension + * @param value the boolean value of the dimension + * @return the TsidBuilder instance for method chaining + */ + public TsidBuilder addBooleanDimension(String path, boolean value) { + addDimension(path, new MurmurHash3.Hash128(3, value ? 1 : 0)); + return this; + } + + /** + * Adds a string dimension to the TSID. + * + * @param path the path of the dimension + * @param value the string value of the dimension + * @return the TsidBuilder instance for method chaining + */ + public TsidBuilder addStringDimension(String path, String value) { + addStringDimension(path, new BytesRef(value)); + return this; + } + + private void addStringDimension(String path, BytesRef value) { + addStringDimension(path, value.bytes, value.offset, value.length); + } + + /** + * Adds a string dimension to the TSID. + * + * @param path the path of the dimension + * @param value the UTF8Bytes value of the dimension + * @return the TsidBuilder instance for method chaining + */ + public TsidBuilder addStringDimension(String path, XContentString.UTF8Bytes value) { + addStringDimension(path, value.bytes(), value.offset(), value.length()); + return this; + } + + /** + * Adds a string dimension to the TSID using a byte array. + * The value is provided as UTF-8 encoded bytes[]. + * + * @param path the path of the dimension + * @param utf8Bytes the UTF-8 encoded bytes of the string value + * @param offset the offset in the byte array where the string starts + * @param length the length of the string in bytes + * @return the TsidBuilder instance for method chaining + */ + public TsidBuilder addStringDimension(String path, byte[] utf8Bytes, int offset, int length) { + murmur3Hasher.reset(); + murmur3Hasher.update(utf8Bytes, offset, length); + MurmurHash3.Hash128 hash128 = murmur3Hasher.digestHash(); + addDimension(path, hash128); + return this; + } + + /** + * Adds a value to the TSID using a funnel. + * This allows for complex types to be added to the TSID. + * + * @param value the value to add + * @param funnel the funnel that describes how to add the value + * @param the type of the value + * @return the TsidBuilder instance for method chaining + */ + public TsidBuilder add(T value, TsidFunnel funnel) { + funnel.add(value, this); + return this; + } + + /** + * Adds a value to the TSID using a funnel that can throw exceptions. + * This allows for complex types to be added to the TSID. + * + * @param value the value to add + * @param funnel the funnel that describes how to add the value + * @param the type of the value + * @param the type of exception that can be thrown + * @return the TsidBuilder instance for method chaining + * @throws E if an exception occurs while adding the value + */ + public TsidBuilder add(T value, ThrowingTsidFunnel funnel) throws E { + funnel.add(value, this); + return this; + } + + private void addDimension(String path, MurmurHash3.Hash128 valueHash) { + murmur3Hasher.reset(); + addString(murmur3Hasher, path); + MurmurHash3.Hash128 pathHash = murmur3Hasher.digestHash(); + dimensions.add(new Dimension(path, pathHash, valueHash, dimensions.size())); + } + + /** + * Adds all dimensions from another TsidBuilder to this one. + * If the other builder is null or has no dimensions, this method does nothing. + * + * @param other the other TsidBuilder to add dimensions from + * @return this TsidBuilder instance for method chaining + */ + public TsidBuilder addAll(TsidBuilder other) { + if (other == null || other.dimensions.isEmpty()) { + return this; + } + dimensions.addAll(other.dimensions); + return this; + } + + /** + * Computes the hash of the dimensions added to this builder. + * The hash is a 128-bit value that is computed based on the dimension names and values. + * + * @return a HashValue128 representing the hash of the dimensions + * @throws IllegalArgumentException if no dimensions have been added + */ + public MurmurHash3.Hash128 hash() { + throwIfEmpty(); + Collections.sort(dimensions); + murmur3Hasher.reset(); + for (Dimension dim : dimensions) { + addLongs(murmur3Hasher, dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2); + } + return murmur3Hasher.digestHash(); + } + + /** + * Builds a time series identifier (TSID) based on the dimensions added to this builder. + * This is a slight adaptation of {@link RoutingPathFields#buildHash()} but creates shorter tsids. + * The TSID is a hash that includes: + *
    + *
  • + * A hash of the dimension field names (4 bytes). + * This is to cluster time series that are using the same dimensions together, which makes the encodings more effective. + *
  • + *
  • + * A hash of the dimension field values (1 byte each, up to a maximum of 16 fields). + * This is to cluster time series with similar values together, also helping with making encodings more effective. + *
  • + *
  • + * A hash of all names and values combined (16 bytes). + * This is to avoid hash collisions. + *
  • + *
+ * + * @return a BytesRef containing the TSID + * @throws IllegalArgumentException if no dimensions have been added + */ + public BytesRef buildTsid() { + throwIfEmpty(); + int numberOfValues = Math.min(MAX_TSID_VALUE_FIELDS, dimensions.size()); + byte[] hash = new byte[4 + numberOfValues + 16]; + int index = 0; + + Collections.sort(dimensions); + + MurmurHash3.Hash128 hashBuffer = new MurmurHash3.Hash128(); + murmur3Hasher.reset(); + for (int i = 0; i < dimensions.size(); i++) { + Dimension dim = dimensions.get(i); + addLong(murmur3Hasher, dim.pathHash.h1 ^ dim.pathHash.h2); + } + ByteUtils.writeIntLE((int) murmur3Hasher.digestHash(hashBuffer).h1, hash, index); + index += 4; + + // similarity hash for values + String previousPath = null; + for (int i = 0; i < numberOfValues; i++) { + Dimension dim = dimensions.get(i); + String path = dim.path(); + if (path.equals(previousPath)) { + // only add the first value for array fields + continue; + } + MurmurHash3.Hash128 valueHash = dim.valueHash(); + murmur3Hasher.reset(); + addLong(murmur3Hasher, valueHash.h1 ^ valueHash.h2); + hash[index++] = (byte) murmur3Hasher.digestHash(hashBuffer).h1; + previousPath = path; + } + + murmur3Hasher.reset(); + for (int i = 0; i < dimensions.size(); i++) { + Dimension dim = dimensions.get(i); + addLongs(murmur3Hasher, dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2); + } + index = writeHash128(murmur3Hasher.digestHash(hashBuffer), hash, index); + return new BytesRef(hash, 0, index); + } + + private void throwIfEmpty() { + if (dimensions.isEmpty()) { + throw new IllegalArgumentException("Dimensions are empty"); + } + } + + private static int writeHash128(MurmurHash3.Hash128 hash128, byte[] buffer, int index) { + ByteUtils.writeLongLE(hash128.h2, buffer, index); + index += 8; + ByteUtils.writeLongLE(hash128.h1, buffer, index); + index += 8; + return index; + } + + /** + * A functional interface that describes how objects of a complex type are added to a TSID. + * + * @param the type of the value + */ + @FunctionalInterface + public interface TsidFunnel { + void add(T value, TsidBuilder tsidBuilder); + } + + /** + * A functional interface that describes how objects of a complex type are added to a TSID, + * allowing for exceptions to be thrown during the process. + * + * @param the type of the value + * @param the type of exception that can be thrown + */ + @FunctionalInterface + public interface ThrowingTsidFunnel { + void add(T value, TsidBuilder tsidBuilder) throws E; + } + + private record Dimension(String path, MurmurHash3.Hash128 pathHash, MurmurHash3.Hash128 valueHash, int insertionOrder) + implements + Comparable { + @Override + public int compareTo(Dimension o) { + int i = path.compareTo(o.path); + if (i != 0) return i; + // ensures array values are in the order as they appear in the source + return Integer.compare(insertionOrder, o.insertionOrder); + } + } + + // these methods will be replaced with a more optimized version when https://github.com/elastic/elasticsearch/pull/133226 is merged + + private static void addString(Murmur3Hasher murmur3Hasher, String path) { + BytesRef bytesRef = new BytesRef(path); + murmur3Hasher.update(bytesRef.bytes, bytesRef.offset, bytesRef.length); + } + + private static void addLong(Murmur3Hasher murmur3Hasher, long value) { + byte[] bytes = new byte[8]; + ByteUtils.writeLongLE(value, bytes, 0); + murmur3Hasher.update(bytes); + } + + private static void addLongs(Murmur3Hasher murmur3Hasher, long v1, long v2) { + byte[] bytes = new byte[16]; + ByteUtils.writeLongLE(v1, bytes, 0); + ByteUtils.writeLongLE(v2, bytes, 8); + murmur3Hasher.update(bytes); + } + + private static void addLongs(Murmur3Hasher murmur3Hasher, long v1, long v2, long v3, long v4) { + byte[] bytes = new byte[32]; + ByteUtils.writeLongLE(v1, bytes, 0); + ByteUtils.writeLongLE(v2, bytes, 8); + ByteUtils.writeLongLE(v3, bytes, 16); + ByteUtils.writeLongLE(v4, bytes, 24); + murmur3Hasher.update(bytes); + } +} diff --git a/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java b/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java index 0abbb5d935156..8096a937e1a43 100644 --- a/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java +++ b/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java @@ -29,6 +29,13 @@ public static class Hash128 { /** higher 64 bits part **/ public long h2; + public Hash128() {} + + public Hash128(long h1, long h2) { + this.h1 = h1; + this.h2 = h2; + } + public byte[] getBytes() { byte[] hash = new byte[16]; getBytes(hash, 0); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/TsidBuilderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/TsidBuilderTests.java new file mode 100644 index 0000000000000..6ed1eca4fa75d --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/TsidBuilderTests.java @@ -0,0 +1,132 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.cluster.routing; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.Text; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HexFormat; +import java.util.Set; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.notNullValue; + +public class TsidBuilderTests extends ESTestCase { + + public void testAddDimensions() { + TsidBuilder builder = TsidBuilder.newBuilder() + .addStringDimension("test_string", "hello") + .addBooleanDimension("test_bool", true) + .addIntDimension("test_int", 42) + .addLongDimension("test_long", 123456789L) + .addDoubleDimension("test_double", 3.14159) + .addStringDimension("test_array", "value1") + .addStringDimension("test_array", "value2"); + + // if these change, we'll need a new index version + // because it means existing time series will get a new _tsid and will be routed to a different shard + assertThat(builder.hash().toString(), equalTo("0xd4de1356065d297a2be489781e15d256")); // used to make shard routing decisions + BytesRef bytesRef = builder.buildTsid(); + assertThat(bytesRef, notNullValue()); + // 4 bytes for path hash + 1 byte per value (up to 16, only first value for arrays) + 16 bytes for hash + assertThat(bytesRef.length, equalTo(26)); + assertThat( + HexFormat.of().formatHex(bytesRef.bytes, bytesRef.offset, bytesRef.length), + equalTo("bf438ddaa0a8d663fdbb56d2151e7889e42b7a295d065613ded4") // _tsid in hex format + ); + } + + public void testOrderingOfDifferentFieldsDoesNotMatter() { + assertEqualBuilders( + TsidBuilder.newBuilder().addStringDimension("foo", "bar").addStringDimension("baz", "qux"), + TsidBuilder.newBuilder().addStringDimension("baz", "qux").addStringDimension("foo", "bar") + ); + } + + public void testOrderingOfMultiFieldsMatters() { + assertThat( + Set.of( + TsidBuilder.newBuilder().addStringDimension("foo", "bar").addStringDimension("foo", "baz").buildTsid(), + TsidBuilder.newBuilder().addStringDimension("foo", "baz").addStringDimension("foo", "bar").buildTsid() + ), + hasSize(2) + ); + } + + public void testAddStringDimension() { + String stringValue = randomUnicodeOfLengthBetween(0, 1024); + BytesRef bytesRef = new BytesRef(stringValue); + byte[] utf8Bytes = stringValue.getBytes(StandardCharsets.UTF_8); + assertEqualBuilders( + TsidBuilder.newBuilder().addStringDimension("test_string", stringValue), + TsidBuilder.newBuilder().addStringDimension("test_string", new Text(stringValue).bytes()), + TsidBuilder.newBuilder().addStringDimension("test_string", bytesRef.bytes, bytesRef.offset, bytesRef.length), + TsidBuilder.newBuilder().addStringDimension("test_string", utf8Bytes, 0, utf8Bytes.length) + ); + } + + private static void assertEqualBuilders(TsidBuilder... tsidBuilders) { + assertThat(Arrays.stream(tsidBuilders).map(TsidBuilder::buildTsid).distinct().toList(), hasSize(1)); + assertThat(Arrays.stream(tsidBuilders).map(TsidBuilder::hash).distinct().toList(), hasSize(1)); + assertThat(tsidBuilders[0].buildTsid(), notNullValue()); + assertThat(tsidBuilders[0].buildTsid().length, greaterThan(0)); + } + + public void testAddAll() { + TsidBuilder builder1 = TsidBuilder.newBuilder().addStringDimension("foo", "bar"); + TsidBuilder builder2 = TsidBuilder.newBuilder().addStringDimension("baz", "qux"); + assertEqualBuilders( + TsidBuilder.newBuilder().addAll(builder1).addAll(builder2), + TsidBuilder.newBuilder().addStringDimension("foo", "bar").addStringDimension("baz", "qux") + ); + } + + public void testAddAllWithNullOrEmpty() { + assertEqualBuilders( + TsidBuilder.newBuilder().addIntDimension("test", 42), + TsidBuilder.newBuilder().addIntDimension("test", 42).addAll(null).addAll(TsidBuilder.newBuilder()) + ); + } + + public void testExceptionWhenNoDimensions() { + // Test that exception is thrown when no dimensions are added + TsidBuilder builder = TsidBuilder.newBuilder(); + + IllegalArgumentException hashException = expectThrows(IllegalArgumentException.class, builder::hash); + assertTrue(hashException.getMessage().contains("Dimensions are empty")); + + IllegalArgumentException tsidException = expectThrows(IllegalArgumentException.class, builder::buildTsid); + assertTrue(tsidException.getMessage().contains("Dimensions are empty")); + } + + public void testTsidMinSize() { + BytesRef tsid = TsidBuilder.newBuilder().addIntDimension("test_int", 42).buildTsid(); + + // The TSID format should be: 4 bytes for path hash + 1 byte per value (up to 16) + 16 bytes for hash + // Since we only added one dimension, we expect: 4 + 1 + 16 = 21 bytes + assertEquals(21, tsid.length); + } + + public void testTsidMaxSize() { + TsidBuilder tsidBuilder = TsidBuilder.newBuilder(); + for (int i = 0; i < 32; i++) { + tsidBuilder.addStringDimension("dimension_" + i, "value_" + i); + } + + // The TSID format should be: 4 bytes for path hash + 1 byte per value (up to 16) + 16 bytes for hash + // Since we added 32 dimensions, we expect: 4 + 16 + 16 = 36 bytes + assertEquals(36, tsidBuilder.buildTsid().length); + } +}