Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
*/
public class TsidBuilder {

private static final int MAX_TSID_VALUE_FIELDS = 16;
private final BufferedMurmur3Hasher murmur3Hasher = new BufferedMurmur3Hasher(0L);

private final List<Dimension> dimensions = new ArrayList<>();
Expand Down Expand Up @@ -205,19 +204,21 @@ public MurmurHash3.Hash128 hash() {

/**
* Builds a time series identifier (TSID) based on the dimensions added to this builder.
* This is a slight adaptation of {@link RoutingPathFields#buildHash()} but creates shorter tsids.
* This is an adaptation of {@link RoutingPathFields#buildHash()} but creates shorter TSIDs with a fixed size 16 bytes.
* The TSID is a hash that includes:
* <ul>
* <li>
* A hash of the dimension field names (4 bytes).
* A hash of the dimension field names (1 byte).
* This is to cluster time series that are using the same dimensions together, which makes the encodings more effective.
* </li>
* <li>
* A hash of the dimension field values (1 byte each, up to a maximum of 16 fields).
* A 4 bit hash of the first and second dimension field value (1 byte).
* This is to cluster time series with similar values together, also helping with making encodings more effective.
* For OpenTelemetry metrics, this usually uses the _metric_names_hash and the first attributes.* dimension,
* due to the alphabetical sorting of the dimension names.
* </li>
* <li>
* A hash of all names and values combined (16 bytes).
* A hash of all names and values combined (14 bytes).
* This is to avoid hash collisions.
* </li>
* </ul>
Expand All @@ -227,44 +228,40 @@ public MurmurHash3.Hash128 hash() {
*/
public BytesRef buildTsid() {
throwIfEmpty();
int numberOfValues = Math.min(MAX_TSID_VALUE_FIELDS, dimensions.size());
byte[] hash = new byte[4 + numberOfValues + 16];
int index = 0;
byte[] hash = new byte[16];

Collections.sort(dimensions);

MurmurHash3.Hash128 hashBuffer = new MurmurHash3.Hash128();
murmur3Hasher.reset();
// full hash for all dimension names and values for uniqueness
for (int i = 0; i < dimensions.size(); i++) {
Dimension dim = dimensions.get(i);
murmur3Hasher.addLong(dim.pathHash.h1 ^ dim.pathHash.h2);
murmur3Hasher.addLongs(dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
}
ByteUtils.writeIntLE((int) murmur3Hasher.digestHash(hashBuffer).h1, hash, index);
index += 4;
MurmurHash3.Hash128 hash128 = murmur3Hasher.digestHash(hashBuffer);
ByteUtils.writeLongLE(hash128.h1, hash, 0);
ByteUtils.writeLongLE(hash128.h2, hash, 8);

// similarity hash for values
String previousPath = null;
for (int i = 0; i < numberOfValues; i++) {
// similarity hash for dimension names
murmur3Hasher.reset();
for (int i = 0; i < dimensions.size(); i++) {
Dimension dim = dimensions.get(i);
String path = dim.path();
if (path.equals(previousPath)) {
// only add the first value for array fields
continue;
}
MurmurHash3.Hash128 valueHash = dim.valueHash();
murmur3Hasher.reset();
murmur3Hasher.addLong(valueHash.h1 ^ valueHash.h2);
hash[index++] = (byte) murmur3Hasher.digestHash(hashBuffer).h1;
previousPath = path;
murmur3Hasher.addLong(dim.pathHash.h1 ^ dim.pathHash.h2);
}
hash[0] = (byte) murmur3Hasher.digestHash(hashBuffer).h1;

// similarity hash for first two dimensions
murmur3Hasher.reset();
for (int i = 0; i < dimensions.size(); i++) {
Dimension dim = dimensions.get(i);
murmur3Hasher.addLongs(dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
murmur3Hasher.addLong(dimensions.get(0).valueHash().hashCode());
int valueSimilarityHash = (int) murmur3Hasher.digestHash(hashBuffer).h1 & 0x0F;
if (dimensions.size() > 1) {
murmur3Hasher.reset();
murmur3Hasher.addLong(dimensions.get(1).valueHash().hashCode());
valueSimilarityHash = (valueSimilarityHash << 4) | (byte) murmur3Hasher.digestHash(hashBuffer).h1 & 0x0F;
}
index = writeHash128(murmur3Hasher.digestHash(hashBuffer), hash, index);
return new BytesRef(hash, 0, index);
hash[1] = (byte) valueSimilarityHash;
return new BytesRef(hash);
}

private void throwIfEmpty() {
Expand All @@ -273,14 +270,6 @@ private void throwIfEmpty() {
}
}

private static int writeHash128(MurmurHash3.Hash128 hash128, byte[] buffer, int index) {
ByteUtils.writeLongLE(hash128.h2, buffer, index);
index += 8;
ByteUtils.writeLongLE(hash128.h1, buffer, index);
index += 8;
return index;
}

/**
* A functional interface that describes how objects of a complex type are added to a TSID.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,13 @@ public void testAddDimensions() {

// if these change, we'll need a new index version
// because it means existing time series will get a new _tsid and will be routed to a different shard
assertThat(builder.hash().toString(), equalTo("0xd4de1356065d297a2be489781e15d256")); // used to make shard routing decisions
assertThat(builder.hash().toString(), equalTo("0xd4de1356065d297a2be489781e15d256"));
BytesRef bytesRef = builder.buildTsid();
assertThat(bytesRef, notNullValue());
// 4 bytes for path hash + 1 byte per value (up to 16, only first value for arrays) + 16 bytes for hash
assertThat(bytesRef.length, equalTo(26));
assertThat(bytesRef.length, equalTo(16));
assertThat(
HexFormat.of().formatHex(bytesRef.bytes, bytesRef.offset, bytesRef.length),
equalTo("bf438ddaa0a8d663fdbb56d2151e7889e42b7a295d065613ded4") // _tsid in hex format
equalTo("bfa45d065613ded456d2151e7889e42b") // _tsid in hex format
);
}

Expand Down Expand Up @@ -111,22 +110,13 @@ public void testExceptionWhenNoDimensions() {
assertTrue(tsidException.getMessage().contains("Dimensions are empty"));
}

public void testTsidMinSize() {
BytesRef tsid = TsidBuilder.newBuilder().addIntDimension("test_int", 42).buildTsid();

// The TSID format should be: 4 bytes for path hash + 1 byte per value (up to 16) + 16 bytes for hash
// Since we only added one dimension, we expect: 4 + 1 + 16 = 21 bytes
assertEquals(21, tsid.length);
}

public void testTsidMaxSize() {
public void testTsidSize() {
TsidBuilder tsidBuilder = TsidBuilder.newBuilder();
for (int i = 0; i < 32; i++) {
int dimensions = randomIntBetween(1, 64);
for (int i = 0; i < dimensions; i++) {
tsidBuilder.addStringDimension("dimension_" + i, "value_" + i);
}

// The TSID format should be: 4 bytes for path hash + 1 byte per value (up to 16) + 16 bytes for hash
// Since we added 32 dimensions, we expect: 4 + 16 + 16 = 36 bytes
assertEquals(36, tsidBuilder.buildTsid().length);
assertEquals(16, tsidBuilder.buildTsid().length);
}
}