Skip to content

Commit 7862d7f

Browse files
authored
Implement conversion from t-digest to exponential histograms (#137575)
1 parent eee9aa3 commit 7862d7f

File tree

2 files changed

+146
-2
lines changed

2 files changed

+146
-2
lines changed

x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/mapper/ParsedHistogramConverter.java

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import java.util.ArrayList;
1313
import java.util.List;
1414

15+
import static org.elasticsearch.exponentialhistogram.ExponentialHistogram.MAX_SCALE;
16+
1517
public class ParsedHistogramConverter {
1618

1719
/**
@@ -42,6 +44,73 @@ public static HistogramParser.ParsedHistogram exponentialToTDigest(ExponentialHi
4244
return new HistogramParser.ParsedHistogram(centroids, counts);
4345
}
4446

47+
/**
48+
* Converts t-digest histograms to exponential histograms, trying to do the inverse
49+
* of {@link #exponentialToTDigest(ExponentialHistogramParser.ParsedExponentialHistogram)}
50+
* as accurately as possible.
51+
* <br>
52+
* On a round-trip conversion from exponential histogram to T-Digest and back,
53+
* the bucket centers will be preserved, however the bucket widths are lost.
54+
* The conversion algorithm works by generating tiny buckets (scale set to MAX_SCALE)
55+
* containing the T-Digest centroids.
56+
*
57+
* @param tDigest the t-digest histogram to convert
58+
* @return the resulting exponential histogram
59+
*/
60+
public static ExponentialHistogramParser.ParsedExponentialHistogram tDigestToExponential(HistogramParser.ParsedHistogram tDigest) {
61+
List<Double> centroids = tDigest.values();
62+
List<Long> counts = tDigest.counts();
63+
64+
int numNegativeCentroids = 0;
65+
while (numNegativeCentroids < centroids.size() && centroids.get(numNegativeCentroids) < 0) {
66+
numNegativeCentroids++;
67+
}
68+
69+
// iterate negative centroids from closest to zero to furthest away,
70+
// which corresponds to ascending exponential histogram bucket indices
71+
int scale = MAX_SCALE;
72+
List<IndexWithCount> negativeBuckets = new ArrayList<>();
73+
for (int i = numNegativeCentroids - 1; i >= 0; i--) {
74+
double centroid = centroids.get(i);
75+
long count = counts.get(i);
76+
assert centroid < 0;
77+
appendCentroidWithCountAsBucket(centroid, count, scale, negativeBuckets);
78+
}
79+
80+
long zeroCount = 0;
81+
int firstPositiveIndex = numNegativeCentroids;
82+
if (firstPositiveIndex < centroids.size() && centroids.get(firstPositiveIndex) == 0) {
83+
// we have a zero-centroid, which we'll map to the zero bucket
84+
zeroCount = counts.get(firstPositiveIndex);
85+
firstPositiveIndex++;
86+
}
87+
88+
List<IndexWithCount> positiveBuckets = new ArrayList<>();
89+
for (int i = firstPositiveIndex; i < centroids.size(); i++) {
90+
double centroid = centroids.get(i);
91+
long count = counts.get(i);
92+
assert centroid > 0;
93+
appendCentroidWithCountAsBucket(centroid, count, scale, positiveBuckets);
94+
}
95+
96+
return new ExponentialHistogramParser.ParsedExponentialHistogram(
97+
scale,
98+
0.0,
99+
zeroCount,
100+
negativeBuckets,
101+
positiveBuckets,
102+
null, // sum, min, max will be estimated
103+
null,
104+
null
105+
);
106+
}
107+
108+
private static void appendCentroidWithCountAsBucket(double centroid, long count, int scale, List<IndexWithCount> outputBuckets) {
109+
long index = ExponentialScaleUtils.computeIndex(centroid, scale);
110+
assert outputBuckets.isEmpty() || outputBuckets.getLast().index() < index;
111+
outputBuckets.add(new IndexWithCount(index, count));
112+
}
113+
45114
private static void appendBucketCentroid(
46115
List<Double> centroids,
47116
List<Long> counts,
@@ -52,7 +121,13 @@ private static void appendBucketCentroid(
52121
double lowerBound = ExponentialScaleUtils.getLowerBucketBoundary(expHistoBucket.index(), scale);
53122
double upperBound = ExponentialScaleUtils.getUpperBucketBoundary(expHistoBucket.index(), scale);
54123
double center = sign * (lowerBound + upperBound) / 2.0;
55-
centroids.add(center);
56-
counts.add(expHistoBucket.count());
124+
// the index + scale representation is higher precision than the centroid representation,
125+
// so we can have multiple exp histogram buckets map to the same centroid.
126+
if (centroids.isEmpty() == false && centroids.getLast() == center) {
127+
counts.add(counts.removeLast() + expHistoBucket.count());
128+
} else {
129+
centroids.add(center);
130+
counts.add(expHistoBucket.count());
131+
}
57132
}
58133
}

x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/mapper/ParsedHistogramConverterTests.java

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.elasticsearch.exponentialhistogram.ExponentialHistogramMerger;
1717
import org.elasticsearch.exponentialhistogram.ExponentialHistogramTestUtils;
1818
import org.elasticsearch.exponentialhistogram.ExponentialHistogramXContent;
19+
import org.elasticsearch.exponentialhistogram.ExponentialScaleUtils;
1920
import org.elasticsearch.test.ESTestCase;
2021
import org.elasticsearch.xcontent.XContentBuilder;
2122
import org.elasticsearch.xcontent.XContentParser;
@@ -26,13 +27,81 @@
2627
import org.elasticsearch.xpack.oteldata.otlp.docbuilder.MappingHints;
2728

2829
import java.io.IOException;
30+
import java.util.List;
2931
import java.util.stream.LongStream;
3032

3133
import static org.hamcrest.Matchers.closeTo;
3234
import static org.hamcrest.Matchers.equalTo;
35+
import static org.hamcrest.Matchers.lessThan;
3336

3437
public class ParsedHistogramConverterTests extends ESTestCase {
3538

39+
public void testExponentialHistogramRoundTrip() {
40+
ExponentialHistogram input = ExponentialHistogramTestUtils.randomHistogram();
41+
HistogramParser.ParsedHistogram tdigest = ParsedHistogramConverter.exponentialToTDigest(toParsed(input));
42+
ExponentialHistogramParser.ParsedExponentialHistogram output = ParsedHistogramConverter.tDigestToExponential(tdigest);
43+
44+
// the conversion looses the width of the original buckets, but the bucket centers (arithmetic mean of boundaries)
45+
// should be very close
46+
47+
assertThat(output.zeroCount(), equalTo(input.zeroBucket().count()));
48+
assertArithmeticBucketCentersClose(input.negativeBuckets().iterator(), output.negativeBuckets(), output.scale());
49+
assertArithmeticBucketCentersClose(input.positiveBuckets().iterator(), output.positiveBuckets(), output.scale());
50+
}
51+
52+
private static void assertArithmeticBucketCentersClose(
53+
BucketIterator originalBuckets,
54+
List<IndexWithCount> convertedBuckets,
55+
int convertedScale
56+
) {
57+
for (IndexWithCount convertedBucket : convertedBuckets) {
58+
assertThat(originalBuckets.hasNext(), equalTo(true));
59+
60+
double originalCenter = (ExponentialScaleUtils.getLowerBucketBoundary(originalBuckets.peekIndex(), originalBuckets.scale())
61+
+ ExponentialScaleUtils.getUpperBucketBoundary(originalBuckets.peekIndex(), originalBuckets.scale())) / 2.0;
62+
double convertedCenter = (ExponentialScaleUtils.getLowerBucketBoundary(convertedBucket.index(), convertedScale)
63+
+ ExponentialScaleUtils.getUpperBucketBoundary(convertedBucket.index(), convertedScale)) / 2.0;
64+
65+
double relativeError = Math.abs(convertedCenter - originalCenter) / Math.abs(originalCenter);
66+
assertThat(
67+
"original center=" + originalCenter + ", converted center=" + convertedCenter + ", relative error=" + relativeError,
68+
relativeError,
69+
closeTo(0, 0.0000001)
70+
);
71+
72+
originalBuckets.advance();
73+
}
74+
assertThat(originalBuckets.hasNext(), equalTo(false));
75+
}
76+
77+
public void testToExponentialHistogramConversionWithCloseCentroids() {
78+
// build a t-digest with two centroids very close to each other
79+
List<Double> centroids = List.of(1.0, Math.nextAfter(1.0, 2));
80+
List<Long> counts = List.of(1L, 2L);
81+
82+
HistogramParser.ParsedHistogram input = new HistogramParser.ParsedHistogram(centroids, counts);
83+
ExponentialHistogramParser.ParsedExponentialHistogram converted = ParsedHistogramConverter.tDigestToExponential(input);
84+
85+
assertThat(converted.zeroCount(), equalTo(0L));
86+
List<IndexWithCount> posBuckets = converted.positiveBuckets();
87+
assertThat(posBuckets.size(), equalTo(2));
88+
assertThat(posBuckets.get(0).index(), lessThan(posBuckets.get(1).index()));
89+
assertThat(posBuckets.get(0).count(), equalTo(1L));
90+
assertThat(posBuckets.get(1).count(), equalTo(2L));
91+
}
92+
93+
public void testToTDigestConversionMergesCentroids() {
94+
// build a histogram with two buckets very close to zero
95+
ExponentialHistogram input = ExponentialHistogram.builder(ExponentialHistogram.MAX_SCALE, ExponentialHistogramCircuitBreaker.noop())
96+
.setPositiveBucket(ExponentialHistogram.MIN_INDEX, 1)
97+
.setPositiveBucket(ExponentialHistogram.MIN_INDEX + 1, 2)
98+
.build();
99+
// due to rounding errors they end up as the same centroid, but should have the count merged
100+
HistogramParser.ParsedHistogram converted = ParsedHistogramConverter.exponentialToTDigest(toParsed(input));
101+
assertThat(converted.values(), equalTo(List.of(0.0)));
102+
assertThat(converted.counts(), equalTo(List.of(3L)));
103+
}
104+
36105
public void testSameConversionBehaviourAsOtlpMetricsEndpoint() {
37106
// our histograms are sparse, opentelemetry ones are dense.
38107
// to test against the OTLP conversion algorithm, we need to make our random histogram dense enough first

0 commit comments

Comments
 (0)