Implement conversion from t-digest to exponential histograms (#137575)

JonasKunz · web-flow · commit 7862d7f37cd5 · 2025-11-10T10:27:35.000+01:00
diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/mapper/ParsedHistogramConverter.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/mapper/ParsedHistogramConverter.java
@@ -12,6 +12,8 @@
 import java.util.ArrayList;
 import java.util.List;
 
+import static org.elasticsearch.exponentialhistogram.ExponentialHistogram.MAX_SCALE;
+
 public class ParsedHistogramConverter {
 
     /**
@@ -42,6 +44,73 @@ public static HistogramParser.ParsedHistogram exponentialToTDigest(ExponentialHi
         return new HistogramParser.ParsedHistogram(centroids, counts);
     }
 
+    /**
+     * Converts t-digest histograms to exponential histograms, trying to do the inverse
+     * of {@link #exponentialToTDigest(ExponentialHistogramParser.ParsedExponentialHistogram)}
+     * as accurately as possible.
+     * <br>
+     * On a round-trip conversion from exponential histogram to T-Digest and back,
+     * the bucket centers will be preserved, however the bucket widths are lost.
+     * The conversion algorithm works by generating tiny buckets (scale set to MAX_SCALE)
+     * containing the T-Digest centroids.
+     *
+     * @param tDigest the t-digest histogram to convert
+     * @return the resulting exponential histogram
+     */
+    public static ExponentialHistogramParser.ParsedExponentialHistogram tDigestToExponential(HistogramParser.ParsedHistogram tDigest) {
+        List<Double> centroids = tDigest.values();
+        List<Long> counts = tDigest.counts();
+
+        int numNegativeCentroids = 0;
+        while (numNegativeCentroids < centroids.size() && centroids.get(numNegativeCentroids) < 0) {
+            numNegativeCentroids++;
+        }
+
+        // iterate negative centroids from closest to zero to furthest away,
+        // which corresponds to ascending exponential histogram bucket indices
+        int scale = MAX_SCALE;
+        List<IndexWithCount> negativeBuckets = new ArrayList<>();
+        for (int i = numNegativeCentroids - 1; i >= 0; i--) {
+            double centroid = centroids.get(i);
+            long count = counts.get(i);
+            assert centroid < 0;
+            appendCentroidWithCountAsBucket(centroid, count, scale, negativeBuckets);
+        }
+
+        long zeroCount = 0;
+        int firstPositiveIndex = numNegativeCentroids;
+        if (firstPositiveIndex < centroids.size() && centroids.get(firstPositiveIndex) == 0) {
+            // we have a zero-centroid, which we'll map to the zero bucket
+            zeroCount = counts.get(firstPositiveIndex);
+            firstPositiveIndex++;
+        }
+
+        List<IndexWithCount> positiveBuckets = new ArrayList<>();
+        for (int i = firstPositiveIndex; i < centroids.size(); i++) {
+            double centroid = centroids.get(i);
+            long count = counts.get(i);
+            assert centroid > 0;
+            appendCentroidWithCountAsBucket(centroid, count, scale, positiveBuckets);
+        }
+
+        return new ExponentialHistogramParser.ParsedExponentialHistogram(
+            scale,
+            0.0,
+            zeroCount,
+            negativeBuckets,
+            positiveBuckets,
+            null, // sum, min, max will be estimated
+            null,
+            null
+        );
+    }
+
+    private static void appendCentroidWithCountAsBucket(double centroid, long count, int scale, List<IndexWithCount> outputBuckets) {
+        long index = ExponentialScaleUtils.computeIndex(centroid, scale);
+        assert outputBuckets.isEmpty() || outputBuckets.getLast().index() < index;
+        outputBuckets.add(new IndexWithCount(index, count));
+    }
+
     private static void appendBucketCentroid(
         List<Double> centroids,
         List<Long> counts,
@@ -52,7 +121,13 @@ private static void appendBucketCentroid(
         double lowerBound = ExponentialScaleUtils.getLowerBucketBoundary(expHistoBucket.index(), scale);
         double upperBound = ExponentialScaleUtils.getUpperBucketBoundary(expHistoBucket.index(), scale);
         double center = sign * (lowerBound + upperBound) / 2.0;
-        centroids.add(center);
-        counts.add(expHistoBucket.count());
+        // the index + scale representation is higher precision than the centroid representation,
+        // so we can have multiple exp histogram buckets map to the same centroid.
+        if (centroids.isEmpty() == false && centroids.getLast() == center) {
+            counts.add(counts.removeLast() + expHistoBucket.count());
+        } else {
+            centroids.add(center);
+            counts.add(expHistoBucket.count());
+        }
     }
 }
diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/mapper/ParsedHistogramConverterTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/mapper/ParsedHistogramConverterTests.java
@@ -16,6 +16,7 @@
 import org.elasticsearch.exponentialhistogram.ExponentialHistogramMerger;
 import org.elasticsearch.exponentialhistogram.ExponentialHistogramTestUtils;
 import org.elasticsearch.exponentialhistogram.ExponentialHistogramXContent;
+import org.elasticsearch.exponentialhistogram.ExponentialScaleUtils;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentParser;
@@ -26,13 +27,81 @@
 import org.elasticsearch.xpack.oteldata.otlp.docbuilder.MappingHints;
 
 import java.io.IOException;
+import java.util.List;
 import java.util.stream.LongStream;
 
 import static org.hamcrest.Matchers.closeTo;
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.lessThan;
 
 public class ParsedHistogramConverterTests extends ESTestCase {
 
+    public void testExponentialHistogramRoundTrip() {
+        ExponentialHistogram input = ExponentialHistogramTestUtils.randomHistogram();
+        HistogramParser.ParsedHistogram tdigest = ParsedHistogramConverter.exponentialToTDigest(toParsed(input));
+        ExponentialHistogramParser.ParsedExponentialHistogram output = ParsedHistogramConverter.tDigestToExponential(tdigest);
+
+        // the conversion looses the width of the original buckets, but the bucket centers (arithmetic mean of boundaries)
+        // should be very close
+
+        assertThat(output.zeroCount(), equalTo(input.zeroBucket().count()));
+        assertArithmeticBucketCentersClose(input.negativeBuckets().iterator(), output.negativeBuckets(), output.scale());
+        assertArithmeticBucketCentersClose(input.positiveBuckets().iterator(), output.positiveBuckets(), output.scale());
+    }
+
+    private static void assertArithmeticBucketCentersClose(
+        BucketIterator originalBuckets,
+        List<IndexWithCount> convertedBuckets,
+        int convertedScale
+    ) {
+        for (IndexWithCount convertedBucket : convertedBuckets) {
+            assertThat(originalBuckets.hasNext(), equalTo(true));
+
+            double originalCenter = (ExponentialScaleUtils.getLowerBucketBoundary(originalBuckets.peekIndex(), originalBuckets.scale())
+                + ExponentialScaleUtils.getUpperBucketBoundary(originalBuckets.peekIndex(), originalBuckets.scale())) / 2.0;
+            double convertedCenter = (ExponentialScaleUtils.getLowerBucketBoundary(convertedBucket.index(), convertedScale)
+                + ExponentialScaleUtils.getUpperBucketBoundary(convertedBucket.index(), convertedScale)) / 2.0;
+
+            double relativeError = Math.abs(convertedCenter - originalCenter) / Math.abs(originalCenter);
+            assertThat(
+                "original center=" + originalCenter + ", converted center=" + convertedCenter + ", relative error=" + relativeError,
+                relativeError,
+                closeTo(0, 0.0000001)
+            );
+
+            originalBuckets.advance();
+        }
+        assertThat(originalBuckets.hasNext(), equalTo(false));
+    }
+
+    public void testToExponentialHistogramConversionWithCloseCentroids() {
+        // build a t-digest with two centroids very close to each other
+        List<Double> centroids = List.of(1.0, Math.nextAfter(1.0, 2));
+        List<Long> counts = List.of(1L, 2L);
+
+        HistogramParser.ParsedHistogram input = new HistogramParser.ParsedHistogram(centroids, counts);
+        ExponentialHistogramParser.ParsedExponentialHistogram converted = ParsedHistogramConverter.tDigestToExponential(input);
+
+        assertThat(converted.zeroCount(), equalTo(0L));
+        List<IndexWithCount> posBuckets = converted.positiveBuckets();
+        assertThat(posBuckets.size(), equalTo(2));
+        assertThat(posBuckets.get(0).index(), lessThan(posBuckets.get(1).index()));
+        assertThat(posBuckets.get(0).count(), equalTo(1L));
+        assertThat(posBuckets.get(1).count(), equalTo(2L));
+    }
+
+    public void testToTDigestConversionMergesCentroids() {
+        // build a histogram with two buckets very close to zero
+        ExponentialHistogram input = ExponentialHistogram.builder(ExponentialHistogram.MAX_SCALE, ExponentialHistogramCircuitBreaker.noop())
+            .setPositiveBucket(ExponentialHistogram.MIN_INDEX, 1)
+            .setPositiveBucket(ExponentialHistogram.MIN_INDEX + 1, 2)
+            .build();
+        // due to rounding errors they end up as the same centroid, but should have the count merged
+        HistogramParser.ParsedHistogram converted = ParsedHistogramConverter.exponentialToTDigest(toParsed(input));
+        assertThat(converted.values(), equalTo(List.of(0.0)));
+        assertThat(converted.counts(), equalTo(List.of(3L)));
+    }
+
     public void testSameConversionBehaviourAsOtlpMetricsEndpoint() {
         // our histograms are sparse, opentelemetry ones are dense.
         // to test against the OTLP conversion algorithm, we need to make our random histogram dense enough first