elastic
diff --git a/‎docs/changelog/96794.yaml‎
Lines changed: 5 additions & 0 deletions b/‎docs/changelog/96794.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc‎
Lines changed: 29 additions & 0 deletions b/‎docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎docs/reference/aggregations/metrics/percentile-aggregation.asciidoc‎
Lines changed: 32 additions & 0 deletions b/‎docs/reference/aggregations/metrics/percentile-aggregation.asciidoc‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎docs/reference/aggregations/metrics/percentile-rank-aggregation.asciidoc‎
Lines changed: 4 additions & 3 deletions b/‎docs/reference/aggregations/metrics/percentile-rank-aggregation.asciidoc‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎libs/tdigest/src/main/java/org/elasticsearch/tdigest/Dist.java‎
Lines changed: 55 additions & 17 deletions b/‎libs/tdigest/src/main/java/org/elasticsearch/tdigest/Dist.java‎
Lines changed: 55 additions & 17 deletions
diff --git a/‎libs/tdigest/src/main/java/org/elasticsearch/tdigest/HybridDigest.java‎
Lines changed: 193 additions & 0 deletions b/‎libs/tdigest/src/main/java/org/elasticsearch/tdigest/HybridDigest.java‎
Lines changed: 193 additions & 0 deletions
@@ -0,0 +1,5 @@
+pr: 96794
+summary: Make TDigestState configurable
+area: Aggregations
+type: enhancement
+issues: []
@@ -166,6 +166,35 @@ GET latency/_search
 
 include::percentile-aggregation.asciidoc[tags=t-digest]
 
+==== Execution hint
+
+The default implementation of TDigest is optimized for performance, scaling to millions or even
+billions of sample values while maintaining acceptable accuracy levels (close to 1% relative error
+for millions of samples in some cases). There's an option to use an implementation optimized
+for accuracy by setting parameter `execution_hint` to value `high_accuracy`:
+
+[source,console]
+--------------------------------------------------
+GET latency/_search
+{
+  "size": 0,
+  "aggs": {
+    "load_time_boxplot": {
+      "boxplot": {
+        "field": "load_time",
+        "execution_hint": "high_accuracy"    <1>
+      }
+    }
+  }
+}
+--------------------------------------------------
+// TEST[setup:latency]
+
+<1> Optimize TDigest for accuracy, at the expense of performance
+
+This option can lead to improved accuracy (relative error close to 0.01% for millions of samples in some
+cases) but then percentile queries take 2x-10x longer to complete.
+
 ==== Missing value
 
 The `missing` parameter defines how documents that are missing a value should be treated.
 
@@ -306,6 +306,38 @@ TDigest roughly 64KB in size. In practice data tends to be more random and
 the TDigest will use less memory.
 // end::t-digest[]
 
+[[search-aggregations-metrics-percentile-aggregation-execution-hint]]
+==== Execution hint
+
+The default implementation of TDigest is optimized for performance, scaling to millions or even
+billions of sample values while maintaining acceptable accuracy levels (close to 1% relative error
+for millions of samples in some cases). There's an option to use an implementation optimized
+for accuracy by setting parameter `execution_hint` to value `high_accuracy`:
+
+[source,console]
+--------------------------------------------------
+GET latency/_search
+{
+  "size": 0,
+  "aggs": {
+    "load_time_outlier": {
+      "percentiles": {
+        "field": "load_time",
+        "tdigest": {
+          "execution_hint": "high_accuracy"    <1>
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+// TEST[setup:latency]
+
+<1> Optimize TDigest for accuracy, at the expense of performance
+
+This option can lead to improved accuracy (relative error close to 0.01% for millions of samples in some
+cases) but then percentile queries take 2x-10x longer to complete.
+
 ==== HDR Histogram
 
 https://github.com/HdrHistogram/HdrHistogram[HDR Histogram] (High Dynamic Range Histogram) is an alternative implementation
 
@@ -10,9 +10,10 @@ extracted from specific numeric or <<histogram,histogram fields>> in the documen
 
 [NOTE]
 ==================================================
-Please see <<search-aggregations-metrics-percentile-aggregation-approximation>>
-and <<search-aggregations-metrics-percentile-aggregation-compression>> for advice
-regarding approximation and memory use of the percentile ranks aggregation
+Please see <<search-aggregations-metrics-percentile-aggregation-approximation>>,
+<<search-aggregations-metrics-percentile-aggregation-compression>> and
+<<search-aggregations-metrics-percentile-aggregation-execution-hint>> for advice
+regarding approximation, performance and memory use of the percentile ranks aggregation
 ==================================================
 
 Percentile rank show the percentage of observed values which are below certain
 
@@ -30,30 +30,68 @@
 public class Dist {
 
     private static double cdf(final double x, final int length, Function<Integer, Double> elementGetter) {
+        if (length == 0) {
+            // no data to examine
+            return Double.NaN;
+        }
+        if (length == 1) {
+            double value = elementGetter.apply(0);
+            if (x < value) return 0;
+            if (x > value) return 1;
+            return 0.5;
+        }
+
         if (Double.compare(x, elementGetter.apply(0)) < 0) {
             return 0;
         }
 
-        double n1 = 0.5;
-        int n2 = 0;
-        for (int i = 1; i < length; i++) {
-            double value = elementGetter.apply(i);
-            int compareResult = Double.compare(value, x);
-            if (compareResult > 0) {
-                if (Double.compare(n2, 0) > 0) {
-                    return (n1 + 0.5 * n2) / length;
-                }
-                double previousValue = elementGetter.apply(i - 1);
-                double factor = (x - previousValue) / (value - previousValue);
-                return (n1 + factor) / length;
+        if (Double.compare(x, elementGetter.apply(0)) == 0) {
+            // we have one or more centroids == x, treat them as one
+            // dw will accumulate the weight of all of the centroids at x
+            double dw = 0;
+            for (int i = 0; i < length && Double.compare(elementGetter.apply(i), x) == 0; i++) {
+                dw += 1;
+            }
+            return dw / 2.0 / length;
+        }
+
+        if (x > elementGetter.apply(length - 1)) {
+            return 1;
+        }
+        if (x == elementGetter.apply(length - 1)) {
+            double dw = 0;
+            for (int i = length - 1; i >= 0 && Double.compare(elementGetter.apply(i), x) == 0; i--) {
+                dw += 1;
             }
-            if (compareResult < 0) {
-                n1++;
-            } else {
-                n2++;
+            return (length - dw / 2.0) / length;
+        }
+
+        // initially, we set left width equal to right width
+        double left = (elementGetter.apply(1) - elementGetter.apply(0)) / 2;
+        double weightSoFar = 0;
+
+        for (int i = 0; i < length - 1; i++) {
+            double right = (elementGetter.apply(i + 1) - elementGetter.apply(i)) / 2;
+            if (x < elementGetter.apply(i) + right) {
+                double value = (weightSoFar + AbstractTDigest.interpolate(x, elementGetter.apply(i) - left, elementGetter.apply(i) + right))
+                    / length;
+                return Math.max(value, 0.0);
             }
+            weightSoFar += 1;
+            left = right;
+        }
+
+        // for the last element, assume right width is same as left
+        int lastOffset = length - 1;
+        double right = (elementGetter.apply(lastOffset) - elementGetter.apply(lastOffset - 1)) / 2;
+        if (x < elementGetter.apply(lastOffset) + right) {
+            return (weightSoFar + AbstractTDigest.interpolate(
+                x,
+                elementGetter.apply(lastOffset) - right,
+                elementGetter.apply(lastOffset) + right
+            )) / length;
         }
-        return (length - 0.5 * n2) / length;
+        return 1;
     }
 
     public static double cdf(final double x, double[] data) {
 
@@ -0,0 +1,193 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.tdigest;
+
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Uses a {@link SortingDigest} implementation under the covers for small sample populations, then switches to {@link MergingDigest}.
+ * The {@link SortingDigest} is perfectly accurate and the fastest implementation for up to millions of samples, at the cost of increased
+ * memory footprint as it tracks all samples. Conversely, the {@link MergingDigest} pre-allocates its memory (tens of KBs) and provides
+ * better performance for hundreds of millions of samples and more, while accuracy stays bounded to 0.1-1% for most cases.
+ *
+ * This hybrid  approach provides the best of both worlds, i.e. speedy and accurate percentile calculations for small populations with
+ * bounded memory allocation and acceptable speed and accuracy for larger ones.
+ */
+public class HybridDigest extends AbstractTDigest {
+
+    // See MergingDigest's compression param.
+    private final double compression;
+
+    // Indicates the sample size over which it switches from SortingDigest to MergingDigest.
+    private final long maxSortingSize;
+
+    // This is set to null when the implementation switches to MergingDigest.
+    private SortingDigest sortingDigest = new SortingDigest();
+
+    // This gets initialized when the implementation switches to MergingDigest.
+    private MergingDigest mergingDigest;
+
+    /**
+     * Creates a hybrid digest that uses a {@link SortingDigest} for up to {@param maxSortingSize} samples,
+     * then switches to a {@link MergingDigest}.
+     *
+     * @param compression The compression factor for the MergingDigest
+     * @param maxSortingSize The sample size limit for switching from a {@link SortingDigest} to a {@link MergingDigest} implementation
+     */
+    HybridDigest(double compression, long maxSortingSize) {
+        this.compression = compression;
+        this.maxSortingSize = maxSortingSize;
+    }
+
+    /**
+     * Similar to the constructor above. The limit for switching from a {@link SortingDigest} to a {@link MergingDigest} implementation
+     * is calculated based on the passed compression factor.
+     *
+     * @param compression The compression factor for the MergingDigest
+     */
+    HybridDigest(double compression) {
+        // The default maxSortingSize is calculated so that the SortingDigest will have comparable size with the MergingDigest
+        // at the point where implementations switch, e.g. for default compression 100 SortingDigest allocates ~16kB and MergingDigest
+        // allocates ~15kB.
+        this(compression, Math.round(compression) * 20);
+    }
+
+    @Override
+    public void add(double x, int w) {
+        reserve(w);
+        if (mergingDigest != null) {
+            mergingDigest.add(x, w);
+        } else {
+            sortingDigest.add(x, w);
+        }
+    }
+
+    @Override
+    public void reserve(long size) {
+        if (mergingDigest != null) {
+            mergingDigest.reserve(size);
+            return;
+        }
+        // Check if we need to switch implementations.
+        assert sortingDigest != null;
+        if (sortingDigest.size() + size >= maxSortingSize) {
+            mergingDigest = new MergingDigest(compression);
+            for (double value : sortingDigest.values) {
+                mergingDigest.add(value);
+            }
+            mergingDigest.reserve(size);
+            // Release the allocated SortingDigest.
+            sortingDigest = null;
+        } else {
+            sortingDigest.reserve(size);
+        }
+    }
+
+    @Override
+    public void add(List<? extends TDigest> others) {
+        if (mergingDigest != null) {
+            mergingDigest.add(others);
+        } else {
+            sortingDigest.add(others);
+        }
+    }
+
+    @Override
+    public void compress() {
+        if (mergingDigest != null) {
+            mergingDigest.compress();
+        } else {
+            sortingDigest.compress();
+        }
+    }
+
+    @Override
+    public long size() {
+        if (mergingDigest != null) {
+            return mergingDigest.size();
+        }
+        return sortingDigest.size();
+    }
+
+    @Override
+    public double cdf(double x) {
+        if (mergingDigest != null) {
+            return mergingDigest.cdf(x);
+        }
+        return sortingDigest.cdf(x);
+    }
+
+    @Override
+    public double quantile(double q) {
+        if (mergingDigest != null) {
+            return mergingDigest.quantile(q);
+        }
+        return sortingDigest.quantile(q);
+    }
+
+    @Override
+    public Collection<Centroid> centroids() {
+        if (mergingDigest != null) {
+            return mergingDigest.centroids();
+        }
+        return sortingDigest.centroids();
+    }
+
+    @Override
+    public double compression() {
+        if (mergingDigest != null) {
+            return mergingDigest.compression();
+        }
+        return sortingDigest.compression();
+    }
+
+    @Override
+    public int centroidCount() {
+        if (mergingDigest != null) {
+            return mergingDigest.centroidCount();
+        }
+        return sortingDigest.centroidCount();
+    }
+
+    @Override
+    public double getMin() {
+        if (mergingDigest != null) {
+            return mergingDigest.getMin();
+        }
+        return sortingDigest.getMin();
+    }
+
+    @Override
+    public double getMax() {
+        if (mergingDigest != null) {
+            return mergingDigest.getMax();
+        }
+        return sortingDigest.getMax();
+    }
+
+    @Override
+    public int byteSize() {
+        if (mergingDigest != null) {
+            return mergingDigest.byteSize();
+        }
+        return sortingDigest.byteSize();
+    }
+}