Skip to content

Commit 92efdcf

Browse files
committed
Fix and clean percentile computation
1 parent 91193bc commit 92efdcf

File tree

4 files changed

+129
-133
lines changed

4 files changed

+129
-133
lines changed

libs/exponential-histogram/src/main/java/org/elasticsearch/exponentialhistogram/ExpHistoPercentiles.java

Lines changed: 0 additions & 59 deletions
This file was deleted.
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.exponentialhistogram;
11+
12+
public class ExponentialHistogramQuantile {
13+
14+
/**
15+
* Provides a quantile for the distribution represented by the given histogram.
16+
*
17+
* It returns the value of the element at rank {@code max(0, min( n-1, (quantile * (n+1))-1))}, where rank starts at 0.
18+
* If that value is fractional, we linearly interpolate based on the fraction the values of the two neighboring ranks.
19+
*
20+
* @param histo the histogram representing the distribution
21+
* @param quantile the quantile to query, in the range [0,1]
22+
* @return NaN if the histogram is empty, otherwise the quantile
23+
*/
24+
public static double getQuantile(ExponentialHistogram histo, double quantile) {
25+
if (quantile < 0 || quantile > 1) {
26+
throw new IllegalArgumentException("quantile must be in range [0, 1]");
27+
}
28+
29+
long zeroCount = histo.zeroBucket().count();
30+
long negCount = getTotalCount(histo.negativeBuckets());
31+
long posCount = getTotalCount(histo.positiveBuckets());
32+
33+
long totalCount = zeroCount + negCount + posCount;
34+
if (totalCount == 0) {
35+
// Can't compute quantile on empty histogram
36+
return Double.NaN;
37+
}
38+
39+
double exactRank = Math.max(0, Math.min(totalCount - 1, (totalCount + 1) * quantile - 1));
40+
long lowerRank = (long) Math.floor(exactRank);
41+
long upperRank = (long) Math.ceil(exactRank);
42+
double upperFactor = exactRank - lowerRank;
43+
44+
return getElementAtRank(histo, lowerRank, negCount, zeroCount) * ( 1 - upperFactor)
45+
+getElementAtRank(histo, upperRank, negCount, zeroCount) * upperFactor;
46+
}
47+
48+
private static double getElementAtRank(ExponentialHistogram histo, long rank, long negCount, long zeroCount) {
49+
if (rank < negCount) {
50+
return -getBucketMidpointForRank(histo.negativeBuckets(), (negCount - 1) - rank);
51+
} else if (rank < (negCount + zeroCount)) {
52+
return 0.0;
53+
} else {
54+
return getBucketMidpointForRank(histo.positiveBuckets(), rank - (negCount + zeroCount));
55+
}
56+
}
57+
58+
private static double getBucketMidpointForRank(ExponentialHistogram.BucketIterator buckets, long rank) {
59+
long seenCount = 0;
60+
while (buckets.hasNext()) {
61+
seenCount += buckets.peekCount();
62+
if (rank < seenCount) {
63+
double prev = ExponentialHistogramUtils.getLowerBucketBoundary(buckets.peekIndex(), buckets.scale());
64+
double next = ExponentialHistogramUtils.getLowerBucketBoundary(buckets.peekIndex()+1, buckets.scale());
65+
double result = ExponentialHistogramUtils.getPointOfLeastRelativeError(buckets.peekIndex(), buckets.scale());
66+
return result;
67+
}
68+
buckets.advance();
69+
}
70+
throw new IllegalStateException("buckets contain in total less elements than the desired rank");
71+
}
72+
73+
private static long getTotalCount(ExponentialHistogram.BucketIterator buckets) {
74+
long count = 0;
75+
while (buckets.hasNext()) {
76+
count += buckets.peekCount();
77+
buckets.advance();
78+
}
79+
return count;
80+
}
81+
}

libs/exponential-histogram/src/test/java/org/elasticsearch/exponentialhistogram/FixedSizeExponentialHistogramTests.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ public void testPrintBucketsLinearScale() {
3838
IntStream.range(-1_000_000, 2_000_000).mapToDouble(Double::valueOf)
3939
);
4040

41-
double smallPerc = ExpHistoPercentiles.getPercentile(result, 0.00001);
42-
double highPerc = ExpHistoPercentiles.getPercentile(result, 0.9999);
43-
double median = ExpHistoPercentiles.getPercentile(result, 0.5);
41+
double smallPerc = ExponentialHistogramQuantile.getQuantile(result, 0.00001);
42+
double highPerc = ExponentialHistogramQuantile.getQuantile(result, 0.9999);
43+
double median = ExponentialHistogramQuantile.getQuantile(result, 0.5);
4444

4545
printMidpoints(result);
4646
}

libs/exponential-histogram/src/test/java/org/elasticsearch/exponentialhistogram/PercentileAccuracyTests.java renamed to libs/exponential-histogram/src/test/java/org/elasticsearch/exponentialhistogram/QuantileAccuracyTests.java

Lines changed: 45 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -23,66 +23,58 @@
2323

2424
import java.util.Arrays;
2525
import java.util.Random;
26+
import java.util.stream.DoubleStream;
27+
import java.util.stream.IntStream;
2628

2729
import static org.hamcrest.Matchers.closeTo;
2830
import static org.hamcrest.Matchers.lessThan;
2931

30-
public class PercentileAccuracyTests extends ESTestCase {
32+
public class QuantileAccuracyTests extends ESTestCase {
3133

32-
public static final double[] PERCENTILES_TO_TEST = { 0, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0 };
34+
public static final double[] QUANTILES_TO_TEST = { 0, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0 };
35+
36+
public void testBasicSmall() {
37+
DoubleStream values = IntStream.range(1,10).mapToDouble(Double::valueOf);
38+
testQuantileAccuracy(values.toArray(), 100);
39+
}
3340

34-
/**
35-
* Test percentile accuracy with uniform distribution
36-
*/
3741
public void testUniformDistribution() {
38-
testDistributionPercentileAccuracy(new UniformRealDistribution(new Well19937c(42), 0, 100), 50000, 500);
42+
testDistributionQuantileAccuracy(new UniformRealDistribution(new Well19937c(42), 0, 100), 50000, 500);
3943
}
4044

41-
/**
42-
* Test percentile accuracy with normal distribution
43-
*/
4445
public void testNormalDistribution() {
45-
testDistributionPercentileAccuracy(new NormalDistribution(new Well19937c(42), 100, 15), 50000, 500);
46+
testDistributionQuantileAccuracy(new NormalDistribution(new Well19937c(42), 100, 15), 50000, 500);
4647
}
4748

48-
/**
49-
* Test percentile accuracy with exponential distribution
50-
*/
5149
public void testExponentialDistribution() {
52-
testDistributionPercentileAccuracy(new ExponentialDistribution(new Well19937c(42), 10), 50000, 500);
50+
testDistributionQuantileAccuracy(new ExponentialDistribution(new Well19937c(42), 10), 50000, 500);
5351
}
5452

55-
/**
56-
* Test percentile accuracy with log-normal distribution
57-
*/
5853
public void testLogNormalDistribution() {
59-
testDistributionPercentileAccuracy(new LogNormalDistribution(new Well19937c(42), 0, 1), 50000, 500);
54+
testDistributionQuantileAccuracy(new LogNormalDistribution(new Well19937c(42), 0, 1), 50000, 500);
6055
}
6156

62-
/**
63-
* Test percentile accuracy with gamma distribution
64-
*/
6557
public void testGammaDistribution() {
66-
testDistributionPercentileAccuracy(new GammaDistribution(new Well19937c(42), 2, 5), 50000, 500);
58+
testDistributionQuantileAccuracy(new GammaDistribution(new Well19937c(42), 2, 5), 50000, 500);
6759
}
6860

69-
/**
70-
* Test percentile accuracy with beta distribution
71-
*/
7261
public void testBetaDistribution() {
73-
testDistributionPercentileAccuracy(new BetaDistribution(new Well19937c(42), 2, 5), 50000, 500);
62+
testDistributionQuantileAccuracy(new BetaDistribution(new Well19937c(42), 2, 5), 50000, 500);
7463
}
7564

76-
/**
77-
* Test percentile accuracy with Weibull distribution
78-
*/
7965
public void testWeibullDistribution() {
80-
testDistributionPercentileAccuracy(new WeibullDistribution(new Well19937c(42), 2, 5), 50000, 500);
66+
testDistributionQuantileAccuracy(new WeibullDistribution(new Well19937c(42), 2, 5), 50000, 500);
67+
}
68+
69+
public void testBigJump() {
70+
double[] values = DoubleStream.concat(
71+
IntStream.range(0,18).mapToDouble(Double::valueOf),
72+
DoubleStream.of(1_000_000.0)
73+
).toArray();
74+
75+
testQuantileAccuracy(values, 500);
8176
}
8277

83-
/**
84-
* Test how bucket count affects percentile accuracy
85-
*/
8678
public void testBucketCountImpact() {
8779
RealDistribution distribution = new LogNormalDistribution(new Well19937c(42), 0, 1);
8880
int sampleSize = 50000;
@@ -91,32 +83,26 @@ public void testBucketCountImpact() {
9183
// Test with different bucket counts
9284
int[] bucketCounts = { 10, 50, 100, 200, 500 };
9385
for (int bucketCount : bucketCounts) {
94-
double maxError = testPercentileAccuracy(values, bucketCount);
86+
double maxError = testQuantileAccuracy(values, bucketCount);
9587
logger.info("Bucket count: " + bucketCount + ", Max relative error: " + maxError);
9688
}
9789

9890
// Verify that more buckets generally means better accuracy
99-
double errorWithFewBuckets = testPercentileAccuracy(values, 20);
100-
double errorWithManyBuckets = testPercentileAccuracy(values, 200);
91+
double errorWithFewBuckets = testQuantileAccuracy(values, 20);
92+
double errorWithManyBuckets = testQuantileAccuracy(values, 200);
10193
assertThat("More buckets should improve accuracy", errorWithManyBuckets, lessThan(errorWithFewBuckets));
10294
}
10395

104-
/**
105-
* Test percentile accuracy with mixed positive and negative values
106-
*/
10796
public void testMixedSignValues() {
10897
Random random = new Random(42);
10998
double[] values = new double[10000];
11099
for (int i = 0; i < values.length; i++) {
111100
values[i] = (random.nextDouble() * 200) - 100; // Range from -100 to 100
112101
}
113102

114-
testPercentileAccuracy(values, 100);
103+
testQuantileAccuracy(values, 100);
115104
}
116105

117-
/**
118-
* Test percentile accuracy with skewed data
119-
*/
120106
public void testSkewedData() {
121107
// Create a highly skewed dataset
122108
Random random = new Random(42);
@@ -131,12 +117,9 @@ public void testSkewedData() {
131117
}
132118
}
133119

134-
testPercentileAccuracy(values, 100);
120+
testQuantileAccuracy(values, 100);
135121
}
136122

137-
/**
138-
* Test percentile accuracy with data containing zeros
139-
*/
140123
public void testDataWithZeros() {
141124
Random random = new Random(42);
142125
double[] values = new double[10000];
@@ -149,32 +132,23 @@ public void testDataWithZeros() {
149132
}
150133
}
151134

152-
testPercentileAccuracy(values, 100);
135+
testQuantileAccuracy(values, 100);
153136
}
154137

155-
/**
156-
* Helper method to test percentile accuracy for a given distribution
157-
*/
158-
private void testDistributionPercentileAccuracy(RealDistribution distribution, int sampleSize, int bucketCount) {
138+
private void testDistributionQuantileAccuracy(RealDistribution distribution, int sampleSize, int bucketCount) {
159139
double[] values = generateSamples(distribution, sampleSize);
160-
testPercentileAccuracy(values, bucketCount);
140+
testQuantileAccuracy(values, bucketCount);
161141
}
162142

163-
/**
164-
* Helper method to generate samples from a distribution
165-
*/
166-
private double[] generateSamples(RealDistribution distribution, int sampleSize) {
143+
private static double[] generateSamples(RealDistribution distribution, int sampleSize) {
167144
double[] values = new double[sampleSize];
168145
for (int i = 0; i < sampleSize; i++) {
169146
values[i] = distribution.sample();
170147
}
171148
return values;
172149
}
173150

174-
/**
175-
* Helper method to test percentile accuracy for a given dataset
176-
*/
177-
private double testPercentileAccuracy(double[] values, int bucketCount) {
151+
private double testQuantileAccuracy(double[] values, int bucketCount) {
178152
// Create histogram
179153
ExponentialHistogram histogram = ExponentialHistogramGenerator.createFor(bucketCount, Arrays.stream(values));
180154

@@ -185,17 +159,17 @@ private double testPercentileAccuracy(double[] values, int bucketCount) {
185159
double allowedError = getMaximumRelativeError(values, bucketCount);
186160
double maxError = 0;
187161

188-
// Compare histogram percentiles with exact percentiles
189-
for (double p : PERCENTILES_TO_TEST) {
162+
// Compare histogram quantiles with exact quantiles
163+
for (double q : QUANTILES_TO_TEST) {
190164
double exactValue;
191-
if (p == 0) {
165+
if (q == 0) {
192166
exactValue = Arrays.stream(values).min().getAsDouble();
193-
} else if (p == 1) {
167+
} else if (q == 1) {
194168
exactValue = Arrays.stream(values).max().getAsDouble();
195169
} else {
196-
exactValue = exactPercentile.evaluate(p * 100);
170+
exactValue = exactPercentile.evaluate(q * 100);
197171
}
198-
double histoValue = ExpHistoPercentiles.getPercentile(histogram, p);
172+
double histoValue = ExponentialHistogramQuantile.getQuantile(histogram, q);
199173

200174
// Skip comparison if exact value is zero to avoid division by zero
201175
if (Math.abs(exactValue) < 1e-10) {
@@ -207,8 +181,8 @@ private double testPercentileAccuracy(double[] values, int bucketCount) {
207181

208182
logger.info(
209183
String.format(
210-
"Percentile %.2f: Exact=%.6f, Histogram=%.6f, Relative Error=%.8f, Allowed Relative Error=%.8f",
211-
p,
184+
"Quantile %.2f: Exact=%.6f, Histogram=%.6f, Relative Error=%.8f, Allowed Relative Error=%.8f",
185+
q,
212186
exactValue,
213187
histoValue,
214188
relativeError,
@@ -217,7 +191,7 @@ private double testPercentileAccuracy(double[] values, int bucketCount) {
217191
);
218192

219193
assertThat(
220-
String.format("Percentile %.2f should be accurate within %.6f%% relative error", p, allowedError * 100),
194+
String.format("Quantile %.2f should be accurate within %.6f%% relative error", q, allowedError * 100),
221195
histoValue,
222196
closeTo(exactValue, Math.abs(exactValue * allowedError))
223197
);
@@ -231,7 +205,7 @@ private double testPercentileAccuracy(double[] values, int bucketCount) {
231205
* The error depends on the raw values put into the histogram and the number of buckets allowed.
232206
* This is an implementation of the error bound computation proven by Theorem 3 in the <a href="https://arxiv.org/pdf/2004.08604">UDDSketch paper</a>
233207
*/
234-
private double getMaximumRelativeError(double[] values, int bucketCount) {
208+
private static double getMaximumRelativeError(double[] values, int bucketCount) {
235209
double smallestAbsNegative = Double.MAX_VALUE;
236210
double largestAbsNegative = 0;
237211
double smallestPositive = Double.MAX_VALUE;

0 commit comments

Comments
 (0)