Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -25,28 +25,46 @@ public class ExponentialHistogramUtils {

/**
* Estimates the sum of all values of a histogram just based on the populated buckets.
* Will never return NaN, but might return +/-Infinity if the histogram is too big.
*
* @param negativeBuckets the negative buckets of the histogram
* @param positiveBuckets the positive buckets of the histogram
* @return the estimated sum of all values in the histogram, guaranteed to be zero if there are no buckets
* @return the estimated sum of all values in the histogram, guaranteed to be zero if there are no buckets.
*/
public static double estimateSum(BucketIterator negativeBuckets, BucketIterator positiveBuckets) {
assert negativeBuckets.scale() == positiveBuckets.scale();
double sum = 0.0;
while (negativeBuckets.hasNext()) {
double bucketMidPoint = ExponentialScaleUtils.getPointOfLeastRelativeError(
negativeBuckets.peekIndex(),
negativeBuckets.scale()
);
sum += -bucketMidPoint * negativeBuckets.peekCount();
negativeBuckets.advance();
}
while (positiveBuckets.hasNext()) {
while (negativeBuckets.hasNext() || positiveBuckets.hasNext()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found the two separate loops easier to follow. Is the only reason you're using a single loop to track the highest bucket so that you know which sign to use for the infinity? If so, couldn't you store the max negative and positive index?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To reiterate for other from our slack chat:
Unfortunately the parallel iteration is required, because positive and negative buckets can cancel each other out.
E.g. let's assume that all of the buckets of the histogram below are already at +-Infinity:

negative:
  indices: [999, 1000, 1001, 1002, 1003]
  counts: [1,2, 1, 1, 1]
positive:
  indices: [999, 1000, 1001, 1002, 1003]
  counts: [2,1, 1, 1, 1]

We have to find the largest bucket where the counts do not cancel each other out, in this case it would be index 1000 and the negative range would be the winner.

However, I think I can remove a lot of the mental overhead here by reusing the MergingBucketIterator with some slight adjustments.

long negativeIndex = negativeBuckets.hasNext() ? negativeBuckets.peekIndex() : Long.MAX_VALUE;
long positiveIndex = positiveBuckets.hasNext() ? positiveBuckets.peekIndex() : Long.MAX_VALUE;

double bucketMidPoint = ExponentialScaleUtils.getPointOfLeastRelativeError(
positiveBuckets.peekIndex(),
Math.min(negativeIndex, positiveIndex),
positiveBuckets.scale()
);
sum += bucketMidPoint * positiveBuckets.peekCount();
positiveBuckets.advance();

long countWithSign;
if (negativeIndex == positiveIndex) {
countWithSign = positiveBuckets.peekCount() - negativeBuckets.peekCount();
positiveBuckets.advance();
negativeBuckets.advance();
} else if (negativeIndex < positiveIndex){
countWithSign = -negativeBuckets.peekCount();
negativeBuckets.advance();
} else { // positiveIndex > negativeIndex
countWithSign = positiveBuckets.peekCount();
positiveBuckets.advance();
}
if (countWithSign != 0) {
double toAdd = bucketMidPoint * countWithSign;
if (Double.isFinite(toAdd)) {
sum += toAdd;
} else {
// Avoid NaN in case we end up with e.g. -Infinity+Infinity
// we consider the bucket with the bigger index the winner for the sign
sum = toAdd;
}
}
}
return sum;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
package org.elasticsearch.exponentialhistogram;

import static org.hamcrest.Matchers.closeTo;
import static org.hamcrest.Matchers.equalTo;

public class ExponentialHistogramUtilsTests extends ExponentialHistogramTestCase {

Expand Down Expand Up @@ -55,4 +56,31 @@ public void testRandomDataSumEstimation() {
assertThat(estimatedAverage, closeTo(correctAverage, allowedError));
}
}

public void testInfinityHandling() {
FixedCapacityExponentialHistogram morePositiveValues = createAutoReleasedHistogram(100);
morePositiveValues.resetBuckets(0);
morePositiveValues.tryAddBucket(1999, 1,false);
morePositiveValues.tryAddBucket(2000, 2, false);
morePositiveValues.tryAddBucket(1999, 2,true);
morePositiveValues.tryAddBucket(2000, 2,true);

double sum = ExponentialHistogramUtils.estimateSum(
morePositiveValues.negativeBuckets().iterator(),
morePositiveValues.positiveBuckets().iterator()
);
assertThat(sum, equalTo(Double.POSITIVE_INFINITY));
FixedCapacityExponentialHistogram moreNegativeValues = createAutoReleasedHistogram(100);
moreNegativeValues.resetBuckets(0);
moreNegativeValues.tryAddBucket(1999, 2,false);
moreNegativeValues.tryAddBucket(2000, 2, false);
moreNegativeValues.tryAddBucket(1999, 1,true);
moreNegativeValues.tryAddBucket(2000, 2,true);

sum = ExponentialHistogramUtils.estimateSum(
moreNegativeValues.negativeBuckets().iterator(),
moreNegativeValues.positiveBuckets().iterator()
);
assertThat(sum, equalTo(Double.NEGATIVE_INFINITY));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,7 @@ private static Map<String, Object> createRandomHistogramValue(int maxBucketCount
Map.of("indices", negativeIndices, "counts", negativeCounts)
)
);
if ((positiveIndices.isEmpty() == false || negativeIndices.isEmpty() == false)) {
// we always add the sum field to avoid numeric problems with the estimation due to random buckets
// sum generation is tested in the yaml tests
if (randomBoolean() && (positiveIndices.isEmpty() == false || negativeIndices.isEmpty() == false)) {
result.put("sum", randomDoubleBetween(-1000, 1000, true));
}
return result;
Expand Down