Merge pull request #1144 from prometheus/beorn7/histogram2

beorn7 · web-flow · commit 25bc1886c07f · 2022-10-11T12:57:40.000+02:00
sparse buckets: Fix handling of +Inf/-Inf/NaN observations
diff --git a/prometheus/histogram.go b/prometheus/histogram.go
@@ -577,28 +577,37 @@ func (hc *histogramCounts) observe(v float64, bucket int, doSparse bool) {
 		atomic.AddUint64(&hc.buckets[bucket], 1)
 	}
 	atomicAddFloat(&hc.sumBits, v)
-	if doSparse {
+	if doSparse && !math.IsNaN(v) {
 		var (
-			sparseKey           int
-			sparseSchema        = atomic.LoadInt32(&hc.sparseSchema)
-			sparseZeroThreshold = math.Float64frombits(atomic.LoadUint64(&hc.sparseZeroThresholdBits))
-			frac, exp           = math.Frexp(math.Abs(v))
-			bucketCreated       bool
+			sparseKey            int
+			sparseSchema         = atomic.LoadInt32(&hc.sparseSchema)
+			sparseZeroThreshold  = math.Float64frombits(atomic.LoadUint64(&hc.sparseZeroThresholdBits))
+			bucketCreated, isInf bool
 		)
-		switch {
-		case math.IsInf(v, 0):
-			sparseKey = math.MaxInt32 // Largest possible sparseKey.
-		case sparseSchema > 0:
+		if math.IsInf(v, 0) {
+			// Pretend v is MaxFloat64 but later increment sparseKey by one.
+			if math.IsInf(v, +1) {
+				v = math.MaxFloat64
+			} else {
+				v = -math.MaxFloat64
+			}
+			isInf = true
+		}
+		frac, exp := math.Frexp(math.Abs(v))
+		if sparseSchema > 0 {
 			bounds := sparseBounds[sparseSchema]
 			sparseKey = sort.SearchFloat64s(bounds, frac) + (exp-1)*len(bounds)
-		default:
+		} else {
 			sparseKey = exp
 			if frac == 0.5 {
 				sparseKey--
 			}
 			div := 1 << -sparseSchema
 			sparseKey = (sparseKey + div - 1) / div
 		}
+		if isInf {
+			sparseKey++
+		}
 		switch {
 		case v > sparseZeroThreshold:
 			bucketCreated = addToSparseBucket(&hc.sparseBucketsPositive, sparseKey, 1)
@@ -1062,7 +1071,8 @@ func (v *HistogramVec) GetMetricWith(labels Labels) (Observer, error) {
 // WithLabelValues works as GetMetricWithLabelValues, but panics where
 // GetMetricWithLabelValues would have returned an error. Not returning an
 // error allows shortcuts like
-//     myVec.WithLabelValues("404", "GET").Observe(42.21)
+//
+//	myVec.WithLabelValues("404", "GET").Observe(42.21)
 func (v *HistogramVec) WithLabelValues(lvs ...string) Observer {
 	h, err := v.GetMetricWithLabelValues(lvs...)
 	if err != nil {
@@ -1073,7 +1083,8 @@ func (v *HistogramVec) WithLabelValues(lvs ...string) Observer {
 
 // With works as GetMetricWith but panics where GetMetricWithLabels would have
 // returned an error. Not returning an error allows shortcuts like
-//     myVec.With(prometheus.Labels{"code": "404", "method": "GET"}).Observe(42.21)
+//
+//	myVec.With(prometheus.Labels{"code": "404", "method": "GET"}).Observe(42.21)
 func (v *HistogramVec) With(labels Labels) Observer {
 	h, err := v.GetMetricWith(labels)
 	if err != nil {
@@ -1219,8 +1230,8 @@ func (s buckSort) Less(i, j int) bool {
 // 2^(2^-n) is less or equal the provided bucketFactor.
 //
 // Special cases:
-//     - bucketFactor <= 1: panics.
-//     - bucketFactor < 2^(2^-8) (but > 1): still returns 8.
+//   - bucketFactor <= 1: panics.
+//   - bucketFactor < 2^(2^-8) (but > 1): still returns 8.
 func pickSparseSchema(bucketFactor float64) int32 {
 	if bucketFactor <= 1 {
 		panic(fmt.Errorf("bucketFactor %f is <=1", bucketFactor))
@@ -1346,13 +1357,55 @@ func findSmallestKey(m *sync.Map) int {
 }
 
 func getLe(key int, schema int32) float64 {
+	// Here a bit of context about the behavior for the last bucket counting
+	// regular numbers (called simply "last bucket" below) and the bucket
+	// counting observations of ±Inf (called "inf bucket" below, with a key
+	// one higher than that of the "last bucket"):
+	//
+	// If we apply the usual formula to the last bucket, its upper bound
+	// would be calculated as +Inf. The reason is that the max possible
+	// regular float64 number (math.MaxFloat64) doesn't coincide with one of
+	// the calculated bucket boundaries. So the calculated boundary has to
+	// be larger than math.MaxFloat64, and the only float64 larger than
+	// math.MaxFloat64 is +Inf. However, we want to count actual
+	// observations of ±Inf in the inf bucket. Therefore, we have to treat
+	// the upper bound of the last bucket specially and set it to
+	// math.MaxFloat64. (The upper bound of the inf bucket, with its key
+	// being one higher than that of the last bucket, naturally comes out as
+	// +Inf by the usual formula. So that's fine.)
+	//
+	// math.MaxFloat64 has a frac of 0.9999999999999999 and an exp of
+	// 1024. If there were a float64 number following math.MaxFloat64, it
+	// would have a frac of 1.0 and an exp of 1024, or equivalently a frac
+	// of 0.5 and an exp of 1025. However, since frac must be smaller than
+	// 1, and exp must be smaller than 1025, either representation overflows
+	// a float64. (Which, in turn, is the reason that math.MaxFloat64 is the
+	// largest possible float64. Q.E.D.) However, the formula for
+	// calculating the upper bound from the idx and schema of the last
+	// bucket results in precisely that. It is either frac=1.0 & exp=1024
+	// (for schema < 0) or frac=0.5 & exp=1025 (for schema >=0). (This is,
+	// by the way, a power of two where the exponent itself is a power of
+	// two, 2¹⁰ in fact, which coinicides with a bucket boundary in all
+	// schemas.) So these are the special cases we have to catch below.
 	if schema < 0 {
-		return math.Ldexp(1, key<<(-schema))
+		exp := key << -schema
+		if exp == 1024 {
+			// This is the last bucket before the overflow bucket
+			// (for ±Inf observations). Return math.MaxFloat64 as
+			// explained above.
+			return math.MaxFloat64
+		}
+		return math.Ldexp(1, exp)
 	}
 
 	fracIdx := key & ((1 << schema) - 1)
 	frac := sparseBounds[schema][fracIdx]
 	exp := (key >> schema) + 1
+	if frac == 0.5 && exp == 1025 {
+		// This is the last bucket before the overflow bucket (for ±Inf
+		// observations). Return math.MaxFloat64 as explained above.
+		return math.MaxFloat64
+	}
 	return math.Ldexp(frac, exp)
 }
 
diff --git a/prometheus/histogram_test.go b/prometheus/histogram_test.go
@@ -548,13 +548,13 @@ func TestSparseHistogram(t *testing.T) {
 			name:         "+Inf observation",
 			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.Inf(+1)},
 			factor:       1.2,
-			want:         `sample_count:7 sample_sum:inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_span:<offset:2147483642 length:1 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 positive_delta:-1 `,
+			want:         `sample_count:7 sample_sum:inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_span:<offset:4092 length:1 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 positive_delta:-1 `,
 		},
 		{
 			name:         "-Inf observation",
 			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.Inf(-1)},
 			factor:       1.2,
-			want:         `sample_count:7 sample_sum:-inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:2147483647 length:1 > negative_delta:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
+			want:         `sample_count:7 sample_sum:-inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:4097 length:1 > negative_delta:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
 		},
 		{
 			name:         "limited buckets but nothing triggered",
@@ -782,3 +782,94 @@ func TestSparseHistogramConcurrency(t *testing.T) {
 		t.Error(err)
 	}
 }
+
+func TestGetLe(t *testing.T) {
+	scenarios := []struct {
+		key    int
+		schema int32
+		want   float64
+	}{
+		{
+			key:    -1,
+			schema: -1,
+			want:   0.25,
+		},
+		{
+			key:    0,
+			schema: -1,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: -1,
+			want:   4,
+		},
+		{
+			key:    512,
+			schema: -1,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    513,
+			schema: -1,
+			want:   math.Inf(+1),
+		},
+		{
+			key:    -1,
+			schema: 0,
+			want:   0.5,
+		},
+		{
+			key:    0,
+			schema: 0,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: 0,
+			want:   2,
+		},
+		{
+			key:    1024,
+			schema: 0,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    1025,
+			schema: 0,
+			want:   math.Inf(+1),
+		},
+		{
+			key:    -1,
+			schema: 2,
+			want:   0.8408964152537144,
+		},
+		{
+			key:    0,
+			schema: 2,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: 2,
+			want:   1.189207115002721,
+		},
+		{
+			key:    4096,
+			schema: 2,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    4097,
+			schema: 2,
+			want:   math.Inf(+1),
+		},
+	}
+
+	for i, s := range scenarios {
+		got := getLe(s.key, s.schema)
+		if s.want != got {
+			t.Errorf("%d. key %d, schema %d, want upper bound of %g, got %g", i, s.key, s.schema, s.want, got)
+		}
+	}
+}