Skip to content

Commit d55badb

Browse files
committed
Add NH count validation
Signed-off-by: SungJin1212 <[email protected]>
1 parent c505924 commit d55badb

File tree

4 files changed

+213
-1
lines changed

4 files changed

+213
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
* [FEATURE] Querier: Support for configuring query optimizers and enabling XFunctions in the Thanos engine. #6873
3131
* [FEATURE] Query Frontend: Add support /api/v1/format_query API for formatting queries. #6893
3232
* [FEATURE] Query Frontend: Add support for /api/v1/parse_query API (experimental) to parse a PromQL expression and return it as a JSON-formatted AST (abstract syntax tree). #6978
33+
* [ENHANCEMENT] Distributor: Add count validations for native histogram. #7072
3334
* [ENHANCEMENT] Upgrade the Prometheus version to 3.6.0 and add a `-name-validation-scheme` flag to support UTF-8. #7040 #7056
3435
* [ENHANCEMENT] Distributor: Emit an error with a 400 status code when empty labels are found before the relabelling or label dropping process. #7052
3536
* [ENHANCEMENT] Parquet Storage: Add support for additional sort columns during Parquet file generation #7003

pkg/util/validation/errors.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,62 @@ func (e *nativeHistogramSampleSizeBytesExceededError) Error() string {
282282
return fmt.Sprintf("native histogram sample size bytes exceeded for metric (actual: %d, limit: %d) metric: %.200q", e.nhSampleSizeBytes, e.limit, formatLabelSet(e.series))
283283
}
284284

285+
// nativeHistogramMisMatchCountError is a ValidationError implementation for native histogram
286+
// where the Count does not match the sum of observations in the buckets.
287+
type nativeHistogramMisMatchCountError struct {
288+
series []cortexpb.LabelAdapter
289+
observations uint64
290+
count uint64
291+
}
292+
293+
func newNativeHistogramMisMatchedCountError(series []cortexpb.LabelAdapter, observations, count uint64) ValidationError {
294+
return &nativeHistogramMisMatchCountError{
295+
series: series,
296+
observations: observations,
297+
count: count,
298+
}
299+
}
300+
301+
func (e *nativeHistogramMisMatchCountError) Error() string {
302+
return fmt.Sprintf("native histogram bucket count mismatch: count is %d, but observations found in buckets is %d, metric: %.200q", e.count, e.observations, formatLabelSet(e.series))
303+
}
304+
305+
// nativeHistogramNegativeCountError is a ValidationError implementation for float native histogram
306+
// where the Count field is negative.
307+
type nativeHistogramNegativeCountError struct {
308+
series []cortexpb.LabelAdapter
309+
count float64
310+
}
311+
312+
func newNativeHistogramNegativeCountError(series []cortexpb.LabelAdapter, count float64) ValidationError {
313+
return &nativeHistogramNegativeCountError{
314+
series: series,
315+
count: count,
316+
}
317+
}
318+
319+
func (e *nativeHistogramNegativeCountError) Error() string {
320+
return fmt.Sprintf("native histogram observation count %.2f is negative, metric: %.200q", e.count, formatLabelSet(e.series))
321+
}
322+
323+
// nativeHistogramNegativeBucketCountError is a ValidationError implementation for float native histogram
324+
// where the count in buckets is negative.
325+
type nativeHistogramNegativeBucketCountError struct {
326+
series []cortexpb.LabelAdapter
327+
count float64
328+
}
329+
330+
func newNativeHistogramNegativeBucketCountError(series []cortexpb.LabelAdapter, count float64) ValidationError {
331+
return &nativeHistogramNegativeBucketCountError{
332+
series: series,
333+
count: count,
334+
}
335+
}
336+
337+
func (e *nativeHistogramNegativeBucketCountError) Error() string {
338+
return fmt.Sprintf("native histogram buckets have a negative count: %.2f, metric: %.200q", e.count, formatLabelSet(e.series))
339+
}
340+
285341
// formatLabelSet formats label adapters as a metric name with labels, while preserving
286342
// label order, and keeping duplicates. If there are multiple "__name__" labels, only
287343
// first one is used as metric name, other ones will be included as regular labels.

pkg/util/validation/validate.go

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package validation
22

33
import (
44
"errors"
5+
"fmt"
6+
"math"
57
"net/http"
68
"strings"
79
"time"
@@ -60,6 +62,9 @@ const (
6062
nativeHistogramBucketCountLimitExceeded = "native_histogram_buckets_exceeded"
6163
nativeHistogramInvalidSchema = "native_histogram_invalid_schema"
6264
nativeHistogramSampleSizeBytesExceeded = "native_histogram_sample_size_bytes_exceeded"
65+
nativeHistogramNegativeCount = "native_histogram_negative_count"
66+
nativeHistogramNegativeBucketCount = "native_histogram_negative_bucket_count"
67+
nativeHistogramMisMatchCount = "native_histogram_mismatch_count"
6368

6469
// RateLimited is one of the values for the reason to discard samples.
6570
// Declared here to avoid duplication in ingester and distributor.
@@ -368,7 +373,6 @@ func ValidateMetadata(validateMetrics *ValidateMetrics, cfg *Limits, userID stri
368373
}
369374

370375
func ValidateNativeHistogram(validateMetrics *ValidateMetrics, limits *Limits, userID string, ls []cortexpb.LabelAdapter, histogramSample cortexpb.Histogram) (cortexpb.Histogram, error) {
371-
372376
// sample size validation for native histogram
373377
if limits.MaxNativeHistogramSampleSizeBytes > 0 && histogramSample.Size() > limits.MaxNativeHistogramSampleSizeBytes {
374378
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramSampleSizeBytesExceeded, userID).Inc()
@@ -381,13 +385,64 @@ func ValidateNativeHistogram(validateMetrics *ValidateMetrics, limits *Limits, u
381385
return cortexpb.Histogram{}, newNativeHistogramSchemaInvalidError(ls, int(histogramSample.Schema))
382386
}
383387

388+
var nCount, pCount uint64
389+
if histogramSample.IsFloatHistogram() {
390+
if err, c := checkHistogramBuckets(histogramSample.GetNegativeCounts(), &nCount, false); err != nil {
391+
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramNegativeBucketCount, userID).Inc()
392+
return cortexpb.Histogram{}, newNativeHistogramNegativeBucketCountError(ls, c)
393+
}
394+
395+
if err, c := checkHistogramBuckets(histogramSample.GetPositiveCounts(), &pCount, false); err != nil {
396+
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramNegativeBucketCount, userID).Inc()
397+
return cortexpb.Histogram{}, newNativeHistogramNegativeBucketCountError(ls, c)
398+
}
399+
400+
if histogramSample.GetZeroCountFloat() < 0 {
401+
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramNegativeBucketCount, userID).Inc()
402+
return cortexpb.Histogram{}, newNativeHistogramNegativeBucketCountError(ls, histogramSample.GetZeroCountFloat())
403+
}
404+
405+
if histogramSample.GetCountFloat() < 0 {
406+
// validate if float histogram has negative count
407+
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramNegativeCount, userID).Inc()
408+
return cortexpb.Histogram{}, newNativeHistogramNegativeCountError(ls, histogramSample.GetCountFloat())
409+
}
410+
} else {
411+
if err, c := checkHistogramBuckets(histogramSample.GetNegativeDeltas(), &nCount, true); err != nil {
412+
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramNegativeBucketCount, userID).Inc()
413+
return cortexpb.Histogram{}, newNativeHistogramNegativeBucketCountError(ls, c)
414+
}
415+
416+
if err, c := checkHistogramBuckets(histogramSample.GetPositiveDeltas(), &pCount, true); err != nil {
417+
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramNegativeBucketCount, userID).Inc()
418+
return cortexpb.Histogram{}, newNativeHistogramNegativeBucketCountError(ls, c)
419+
}
420+
421+
// validate if there is mismatch between count with observations in buckets
422+
observations := nCount + pCount + histogramSample.GetZeroCountInt()
423+
count := histogramSample.GetCountInt()
424+
425+
if math.IsNaN(histogramSample.Sum) {
426+
if observations > count {
427+
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramMisMatchCount, userID).Inc()
428+
return cortexpb.Histogram{}, newNativeHistogramMisMatchedCountError(ls, observations, count)
429+
}
430+
} else {
431+
if observations != count {
432+
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramMisMatchCount, userID).Inc()
433+
return cortexpb.Histogram{}, newNativeHistogramMisMatchedCountError(ls, observations, count)
434+
}
435+
}
436+
}
437+
384438
if limits.MaxNativeHistogramBuckets == 0 {
385439
return histogramSample, nil
386440
}
387441

388442
var (
389443
exceedLimit bool
390444
)
445+
391446
if histogramSample.IsFloatHistogram() {
392447
// Initial check to see if the bucket limit is exceeded or not. If not, we can avoid type casting.
393448
exceedLimit = len(histogramSample.PositiveCounts)+len(histogramSample.NegativeCounts) > limits.MaxNativeHistogramBuckets
@@ -399,6 +454,7 @@ func ValidateNativeHistogram(validateMetrics *ValidateMetrics, limits *Limits, u
399454
validateMetrics.DiscardedSamples.WithLabelValues(nativeHistogramBucketCountLimitExceeded, userID).Inc()
400455
return cortexpb.Histogram{}, newHistogramBucketLimitExceededError(ls, limits.MaxNativeHistogramBuckets)
401456
}
457+
402458
fh := cortexpb.FloatHistogramProtoToFloatHistogram(histogramSample)
403459
oBuckets := len(fh.PositiveBuckets) + len(fh.NegativeBuckets)
404460
for len(fh.PositiveBuckets)+len(fh.NegativeBuckets) > limits.MaxNativeHistogramBuckets {
@@ -437,10 +493,35 @@ func ValidateNativeHistogram(validateMetrics *ValidateMetrics, limits *Limits, u
437493
if oBuckets != len(h.PositiveBuckets)+len(h.NegativeBuckets) {
438494
validateMetrics.HistogramSamplesReducedResolution.WithLabelValues(userID).Inc()
439495
}
496+
440497
// If resolution reduced, convert new histogram to protobuf type again.
441498
return cortexpb.HistogramToHistogramProto(histogramSample.TimestampMs, h), nil
442499
}
443500

501+
// copy from https://github.com/prometheus/prometheus/blob/v3.6.0/model/histogram/generic.go#L399-L420
502+
func checkHistogramBuckets[BC histogram.BucketCount, IBC histogram.InternalBucketCount](buckets []IBC, count *BC, deltas bool) (error, float64) {
503+
if len(buckets) == 0 {
504+
return nil, 0
505+
}
506+
507+
var last IBC
508+
for i := range buckets {
509+
var c IBC
510+
if deltas {
511+
c = last + buckets[i]
512+
} else {
513+
c = buckets[i]
514+
}
515+
if c < 0 {
516+
return fmt.Errorf("bucket number %d has observation count of %v", i+1, c), float64(c)
517+
}
518+
last = c
519+
*count += BC(c)
520+
}
521+
522+
return nil, 0
523+
}
524+
444525
func DeletePerUserValidationMetrics(validateMetrics *ValidateMetrics, userID string, log log.Logger) {
445526
filter := map[string]string{"user": userID}
446527

pkg/util/validation/validate_test.go

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package validation
22

33
import (
4+
"math"
45
"net/http"
56
"strings"
67
"testing"
@@ -412,6 +413,31 @@ func TestValidateNativeHistogram(t *testing.T) {
412413
exceedMaxRangeSchemaFloatHistogram.Schema = 20
413414
exceedMaxSampleSizeBytesLimitFloatHistogram := tsdbutil.GenerateTestFloatHistogram(100)
414415

416+
negativeBucketCountInNBucketsFH := tsdbutil.GenerateTestFloatHistogram(0)
417+
negativeBucketCountInNBucketsFH.NegativeBuckets = []float64{-1.1, -1.2, -1.3, -1.4}
418+
419+
negativeBucketCountInPBucketsFH := tsdbutil.GenerateTestFloatHistogram(0)
420+
negativeBucketCountInPBucketsFH.PositiveBuckets = []float64{-1.1, -1.2, -1.3, -1.4}
421+
422+
negativeCountFloatHistogram := tsdbutil.GenerateTestFloatHistogram(0)
423+
negativeCountFloatHistogram.Count = -1.2345
424+
425+
negativeZeroCountFloatHistogram := tsdbutil.GenerateTestFloatHistogram(0)
426+
negativeZeroCountFloatHistogram.ZeroCount = -1.2345
427+
428+
negativeBucketCountInNBucketsH := tsdbutil.GenerateTestHistogram(0)
429+
negativeBucketCountInNBucketsH.NegativeBuckets = []int64{-1, -2, -3, -4}
430+
431+
negativeBucketCountInPBucketsH := tsdbutil.GenerateTestHistogram(0)
432+
negativeBucketCountInPBucketsH.PositiveBuckets = []int64{-1, -2, -3, -4}
433+
434+
countMisMatchSumIsNaN := tsdbutil.GenerateTestHistogram(0)
435+
countMisMatchSumIsNaN.Sum = math.NaN()
436+
countMisMatchSumIsNaN.Count = 11
437+
438+
countMisMatch := tsdbutil.GenerateTestHistogram(0)
439+
countMisMatch.Count = 11
440+
415441
for _, tc := range []struct {
416442
name string
417443
bucketLimit int
@@ -525,6 +551,54 @@ func TestValidateNativeHistogram(t *testing.T) {
525551
discardedSampleLabelValue: nativeHistogramSampleSizeBytesExceeded,
526552
maxNativeHistogramSampleSizeBytesLimit: 100,
527553
},
554+
{
555+
name: "negative observations count in negative buckets for float native histogram",
556+
histogram: cortexpb.FloatHistogramToHistogramProto(0, negativeBucketCountInNBucketsFH.Copy()),
557+
expectedErr: newNativeHistogramNegativeBucketCountError(lbls, negativeBucketCountInNBucketsFH.NegativeBuckets[0]),
558+
discardedSampleLabelValue: nativeHistogramNegativeBucketCount,
559+
},
560+
{
561+
name: "negative observations count in positive buckets for float native histogram",
562+
histogram: cortexpb.FloatHistogramToHistogramProto(0, negativeBucketCountInPBucketsFH.Copy()),
563+
expectedErr: newNativeHistogramNegativeBucketCountError(lbls, negativeBucketCountInPBucketsFH.PositiveBuckets[0]),
564+
discardedSampleLabelValue: nativeHistogramNegativeBucketCount,
565+
},
566+
{
567+
name: "count is negative for float native histogram",
568+
histogram: cortexpb.FloatHistogramToHistogramProto(0, negativeCountFloatHistogram.Copy()),
569+
expectedErr: newNativeHistogramNegativeCountError(lbls, negativeCountFloatHistogram.Count),
570+
discardedSampleLabelValue: nativeHistogramNegativeCount,
571+
},
572+
{
573+
name: "zero count is negative for float native histogram",
574+
histogram: cortexpb.FloatHistogramToHistogramProto(0, negativeZeroCountFloatHistogram.Copy()),
575+
expectedErr: newNativeHistogramNegativeBucketCountError(lbls, negativeZeroCountFloatHistogram.ZeroCount),
576+
discardedSampleLabelValue: nativeHistogramNegativeBucketCount,
577+
},
578+
{
579+
name: "negative observations count in negative buckets for native histogram",
580+
histogram: cortexpb.HistogramToHistogramProto(0, negativeBucketCountInNBucketsH.Copy()),
581+
expectedErr: newNativeHistogramNegativeBucketCountError(lbls, float64(negativeBucketCountInNBucketsH.NegativeBuckets[0])),
582+
discardedSampleLabelValue: nativeHistogramNegativeBucketCount,
583+
},
584+
{
585+
name: "negative observations count in positive buckets for native histogram",
586+
histogram: cortexpb.HistogramToHistogramProto(0, negativeBucketCountInPBucketsH.Copy()),
587+
expectedErr: newNativeHistogramNegativeBucketCountError(lbls, float64(negativeBucketCountInPBucketsH.PositiveBuckets[0])),
588+
discardedSampleLabelValue: nativeHistogramNegativeBucketCount,
589+
},
590+
{
591+
name: "mismatch between observations count with count field when sum is NaN",
592+
histogram: cortexpb.HistogramToHistogramProto(0, countMisMatchSumIsNaN.Copy()),
593+
expectedErr: newNativeHistogramMisMatchedCountError(lbls, 12, 11),
594+
discardedSampleLabelValue: nativeHistogramMisMatchCount,
595+
},
596+
{
597+
name: "mismatch between observations count with count field",
598+
histogram: cortexpb.HistogramToHistogramProto(0, countMisMatch.Copy()),
599+
expectedErr: newNativeHistogramMisMatchedCountError(lbls, 12, 11),
600+
discardedSampleLabelValue: nativeHistogramMisMatchCount,
601+
},
528602
} {
529603
t.Run(tc.name, func(t *testing.T) {
530604
reg := prometheus.NewRegistry()

0 commit comments

Comments
 (0)