Add unwrapped aggregation rate_counter() (grafana#6412)

chaudum · web-flow · commit a56a88bd3568 · 2022-06-16T16:16:46.000-04:00
`rate_counter()` takes an unwrapped range as input, but unlike `rate()` it treats the values extracted from the log lines as "counter metric" like in Prometheus' `rate()` function. This is a replacement for the reverted change of grafana#5013 Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,6 @@
 * [6372](https://github.com/grafana/loki/pull/6372) **splitice**: Add support for numbers in JSON fields
 * [6105](https://github.com/grafana/loki/pull/6105) **rutgerke** Export metrics for the promtail journal target
 * [6179](https://github.com/grafana/loki/pull/6179) **chaudum**: Add new HTTP endpoint to delete ingester ring token file and shutdown process gracefully
-* [6099](https://github.com/grafana/loki/pull/6099/files) **cstyan**: Drop lines with malformed JSON in Promtail JSON pipeline stage
 * [6136](https://github.com/grafana/loki/pull/6136) **periklis**: Add support for alertmanager header authorization
 * [6102](https://github.com/grafana/loki/pull/6102) **timchenko-a**: Add multi-tenancy support to lambda-promtail
 * [5971](https://github.com/grafana/loki/pull/5971) **kavirajk**: Record statistics about metadata queries such as labels and series queries in `metrics.go` as well
@@ -87,6 +86,7 @@
 #### Loki
 
 ##### Enhancements
+* [6361](https://github.com/grafana/loki/pull/6361) **chaudum**: Add new unwrapped range aggregation `rate_counter()` to LogQL
 * [6317](https://github.com/grafana/loki/pull/6317/files) **dannykoping**: General: add cache usage statistics
 
 ##### Fixes
diff --git a/docs/sources/logql/metric_queries.md b/docs/sources/logql/metric_queries.md
@@ -69,6 +69,7 @@ We currently support the functions:
 Supported function for operating over unwrapped ranges are:
 
 - `rate(unwrapped-range)`: calculates per second rate of the sum of all values in the specified interval.
+- `rate_counter(unwrapped-range)`: calculates per second rate of the values in the specified interval and treating them as "counter metric"
 - `sum_over_time(unwrapped-range)`: the sum of all values in the specified interval.
 - `avg_over_time(unwrapped-range)`: the average value of all points in the specified interval.
 - `max_over_time(unwrapped-range)`: the maximum value of all points in the specified interval.
diff --git a/docs/sources/upgrading/_index.md b/docs/sources/upgrading/_index.md
@@ -31,9 +31,15 @@ The output is incredibly verbose as it shows the entire internal config struct u
 
 ## Main / Unreleased
 
-
 ### Loki
 
+#### Implementation of unwrapped `rate` aggregation changed
+
+The implementation of the `rate()` aggregation function changed back to the previous implemention prior to [#5013](https://github.com/grafana/loki/pulls/5013).
+This means that the rate per second is calculated based on the sum of the extracted values, instead of the average increase over time.
+
+If you want the extracted values to be treated as [Counter](https://prometheus.io/docs/concepts/metric_types/#counter) metric, you should use the new `rate_counter()` aggregation function, which calculates the per-second average rate of increase of the vector.
+
 #### Default value for `azure.container-name` changed
 
 This value now defaults to `loki`, it was previously set to `cortex`. If you are relying on this container name for your chunks or ruler storage, you will have to manually specify `-azure.container-name=cortex` or `-ruler.storage.azure.container-name=cortex` respectively.
diff --git a/pkg/logql/engine_test.go b/pkg/logql/engine_test.go
@@ -53,13 +53,15 @@ func TestEngine_LogsRateUnwrap(t *testing.T) {
 			time.Unix(60, 0),
 			logproto.FORWARD,
 			10,
+			// create a stream {app="foo"} with 300 samples starting at 46s and ending at 345s with a constant value of 1
 			[][]logproto.Series{
 				// 30s range the lower bound of the range is not inclusive only 15 samples will make it 60 included
 				{newSeries(testSize, offset(46, constantValue(1)), `{app="foo"}`)},
 			},
 			[]SelectSampleParams{
 				{&logproto.SampleQueryRequest{Start: time.Unix(30, 0), End: time.Unix(60, 0), Selector: `rate({app="foo"} | unwrap foo[30s])`}},
 			},
+			// there are 15 samples (from 47 to 61) matched from the generated series
 			// SUM(n=47, 61, 1) = 15
 			// 15 / 30 = 0.5
 			promql.Vector{promql.Sample{Point: promql.Point{T: 60 * 1000, V: 0.5}, Metric: labels.Labels{labels.Label{Name: "app", Value: "foo"}}}},
@@ -69,17 +71,53 @@ func TestEngine_LogsRateUnwrap(t *testing.T) {
 			time.Unix(60, 0),
 			logproto.FORWARD,
 			10,
+			// create a stream {app="foo"} with 300 samples starting at 46s and ending at 345s with an increasing value by 1
 			[][]logproto.Series{
 				// 30s range the lower bound of the range is not inclusive only 15 samples will make it 60 included
 				{newSeries(testSize, offset(46, incValue(1)), `{app="foo"}`)},
 			},
 			[]SelectSampleParams{
 				{&logproto.SampleQueryRequest{Start: time.Unix(30, 0), End: time.Unix(60, 0), Selector: `rate({app="foo"} | unwrap foo[30s])`}},
 			},
-			// SUM(n=47, 61, n) = 810
+			// there are 15 samples (from 47 to 61) matched from the generated series
+			// SUM(n=47, 61, n) = (47+48+...+61) = 810
 			// 810 / 30 = 27
 			promql.Vector{promql.Sample{Point: promql.Point{T: 60 * 1000, V: 27}, Metric: labels.Labels{labels.Label{Name: "app", Value: "foo"}}}},
 		},
+		{
+			`rate_counter({app="foo"} | unwrap foo [30s])`,
+			time.Unix(60, 0),
+			logproto.FORWARD,
+			10,
+			// create a stream {app="foo"} with 300 samples starting at 46s and ending at 345s with a constant value of 1
+			[][]logproto.Series{
+				// 30s range the lower bound of the range is not inclusive only 15 samples will make it 60 included
+				{newSeries(testSize, offset(46, constantValue(1)), `{app="foo"}`)},
+			},
+			[]SelectSampleParams{
+				{&logproto.SampleQueryRequest{Start: time.Unix(30, 0), End: time.Unix(60, 0), Selector: `rate_counter({app="foo"} | unwrap foo[30s])`}},
+			},
+			// there are 15 samples (from 47 to 61) matched from the generated series
+			// (1 - 1) / 30 = 0
+			promql.Vector{promql.Sample{Point: promql.Point{T: 60 * 1000, V: 0}, Metric: labels.Labels{labels.Label{Name: "app", Value: "foo"}}}},
+		},
+		{
+			`rate_counter({app="foo"} | unwrap foo [30s])`,
+			time.Unix(60, 0),
+			logproto.FORWARD,
+			10,
+			// create a stream {app="foo"} with 300 samples starting at 46s and ending at 345s with an increasing value by 1
+			[][]logproto.Series{
+				// 30s range the lower bound of the range is not inclusive only 15 samples will make it 60 included
+				{newSeries(testSize, offset(46, incValue(1)), `{app="foo"}`)},
+			},
+			[]SelectSampleParams{
+				{&logproto.SampleQueryRequest{Start: time.Unix(30, 0), End: time.Unix(60, 0), Selector: `rate_counter({app="foo"} | unwrap foo[30s])`}},
+			},
+			// there are 15 samples (from 47 to 61) matched from the generated series
+			// (61 - 47) / 30 = 0.4666
+			promql.Vector{promql.Sample{Point: promql.Point{T: 60 * 1000, V: 0.46666766666666665}, Metric: labels.Labels{labels.Label{Name: "app", Value: "foo"}}}},
+		},
 	} {
 		test := test
 		t.Run(fmt.Sprintf("%s %s", test.qs, test.direction), func(t *testing.T) {
diff --git a/pkg/logql/range_vector.go b/pkg/logql/range_vector.go
@@ -189,6 +189,8 @@ func aggregator(r *syntax.RangeAggregationExpr) (RangeVectorAggregator, error) {
 	switch r.Operation {
 	case syntax.OpRangeTypeRate:
 		return rateLogs(r.Left.Interval, r.Left.Unwrap != nil), nil
+	case syntax.OpRangeTypeRateCounter:
+		return rateCounter(r.Left.Interval), nil
 	case syntax.OpRangeTypeCount:
 		return countOverTime, nil
 	case syntax.OpRangeTypeBytesRate:
@@ -233,6 +235,92 @@ func rateLogs(selRange time.Duration, computeValues bool) func(samples []promql.
 	}
 }
 
+// rateCounter calculates the per-second rate of values extracted from log lines
+// and treat them like a "counter" metric.
+func rateCounter(selRange time.Duration) func(samples []promql.Point) float64 {
+	return func(samples []promql.Point) float64 {
+		return extrapolatedRate(samples, selRange, true, true)
+	}
+}
+
+// extrapolatedRate function is taken from prometheus code promql/functions.go:59
+// extrapolatedRate is a utility function for rate/increase/delta.
+// It calculates the rate (allowing for counter resets if isCounter is true),
+// extrapolates if the first/last sample is close to the boundary, and returns
+// the result as either per-second (if isRate is true) or overall.
+func extrapolatedRate(samples []promql.Point, selRange time.Duration, isCounter, isRate bool) float64 {
+	// No sense in trying to compute a rate without at least two points. Drop
+	// this Vector element.
+	if len(samples) < 2 {
+		return 0
+	}
+	var (
+		rangeStart = samples[0].T - durationMilliseconds(selRange)
+		rangeEnd   = samples[len(samples)-1].T
+	)
+
+	resultValue := samples[len(samples)-1].V - samples[0].V
+	if isCounter {
+		var lastValue float64
+		for _, sample := range samples {
+			if sample.V < lastValue {
+				resultValue += lastValue
+			}
+			lastValue = sample.V
+		}
+	}
+
+	// Duration between first/last samples and boundary of range.
+	durationToStart := float64(samples[0].T-rangeStart) / 1000
+	durationToEnd := float64(rangeEnd-samples[len(samples)-1].T) / 1000
+
+	sampledInterval := float64(samples[len(samples)-1].T-samples[0].T) / 1000
+	averageDurationBetweenSamples := sampledInterval / float64(len(samples)-1)
+
+	if isCounter && resultValue > 0 && samples[0].V >= 0 {
+		// Counters cannot be negative. If we have any slope at
+		// all (i.e. resultValue went up), we can extrapolate
+		// the zero point of the counter. If the duration to the
+		// zero point is shorter than the durationToStart, we
+		// take the zero point as the start of the series,
+		// thereby avoiding extrapolation to negative counter
+		// values.
+		durationToZero := sampledInterval * (samples[0].V / resultValue)
+		if durationToZero < durationToStart {
+			durationToStart = durationToZero
+		}
+	}
+
+	// If the first/last samples are close to the boundaries of the range,
+	// extrapolate the result. This is as we expect that another sample
+	// will exist given the spacing between samples we've seen thus far,
+	// with an allowance for noise.
+	extrapolationThreshold := averageDurationBetweenSamples * 1.1
+	extrapolateToInterval := sampledInterval
+
+	if durationToStart < extrapolationThreshold {
+		extrapolateToInterval += durationToStart
+	} else {
+		extrapolateToInterval += averageDurationBetweenSamples / 2
+	}
+	if durationToEnd < extrapolationThreshold {
+		extrapolateToInterval += durationToEnd
+	} else {
+		extrapolateToInterval += averageDurationBetweenSamples / 2
+	}
+	resultValue = resultValue * (extrapolateToInterval / sampledInterval)
+	if isRate {
+		seconds := selRange.Seconds()
+		resultValue = resultValue / seconds
+	}
+
+	return resultValue
+}
+
+func durationMilliseconds(d time.Duration) int64 {
+	return int64(d / (time.Millisecond / time.Nanosecond))
+}
+
 // rateLogBytes calculates the per-second rate of log bytes.
 func rateLogBytes(selRange time.Duration) func(samples []promql.Point) float64 {
 	return func(samples []promql.Point) float64 {
diff --git a/pkg/logql/syntax/ast.go b/pkg/logql/syntax/ast.go
@@ -616,20 +616,21 @@ const (
 	OpTypeTopK    = "topk"
 
 	// range vector ops
-	OpRangeTypeCount     = "count_over_time"
-	OpRangeTypeRate      = "rate"
-	OpRangeTypeBytes     = "bytes_over_time"
-	OpRangeTypeBytesRate = "bytes_rate"
-	OpRangeTypeAvg       = "avg_over_time"
-	OpRangeTypeSum       = "sum_over_time"
-	OpRangeTypeMin       = "min_over_time"
-	OpRangeTypeMax       = "max_over_time"
-	OpRangeTypeStdvar    = "stdvar_over_time"
-	OpRangeTypeStddev    = "stddev_over_time"
-	OpRangeTypeQuantile  = "quantile_over_time"
-	OpRangeTypeFirst     = "first_over_time"
-	OpRangeTypeLast      = "last_over_time"
-	OpRangeTypeAbsent    = "absent_over_time"
+	OpRangeTypeCount       = "count_over_time"
+	OpRangeTypeRate        = "rate"
+	OpRangeTypeRateCounter = "rate_counter"
+	OpRangeTypeBytes       = "bytes_over_time"
+	OpRangeTypeBytesRate   = "bytes_rate"
+	OpRangeTypeAvg         = "avg_over_time"
+	OpRangeTypeSum         = "sum_over_time"
+	OpRangeTypeMin         = "min_over_time"
+	OpRangeTypeMax         = "max_over_time"
+	OpRangeTypeStdvar      = "stdvar_over_time"
+	OpRangeTypeStddev      = "stddev_over_time"
+	OpRangeTypeQuantile    = "quantile_over_time"
+	OpRangeTypeFirst       = "first_over_time"
+	OpRangeTypeLast        = "last_over_time"
+	OpRangeTypeAbsent      = "absent_over_time"
 
 	// binops - logical/set
 	OpTypeOr     = "or"
@@ -778,7 +779,9 @@ func (e RangeAggregationExpr) validate() error {
 	}
 	if e.Left.Unwrap != nil {
 		switch e.Operation {
-		case OpRangeTypeAvg, OpRangeTypeSum, OpRangeTypeMax, OpRangeTypeMin, OpRangeTypeStddev, OpRangeTypeStdvar, OpRangeTypeQuantile, OpRangeTypeRate, OpRangeTypeAbsent, OpRangeTypeFirst, OpRangeTypeLast:
+		case OpRangeTypeAvg, OpRangeTypeSum, OpRangeTypeMax, OpRangeTypeMin, OpRangeTypeStddev,
+			OpRangeTypeStdvar, OpRangeTypeQuantile, OpRangeTypeRate, OpRangeTypeRateCounter,
+			OpRangeTypeAbsent, OpRangeTypeFirst, OpRangeTypeLast:
 			return nil
 		default:
 			return fmt.Errorf("invalid aggregation %s with unwrap", e.Operation)
diff --git a/pkg/logql/syntax/expr.y b/pkg/logql/syntax/expr.y
@@ -107,7 +107,7 @@ import (
 %token <str>      IDENTIFIER STRING NUMBER
 %token <duration> DURATION RANGE
 %token <val>      MATCHERS LABELS EQ RE NRE OPEN_BRACE CLOSE_BRACE OPEN_BRACKET CLOSE_BRACKET COMMA DOT PIPE_MATCH PIPE_EXACT
-                  OPEN_PARENTHESIS CLOSE_PARENTHESIS BY WITHOUT COUNT_OVER_TIME RATE SUM AVG MAX MIN COUNT STDDEV STDVAR BOTTOMK TOPK
+                  OPEN_PARENTHESIS CLOSE_PARENTHESIS BY WITHOUT COUNT_OVER_TIME RATE RATE_COUNTER SUM AVG MAX MIN COUNT STDDEV STDVAR BOTTOMK TOPK
                   BYTES_OVER_TIME BYTES_RATE BOOL JSON REGEXP LOGFMT PIPE LINE_FMT LABEL_FMT UNWRAP AVG_OVER_TIME SUM_OVER_TIME MIN_OVER_TIME
                   MAX_OVER_TIME STDVAR_OVER_TIME STDDEV_OVER_TIME QUANTILE_OVER_TIME BYTES_CONV DURATION_CONV DURATION_SECONDS_CONV
                   FIRST_OVER_TIME LAST_OVER_TIME ABSENT_OVER_TIME LABEL_REPLACE UNPACK OFFSET PATTERN IP ON IGNORING GROUP_LEFT GROUP_RIGHT
@@ -457,6 +457,7 @@ vectorOp:
 rangeOp:
       COUNT_OVER_TIME    { $$ = OpRangeTypeCount }
     | RATE               { $$ = OpRangeTypeRate }
+    | RATE_COUNTER       { $$ = OpRangeTypeRateCounter }
     | BYTES_OVER_TIME    { $$ = OpRangeTypeBytes }
     | BYTES_RATE         { $$ = OpRangeTypeBytesRate }
     | AVG_OVER_TIME      { $$ = OpRangeTypeAvg }
diff --git a/pkg/logql/syntax/expr.y.go b/pkg/logql/syntax/expr.y.go
diff --git a/pkg/logql/syntax/lex.go b/pkg/logql/syntax/lex.go
diff --git a/pkg/logql/syntax/lex_test.go b/pkg/logql/syntax/lex_test.go