Skip to content

Commit 24edba0

Browse files
authored
Setting ruler.evaluation-delay-duration to be deprecated. (#6149)
1 parent 22245aa commit 24edba0

File tree

10 files changed

+50
-360
lines changed

10 files changed

+50
-360
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* [CHANGE] Querier: Deprecate and enable by default `querier.ingester-metadata-streaming` flag. #6147
1010
* [CHANGE] QueryFrontend/QueryScheduler: Deprecate `-querier.max-outstanding-requests-per-tenant` and `-query-scheduler.max-outstanding-requests-per-tenant` flags. Use frontend.max-outstanding-requests-per-tenant instead. #6146
1111
* [CHANGE] Ingesters: Enable 'snappy-block' compression on ingester clients by default. #6148
12+
* [CHANGE] Ruler: Scheduling `ruler.evaluation-delay-duration` to be deprecated. Use the highest value between `ruler.evaluation-delay-duration` and `ruler.query-offset` #6149
1213
* [FEATURE] Ingester/Distributor: Experimental: Enable native histogram ingestion via `-blocks-storage.tsdb.enable-native-histograms` flag. #5986 #6010 #6020
1314
* [FEATURE] Querier: Enable querying native histogram chunks. #5944 #6031
1415
* [FEATURE] Query Frontend: Support native histogram in query frontend response. #5996 #6043

docs/configuration/config-file-reference.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3333,6 +3333,7 @@ query_rejection:
33333333
# them.
33343334
[query_attributes: <list of QueryAttribute> | default = []]
33353335

3336+
# Deprecated(use ruler.query-offset instead) and will be removed in v1.19.0:
33363337
# Duration to delay the evaluation of rules to ensure the underlying metrics
33373338
# have been pushed to Cortex.
33383339
# CLI flag: -ruler.evaluation-delay-duration

integration/configs.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,6 @@ receivers:
6363
labels: {}
6464
annotations: {}
6565
`
66-
67-
cortexRulerEvalStaleNanConfigYaml = `groups:
68-
- name: rule
69-
interval: 1s
70-
rules:
71-
- record: stale_nan_eval
72-
expr: a_sometimes_stale_nan_series * 2
73-
`
7466
)
7567

7668
var (

integration/ruler_test.go

Lines changed: 0 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"crypto/x509/pkix"
1111
"encoding/json"
1212
"fmt"
13-
"math"
1413
"math/rand"
1514
"net/http"
1615
"os"
@@ -24,7 +23,6 @@ import (
2423
"github.com/prometheus/common/model"
2524
"github.com/prometheus/prometheus/model/labels"
2625
"github.com/prometheus/prometheus/model/rulefmt"
27-
"github.com/prometheus/prometheus/model/value"
2826
"github.com/prometheus/prometheus/prompb"
2927
"github.com/stretchr/testify/assert"
3028
"github.com/stretchr/testify/require"
@@ -200,119 +198,6 @@ func TestRulerAPISingleBinary(t *testing.T) {
200198
require.NoError(t, cortexRestarted.WaitSumMetrics(e2e.Equals(1), "cortex_ruler_managers_total"))
201199
}
202200

203-
func TestRulerEvaluationDelay(t *testing.T) {
204-
s, err := e2e.NewScenario(networkName)
205-
require.NoError(t, err)
206-
defer s.Close()
207-
208-
namespace := "ns"
209-
user := "fake"
210-
211-
evaluationDelay := time.Minute * 5
212-
213-
configOverrides := map[string]string{
214-
"-ruler-storage.local.directory": filepath.Join(e2e.ContainerSharedDir, "ruler_configs"),
215-
"-ruler.poll-interval": "2s",
216-
"-ruler.rule-path": filepath.Join(e2e.ContainerSharedDir, "rule_tmp/"),
217-
"-ruler.evaluation-delay-duration": evaluationDelay.String(),
218-
}
219-
220-
// Start Cortex components.
221-
require.NoError(t, copyFileToSharedDir(s, "docs/configuration/single-process-config-blocks-local.yaml", cortexConfigFile))
222-
require.NoError(t, writeFileToSharedDir(s, filepath.Join("ruler_configs", user, namespace), []byte(cortexRulerEvalStaleNanConfigYaml)))
223-
cortex := e2ecortex.NewSingleBinaryWithConfigFile("cortex", cortexConfigFile, configOverrides, "", 9009, 9095)
224-
require.NoError(t, s.StartAndWaitReady(cortex))
225-
226-
// Create a client with the ruler address configured
227-
c, err := e2ecortex.NewClient(cortex.HTTPEndpoint(), cortex.HTTPEndpoint(), "", cortex.HTTPEndpoint(), "")
228-
require.NoError(t, err)
229-
230-
now := time.Now()
231-
232-
// Generate series that includes stale nans
233-
samplesToSend := 10
234-
series := prompb.TimeSeries{
235-
Labels: []prompb.Label{
236-
{Name: "__name__", Value: "a_sometimes_stale_nan_series"},
237-
{Name: "instance", Value: "sometimes-stale"},
238-
},
239-
}
240-
series.Samples = make([]prompb.Sample, samplesToSend)
241-
posStale := 2
242-
243-
// Create samples, that are delayed by the evaluation delay with increasing values.
244-
for pos := range series.Samples {
245-
series.Samples[pos].Timestamp = e2e.TimeToMilliseconds(now.Add(-evaluationDelay).Add(time.Duration(pos) * time.Second))
246-
series.Samples[pos].Value = float64(pos + 1)
247-
248-
// insert staleness marker at the positions marked by posStale
249-
if pos == posStale {
250-
series.Samples[pos].Value = math.Float64frombits(value.StaleNaN)
251-
}
252-
}
253-
254-
// Insert metrics
255-
res, err := c.Push([]prompb.TimeSeries{series})
256-
require.NoError(t, err)
257-
require.Equal(t, 200, res.StatusCode)
258-
259-
// Get number of rule evaluations just after push
260-
ruleEvaluationsAfterPush, err := cortex.SumMetrics([]string{"cortex_prometheus_rule_evaluations_total"})
261-
require.NoError(t, err)
262-
263-
// Wait until the rule is evaluated for the first time
264-
require.NoError(t, cortex.WaitSumMetrics(e2e.Greater(ruleEvaluationsAfterPush[0]), "cortex_prometheus_rule_evaluations_total"))
265-
266-
// Query the timestamp of the latest result to ensure the evaluation is delayed
267-
result, err := c.Query("timestamp(stale_nan_eval)", now)
268-
require.NoError(t, err)
269-
require.Equal(t, model.ValVector, result.Type())
270-
271-
vector := result.(model.Vector)
272-
require.Equal(t, 1, vector.Len(), "expect one sample returned")
273-
274-
// 290 seconds gives 10 seconds of slack between the rule evaluation and the query
275-
// to account for CI latency, but ensures the latest evaluation was in the past.
276-
var maxDiff int64 = 290_000
277-
require.GreaterOrEqual(t, e2e.TimeToMilliseconds(time.Now())-int64(vector[0].Value)*1000, maxDiff)
278-
279-
// Wait until all the pushed samples have been evaluated by the rule. This
280-
// ensures that rule results are successfully written even after a
281-
// staleness period.
282-
require.NoError(t, cortex.WaitSumMetrics(e2e.GreaterOrEqual(ruleEvaluationsAfterPush[0]+float64(samplesToSend)), "cortex_prometheus_rule_evaluations_total"))
283-
284-
// query all results to verify rules have been evaluated correctly
285-
result, err = c.QueryRange("stale_nan_eval", now.Add(-evaluationDelay), now, time.Second)
286-
require.NoError(t, err)
287-
require.Equal(t, model.ValMatrix, result.Type())
288-
289-
matrix := result.(model.Matrix)
290-
require.GreaterOrEqual(t, 1, matrix.Len(), "expect at least a series returned")
291-
292-
// Iterate through the values recorded and ensure they exist as expected.
293-
inputPos := 0
294-
for _, m := range matrix {
295-
for _, v := range m.Values {
296-
// Skip values for stale positions
297-
if inputPos == posStale {
298-
inputPos++
299-
}
300-
301-
expectedValue := model.SampleValue(2 * (inputPos + 1))
302-
require.Equal(t, expectedValue, v.Value)
303-
304-
// Look for next value
305-
inputPos++
306-
307-
// We have found all input values
308-
if inputPos >= len(series.Samples) {
309-
break
310-
}
311-
}
312-
}
313-
require.Equal(t, len(series.Samples), inputPos, "expect to have returned all evaluations")
314-
}
315-
316201
func TestRulerSharding(t *testing.T) {
317202
const numRulesGroups = 100
318203

pkg/ruler/compat.go

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313
"github.com/prometheus/prometheus/model/histogram"
1414
"github.com/prometheus/prometheus/model/labels"
1515
"github.com/prometheus/prometheus/model/metadata"
16-
"github.com/prometheus/prometheus/model/value"
1716
"github.com/prometheus/prometheus/notifier"
1817
"github.com/prometheus/prometheus/promql"
1918
"github.com/prometheus/prometheus/promql/parser"
@@ -46,27 +45,15 @@ type PusherAppender struct {
4645
histogramLabels []labels.Labels
4746
histograms []cortexpb.Histogram
4847
userID string
49-
evaluationDelay time.Duration
5048
}
5149

5250
func (a *PusherAppender) AppendHistogram(_ storage.SeriesRef, l labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) {
5351
if h == nil && fh == nil {
5452
return 0, errors.New("no histogram")
5553
}
56-
5754
if h != nil {
58-
// A histogram sample is considered stale if its sum is set to NaN.
59-
// https://github.com/prometheus/prometheus/blob/b6ef745016fa9472fdd0ae20f75a9682e01d1e5c/tsdb/head_append.go#L339-L346
60-
if a.evaluationDelay > 0 && (value.IsStaleNaN(h.Sum)) {
61-
t -= a.evaluationDelay.Milliseconds()
62-
}
6355
a.histograms = append(a.histograms, cortexpb.HistogramToHistogramProto(t, h))
6456
} else {
65-
// A histogram sample is considered stale if its sum is set to NaN.
66-
// https://github.com/prometheus/prometheus/blob/b6ef745016fa9472fdd0ae20f75a9682e01d1e5c/tsdb/head_append.go#L339-L346
67-
if a.evaluationDelay > 0 && (value.IsStaleNaN(fh.Sum)) {
68-
t -= a.evaluationDelay.Milliseconds()
69-
}
7057
a.histograms = append(a.histograms, cortexpb.FloatHistogramToHistogramProto(t, fh))
7158
}
7259
a.histogramLabels = append(a.histogramLabels, l)
@@ -75,19 +62,6 @@ func (a *PusherAppender) AppendHistogram(_ storage.SeriesRef, l labels.Labels, t
7562

7663
func (a *PusherAppender) Append(_ storage.SeriesRef, l labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
7764
a.labels = append(a.labels, l)
78-
79-
// Adapt staleness markers for ruler evaluation delay. As the upstream code
80-
// is using the actual time, when there is a no longer available series.
81-
// This then causes 'out of order' append failures once the series is
82-
// becoming available again.
83-
// see https://github.com/prometheus/prometheus/blob/6c56a1faaaad07317ff585bda75b99bdba0517ad/rules/manager.go#L647-L660
84-
// Similar to staleness markers, the rule manager also appends actual time to the ALERTS and ALERTS_FOR_STATE series.
85-
// See: https://github.com/prometheus/prometheus/blob/ae086c73cb4d6db9e8b67d5038d3704fea6aec4a/rules/alerting.go#L414-L417
86-
metricName := l.Get(labels.MetricName)
87-
if a.evaluationDelay > 0 && (value.IsStaleNaN(v) || metricName == "ALERTS" || metricName == "ALERTS_FOR_STATE") {
88-
t -= a.evaluationDelay.Milliseconds()
89-
}
90-
9165
a.samples = append(a.samples, cortexpb.Sample{
9266
TimestampMs: t,
9367
Value: v,
@@ -164,16 +138,14 @@ func (t *PusherAppendable) Appender(ctx context.Context) storage.Appender {
164138
failedWrites: t.failedWrites,
165139
totalWrites: t.totalWrites,
166140

167-
ctx: ctx,
168-
pusher: t.pusher,
169-
userID: t.userID,
170-
evaluationDelay: t.rulesLimits.EvaluationDelay(t.userID),
141+
ctx: ctx,
142+
pusher: t.pusher,
143+
userID: t.userID,
171144
}
172145
}
173146

174147
// RulesLimits defines limits used by Ruler.
175148
type RulesLimits interface {
176-
EvaluationDelay(userID string) time.Duration
177149
MaxQueryLength(userID string) time.Duration
178150
RulerTenantShardSize(userID string) int
179151
RulerMaxRuleGroupsPerTenant(userID string) int
@@ -182,7 +154,7 @@ type RulesLimits interface {
182154
DisabledRuleGroups(userID string) validation.DisabledRuleGroups
183155
}
184156

185-
// EngineQueryFunc returns a new engine query function by passing an altered timestamp.
157+
// EngineQueryFunc returns a new engine query function validating max queryLength.
186158
// Modified from Prometheus rules.EngineQueryFunc
187159
// https://github.com/prometheus/prometheus/blob/v2.39.1/rules/manager.go#L189.
188160
func EngineQueryFunc(engine promql.QueryEngine, q storage.Queryable, overrides RulesLimits, userID string, lookbackDelta time.Duration) rules.QueryFunc {
@@ -202,8 +174,7 @@ func EngineQueryFunc(engine promql.QueryEngine, q storage.Queryable, overrides R
202174
}
203175
}
204176

205-
evaluationDelay := overrides.EvaluationDelay(userID)
206-
q, err := engine.NewInstantQuery(ctx, q, nil, qs, t.Add(-evaluationDelay))
177+
q, err := engine.NewInstantQuery(ctx, q, nil, qs, t)
207178
if err != nil {
208179
return nil, err
209180
}

0 commit comments

Comments
 (0)