Skip to content

Commit 2f4e1fd

Browse files
authored
Add query metrics to ruler query stats (#6173)
1 parent 36b59b4 commit 2f4e1fd

File tree

7 files changed

+82
-24
lines changed

7 files changed

+82
-24
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## master / unreleased
44

55
* [ENHANCEMENT] Ruler: Add new ruler metric `cortex_ruler_rule_groups_in_store` that is the total rule groups per tenant in store, which can be used to compare with `cortex_prometheus_rule_group_rules` to count the number of rule groups that are not loaded by a ruler. #5869
6+
* [ENHANCEMENT] Ruler: Add query statistics metrics when --ruler.query-stats-enabled=true. #6173
67

78
## 1.18.0 in progress
89

docs/configuration/config-file-reference.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4366,8 +4366,8 @@ ring:
43664366
# CLI flag: -ruler.disabled-tenants
43674367
[disabled_tenants: <string> | default = ""]
43684368
4369-
# Report the wall time for ruler queries to complete as a per user metric and as
4370-
# an info level log message.
4369+
# Report query statistics for ruler queries to complete as a per user metric and
4370+
# as an info level log message.
43714371
# CLI flag: -ruler.query-stats-enabled
43724372
[query_stats_enabled: <boolean> | default = false]
43734373

pkg/ruler/compat.go

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -229,19 +229,25 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun
229229
}
230230
}
231231

232-
func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc {
233-
if queryTime == nil {
234-
return qf
235-
}
232+
func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, userID string, evalMetrics *RuleEvalMetrics, logger log.Logger) rules.QueryFunc {
233+
queryTime := evalMetrics.RulerQuerySeconds.WithLabelValues(userID)
234+
querySeries := evalMetrics.RulerQuerySeries.WithLabelValues(userID)
235+
querySample := evalMetrics.RulerQuerySamples.WithLabelValues(userID)
236+
queryChunkBytes := evalMetrics.RulerQueryChunkBytes.WithLabelValues(userID)
237+
queryDataBytes := evalMetrics.RulerQueryDataBytes.WithLabelValues(userID)
236238

237239
return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
238240
queryStats, ctx := stats.ContextWithEmptyStats(ctx)
239241
// If we've been passed a counter we want to record the wall time spent executing this request.
240242
timer := prometheus.NewTimer(nil)
243+
241244
defer func() {
242245
querySeconds := timer.ObserveDuration().Seconds()
243246
queryTime.Add(querySeconds)
244-
247+
querySeries.Add(float64(queryStats.FetchedSeriesCount))
248+
querySample.Add(float64(queryStats.FetchedSamplesCount))
249+
queryChunkBytes.Add(float64(queryStats.FetchedChunkBytes))
250+
queryDataBytes.Add(float64(queryStats.FetchedDataBytes))
245251
// Log ruler query stats.
246252
logMessage := []interface{}{
247253
"msg", "query stats",
@@ -303,23 +309,24 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
303309
q = querier.NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors)
304310

305311
return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager {
306-
var queryTime prometheus.Counter
307-
if evalMetrics.RulerQuerySeconds != nil {
308-
queryTime = evalMetrics.RulerQuerySeconds.WithLabelValues(userID)
309-
}
310-
311312
failedQueries := evalMetrics.FailedQueriesVec.WithLabelValues(userID)
312313
totalQueries := evalMetrics.TotalQueriesVec.WithLabelValues(userID)
313314
totalWrites := evalMetrics.TotalWritesVec.WithLabelValues(userID)
314315
failedWrites := evalMetrics.FailedWritesVec.WithLabelValues(userID)
315316

317+
var queryFunc rules.QueryFunc
316318
engineQueryFunc := EngineQueryFunc(engine, q, overrides, userID, cfg.LookbackDelta)
317319
metricsQueryFunc := MetricsQueryFunc(engineQueryFunc, totalQueries, failedQueries)
320+
if cfg.EnableQueryStats {
321+
queryFunc = RecordAndReportRuleQueryMetrics(metricsQueryFunc, userID, evalMetrics, logger)
322+
} else {
323+
queryFunc = metricsQueryFunc
324+
}
318325

319326
return rules.NewManager(&rules.ManagerOptions{
320327
Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites),
321328
Queryable: q,
322-
QueryFunc: RecordAndReportRuleQueryMetrics(metricsQueryFunc, queryTime, logger),
329+
QueryFunc: queryFunc,
323330
Context: user.InjectOrgID(ctx, userID),
324331
ExternalURL: cfg.ExternalURL.URL,
325332
NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()),

pkg/ruler/compat_test.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/weaveworks/common/httpgrpc"
2424

2525
"github.com/cortexproject/cortex/pkg/cortexpb"
26+
"github.com/cortexproject/cortex/pkg/querier/stats"
2627
"github.com/cortexproject/cortex/pkg/util/validation"
2728
)
2829

@@ -392,14 +393,23 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
392393
}
393394

394395
func TestRecordAndReportRuleQueryMetrics(t *testing.T) {
395-
queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"})
396+
metrics := NewRuleEvalMetrics(Config{EnableQueryStats: true}, prometheus.DefaultRegisterer)
396397

397398
mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
399+
queryStats := stats.FromContext(ctx)
400+
queryStats.AddFetchedSeries(2)
401+
queryStats.AddFetchedSamples(2)
402+
queryStats.AddFetchedChunkBytes(10)
403+
queryStats.AddFetchedDataBytes(14)
398404
time.Sleep(1 * time.Second)
399405
return promql.Vector{}, nil
400406
}
401-
qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), log.NewNopLogger())
407+
qf := RecordAndReportRuleQueryMetrics(mockFunc, "userID", metrics, log.NewNopLogger())
402408
_, _ = qf(context.Background(), "test", time.Now())
403409

404-
require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1))
410+
require.GreaterOrEqual(t, testutil.ToFloat64(metrics.RulerQuerySeconds.WithLabelValues("userID")), float64(1))
411+
require.Equal(t, testutil.ToFloat64(metrics.RulerQuerySeries.WithLabelValues("userID")), float64(2))
412+
require.Equal(t, testutil.ToFloat64(metrics.RulerQuerySamples.WithLabelValues("userID")), float64(2))
413+
require.Equal(t, testutil.ToFloat64(metrics.RulerQueryChunkBytes.WithLabelValues("userID")), float64(10))
414+
require.Equal(t, testutil.ToFloat64(metrics.RulerQueryDataBytes.WithLabelValues("userID")), float64(14))
405415
}

pkg/ruler/manager_metrics.go

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -225,11 +225,15 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
225225
}
226226

227227
type RuleEvalMetrics struct {
228-
TotalWritesVec *prometheus.CounterVec
229-
FailedWritesVec *prometheus.CounterVec
230-
TotalQueriesVec *prometheus.CounterVec
231-
FailedQueriesVec *prometheus.CounterVec
232-
RulerQuerySeconds *prometheus.CounterVec
228+
TotalWritesVec *prometheus.CounterVec
229+
FailedWritesVec *prometheus.CounterVec
230+
TotalQueriesVec *prometheus.CounterVec
231+
FailedQueriesVec *prometheus.CounterVec
232+
RulerQuerySeconds *prometheus.CounterVec
233+
RulerQuerySeries *prometheus.CounterVec
234+
RulerQuerySamples *prometheus.CounterVec
235+
RulerQueryChunkBytes *prometheus.CounterVec
236+
RulerQueryDataBytes *prometheus.CounterVec
233237
}
234238

235239
func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics {
@@ -256,6 +260,22 @@ func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics
256260
Name: "cortex_ruler_query_seconds_total",
257261
Help: "Total amount of wall clock time spent processing queries by the ruler.",
258262
}, []string{"user"})
263+
m.RulerQuerySeries = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
264+
Name: "cortex_ruler_fetched_series_total",
265+
Help: "Number of series fetched to execute a query by the ruler.",
266+
}, []string{"user"})
267+
m.RulerQuerySamples = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
268+
Name: "cortex_ruler_samples_total",
269+
Help: "Number of samples fetched to execute a query by the ruler.",
270+
}, []string{"user"})
271+
m.RulerQueryChunkBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
272+
Name: "cortex_ruler_fetched_chunks_bytes_total",
273+
Help: "Size of all chunks fetched to execute a query in bytes by the ruler.",
274+
}, []string{"user"})
275+
m.RulerQueryDataBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
276+
Name: "cortex_ruler_fetched_data_bytes_total",
277+
Help: "Size of all data fetched to execute a query in bytes by the ruler.",
278+
}, []string{"user"})
259279
}
260280

261281
return m
@@ -270,6 +290,18 @@ func (m *RuleEvalMetrics) deletePerUserMetrics(userID string) {
270290
if m.RulerQuerySeconds != nil {
271291
m.RulerQuerySeconds.DeleteLabelValues(userID)
272292
}
293+
if m.RulerQuerySeries != nil {
294+
m.RulerQuerySeries.DeleteLabelValues(userID)
295+
}
296+
if m.RulerQuerySamples != nil {
297+
m.RulerQuerySamples.DeleteLabelValues(userID)
298+
}
299+
if m.RulerQueryChunkBytes != nil {
300+
m.RulerQueryChunkBytes.DeleteLabelValues(userID)
301+
}
302+
if m.RulerQueryDataBytes != nil {
303+
m.RulerQueryDataBytes.DeleteLabelValues(userID)
304+
}
273305
}
274306

275307
type RuleGroupMetrics struct {

pkg/ruler/manager_metrics_test.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -574,8 +574,16 @@ func TestRuleEvalMetricsDeletePerUserMetrics(t *testing.T) {
574574
m.FailedQueriesVec.WithLabelValues("fake2").Add(10)
575575
m.RulerQuerySeconds.WithLabelValues("fake1").Add(10)
576576
m.RulerQuerySeconds.WithLabelValues("fake2").Add(10)
577-
578-
metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total"}
577+
m.RulerQuerySeries.WithLabelValues("fake1").Add(10)
578+
m.RulerQuerySeries.WithLabelValues("fake2").Add(10)
579+
m.RulerQuerySamples.WithLabelValues("fake1").Add(10)
580+
m.RulerQuerySamples.WithLabelValues("fake2").Add(10)
581+
m.RulerQueryChunkBytes.WithLabelValues("fake1").Add(10)
582+
m.RulerQueryChunkBytes.WithLabelValues("fake2").Add(10)
583+
m.RulerQueryDataBytes.WithLabelValues("fake1").Add(10)
584+
m.RulerQueryDataBytes.WithLabelValues("fake2").Add(10)
585+
586+
metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total", "cortex_ruler_fetched_series_total", "cortex_ruler_samples_total", "cortex_ruler_fetched_chunks_bytes_total", "cortex_ruler_fetched_data_bytes_total"}
579587
gm, err := reg.Gather()
580588
require.NoError(t, err)
581589
mfm, err := util.NewMetricFamilyMap(gm)

pkg/ruler/ruler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
217217
f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.")
218218
f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.")
219219

220-
f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.")
220+
f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report query statistics for ruler queries to complete as a per user metric and as an info level log message.")
221221
f.BoolVar(&cfg.DisableRuleGroupLabel, "ruler.disable-rule-group-label", false, "Disable the rule_group label on exported metrics")
222222

223223
cfg.RingCheckPeriod = 5 * time.Second

0 commit comments

Comments
 (0)