Skip to content

Commit 3fa93a1

Browse files
committed
only report expensive queries
1 parent aa63600 commit 3fa93a1

File tree

4 files changed

+55
-22
lines changed

4 files changed

+55
-22
lines changed

cmd/thanos/query_frontend.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,8 @@ func registerQueryFrontend(app *extkingpin.App) {
147147
cmd.Flag("query-frontend.log-queries-longer-than", "Log queries that are slower than the specified duration. "+
148148
"Set to 0 to disable. Set to < 0 to enable on all queries.").Default("0").DurationVar(&cfg.CortexHandlerConfig.LogQueriesLongerThan)
149149

150-
cmd.Flag("query-frontend.query-stats-enabled", "Enable query stats logging and metrics").Default("true").BoolVar(&cfg.CortexHandlerConfig.QueryStatsEnabled)
150+
cmd.Flag("query-frontend.log-queries-more-expensive-than", "Log queries that fetch more data than the specified bytes. "+
151+
"Set to 0 to disable. Otherwise must be a positive value.").Default("0").Uint64Var(&cfg.CortexHandlerConfig.LogQueriesMoreExpensiveThan)
151152

152153
cmd.Flag("query-frontend.log-failed-queries", "Log failed queries due to any reason").Default("true").BoolVar(&cfg.CortexHandlerConfig.LogFailedQueries)
153154

internal/cortex/frontend/transport/handler.go

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,13 @@ var (
4444

4545
// HandlerConfig Config for a Handler.
4646
type HandlerConfig struct {
47-
LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
48-
MaxBodySize int64 `yaml:"max_body_size"`
49-
QueryStatsEnabled bool `yaml:"query_stats_enabled"`
50-
LogFailedQueries bool `yaml:"log_failed_queries"`
51-
FailedQueryCacheCapacity int `yaml:"failed_query_cache_capacity"`
52-
SlowQueryLogsUserHeader string `yaml:"slow_query_logs_user_header"`
47+
LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
48+
MaxBodySize int64 `yaml:"max_body_size"`
49+
QueryStatsEnabled bool `yaml:"query_stats_enabled"`
50+
LogFailedQueries bool `yaml:"log_failed_queries"`
51+
FailedQueryCacheCapacity int `yaml:"failed_query_cache_capacity"`
52+
SlowQueryLogsUserHeader string `yaml:"slow_query_logs_user_header"`
53+
LogQueriesMoreExpensiveThan uint64 `yaml:"log_queries_more_expensive_than"`
5354
}
5455

5556
// Handler accepts queries and forwards them to RoundTripper. It can log slow queries,
@@ -61,12 +62,13 @@ type Handler struct {
6162
failedQueryCache *utils.FailedQueryCache
6263

6364
// Metrics.
64-
querySeconds *prometheus.CounterVec
65-
querySeries *prometheus.CounterVec
66-
queryBytes *prometheus.CounterVec
67-
activeUsers *util.ActiveUsersCleanupService
68-
slowQueryCount prometheus.Counter
69-
failedQueryCount prometheus.Counter
65+
querySeconds *prometheus.CounterVec
66+
querySeries *prometheus.CounterVec
67+
queryBytes *prometheus.CounterVec
68+
activeUsers *util.ActiveUsersCleanupService
69+
slowQueryCount prometheus.Counter
70+
failedQueryCount prometheus.Counter
71+
expensiveQueryCount prometheus.Counter
7072
}
7173

7274
// NewHandler creates a new frontend handler.
@@ -120,6 +122,10 @@ func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logge
120122
Name: "cortex_failed_query_total",
121123
Help: "Total number of failed queries detected.",
122124
})
125+
h.expensiveQueryCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
126+
Name: "cortex_expensive_query_total",
127+
Help: "Total number of expensive queries detected.",
128+
})
123129
return h
124130
}
125131

@@ -205,16 +211,19 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
205211

206212
// Check whether we should parse the query string.
207213
shouldReportSlowQuery := f.cfg.LogQueriesLongerThan != 0 && queryResponseTime > f.cfg.LogQueriesLongerThan
208-
if shouldReportSlowQuery || f.cfg.QueryStatsEnabled {
214+
queryBytesFetched := queryrange.GetQueryBytesFetchedFromHeader(resp.Header)
215+
shouldReportExpensiveQuery := f.cfg.LogQueriesMoreExpensiveThan != 0 && queryBytesFetched > f.cfg.LogQueriesMoreExpensiveThan
216+
if shouldReportSlowQuery || shouldReportExpensiveQuery || f.cfg.QueryStatsEnabled {
209217
queryString = f.parseRequestQueryString(r, buf)
210218
}
211219

212220
if shouldReportSlowQuery {
213221
f.reportSlowQuery(r, hs, queryString, queryResponseTime)
214222
}
223+
if shouldReportExpensiveQuery {
224+
f.reportExpensiveQuery(r, queryString, queryBytesFetched, queryResponseTime)
225+
}
215226
if f.cfg.QueryStatsEnabled {
216-
stats.FetchedChunkBytes = queryrange.GetQueryBytesFetchedFromHeader(resp.Header)
217-
stats.FetchedSeriesCount = queryrange.GetQuerySeriesFetchedFromHeader(resp.Header)
218227
f.reportQueryStats(r, queryString, queryResponseTime, stats)
219228
}
220229
}
@@ -248,6 +257,35 @@ func (f *Handler) reportFailedQuery(r *http.Request, queryString url.Values, err
248257
level.Error(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
249258
}
250259

260+
func (f *Handler) reportExpensiveQuery(r *http.Request, queryString url.Values, queryBytesFetched uint64, queryResponseTime time.Duration) {
261+
f.expensiveQueryCount.Inc()
262+
// NOTE(GiedriusS): see https://github.com/grafana/grafana/pull/60301 for more info.
263+
grafanaDashboardUID := "-"
264+
if dashboardUID := r.Header.Get("X-Dashboard-Uid"); dashboardUID != "" {
265+
grafanaDashboardUID = dashboardUID
266+
}
267+
grafanaPanelID := "-"
268+
if panelID := r.Header.Get("X-Panel-Id"); panelID != "" {
269+
grafanaPanelID = panelID
270+
}
271+
remoteUser, _, _ := r.BasicAuth()
272+
273+
logMessage := append([]interface{}{
274+
"msg", "expensive query",
275+
"method", r.Method,
276+
"host", r.Host,
277+
"path", r.URL.Path,
278+
"remote_user", remoteUser,
279+
"remote_addr", r.RemoteAddr,
280+
"query_megabytes_fetched", queryBytesFetched / (1024 * 1024),
281+
"grafana_dashboard_uid", grafanaDashboardUID,
282+
"grafana_panel_id", grafanaPanelID,
283+
"query_response_time", queryResponseTime.String(),
284+
}, formatQueryString(queryString)...)
285+
286+
level.Error(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
287+
}
288+
251289
// reportSlowQuery reports slow queries.
252290
func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header, queryString url.Values, queryResponseTime time.Duration) {
253291
f.slowQueryCount.Inc()

internal/cortex/querier/queryrange/query_bytes_fetched.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
// QueryBytesFetchedHeaderName is the http header name of number of bytes fetched by a query from m3readcoord.
1212
// This name is compatible with M3 and rule manager code
1313
const QueryBytesFetchedHeaderName = "M3-Fetched-Bytes-Estimate"
14-
const QuerySeriesFetchedHeaderName = "M3-Fetched-Series-Estimate"
1514

1615
func sumQueryBytesFetched(responses ...Response) uint64 {
1716
var result uint64
@@ -72,7 +71,3 @@ func getHeaderValue(hdr http.Header, key string) uint64 {
7271
func GetQueryBytesFetchedFromHeader(hdr http.Header) uint64 {
7372
return getHeaderValue(hdr, QueryBytesFetchedHeaderName)
7473
}
75-
76-
func GetQuerySeriesFetchedFromHeader(hdr http.Header) uint64 {
77-
return getHeaderValue(hdr, QuerySeriesFetchedHeaderName)
78-
}

internal/cortex/querier/queryrange/query_range.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,6 @@ func (prometheusCodec) EncodeResponse(ctx context.Context, res Response) (*http.
483483
} else if res.(*PrometheusResponse).Data.SeriesStatsCounter != nil {
484484
// Pantheon code path
485485
httpHeader[QueryBytesFetchedHeaderName] = []string{strconv.FormatInt(res.(*PrometheusResponse).Data.SeriesStatsCounter.Bytes, 10)}
486-
httpHeader[QuerySeriesFetchedHeaderName] = []string{strconv.FormatInt(res.(*PrometheusResponse).Data.SeriesStatsCounter.Series, 10)}
487486
}
488487
resp := http.Response{
489488
Header: httpHeader,

0 commit comments

Comments
 (0)