Skip to content

Commit 30e2550

Browse files
authored
Report expensive queries in query frontend (#78)
2 parents 23355a6 + 2d2e661 commit 30e2550

File tree

3 files changed

+79
-15
lines changed

3 files changed

+79
-15
lines changed

cmd/thanos/query_frontend.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,9 @@ func registerQueryFrontend(app *extkingpin.App) {
147147
cmd.Flag("query-frontend.log-queries-longer-than", "Log queries that are slower than the specified duration. "+
148148
"Set to 0 to disable. Set to < 0 to enable on all queries.").Default("0").DurationVar(&cfg.CortexHandlerConfig.LogQueriesLongerThan)
149149

150+
cmd.Flag("query-frontend.log-queries-more-expensive-than", "Log queries that fetch more data than the specified bytes. "+
151+
"Set to 0 to disable. Otherwise must be a positive value. Default is 1GB").Default("1073741824").Uint64Var(&cfg.CortexHandlerConfig.LogQueriesMoreExpensiveThan)
152+
150153
cmd.Flag("query-frontend.log-failed-queries", "Log failed queries due to any reason").Default("true").BoolVar(&cfg.CortexHandlerConfig.LogFailedQueries)
151154

152155
cmd.Flag("failed-query-cache-capacity", "Capacity of cache for failed queries. 0 means this feature is disabled.").

internal/cortex/frontend/transport/handler.go

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/prometheus/client_golang/prometheus"
2222
"github.com/prometheus/client_golang/prometheus/promauto"
2323
"github.com/thanos-io/thanos/internal/cortex/frontend/transport/utils"
24+
"github.com/thanos-io/thanos/internal/cortex/querier/queryrange"
2425
querier_stats "github.com/thanos-io/thanos/internal/cortex/querier/stats"
2526
"github.com/thanos-io/thanos/internal/cortex/tenant"
2627
"github.com/thanos-io/thanos/internal/cortex/util"
@@ -43,12 +44,13 @@ var (
4344

4445
// HandlerConfig Config for a Handler.
4546
type HandlerConfig struct {
46-
LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
47-
MaxBodySize int64 `yaml:"max_body_size"`
48-
QueryStatsEnabled bool `yaml:"query_stats_enabled"`
49-
LogFailedQueries bool `yaml:"log_failed_queries"`
50-
FailedQueryCacheCapacity int `yaml:"failed_query_cache_capacity"`
51-
SlowQueryLogsUserHeader string `yaml:"slow_query_logs_user_header"`
47+
LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
48+
MaxBodySize int64 `yaml:"max_body_size"`
49+
QueryStatsEnabled bool `yaml:"query_stats_enabled"`
50+
LogFailedQueries bool `yaml:"log_failed_queries"`
51+
FailedQueryCacheCapacity int `yaml:"failed_query_cache_capacity"`
52+
SlowQueryLogsUserHeader string `yaml:"slow_query_logs_user_header"`
53+
LogQueriesMoreExpensiveThan uint64 `yaml:"log_queries_more_expensive_than"`
5254
}
5355

5456
// Handler accepts queries and forwards them to RoundTripper. It can log slow queries,
@@ -60,12 +62,13 @@ type Handler struct {
6062
failedQueryCache *utils.FailedQueryCache
6163

6264
// Metrics.
63-
querySeconds *prometheus.CounterVec
64-
querySeries *prometheus.CounterVec
65-
queryBytes *prometheus.CounterVec
66-
activeUsers *util.ActiveUsersCleanupService
67-
slowQueryCount prometheus.Counter
68-
failedQueryCount prometheus.Counter
65+
querySeconds *prometheus.CounterVec
66+
querySeries *prometheus.CounterVec
67+
queryBytes *prometheus.CounterVec
68+
activeUsers *util.ActiveUsersCleanupService
69+
slowQueryCount prometheus.Counter
70+
failedQueryCount prometheus.Counter
71+
expensiveQueryCount prometheus.Counter
6972
}
7073

7174
// NewHandler creates a new frontend handler.
@@ -119,6 +122,10 @@ func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logge
119122
Name: "cortex_failed_query_total",
120123
Help: "Total number of failed queries detected.",
121124
})
125+
h.expensiveQueryCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
126+
Name: "cortex_expensive_query_total",
127+
Help: "Total number of expensive queries detected.",
128+
})
122129
return h
123130
}
124131

@@ -204,12 +211,17 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
204211

205212
// Check whether we should parse the query string.
206213
shouldReportSlowQuery := f.cfg.LogQueriesLongerThan != 0 && queryResponseTime > f.cfg.LogQueriesLongerThan
207-
if shouldReportSlowQuery || f.cfg.QueryStatsEnabled {
214+
queryBytesFetched := queryrange.GetQueryBytesFetchedFromHeader(resp.Header)
215+
shouldReportExpensiveQuery := f.cfg.LogQueriesMoreExpensiveThan != 0 && queryBytesFetched > f.cfg.LogQueriesMoreExpensiveThan
216+
if shouldReportSlowQuery || shouldReportExpensiveQuery || f.cfg.QueryStatsEnabled {
208217
queryString = f.parseRequestQueryString(r, buf)
209218
}
210219

211220
if shouldReportSlowQuery {
212-
f.reportSlowQuery(r, hs, queryString, queryResponseTime)
221+
f.reportSlowQuery(r, hs, queryString, queryBytesFetched, queryResponseTime)
222+
}
223+
if shouldReportExpensiveQuery {
224+
f.reportExpensiveQuery(r, queryString, queryBytesFetched, queryResponseTime)
213225
}
214226
if f.cfg.QueryStatsEnabled {
215227
f.reportQueryStats(r, queryString, queryResponseTime, stats)
@@ -245,8 +257,37 @@ func (f *Handler) reportFailedQuery(r *http.Request, queryString url.Values, err
245257
level.Error(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
246258
}
247259

260+
func (f *Handler) reportExpensiveQuery(r *http.Request, queryString url.Values, queryBytesFetched uint64, queryResponseTime time.Duration) {
261+
f.expensiveQueryCount.Inc()
262+
// NOTE(GiedriusS): see https://github.com/grafana/grafana/pull/60301 for more info.
263+
grafanaDashboardUID := "-"
264+
if dashboardUID := r.Header.Get("X-Dashboard-Uid"); dashboardUID != "" {
265+
grafanaDashboardUID = dashboardUID
266+
}
267+
grafanaPanelID := "-"
268+
if panelID := r.Header.Get("X-Panel-Id"); panelID != "" {
269+
grafanaPanelID = panelID
270+
}
271+
remoteUser, _, _ := r.BasicAuth()
272+
273+
logMessage := append([]interface{}{
274+
"msg", "expensive query",
275+
"method", r.Method,
276+
"host", r.Host,
277+
"path", r.URL.Path,
278+
"remote_user", remoteUser,
279+
"remote_addr", r.RemoteAddr,
280+
"query_megabytes_fetched", queryBytesFetched / (1024 * 1024),
281+
"grafana_dashboard_uid", grafanaDashboardUID,
282+
"grafana_panel_id", grafanaPanelID,
283+
"query_response_time", queryResponseTime.String(),
284+
}, formatQueryString(queryString)...)
285+
286+
level.Error(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
287+
}
288+
248289
// reportSlowQuery reports slow queries.
249-
func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header, queryString url.Values, queryResponseTime time.Duration) {
290+
func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header, queryString url.Values, queryBytesFetched uint64, queryResponseTime time.Duration) {
250291
f.slowQueryCount.Inc()
251292
// NOTE(GiedriusS): see https://github.com/grafana/grafana/pull/60301 for more info.
252293
grafanaDashboardUID := "-"
@@ -278,6 +319,7 @@ func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header,
278319
"remote_user", remoteUser,
279320
"remote_addr", r.RemoteAddr,
280321
"time_taken", queryResponseTime.String(),
322+
"query_megabytes_fetched", queryBytesFetched / (1024 * 1024),
281323
"grafana_dashboard_uid", grafanaDashboardUID,
282324
"grafana_panel_id", grafanaPanelID,
283325
"trace_id", thanosTraceID,

internal/cortex/querier/queryrange/query_bytes_fetched.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package queryrange
55

66
import (
7+
"net/http"
78
"strconv"
89
)
910

@@ -52,3 +53,21 @@ func QueryBytesFetchedHttpHeaderValue(response Response) []string {
5253
}
5354
return result
5455
}
56+
57+
func getHeaderValue(hdr http.Header, key string) uint64 {
58+
if val, ok := hdr[key]; ok {
59+
if len(val) != 1 {
60+
return 0
61+
}
62+
n, err := strconv.ParseUint(val[0], 10, 64)
63+
if err != nil {
64+
return 0
65+
}
66+
return n
67+
}
68+
return 0
69+
}
70+
71+
func GetQueryBytesFetchedFromHeader(hdr http.Header) uint64 {
72+
return getHeaderValue(hdr, QueryBytesFetchedHeaderName)
73+
}

0 commit comments

Comments
 (0)