Initial implementation of blocking expensive ad-hoc queries in the frontend (#40)

pranavmishradatabricks · web-flow · commit 120d3287d4f4 · 2024-06-05T13:44:22.000-07:00
diff --git a/cmd/thanos/query_frontend.go b/cmd/thanos/query_frontend.go
@@ -8,6 +8,8 @@ import (
 	"net/http"
 	"time"
 
+	"gopkg.in/yaml.v2"
+
 	extflag "github.com/efficientgo/tools/extkingpin"
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
@@ -19,7 +21,6 @@ import (
 	"github.com/prometheus/prometheus/promql/parser"
 	"github.com/thanos-io/promql-engine/execution/parse"
 	"github.com/weaveworks/common/user"
-	"gopkg.in/yaml.v2"
 
 	cortexfrontend "github.com/thanos-io/thanos/internal/cortex/frontend"
 	"github.com/thanos-io/thanos/internal/cortex/frontend/transport"
@@ -148,6 +149,9 @@ func registerQueryFrontend(app *extkingpin.App) {
 
 	cmd.Flag("query-frontend.log-failed-queries", "Log failed queries due to any reason").Default("true").BoolVar(&cfg.CortexHandlerConfig.LogFailedQueries)
 
+	cmd.Flag("failed-query-cache-capacity", "Capacity of cache for failed queries. 0 means this feature is disabled.").
+		Default("0").IntVar(&cfg.CortexHandlerConfig.FailedQueryCacheCapacity)
+
 	cmd.Flag("query-frontend.org-id-header", "Deprecation Warning - This flag will be soon deprecated in favor of query-frontend.tenant-header"+
 		" and both flags cannot be used at the same time. "+
 		"Request header names used to identify the source of slow queries (repeated flag). "+
diff --git a/go.mod b/go.mod
@@ -116,6 +116,7 @@ require (
 
 require (
 	github.com/cortexproject/promqlsmith v0.0.0-20240326071418-c2a9ca1e89f5
+	github.com/hashicorp/golang-lru v0.6.0
 	github.com/hashicorp/golang-lru/v2 v2.0.7
 	github.com/mitchellh/go-ps v1.0.0
 	github.com/onsi/gomega v1.29.0
diff --git a/internal/cortex/frontend/transport/handler.go b/internal/cortex/frontend/transport/handler.go
@@ -11,22 +11,23 @@ import (
 	"io"
 	"net/http"
 	"net/url"
+	"regexp"
 	"strconv"
 	"strings"
 	"syscall"
 	"time"
 
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
+	"github.com/hashicorp/golang-lru"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
-	"github.com/weaveworks/common/httpgrpc"
-	"github.com/weaveworks/common/httpgrpc/server"
-
 	querier_stats "github.com/thanos-io/thanos/internal/cortex/querier/stats"
 	"github.com/thanos-io/thanos/internal/cortex/tenant"
 	"github.com/thanos-io/thanos/internal/cortex/util"
 	util_log "github.com/thanos-io/thanos/internal/cortex/util/log"
+	"github.com/weaveworks/common/httpgrpc"
+	"github.com/weaveworks/common/httpgrpc/server"
 )
 
 const (
@@ -39,14 +40,16 @@ var (
 	errCanceled              = httpgrpc.Errorf(StatusClientClosedRequest, context.Canceled.Error())
 	errDeadlineExceeded      = httpgrpc.Errorf(http.StatusGatewayTimeout, context.DeadlineExceeded.Error())
 	errRequestEntityTooLarge = httpgrpc.Errorf(http.StatusRequestEntityTooLarge, "http: request body too large")
+	cacheableResponseCodes   = []int{http.StatusRequestTimeout, http.StatusGatewayTimeout, http.StatusBadRequest}
 )
 
-// Config for a Handler.
+// HandlerConfig Config for a Handler.
 type HandlerConfig struct {
-	LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
-	MaxBodySize          int64         `yaml:"max_body_size"`
-	QueryStatsEnabled    bool          `yaml:"query_stats_enabled"`
-	LogFailedQueries    bool          `yaml:"log_failed_queries"`
+	LogQueriesLongerThan     time.Duration `yaml:"log_queries_longer_than"`
+	MaxBodySize              int64         `yaml:"max_body_size"`
+	QueryStatsEnabled        bool          `yaml:"query_stats_enabled"`
+	LogFailedQueries         bool          `yaml:"log_failed_queries"`
+	FailedQueryCacheCapacity int           `yaml:"failed_query_cache_capacity"`
 }
 
 // Handler accepts queries and forwards them to RoundTripper. It can log slow queries,
@@ -55,20 +58,40 @@ type Handler struct {
 	cfg          HandlerConfig
 	log          log.Logger
 	roundTripper http.RoundTripper
+	lruCache     *lru.Cache
+	regex        *regexp.Regexp
+	errorExtract *regexp.Regexp
 
 	// Metrics.
 	querySeconds *prometheus.CounterVec
 	querySeries  *prometheus.CounterVec
 	queryBytes   *prometheus.CounterVec
+	cachedHits   prometheus.Counter
 	activeUsers  *util.ActiveUsersCleanupService
 }
 
 // NewHandler creates a new frontend handler.
 func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logger, reg prometheus.Registerer) http.Handler {
+	var (
+		LruCache *lru.Cache
+		err      error
+	)
+
+	if cfg.FailedQueryCacheCapacity > 0 {
+		LruCache, err = lru.New(cfg.FailedQueryCacheCapacity)
+		if err != nil {
+			LruCache = nil
+			level.Warn(log).Log("msg", "Failed to create LruCache", "error", err)
+		}
+	}
+
 	h := &Handler{
 		cfg:          cfg,
 		log:          log,
 		roundTripper: roundTripper,
+		lruCache:     LruCache,
+		regex:        regexp.MustCompile(`[\s\n\t]+`),
+		errorExtract: regexp.MustCompile(`Code\((\d+)\)`),
 	}
 
 	if cfg.QueryStatsEnabled {
@@ -92,17 +115,25 @@ func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logge
 			h.querySeries.DeleteLabelValues(user)
 			h.queryBytes.DeleteLabelValues(user)
 		})
+
 		// If cleaner stops or fail, we will simply not clean the metrics for inactive users.
 		_ = h.activeUsers.StartAsync(context.Background())
 	}
 
+	h.cachedHits = promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "cached_failed_queries_count",
+		Help: "Total number of queries that hit the failed query cache.",
+	})
+
 	return h
 }
 
 func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	var (
-		stats       *querier_stats.Stats
-		queryString url.Values
+		stats                      *querier_stats.Stats
+		queryString                url.Values
+		queryExpressionNormalized  string
+		queryExpressionRangeLength int
 	)
 
 	// Initialise the stats in the context and make sure it's propagated
@@ -122,14 +153,41 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	r.Body = http.MaxBytesReader(w, r.Body, f.cfg.MaxBodySize)
 	r.Body = io.NopCloser(io.TeeReader(r.Body, &buf))
 
+	// Check if caching is enabled.
+	if f.lruCache != nil {
+		// Store query expression.
+		queryExpressionNormalized = f.regex.ReplaceAllString(r.URL.Query().Get("query"), " ")
+
+		// Store query time range length.
+		queryExpressionRangeLength = getQueryRangeSeconds(r)
+
+		// Check if query in cache and whether value exceeds time range length.
+		if value, ok := f.lruCache.Get(queryExpressionNormalized); ok && value.(int) >= queryExpressionRangeLength {
+			w.WriteHeader(http.StatusForbidden)
+			level.Info(util_log.WithContext(r.Context(), f.log)).Log(
+				"msg", "Retrieved query from cache",
+				"normalized_query", queryExpressionNormalized,
+				"range_seconds", queryExpressionRangeLength,
+			)
+			f.cachedHits.Inc()
+			return
+		}
+	}
+
 	startTime := time.Now()
 	resp, err := f.roundTripper.RoundTrip(r)
 	queryResponseTime := time.Since(startTime)
 
 	if err != nil {
 		writeError(w, err)
+		queryString = f.parseRequestQueryString(r, buf)
+
+		// Check if caching is enabled.
+		if f.lruCache != nil {
+			f.updateFailedQueryCache(err, queryExpressionNormalized, queryExpressionRangeLength, r)
+		}
+
 		if f.cfg.LogFailedQueries {
-			queryString = f.parseRequestQueryString(r, buf)
 			f.reportFailedQuery(r, queryString, err)
 		}
 		return
@@ -165,6 +223,84 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
+func (f *Handler) updateFailedQueryCache(err error, queryExpressionNormalized string, queryExpressionRangeLength int, r *http.Request) {
+	// Extracting error code from error string.
+	codeExtract := f.errorExtract.FindStringSubmatch(err.Error())
+
+	// Checking if error code extracted successfully.
+	if codeExtract == nil || len(codeExtract) < 2 {
+		level.Error(util_log.WithContext(r.Context(), f.log)).Log(
+			"msg", "Error string regex conversion error",
+			"normalized_query", queryExpressionNormalized,
+			"range_seconds", queryExpressionRangeLength,
+			"error", err)
+		return
+	}
+
+	// Converting error code to int.
+	errCode, strConvError := strconv.Atoi(codeExtract[1])
+
+	// Checking if error code extracted properly from string.
+	if strConvError != nil {
+		level.Error(util_log.WithContext(r.Context(), f.log)).Log(
+			"msg", "String to int conversion error",
+			"normalized_query", queryExpressionNormalized,
+			"range_seconds", queryExpressionRangeLength,
+			"error", err)
+		return
+	}
+
+	// If error should be cached, store it in cache.
+	if !isCacheableError(errCode) {
+		level.Debug(util_log.WithContext(r.Context(), f.log)).Log(
+			"msg", "Query not cached due to non-cacheable error code",
+			"normalized_query", queryExpressionNormalized,
+			"range_seconds", queryExpressionRangeLength,
+			"error", err,
+		)
+		return
+	}
+
+	// Checks if queryExpression is already in cache, and updates time range length value to min of stored and new value.
+	if contains, _ := f.lruCache.ContainsOrAdd(queryExpressionNormalized, queryExpressionRangeLength); contains {
+		if oldValue, ok := f.lruCache.Get(queryExpressionNormalized); ok {
+			queryExpressionRangeLength = min(queryExpressionRangeLength, oldValue.(int))
+		}
+		f.lruCache.Add(queryExpressionNormalized, queryExpressionRangeLength)
+	}
+
+	level.Debug(util_log.WithContext(r.Context(), f.log)).Log(
+		"msg", "Cached a failed query",
+		"normalized_query", queryExpressionNormalized,
+		"range_seconds", queryExpressionRangeLength,
+		"error", err,
+	)
+
+}
+
+// isCacheableError Returns true if response code is in pre-defined cacheable errors list, else returns false.
+func isCacheableError(statusCode int) bool {
+	for _, errStatusCode := range cacheableResponseCodes {
+		if errStatusCode == statusCode {
+			return true
+		}
+	}
+	return false
+}
+
+// Time range length for queries, if either of "start" or "end" are not present, return 0.
+func getQueryRangeSeconds(r *http.Request) int {
+	start, err := strconv.Atoi(r.URL.Query().Get("start"))
+	if err != nil {
+		return 0
+	}
+	end, err := strconv.Atoi(r.URL.Query().Get("end"))
+	if err != nil {
+		return 0
+	}
+	return end - start
+}
+
 func (f *Handler) reportFailedQuery(r *http.Request, queryString url.Values, err error) {
 	// NOTE(GiedriusS): see https://github.com/grafana/grafana/pull/60301 for more info.
 	grafanaDashboardUID := "-"