query-tee: optionally consider equivalent errors the same when comparing responses (#9143)

charleskorn · web-flow · commit e7706d50b66b · 2024-09-06T07:47:10.000+10:00
* query-tee: optionally consider equivalent errors the same when comparing responses

* Add changelog entry

* Address PR feedback: rename CLI flag

* Address PR feedback: add to docs page
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -193,6 +193,7 @@
 * [ENHANCEMENT] Don't consider responses to be different during response comparison if both backends' responses contain different series, but all samples are within the recent sample window. #8749 #8894
 * [ENHANCEMENT] When the expected and actual response for a matrix series is different, the full set of samples for that series from both backends will now be logged. #8947
 * [ENHANCEMENT] Wait up to `-server.graceful-shutdown-timeout` for inflight requests to finish when shutting down, rather than immediately terminating inflight requests on shutdown. #8985
+* [ENHANCEMENT] Optionally consider equivalent error messages the same when comparing responses. Enabled by default, disable with `-proxy.require-exact-error-match=true`. #9143
 * [BUGFIX] Ensure any errors encountered while forwarding a request to a backend (eg. DNS resolution failures) are logged. #8419
 * [BUGFIX] The comparison of the results should not fail when either side contains extra samples from within SkipRecentSamples duration. #8920
 
diff --git a/cmd/query-tee/main.go b/cmd/query-tee/main.go
@@ -103,9 +103,10 @@ func mimirReadRoutes(cfg Config) []querytee.Route {
 	}
 
 	samplesComparator := querytee.NewSamplesComparator(querytee.SampleComparisonOptions{
-		Tolerance:         cfg.ProxyConfig.ValueComparisonTolerance,
-		UseRelativeError:  cfg.ProxyConfig.UseRelativeError,
-		SkipRecentSamples: cfg.ProxyConfig.SkipRecentSamples,
+		Tolerance:              cfg.ProxyConfig.ValueComparisonTolerance,
+		UseRelativeError:       cfg.ProxyConfig.UseRelativeError,
+		SkipRecentSamples:      cfg.ProxyConfig.SkipRecentSamples,
+		RequireExactErrorMatch: cfg.ProxyConfig.RequireExactErrorMatch,
 	})
 
 	var instantQueryTransformers []querytee.RequestTransformer
diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md
@@ -125,8 +125,12 @@ The query results comparison can be enabled setting the flag `-proxy.compare-res
 
 When the query results comparison is enabled, the query-tee compares the response received from the two configured backends and logs a message for each query whose results don't match. Query-tee keeps track of the number of successful and failed comparison through the metric `cortex_querytee_responses_compared_total`.
 
+By default, query-tee considers equivalent error messages as matching, even if they are not exactly the same.
+This ensures that comparison does not fail for known situations where error messages are non-deterministic.
+Set `-proxy.compare-exact-error-matching=true` to require that error messages match exactly.
+
 {{< admonition type="note" >}}
-Query-tee compares Floating point sample values with a tolerance that you can configure with the `-proxy.value-comparison-tolerance` option.
+Query-tee compares floating point sample values with a tolerance that you can configure with the `-proxy.value-comparison-tolerance` option.
 
 The configured tolerance prevents false positives due to differences in floating point values rounding introduced by the non-deterministic series ordering within the Prometheus PromQL engine.
 {{< /admonition >}}
diff --git a/tools/querytee/proxy.go b/tools/querytee/proxy.go
@@ -39,6 +39,7 @@ type ProxyConfig struct {
 	UseRelativeError                    bool
 	PassThroughNonRegisteredRoutes      bool
 	SkipRecentSamples                   time.Duration
+	RequireExactErrorMatch              bool
 	BackendSkipTLSVerify                bool
 	AddMissingTimeParamToInstantQueries bool
 	SecondaryBackendsRequestProportion  float64
@@ -64,6 +65,7 @@ func (cfg *ProxyConfig) RegisterFlags(f *flag.FlagSet) {
 	f.Float64Var(&cfg.ValueComparisonTolerance, "proxy.value-comparison-tolerance", 0.000001, "The tolerance to apply when comparing floating point values in the responses. 0 to disable tolerance and require exact match (not recommended).")
 	f.BoolVar(&cfg.UseRelativeError, "proxy.compare-use-relative-error", false, "Use relative error tolerance when comparing floating point values.")
 	f.DurationVar(&cfg.SkipRecentSamples, "proxy.compare-skip-recent-samples", 2*time.Minute, "The window from now to skip comparing samples. 0 to disable.")
+	f.BoolVar(&cfg.RequireExactErrorMatch, "proxy.compare-exact-error-matching", false, "If true, errors will be considered the same only if they are exactly the same. If false, errors will be considered the same if they are considered equivalent.")
 	f.BoolVar(&cfg.PassThroughNonRegisteredRoutes, "proxy.passthrough-non-registered-routes", false, "Passthrough requests for non-registered routes to preferred backend.")
 	f.BoolVar(&cfg.AddMissingTimeParamToInstantQueries, "proxy.add-missing-time-parameter-to-instant-queries", true, "Add a 'time' parameter to proxied instant query requests if they do not have one.")
 	f.Float64Var(&cfg.SecondaryBackendsRequestProportion, "proxy.secondary-backends-request-proportion", 1.0, "Proportion of requests to send to secondary backends. Must be between 0 and 1 (inclusive), and if not 1, then -backend.preferred must be set.")
diff --git a/tools/querytee/response_comparator.go b/tools/querytee/response_comparator.go
@@ -15,6 +15,7 @@ import (
 
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
+	"github.com/grafana/regexp"
 	"github.com/prometheus/common/model"
 
 	util_log "github.com/grafana/mimir/pkg/util/log"
@@ -36,9 +37,10 @@ type SamplesResponse struct {
 }
 
 type SampleComparisonOptions struct {
-	Tolerance         float64
-	UseRelativeError  bool
-	SkipRecentSamples time.Duration
+	Tolerance              float64
+	UseRelativeError       bool
+	SkipRecentSamples      time.Duration
+	RequireExactErrorMatch bool
 }
 
 func NewSamplesComparator(opts SampleComparisonOptions) *SamplesComparator {
@@ -83,7 +85,7 @@ func (s *SamplesComparator) Compare(expectedResponse, actualResponse []byte) (Co
 		return ComparisonFailed, fmt.Errorf("expected error type '%s' but got '%s'", expected.ErrorType, actual.ErrorType)
 	}
 
-	if expected.Error != actual.Error {
+	if !s.errorsMatch(expected.Error, actual.Error) {
 		return ComparisonFailed, fmt.Errorf("expected error '%s' but got '%s'", expected.Error, actual.Error)
 	}
 
@@ -116,6 +118,60 @@ func (s *SamplesComparator) Compare(expectedResponse, actualResponse []byte) (Co
 	return ComparisonSuccess, nil
 }
 
+var errorEquivalenceClasses = [][]*regexp.Regexp{
+	{
+		// Invalid expression type for range query: MQE and Prometheus' engine return different error messages.
+		// Prometheus' engine:
+		regexp.MustCompile(`invalid parameter "query": invalid expression type "range vector" for range query, must be Scalar or instant Vector`),
+		// MQE:
+		regexp.MustCompile(`invalid parameter "query": query expression produces a range vector, but expression for range queries must produce an instant vector or scalar`),
+	},
+	{
+		// Binary operation conflict on right (one-to-one) / many (one-to-many/many-to-one) side: MQE and Prometheus' engine return different error messages, and there's no guarantee they'll pick the same series as examples.
+		// Even comparing Prometheus' engine to another instance of Prometheus' engine can produce different results: the series selected as examples are not deterministic.
+		// Prometheus' engine:
+		regexp.MustCompile(`found duplicate series for the match group \{.*\} on the (left|right) hand-side of the operation: \[.*\];many-to-many matching not allowed: matching labels must be unique on one side`),
+		// MQE:
+		regexp.MustCompile(`found duplicate series for the match group \{.*\} on the (left|right) side of the operation at timestamp \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z: \{.*\} and \{.*\}`),
+	},
+	{
+		// Same as above, but for left (one-to-one) / one (one-to-many/many-to-one) side.
+		// Prometheus' engine:
+		regexp.MustCompile(`multiple matches for labels: many-to-one matching must be explicit \(group_left/group_right\)`),
+		// MQE:
+		regexp.MustCompile(`found duplicate series for the match group \{.*\} on the (left|right) side of the operation at timestamp \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z: \{.*\} and \{.*\}`),
+	},
+}
+
+func (s *SamplesComparator) errorsMatch(expected, actual string) bool {
+	if expected == actual {
+		return true
+	}
+
+	if s.opts.RequireExactErrorMatch {
+		// Errors didn't match exactly, and we want an exact match. We're done.
+		return false
+	}
+
+	for _, equivalenceClass := range errorEquivalenceClasses {
+		if anyMatch(expected, equivalenceClass) && anyMatch(actual, equivalenceClass) {
+			return true
+		}
+	}
+
+	return false
+}
+
+func anyMatch(s string, patterns []*regexp.Regexp) bool {
+	for _, pattern := range patterns {
+		if pattern.MatchString(s) {
+			return true
+		}
+	}
+
+	return false
+}
+
 func slicesEqualIgnoringOrder(a, b []string) bool {
 	if len(a) == 0 && len(b) == 0 {
 		return true
diff --git a/tools/querytee/response_comparator_test.go b/tools/querytee/response_comparator_test.go
@@ -1188,6 +1188,66 @@ func TestCompareSamplesResponse(t *testing.T) {
 						}`),
 			err: errors.New("expected error 'something went wrong' but got 'something else went wrong'"),
 		},
+		{
+			name: "response error is different but equivalent, and matches the same pattern in the equivalence class",
+			expected: json.RawMessage(`{
+							"status": "error",
+							"error": "found duplicate series for the match group {foo=\"bar\"} on the right hand-side of the operation: [{foo=\"bar\", env=\"test\"}, {foo=\"bar\", env=\"prod\"}];many-to-many matching not allowed: matching labels must be unique on one side"
+						}`),
+			actual: json.RawMessage(`{
+							"status": "error",
+							"error": "found duplicate series for the match group {foo=\"blah\"} on the right hand-side of the operation: [{foo=\"blah\", env=\"test\"}, {foo=\"blah\", env=\"prod\"}];many-to-many matching not allowed: matching labels must be unique on one side"
+						}`),
+			err: nil,
+		},
+		{
+			name: "response error is different but equivalent, and matches a different pattern in the equivalence class",
+			expected: json.RawMessage(`{
+							"status": "error",
+							"error": "found duplicate series for the match group {foo=\"blah\"} on the right hand-side of the operation: [{foo=\"blah\", env=\"test\"}, {foo=\"blah\", env=\"prod\"}];many-to-many matching not allowed: matching labels must be unique on one side"
+						}`),
+			actual: json.RawMessage(`{
+							"status": "error",
+							"error": "found duplicate series for the match group {foo=\"bar\"} on the right side of the operation at timestamp 1970-01-01T00:00:00Z: {foo=\"bar\", env=\"test\"} and {foo=\"bar\", env=\"prod\"}"
+						}`),
+			err: nil,
+		},
+		{
+			name: "response errors match equivalence classes, but errors are not from the same equivalence class",
+			expected: json.RawMessage(`{
+							"status": "error",
+							"error": "found duplicate series for the match group {foo=\"bar\"} on the right hand-side of the operation: [{foo=\"bar\", env=\"test\"}, {foo=\"bar\", env=\"prod\"}];many-to-many matching not allowed: matching labels must be unique on one side"
+						}`),
+			actual: json.RawMessage(`{
+							"status": "error",
+							"error": "invalid parameter \"query\": invalid expression type \"range vector\" for range query, must be Scalar or instant Vector"
+						}`),
+			err: errors.New(`expected error 'found duplicate series for the match group {foo="bar"} on the right hand-side of the operation: [{foo="bar", env="test"}, {foo="bar", env="prod"}];many-to-many matching not allowed: matching labels must be unique on one side' but got 'invalid parameter "query": invalid expression type "range vector" for range query, must be Scalar or instant Vector'`),
+		},
+		{
+			name: "expected response error matches an equivalence class, but actual response error does not",
+			expected: json.RawMessage(`{
+							"status": "error",
+							"error": "invalid parameter \"query\": invalid expression type \"range vector\" for range query, must be Scalar or instant Vector"
+						}`),
+			actual: json.RawMessage(`{
+							"status": "error",
+							"error": "something went wrong"
+						}`),
+			err: errors.New(`expected error 'invalid parameter "query": invalid expression type "range vector" for range query, must be Scalar or instant Vector' but got 'something went wrong'`),
+		},
+		{
+			name: "actual response error matches an equivalence class, but expected response error does not",
+			expected: json.RawMessage(`{
+							"status": "error",
+							"error": "something went wrong"
+						}`),
+			actual: json.RawMessage(`{
+							"status": "error",
+							"error": "invalid parameter \"query\": invalid expression type \"range vector\" for range query, must be Scalar or instant Vector"
+						}`),
+			err: errors.New(`expected error 'something went wrong' but got 'invalid parameter "query": invalid expression type "range vector" for range query, must be Scalar or instant Vector'`),
+		},
 		{
 			name: "difference in resultType",
 			expected: json.RawMessage(`{