Skip to content

Commit e7706d5

Browse files
authored
query-tee: optionally consider equivalent errors the same when comparing responses (#9143)
* query-tee: optionally consider equivalent errors the same when comparing responses * Add changelog entry * Address PR feedback: rename CLI flag * Address PR feedback: add to docs page
1 parent d717218 commit e7706d5

File tree

6 files changed

+132
-8
lines changed

6 files changed

+132
-8
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@
193193
* [ENHANCEMENT] Don't consider responses to be different during response comparison if both backends' responses contain different series, but all samples are within the recent sample window. #8749 #8894
194194
* [ENHANCEMENT] When the expected and actual response for a matrix series is different, the full set of samples for that series from both backends will now be logged. #8947
195195
* [ENHANCEMENT] Wait up to `-server.graceful-shutdown-timeout` for inflight requests to finish when shutting down, rather than immediately terminating inflight requests on shutdown. #8985
196+
* [ENHANCEMENT] Optionally consider equivalent error messages the same when comparing responses. Enabled by default, disable with `-proxy.require-exact-error-match=true`. #9143
196197
* [BUGFIX] Ensure any errors encountered while forwarding a request to a backend (eg. DNS resolution failures) are logged. #8419
197198
* [BUGFIX] The comparison of the results should not fail when either side contains extra samples from within SkipRecentSamples duration. #8920
198199

cmd/query-tee/main.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,10 @@ func mimirReadRoutes(cfg Config) []querytee.Route {
103103
}
104104

105105
samplesComparator := querytee.NewSamplesComparator(querytee.SampleComparisonOptions{
106-
Tolerance: cfg.ProxyConfig.ValueComparisonTolerance,
107-
UseRelativeError: cfg.ProxyConfig.UseRelativeError,
108-
SkipRecentSamples: cfg.ProxyConfig.SkipRecentSamples,
106+
Tolerance: cfg.ProxyConfig.ValueComparisonTolerance,
107+
UseRelativeError: cfg.ProxyConfig.UseRelativeError,
108+
SkipRecentSamples: cfg.ProxyConfig.SkipRecentSamples,
109+
RequireExactErrorMatch: cfg.ProxyConfig.RequireExactErrorMatch,
109110
})
110111

111112
var instantQueryTransformers []querytee.RequestTransformer

docs/sources/mimir/manage/tools/query-tee.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,12 @@ The query results comparison can be enabled setting the flag `-proxy.compare-res
125125

126126
When the query results comparison is enabled, the query-tee compares the response received from the two configured backends and logs a message for each query whose results don't match. Query-tee keeps track of the number of successful and failed comparison through the metric `cortex_querytee_responses_compared_total`.
127127

128+
By default, query-tee considers equivalent error messages as matching, even if they are not exactly the same.
129+
This ensures that comparison does not fail for known situations where error messages are non-deterministic.
130+
Set `-proxy.compare-exact-error-matching=true` to require that error messages match exactly.
131+
128132
{{< admonition type="note" >}}
129-
Query-tee compares Floating point sample values with a tolerance that you can configure with the `-proxy.value-comparison-tolerance` option.
133+
Query-tee compares floating point sample values with a tolerance that you can configure with the `-proxy.value-comparison-tolerance` option.
130134

131135
The configured tolerance prevents false positives due to differences in floating point values rounding introduced by the non-deterministic series ordering within the Prometheus PromQL engine.
132136
{{< /admonition >}}

tools/querytee/proxy.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ type ProxyConfig struct {
3939
UseRelativeError bool
4040
PassThroughNonRegisteredRoutes bool
4141
SkipRecentSamples time.Duration
42+
RequireExactErrorMatch bool
4243
BackendSkipTLSVerify bool
4344
AddMissingTimeParamToInstantQueries bool
4445
SecondaryBackendsRequestProportion float64
@@ -64,6 +65,7 @@ func (cfg *ProxyConfig) RegisterFlags(f *flag.FlagSet) {
6465
f.Float64Var(&cfg.ValueComparisonTolerance, "proxy.value-comparison-tolerance", 0.000001, "The tolerance to apply when comparing floating point values in the responses. 0 to disable tolerance and require exact match (not recommended).")
6566
f.BoolVar(&cfg.UseRelativeError, "proxy.compare-use-relative-error", false, "Use relative error tolerance when comparing floating point values.")
6667
f.DurationVar(&cfg.SkipRecentSamples, "proxy.compare-skip-recent-samples", 2*time.Minute, "The window from now to skip comparing samples. 0 to disable.")
68+
f.BoolVar(&cfg.RequireExactErrorMatch, "proxy.compare-exact-error-matching", false, "If true, errors will be considered the same only if they are exactly the same. If false, errors will be considered the same if they are considered equivalent.")
6769
f.BoolVar(&cfg.PassThroughNonRegisteredRoutes, "proxy.passthrough-non-registered-routes", false, "Passthrough requests for non-registered routes to preferred backend.")
6870
f.BoolVar(&cfg.AddMissingTimeParamToInstantQueries, "proxy.add-missing-time-parameter-to-instant-queries", true, "Add a 'time' parameter to proxied instant query requests if they do not have one.")
6971
f.Float64Var(&cfg.SecondaryBackendsRequestProportion, "proxy.secondary-backends-request-proportion", 1.0, "Proportion of requests to send to secondary backends. Must be between 0 and 1 (inclusive), and if not 1, then -backend.preferred must be set.")

tools/querytee/response_comparator.go

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
"github.com/go-kit/log"
1717
"github.com/go-kit/log/level"
18+
"github.com/grafana/regexp"
1819
"github.com/prometheus/common/model"
1920

2021
util_log "github.com/grafana/mimir/pkg/util/log"
@@ -36,9 +37,10 @@ type SamplesResponse struct {
3637
}
3738

3839
type SampleComparisonOptions struct {
39-
Tolerance float64
40-
UseRelativeError bool
41-
SkipRecentSamples time.Duration
40+
Tolerance float64
41+
UseRelativeError bool
42+
SkipRecentSamples time.Duration
43+
RequireExactErrorMatch bool
4244
}
4345

4446
func NewSamplesComparator(opts SampleComparisonOptions) *SamplesComparator {
@@ -83,7 +85,7 @@ func (s *SamplesComparator) Compare(expectedResponse, actualResponse []byte) (Co
8385
return ComparisonFailed, fmt.Errorf("expected error type '%s' but got '%s'", expected.ErrorType, actual.ErrorType)
8486
}
8587

86-
if expected.Error != actual.Error {
88+
if !s.errorsMatch(expected.Error, actual.Error) {
8789
return ComparisonFailed, fmt.Errorf("expected error '%s' but got '%s'", expected.Error, actual.Error)
8890
}
8991

@@ -116,6 +118,60 @@ func (s *SamplesComparator) Compare(expectedResponse, actualResponse []byte) (Co
116118
return ComparisonSuccess, nil
117119
}
118120

121+
var errorEquivalenceClasses = [][]*regexp.Regexp{
122+
{
123+
// Invalid expression type for range query: MQE and Prometheus' engine return different error messages.
124+
// Prometheus' engine:
125+
regexp.MustCompile(`invalid parameter "query": invalid expression type "range vector" for range query, must be Scalar or instant Vector`),
126+
// MQE:
127+
regexp.MustCompile(`invalid parameter "query": query expression produces a range vector, but expression for range queries must produce an instant vector or scalar`),
128+
},
129+
{
130+
// Binary operation conflict on right (one-to-one) / many (one-to-many/many-to-one) side: MQE and Prometheus' engine return different error messages, and there's no guarantee they'll pick the same series as examples.
131+
// Even comparing Prometheus' engine to another instance of Prometheus' engine can produce different results: the series selected as examples are not deterministic.
132+
// Prometheus' engine:
133+
regexp.MustCompile(`found duplicate series for the match group \{.*\} on the (left|right) hand-side of the operation: \[.*\];many-to-many matching not allowed: matching labels must be unique on one side`),
134+
// MQE:
135+
regexp.MustCompile(`found duplicate series for the match group \{.*\} on the (left|right) side of the operation at timestamp \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z: \{.*\} and \{.*\}`),
136+
},
137+
{
138+
// Same as above, but for left (one-to-one) / one (one-to-many/many-to-one) side.
139+
// Prometheus' engine:
140+
regexp.MustCompile(`multiple matches for labels: many-to-one matching must be explicit \(group_left/group_right\)`),
141+
// MQE:
142+
regexp.MustCompile(`found duplicate series for the match group \{.*\} on the (left|right) side of the operation at timestamp \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z: \{.*\} and \{.*\}`),
143+
},
144+
}
145+
146+
func (s *SamplesComparator) errorsMatch(expected, actual string) bool {
147+
if expected == actual {
148+
return true
149+
}
150+
151+
if s.opts.RequireExactErrorMatch {
152+
// Errors didn't match exactly, and we want an exact match. We're done.
153+
return false
154+
}
155+
156+
for _, equivalenceClass := range errorEquivalenceClasses {
157+
if anyMatch(expected, equivalenceClass) && anyMatch(actual, equivalenceClass) {
158+
return true
159+
}
160+
}
161+
162+
return false
163+
}
164+
165+
func anyMatch(s string, patterns []*regexp.Regexp) bool {
166+
for _, pattern := range patterns {
167+
if pattern.MatchString(s) {
168+
return true
169+
}
170+
}
171+
172+
return false
173+
}
174+
119175
func slicesEqualIgnoringOrder(a, b []string) bool {
120176
if len(a) == 0 && len(b) == 0 {
121177
return true

tools/querytee/response_comparator_test.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,6 +1188,66 @@ func TestCompareSamplesResponse(t *testing.T) {
11881188
}`),
11891189
err: errors.New("expected error 'something went wrong' but got 'something else went wrong'"),
11901190
},
1191+
{
1192+
name: "response error is different but equivalent, and matches the same pattern in the equivalence class",
1193+
expected: json.RawMessage(`{
1194+
"status": "error",
1195+
"error": "found duplicate series for the match group {foo=\"bar\"} on the right hand-side of the operation: [{foo=\"bar\", env=\"test\"}, {foo=\"bar\", env=\"prod\"}];many-to-many matching not allowed: matching labels must be unique on one side"
1196+
}`),
1197+
actual: json.RawMessage(`{
1198+
"status": "error",
1199+
"error": "found duplicate series for the match group {foo=\"blah\"} on the right hand-side of the operation: [{foo=\"blah\", env=\"test\"}, {foo=\"blah\", env=\"prod\"}];many-to-many matching not allowed: matching labels must be unique on one side"
1200+
}`),
1201+
err: nil,
1202+
},
1203+
{
1204+
name: "response error is different but equivalent, and matches a different pattern in the equivalence class",
1205+
expected: json.RawMessage(`{
1206+
"status": "error",
1207+
"error": "found duplicate series for the match group {foo=\"blah\"} on the right hand-side of the operation: [{foo=\"blah\", env=\"test\"}, {foo=\"blah\", env=\"prod\"}];many-to-many matching not allowed: matching labels must be unique on one side"
1208+
}`),
1209+
actual: json.RawMessage(`{
1210+
"status": "error",
1211+
"error": "found duplicate series for the match group {foo=\"bar\"} on the right side of the operation at timestamp 1970-01-01T00:00:00Z: {foo=\"bar\", env=\"test\"} and {foo=\"bar\", env=\"prod\"}"
1212+
}`),
1213+
err: nil,
1214+
},
1215+
{
1216+
name: "response errors match equivalence classes, but errors are not from the same equivalence class",
1217+
expected: json.RawMessage(`{
1218+
"status": "error",
1219+
"error": "found duplicate series for the match group {foo=\"bar\"} on the right hand-side of the operation: [{foo=\"bar\", env=\"test\"}, {foo=\"bar\", env=\"prod\"}];many-to-many matching not allowed: matching labels must be unique on one side"
1220+
}`),
1221+
actual: json.RawMessage(`{
1222+
"status": "error",
1223+
"error": "invalid parameter \"query\": invalid expression type \"range vector\" for range query, must be Scalar or instant Vector"
1224+
}`),
1225+
err: errors.New(`expected error 'found duplicate series for the match group {foo="bar"} on the right hand-side of the operation: [{foo="bar", env="test"}, {foo="bar", env="prod"}];many-to-many matching not allowed: matching labels must be unique on one side' but got 'invalid parameter "query": invalid expression type "range vector" for range query, must be Scalar or instant Vector'`),
1226+
},
1227+
{
1228+
name: "expected response error matches an equivalence class, but actual response error does not",
1229+
expected: json.RawMessage(`{
1230+
"status": "error",
1231+
"error": "invalid parameter \"query\": invalid expression type \"range vector\" for range query, must be Scalar or instant Vector"
1232+
}`),
1233+
actual: json.RawMessage(`{
1234+
"status": "error",
1235+
"error": "something went wrong"
1236+
}`),
1237+
err: errors.New(`expected error 'invalid parameter "query": invalid expression type "range vector" for range query, must be Scalar or instant Vector' but got 'something went wrong'`),
1238+
},
1239+
{
1240+
name: "actual response error matches an equivalence class, but expected response error does not",
1241+
expected: json.RawMessage(`{
1242+
"status": "error",
1243+
"error": "something went wrong"
1244+
}`),
1245+
actual: json.RawMessage(`{
1246+
"status": "error",
1247+
"error": "invalid parameter \"query\": invalid expression type \"range vector\" for range query, must be Scalar or instant Vector"
1248+
}`),
1249+
err: errors.New(`expected error 'something went wrong' but got 'invalid parameter "query": invalid expression type "range vector" for range query, must be Scalar or instant Vector'`),
1250+
},
11911251
{
11921252
name: "difference in resultType",
11931253
expected: json.RawMessage(`{

0 commit comments

Comments
 (0)