Skip to content

Commit 97011d8

Browse files
committed
cap the exemplars at the traceql engine level
1 parent 0d00f5b commit 97011d8

File tree

3 files changed

+70
-1
lines changed

3 files changed

+70
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
## main / unreleased
22

3+
* [ENHANCEMENT] Used frontend MaxExemplars config as single source of truth for exemplar limits. Added a safety cap at the traceql engine entry points. [#6515](https://github.com/grafana/tempo/pull/6515) (@zhxiaogg)
34
* [CHANGE] Set default `max_result_limit` for search to 256*1024 [#6525](https://github.com/grafana/tempo/pull/6525) (@zhxiaogg)
45
* [CHANGE] **BREAKING CHANGE** Remove Opencensus receiver [#6523](https://github.com/grafana/tempo/pull/6523) (@javiermolinar)
5-
* [ENHANCEMENT] Used frontend MaxExemplars config as single source of truth for exemplar limits, Removed hardcoded maxExemplars limit from tracesql. [#6515](https://github.com/grafana/tempo/pull/6515) (@zhxiaogg)
66
* [CHANGE] Upgrade Tempo to Go 1.26.0 [#6443](https://github.com/grafana/tempo/pull/6443) (@stoewer)
77
* [CHANGE] Allow duplicate dimensions for span metrics and service graphs. This is a valid use case if using different instrumentation libraries, with spans having "deployment.environment" and others "deployment_environment", for example. [#6288](https://github.com/grafana/tempo/pull/6288) (@carles-grafana)
88
* [CHANGE] Updade default max duration for traceql metrics queries up to one day [#6285](https://github.com/grafana/tempo/pull/6285) (@javiermolinar)

pkg/traceql/engine_metrics.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@ import (
1212
"sync"
1313
"time"
1414

15+
"github.com/go-kit/log/level"
1516
"github.com/grafana/tempo/pkg/tempopb"
1617
commonv1proto "github.com/grafana/tempo/pkg/tempopb/common/v1"
1718
"github.com/grafana/tempo/pkg/util"
19+
"github.com/grafana/tempo/pkg/util/log"
1820
"github.com/prometheus/prometheus/model/labels"
1921
)
2022

@@ -23,6 +25,9 @@ const (
2325
internalMetaTypeCount = "__count"
2426
internalLabelBucket = "__bucket"
2527
maxExemplarsPerBucket = 2
28+
// maxExemplars is a safety cap applied at the engine entry points to bound memory
29+
// usage regardless of what the caller requests.
30+
maxExemplars uint32 = 100000
2631
// NormalNaN is a quiet NaN. This is also math.NaN().
2732
normalNaN uint64 = 0x7ff8000000000001
2833
)
@@ -930,6 +935,11 @@ func (u *UngroupedAggregator) Series() SeriesSet {
930935
}
931936

932937
func (e *Engine) CompileMetricsQueryRangeNonRaw(req *tempopb.QueryRangeRequest, mode AggregateMode) (*MetricsFrontendEvaluator, error) {
938+
if req.Exemplars > maxExemplars {
939+
level.Warn(log.Logger).Log("msg", "capping exemplars to safety limit", "requested", req.Exemplars, "cap", maxExemplars)
940+
req.Exemplars = maxExemplars
941+
}
942+
933943
if req.Start <= 0 {
934944
return nil, fmt.Errorf("start required")
935945
}
@@ -973,6 +983,11 @@ func (e *Engine) CompileMetricsQueryRangeNonRaw(req *tempopb.QueryRangeRequest,
973983
// example if the datasource is replication factor=1 or only a single block then we know there
974984
// aren't duplicates, and we can make some optimizations.
975985
func (e *Engine) CompileMetricsQueryRange(req *tempopb.QueryRangeRequest, timeOverlapCutoff float64, allowUnsafeQueryHints bool) (*MetricsEvaluator, error) {
986+
if req.Exemplars > maxExemplars {
987+
level.Warn(log.Logger).Log("msg", "capping exemplars to safety limit", "requested", req.Exemplars, "cap", maxExemplars)
988+
req.Exemplars = maxExemplars
989+
}
990+
976991
if req.Start <= 0 {
977992
return nil, fmt.Errorf("start required")
978993
}

pkg/traceql/engine_metrics_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,60 @@ func TestCompileMetricsQueryRangeExemplarsHint(t *testing.T) {
522522
}
523523
}
524524

525+
func TestCompileMetricsQueryRangeExemplarsSafetyCap(t *testing.T) {
526+
tcs := []struct {
527+
name string
528+
exemplars uint32
529+
expected int
530+
}{
531+
{"below cap", maxExemplars - 1, int(maxExemplars - 1)},
532+
{"at cap", maxExemplars, int(maxExemplars)},
533+
{"above cap", maxExemplars + 1, int(maxExemplars)},
534+
}
535+
536+
for _, tc := range tcs {
537+
t.Run(tc.name, func(t *testing.T) {
538+
req := &tempopb.QueryRangeRequest{
539+
Query: "{} | rate()",
540+
Start: 1,
541+
End: 2,
542+
Step: 1,
543+
Exemplars: tc.exemplars,
544+
}
545+
eval, err := NewEngine().CompileMetricsQueryRange(req, 0, false)
546+
require.NoError(t, err)
547+
require.Equal(t, tc.expected, eval.maxExemplars)
548+
})
549+
}
550+
}
551+
552+
func TestCompileMetricsQueryRangeNonRawExemplarsSafetyCap(t *testing.T) {
553+
tcs := []struct {
554+
name string
555+
exemplars uint32
556+
expected uint32
557+
}{
558+
{"below cap", maxExemplars - 1, maxExemplars - 1},
559+
{"at cap", maxExemplars, maxExemplars},
560+
{"above cap", maxExemplars + 1, maxExemplars},
561+
}
562+
563+
for _, tc := range tcs {
564+
t.Run(tc.name, func(t *testing.T) {
565+
req := &tempopb.QueryRangeRequest{
566+
Query: "{} | rate()",
567+
Start: 1,
568+
End: 2,
569+
Step: 1,
570+
Exemplars: tc.exemplars,
571+
}
572+
_, err := NewEngine().CompileMetricsQueryRangeNonRaw(req, AggregateModeSum)
573+
require.NoError(t, err)
574+
require.Equal(t, tc.expected, req.Exemplars)
575+
})
576+
}
577+
}
578+
525579
func TestCompileMetricsQueryRangeFetchSpansRequest(t *testing.T) {
526580
tc := map[string]struct {
527581
q string

0 commit comments

Comments
 (0)