distributor: log truncated oversized attributes with rate limited logger (#6467)

carles-grafana · web-flow · commit 15fa5b4f5c99 · 2026-02-20T10:41:55.000+01:00
* distributor: log truncated oversized attributes with rate limited logger

Add warn-level logging when attribute keys or values exceed the
configured max_attribute_bytes limit.

* remove benchmark, no longer useful
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@
 * [ENHANCEMENT] Add new alerts and runbooks entries [#6276](https://github.com/grafana/tempo/pull/6276) (@javiermolinar)
 * [ENHANCEMENT] Double the maximum number of dedicated string columns in vParquet5 and update tempo-cli to determine the optimum number for the data [#6282](https://github.com/grafana/tempo/pull/6282) (@mdisibio)
 * [ENHANCEMENT] Improve attribute truncating observability [#6400](https://github.com/grafana/tempo/pull/6400) (@javiermolinar)
+* [ENHANCEMENT] Log truncated oversized attributes [#6467](https://github.com/grafana/tempo/pull/6467) (@carles-grafana)
 * [ENHANCEMENT] Remove live-store partition owner from ring on shutdown to prevent stale owner entries [#6409](https://github.com/grafana/tempo/pull/6409) (@oleg-kozlyuk-grafana)
 * [ENHANCEMENT] Improved live store readiness check and added `readiness_target_lag` and `readiness_max_wait` config parameters. Live store will now - if `readiness_target_lag` is set - not report `/ready` until Kafka lag is brought under the specified value [#6238](https://github.com/grafana/tempo/pull/6238) [#6405](https://github.com/grafana/tempo/pull/6405) (@oleg-kozlyuk-grafana, @ruslan-mikhailov)
 * [ENHANCEMENT] Expose a new histogram metric to track the jobs per query distribution [#6343](https://github.com/grafana/tempo/pull/6343) (@javiermolinar)
diff --git a/modules/distributor/distributor.go b/modules/distributor/distributor.go
@@ -44,11 +44,14 @@ import (
 	v1 "github.com/grafana/tempo/pkg/tempopb/trace/v1"
 	"github.com/grafana/tempo/pkg/usagestats"
 	"github.com/grafana/tempo/pkg/util"
+	tempo_log "github.com/grafana/tempo/pkg/util/log"
 	"github.com/grafana/tempo/pkg/validation"
 )
 
 const (
 	distributorRingKey = "distributor"
+
+	truncationLogsPerSecond = 1
 )
 
 var (
@@ -167,6 +170,13 @@ func (c truncatedAttributesCount) Total() int {
 	return c.Resource + c.Scope + c.Span + c.Event + c.Link
 }
 
+type truncatedAttrInfo struct {
+	scope    string
+	name     string
+	field    string // "key" or "value"
+	origSize int    // original byte length before truncation; 0 means no example captured yet
+}
+
 // Distributor coordinates replicates and distribution of log streams.
 type Distributor struct {
 	services.Service
@@ -207,6 +217,8 @@ type Distributor struct {
 	// Middleware errors are logged but don't fail the push (fail open behavior).
 	tracePushMiddlewares []TracePushMiddleware
 
+	truncationLogger *tempo_log.RateLimitedLogger
+
 	// For testing functionality that relies on timing without having to sleep in unit tests.
 	sleep func(time.Duration)
 	now   func() time.Time
@@ -284,6 +296,7 @@ func New(
 		overrides:            o,
 		traceEncoder:         model.MustNewSegmentDecoder(model.CurrentEncoding),
 		tracePushMiddlewares: cfg.TracePushMiddlewares,
+		truncationLogger:     tempo_log.NewRateLimitedLogger(truncationLogsPerSecond, level.Warn(logger)),
 		logger:               logger,
 		sleep:                time.Sleep,
 		now:                  time.Now,
@@ -494,7 +507,7 @@ func (d *Distributor) PushTraces(ctx context.Context, traces ptrace.Traces) (*te
 
 	maxAttributeBytes := d.getMaxAttributeBytes(userID)
 
-	ringTokens, rebatchedTraces, truncatedAttributesCount, err := requestsByTraceID(batches, userID, spanCount, maxAttributeBytes)
+	ringTokens, rebatchedTraces, truncatedAttributesCount, truncationExample, err := requestsByTraceID(batches, userID, spanCount, maxAttributeBytes)
 	if err != nil {
 		logDiscardedResourceSpans(batches, userID, &d.cfg.LogDiscardedSpans, d.logger)
 		return nil, err
@@ -506,6 +519,17 @@ func (d *Distributor) PushTraces(ctx context.Context, traces ptrace.Traces) (*te
 		metricAttributesTruncated.WithLabelValues(userID, "span").Add(float64(truncatedAttributesCount.Span))
 		metricAttributesTruncated.WithLabelValues(userID, "event").Add(float64(truncatedAttributesCount.Event))
 		metricAttributesTruncated.WithLabelValues(userID, "link").Add(float64(truncatedAttributesCount.Link))
+
+		if truncationExample != nil {
+			d.truncationLogger.Log("msg", "attributes truncated",
+				"tenant", userID,
+				"total_truncated", truncatedAttributesCount.Total(),
+				"max_size_bytes", maxAttributeBytes,
+				"example_scope", truncationExample.scope,
+				"example_name", truncationExample.name,
+				"example_field", truncationExample.field,
+				"example_orig_size", truncationExample.origSize)
+		}
 	}
 
 	if d.cfg.IngesterWritePathEnabled {
@@ -727,53 +751,53 @@ func (d *Distributor) sendToKafka(ctx context.Context, userID string, keys []uin
 	}, ring.DoBatchOptions{})
 }
 
-// requestsByTraceID takes an incoming tempodb.PushRequest and creates a set of keys for the hash ring
-// and traces to pass onto the ingesters.
-func requestsByTraceID(batches []*v1.ResourceSpans, userID string, spanCount, maxSpanAttrSize int) ([]uint32, []*rebatchedTrace, truncatedAttributesCount, error) {
+// requestsByTraceID groups ResourceSpans by trace ID, producing hash-ring tokens and
+// rebatched traces for the ingesters. It truncates oversized attributes and returns
+// the first truncation example (if any) for diagnostic logging.
+func requestsByTraceID(batches []*v1.ResourceSpans, userID string, spanCount, maxSpanAttrSize int) ([]uint32, []*rebatchedTrace, truncatedAttributesCount, *truncatedAttrInfo, error) {
 	const tracesPerBatch = 20 // p50 of internal env
 	tracesByID := make(map[uint64]*rebatchedTrace, tracesPerBatch)
 	truncatedCount := truncatedAttributesCount{}
+
+	// truncationExample captures one example of a truncated attribute for rate-limited logging.
+	var truncationExample truncatedAttrInfo
+
 	currentTime := uint32(time.Now().Unix())
 	for _, b := range batches {
 		spansByILS := make(map[uint64]*v1.ScopeSpans)
 		// check resource for large attributes
 		if maxSpanAttrSize > 0 && b.Resource != nil {
-			resourceAttrTruncatedCount := processAttributes(b.Resource.Attributes, maxSpanAttrSize)
-			truncatedCount.Resource += resourceAttrTruncatedCount
+			truncatedCount.Resource += processAttributes(b.Resource.Attributes, maxSpanAttrSize, &truncationExample, "resource")
 		}
 
 		for _, ils := range b.ScopeSpans {
 
 			// check instrumentation for large attributes
 			if maxSpanAttrSize > 0 && ils.Scope != nil {
-				scopeAttrTruncatedCount := processAttributes(ils.Scope.Attributes, maxSpanAttrSize)
-				truncatedCount.Scope += scopeAttrTruncatedCount
+				truncatedCount.Scope += processAttributes(ils.Scope.Attributes, maxSpanAttrSize, &truncationExample, "scope")
 			}
 
 			for _, span := range ils.Spans {
 				// check spans for large attributes
 				if maxSpanAttrSize > 0 {
-					spanAttrTruncatedCount := processAttributes(span.Attributes, maxSpanAttrSize)
-					truncatedCount.Span += spanAttrTruncatedCount
+					truncatedCount.Span += processAttributes(span.Attributes, maxSpanAttrSize, &truncationExample, "span")
 
 					// check large attributes for events and links
 					for _, event := range span.Events {
-						eventAttrTruncatedCount := processAttributes(event.Attributes, maxSpanAttrSize)
-						truncatedCount.Event += eventAttrTruncatedCount
+						truncatedCount.Event += processAttributes(event.Attributes, maxSpanAttrSize, &truncationExample, "event")
 					}
 
 					for _, link := range span.Links {
-						linkAttrTruncatedCount := processAttributes(link.Attributes, maxSpanAttrSize)
-						truncatedCount.Link += linkAttrTruncatedCount
+						truncatedCount.Link += processAttributes(link.Attributes, maxSpanAttrSize, &truncationExample, "link")
 					}
 				}
 				traceID := span.TraceId
 				if !validation.ValidTraceID(traceID) {
-					return nil, nil, truncatedAttributesCount{}, status.Errorf(codes.InvalidArgument, "trace ids must be 128 bit, received %d bits", len(traceID)*8)
+					return nil, nil, truncatedAttributesCount{}, nil, status.Errorf(codes.InvalidArgument, "trace ids must be 128 bit, received %d bits", len(traceID)*8)
 				}
 
 				if !validation.ValidSpanID(span.SpanId) {
-					return nil, nil, truncatedAttributesCount{}, status.Errorf(codes.InvalidArgument, "span ids must be 64 bit and not all zero, received %d bits", len(span.SpanId)*8)
+					return nil, nil, truncatedAttributesCount{}, nil, status.Errorf(codes.InvalidArgument, "span ids must be 64 bit and not all zero, received %d bits", len(span.SpanId)*8)
 				}
 
 				traceKey := util.HashForTraceID(traceID)
@@ -846,21 +870,32 @@ func requestsByTraceID(batches []*v1.ResourceSpans, userID string, spanCount, ma
 		traces = append(traces, tr)
 	}
 
-	return ringTokens, traces, truncatedCount, nil
+	if truncationExample.origSize > 0 {
+		return ringTokens, traces, truncatedCount, &truncationExample, nil
+	}
+	return ringTokens, traces, truncatedCount, nil, nil
 }
 
-// find and truncate the span attributes that are too large
-func processAttributes(attributes []*v1_common.KeyValue, maxAttrSize int) int {
+// processAttributes finds and truncates attribute keys/values that exceed maxAttrSize.
+func processAttributes(attributes []*v1_common.KeyValue, maxAttrSize int, truncationExample *truncatedAttrInfo, scope string) int {
 	count := 0
 	for _, attr := range attributes {
 		if len(attr.Key) > maxAttrSize {
+			origSize := len(attr.Key)
 			attr.Key = attr.Key[:maxAttrSize]
+			if truncationExample != nil && truncationExample.origSize == 0 { // only capture the first truncation
+				// name is the truncated prefix; origSize records the full original length.
+				*truncationExample = truncatedAttrInfo{scope: scope, name: attr.Key, field: "key", origSize: origSize}
+			}
 			count++
 		}
 
 		switch value := attr.GetValue().Value.(type) {
 		case *v1_common.AnyValue_StringValue:
 			if len(value.StringValue) > maxAttrSize {
+				if truncationExample != nil && truncationExample.origSize == 0 { // only capture the first truncation
+					*truncationExample = truncatedAttrInfo{scope: scope, name: attr.Key, field: "value", origSize: len(value.StringValue)}
+				}
 				value.StringValue = value.StringValue[:maxAttrSize]
 				count++
 			}
diff --git a/modules/distributor/distributor_test.go b/modules/distributor/distributor_test.go
@@ -941,7 +941,7 @@ func TestRequestsByTraceID(t *testing.T) {
 			if tt.emptyTenant {
 				tenant = ""
 			}
-			ringTokens, rebatchedTraces, _, err := requestsByTraceID(tt.batches, tenant, 1, 1000)
+			ringTokens, rebatchedTraces, _, _, err := requestsByTraceID(tt.batches, tenant, 1, 1000)
 			require.Equal(t, len(ringTokens), len(rebatchedTraces))
 
 			for i, expectedID := range tt.expectedIDs {
@@ -1023,7 +1023,7 @@ func TestProcessAttributes(t *testing.T) {
 		},
 	}
 
-	_, rebatchedTrace, truncatedCount, _ := requestsByTraceID(trace.ResourceSpans, "test", spanCount*batchCount, maxAttrByte)
+	_, rebatchedTrace, truncatedCount, _, _ := requestsByTraceID(trace.ResourceSpans, "test", spanCount*batchCount, maxAttrByte)
 	// 2 at resource level, 2 at span level, 2 at event level, 2 at link level, 2 at scope level
 	assert.Equal(t, 10, truncatedCount.Total())
 	assert.Equal(t, 2, truncatedCount.Resource)
@@ -1092,6 +1092,105 @@ func TestProcessAttributes(t *testing.T) {
 	}
 }
 
+func TestRequestsByTraceID_TruncationDetail(t *testing.T) {
+	longString := strings.Repeat("t", 5000)
+	maxAttrByte := 1000
+
+	// No truncation — detail should be nil
+	trace := test.MakeTraceWithSpanCount(1, 1, []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10})
+	_, _, truncatedCount, detail, err := requestsByTraceID(trace.ResourceSpans, "test", 1, maxAttrByte)
+	require.NoError(t, err)
+	assert.Equal(t, 0, truncatedCount.Total())
+	assert.Nil(t, detail)
+
+	// With truncation — detail is always populated
+	trace = test.MakeTraceWithSpanCount(1, 1, []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10})
+	trace.ResourceSpans[0].Resource.Attributes = append(trace.ResourceSpans[0].Resource.Attributes,
+		test.MakeAttribute("oversized", longString))
+	_, _, truncatedCount, detail, err = requestsByTraceID(trace.ResourceSpans, "test", 1, maxAttrByte)
+	require.NoError(t, err)
+	assert.Greater(t, truncatedCount.Total(), 0)
+	require.NotNil(t, detail)
+	assert.Equal(t, "resource", detail.scope)
+	assert.Equal(t, "value", detail.field)
+	assert.Equal(t, "oversized", detail.name)
+	assert.Equal(t, 5000, detail.origSize)
+
+	// maxSpanAttrSize == 0 — no truncation, no detail
+	trace = test.MakeTraceWithSpanCount(1, 1, []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10})
+	trace.ResourceSpans[0].Resource.Attributes = append(trace.ResourceSpans[0].Resource.Attributes,
+		test.MakeAttribute("oversized", longString))
+	_, _, truncatedCount, detail, err = requestsByTraceID(trace.ResourceSpans, "test", 1, 0)
+	require.NoError(t, err)
+	assert.Equal(t, 0, truncatedCount.Total())
+	assert.Nil(t, detail)
+}
+
+func TestProcessAttributesDetail(t *testing.T) {
+	// Without detail — nil detail, truncation still happens
+	attributes := []*v1_common.KeyValue{
+		test.MakeAttribute("key", strings.Repeat("v", 5000)),
+	}
+	count := processAttributes(attributes, 2048, nil, "span")
+	assert.Equal(t, 1, count)
+	assert.Equal(t, 2048, len(attributes[0].Value.GetStringValue()))
+
+	// Value truncation — detail captured deterministically
+	attributes = []*v1_common.KeyValue{
+		test.MakeAttribute("key", strings.Repeat("v", 5000)),
+	}
+	detail := truncatedAttrInfo{}
+	count = processAttributes(attributes, 2048, &detail, "span")
+	assert.Equal(t, 1, count)
+	assert.Equal(t, "key", detail.name)
+	assert.Equal(t, "span", detail.scope)
+	assert.Equal(t, "value", detail.field)
+	assert.Equal(t, 5000, detail.origSize)
+
+	// Key truncation — detail captured deterministically
+	attributes = []*v1_common.KeyValue{
+		test.MakeAttribute(strings.Repeat("k", 5000), "short"),
+	}
+	detail = truncatedAttrInfo{}
+	count = processAttributes(attributes, 2048, &detail, "resource")
+	assert.Equal(t, 1, count)
+	assert.Equal(t, "key", detail.field)
+	assert.Equal(t, "resource", detail.scope)
+	assert.Equal(t, 5000, detail.origSize)
+	assert.Equal(t, strings.Repeat("k", 2048), detail.name) // truncated prefix, not full original
+
+	// Only the first truncation is captured (first of two values)
+	attributes = []*v1_common.KeyValue{
+		test.MakeAttribute("key1", strings.Repeat("v", 5000)),
+		test.MakeAttribute("key2", strings.Repeat("v", 6000)),
+	}
+	detail = truncatedAttrInfo{}
+	count = processAttributes(attributes, 2048, &detail, "span")
+	assert.Equal(t, 2, count)
+	assert.Equal(t, "key1", detail.name)
+	assert.Equal(t, 5000, detail.origSize)
+
+	// Both key AND value oversized — key wins (checked first)
+	attributes = []*v1_common.KeyValue{
+		test.MakeAttribute(strings.Repeat("k", 5000), strings.Repeat("v", 6000)),
+	}
+	detail = truncatedAttrInfo{}
+	count = processAttributes(attributes, 2048, &detail, "span")
+	assert.Equal(t, 2, count)
+	assert.Equal(t, "key", detail.field)
+	assert.Equal(t, 5000, detail.origSize)
+
+	// Already-captured detail (origSize > 0) is not overwritten
+	detail = truncatedAttrInfo{scope: "resource", name: "first", field: "value", origSize: 3000}
+	attributes = []*v1_common.KeyValue{
+		test.MakeAttribute("key2", strings.Repeat("v", 6000)),
+	}
+	count = processAttributes(attributes, 2048, &detail, "span")
+	assert.Equal(t, 1, count)
+	assert.Equal(t, "first", detail.name)
+	assert.Equal(t, 3000, detail.origSize)
+}
+
 func BenchmarkTestsByRequestID(b *testing.B) {
 	spansPer := 5000
 	batches := 100
@@ -1114,7 +1213,7 @@ func BenchmarkTestsByRequestID(b *testing.B) {
 
 	for i := 0; i < b.N; i++ {
 		for _, blerg := range ils {
-			_, _, _, err := requestsByTraceID([]*v1.ResourceSpans{
+			_, _, _, _, err := requestsByTraceID([]*v1.ResourceSpans{
 				{
 					ScopeSpans: blerg,
 				},
@@ -2424,7 +2523,7 @@ func TestRequestsByTraceID_SpanIDValidation(t *testing.T) {
 				},
 			},
 		}
-		_, _, _, err := requestsByTraceID(batches, "test-tenant", 1, 1000)
+		_, _, _, _, err := requestsByTraceID(batches, "test-tenant", 1, 1000)
 		require.Error(t, err)
 		require.Contains(t, err.Error(), "span ids must be 64 bit")
 	}
@@ -2444,7 +2543,7 @@ func TestRequestsByTraceID_SpanIDValidation(t *testing.T) {
 			},
 		},
 	}
-	_, _, _, err := requestsByTraceID(batches, "test-tenant", 1, 1000)
+	_, _, _, _, err := requestsByTraceID(batches, "test-tenant", 1, 1000)
 	require.NoError(t, err)
 }
 
diff --git a/modules/distributor/forwarder_test.go b/modules/distributor/forwarder_test.go
@@ -30,7 +30,7 @@ func TestForwarder(t *testing.T) {
 	require.NoError(t, err)
 
 	b := test.MakeBatch(10, id)
-	keys, rebatchedTraces, _, err := requestsByTraceID([]*v1.ResourceSpans{b}, tenantID, 10, 1000)
+	keys, rebatchedTraces, _, _, err := requestsByTraceID([]*v1.ResourceSpans{b}, tenantID, 10, 1000)
 	require.NoError(t, err)
 
 	o, err := overrides.NewOverrides(oCfg, nil, prometheus.DefaultRegisterer)
@@ -81,7 +81,7 @@ func TestForwarder_shutdown(t *testing.T) {
 	require.NoError(t, err)
 
 	b := test.MakeBatch(10, id)
-	keys, rebatchedTraces, _, err := requestsByTraceID([]*v1.ResourceSpans{b}, tenantID, 10, 1000)
+	keys, rebatchedTraces, _, _, err := requestsByTraceID([]*v1.ResourceSpans{b}, tenantID, 10, 1000)
 	require.NoError(t, err)
 
 	o, err := overrides.NewOverrides(oCfg, nil, prometheus.DefaultRegisterer)