Skip to content

Commit 5f42d7c

Browse files
committed
add experimental batch span processor metrics
1 parent 5a888fc commit 5f42d7c

File tree

4 files changed

+101
-3
lines changed

4 files changed

+101
-3
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ The next release will require at least [Go 1.23].
1313

1414
### Added
1515

16+
- Add `OTEL_GO_X_SELF_OBSERVABILITY` environment variable to control whether self-observability metrics and traces are produced by SDKs. (#TODO)
17+
- Add experimental `otel.sdk.processor.span.queue.size`, `otel.sdk.processor.span.queue.capacity`, and `otel.sdk.processor.span.processed.count` metrics to the trace batch span processor in `go.opentelemetry.io/otel/sdk/trace`. (#TODO)
18+
19+
### Fixed
20+
1621
- Add `ValueFromAttribute` and `KeyValueFromAttribute` in `go.opentelemetry.io/otel/log`. (#6180)
1722
- Add `EventName` and `SetEventName` to `Record` in `go.opentelemetry.io/otel/log`. (#6187)
1823
- Add `EventName` to `RecordFactory` in `go.opentelemetry.io/otel/log/logtest`. (#6187)

sdk/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ require (
1010
github.com/google/uuid v1.6.0
1111
github.com/stretchr/testify v1.10.0
1212
go.opentelemetry.io/otel v1.34.0
13+
go.opentelemetry.io/otel/metric v1.34.0
1314
go.opentelemetry.io/otel/trace v1.34.0
1415
go.uber.org/goleak v1.3.0
1516
golang.org/x/sys v0.30.0
@@ -20,7 +21,6 @@ require (
2021
github.com/go-logr/stdr v1.2.2 // indirect
2122
github.com/pmezard/go-difflib v1.0.0 // indirect
2223
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
23-
go.opentelemetry.io/otel/metric v1.34.0 // indirect
2424
gopkg.in/yaml.v3 v3.0.1 // indirect
2525
)
2626

sdk/trace/batch_span_processor.go

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,20 @@ package trace // import "go.opentelemetry.io/otel/sdk/trace"
55

66
import (
77
"context"
8+
"errors"
9+
"fmt"
810
"sync"
911
"sync/atomic"
1012
"time"
1113

1214
"go.opentelemetry.io/otel"
15+
"go.opentelemetry.io/otel/attribute"
1316
"go.opentelemetry.io/otel/internal/global"
17+
"go.opentelemetry.io/otel/metric"
18+
"go.opentelemetry.io/otel/metric/noop"
1419
"go.opentelemetry.io/otel/sdk/internal/env"
20+
"go.opentelemetry.io/otel/sdk/internal/x"
21+
semconv "go.opentelemetry.io/otel/semconv/v1.30.0"
1522
"go.opentelemetry.io/otel/trace"
1623
)
1724

@@ -21,6 +28,19 @@ const (
2128
DefaultScheduleDelay = 5000
2229
DefaultExportTimeout = 30000
2330
DefaultMaxExportBatchSize = 512
31+
32+
queueSizeMetricName = "otel.sdk.processor.span.queue.size"
33+
queueSizeMetricDescription = "The number of spans in the queue of a given instance of an SDK span processor"
34+
queueCapacityMetricName = "otel.sdk.processor.span.queue.capacity"
35+
queueCapacityMetricDescription = "The maximum number of spans the queue of a given instance of an SDK span processor can hold"
36+
spansProcessedMetricName = "otel.sdk.processor.span.processed.count"
37+
spansProcessedMetricDescription = "The number of spans for which the processing has finished, either successful or failed"
38+
spanCountUnit = "{span}"
39+
)
40+
41+
var (
42+
componentTypeKey = attribute.Key("otel.component.type")
43+
componentNameKey = attribute.Key("otel.component.name")
2444
)
2545

2646
// BatchSpanProcessorOption configures a BatchSpanProcessor.
@@ -66,6 +86,12 @@ type batchSpanProcessor struct {
6686
queue chan ReadOnlySpan
6787
dropped uint32
6888

89+
callbackRegistration metric.Registration
90+
spansProcessedCounter metric.Int64Counter
91+
successAttributes metric.MeasurementOption
92+
alreadyShutdownAttributes metric.MeasurementOption
93+
queueFullAttributes metric.MeasurementOption
94+
6995
batch []ReadOnlySpan
7096
batchMutex sync.Mutex
7197
timer *time.Timer
@@ -111,6 +137,8 @@ func NewBatchSpanProcessor(exporter SpanExporter, options ...BatchSpanProcessorO
111137
stopCh: make(chan struct{}),
112138
}
113139

140+
bsp.configureSelfObservability()
141+
114142
bsp.stopWait.Add(1)
115143
go func() {
116144
defer bsp.stopWait.Done()
@@ -121,13 +149,74 @@ func NewBatchSpanProcessor(exporter SpanExporter, options ...BatchSpanProcessorO
121149
return bsp
122150
}
123151

152+
var processorID atomic.Uint64
153+
154+
// nextProcessorID returns an identifier for this batch span processor,
155+
// starting with 0 and incrementing by 1 each time it is called.
156+
func nextProcessorID() int64 {
157+
return int64(processorID.Add(1) - 1)
158+
}
159+
160+
// configureSelfObservability configures metrics for the batch span processor.
161+
func (bsp *batchSpanProcessor) configureSelfObservability() {
162+
mp := otel.GetMeterProvider()
163+
if !x.SelfObservability.Enabled() {
164+
mp = metric.MeterProvider(noop.NewMeterProvider())
165+
}
166+
meter := mp.Meter(
167+
selfObsScopeName,
168+
metric.WithInstrumentationVersion(version()),
169+
)
170+
171+
queueCapacityUpDownCounter, err := meter.Int64ObservableUpDownCounter(queueCapacityMetricName,
172+
metric.WithUnit(spanCountUnit),
173+
metric.WithDescription(queueCapacityMetricDescription),
174+
)
175+
if err != nil {
176+
otel.Handle(err)
177+
}
178+
queueSizeUpDownCounter, err := meter.Int64ObservableUpDownCounter(queueSizeMetricName,
179+
metric.WithUnit(spanCountUnit),
180+
metric.WithDescription(queueSizeMetricDescription),
181+
)
182+
if err != nil {
183+
otel.Handle(err)
184+
}
185+
bsp.spansProcessedCounter, err = meter.Int64Counter(spansProcessedMetricName,
186+
metric.WithUnit(spanCountUnit),
187+
metric.WithDescription(spansProcessedMetricDescription),
188+
)
189+
if err != nil {
190+
otel.Handle(err)
191+
}
192+
193+
componentTypeAttr := componentTypeKey.String("batching_span_processor")
194+
componentNameAttr := componentNameKey.String(fmt.Sprintf("batching_span_processor/%d", nextProcessorID()))
195+
bsp.successAttributes = metric.WithAttributes(componentNameAttr, componentTypeAttr, semconv.ErrorTypeKey.String(""))
196+
bsp.alreadyShutdownAttributes = metric.WithAttributes(componentNameAttr, componentTypeAttr, semconv.ErrorTypeKey.String("already_shutdown"))
197+
bsp.queueFullAttributes = metric.WithAttributes(componentNameAttr, componentTypeAttr, semconv.ErrorTypeKey.String("queue_full"))
198+
callabckAttributesOpt := metric.WithAttributes(componentNameAttr, componentTypeAttr)
199+
bsp.callbackRegistration, err = meter.RegisterCallback(
200+
func(ctx context.Context, o metric.Observer) error {
201+
o.ObserveInt64(queueSizeUpDownCounter, int64(len(bsp.queue)), callabckAttributesOpt)
202+
o.ObserveInt64(queueCapacityUpDownCounter, int64(bsp.o.MaxQueueSize), callabckAttributesOpt)
203+
return nil
204+
},
205+
queueSizeUpDownCounter, queueCapacityUpDownCounter)
206+
if err != nil {
207+
otel.Handle(err)
208+
}
209+
}
210+
124211
// OnStart method does nothing.
125212
func (bsp *batchSpanProcessor) OnStart(parent context.Context, s ReadWriteSpan) {}
126213

127214
// OnEnd method enqueues a ReadOnlySpan for later processing.
128215
func (bsp *batchSpanProcessor) OnEnd(s ReadOnlySpan) {
216+
ctx := context.Background()
129217
// Do not enqueue spans after Shutdown.
130218
if bsp.stopped.Load() {
219+
bsp.spansProcessedCounter.Add(ctx, 1, bsp.alreadyShutdownAttributes)
131220
return
132221
}
133222

@@ -162,7 +251,7 @@ func (bsp *batchSpanProcessor) Shutdown(ctx context.Context) error {
162251
err = ctx.Err()
163252
}
164253
})
165-
return err
254+
return errors.Join(err, bsp.callbackRegistration.Unregister())
166255
}
167256

168257
type forceFlushSpan struct {
@@ -273,6 +362,7 @@ func (bsp *batchSpanProcessor) exportSpans(ctx context.Context) error {
273362

274363
if l := len(bsp.batch); l > 0 {
275364
global.Debug("exporting spans", "count", len(bsp.batch), "total_dropped", atomic.LoadUint32(&bsp.dropped))
365+
bsp.spansProcessedCounter.Add(ctx, int64(len(bsp.batch)), bsp.successAttributes)
276366
err := bsp.e.ExportSpans(ctx, bsp.batch)
277367

278368
// A new batch is always created after exporting, even if the batch failed to be exported.
@@ -381,11 +471,12 @@ func (bsp *batchSpanProcessor) enqueueBlockOnQueueFull(ctx context.Context, sd R
381471
case bsp.queue <- sd:
382472
return true
383473
case <-ctx.Done():
474+
bsp.spansProcessedCounter.Add(ctx, 1, bsp.queueFullAttributes)
384475
return false
385476
}
386477
}
387478

388-
func (bsp *batchSpanProcessor) enqueueDrop(_ context.Context, sd ReadOnlySpan) bool {
479+
func (bsp *batchSpanProcessor) enqueueDrop(ctx context.Context, sd ReadOnlySpan) bool {
389480
if !sd.SpanContext().IsSampled() {
390481
return false
391482
}
@@ -395,6 +486,7 @@ func (bsp *batchSpanProcessor) enqueueDrop(_ context.Context, sd ReadOnlySpan) b
395486
return true
396487
default:
397488
atomic.AddUint32(&bsp.dropped, 1)
489+
bsp.spansProcessedCounter.Add(ctx, 1, bsp.queueFullAttributes)
398490
}
399491
return false
400492
}

sdk/trace/provider.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020

2121
const (
2222
defaultTracerName = "go.opentelemetry.io/otel/sdk/tracer"
23+
selfObsScopeName = "go.opentelemetry.io/otel/sdk/trace"
2324
)
2425

2526
// tracerProviderConfig.

0 commit comments

Comments
 (0)