Optimize Observability return types in in Prometheus exporter (#7410)

MrAlias · web-flow · commit ac8d8e92e39c · 2025-09-25T11:39:53.000-07:00
Do not allocate a return function from `ExportMetrics`,
`RecordCollectionDuration`, or `RecordOperationDuration` to the heap.
Use the added `ExportOp` or `Timer` type instead.

### Benchmarks

#### `prometheus`

```
goos: linux
goarch: amd64
pkg: go.opentelemetry.io/otel/exporters/prometheus
cpu: Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz
                                     │ main.bmark.result │  prom-optimize-observ.bmark.result  │
                                     │      sec/op       │    sec/op     vs base               │
Collect1/ObservabilityDisabled-8            27.59µ ±  7%   27.55µ ±  2%       ~ (p=0.631 n=10)
Collect1/ObservabilityEnabled-8             29.23µ ±  1%   27.25µ ±  7%  -6.78% (p=0.004 n=10)
Collect10/ObservabilityDisabled-8           70.75µ ±  3%   66.81µ ±  4%  -5.57% (p=0.003 n=10)
Collect10/ObservabilityEnabled-8            75.41µ ±  5%   71.13µ ±  5%  -5.68% (p=0.002 n=10)
Collect100/ObservabilityDisabled-8          420.7µ ±  4%   425.4µ ±  6%       ~ (p=0.912 n=10)
Collect100/ObservabilityEnabled-8           432.3µ ±  3%   422.2µ ±  5%       ~ (p=0.105 n=10)
Collect1000/ObservabilityDisabled-8         3.929m ± 31%   3.808m ±  2%  -3.09% (p=0.001 n=10)
Collect1000/ObservabilityEnabled-8          4.150m ±  1%   3.964m ±  4%  -4.48% (p=0.003 n=10)
Collect10000/ObservabilityDisabled-8        37.64m ±  6%   37.52m ±  2%       ~ (p=0.739 n=10)
Collect10000/ObservabilityEnabled-8         39.46m ±  2%   39.81m ± 19%       ~ (p=0.436 n=10)
geomean                                     672.6µ         654.6µ        -2.68%

                                     │ main.bmark.result │  prom-optimize-observ.bmark.result  │
                                     │       B/op        │     B/op      vs base               │
Collect1/ObservabilityDisabled-8            34.40Ki ± 0%   34.40Ki ± 0%       ~ (p=0.075 n=10)
Collect1/ObservabilityEnabled-8             34.64Ki ± 0%   34.43Ki ± 0%  -0.60% (p=0.000 n=10)
Collect10/ObservabilityDisabled-8           47.26Ki ± 0%   47.25Ki ± 0%       ~ (p=0.093 n=10)
Collect10/ObservabilityEnabled-8            47.93Ki ± 0%   47.30Ki ± 0%  -1.33% (p=0.000 n=10)
Collect100/ObservabilityDisabled-8          191.3Ki ± 0%   191.0Ki ± 0%       ~ (p=0.218 n=10)
Collect100/ObservabilityEnabled-8           197.0Ki ± 0%   191.6Ki ± 0%  -2.74% (p=0.000 n=10)
Collect1000/ObservabilityDisabled-8         1.902Mi ± 1%   1.891Mi ± 1%       ~ (p=0.353 n=10)
Collect1000/ObservabilityEnabled-8          1.935Mi ± 2%   1.889Mi ± 1%  -2.38% (p=0.000 n=10)
Collect10000/ObservabilityDisabled-8        17.67Mi ± 4%   18.17Mi ± 5%       ~ (p=0.190 n=10)
Collect10000/ObservabilityEnabled-8         18.62Mi ± 4%   17.98Mi ± 6%  -3.42% (p=0.035 n=10)
geomean                                     410.4Ki        406.9Ki       -0.85%

                                     │ main.bmark.result │  prom-optimize-observ.bmark.result   │
                                     │     allocs/op     │  allocs/op   vs base                 │
Collect1/ObservabilityDisabled-8              61.00 ± 0%    61.00 ± 0%       ~ (p=1.000 n=10) ¹
Collect1/ObservabilityEnabled-8               64.00 ± 0%    61.00 ± 0%  -4.69% (p=0.000 n=10)
Collect10/ObservabilityDisabled-8             410.0 ± 0%    410.0 ± 0%       ~ (p=1.000 n=10) ¹
Collect10/ObservabilityEnabled-8              423.0 ± 0%    411.0 ± 0%  -2.84% (p=0.000 n=10)
Collect100/ObservabilityDisabled-8           3.874k ± 0%   3.874k ± 0%       ~ (p=0.272 n=10)
Collect100/ObservabilityEnabled-8            3.979k ± 0%   3.876k ± 0%  -2.59% (p=0.000 n=10)
Collect1000/ObservabilityDisabled-8          38.90k ± 0%   38.88k ± 0%       ~ (p=0.306 n=10)
Collect1000/ObservabilityEnabled-8           39.88k ± 0%   38.88k ± 0%  -2.51% (p=0.000 n=10)
Collect10000/ObservabilityDisabled-8         387.9k ± 0%   388.9k ± 0%       ~ (p=0.138 n=10)
Collect10000/ObservabilityEnabled-8          399.0k ± 0%   388.6k ± 1%  -2.60% (p=0.000 n=10)
geomean                                      4.364k        4.298k       -1.52%
¹ all samples are equal
```

#### `prometheus/internal/observ`

```terminal
goos: linux
goarch: amd64
pkg: go.opentelemetry.io/otel/exporters/prometheus/internal/observ
cpu: Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz
                                                  │ main.bmark.result │  prom-optimize-observ.bmark.result  │
                                                  │      sec/op       │   sec/op     vs base                │
InstrumentationExportMetrics/NoError-8                   92.64n ±  7%   56.07n ± 5%  -39.48% (p=0.000 n=10)
InstrumentationExportMetrics/AllError-8                  664.6n ±  4%   579.9n ± 4%  -12.74% (p=0.000 n=10)
InstrumentationExportMetrics/PartialError-8              637.5n ± 10%   579.1n ± 6%   -9.15% (p=0.000 n=10)
InstrumentationRecordOperationDuration/NoError-8         148.3n ±  5%   109.9n ± 3%  -25.89% (p=0.000 n=10)
InstrumentationRecordOperationDuration/Error-8           709.9n ±  8%   613.5n ± 3%  -13.58% (p=0.000 n=10)
InstrumentationRecordCollectionDuration/NoError-8        150.9n ± 11%   114.1n ± 5%  -24.35% (p=0.000 n=10)
InstrumentationRecordCollectionDuration/Error-8          723.5n ± 12%   629.2n ± 2%  -13.04% (p=0.000 n=10)
geomean                                                  332.7n         264.8n       -20.42%

                                                  │ main.bmark.result │    prom-optimize-observ.bmark.result    │
                                                  │       B/op        │    B/op     vs base                     │
InstrumentationExportMetrics/NoError-8                     48.00 ± 0%    0.00 ± 0%  -100.00% (p=0.000 n=10)
InstrumentationExportMetrics/AllError-8                    264.0 ± 0%   216.0 ± 0%   -18.18% (p=0.000 n=10)
InstrumentationExportMetrics/PartialError-8                264.0 ± 0%   216.0 ± 0%   -18.18% (p=0.000 n=10)
InstrumentationRecordOperationDuration/NoError-8           80.00 ± 0%    0.00 ± 0%  -100.00% (p=0.000 n=10)
InstrumentationRecordOperationDuration/Error-8             296.0 ± 0%   216.0 ± 0%   -27.03% (p=0.000 n=10)
InstrumentationRecordCollectionDuration/NoError-8          80.00 ± 0%    0.00 ± 0%  -100.00% (p=0.000 n=10)
InstrumentationRecordCollectionDuration/Error-8            296.0 ± 0%   216.0 ± 0%   -27.03% (p=0.000 n=10)
geomean                                                    152.0                    ?                       ¹ ²
¹ summaries must be &gt;0 to compute geomean
² ratios must be &gt;0 to compute geomean

                                                  │ main.bmark.result │    prom-optimize-observ.bmark.result    │
                                                  │     allocs/op     │ allocs/op   vs base                     │
InstrumentationExportMetrics/NoError-8                     1.000 ± 0%   0.000 ± 0%  -100.00% (p=0.000 n=10)
InstrumentationExportMetrics/AllError-8                    3.000 ± 0%   2.000 ± 0%   -33.33% (p=0.000 n=10)
InstrumentationExportMetrics/PartialError-8                3.000 ± 0%   2.000 ± 0%   -33.33% (p=0.000 n=10)
InstrumentationRecordOperationDuration/NoError-8           1.000 ± 0%   0.000 ± 0%  -100.00% (p=0.000 n=10)
InstrumentationRecordOperationDuration/Error-8             3.000 ± 0%   2.000 ± 0%   -33.33% (p=0.000 n=10)
InstrumentationRecordCollectionDuration/NoError-8          1.000 ± 0%   0.000 ± 0%  -100.00% (p=0.000 n=10)
InstrumentationRecordCollectionDuration/Error-8            3.000 ± 0%   2.000 ± 0%   -33.33% (p=0.000 n=10)
geomean                                                    1.873                    ?                       ¹ ²
¹ summaries must be &gt;0 to compute geomean
² ratios must be &gt;0 to compute geomean
```
diff --git a/exporters/prometheus/benchmark_test.go b/exporters/prometheus/benchmark_test.go
@@ -13,26 +13,37 @@ import (
 	"go.opentelemetry.io/otel/sdk/metric"
 )
 
-func benchmarkCollect(b *testing.B, n int) {
-	ctx := b.Context()
-	registry := prometheus.NewRegistry()
-	exporter, err := New(WithRegisterer(registry))
-	require.NoError(b, err)
-	provider := metric.NewMeterProvider(metric.WithReader(exporter))
-	meter := provider.Meter("testmeter")
-
-	for i := range n {
-		counter, err := meter.Float64Counter(fmt.Sprintf("foo_%d", i))
+func run(n int) func(b *testing.B) {
+	return func(b *testing.B) {
+		ctx := b.Context()
+		registry := prometheus.NewRegistry()
+		exporter, err := New(WithRegisterer(registry))
 		require.NoError(b, err)
-		counter.Add(ctx, float64(i))
+		provider := metric.NewMeterProvider(metric.WithReader(exporter))
+		meter := provider.Meter("testmeter")
+
+		for i := range n {
+			counter, err := meter.Float64Counter(fmt.Sprintf("foo_%d", i))
+			require.NoError(b, err)
+			counter.Add(ctx, float64(i))
+		}
+
+		b.ReportAllocs()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_, err := registry.Gather()
+			require.NoError(b, err)
+		}
 	}
+}
 
-	b.ReportAllocs()
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_, err := registry.Gather()
-		require.NoError(b, err)
-	}
+func benchmarkCollect(b *testing.B, n int) {
+	b.Run("ObservabilityDisabled", run(n))
+	b.Run("ObservabilityEnabled", func(b *testing.B) {
+		b.Setenv("OTEL_GO_X_OBSERVABILITY", "true")
+		bmark := run(n)
+		bmark(b)
+	})
 }
 
 func BenchmarkCollect1(b *testing.B)     { benchmarkCollect(b, 1) }
diff --git a/exporters/prometheus/exporter.go b/exporters/prometheus/exporter.go
@@ -166,16 +166,16 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) {
 	ctx := context.TODO()
 
 	if c.inst != nil {
-		endOp := c.inst.RecordOperationDuration(ctx)
-		defer func() { endOp(err) }()
+		timer := c.inst.RecordOperationDuration(ctx)
+		defer func() { timer.Stop(err) }()
 	}
 
 	metrics := metricsPool.Get().(*metricdata.ResourceMetrics)
 	defer metricsPool.Put(metrics)
 
 	endCollection := func(error) {}
 	if c.inst != nil {
-		endCollection = c.inst.RecordCollectionDuration(ctx)
+		endCollection = c.inst.RecordCollectionDuration(ctx).Stop
 	}
 	err = c.reader.Collect(ctx, metrics)
 	endCollection(err)
@@ -366,8 +366,8 @@ func addExponentialHistogramMetric[N int64 | float64](
 	var err error
 	var success int64
 	if inst != nil {
-		end := inst.ExportMetrics(ctx, int64(len(histogram.DataPoints)))
-		defer func() { end(success, err) }()
+		op := inst.ExportMetrics(ctx, int64(len(histogram.DataPoints)))
+		defer func() { op.End(success, err) }()
 	}
 
 	for j, dp := range histogram.DataPoints {
@@ -467,8 +467,8 @@ func addHistogramMetric[N int64 | float64](
 	var err error
 	var success int64
 	if inst != nil {
-		end := inst.ExportMetrics(ctx, int64(len(histogram.DataPoints)))
-		defer func() { end(success, err) }()
+		op := inst.ExportMetrics(ctx, int64(len(histogram.DataPoints)))
+		defer func() { op.End(success, err) }()
 	}
 
 	for j, dp := range histogram.DataPoints {
@@ -515,8 +515,8 @@ func addSumMetric[N int64 | float64](
 	var err error
 	var success int64
 	if inst != nil {
-		end := inst.ExportMetrics(ctx, int64(len(sum.DataPoints)))
-		defer func() { end(success, err) }()
+		op := inst.ExportMetrics(ctx, int64(len(sum.DataPoints)))
+		defer func() { op.End(success, err) }()
 	}
 
 	valueType := prometheus.CounterValue
@@ -565,8 +565,8 @@ func addGaugeMetric[N int64 | float64](
 	var err error
 	var success int64
 	if inst != nil {
-		end := inst.ExportMetrics(ctx, int64(len(gauge.DataPoints)))
-		defer func() { end(success, err) }()
+		op := inst.ExportMetrics(ctx, int64(len(gauge.DataPoints)))
+		defer func() { op.End(success, err) }()
 	}
 
 	for i, dp := range gauge.DataPoints {
diff --git a/exporters/prometheus/internal/observ/instrumentation.go b/exporters/prometheus/internal/observ/instrumentation.go
@@ -144,81 +144,106 @@ func NewInstrumentation(id int64) (*Instrumentation, error) {
 	return i, err
 }
 
-// RecordDurationDone is a function that is called when a call to an Exporters'
-// RecordOperationDuration or RecordCollectionDuration completes.
+// RecordOperationDuration starts the timing of an operation.
 //
-// Any error that is encountered is provided as err.
-type RecordDurationDone func(error)
-
-func (i *Instrumentation) RecordOperationDuration(ctx context.Context) RecordDurationDone {
-	return i.recordDuration(ctx, i.operationDuration)
+// It returns a [Timer] that tracks the operation duration. The [Timer.Stop]
+// method must be called when the operation completes.
+func (i *Instrumentation) RecordOperationDuration(ctx context.Context) Timer {
+	return Timer{
+		ctx:   ctx,
+		start: time.Now(),
+		inst:  i,
+		hist:  i.operationDuration,
+	}
 }
 
-func (i *Instrumentation) RecordCollectionDuration(ctx context.Context) RecordDurationDone {
-	return i.recordDuration(ctx, i.collectionDuration)
+// RecordCollectionDuration starts the timing of a collection operation.
+//
+// It returns a [Timer] that tracks the collection duration. The [Timer.Stop]
+// method must be called when the collection completes.
+func (i *Instrumentation) RecordCollectionDuration(ctx context.Context) Timer {
+	return Timer{
+		ctx:   ctx,
+		start: time.Now(),
+		inst:  i,
+		hist:  i.collectionDuration,
+	}
 }
 
-func (i *Instrumentation) recordDuration(
-	ctx context.Context,
-	h metric.Float64Histogram,
-) RecordDurationDone {
-	start := time.Now()
+// Timer tracks the duration of an operation.
+type Timer struct {
+	ctx   context.Context
+	start time.Time
 
-	return func(err error) {
-		recordOpt := get[metric.RecordOption](recordOptPool)
-		defer put(recordOptPool, recordOpt)
-		*recordOpt = append(*recordOpt, i.setOpt)
-
-		if err != nil {
-			attrs := get[attribute.KeyValue](measureAttrsPool)
-			defer put(measureAttrsPool, attrs)
-			*attrs = append(*attrs, i.attrs...)
-			*attrs = append(*attrs, semconv.ErrorType(err))
-
-			set := attribute.NewSet(*attrs...)
-			*recordOpt = append((*recordOpt)[:0], metric.WithAttributeSet(set))
-		}
+	inst *Instrumentation
+	hist metric.Float64Histogram
+}
 
-		h.Record(ctx, time.Since(start).Seconds(), *recordOpt...)
+// Stop ends the timing operation and records the elapsed duration.
+//
+// If err is non-nil, an appropriate error type attribute will be included.
+func (t Timer) Stop(err error) {
+	recordOpt := get[metric.RecordOption](recordOptPool)
+	defer put(recordOptPool, recordOpt)
+	*recordOpt = append(*recordOpt, t.inst.setOpt)
+
+	if err != nil {
+		attrs := get[attribute.KeyValue](measureAttrsPool)
+		defer put(measureAttrsPool, attrs)
+		*attrs = append(*attrs, t.inst.attrs...)
+		*attrs = append(*attrs, semconv.ErrorType(err))
+
+		set := attribute.NewSet(*attrs...)
+		*recordOpt = append((*recordOpt)[:0], metric.WithAttributeSet(set))
 	}
+
+	t.hist.Record(t.ctx, time.Since(t.start).Seconds(), *recordOpt...)
 }
 
-// ExportMetricsDone is a function that is called when a call to an Exporter's
-// export methods completes.
+// ExportMetrics starts the observation of a metric export operation.
 //
-// The number of successful exports is provided as success. Any error that is
-// encountered is provided as err.
-type ExportMetricsDone func(success int64, err error)
-
-func (i *Instrumentation) ExportMetrics(ctx context.Context, n int64) ExportMetricsDone {
+// It returns an [ExportOp] that tracks the export operation. The
+// [ExportOp.End] method must be called when the export completes.
+func (i *Instrumentation) ExportMetrics(ctx context.Context, n int64) ExportOp {
 	addOpt := get[metric.AddOption](addOptPool)
 	defer put(addOptPool, addOpt)
 	*addOpt = append(*addOpt, i.setOpt)
 
 	i.inflightMetric.Add(ctx, n, *addOpt...)
 
-	return i.end(ctx, n)
+	return ExportOp{ctx: ctx, nMetrics: n, inst: i}
+}
+
+// ExportOp tracks a metric export operation.
+type ExportOp struct {
+	ctx      context.Context
+	nMetrics int64
+
+	inst *Instrumentation
 }
 
-func (i *Instrumentation) end(ctx context.Context, n int64) ExportMetricsDone {
-	return func(success int64, err error) {
-		addOpt := get[metric.AddOption](addOptPool)
-		defer put(addOptPool, addOpt)
-		*addOpt = append(*addOpt, i.setOpt)
+// End ends the observation of a metric export operation.
+//
+// The success parameter is the number of metrics that were successfully
+// exported. If a non-nil error is provided, the number of failed metrics will
+// be recorded with the error type attribute.
+func (e ExportOp) End(success int64, err error) {
+	addOpt := get[metric.AddOption](addOptPool)
+	defer put(addOptPool, addOpt)
+	*addOpt = append(*addOpt, e.inst.setOpt)
 
-		i.inflightMetric.Add(ctx, -n, *addOpt...)
-		i.exportedMetric.Add(ctx, success, *addOpt...)
+	e.inst.inflightMetric.Add(e.ctx, -e.nMetrics, *addOpt...)
+	e.inst.exportedMetric.Add(e.ctx, success, *addOpt...)
 
-		if err != nil {
-			attrs := get[attribute.KeyValue](measureAttrsPool)
-			defer put(measureAttrsPool, attrs)
-			*attrs = append(*attrs, i.attrs...)
-			*attrs = append(*attrs, semconv.ErrorType(err))
+	if err != nil {
+		attrs := get[attribute.KeyValue](measureAttrsPool)
+		defer put(measureAttrsPool, attrs)
+		*attrs = append(*attrs, e.inst.attrs...)
+		*attrs = append(*attrs, semconv.ErrorType(err))
 
-			set := attribute.NewSet(*attrs...)
+		set := attribute.NewSet(*attrs...)
 
-			*addOpt = append((*addOpt)[:0], metric.WithAttributeSet(set))
-			i.exportedMetric.Add(ctx, n-success, *addOpt...)
-		}
+		*addOpt = append((*addOpt)[:0], metric.WithAttributeSet(set))
+		e.inst.exportedMetric.Add(e.ctx, e.nMetrics-success, *addOpt...)
 	}
 }
diff --git a/exporters/prometheus/internal/observ/instrumentation_test.go b/exporters/prometheus/internal/observ/instrumentation_test.go