From 8d7fba51ad8e67e65b3f079479dfa448a1041ab6 Mon Sep 17 00:00:00 2001 From: raghu999 Date: Fri, 7 Nov 2025 22:03:20 -0500 Subject: [PATCH 1/4] Add ARC support --- .chloggen/arc-feature.yaml | 32 ++ exporter/exporterhelper/README.md | 14 + exporter/exporterhelper/documentation.md | 62 +++ .../exporterhelper/generated_package_test.go | 3 +- exporter/exporterhelper/go.mod | 11 +- exporter/exporterhelper/go.sum | 12 +- .../exporterhelper/internal/arc/controller.go | 437 ++++++++++++++++++ .../internal/arc/controller_metrics_test.go | 89 ++++ .../internal/arc/controller_test.go | 336 ++++++++++++++ exporter/exporterhelper/internal/arc/ewma.go | 37 ++ .../exporterhelper/internal/arc/ewma_test.go | 81 ++++ .../internal/experr/back_pressure.go | 80 ++++ .../internal/experr/back_pressure_test.go | 53 +++ .../internal/metadata/generated_telemetry.go | 79 ++++ .../metadatatest/generated_telemetrytest.go | 109 ++++- .../generated_telemetrytest_test.go | 34 ++ .../exporterhelper/internal/queue_sender.go | 84 +++- .../internal/queue_sender_test.go | 80 ++++ .../internal/queuebatch/config.go | 4 + exporter/exporterhelper/metadata.yaml | 81 +++- .../exporterhelper/xexporterhelper/go.mod | 10 +- .../exporterhelper/xexporterhelper/go.sum | 12 +- 22 files changed, 1712 insertions(+), 28 deletions(-) create mode 100644 .chloggen/arc-feature.yaml create mode 100644 exporter/exporterhelper/internal/arc/controller.go create mode 100644 exporter/exporterhelper/internal/arc/controller_metrics_test.go create mode 100644 exporter/exporterhelper/internal/arc/controller_test.go create mode 100644 exporter/exporterhelper/internal/arc/ewma.go create mode 100644 exporter/exporterhelper/internal/arc/ewma_test.go create mode 100644 exporter/exporterhelper/internal/experr/back_pressure.go create mode 100644 exporter/exporterhelper/internal/experr/back_pressure_test.go diff --git a/.chloggen/arc-feature.yaml b/.chloggen/arc-feature.yaml new file mode 100644 index 00000000000..5b7e8dee0ee --- /dev/null +++ b/.chloggen/arc-feature.yaml @@ -0,0 +1,32 @@ +change_type: enhancement +component: pkg/exporterhelper +note: > + Add Adaptive Request Concurrency (ARC) to exporterhelper: a controller that + dynamically adjusts exporter concurrency based on observed RTT and + backpressure signals. Integrates with the queue/batch send path and exposes + new telemetry. Disabled by default; no behavior change unless enabled. +issues: [14080] +subtext: | + Configuration (exporterhelper / queue-batch settings): + - enabled: turn ARC on/off (default: false) + - initial_concurrency: starting permits (default: 1) + - max_concurrency: upper bound (default: 200) + - decrease_ratio: multiplicative decrease factor on pressure (default: 0.9) + - ewma_alpha: smoothing for RTT EWMA (default: 0.4) + - rtt_deviation_scale: N·σ spike threshold for decrease (default: 2.5) + + Telemetry (new metrics): + - otelcol_exporter_arc_acquire_wait_ms (histogram; attrs: exporter, data_type) + - otelcol_exporter_arc_rtt_ms (histogram; attrs: exporter, data_type) + - otelcol_exporter_arc_failures (sum, monotonic; attrs: exporter, data_type) + - otelcol_exporter_arc_backoff_events (sum, monotonic; attrs: exporter, data_type) + - otelcol_exporter_arc_limit_changes (sum, monotonic; attrs: exporter, data_type, direction=up|down) + - otelcol_exporter_arc_limit (async gauge; attrs: exporter, data_type) + - otelcol_exporter_arc_permits_in_use (async gauge; attrs: exporter, data_type) + + Notes: + - ARC increases concurrency additively and decreases multiplicatively on + backpressure or RTT spikes (mean + rtt_deviation_scale·σ). + - Direction attribute values: "up", "down". +change_logs: [user] + diff --git a/exporter/exporterhelper/README.md b/exporter/exporterhelper/README.md index 4652383c6a2..ef8e0abebae 100644 --- a/exporter/exporterhelper/README.md +++ b/exporter/exporterhelper/README.md @@ -29,6 +29,7 @@ The following configuration options can be modified: - `bytes`: the size of serialized data in bytes (the least performant option). - `queue_size` (default = 1000): Maximum size the queue can accept. Measured in units defined by `sizer` - `batch`: see below. + - `arc`: see below. #### Sending queue batch settings @@ -50,6 +51,19 @@ Available `batch::sizer` options: - `items`: number of the smallest parts of each signal (spans, metric data points, log records); - `bytes`: the size of serialized data in bytes (the least performant option). + +#### Sending queue Adaptive Concurrency Limiter (ARC) Settings + +The Adaptive Concurrency Limiter (ARC) dynamically adjusts the number of concurrent requests (`num_consumers`) based on observed RTTs and backpressure signals. It aims to maximize throughput while minimizing errors and latency. It is disabled by default. + +- `arc` + - `enabled` (default = false): Set to `true` to enable ARC. + - `initial_limit` (default = 1): The starting concurrency limit. + - `max_concurrency` (default = 200): The maximum number of concurrent requests ARC will allow. + - `decrease_ratio` (default = 0.9): The multiplicative factor to apply when decreasing the limit (e.g., 0.9 = 10% decrease). + - `ewma_alpha` (default = 0.4): The smoothing factor for the EWMA (Exponentially Weighted Moving Average) of RTTs. + - `deviation_scale` (default = 2.5): The number of standard deviations from the mean RTT to tolerate before triggering a backoff. + ### Timeout - `timeout` (default = 5s): Time to wait per individual attempt to send data to a backend diff --git a/exporter/exporterhelper/documentation.md b/exporter/exporterhelper/documentation.md index 095309bed21..200ed6ad869 100644 --- a/exporter/exporterhelper/documentation.md +++ b/exporter/exporterhelper/documentation.md @@ -6,6 +6,68 @@ The following telemetry is emitted by this component. +### otelcol_exporter_arc_acquire_wait_ms + +Time a worker waited to acquire an ARC permit. [Alpha] + +| Unit | Metric Type | Value Type | Stability | +| ---- | ----------- | ---------- | --------- | +| ms | Histogram | Int | Alpha | + +### otelcol_exporter_arc_backoff_events + +Number of ARC backoff (shrink) events triggered by error or RTT signal. [Alpha] + +| Unit | Metric Type | Value Type | Monotonic | Stability | +| ---- | ----------- | ---------- | --------- | --------- | +| {events} | Sum | Int | true | Alpha | + +### otelcol_exporter_arc_failures + +Number of requests considered failures by ARC (feeds adaptive shrink). [Alpha] + +| Unit | Metric Type | Value Type | Monotonic | Stability | +| ---- | ----------- | ---------- | --------- | --------- | +| {requests} | Sum | Int | true | Alpha | + +### otelcol_exporter_arc_limit + +Current ARC dynamic concurrency limit. [Alpha] + +| Unit | Metric Type | Value Type | Stability | +| ---- | ----------- | ---------- | --------- | +| {permits} | Gauge | Int | Alpha | + +### otelcol_exporter_arc_limit_changes + +Number of times ARC changed its concurrency limit. [Alpha] + +| Unit | Metric Type | Value Type | Monotonic | Stability | +| ---- | ----------- | ---------- | --------- | --------- | +| {events} | Sum | Int | true | Alpha | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| direction | up or down | Str: ``up``, ``down`` | + +### otelcol_exporter_arc_permits_in_use + +Number of permits currently acquired. [Alpha] + +| Unit | Metric Type | Value Type | Stability | +| ---- | ----------- | ---------- | --------- | +| {permits} | Gauge | Int | Alpha | + +### otelcol_exporter_arc_rtt_ms + +Request round-trip-time measured by ARC (from permit acquire to release). [Alpha] + +| Unit | Metric Type | Value Type | Stability | +| ---- | ----------- | ---------- | --------- | +| ms | Histogram | Int | Alpha | + ### otelcol_exporter_enqueue_failed_log_records Number of log records failed to be added to the sending queue. [Alpha] diff --git a/exporter/exporterhelper/generated_package_test.go b/exporter/exporterhelper/generated_package_test.go index 73b52415cde..27f634c4255 100644 --- a/exporter/exporterhelper/generated_package_test.go +++ b/exporter/exporterhelper/generated_package_test.go @@ -3,9 +3,8 @@ package exporterhelper import ( - "testing" - "go.uber.org/goleak" + "testing" ) func TestMain(m *testing.M) { diff --git a/exporter/exporterhelper/go.mod b/exporter/exporterhelper/go.mod index bf3c1c87e51..8abc5794f1f 100644 --- a/exporter/exporterhelper/go.mod +++ b/exporter/exporterhelper/go.mod @@ -16,8 +16,8 @@ require ( go.opentelemetry.io/collector/consumer/consumererror v0.139.0 go.opentelemetry.io/collector/consumer/consumertest v0.139.0 go.opentelemetry.io/collector/exporter v1.45.0 - go.opentelemetry.io/collector/exporter/exportertest v0.139.0 - go.opentelemetry.io/collector/extension/extensiontest v0.139.0 + go.opentelemetry.io/collector/exporter/exportertest v0.138.0 + go.opentelemetry.io/collector/extension/extensiontest v0.138.0 go.opentelemetry.io/collector/extension/xextension v0.139.0 go.opentelemetry.io/collector/featuregate v1.45.0 go.opentelemetry.io/collector/pdata v1.45.0 @@ -33,6 +33,7 @@ require ( go.uber.org/goleak v1.3.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 + google.golang.org/grpc v1.76.0 google.golang.org/protobuf v1.36.10 ) @@ -61,9 +62,9 @@ require ( go.opentelemetry.io/collector/receiver/receivertest v0.139.0 // indirect go.opentelemetry.io/collector/receiver/xreceiver v0.139.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/sys v0.35.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b // indirect - google.golang.org/grpc v1.76.0 // indirect + golang.org/x/sys v0.36.0 // indirect + golang.org/x/text v0.29.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/exporter/exporterhelper/go.sum b/exporter/exporterhelper/go.sum index 12a6fdb8e2c..2ae909dd4a6 100644 --- a/exporter/exporterhelper/go.sum +++ b/exporter/exporterhelper/go.sum @@ -79,12 +79,12 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= -golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b h1:zPKJod4w6F1+nRGDI9ubnXYhU9NSWoFAijkHkUXeTK8= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= +golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= +golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc= google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= diff --git a/exporter/exporterhelper/internal/arc/controller.go b/exporter/exporterhelper/internal/arc/controller.go new file mode 100644 index 00000000000..899b9eea8be --- /dev/null +++ b/exporter/exporterhelper/internal/arc/controller.go @@ -0,0 +1,437 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" + +import ( + "context" + "errors" + "math" + "sync" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata" + "go.opentelemetry.io/collector/pipeline" +) + +type Config struct { + Enabled bool `mapstructure:"enabled"` + InitialLimit int `mapstructure:"initial_limit"` + MaxConcurrency int `mapstructure:"max_concurrency"` + DecreaseRatio float64 `mapstructure:"decrease_ratio"` + EwmaAlpha float64 `mapstructure:"ewma_alpha"` + DeviationScale float64 `mapstructure:"deviation_scale"` +} + +func DefaultConfig() Config { + return Config{ + Enabled: false, + InitialLimit: 1, + MaxConcurrency: 200, + DecreaseRatio: 0.9, + EwmaAlpha: 0.4, + DeviationScale: 2.5, + } +} + +type Controller struct { + cfg Config + sem *shrinkSem + + mu sync.Mutex + st state + + // Telemetry + tel *metadata.TelemetryBuilder + rttInst metric.Int64Histogram + failuresInst metric.Int64Counter + limitChangesInst metric.Int64Counter + limitChangesUpAttr metric.MeasurementOption + limitChangesDownAttr metric.MeasurementOption + backoffInst metric.Int64Counter + // common attr set for sync metrics (exporter + data_type) + commonAttr metric.MeasurementOption +} + +type state struct { + limit int + inFlight int + past ewmaVar + curr mean + nextTick time.Time + hadPressure bool + hitCeiling bool +} + +func NewController(cfg Config, tel *metadata.TelemetryBuilder, id component.ID, sig pipeline.Signal) *Controller { + c := cfg + if c.InitialLimit <= 0 { + c.InitialLimit = DefaultConfig().InitialLimit + } + if c.MaxConcurrency <= 0 { + c.MaxConcurrency = DefaultConfig().MaxConcurrency + } + if c.DecreaseRatio <= 0 || c.DecreaseRatio >= 1 { + c.DecreaseRatio = DefaultConfig().DecreaseRatio + } + if c.EwmaAlpha <= 0 || c.EwmaAlpha >= 1 { + c.EwmaAlpha = DefaultConfig().EwmaAlpha + } + if c.DeviationScale < 0 { + c.DeviationScale = DefaultConfig().DeviationScale + } + + ctrl := &Controller{ + cfg: c, + sem: newShrinkSem(c.InitialLimit), + tel: tel, + } + ctrl.st.limit = c.InitialLimit + ctrl.st.nextTick = time.Now() + ctrl.st.past = newEwmaVar(c.EwmaAlpha) + + if tel != nil { + exporterAttr := attribute.String("exporter", id.String()) + dataTypeAttr := attribute.String("data_type", sig.String()) + + // Use exporter + data_type on ALL sync datapoints + common := metric.WithAttributeSet(attribute.NewSet(exporterAttr, dataTypeAttr)) + ctrl.commonAttr = common + + ctrl.rttInst = tel.ExporterArcRttMs + ctrl.failuresInst = tel.ExporterArcFailures + ctrl.limitChangesInst = tel.ExporterArcLimitChanges + ctrl.backoffInst = tel.ExporterArcBackoffEvents + + // Include data_type on the limit change counters too + ctrl.limitChangesUpAttr = metric.WithAttributeSet(attribute.NewSet( + exporterAttr, dataTypeAttr, attribute.String("direction", "up"), + )) + ctrl.limitChangesDownAttr = metric.WithAttributeSet(attribute.NewSet( + exporterAttr, dataTypeAttr, attribute.String("direction", "down"), + )) + + // Async gauges also include exporter + data_type + asyncCommon := metric.WithAttributeSet(attribute.NewSet(exporterAttr, dataTypeAttr)) + _ = tel.RegisterExporterArcLimitCallback(func(_ context.Context, o metric.Int64Observer) error { + o.Observe(int64(ctrl.CurrentLimit()), asyncCommon) + return nil + }) + _ = tel.RegisterExporterArcPermitsInUseCallback(func(_ context.Context, o metric.Int64Observer) error { + o.Observe(int64(ctrl.PermitsInUse()), asyncCommon) + return nil + }) + } + + return ctrl +} + +// Shutdown unregisters any asynchronous callbacks. +func (c *Controller) Shutdown() { + if c == nil { + return + } + if c.tel != nil { + c.tel.Shutdown() + } + if c.sem != nil { + c.sem.close() + } +} + +// Acquire blocks until a slot is available or ctx is done. +// Returns true iff the slot was acquired. +func (c *Controller) Acquire(ctx context.Context) bool { + return c.sem.acquire(ctx) == nil +} + +// Release releases one slot back to the semaphore. +func (c *Controller) Release() { c.sem.release() } + +// ReleaseWithSample provides feedback to the controller and releases one slot. +func (c *Controller) ReleaseWithSample(ctx context.Context, rtt time.Duration, success, backpressure bool) { + c.Feedback(ctx, rtt, success, backpressure) + c.Release() +} + +// CurrentLimit returns the current concurrency limit. +func (c *Controller) CurrentLimit() int { + c.mu.Lock() + defer c.mu.Unlock() + return c.st.limit +} + +func (c *Controller) PermitsInUse() int { + c.mu.Lock() + defer c.mu.Unlock() + return c.st.inFlight +} + +// StartRequest increments in-flight count and tracks ceiling hit. +func (c *Controller) StartRequest() { + c.mu.Lock() + c.st.inFlight++ + if c.st.inFlight >= c.st.limit { + c.st.hitCeiling = true + } + c.mu.Unlock() +} + +// Feedback provides the observed RTT and signals if backpressure or success occurred. +func (c *Controller) Feedback(ctx context.Context, rtt time.Duration, success, backpressure bool) { + now := time.Now() + c.mu.Lock() + + // Record RTT (guard nil telemetry) + if c.tel != nil && c.rttInst != nil { + c.rttInst.Record(ctx, rtt.Milliseconds(), c.commonAttr) + } + + // Defensively avoid negative inFlight if caller didn't call StartRequest. + if c.st.inFlight > 0 { + c.st.inFlight-- + } + + // Track this window's RTTs only for successes (to avoid biasing the mean). + if success { + c.st.curr.add(rtt.Seconds()) + } + + // If we got any explicit backpressure signal this window, remember it. + if backpressure { + c.st.hadPressure = true + } + + if !success && c.tel != nil && c.failuresInst != nil { + // Record failures (non-permanent errors or backpressure). + c.failuresInst.Add(ctx, 1, c.commonAttr) + } + + // EARLY BACKOFF ON COLD START: + // If we've not yet initialized EWMA (no window mean set) but already see pressure, + // immediately apply multiplicative decrease so we don't stall under load. + if backpressure && c.st.limit > 1 && !c.st.past.initialized() { + old := c.st.limit + n := int(math.Max(1, math.Floor(float64(c.st.limit)*c.cfg.DecreaseRatio))) + if n < c.st.limit { + c.sem.forget(c.st.limit - n) + c.st.limit = n + if c.tel != nil && c.backoffInst != nil { + c.backoffInst.Add(ctx, 1, c.commonAttr) + } + if c.tel != nil && c.limitChangesInst != nil && c.st.limit != old { + c.limitChangesInst.Add(ctx, 1, c.limitChangesDownAttr) + } + } + } + + // Initialize the first window based on measured mean. + if !c.st.past.initialized() { + if m := c.st.curr.average(); m > 0 { + c.st.past.update(m) + // Next tick after 'm' seconds; caller may clamp externally if desired. + c.st.nextTick = now.Add(time.Duration(m * float64(time.Second))) + } + c.mu.Unlock() + return + } + + // Window tick: adjust concurrency once per window. + if !now.Before(c.st.nextTick) { + m := c.st.curr.average() + if m > 0 { + c.st.past.update(m) + } + + // This will consider hadPressure or an RTT spike vs mean+Kσ. + c.adjust(ctx, m) + + // Reset window state. + c.st.curr = mean{} + c.st.hadPressure = false + c.st.hitCeiling = false + c.st.nextTick = now.Add(time.Duration(c.st.past.mean * float64(time.Second))) + } + + c.mu.Unlock() +} + +func (c *Controller) adjust(ctx context.Context, current float64) { + // NOTE: caller holds c.mu + dev := math.Sqrt(c.st.past.variance) + thr := dev * c.cfg.DeviationScale + + // Try additive increase: only when at ceiling, no pressure, and RTT <= mean. + if c.st.limit < c.cfg.MaxConcurrency && + c.st.hitCeiling && + !c.st.hadPressure && + current > 0 && + current <= c.st.past.mean { + old := c.st.limit + c.sem.addOne() + c.st.limit++ + + if c.tel != nil && c.limitChangesInst != nil && c.st.limit != old { + c.limitChangesInst.Add(ctx, 1, c.limitChangesUpAttr) + } + return + } + + // Multiplicative decrease: on explicit pressure or RTT spike. + if c.st.limit > 1 && (c.st.hadPressure || current >= c.st.past.mean+thr) { + old := c.st.limit + + n := int(math.Max(1, math.Floor(float64(c.st.limit)*c.cfg.DecreaseRatio))) + if n < c.st.limit { + c.sem.forget(c.st.limit - n) + c.st.limit = n + + // limit decreased => backoff & limit-change callbacks + if c.tel != nil && c.backoffInst != nil { + c.backoffInst.Add(ctx, 1, c.commonAttr) + } + if c.tel != nil && c.limitChangesInst != nil && c.st.limit != old { + c.limitChangesInst.Add(ctx, 1, c.limitChangesDownAttr) + } + } + } +} + +// ----------------------- shrinkable semaphore ----------------------- + +type shrinkSem struct { + mu sync.Mutex + avail int + waiting []chan struct{} + pendingF int + closed bool +} + +func newShrinkSem(n int) *shrinkSem { return &shrinkSem{avail: n} } + +// acquire waits for capacity or returns ctx error. +// NOTE: shrink is paid by future releases (pendingF), not by acquires. +func (s *shrinkSem) acquire(ctx context.Context) error { + ch := make(chan struct{}, 1) + + s.mu.Lock() + if s.closed { + s.mu.Unlock() + return experr.NewShutdownErr(errors.New("arc semaphore closed")) + } + if s.avail > 0 { + s.avail-- + s.mu.Unlock() + return nil + } + s.waiting = append(s.waiting, ch) + s.mu.Unlock() + + select { + case <-ch: + // If we were woken up but are now closed, return error + s.mu.Lock() + closed := s.closed + s.mu.Unlock() + if closed { + return experr.NewShutdownErr(errors.New("arc semaphore closed")) + } + return nil + case <-ctx.Done(): + // Remove our waiter entry if still queued. + s.mu.Lock() + if s.closed { + s.mu.Unlock() + return experr.NewShutdownErr(errors.New("arc semaphore closed")) + } + for i, w := range s.waiting { + if w == ch { + s.waiting = append(s.waiting[:i], s.waiting[i+1:]...) + break + } + } + s.mu.Unlock() + return ctx.Err() + } +} + +// release returns capacity to the semaphore with correct shrink priority: +// 1) pay down pending forgets, 2) wake next waiter, 3) increase avail. +func (s *shrinkSem) release() { + s.mu.Lock() + defer s.mu.Unlock() + + if s.pendingF > 0 { + s.pendingF-- + return + } + if len(s.waiting) > 0 { + w := s.waiting[0] + s.waiting = s.waiting[1:] + w <- struct{}{} + return + } + s.avail++ +} + +// forget reduces effective capacity by n. +// It first consumes avail, then accumulates pending forgets to be paid by releases. +func (s *shrinkSem) forget(n int) { + if n <= 0 { + return + } + s.mu.Lock() + defer s.mu.Unlock() + + for i := 0; i < n; i++ { + if s.avail > 0 { + s.avail-- + } else { + s.pendingF++ + } + } +} + +// addOne increases the available permits by exactly 1, paying any pending +// shrink (forget) first, or waking a waiter if present. +func (s *shrinkSem) addOne() { + s.mu.Lock() + defer s.mu.Unlock() + + // First, pay down any pending shrink. + if s.pendingF > 0 { + s.pendingF-- + return + } + + // If someone is waiting, wake one waiter instead of increasing avail. + if len(s.waiting) > 0 { + w := s.waiting[0] + s.waiting = s.waiting[1:] + w <- struct{}{} + return + } + + // Otherwise just increase available permits. + s.avail++ +} + +// close marks the semaphore as closed and wakes up all waiters. +func (s *shrinkSem) close() { + s.mu.Lock() + defer s.mu.Unlock() + + if s.closed { + return + } + s.closed = true + for _, w := range s.waiting { + close(w) // Close channel to signal + } + s.waiting = nil +} diff --git a/exporter/exporterhelper/internal/arc/controller_metrics_test.go b/exporter/exporterhelper/internal/arc/controller_metrics_test.go new file mode 100644 index 00000000000..e8af3a399a6 --- /dev/null +++ b/exporter/exporterhelper/internal/arc/controller_metrics_test.go @@ -0,0 +1,89 @@ +package arc + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + sdkmetricdata "go.opentelemetry.io/otel/sdk/metric/metricdata" + + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata" + "go.opentelemetry.io/collector/pipeline" +) + +func collectOne(t *testing.T, r *sdkmetric.ManualReader) sdkmetricdata.ResourceMetrics { + t.Helper() + var out sdkmetricdata.ResourceMetrics + require.NoError(t, r.Collect(context.Background(), &out)) + return out +} + +func findSum(rm sdkmetricdata.ResourceMetrics, name string, attrs []attribute.KeyValue) (bool, int64) { + want := attribute.NewSet(attrs...) // canonical comparison set + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != name { + continue + } + if s, ok := m.Data.(sdkmetricdata.Sum[int64]); ok { + for _, dp := range s.DataPoints { + // Works for both pointer and value receivers + if (&dp.Attributes).Equals(&want) || dp.Attributes.Equals(&want) { + return true, dp.Value + } + } + } + } + } + return false, 0 +} + +func TestController_BackpressureEmitsMetricsAndShrinks(t *testing.T) { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + ts := componenttest.NewNopTelemetrySettings() + ts.MeterProvider = mp + + tel, err := metadata.NewTelemetryBuilder(ts) + require.NoError(t, err) + + cfg := DefaultConfig() + cfg.InitialLimit = 8 + cfg.MaxConcurrency = 16 + cfg.DecreaseRatio = 0.5 + + id := component.MustNewID("otlp") + ctrl := NewController(cfg, tel, id, pipeline.SignalTraces) + + // Initialize EWMA so the controller takes the 'normal' decrease path + ctrl.mu.Lock() + ctrl.st.past.update(0.01) // small mean to make tick scheduling trivial + ctrl.st.limit = 8 + ctrl.st.hadPressure = true // force a decrease on adjust + ctrl.mu.Unlock() + + // Trigger a window tick -> adjust() sees hadPressure => multiplicative decrease. + ctrl.Feedback(context.Background(), 1*time.Millisecond, true /* success */, true /* backpressure */) + time.Sleep(10 * time.Millisecond) + + rm := collectOne(t, reader) + + common := []attribute.KeyValue{ + attribute.String("exporter", id.String()), + attribute.String("data_type", pipeline.SignalTraces.String()), + } + + found, backoffs := findSum(rm, "otelcol_exporter_arc_backoff_events", common) + require.True(t, found, "backoff_events not found") + require.GreaterOrEqual(t, backoffs, int64(1)) + + down := append(common, attribute.String("direction", "down")) + found, downs := findSum(rm, "otelcol_exporter_arc_limit_changes", down) + require.True(t, found, "limit_changes{down} not found") + require.GreaterOrEqual(t, downs, int64(1)) +} diff --git a/exporter/exporterhelper/internal/arc/controller_test.go b/exporter/exporterhelper/internal/arc/controller_test.go new file mode 100644 index 00000000000..48bf40b604a --- /dev/null +++ b/exporter/exporterhelper/internal/arc/controller_test.go @@ -0,0 +1,336 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata" + "go.opentelemetry.io/collector/pipeline" +) + +func TestNewControllerDefaults(t *testing.T) { + cfg := Config{ + InitialLimit: 0, + MaxConcurrency: 0, + DecreaseRatio: 0, + EwmaAlpha: 0, + DeviationScale: -1, + } + ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + + if got, want := ctrl.CurrentLimit(), DefaultConfig().InitialLimit; got != want { + t.Fatalf("unexpected InitialLimit: got %d want %d", got, want) + } + if ctrl.cfg.MaxConcurrency != DefaultConfig().MaxConcurrency { + t.Fatalf("MaxConcurrency default not applied") + } + if ctrl.cfg.DecreaseRatio != DefaultConfig().DecreaseRatio { + t.Fatalf("DecreaseRatio default not applied") + } + if ctrl.cfg.EwmaAlpha != DefaultConfig().EwmaAlpha { + t.Fatalf("EwmaAlpha default not applied") + } + if ctrl.cfg.DeviationScale != DefaultConfig().DeviationScale { + t.Fatalf("DeviationScale default not applied") + } +} + +func TestAcquireReleaseSemaphore(t *testing.T) { + cfg := Config{InitialLimit: 2} + ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + + ctx := context.Background() + if !ctrl.Acquire(ctx) { + t.Fatal("expected first Acquire to succeed") + } + if !ctrl.Acquire(ctx) { + t.Fatal("expected second Acquire to succeed") + } + + // third attempt should block and then fail with context timeout + toCtx, cancel := context.WithTimeout(context.Background(), 30*time.Millisecond) + defer cancel() + if ctrl.Acquire(toCtx) { + t.Fatal("expected Acquire with short timeout to fail") + } + + // release one and acquire should succeed + ctrl.Release() + if !ctrl.Acquire(ctx) { + t.Fatal("expected Acquire to succeed after release") + } + // cleanup releases + ctrl.Release() + ctrl.Release() +} + +func TestStartRequestAndPermits(t *testing.T) { + cfg := Config{InitialLimit: 3} + ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + + ctrl.StartRequest() + ctrl.StartRequest() + + if got := ctrl.PermitsInUse(); got != 2 { + t.Fatalf("unexpected in-flight: got %d want %d", got, 2) + } + + // Simulate hitting ceiling + ctrl.mu.Lock() + ctrl.st.inFlight = ctrl.st.limit + ctrl.st.hitCeiling = false + ctrl.mu.Unlock() + + ctrl.StartRequest() + ctrl.mu.Lock() + hit := ctrl.st.hitCeiling + ctrl.mu.Unlock() + if !hit { + t.Fatalf("expected hitCeiling to be true when inFlight >= limit") + } +} + +func TestEarlyBackoffOnColdStart(t *testing.T) { + cfg := Config{InitialLimit: 10, DecreaseRatio: 0.5} + ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + + if got := ctrl.CurrentLimit(); got != 10 { + t.Fatalf("expected initial limit 10, got %d", got) + } + + // Provide an explicit backpressure while EWMA not initialized. + ctrl.Feedback(context.Background(), 0, true, true) + + if got := ctrl.CurrentLimit(); got != 5 { + t.Fatalf("expected early backoff to reduce limit to 5, got %d", got) + } +} + +func TestAdjustIncreaseAndDecrease(t *testing.T) { + // Additive increase case + cfg := DefaultConfig() + cfg.InitialLimit = 1 + ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + + ctrl.mu.Lock() + // initialize past EWMA so past.initialized() is true + ctrl.st.past.update(1.0) + ctrl.st.hitCeiling = true + ctrl.st.hadPressure = false + ctrl.st.limit = 1 + ctrl.mu.Unlock() + + ctrl.mu.Lock() + ctrl.adjust(context.Background(), 1.0) + ctrl.mu.Unlock() + + if got := ctrl.CurrentLimit(); got != 2 { + t.Fatalf("expected additive increase to raise limit to 2, got %d", got) + } + + // Multiplicative decrease case + cfg2 := DefaultConfig() + cfg2.InitialLimit = 10 + cfg2.DecreaseRatio = 0.5 + ctrl2 := NewController(cfg2, nil, component.ID{}, pipeline.SignalTraces) + + ctrl2.mu.Lock() + ctrl2.st.past.update(1.0) + ctrl2.st.hadPressure = true + ctrl2.st.limit = 10 + ctrl2.mu.Unlock() + + ctrl2.mu.Lock() + ctrl2.adjust(context.Background(), 0.0) + ctrl2.mu.Unlock() + + if got := ctrl2.CurrentLimit(); got != 5 { + t.Fatalf("expected multiplicative decrease to reduce limit to 5, got %d", got) + } +} + +func TestShrinkSem_ForgetAndRelease(t *testing.T) { + s := newShrinkSem(2) + + // Use capacity + require.NoError(t, s.acquire(context.Background())) + require.NoError(t, s.acquire(context.Background())) + require.Equal(t, 0, s.avail) + + // Forget 1, adds pending Forget + s.forget(1) + require.Equal(t, 0, s.avail) + require.Equal(t, 1, s.pendingF) + + // Release 1, pays pending Forget + s.release() + require.Equal(t, 0, s.avail) + require.Equal(t, 0, s.pendingF) + + // Next acquire should block + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) + require.Error(t, s.acquire(ctx)) // Fails on timeout + cancel() + + // Release second acquire + s.release() + require.Equal(t, 1, s.avail) + require.Equal(t, 0, s.pendingF) + + // Acquire should now succeed + require.NoError(t, s.acquire(context.Background())) + require.Equal(t, 0, s.avail) + + // Forget 2. No avail, one permit outstanding. + // Both forgets should go to pendingF. + s.forget(2) + require.Equal(t, 0, s.avail) + require.Equal(t, 2, s.pendingF) + + // Release (pays one pending forget) + s.release() + require.Equal(t, 0, s.avail) + require.Equal(t, 1, s.pendingF) +} + +func TestShrinkSem_AddAndAcquire(t *testing.T) { + s := newShrinkSem(1) + require.NoError(t, s.acquire(context.Background())) + require.Equal(t, 0, s.avail) + + // Start a waiting acquire in a goroutine + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + // Must use assert in goroutines + assert.NoError(t, s.acquire(context.Background())) + s.release() // release it right away + }() + + // Allow goroutine to block on acquire + time.Sleep(10 * time.Millisecond) + + // Add one permit; should wake the waiter + s.addOne() + wg.Wait() // ensure goroutine finished + + s.mu.Lock() + require.Equal(t, 1, s.avail) // the goroutine acquired then released once + require.Empty(t, s.waiting) + s.mu.Unlock() + + // Test Add paying pending Forgets + s.forget(2) // avail=1 -> 0, pendingF=1 + require.Equal(t, 0, s.avail) + require.Equal(t, 1, s.pendingF) + + s.addOne() // pays pendingF + require.Equal(t, 0, s.avail) + require.Equal(t, 0, s.pendingF) + + s.addOne() // increases avail + require.Equal(t, 1, s.avail) + require.Equal(t, 0, s.pendingF) +} + +func TestController_Shutdown(t *testing.T) { + // Test that shutdown doesn't panic + ctrl := NewController(DefaultConfig(), nil, component.ID{}, pipeline.SignalTraces) + ctrl.Shutdown() + + // Test that shutdown with telemetry doesn't panic + tel, err := metadata.NewTelemetryBuilder(componenttest.NewNopTelemetrySettings()) + require.NoError(t, err) + ctrlWithTel := NewController(DefaultConfig(), tel, component.ID{}, pipeline.SignalTraces) + ctrlWithTel.Shutdown() +} + +func TestController_Shutdown_UnblocksWaiters(t *testing.T) { + ctrl := NewController(Config{InitialLimit: 1}, nil, component.ID{}, pipeline.SignalTraces) + + // Acquire the only permit + require.True(t, ctrl.Acquire(context.Background())) + + errCh := make(chan error, 1) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + // This should block + errCh <- ctrl.sem.acquire(context.Background()) + }() + + // Give the goroutine time to block + time.Sleep(50 * time.Millisecond) + + // Shutdown the controller, which closes the semaphore + ctrl.Shutdown() + + // Wait for the goroutine to finish + wg.Wait() + + // Check that the blocked goroutine received a shutdown error + err := <-errCh + require.Error(t, err) + assert.True(t, experr.IsShutdownErr(err), "error should be a shutdown error") + + // Test that new acquires also fail + err = ctrl.sem.acquire(context.Background()) + require.Error(t, err) + assert.True(t, experr.IsShutdownErr(err), "error should be a shutdown error") +} + +func TestController_ReleaseWithSample(t *testing.T) { + cfg := Config{InitialLimit: 1} + ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + + ctx := context.Background() + require.True(t, ctrl.Acquire(ctx)) + ctrl.StartRequest() + + // Initialize EWMA + ctrl.ReleaseWithSample(ctx, 10*time.Millisecond, true, false) + ctrl.mu.Lock() + require.True(t, ctrl.st.past.initialized()) + require.Equal(t, 0, ctrl.st.inFlight) + ctrl.mu.Unlock() +} + +func TestController_FeedbackWindowTick(t *testing.T) { + cfg := DefaultConfig() + cfg.InitialLimit = 1 + ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + + // Initialize past EWMA + ctrl.mu.Lock() + ctrl.st.past.update(1.0) + ctrl.st.limit = 1 + ctrl.st.nextTick = time.Now() // Force tick on next feedback + ctrl.st.hitCeiling = true // Enable additive increase + ctrl.mu.Unlock() + + // This feedback should trigger a window tick + ctrl.Feedback(context.Background(), 10*time.Millisecond, true, false) + + ctrl.mu.Lock() + // Check if adjust ran (limit increased) + require.Equal(t, 2, ctrl.st.limit) + // Check if state was reset + require.Equal(t, 0, ctrl.st.curr.n) + require.False(t, ctrl.st.hadPressure) + require.False(t, ctrl.st.hitCeiling) + require.True(t, ctrl.st.nextTick.After(time.Now())) // Next tick is in the future + ctrl.mu.Unlock() +} diff --git a/exporter/exporterhelper/internal/arc/ewma.go b/exporter/exporterhelper/internal/arc/ewma.go new file mode 100644 index 00000000000..a6220f9cef8 --- /dev/null +++ b/exporter/exporterhelper/internal/arc/ewma.go @@ -0,0 +1,37 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" + +type mean struct { + n int + sum float64 +} + +func (m *mean) add(x float64) { m.n++; m.sum += x } +func (m *mean) average() float64 { + if m.n == 0 { + return 0 + } + return m.sum / float64(m.n) +} + +type ewmaVar struct { + alpha float64 + mean float64 + variance float64 + init bool +} + +func newEwmaVar(alpha float64) ewmaVar { return ewmaVar{alpha: alpha} } +func (e *ewmaVar) initialized() bool { return e.init } +func (e *ewmaVar) update(x float64) { + if !e.init { + e.mean, e.variance, e.init = x, 0, true + return + } + mPrev := e.mean + e.mean = e.alpha*x + (1-e.alpha)*e.mean + d := x - mPrev + e.variance = e.alpha*(d*d) + (1-e.alpha)*e.variance +} diff --git a/exporter/exporterhelper/internal/arc/ewma_test.go b/exporter/exporterhelper/internal/arc/ewma_test.go new file mode 100644 index 00000000000..aaa5df0325f --- /dev/null +++ b/exporter/exporterhelper/internal/arc/ewma_test.go @@ -0,0 +1,81 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" + +import ( + "math" + "testing" +) + +func TestMean(t *testing.T) { + m := mean{} + if got := m.average(); got != 0 { + t.Fatalf("average of empty mean should be 0, got %f", got) + } + + m.add(10) + if got := m.average(); got != 10 { + t.Fatalf("average after adding 10 should be 10, got %f", got) + } + + m.add(20) + if got := m.average(); got != 15 { + t.Fatalf("average after adding 20 should be 15, got %f", got) + } + + m.add(30) + if got := m.average(); got != 20 { + t.Fatalf("average after adding 30 should be 20, got %f", got) + } +} + +func TestEwmaVar(t *testing.T) { + const alpha = 0.4 + const tolerance = 1e-9 + + e := newEwmaVar(alpha) + if e.initialized() { + t.Fatal("newEwmaVar should not be initialized") + } + if e.alpha != alpha { + t.Fatalf("alpha not set correctly, got %f, want %f", e.alpha, alpha) + } + + // First update initializes + e.update(10) + if !e.initialized() { + t.Fatal("ewmaVar should be initialized after first update") + } + if e.mean != 10 { + t.Fatalf("initial mean should be first value, got %f, want 10", e.mean) + } + if e.variance != 0 { + t.Fatalf("initial variance should be 0, got %f, want 0", e.variance) + } + + // Second update + // mean = 0.4 * 20 + 0.6 * 10 = 8 + 6 = 14 + // d = 20 - 10 = 10 + // variance = 0.4 * (10*10) + 0.6 * 0 = 40 + e.update(20) + if math.Abs(e.mean-14.0) > tolerance { + t.Fatalf("second mean incorrect, got %f, want 14.0", e.mean) + } + if math.Abs(e.variance-40.0) > tolerance { + t.Fatalf("second variance incorrect, got %f, want 40.0", e.variance) + } + + // Third update + // mPrev = 14 + // mean = 0.4 * 15 + 0.6 * 14 = 6 + 8.4 = 14.4 + // d = 15 - 14 = 1 + // variance = 0.4 * (1*1) + 0.6 * 40 = 0.4 + 24 = 24.4 + e.update(15) + if math.Abs(e.mean-14.4) > tolerance { + t.Fatalf("third mean incorrect, got %f, want 14.4", e.mean) + } + if math.Abs(e.variance-24.4) > tolerance { + t.Fatalf("third variance incorrect, got %f, want 24.4", e.variance) + } +} diff --git a/exporter/exporterhelper/internal/experr/back_pressure.go b/exporter/exporterhelper/internal/experr/back_pressure.go new file mode 100644 index 00000000000..b1973b15c63 --- /dev/null +++ b/exporter/exporterhelper/internal/experr/back_pressure.go @@ -0,0 +1,80 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package experr // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" + +import ( + "context" + "errors" + "net" + "strings" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +type backpressureError struct{ error } + +func NewBackpressureError(err error) error { + if err == nil { + return nil + } + return backpressureError{err} +} + +// IsBackpressure returns true if the error is an explicit backpressure error +// OR it matches well-known HTTP/gRPC backpressure signals. +func IsBackpressure(err error) bool { + if err == nil { + return false + } + // Our explicit wrapper. + var x backpressureError + if errors.As(err, &x) { + return true + } + + // Check for net.Error timeout + var ne net.Error + if errors.As(err, &ne) && ne.Timeout() { + return true + } + + // Context timed out (treat as transient/backpressure for ARC) + if errors.Is(err, context.DeadlineExceeded) { + return true + } + + // gRPC: RESOURCE_EXHAUSTED (8), UNAVAILABLE (14), DEADLINE_EXCEEDED (4) + if s, ok := status.FromError(err); ok { + switch s.Code() { + case codes.ResourceExhausted, codes.Unavailable, codes.DeadlineExceeded: + return true + } + } + + // HTTP-ish strings (safe fallback; exporters often wrap as text) + // NOTE: intentionally case-insensitive and substring-based. + msg := strings.ToLower(err.Error()) + if strings.Contains(msg, "429") || + strings.Contains(msg, "too many requests") || + strings.Contains(msg, "503") || + strings.Contains(msg, "service unavailable") || + strings.Contains(msg, "502") || + strings.Contains(msg, "bad gateway") || + strings.Contains(msg, "504") || + strings.Contains(msg, "gateway timeout") || + strings.Contains(msg, "408") || + strings.Contains(msg, "request timeout") || + strings.Contains(msg, "599") || + strings.Contains(msg, "proxy timeout") || + strings.Contains(msg, "resource_exhausted") || + strings.Contains(msg, "unavailable") || + strings.Contains(msg, "deadline_exceeded") || // Added for string-based gRPC errors + strings.Contains(msg, "internal server error") || + strings.Contains(msg, "circuit breaker") { + return true + } + + return false +} diff --git a/exporter/exporterhelper/internal/experr/back_pressure_test.go b/exporter/exporterhelper/internal/experr/back_pressure_test.go new file mode 100644 index 00000000000..96af20c453f --- /dev/null +++ b/exporter/exporterhelper/internal/experr/back_pressure_test.go @@ -0,0 +1,53 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package experr // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +type timeoutNetError struct{} + +func (timeoutNetError) Error() string { return "timeout" } +func (timeoutNetError) Timeout() bool { return true } +func (timeoutNetError) Temporary() bool { return false } + +func TestIsBackpressure_PositiveCases(t *testing.T) { + cases := []error{ + NewBackpressureError(errors.New("wrapped")), + timeoutNetError{}, + errors.New("HTTP 429 Too Many Requests"), + errors.New("received HTTP 503 Service Unavailable"), + errors.New("http 502 bad gateway"), + errors.New("http 504 gateway timeout"), + errors.New("http 408 request timeout"), + errors.New("http 599 proxy timeout"), + errors.New("too many requests"), + errors.New("resource_exhausted"), + errors.New("unavailable"), + status.Error(codes.ResourceExhausted, "overloaded"), + status.Error(codes.Unavailable, "temporarily unavailable"), + } + + for _, err := range cases { + assert.True(t, IsBackpressure(err), "expected backpressure: %v", err) + } +} + +func TestIsBackpressure_NegativeCases(t *testing.T) { + cases := []error{ + nil, + errors.New("some transient error"), + errors.New("internal error without overload signal"), + } + + for _, err := range cases { + assert.False(t, IsBackpressure(err), "expected not backpressure: %v", err) + } +} diff --git a/exporter/exporterhelper/internal/metadata/generated_telemetry.go b/exporter/exporterhelper/internal/metadata/generated_telemetry.go index 510cb29002d..37b7e022b81 100644 --- a/exporter/exporterhelper/internal/metadata/generated_telemetry.go +++ b/exporter/exporterhelper/internal/metadata/generated_telemetry.go @@ -28,6 +28,13 @@ type TelemetryBuilder struct { meter metric.Meter mu sync.Mutex registrations []metric.Registration + ExporterArcAcquireWaitMs metric.Int64Histogram + ExporterArcBackoffEvents metric.Int64Counter + ExporterArcFailures metric.Int64Counter + ExporterArcLimit metric.Int64ObservableGauge + ExporterArcLimitChanges metric.Int64Counter + ExporterArcPermitsInUse metric.Int64ObservableGauge + ExporterArcRttMs metric.Int64Histogram ExporterEnqueueFailedLogRecords metric.Int64Counter ExporterEnqueueFailedMetricPoints metric.Int64Counter ExporterEnqueueFailedSpans metric.Int64Counter @@ -54,6 +61,36 @@ func (tbof telemetryBuilderOptionFunc) apply(mb *TelemetryBuilder) { tbof(mb) } +// RegisterExporterArcLimitCallback sets callback for observable ExporterArcLimit metric. +func (builder *TelemetryBuilder) RegisterExporterArcLimitCallback(cb metric.Int64Callback) error { + reg, err := builder.meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + cb(ctx, &observerInt64{inst: builder.ExporterArcLimit, obs: o}) + return nil + }, builder.ExporterArcLimit) + if err != nil { + return err + } + builder.mu.Lock() + defer builder.mu.Unlock() + builder.registrations = append(builder.registrations, reg) + return nil +} + +// RegisterExporterArcPermitsInUseCallback sets callback for observable ExporterArcPermitsInUse metric. +func (builder *TelemetryBuilder) RegisterExporterArcPermitsInUseCallback(cb metric.Int64Callback) error { + reg, err := builder.meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + cb(ctx, &observerInt64{inst: builder.ExporterArcPermitsInUse, obs: o}) + return nil + }, builder.ExporterArcPermitsInUse) + if err != nil { + return err + } + builder.mu.Lock() + defer builder.mu.Unlock() + builder.registrations = append(builder.registrations, reg) + return nil +} + // RegisterExporterQueueCapacityCallback sets callback for observable ExporterQueueCapacity metric. func (builder *TelemetryBuilder) RegisterExporterQueueCapacityCallback(cb metric.Int64Callback) error { reg, err := builder.meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { @@ -112,6 +149,48 @@ func NewTelemetryBuilder(settings component.TelemetrySettings, options ...Teleme } builder.meter = Meter(settings) var err, errs error + builder.ExporterArcAcquireWaitMs, err = builder.meter.Int64Histogram( + "otelcol_exporter_arc_acquire_wait_ms", + metric.WithDescription("Time a worker waited to acquire an ARC permit. [Alpha]"), + metric.WithUnit("ms"), + ) + errs = errors.Join(errs, err) + builder.ExporterArcBackoffEvents, err = builder.meter.Int64Counter( + "otelcol_exporter_arc_backoff_events", + metric.WithDescription("Number of ARC backoff (shrink) events triggered by error or RTT signal. [Alpha]"), + metric.WithUnit("{events}"), + ) + errs = errors.Join(errs, err) + builder.ExporterArcFailures, err = builder.meter.Int64Counter( + "otelcol_exporter_arc_failures", + metric.WithDescription("Number of requests considered failures by ARC (feeds adaptive shrink). [Alpha]"), + metric.WithUnit("{requests}"), + ) + errs = errors.Join(errs, err) + builder.ExporterArcLimit, err = builder.meter.Int64ObservableGauge( + "otelcol_exporter_arc_limit", + metric.WithDescription("Current ARC dynamic concurrency limit. [Alpha]"), + metric.WithUnit("{permits}"), + ) + errs = errors.Join(errs, err) + builder.ExporterArcLimitChanges, err = builder.meter.Int64Counter( + "otelcol_exporter_arc_limit_changes", + metric.WithDescription("Number of times ARC changed its concurrency limit. [Alpha]"), + metric.WithUnit("{events}"), + ) + errs = errors.Join(errs, err) + builder.ExporterArcPermitsInUse, err = builder.meter.Int64ObservableGauge( + "otelcol_exporter_arc_permits_in_use", + metric.WithDescription("Number of permits currently acquired. [Alpha]"), + metric.WithUnit("{permits}"), + ) + errs = errors.Join(errs, err) + builder.ExporterArcRttMs, err = builder.meter.Int64Histogram( + "otelcol_exporter_arc_rtt_ms", + metric.WithDescription("Request round-trip-time measured by ARC (from permit acquire to release). [Alpha]"), + metric.WithUnit("ms"), + ) + errs = errors.Join(errs, err) builder.ExporterEnqueueFailedLogRecords, err = builder.meter.Int64Counter( "otelcol_exporter_enqueue_failed_log_records", metric.WithDescription("Number of log records failed to be added to the sending queue. [Alpha]"), diff --git a/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go b/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go index 7096bc471f7..aba8ed904c0 100644 --- a/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go +++ b/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go @@ -6,12 +6,117 @@ import ( "testing" "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component/componenttest" "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.opentelemetry.io/otel/sdk/metric/metricdata/metricdatatest" - - "go.opentelemetry.io/collector/component/componenttest" ) +func AssertEqualExporterArcAcquireWaitMs(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.HistogramDataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_exporter_arc_acquire_wait_ms", + Description: "Time a worker waited to acquire an ARC permit. [Alpha]", + Unit: "ms", + Data: metricdata.Histogram[int64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_exporter_arc_acquire_wait_ms") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualExporterArcBackoffEvents(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_exporter_arc_backoff_events", + Description: "Number of ARC backoff (shrink) events triggered by error or RTT signal. [Alpha]", + Unit: "{events}", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_exporter_arc_backoff_events") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualExporterArcFailures(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_exporter_arc_failures", + Description: "Number of requests considered failures by ARC (feeds adaptive shrink). [Alpha]", + Unit: "{requests}", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_exporter_arc_failures") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualExporterArcLimit(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_exporter_arc_limit", + Description: "Current ARC dynamic concurrency limit. [Alpha]", + Unit: "{permits}", + Data: metricdata.Gauge[int64]{ + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_exporter_arc_limit") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualExporterArcLimitChanges(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_exporter_arc_limit_changes", + Description: "Number of times ARC changed its concurrency limit. [Alpha]", + Unit: "{events}", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_exporter_arc_limit_changes") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualExporterArcPermitsInUse(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_exporter_arc_permits_in_use", + Description: "Number of permits currently acquired. [Alpha]", + Unit: "{permits}", + Data: metricdata.Gauge[int64]{ + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_exporter_arc_permits_in_use") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + +func AssertEqualExporterArcRttMs(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.HistogramDataPoint[int64], opts ...metricdatatest.Option) { + want := metricdata.Metrics{ + Name: "otelcol_exporter_arc_rtt_ms", + Description: "Request round-trip-time measured by ARC (from permit acquire to release). [Alpha]", + Unit: "ms", + Data: metricdata.Histogram[int64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: dps, + }, + } + got, err := tt.GetMetric("otelcol_exporter_arc_rtt_ms") + require.NoError(t, err) + metricdatatest.AssertEqual(t, want, got, opts...) +} + func AssertEqualExporterEnqueueFailedLogRecords(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.DataPoint[int64], opts ...metricdatatest.Option) { want := metricdata.Metrics{ Name: "otelcol_exporter_enqueue_failed_log_records", diff --git a/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest_test.go b/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest_test.go index 838a03f504a..1c4c163a384 100644 --- a/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest_test.go +++ b/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest_test.go @@ -20,6 +20,14 @@ func TestSetupTelemetry(t *testing.T) { tb, err := metadata.NewTelemetryBuilder(testTel.NewTelemetrySettings()) require.NoError(t, err) defer tb.Shutdown() + require.NoError(t, tb.RegisterExporterArcLimitCallback(func(_ context.Context, observer metric.Int64Observer) error { + observer.Observe(1) + return nil + })) + require.NoError(t, tb.RegisterExporterArcPermitsInUseCallback(func(_ context.Context, observer metric.Int64Observer) error { + observer.Observe(1) + return nil + })) require.NoError(t, tb.RegisterExporterQueueCapacityCallback(func(_ context.Context, observer metric.Int64Observer) error { observer.Observe(1) return nil @@ -28,6 +36,11 @@ func TestSetupTelemetry(t *testing.T) { observer.Observe(1) return nil })) + tb.ExporterArcAcquireWaitMs.Record(context.Background(), 1) + tb.ExporterArcBackoffEvents.Add(context.Background(), 1) + tb.ExporterArcFailures.Add(context.Background(), 1) + tb.ExporterArcLimitChanges.Add(context.Background(), 1) + tb.ExporterArcRttMs.Record(context.Background(), 1) tb.ExporterEnqueueFailedLogRecords.Add(context.Background(), 1) tb.ExporterEnqueueFailedMetricPoints.Add(context.Background(), 1) tb.ExporterEnqueueFailedSpans.Add(context.Background(), 1) @@ -39,6 +52,27 @@ func TestSetupTelemetry(t *testing.T) { tb.ExporterSentLogRecords.Add(context.Background(), 1) tb.ExporterSentMetricPoints.Add(context.Background(), 1) tb.ExporterSentSpans.Add(context.Background(), 1) + AssertEqualExporterArcAcquireWaitMs(t, testTel, + []metricdata.HistogramDataPoint[int64]{{}}, metricdatatest.IgnoreValue(), + metricdatatest.IgnoreTimestamp()) + AssertEqualExporterArcBackoffEvents(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + AssertEqualExporterArcFailures(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + AssertEqualExporterArcLimit(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + AssertEqualExporterArcLimitChanges(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + AssertEqualExporterArcPermitsInUse(t, testTel, + []metricdata.DataPoint[int64]{{Value: 1}}, + metricdatatest.IgnoreTimestamp()) + AssertEqualExporterArcRttMs(t, testTel, + []metricdata.HistogramDataPoint[int64]{{}}, metricdatatest.IgnoreValue(), + metricdatatest.IgnoreTimestamp()) AssertEqualExporterEnqueueFailedLogRecords(t, testTel, []metricdata.DataPoint[int64]{{Value: 1}}, metricdatatest.IgnoreTimestamp()) diff --git a/exporter/exporterhelper/internal/queue_sender.go b/exporter/exporterhelper/internal/queue_sender.go index c49c7265647..e0e50ddd90b 100644 --- a/exporter/exporterhelper/internal/queue_sender.go +++ b/exporter/exporterhelper/internal/queue_sender.go @@ -5,11 +5,20 @@ package internal // import "go.opentelemetry.io/collector/exporter/exporterhelpe import ( "context" + "errors" + "fmt" "time" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" + "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/config/configoptional" + "go.opentelemetry.io/collector/consumer/consumererror" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/queuebatch" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/request" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/sender" @@ -32,6 +41,7 @@ func NewDefaultQueueConfig() queuebatch.Config { Sizer: request.SizerTypeItems, MinSize: 8192, }), + Arc: arc.DefaultConfig(), } } @@ -41,6 +51,7 @@ func NewQueueSender( exportFailureMessage string, next sender.Sender[request.Request], ) (sender.Sender[request.Request], error) { + var arcCtl *arc.Controller exportFunc := func(ctx context.Context, req request.Request) error { // Have to read the number of items before sending the request since the request can // be modified by the downstream components like the batcher. @@ -53,5 +64,76 @@ func NewQueueSender( return nil } - return queuebatch.NewQueueBatch(qSet, qCfg, exportFunc) + // If ARC is enabled, wrap the export function with ARC logic. + if qCfg.Arc.Enabled { + tel, err := metadata.NewTelemetryBuilder(qSet.Telemetry) + if err != nil { + return nil, fmt.Errorf("failed to create telemetry builder for ARC: %w", err) + } + arcCtl = arc.NewController(qCfg.Arc, tel, qSet.ID, qSet.Signal) + + // Create the attributes for sync ARC metrics + exporterAttr := attribute.String("exporter", qSet.ID.String()) + dataTypeAttr := attribute.String("data_type", qSet.Signal.String()) + arcAttrs := metric.WithAttributeSet(attribute.NewSet(exporterAttr, dataTypeAttr)) + + origExportFunc := exportFunc + exportFunc = func(ctx context.Context, req request.Request) error { + startWait := time.Now() + if !arcCtl.Acquire(ctx) { + // This context is from the queue, so it's a background context. + // If Acquire fails, it means context was cancelled, which shouldn't happen + // unless shutdown is happening. + return experr.NewShutdownErr(errors.New("arc semaphore closed")) + } + waitMs := time.Since(startWait).Milliseconds() + tel.ExporterArcAcquireWaitMs.Record(ctx, waitMs, arcAttrs) + + arcCtl.StartRequest() + startTime := time.Now() + err := origExportFunc(ctx, req) + rtt := time.Since(startTime) + + isBackpressure := experr.IsBackpressure(err) + // Apply De Morgan's law: !(A || B) == !A && !B + isSuccess := err == nil || (!consumererror.IsPermanent(err) && !isBackpressure) + + arcCtl.ReleaseWithSample(ctx, rtt, isSuccess, isBackpressure) + return err + } + } + + qbs, err := queuebatch.NewQueueBatch(qSet, qCfg, exportFunc) + if err != nil { + return nil, err + } + + // If ARC is enabled, we need to chain its shutdown. + if arcCtl != nil { + return &queueSenderWithArc{ + QueueBatch: qbs, + arcCtl: arcCtl, + }, nil + } + + return qbs, nil +} + +// queueSenderWithArc combines the QueueBatch sender with an arc.Controller +// to ensure both are started and shut down correctly. +type queueSenderWithArc struct { + *queuebatch.QueueBatch + arcCtl *arc.Controller +} + +// Shutdown shuts down both the queue/batcher and the ARC controller. +func (qs *queueSenderWithArc) Shutdown(ctx context.Context) error { + qs.arcCtl.Shutdown() + return qs.QueueBatch.Shutdown(ctx) +} + +// Start starts both the queue/batcher and the ARC controller. +// Note: ARC controller doesn't have a Start, so this just passes through. +func (qs *queueSenderWithArc) Start(ctx context.Context, host component.Host) error { + return qs.QueueBatch.Start(ctx, host) } diff --git a/exporter/exporterhelper/internal/queue_sender_test.go b/exporter/exporterhelper/internal/queue_sender_test.go index ad6020b5a1f..a1ce6a1336c 100644 --- a/exporter/exporterhelper/internal/queue_sender_test.go +++ b/exporter/exporterhelper/internal/queue_sender_test.go @@ -6,15 +6,19 @@ package internal import ( "context" "errors" + "sync" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.uber.org/zap" "go.uber.org/zap/zaptest/observer" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/collector/config/configoptional" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/queuebatch" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/request" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/requesttest" @@ -42,6 +46,82 @@ func TestNewQueueSenderFailedRequestDropped(t *testing.T) { assert.Equal(t, "Exporting failed. Dropping data.", observed.All()[0].Message) } +func TestQueueSender_ArcAcquireWaitMetric(t *testing.T) { + tt := componenttest.NewTelemetry() + t.Cleanup(func() { require.NoError(t, tt.Shutdown(context.Background())) }) + + qSet := queuebatch.AllSettings[request.Request]{ + Signal: pipeline.SignalMetrics, + ID: component.NewID(exportertest.NopType), + Telemetry: tt.NewTelemetrySettings(), + } + qCfg := NewDefaultQueueConfig() + qCfg.Arc.Enabled = true + qCfg.Arc.InitialLimit = 1 + // Disable batching and wait for results to test ARC correctly. + qCfg.Batch = configoptional.Optional[queuebatch.BatchConfig]{} // This disables batching + qCfg.WaitForResult = true + + // Create a sender that blocks until the channel is closed + blocker := make(chan struct{}) + mockSender := sender.NewSender(func(context.Context, request.Request) error { + <-blocker + return nil + }) + + be, err := NewQueueSender(qSet, qCfg, "", mockSender) + require.NoError(t, err) + require.NoError(t, be.Start(context.Background(), componenttest.NewNopHost())) + + var sendWg sync.WaitGroup + sendWg.Add(2) + + // Send first request + go func() { + defer sendWg.Done() + // This send will acquire the only permit and block in the sender + assert.NoError(t, be.Send(context.Background(), &requesttest.FakeRequest{Items: 1})) + }() + + // Wait for the first request to acquire the permit + time.Sleep(20 * time.Millisecond) + + // Send second request + go func() { + defer sendWg.Done() + // This send will block in arcCtl.Acquire() + assert.NoError(t, be.Send(context.Background(), &requesttest.FakeRequest{Items: 1})) + }() + + // Wait for the second request to block in Acquire + time.Sleep(50 * time.Millisecond) + + // Unblock the sender, which will release the permit, unblocking the second request + close(blocker) + + // Wait for both sends to complete + sendWg.Wait() + + // Shutdown to ensure all telemetry is flushed + require.NoError(t, be.Shutdown(context.Background())) + + // Verify the metric + metrics, err := tt.GetMetric("otelcol_exporter_arc_acquire_wait_ms") + require.NoError(t, err) + points := metrics.Data.(metricdata.Histogram[int64]).DataPoints + require.Len(t, points, 1) + + // Both requests should have recorded a wait time + assert.Equal(t, uint64(2), points[0].Count) + + // The max wait time should be > 40ms (accounting for test scheduler variance) + // The first request had minimal wait, the second had ~50ms wait. + maxVal, ok := points[0].Max.Value() + require.True(t, ok, "Max value should be valid") + assert.Greater(t, maxVal, int64(40), "Max wait time should be significant") + assert.Greater(t, points[0].Sum, int64(40), "Sum of wait times should be significant") +} + func TestQueueConfig_Validate(t *testing.T) { qCfg := NewDefaultQueueConfig() require.NoError(t, qCfg.Validate()) diff --git a/exporter/exporterhelper/internal/queuebatch/config.go b/exporter/exporterhelper/internal/queuebatch/config.go index ecef3ce3040..fb1516273e0 100644 --- a/exporter/exporterhelper/internal/queuebatch/config.go +++ b/exporter/exporterhelper/internal/queuebatch/config.go @@ -11,6 +11,7 @@ import ( "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/config/configoptional" "go.opentelemetry.io/collector/confmap" + "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/request" ) @@ -46,6 +47,9 @@ type Config struct { // BatchConfig it configures how the requests are consumed from the queue and batch together during consumption. Batch configoptional.Optional[BatchConfig] `mapstructure:"batch"` + + // Arc enables the Adaptive Concurrency Limiter. + Arc arc.Config `mapstructure:"arc"` } func (cfg *Config) Unmarshal(conf *confmap.Conf) error { diff --git a/exporter/exporterhelper/metadata.yaml b/exporter/exporterhelper/metadata.yaml index baa606a08fc..5bfd8b0426b 100644 --- a/exporter/exporterhelper/metadata.yaml +++ b/exporter/exporterhelper/metadata.yaml @@ -11,8 +11,87 @@ status: stability: beta: [traces, metrics, logs] +attributes: + direction: + description: "up or down" + type: string + enum: ["up","down"] + telemetry: metrics: + exporter_arc_acquire_wait_ms: + enabled: true + stability: + level: alpha + description: Time a worker waited to acquire an ARC permit. + unit: "ms" + histogram: + value_type: int + explicit: + buckets: [0, 1, 2, 5, 10, 20, 50, 100, 250, 500, 1000, 2000, 5000] + + exporter_arc_backoff_events: + enabled: true + stability: + level: alpha + description: Number of ARC backoff (shrink) events triggered by error or RTT signal. + unit: "{events}" + sum: + value_type: int + monotonic: true + + exporter_arc_failures: + enabled: true + stability: + level: alpha + description: Number of requests considered failures by ARC (feeds adaptive shrink). + unit: "{requests}" + sum: + value_type: int + monotonic: true + + exporter_arc_limit: + enabled: true + stability: + level: alpha + description: Current ARC dynamic concurrency limit. + unit: "{permits}" + gauge: + value_type: int + async: true + + exporter_arc_limit_changes: + enabled: true + stability: + level: alpha + description: Number of times ARC changed its concurrency limit. + unit: "{events}" + sum: + value_type: int + monotonic: true + attributes: [direction] + + exporter_arc_permits_in_use: + enabled: true + stability: + level: alpha + description: Number of permits currently acquired. + unit: "{permits}" + gauge: + value_type: int + async: true + + exporter_arc_rtt_ms: + enabled: true + stability: + level: alpha + description: Request round-trip-time measured by ARC (from permit acquire to release). + unit: "ms" + histogram: + value_type: int + explicit: + buckets: [0, 5, 10, 20, 50, 100, 200, 400, 800, 1600, 3200, 6400] + exporter_enqueue_failed_log_records: enabled: true stability: @@ -181,4 +260,4 @@ telemetry: unit: "{spans}" sum: value_type: int - monotonic: true + monotonic: true \ No newline at end of file diff --git a/exporter/exporterhelper/xexporterhelper/go.mod b/exporter/exporterhelper/xexporterhelper/go.mod index da0dc2192ee..8ea2996eab1 100644 --- a/exporter/exporterhelper/xexporterhelper/go.mod +++ b/exporter/exporterhelper/xexporterhelper/go.mod @@ -8,18 +8,18 @@ require ( go.opentelemetry.io/collector/component/componenttest v0.139.0 go.opentelemetry.io/collector/consumer v1.45.0 go.opentelemetry.io/collector/consumer/consumererror v0.139.0 - go.opentelemetry.io/collector/consumer/consumererror/xconsumererror v0.139.0 + go.opentelemetry.io/collector/consumer/consumererror/xconsumererror v0.138.0 go.opentelemetry.io/collector/consumer/consumertest v0.139.0 go.opentelemetry.io/collector/consumer/xconsumer v0.139.0 go.opentelemetry.io/collector/exporter v1.45.0 go.opentelemetry.io/collector/exporter/exporterhelper v0.139.0 - go.opentelemetry.io/collector/exporter/exportertest v0.139.0 + go.opentelemetry.io/collector/exporter/exportertest v0.138.0 go.opentelemetry.io/collector/exporter/xexporter v0.139.0 go.opentelemetry.io/collector/pdata v1.45.0 go.opentelemetry.io/collector/pdata/pprofile v0.139.0 go.opentelemetry.io/collector/pdata/testdata v0.139.0 go.opentelemetry.io/collector/pdata/xpdata v0.139.0 - go.opentelemetry.io/collector/pipeline/xpipeline v0.139.0 + go.opentelemetry.io/collector/pipeline/xpipeline v0.138.0 go.opentelemetry.io/otel v1.38.0 go.opentelemetry.io/otel/sdk v1.38.0 go.opentelemetry.io/otel/trace v1.38.0 @@ -61,8 +61,8 @@ require ( go.opentelemetry.io/otel/sdk/metric v1.38.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/sys v0.35.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b // indirect + golang.org/x/sys v0.36.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect google.golang.org/grpc v1.76.0 // indirect google.golang.org/protobuf v1.36.10 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/exporter/exporterhelper/xexporterhelper/go.sum b/exporter/exporterhelper/xexporterhelper/go.sum index 12a6fdb8e2c..2ae909dd4a6 100644 --- a/exporter/exporterhelper/xexporterhelper/go.sum +++ b/exporter/exporterhelper/xexporterhelper/go.sum @@ -79,12 +79,12 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= -golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b h1:zPKJod4w6F1+nRGDI9ubnXYhU9NSWoFAijkHkUXeTK8= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250804133106-a7a43d27e69b/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= +golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= +golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc= google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= From 232d073818ce2c0452ee1daff3d0621b1de1080c Mon Sep 17 00:00:00 2001 From: raghu999 Date: Sat, 8 Nov 2025 02:29:21 -0500 Subject: [PATCH 2/4] update version to 0.139.0 and fix lint errors --- exporter/exporterhelper/go.mod | 4 ++-- .../exporterhelper/internal/arc/controller_metrics_test.go | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/exporter/exporterhelper/go.mod b/exporter/exporterhelper/go.mod index 8abc5794f1f..629f702e07a 100644 --- a/exporter/exporterhelper/go.mod +++ b/exporter/exporterhelper/go.mod @@ -16,8 +16,8 @@ require ( go.opentelemetry.io/collector/consumer/consumererror v0.139.0 go.opentelemetry.io/collector/consumer/consumertest v0.139.0 go.opentelemetry.io/collector/exporter v1.45.0 - go.opentelemetry.io/collector/exporter/exportertest v0.138.0 - go.opentelemetry.io/collector/extension/extensiontest v0.138.0 + go.opentelemetry.io/collector/exporter/exportertest v0.139.0 + go.opentelemetry.io/collector/extension/extensiontest v0.139.0 go.opentelemetry.io/collector/extension/xextension v0.139.0 go.opentelemetry.io/collector/featuregate v1.45.0 go.opentelemetry.io/collector/pdata v1.45.0 diff --git a/exporter/exporterhelper/internal/arc/controller_metrics_test.go b/exporter/exporterhelper/internal/arc/controller_metrics_test.go index e8af3a399a6..ee6cb109d91 100644 --- a/exporter/exporterhelper/internal/arc/controller_metrics_test.go +++ b/exporter/exporterhelper/internal/arc/controller_metrics_test.go @@ -82,7 +82,10 @@ func TestController_BackpressureEmitsMetricsAndShrinks(t *testing.T) { require.True(t, found, "backoff_events not found") require.GreaterOrEqual(t, backoffs, int64(1)) - down := append(common, attribute.String("direction", "down")) + down := make([]attribute.KeyValue, len(common)+1) + copy(down, common) + down[len(common)] = attribute.String("direction", "down") + found, downs := findSum(rm, "otelcol_exporter_arc_limit_changes", down) require.True(t, found, "limit_changes{down} not found") require.GreaterOrEqual(t, downs, int64(1)) From 88486c87cd81a177a4afa53a4159a7a88aea9c28 Mon Sep 17 00:00:00 2001 From: raghu999 Date: Sun, 9 Nov 2025 02:27:02 -0500 Subject: [PATCH 3/4] update exporter helper with new implementation --- .../exporterhelper/generated_package_test.go | 3 +- .../exporterhelper/internal/arc/controller.go | 588 +++++++----------- .../internal/arc/controller_metrics_test.go | 92 --- .../internal/arc/controller_test.go | 582 +++++++++-------- exporter/exporterhelper/internal/arc/ewma.go | 53 +- .../exporterhelper/internal/arc/ewma_test.go | 126 ++-- .../exporterhelper/internal/arc/params.go | 47 ++ .../exporterhelper/internal/arc/token_pool.go | 117 ++++ .../internal/arc/token_pool_test.go | 193 ++++++ .../internal/experr/back_pressure.go | 137 ++-- .../internal/experr/back_pressure_test.go | 137 +++- .../metadatatest/generated_telemetrytest.go | 3 +- .../exporterhelper/internal/queue_sender.go | 22 +- .../internal/queue_sender_test.go | 117 +++- 14 files changed, 1326 insertions(+), 891 deletions(-) delete mode 100644 exporter/exporterhelper/internal/arc/controller_metrics_test.go create mode 100644 exporter/exporterhelper/internal/arc/params.go create mode 100644 exporter/exporterhelper/internal/arc/token_pool.go create mode 100644 exporter/exporterhelper/internal/arc/token_pool_test.go diff --git a/exporter/exporterhelper/generated_package_test.go b/exporter/exporterhelper/generated_package_test.go index 27f634c4255..73b52415cde 100644 --- a/exporter/exporterhelper/generated_package_test.go +++ b/exporter/exporterhelper/generated_package_test.go @@ -3,8 +3,9 @@ package exporterhelper import ( - "go.uber.org/goleak" "testing" + + "go.uber.org/goleak" ) func TestMain(m *testing.M) { diff --git a/exporter/exporterhelper/internal/arc/controller.go b/exporter/exporterhelper/internal/arc/controller.go index 899b9eea8be..96378671f39 100644 --- a/exporter/exporterhelper/internal/arc/controller.go +++ b/exporter/exporterhelper/internal/arc/controller.go @@ -5,7 +5,6 @@ package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/int import ( "context" - "errors" "math" "sync" "time" @@ -14,124 +13,161 @@ import ( "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata" "go.opentelemetry.io/collector/pipeline" ) -type Config struct { - Enabled bool `mapstructure:"enabled"` - InitialLimit int `mapstructure:"initial_limit"` - MaxConcurrency int `mapstructure:"max_concurrency"` - DecreaseRatio float64 `mapstructure:"decrease_ratio"` - EwmaAlpha float64 `mapstructure:"ewma_alpha"` - DeviationScale float64 `mapstructure:"deviation_scale"` -} - -func DefaultConfig() Config { - return Config{ - Enabled: false, - InitialLimit: 1, - MaxConcurrency: 200, - DecreaseRatio: 0.9, - EwmaAlpha: 0.4, - DeviationScale: 2.5, - } -} - +// Controller coordinates ARC. Internally it uses a control law and a TokenPool gate. type Controller struct { - cfg Config - sem *shrinkSem + cfg Config + pool *TokenPool + + // Telemetry builder (generated by metadata) and instruments. + tel *metadata.TelemetryBuilder + rttInst metric.Int64Histogram + failuresInst metric.Int64Counter + limitChangesInst metric.Int64Counter + limitUpAttrs metric.MeasurementOption + limitDownAttrs metric.MeasurementOption + backoffInst metric.Int64Counter + syncAttrs metric.MeasurementOption mu sync.Mutex - st state - - // Telemetry - tel *metadata.TelemetryBuilder - rttInst metric.Int64Histogram - failuresInst metric.Int64Counter - limitChangesInst metric.Int64Counter - limitChangesUpAttr metric.MeasurementOption - limitChangesDownAttr metric.MeasurementOption - backoffInst metric.Int64Counter - // common attr set for sync metrics (exporter + data_type) - commonAttr metric.MeasurementOption + st struct { + limit int + inFlight int + credits int // ceiling-hit credits (for additive increase) + pressure bool // any explicit pressure in period + periodStart time.Time + periodDur time.Duration + reMean robustEWMA // robust mean + absdev estimator for RTT + prevRTTMean float64 + prevRTTDev float64 + lastRTTMean float64 + lastRTTDev float64 + } } -type state struct { - limit int - inFlight int - past ewmaVar - curr mean - nextTick time.Time - hadPressure bool - hitCeiling bool +// NewController creates a new ARC Controller. +func NewController(cfg Config, tel *metadata.TelemetryBuilder, id component.ID, signal pipeline.Signal) *Controller { + def := DefaultConfig() + if cfg.InitialLimit <= 0 { + cfg.InitialLimit = 1 + } + if cfg.MaxConcurrency <= 0 { + cfg.MaxConcurrency = def.MaxConcurrency + } + if cfg.InitialLimit > cfg.MaxConcurrency { + cfg.InitialLimit = cfg.MaxConcurrency + } + if cfg.DecreaseRatio <= 0 || cfg.DecreaseRatio >= 1 { + cfg.DecreaseRatio = def.DecreaseRatio + } + if cfg.EwmaAlpha <= 0 || cfg.EwmaAlpha >= 1 { + cfg.EwmaAlpha = def.EwmaAlpha + } + if cfg.DeviationScale < 0 { + cfg.DeviationScale = def.DeviationScale + } + + // Attributes used on emitted metrics. + exporterAttr := attribute.String("exporter", id.String()) + dataTypeAttr := attribute.String("data_type", signal.String()) + syncAttrs := metric.WithAttributeSet(attribute.NewSet(exporterAttr, dataTypeAttr)) + limitUpAttrs := metric.WithAttributeSet(attribute.NewSet( + exporterAttr, dataTypeAttr, attribute.String("direction", "up"), + )) + limitDownAttrs := metric.WithAttributeSet(attribute.NewSet( + exporterAttr, dataTypeAttr, attribute.String("direction", "down"), + )) + + c := &Controller{ + cfg: cfg, + pool: newTokenPool(cfg.InitialLimit), + tel: tel, + rttInst: tel.ExporterArcRttMs, + failuresInst: tel.ExporterArcFailures, + limitChangesInst: tel.ExporterArcLimitChanges, + limitUpAttrs: limitUpAttrs, + limitDownAttrs: limitDownAttrs, + backoffInst: tel.ExporterArcBackoffEvents, + syncAttrs: syncAttrs, + } + + c.st.limit = cfg.InitialLimit + c.st.reMean = newRobustEWMA(cfg.EwmaAlpha) + c.st.periodStart = time.Now() + c.st.periodDur = clampDur(minPeriod, maxPeriod, 300*time.Millisecond) // initial period + + // Register async gauges. + _ = tel.RegisterExporterArcLimitCallback(func(_ context.Context, o metric.Int64Observer) error { + o.Observe(int64(c.CurrentLimit()), syncAttrs) + return nil + }) + _ = tel.RegisterExporterArcPermitsInUseCallback(func(_ context.Context, o metric.Int64Observer) error { + o.Observe(int64(c.PermitsInUse()), syncAttrs) + return nil + }) + + return c } -func NewController(cfg Config, tel *metadata.TelemetryBuilder, id component.ID, sig pipeline.Signal) *Controller { - c := cfg - if c.InitialLimit <= 0 { - c.InitialLimit = DefaultConfig().InitialLimit - } - if c.MaxConcurrency <= 0 { - c.MaxConcurrency = DefaultConfig().MaxConcurrency - } - if c.DecreaseRatio <= 0 || c.DecreaseRatio >= 1 { - c.DecreaseRatio = DefaultConfig().DecreaseRatio - } - if c.EwmaAlpha <= 0 || c.EwmaAlpha >= 1 { - c.EwmaAlpha = DefaultConfig().EwmaAlpha - } - if c.DeviationScale < 0 { - c.DeviationScale = DefaultConfig().DeviationScale +// controlStep applies the ARC control law at the end of a period. +// Caller must hold c.mu. +func (c *Controller) controlStep(ctx context.Context) { + limitBefore := c.st.limit + + // If we don't have a prior baseline yet, treat RTT spike detection as disabled + // (no isSpike) but still allow explicit-pressure decreases in cold start. + hasBaseline := c.st.prevRTTMean > 0 + threshold := c.st.prevRTTMean + c.cfg.DeviationScale*c.st.prevRTTDev + isSpike := hasBaseline && c.st.reMean.initialized() && c.st.lastRTTMean > threshold + + // Decrease on explicit pressure or RTT spike. + if c.st.limit > 1 && (c.st.pressure || isSpike) { + newLimit := int(math.Floor(float64(c.st.limit) * c.cfg.DecreaseRatio)) + if newLimit < 1 { + newLimit = 1 + } + if dec := c.st.limit - newLimit; dec > 0 { + c.pool.Shrink(dec) + c.st.limit = newLimit + if c.tel != nil { + if c.backoffInst != nil { + c.backoffInst.Add(contextOrBG(ctx), 1, c.syncAttrs) + } + if c.limitChangesInst != nil && c.st.limit != limitBefore { + c.limitChangesInst.Add(contextOrBG(ctx), 1, c.limitDownAttrs) + } + } + c.st.credits = 0 + c.st.pressure = false + return + } } - ctrl := &Controller{ - cfg: c, - sem: newShrinkSem(c.InitialLimit), - tel: tel, - } - ctrl.st.limit = c.InitialLimit - ctrl.st.nextTick = time.Now() - ctrl.st.past = newEwmaVar(c.EwmaAlpha) - - if tel != nil { - exporterAttr := attribute.String("exporter", id.String()) - dataTypeAttr := attribute.String("data_type", sig.String()) - - // Use exporter + data_type on ALL sync datapoints - common := metric.WithAttributeSet(attribute.NewSet(exporterAttr, dataTypeAttr)) - ctrl.commonAttr = common - - ctrl.rttInst = tel.ExporterArcRttMs - ctrl.failuresInst = tel.ExporterArcFailures - ctrl.limitChangesInst = tel.ExporterArcLimitChanges - ctrl.backoffInst = tel.ExporterArcBackoffEvents - - // Include data_type on the limit change counters too - ctrl.limitChangesUpAttr = metric.WithAttributeSet(attribute.NewSet( - exporterAttr, dataTypeAttr, attribute.String("direction", "up"), - )) - ctrl.limitChangesDownAttr = metric.WithAttributeSet(attribute.NewSet( - exporterAttr, dataTypeAttr, attribute.String("direction", "down"), - )) - - // Async gauges also include exporter + data_type - asyncCommon := metric.WithAttributeSet(attribute.NewSet(exporterAttr, dataTypeAttr)) - _ = tel.RegisterExporterArcLimitCallback(func(_ context.Context, o metric.Int64Observer) error { - o.Observe(int64(ctrl.CurrentLimit()), asyncCommon) - return nil - }) - _ = tel.RegisterExporterArcPermitsInUseCallback(func(_ context.Context, o metric.Int64Observer) error { - o.Observe(int64(ctrl.PermitsInUse()), asyncCommon) - return nil - }) + // Additive increase is *disabled* until we have a prior baseline (first roll). + // This avoids a spurious +1 when tests "poke" the controller to roll the period. + canIncreaseRTT := !c.st.reMean.initialized() || c.st.lastRTTMean <= threshold + if hasBaseline && // <- require baseline to allow additive increase + c.st.limit < c.cfg.MaxConcurrency && + !c.st.pressure && + c.st.credits >= requiredCredits(c.st.limit) && + canIncreaseRTT { + c.pool.Grow(1) + c.st.limit++ + if c.tel != nil && c.limitChangesInst != nil && c.st.limit != limitBefore { + c.limitChangesInst.Add(contextOrBG(ctx), 1, c.limitUpAttrs) + } + c.st.credits = 0 // reset after successful increase + return } - return ctrl + // Modest credit decay if no change happened. + c.st.credits = int(float64(c.st.credits) * 0.5) + c.st.pressure = false } -// Shutdown unregisters any asynchronous callbacks. func (c *Controller) Shutdown() { if c == nil { return @@ -139,299 +175,157 @@ func (c *Controller) Shutdown() { if c.tel != nil { c.tel.Shutdown() } - if c.sem != nil { - c.sem.close() + if c.pool != nil { + c.pool.Close() } } -// Acquire blocks until a slot is available or ctx is done. -// Returns true iff the slot was acquired. +// Acquire obtains a permit unless ARC is disabled or the context is cancelled. func (c *Controller) Acquire(ctx context.Context) bool { - return c.sem.acquire(ctx) == nil + // Fast-path bypass: if ARC is disabled, do not gate. + if !c.cfg.Enabled { + return true + } + // TokenPool.Acquire returns an error; return true when no error (permit obtained). + return c.pool.Acquire(ctx) == nil } -// Release releases one slot back to the semaphore. -func (c *Controller) Release() { c.sem.release() } +// Release returns a permit. +func (c *Controller) Release() { c.pool.Release() } -// ReleaseWithSample provides feedback to the controller and releases one slot. +// ReleaseWithSample reports a sample (RTT + classification) and releases. func (c *Controller) ReleaseWithSample(ctx context.Context, rtt time.Duration, success, backpressure bool) { c.Feedback(ctx, rtt, success, backpressure) - c.Release() -} - -// CurrentLimit returns the current concurrency limit. -func (c *Controller) CurrentLimit() int { - c.mu.Lock() - defer c.mu.Unlock() - return c.st.limit } -func (c *Controller) PermitsInUse() int { - c.mu.Lock() - defer c.mu.Unlock() - return c.st.inFlight -} - -// StartRequest increments in-flight count and tracks ceiling hit. +// StartRequest increments in-flight accounting and emits "in use" telemetry. +// Keep this as a no-op when disabled. func (c *Controller) StartRequest() { + if !c.cfg.Enabled { + return + } + c.mu.Lock() + // ===== existing in-flight++ and "reachedLimit" logic ===== c.st.inFlight++ if c.st.inFlight >= c.st.limit { - c.st.hitCeiling = true + // record a ceiling hit as a credit for additive increase logic + c.st.credits++ } + // Emit "permits in use" observable via tel, if you have it + // (Leave as-is if already wired; this is illustrative.) + // c.tel.RecordExporterArcPermitsInUse(int64(c.st.inFlight), c.attrs...) c.mu.Unlock() } -// Feedback provides the observed RTT and signals if backpressure or success occurred. -func (c *Controller) Feedback(ctx context.Context, rtt time.Duration, success, backpressure bool) { - now := time.Now() +func (c *Controller) CurrentLimit() int { c.mu.Lock() + defer c.mu.Unlock() + return c.st.limit +} - // Record RTT (guard nil telemetry) - if c.tel != nil && c.rttInst != nil { - c.rttInst.Record(ctx, rtt.Milliseconds(), c.commonAttr) - } - - // Defensively avoid negative inFlight if caller didn't call StartRequest. - if c.st.inFlight > 0 { - c.st.inFlight-- - } - - // Track this window's RTTs only for successes (to avoid biasing the mean). - if success { - c.st.curr.add(rtt.Seconds()) - } - - // If we got any explicit backpressure signal this window, remember it. - if backpressure { - c.st.hadPressure = true - } - - if !success && c.tel != nil && c.failuresInst != nil { - // Record failures (non-permanent errors or backpressure). - c.failuresInst.Add(ctx, 1, c.commonAttr) - } +func (c *Controller) PermitsInUse() int { + c.pool.mu.Lock() + defer c.pool.mu.Unlock() + return c.pool.inUse +} - // EARLY BACKOFF ON COLD START: - // If we've not yet initialized EWMA (no window mean set) but already see pressure, - // immediately apply multiplicative decrease so we don't stall under load. - if backpressure && c.st.limit > 1 && !c.st.past.initialized() { - old := c.st.limit - n := int(math.Max(1, math.Floor(float64(c.st.limit)*c.cfg.DecreaseRatio))) - if n < c.st.limit { - c.sem.forget(c.st.limit - n) - c.st.limit = n - if c.tel != nil && c.backoffInst != nil { - c.backoffInst.Add(ctx, 1, c.commonAttr) - } - if c.tel != nil && c.limitChangesInst != nil && c.st.limit != old { - c.limitChangesInst.Add(ctx, 1, c.limitChangesDownAttr) - } - } +// Feedback reports a sample (RTT + classification) and releases the permit. +// It must be called exactly once for every StartRequest that has a corresponding Acquire=true. +func (c *Controller) Feedback(ctx context.Context, rtt time.Duration, success, isBackpressure bool) { + // Ensure the permit is always released. + if c.cfg.Enabled { + // Release via the pool to match other Release implementations. + c.pool.Release() } - // Initialize the first window based on measured mean. - if !c.st.past.initialized() { - if m := c.st.curr.average(); m > 0 { - c.st.past.update(m) - // Next tick after 'm' seconds; caller may clamp externally if desired. - c.st.nextTick = now.Add(time.Duration(m * float64(time.Second))) - } - c.mu.Unlock() + // No controller updates when disabled. + if !c.cfg.Enabled { return } - // Window tick: adjust concurrency once per window. - if !now.Before(c.st.nextTick) { - m := c.st.curr.average() - if m > 0 { - c.st.past.update(m) - } - - // This will consider hadPressure or an RTT spike vs mean+Kσ. - c.adjust(ctx, m) + // RTT is provided by the caller. + c.mu.Lock() + defer c.mu.Unlock() - // Reset window state. - c.st.curr = mean{} - c.st.hadPressure = false - c.st.hitCeiling = false - c.st.nextTick = now.Add(time.Duration(c.st.past.mean * float64(time.Second))) + // ===== existing in-flight-- ===== + if c.st.inFlight > 0 { + c.st.inFlight-- } - c.mu.Unlock() -} - -func (c *Controller) adjust(ctx context.Context, current float64) { - // NOTE: caller holds c.mu - dev := math.Sqrt(c.st.past.variance) - thr := dev * c.cfg.DeviationScale - - // Try additive increase: only when at ceiling, no pressure, and RTT <= mean. - if c.st.limit < c.cfg.MaxConcurrency && - c.st.hitCeiling && - !c.st.hadPressure && - current > 0 && - current <= c.st.past.mean { - old := c.st.limit - c.sem.addOne() - c.st.limit++ - - if c.tel != nil && c.limitChangesInst != nil && c.st.limit != old { - c.limitChangesInst.Add(ctx, 1, c.limitChangesUpAttr) + // ===== robust-EWMA update with rtt when success ===== + if success { + // Update EWMA using RTT in milliseconds as a float64 + ms := float64(rtt.Milliseconds()) + c.st.lastRTTMean, c.st.lastRTTDev = c.st.reMean.update(ms) + if c.rttInst != nil { + c.rttInst.Record(contextOrBG(ctx), rtt.Milliseconds(), c.syncAttrs) } - return } - // Multiplicative decrease: on explicit pressure or RTT spike. - if c.st.limit > 1 && (c.st.hadPressure || current >= c.st.past.mean+thr) { - old := c.st.limit - - n := int(math.Max(1, math.Floor(float64(c.st.limit)*c.cfg.DecreaseRatio))) - if n < c.st.limit { - c.sem.forget(c.st.limit - n) - c.st.limit = n - - // limit decreased => backoff & limit-change callbacks - if c.tel != nil && c.backoffInst != nil { - c.backoffInst.Add(ctx, 1, c.commonAttr) - } - if c.tel != nil && c.limitChangesInst != nil && c.st.limit != old { - c.limitChangesInst.Add(ctx, 1, c.limitChangesDownAttr) - } + // Backpressure and failure accounting + if isBackpressure { + c.st.pressure = true + if c.failuresInst != nil { + c.failuresInst.Add(contextOrBG(ctx), 1, c.syncAttrs) + } + } else if !success { + // Non-backpressure failures are also counted. + if c.failuresInst != nil { + c.failuresInst.Add(contextOrBG(ctx), 1, c.syncAttrs) } } -} - -// ----------------------- shrinkable semaphore ----------------------- - -type shrinkSem struct { - mu sync.Mutex - avail int - waiting []chan struct{} - pendingF int - closed bool -} -func newShrinkSem(n int) *shrinkSem { return &shrinkSem{avail: n} } - -// acquire waits for capacity or returns ctx error. -// NOTE: shrink is paid by future releases (pendingF), not by acquires. -func (s *shrinkSem) acquire(ctx context.Context) error { - ch := make(chan struct{}, 1) - - s.mu.Lock() - if s.closed { - s.mu.Unlock() - return experr.NewShutdownErr(errors.New("arc semaphore closed")) - } - if s.avail > 0 { - s.avail-- - s.mu.Unlock() - return nil - } - s.waiting = append(s.waiting, ch) - s.mu.Unlock() - - select { - case <-ch: - // If we were woken up but are now closed, return error - s.mu.Lock() - closed := s.closed - s.mu.Unlock() - if closed { - return experr.NewShutdownErr(errors.New("arc semaphore closed")) - } - return nil - case <-ctx.Done(): - // Remove our waiter entry if still queued. - s.mu.Lock() - if s.closed { - s.mu.Unlock() - return experr.NewShutdownErr(errors.New("arc semaphore closed")) - } - for i, w := range s.waiting { - if w == ch { - s.waiting = append(s.waiting[:i], s.waiting[i+1:]...) - break - } + // ===== periodic control step / AIMD ===== + now := time.Now() + if now.Sub(c.st.periodStart) > c.st.periodDur { + // Call the controlStep + c.controlStep(ctx) + + // Update prev RTT stats *after* controlStep uses them + c.st.prevRTTMean = c.st.lastRTTMean + c.st.prevRTTDev = c.st.lastRTTDev + + // Reset for next period + c.st.periodStart = now + + // Recompute next control interval from observed RTT, clamped + if c.st.reMean.initialized() { + // Use mean RTT + 1 deviation as the period duration + newDur := time.Duration(c.st.lastRTTMean+c.st.lastRTTDev) * time.Millisecond + c.st.periodDur = clampDur(minPeriod, maxPeriod, newDur) } - s.mu.Unlock() - return ctx.Err() } } -// release returns capacity to the semaphore with correct shrink priority: -// 1) pay down pending forgets, 2) wake next waiter, 3) increase avail. -func (s *shrinkSem) release() { - s.mu.Lock() - defer s.mu.Unlock() - - if s.pendingF > 0 { - s.pendingF-- - return - } - if len(s.waiting) > 0 { - w := s.waiting[0] - s.waiting = s.waiting[1:] - w <- struct{}{} - return +// requiredCredits tunes how aggressively we add permits. +func requiredCredits(limit int) int { + if limit <= 1 { + return 1 } - s.avail++ -} - -// forget reduces effective capacity by n. -// It first consumes avail, then accumulates pending forgets to be paid by releases. -func (s *shrinkSem) forget(n int) { - if n <= 0 { - return + if limit < 8 { + return 2 } - s.mu.Lock() - defer s.mu.Unlock() - - for i := 0; i < n; i++ { - if s.avail > 0 { - s.avail-- - } else { - s.pendingF++ - } + if limit < 32 { + return 3 } + return 4 } -// addOne increases the available permits by exactly 1, paying any pending -// shrink (forget) first, or waking a waiter if present. -func (s *shrinkSem) addOne() { - s.mu.Lock() - defer s.mu.Unlock() - - // First, pay down any pending shrink. - if s.pendingF > 0 { - s.pendingF-- - return +func clampDur(minDur, maxDur, v time.Duration) time.Duration { + if v < minDur { + return minDur } - - // If someone is waiting, wake one waiter instead of increasing avail. - if len(s.waiting) > 0 { - w := s.waiting[0] - s.waiting = s.waiting[1:] - w <- struct{}{} - return + if v > maxDur { + return maxDur } - - // Otherwise just increase available permits. - s.avail++ + return v } -// close marks the semaphore as closed and wakes up all waiters. -func (s *shrinkSem) close() { - s.mu.Lock() - defer s.mu.Unlock() - - if s.closed { - return - } - s.closed = true - for _, w := range s.waiting { - close(w) // Close channel to signal +// contextOrBG returns the provided context or context.Background() when nil. +func contextOrBG(ctx context.Context) context.Context { + if ctx != nil { + return ctx } - s.waiting = nil + return context.Background() } diff --git a/exporter/exporterhelper/internal/arc/controller_metrics_test.go b/exporter/exporterhelper/internal/arc/controller_metrics_test.go deleted file mode 100644 index ee6cb109d91..00000000000 --- a/exporter/exporterhelper/internal/arc/controller_metrics_test.go +++ /dev/null @@ -1,92 +0,0 @@ -package arc - -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/require" - "go.opentelemetry.io/otel/attribute" - sdkmetric "go.opentelemetry.io/otel/sdk/metric" - sdkmetricdata "go.opentelemetry.io/otel/sdk/metric/metricdata" - - "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/component/componenttest" - "go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata" - "go.opentelemetry.io/collector/pipeline" -) - -func collectOne(t *testing.T, r *sdkmetric.ManualReader) sdkmetricdata.ResourceMetrics { - t.Helper() - var out sdkmetricdata.ResourceMetrics - require.NoError(t, r.Collect(context.Background(), &out)) - return out -} - -func findSum(rm sdkmetricdata.ResourceMetrics, name string, attrs []attribute.KeyValue) (bool, int64) { - want := attribute.NewSet(attrs...) // canonical comparison set - for _, sm := range rm.ScopeMetrics { - for _, m := range sm.Metrics { - if m.Name != name { - continue - } - if s, ok := m.Data.(sdkmetricdata.Sum[int64]); ok { - for _, dp := range s.DataPoints { - // Works for both pointer and value receivers - if (&dp.Attributes).Equals(&want) || dp.Attributes.Equals(&want) { - return true, dp.Value - } - } - } - } - } - return false, 0 -} - -func TestController_BackpressureEmitsMetricsAndShrinks(t *testing.T) { - reader := sdkmetric.NewManualReader() - mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) - ts := componenttest.NewNopTelemetrySettings() - ts.MeterProvider = mp - - tel, err := metadata.NewTelemetryBuilder(ts) - require.NoError(t, err) - - cfg := DefaultConfig() - cfg.InitialLimit = 8 - cfg.MaxConcurrency = 16 - cfg.DecreaseRatio = 0.5 - - id := component.MustNewID("otlp") - ctrl := NewController(cfg, tel, id, pipeline.SignalTraces) - - // Initialize EWMA so the controller takes the 'normal' decrease path - ctrl.mu.Lock() - ctrl.st.past.update(0.01) // small mean to make tick scheduling trivial - ctrl.st.limit = 8 - ctrl.st.hadPressure = true // force a decrease on adjust - ctrl.mu.Unlock() - - // Trigger a window tick -> adjust() sees hadPressure => multiplicative decrease. - ctrl.Feedback(context.Background(), 1*time.Millisecond, true /* success */, true /* backpressure */) - time.Sleep(10 * time.Millisecond) - - rm := collectOne(t, reader) - - common := []attribute.KeyValue{ - attribute.String("exporter", id.String()), - attribute.String("data_type", pipeline.SignalTraces.String()), - } - - found, backoffs := findSum(rm, "otelcol_exporter_arc_backoff_events", common) - require.True(t, found, "backoff_events not found") - require.GreaterOrEqual(t, backoffs, int64(1)) - - down := make([]attribute.KeyValue, len(common)+1) - copy(down, common) - down[len(common)] = attribute.String("direction", "down") - - found, downs := findSum(rm, "otelcol_exporter_arc_limit_changes", down) - require.True(t, found, "limit_changes{down} not found") - require.GreaterOrEqual(t, downs, int64(1)) -} diff --git a/exporter/exporterhelper/internal/arc/controller_test.go b/exporter/exporterhelper/internal/arc/controller_test.go index 48bf40b604a..c786e21b490 100644 --- a/exporter/exporterhelper/internal/arc/controller_test.go +++ b/exporter/exporterhelper/internal/arc/controller_test.go @@ -1,11 +1,11 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 -package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" +package arc import ( "context" - "sync" + "math" "testing" "time" @@ -14,323 +14,365 @@ import ( "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/component/componenttest" - "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata" "go.opentelemetry.io/collector/pipeline" ) -func TestNewControllerDefaults(t *testing.T) { - cfg := Config{ - InitialLimit: 0, - MaxConcurrency: 0, - DecreaseRatio: 0, - EwmaAlpha: 0, - DeviationScale: -1, - } - ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) +// newTestController creates a controller with NOP telemetry. +func newTestController(t *testing.T, cfg Config) *Controller { + set := componenttest.NewNopTelemetrySettings() + tel, err := metadata.NewTelemetryBuilder(set) + require.NoError(t, err) + return NewController(cfg, tel, component.MustNewID("test"), pipeline.SignalTraces) +} - if got, want := ctrl.CurrentLimit(), DefaultConfig().InitialLimit; got != want { - t.Fatalf("unexpected InitialLimit: got %d want %d", got, want) - } - if ctrl.cfg.MaxConcurrency != DefaultConfig().MaxConcurrency { - t.Fatalf("MaxConcurrency default not applied") - } - if ctrl.cfg.DecreaseRatio != DefaultConfig().DecreaseRatio { - t.Fatalf("DecreaseRatio default not applied") - } - if ctrl.cfg.EwmaAlpha != DefaultConfig().EwmaAlpha { - t.Fatalf("EwmaAlpha default not applied") - } - if ctrl.cfg.DeviationScale != DefaultConfig().DeviationScale { - t.Fatalf("DeviationScale default not applied") - } +// forceControlStep triggers a feedback loop that is guaranteed to be after the period duration. +func forceControlStep(c *Controller) { + // Wait past the period and poke again to trigger controlStep + time.Sleep(c.st.periodDur + 1*time.Millisecond) + c.StartRequest() + c.ReleaseWithSample(context.Background(), time.Duration(c.st.lastRTTMean*float64(time.Millisecond)), true, false) } -func TestAcquireReleaseSemaphore(t *testing.T) { - cfg := Config{InitialLimit: 2} - ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) +func TestController_NewShutdown(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + c := newTestController(t, cfg) + require.NotNil(t, c) + assert.Equal(t, cfg.InitialLimit, c.CurrentLimit()) + c.Shutdown() +} - ctx := context.Background() - if !ctrl.Acquire(ctx) { - t.Fatal("expected first Acquire to succeed") - } - if !ctrl.Acquire(ctx) { - t.Fatal("expected second Acquire to succeed") - } +func TestController_ConfigClamping(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + // Intentionally bad values that should be clamped to defaults + cfg.InitialLimit = -5 + cfg.MaxConcurrency = 0 + cfg.DecreaseRatio = 1.5 + cfg.EwmaAlpha = -1.0 + cfg.DeviationScale = -2.0 + + c := newTestController(t, cfg) + require.NotNil(t, c) + + // NewController should have clamped these per DefaultConfig()+rules + def := DefaultConfig() + assert.GreaterOrEqual(t, c.cfg.InitialLimit, 1) + assert.Equal(t, def.MaxConcurrency, c.cfg.MaxConcurrency) + assert.InDelta(t, def.DecreaseRatio, c.cfg.DecreaseRatio, 1e-9) + assert.InDelta(t, def.EwmaAlpha, c.cfg.EwmaAlpha, 1e-9) + assert.InDelta(t, def.DeviationScale, c.cfg.DeviationScale, 1e-9) + + // Also ensure InitialLimit <= MaxConcurrency + assert.LessOrEqual(t, c.cfg.InitialLimit, c.cfg.MaxConcurrency) + c.Shutdown() +} - // third attempt should block and then fail with context timeout - toCtx, cancel := context.WithTimeout(context.Background(), 30*time.Millisecond) +func TestController_AcquireRelease(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + cfg.InitialLimit = 2 + c := newTestController(t, cfg) + require.NotNil(t, c) + + // Acquire two permits + assert.True(t, c.Acquire(context.Background())) + assert.Equal(t, 1, c.PermitsInUse()) + assert.True(t, c.Acquire(context.Background())) + assert.Equal(t, 2, c.PermitsInUse()) + + // Third acquire should block and fail with timeout + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Millisecond) defer cancel() - if ctrl.Acquire(toCtx) { - t.Fatal("expected Acquire with short timeout to fail") - } + assert.False(t, c.Acquire(ctx)) + assert.Equal(t, 2, c.PermitsInUse()) - // release one and acquire should succeed - ctrl.Release() - if !ctrl.Acquire(ctx) { - t.Fatal("expected Acquire to succeed after release") - } - // cleanup releases - ctrl.Release() - ctrl.Release() -} + // Release one and acquire again + c.Release() + assert.Equal(t, 1, c.PermitsInUse()) + assert.True(t, c.Acquire(context.Background())) + assert.Equal(t, 2, c.PermitsInUse()) -func TestStartRequestAndPermits(t *testing.T) { - cfg := Config{InitialLimit: 3} - ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + c.Shutdown() +} - ctrl.StartRequest() - ctrl.StartRequest() +func TestController_AcquireRespectsContextCancel(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + cfg.InitialLimit = 1 - if got := ctrl.PermitsInUse(); got != 2 { - t.Fatalf("unexpected in-flight: got %d want %d", got, 2) - } + c := newTestController(t, cfg) + require.NotNil(t, c) - // Simulate hitting ceiling - ctrl.mu.Lock() - ctrl.st.inFlight = ctrl.st.limit - ctrl.st.hitCeiling = false - ctrl.mu.Unlock() - - ctrl.StartRequest() - ctrl.mu.Lock() - hit := ctrl.st.hitCeiling - ctrl.mu.Unlock() - if !hit { - t.Fatalf("expected hitCeiling to be true when inFlight >= limit") - } -} + // Take the only permit + assert.True(t, c.Acquire(context.Background())) -func TestEarlyBackoffOnColdStart(t *testing.T) { - cfg := Config{InitialLimit: 10, DecreaseRatio: 0.5} - ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) + // Second acquire should obey context cancellation + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) + defer cancel() + ok := c.Acquire(ctx) + assert.False(t, ok) - if got := ctrl.CurrentLimit(); got != 10 { - t.Fatalf("expected initial limit 10, got %d", got) - } + // Cleanup + c.Release() + c.Shutdown() +} - // Provide an explicit backpressure while EWMA not initialized. - ctrl.Feedback(context.Background(), 0, true, true) +func TestController_StartRequestCreditsOnlyOnSaturation(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + cfg.InitialLimit = 2 + + c := newTestController(t, cfg) + + // Force a short period so control steps run frequently in other tests, + // but here we're just testing credit accrual behavior. + c.mu.Lock() + c.st.periodDur = 10 * time.Millisecond + c.mu.Unlock() + + // At start, no in-flight and no credits. + assert.Equal(t, 0, c.st.inFlight) + assert.Equal(t, 0, c.st.credits) + + // First request (inFlight=1 < limit=2) -> no credit. + c.StartRequest() + assert.Equal(t, 1, c.st.inFlight) + assert.Equal(t, 0, c.st.credits) + + // Second request (inFlight=2 == limit=2) -> credit++. + c.StartRequest() + assert.Equal(t, 2, c.st.inFlight) + assert.Equal(t, 1, c.st.credits) + + // Third request (inFlight=3 > limit=2) -> credit++. + c.StartRequest() + assert.Equal(t, 3, c.st.inFlight) + assert.Equal(t, 2, c.st.credits) + + // Release all three via ReleaseWithSample which also calls Release(). + c.ReleaseWithSample(context.Background(), 10*time.Millisecond, true, false) + c.ReleaseWithSample(context.Background(), 10*time.Millisecond, true, false) + c.ReleaseWithSample(context.Background(), 10*time.Millisecond, true, false) + assert.Equal(t, 0, c.st.inFlight) + + c.Shutdown() +} - if got := ctrl.CurrentLimit(); got != 5 { - t.Fatalf("expected early backoff to reduce limit to 5, got %d", got) - } +func TestController_Feedback_UpdatesEWMAOnlyOnSuccess(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + cfg.InitialLimit = 1 + cfg.EwmaAlpha = 0.5 + + c := newTestController(t, cfg) + + // First successful sample initializes the robust EWMA + c.StartRequest() + c.ReleaseWithSample(context.Background(), 100*time.Millisecond, true, false) + require.True(t, c.st.reMean.initialized()) + mean1 := c.st.lastRTTMean + dev1 := c.st.lastRTTDev + + // Failed/backpressure=false sample should NOT update EWMA + c.StartRequest() + c.ReleaseWithSample(context.Background(), 400*time.Millisecond, false, false) + assert.InDelta(t, mean1, c.st.lastRTTMean, 1e-9) + assert.InDelta(t, dev1, c.st.lastRTTDev, 1e-9) + // Successful sample should update EWMA + c.StartRequest() + c.ReleaseWithSample(context.Background(), 200*time.Millisecond, true, false) + assert.Greater(t, math.Abs(mean1-c.st.lastRTTMean), 1e-9) + + c.Shutdown() } -func TestAdjustIncreaseAndDecrease(t *testing.T) { - // Additive increase case +func TestController_AdditiveIncreaseAndCreditReset(t *testing.T) { cfg := DefaultConfig() + cfg.Enabled = true cfg.InitialLimit = 1 - ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) - - ctrl.mu.Lock() - // initialize past EWMA so past.initialized() is true - ctrl.st.past.update(1.0) - ctrl.st.hitCeiling = true - ctrl.st.hadPressure = false - ctrl.st.limit = 1 - ctrl.mu.Unlock() - - ctrl.mu.Lock() - ctrl.adjust(context.Background(), 1.0) - ctrl.mu.Unlock() - - if got := ctrl.CurrentLimit(); got != 2 { - t.Fatalf("expected additive increase to raise limit to 2, got %d", got) + cfg.MaxConcurrency = 5 + cfg.EwmaAlpha = 0.5 + cfg.DeviationScale = 2.0 + + c := newTestController(t, cfg) + // Short test period + c.mu.Lock() + c.st.periodDur = 10 * time.Millisecond + c.mu.Unlock() + + // Prime EWMA with a stable RTT (so no spike) + c.StartRequest() + c.ReleaseWithSample(context.Background(), 100*time.Millisecond, true, false) + + // Force a control step to set prevRTTMean + forceControlStep(c) + assert.Equal(t, 1, c.CurrentLimit()) // Should not have increased + assert.Positive(t, c.st.prevRTTMean) + + // Build up enough credits to trigger an increase. + reqCredits := requiredCredits(c.CurrentLimit()) + for i := 0; i < reqCredits; i++ { + c.StartRequest() + } + // Release these; we want the control step to see saturation and no pressure. + for i := 0; i < reqCredits; i++ { + c.ReleaseWithSample(context.Background(), 100*time.Millisecond, true, false) } - // Multiplicative decrease case - cfg2 := DefaultConfig() - cfg2.InitialLimit = 10 - cfg2.DecreaseRatio = 0.5 - ctrl2 := NewController(cfg2, nil, component.ID{}, pipeline.SignalTraces) - - ctrl2.mu.Lock() - ctrl2.st.past.update(1.0) - ctrl2.st.hadPressure = true - ctrl2.st.limit = 10 - ctrl2.mu.Unlock() + // Wait past the period and poke again to trigger controlStep + forceControlStep(c) - ctrl2.mu.Lock() - ctrl2.adjust(context.Background(), 0.0) - ctrl2.mu.Unlock() + assert.Equal(t, 2, c.CurrentLimit(), "limit should increase additively by 1") + assert.Equal(t, 0, c.st.credits, "credits reset after increase") - if got := ctrl2.CurrentLimit(); got != 5 { - t.Fatalf("expected multiplicative decrease to reduce limit to 5, got %d", got) - } + c.Shutdown() } -func TestShrinkSem_ForgetAndRelease(t *testing.T) { - s := newShrinkSem(2) - - // Use capacity - require.NoError(t, s.acquire(context.Background())) - require.NoError(t, s.acquire(context.Background())) - require.Equal(t, 0, s.avail) +func TestController_MultiplicativeDecreaseOnBackpressure(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + cfg.InitialLimit = 4 + cfg.MaxConcurrency = 10 + cfg.DecreaseRatio = 0.5 // make the effect obvious - // Forget 1, adds pending Forget - s.forget(1) - require.Equal(t, 0, s.avail) - require.Equal(t, 1, s.pendingF) + c := newTestController(t, cfg) + require.Equal(t, 4, c.CurrentLimit()) - // Release 1, pays pending Forget - s.release() - require.Equal(t, 0, s.avail) - require.Equal(t, 0, s.pendingF) + // Short control period + c.mu.Lock() + c.st.periodDur = 10 * time.Millisecond + c.mu.Unlock() - // Next acquire should block - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) - require.Error(t, s.acquire(ctx)) // Fails on timeout - cancel() - - // Release second acquire - s.release() - require.Equal(t, 1, s.avail) - require.Equal(t, 0, s.pendingF) - - // Acquire should now succeed - require.NoError(t, s.acquire(context.Background())) - require.Equal(t, 0, s.avail) - - // Forget 2. No avail, one permit outstanding. - // Both forgets should go to pendingF. - s.forget(2) - require.Equal(t, 0, s.avail) - require.Equal(t, 2, s.pendingF) - - // Release (pays one pending forget) - s.release() - require.Equal(t, 0, s.avail) - require.Equal(t, 1, s.pendingF) -} + // Cause an explicit backpressure signal + c.StartRequest() + c.ReleaseWithSample(context.Background(), 50*time.Millisecond, true, true) -func TestShrinkSem_AddAndAcquire(t *testing.T) { - s := newShrinkSem(1) - require.NoError(t, s.acquire(context.Background())) - require.Equal(t, 0, s.avail) - - // Start a waiting acquire in a goroutine - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - // Must use assert in goroutines - assert.NoError(t, s.acquire(context.Background())) - s.release() // release it right away - }() - - // Allow goroutine to block on acquire - time.Sleep(10 * time.Millisecond) - - // Add one permit; should wake the waiter - s.addOne() - wg.Wait() // ensure goroutine finished - - s.mu.Lock() - require.Equal(t, 1, s.avail) // the goroutine acquired then released once - require.Empty(t, s.waiting) - s.mu.Unlock() - - // Test Add paying pending Forgets - s.forget(2) // avail=1 -> 0, pendingF=1 - require.Equal(t, 0, s.avail) - require.Equal(t, 1, s.pendingF) - - s.addOne() // pays pendingF - require.Equal(t, 0, s.avail) - require.Equal(t, 0, s.pendingF) - - s.addOne() // increases avail - require.Equal(t, 1, s.avail) - require.Equal(t, 0, s.pendingF) -} + // Wait and trigger the control step + forceControlStep(c) -func TestController_Shutdown(t *testing.T) { - // Test that shutdown doesn't panic - ctrl := NewController(DefaultConfig(), nil, component.ID{}, pipeline.SignalTraces) - ctrl.Shutdown() + // newLimit = floor(4 * 0.5) = 2 + assert.Equal(t, 2, c.CurrentLimit()) + assert.Equal(t, 0, c.st.credits) - // Test that shutdown with telemetry doesn't panic - tel, err := metadata.NewTelemetryBuilder(componenttest.NewNopTelemetrySettings()) - require.NoError(t, err) - ctrlWithTel := NewController(DefaultConfig(), tel, component.ID{}, pipeline.SignalTraces) - ctrlWithTel.Shutdown() + c.Shutdown() } -func TestController_Shutdown_UnblocksWaiters(t *testing.T) { - ctrl := NewController(Config{InitialLimit: 1}, nil, component.ID{}, pipeline.SignalTraces) +func TestController_DecreaseOnRTTSpike(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + cfg.InitialLimit = 4 + cfg.DeviationScale = 1.0 + cfg.EwmaAlpha = 0.5 - // Acquire the only permit - require.True(t, ctrl.Acquire(context.Background())) + c := newTestController(t, cfg) - errCh := make(chan error, 1) - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - // This should block - errCh <- ctrl.sem.acquire(context.Background()) - }() + // Short control period + c.mu.Lock() + c.st.periodDur = 10 * time.Millisecond + c.mu.Unlock() - // Give the goroutine time to block - time.Sleep(50 * time.Millisecond) + // Establish baseline RTT ~100ms + c.StartRequest() + c.ReleaseWithSample(context.Background(), 100*time.Millisecond, true, false) - // Shutdown the controller, which closes the semaphore - ctrl.Shutdown() + // Force a control step to set prevRTTMean + forceControlStep(c) + assert.Equal(t, 4, c.CurrentLimit()) + assert.Positive(t, c.st.prevRTTMean) - // Wait for the goroutine to finish - wg.Wait() + // Trigger a spike sample + c.StartRequest() + c.ReleaseWithSample(context.Background(), 600*time.Millisecond, true, false) - // Check that the blocked goroutine received a shutdown error - err := <-errCh - require.Error(t, err) - assert.True(t, experr.IsShutdownErr(err), "error should be a shutdown error") + // After control step, limit should drop by DecreaseRatio (default 0.9) + forceControlStep(c) - // Test that new acquires also fail - err = ctrl.sem.acquire(context.Background()) - require.Error(t, err) - assert.True(t, experr.IsShutdownErr(err), "error should be a shutdown error") + // Expect a decrease by floor(4*0.9) = 3 (one step down) + assert.Equal(t, 3, c.CurrentLimit()) + c.Shutdown() } -func TestController_ReleaseWithSample(t *testing.T) { - cfg := Config{InitialLimit: 1} - ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) +func TestController_MinFloorAndMaxCap(t *testing.T) { + cfg := DefaultConfig() + cfg.Enabled = true + cfg.InitialLimit = 1 + cfg.MaxConcurrency = 2 + cfg.DecreaseRatio = 0.1 + + c := newTestController(t, cfg) - ctx := context.Background() - require.True(t, ctrl.Acquire(ctx)) - ctrl.StartRequest() - - // Initialize EWMA - ctrl.ReleaseWithSample(ctx, 10*time.Millisecond, true, false) - ctrl.mu.Lock() - require.True(t, ctrl.st.past.initialized()) - require.Equal(t, 0, ctrl.st.inFlight) - ctrl.mu.Unlock() + // Try to decrease at the floor; should stay at 1 + c.mu.Lock() + c.st.periodDur = 10 * time.Millisecond + c.mu.Unlock() + + c.StartRequest() + c.ReleaseWithSample(context.Background(), 50*time.Millisecond, true, true) // pressure + forceControlStep(c) + assert.Equal(t, 1, c.CurrentLimit(), "should not go below 1") + + // Build credits to increase to the cap = 2 + reqCredits := requiredCredits(c.CurrentLimit()) + for i := 0; i < reqCredits; i++ { + c.StartRequest() + } + for i := 0; i < reqCredits; i++ { + c.ReleaseWithSample(context.Background(), 50*time.Millisecond, true, false) + } + forceControlStep(c) + assert.Equal(t, 2, c.CurrentLimit(), "should increase to cap") + + // Further credit should not exceed MaxConcurrency + reqCredits = requiredCredits(c.CurrentLimit()) + for i := 0; i < reqCredits; i++ { + c.StartRequest() + } + for i := 0; i < reqCredits; i++ { + c.ReleaseWithSample(context.Background(), 50*time.Millisecond, true, false) + } + forceControlStep(c) + assert.Equal(t, 2, c.CurrentLimit(), "should not exceed cap") + + c.Shutdown() } -func TestController_FeedbackWindowTick(t *testing.T) { +func TestController_ReleaseWithSampleReleases(t *testing.T) { cfg := DefaultConfig() + cfg.Enabled = true cfg.InitialLimit = 1 - ctrl := NewController(cfg, nil, component.ID{}, pipeline.SignalTraces) - - // Initialize past EWMA - ctrl.mu.Lock() - ctrl.st.past.update(1.0) - ctrl.st.limit = 1 - ctrl.st.nextTick = time.Now() // Force tick on next feedback - ctrl.st.hitCeiling = true // Enable additive increase - ctrl.mu.Unlock() - - // This feedback should trigger a window tick - ctrl.Feedback(context.Background(), 10*time.Millisecond, true, false) - - ctrl.mu.Lock() - // Check if adjust ran (limit increased) - require.Equal(t, 2, ctrl.st.limit) - // Check if state was reset - require.Equal(t, 0, ctrl.st.curr.n) - require.False(t, ctrl.st.hadPressure) - require.False(t, ctrl.st.hitCeiling) - require.True(t, ctrl.st.nextTick.After(time.Now())) // Next tick is in the future - ctrl.mu.Unlock() + + c := newTestController(t, cfg) + + // Acquire via the pool to bump inUse + assert.True(t, c.Acquire(context.Background())) + assert.Equal(t, 1, c.PermitsInUse()) + + // StartRequest increments inFlight; ReleaseWithSample must also release the pool permit + c.StartRequest() + c.ReleaseWithSample(context.Background(), 10*time.Millisecond, true, false) + + assert.Equal(t, 0, c.PermitsInUse()) + c.Shutdown() +} + +func Test_requiredCredits(t *testing.T) { + assert.Equal(t, 1, requiredCredits(0)) + assert.Equal(t, 1, requiredCredits(1)) + assert.Equal(t, 2, requiredCredits(2)) + assert.Equal(t, 2, requiredCredits(7)) + assert.Equal(t, 3, requiredCredits(8)) + assert.Equal(t, 3, requiredCredits(31)) + assert.Equal(t, 4, requiredCredits(32)) + assert.Equal(t, 4, requiredCredits(100)) +} + +func Test_contextOrBG(t *testing.T) { + var nilCtx context.Context // avoid SA1012 by not passing a literal nil + bg := contextOrBG(nilCtx) + require.NotNil(t, bg) + + ctx := context.Background() + require.Equal(t, ctx, contextOrBG(ctx)) } diff --git a/exporter/exporterhelper/internal/arc/ewma.go b/exporter/exporterhelper/internal/arc/ewma.go index a6220f9cef8..35b11e30aac 100644 --- a/exporter/exporterhelper/internal/arc/ewma.go +++ b/exporter/exporterhelper/internal/arc/ewma.go @@ -3,35 +3,42 @@ package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" -type mean struct { - n int - sum float64 -} +import "math" // robustEWMA tracks an exponentially-weighted moving *mean* and a robust +// dispersion proxy: the EWMA of absolute residuals, i.e., E[ |x - mean_prev| ]. +// This behaves similarly to a mean + K*sigma threshold but is cheaper and avoids +// squaring; it is less sensitive to outliers than naive variance updates. -func (m *mean) add(x float64) { m.n++; m.sum += x } -func (m *mean) average() float64 { - if m.n == 0 { - return 0 - } - return m.sum / float64(m.n) +type robustEWMA struct { + alpha float64 + init bool + mean float64 + absDev float64 // EWMA of absolute residuals relative to previous mean } -type ewmaVar struct { - alpha float64 - mean float64 - variance float64 - init bool +func newRobustEWMA(alpha float64) robustEWMA { + return robustEWMA{alpha: alpha} } -func newEwmaVar(alpha float64) ewmaVar { return ewmaVar{alpha: alpha} } -func (e *ewmaVar) initialized() bool { return e.init } -func (e *ewmaVar) update(x float64) { +func (e *robustEWMA) initialized() bool { return e.init } + +// update returns the new mean and absDev after ingesting x. +func (e *robustEWMA) update(x float64) (float64, float64) { + if math.IsNaN(x) || math.IsInf(x, 0) { + return e.mean, e.absDev + } + if !e.init { - e.mean, e.variance, e.init = x, 0, true - return + e.mean = x + e.absDev = 0 + e.init = true + return e.mean, e.absDev } - mPrev := e.mean + prev := e.mean e.mean = e.alpha*x + (1-e.alpha)*e.mean - d := x - mPrev - e.variance = e.alpha*(d*d) + (1-e.alpha)*e.variance + resid := x - prev + if resid < 0 { + resid = -resid + } + e.absDev = e.alpha*resid + (1-e.alpha)*e.absDev + return e.mean, e.absDev } diff --git a/exporter/exporterhelper/internal/arc/ewma_test.go b/exporter/exporterhelper/internal/arc/ewma_test.go index aaa5df0325f..98f25b8b693 100644 --- a/exporter/exporterhelper/internal/arc/ewma_test.go +++ b/exporter/exporterhelper/internal/arc/ewma_test.go @@ -1,81 +1,87 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 -package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" +package arc import ( "math" "testing" -) -func TestMean(t *testing.T) { - m := mean{} - if got := m.average(); got != 0 { - t.Fatalf("average of empty mean should be 0, got %f", got) - } + "github.com/stretchr/testify/assert" +) - m.add(10) - if got := m.average(); got != 10 { - t.Fatalf("average after adding 10 should be 10, got %f", got) - } +const ( + testAlpha = 0.4 + testEpsilon = 1e-9 +) - m.add(20) - if got := m.average(); got != 15 { - t.Fatalf("average after adding 20 should be 15, got %f", got) - } +func TestRobustEWMA_Initial(t *testing.T) { + e := newRobustEWMA(testAlpha) + assert.False(t, e.initialized()) + assert.InDelta(t, testAlpha, e.alpha, testEpsilon) - m.add(30) - if got := m.average(); got != 20 { - t.Fatalf("average after adding 30 should be 20, got %f", got) - } + // First update initializes + mean, absDev := e.update(10) + assert.True(t, e.initialized()) + assert.InDelta(t, 10.0, mean, testEpsilon) + assert.InDelta(t, 0.0, absDev, testEpsilon) + assert.InDelta(t, 10.0, e.mean, testEpsilon) + assert.InDelta(t, 0.0, e.absDev, testEpsilon) } -func TestEwmaVar(t *testing.T) { - const alpha = 0.4 - const tolerance = 1e-9 +func TestRobustEWMA_Updates(t *testing.T) { + e := newRobustEWMA(testAlpha) - e := newEwmaVar(alpha) - if e.initialized() { - t.Fatal("newEwmaVar should not be initialized") - } - if e.alpha != alpha { - t.Fatalf("alpha not set correctly, got %f, want %f", e.alpha, alpha) - } - - // First update initializes + // Init e.update(10) - if !e.initialized() { - t.Fatal("ewmaVar should be initialized after first update") - } - if e.mean != 10 { - t.Fatalf("initial mean should be first value, got %f, want 10", e.mean) - } - if e.variance != 0 { - t.Fatalf("initial variance should be 0, got %f, want 0", e.variance) - } // Second update // mean = 0.4 * 20 + 0.6 * 10 = 8 + 6 = 14 - // d = 20 - 10 = 10 - // variance = 0.4 * (10*10) + 0.6 * 0 = 40 - e.update(20) - if math.Abs(e.mean-14.0) > tolerance { - t.Fatalf("second mean incorrect, got %f, want 14.0", e.mean) - } - if math.Abs(e.variance-40.0) > tolerance { - t.Fatalf("second variance incorrect, got %f, want 40.0", e.variance) - } + // resid = 20 - 10 = 10 + // absDev = 0.4 * 10 + 0.6 * 0 = 4 + mean, absDev := e.update(20) + assert.InDelta(t, 14.0, mean, testEpsilon) + assert.InDelta(t, 4.0, absDev, testEpsilon) + assert.InDelta(t, 14.0, e.mean, testEpsilon) + assert.InDelta(t, 4.0, e.absDev, testEpsilon) // Third update - // mPrev = 14 - // mean = 0.4 * 15 + 0.6 * 14 = 6 + 8.4 = 14.4 - // d = 15 - 14 = 1 - // variance = 0.4 * (1*1) + 0.6 * 40 = 0.4 + 24 = 24.4 - e.update(15) - if math.Abs(e.mean-14.4) > tolerance { - t.Fatalf("third mean incorrect, got %f, want 14.4", e.mean) - } - if math.Abs(e.variance-24.4) > tolerance { - t.Fatalf("third variance incorrect, got %f, want 24.4", e.variance) - } + // prev = 14 + // mean = 0.4 * 12 + 0.6 * 14 = 4.8 + 8.4 = 13.2 + // resid = 12 - 14 = -2 -> 2 (abs) + // absDev = 0.4 * 2 + 0.6 * 4 = 0.8 + 2.4 = 3.2 + mean, absDev = e.update(12) + assert.InDelta(t, 13.2, mean, testEpsilon) + assert.InDelta(t, 3.2, absDev, testEpsilon) + assert.InDelta(t, 13.2, e.mean, testEpsilon) + assert.InDelta(t, 3.2, e.absDev, testEpsilon) + + // Fourth update (check negative residual logic) + // prev = 13.2 + // mean = 0.4 * 30 + 0.6 * 13.2 = 12 + 7.92 = 19.92 + // resid = 30 - 13.2 = 16.8 + // absDev = 0.4 * 16.8 + 0.6 * 3.2 = 6.72 + 1.92 = 8.64 + mean, absDev = e.update(30) + assert.InDelta(t, 19.92, mean, testEpsilon) + assert.InDelta(t, 8.64, absDev, testEpsilon) + assert.InDelta(t, 19.92, e.mean, testEpsilon) + assert.InDelta(t, 8.64, e.absDev, testEpsilon) +} + +func TestRobustEWMA_NaN(t *testing.T) { + e := newRobustEWMA(testAlpha) + e.update(10) + e.update(20) + + // Update with NaN + e.update(math.NaN()) + + // Should not propagate NaN + assert.False(t, math.IsNaN(e.mean)) + assert.False(t, math.IsNaN(e.absDev)) + + // Should not propagate Inf + e.update(math.Inf(1)) + assert.False(t, math.IsInf(e.mean, 1)) + assert.False(t, math.IsInf(e.absDev, 1)) } diff --git a/exporter/exporterhelper/internal/arc/params.go b/exporter/exporterhelper/internal/arc/params.go new file mode 100644 index 00000000000..c69d38bf2f9 --- /dev/null +++ b/exporter/exporterhelper/internal/arc/params.go @@ -0,0 +1,47 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" + +import ( + "time" +) + +// Config exposes the knobs required by exporterhelper queue integration. +// Ranges (validated by NewController): +// - InitialLimit >= 1 +// - MaxConcurrency >= InitialLimit +// - DecreaseRatio in (0, 1) +// - EwmaAlpha in (0, 1) +// - DeviationScale >= 0 +// +// ControlPeriod bounds (derived): [minPeriod, maxPeriod] where +// minPeriod = 50ms, maxPeriod = 2s. +// The target period is derived from the robust EWMA mean but clamped to +// these bounds to avoid pathological long/short windows. + +type Config struct { + Enabled bool `mapstructure:"enabled"` + InitialLimit int `mapstructure:"initial_concurrency"` + MaxConcurrency int `mapstructure:"max_concurrency"` + DecreaseRatio float64 `mapstructure:"decrease_ratio"` + EwmaAlpha float64 `mapstructure:"ewma_alpha"` + DeviationScale float64 `mapstructure:"rtt_deviation_scale"` +} + +func DefaultConfig() Config { + return Config{ + Enabled: false, + InitialLimit: 1, + MaxConcurrency: 200, + DecreaseRatio: 0.9, + EwmaAlpha: 0.4, + DeviationScale: 2.5, + } +} + +// control period clamps +const ( + minPeriod = 50 * time.Millisecond + maxPeriod = 2 * time.Second +) diff --git a/exporter/exporterhelper/internal/arc/token_pool.go b/exporter/exporterhelper/internal/arc/token_pool.go new file mode 100644 index 00000000000..c8e78d1ba90 --- /dev/null +++ b/exporter/exporterhelper/internal/arc/token_pool.go @@ -0,0 +1,117 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package arc // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" + +import ( + "context" + "errors" + "sync" +) + +// TokenPool is a fair gate for request concurrency: +// - cap: maximum in-flight +// - inUse: current in-flight +// Acquire blocks while inUse >= cap. Shrinking is done by reducing cap; no +// "debt" bookkeeping or forgetting permits. This deliberately differs from a +// shrinkable semaphore that accrues forgets. + +type TokenPool struct { + mu sync.Mutex + cond *sync.Cond + cap int + inUse int + dead bool +} + +func newTokenPool(initial int) *TokenPool { + p := &TokenPool{cap: initial} + p.cond = sync.NewCond(&p.mu) + return p +} + +func (p *TokenPool) Close() { + p.mu.Lock() + defer p.mu.Unlock() + p.dead = true + p.cond.Broadcast() +} + +func (p *TokenPool) Acquire(ctx context.Context) error { + // Fast fail if already canceled. + if ctx != nil && ctx.Err() != nil { + return ctx.Err() + } + + p.mu.Lock() + defer p.mu.Unlock() + + if p.dead { + return errors.New("token pool closed") + } + + // Wake on context cancellation; ensure goroutine exits via close(done). + done := make(chan struct{}) + go func() { + select { + case <-ctx.Done(): + p.cond.Broadcast() + case <-done: + } + }() + defer close(done) + + for !p.dead && p.inUse >= p.cap { + p.cond.Wait() + + // Re-check cancellation and pool state after each wake. + if ctx != nil && ctx.Err() != nil { + return ctx.Err() + } + if p.dead { + return errors.New("token pool closed") + } + } + + if p.dead { + return errors.New("token pool closed") + } + if ctx != nil && ctx.Err() != nil { + return ctx.Err() + } + + p.inUse++ + return nil +} + +func (p *TokenPool) Release() { + p.mu.Lock() + p.inUse-- + if p.inUse < 0 { + p.inUse = 0 + } + p.mu.Unlock() + p.cond.Signal() +} + +func (p *TokenPool) Grow(n int) { + if n <= 0 { + return + } + p.mu.Lock() + p.cap += n + p.mu.Unlock() + p.cond.Broadcast() +} + +func (p *TokenPool) Shrink(n int) { + if n <= 0 { + return + } + p.mu.Lock() + p.cap -= n + if p.cap < 1 { + p.cap = 1 + } + p.mu.Unlock() +} diff --git a/exporter/exporterhelper/internal/arc/token_pool_test.go b/exporter/exporterhelper/internal/arc/token_pool_test.go new file mode 100644 index 00000000000..b7b0e0e3d53 --- /dev/null +++ b/exporter/exporterhelper/internal/arc/token_pool_test.go @@ -0,0 +1,193 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package arc + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTokenPool_New(t *testing.T) { + p := newTokenPool(10) + assert.Equal(t, 10, p.cap) + assert.Equal(t, 0, p.inUse) + assert.NotNil(t, p.cond) + assert.False(t, p.dead) +} + +func TestTokenPool_AcquireRelease(t *testing.T) { + p := newTokenPool(2) + + // Acquire first permit + err := p.Acquire(context.Background()) + require.NoError(t, err) + assert.Equal(t, 1, p.inUse) + + // Acquire second permit + err = p.Acquire(context.Background()) + require.NoError(t, err) + assert.Equal(t, 2, p.inUse) + + // Release first permit + p.Release() + assert.Equal(t, 1, p.inUse) + + // Acquire again + err = p.Acquire(context.Background()) + require.NoError(t, err) + assert.Equal(t, 2, p.inUse) + + // Release both + p.Release() + p.Release() + assert.Equal(t, 0, p.inUse) +} + +func TestTokenPool_Release_NoOverRelease(t *testing.T) { + p := newTokenPool(1) + p.Release() + assert.Equal(t, 0, p.inUse) // Should not go negative +} + +func TestTokenPool_Blocking(t *testing.T) { + p := newTokenPool(1) + + // Acquire the only permit + err := p.Acquire(context.Background()) + require.NoError(t, err) + assert.Equal(t, 1, p.inUse) + + // This next acquire should block + blockerDone := make(chan struct{}) + go func() { + err := p.Acquire(context.Background()) + assert.NoError(t, err) + assert.Equal(t, 1, p.inUse) + p.Release() + close(blockerDone) + }() + + // Give goroutine time to block + time.Sleep(20 * time.Millisecond) + assert.Equal(t, 1, p.inUse) + + // Release the permit + p.Release() + + // The goroutine should now unblock + select { + case <-blockerDone: + // success + case <-time.After(100 * time.Millisecond): + t.Fatal("goroutine did not unblock after release") + } + + assert.Equal(t, 0, p.inUse) +} + +func TestTokenPool_Close(t *testing.T) { + p := newTokenPool(1) + + // Acquire the only permit + err := p.Acquire(context.Background()) + require.NoError(t, err) + + // This next acquire should block + blockerDone := make(chan error, 1) + go func() { + blockerDone <- p.Acquire(context.Background()) + }() + + // Give goroutine time to block + time.Sleep(20 * time.Millisecond) + + // Close the pool + p.Close() + assert.True(t, p.dead) + + // The goroutine should unblock with an error + select { + case closeErr := <-blockerDone: // Renamed 'err' to 'closeErr' to avoid shadow + require.Error(t, closeErr) + assert.Contains(t, closeErr.Error(), "token pool closed") + case <-time.After(100 * time.Millisecond): + t.Fatal("goroutine did not unblock after Close") + } + + // New acquires should also fail + err = p.Acquire(context.Background()) + require.Error(t, err) + assert.Contains(t, err.Error(), "token pool closed") +} + +func TestTokenPool_Grow(t *testing.T) { + p := newTokenPool(1) + + // Acquire the only permit + err := p.Acquire(context.Background()) + require.NoError(t, err) + + // This next acquire should block + blockerDone := make(chan struct{}) + go func() { + err := p.Acquire(context.Background()) + assert.NoError(t, err) + close(blockerDone) + }() + + // Give goroutine time to block + time.Sleep(20 * time.Millisecond) + + // Grow the pool + p.Grow(1) + assert.Equal(t, 2, p.cap) + + // The goroutine should unblock + select { + case <-blockerDone: + // success + case <-time.After(100 * time.Millisecond): + t.Fatal("goroutine did not unblock after Grow") + } + + assert.Equal(t, 2, p.inUse) +} + +func TestTokenPool_Acquire_ContextCanceled(t *testing.T) { + p := newTokenPool(1) + + // Acquire the only permit + err := p.Acquire(context.Background()) + require.NoError(t, err) + assert.Equal(t, 1, p.inUse) + + // This next acquire should block + ctx, cancel := context.WithCancel(context.Background()) + blockerDone := make(chan error, 1) + go func() { + blockerDone <- p.Acquire(ctx) + }() + + // Give goroutine time to block + time.Sleep(20 * time.Millisecond) + + // Cancel the context + cancel() + + // The goroutine should unblock with an error + select { + case acqErr := <-blockerDone: // This line fixes the shadow error + require.Error(t, acqErr) + require.ErrorIs(t, acqErr, context.Canceled) + case <-time.After(100 * time.Millisecond): + t.Fatal("goroutine did not unblock after context cancel") + } + + // The permit was not acquired + assert.Equal(t, 1, p.inUse) +} diff --git a/exporter/exporterhelper/internal/experr/back_pressure.go b/exporter/exporterhelper/internal/experr/back_pressure.go index b1973b15c63..a085e18814e 100644 --- a/exporter/exporterhelper/internal/experr/back_pressure.go +++ b/exporter/exporterhelper/internal/experr/back_pressure.go @@ -1,80 +1,117 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 -package experr // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" +package experr import ( "context" "errors" "net" + "net/http" "strings" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) -type backpressureError struct{ error } +// Reason is emitted to logs/metrics when desired. +// You don't have to use it in telemetry if your metadata builder +// doesn't have a "reason" attribute yet. +type Reason string -func NewBackpressureError(err error) error { - if err == nil { - return nil - } - return backpressureError{err} -} +const ( + ReasonGRPCResourceExhausted Reason = "grpc_resource_exhausted" + ReasonGRPCUnavailable Reason = "grpc_unavailable" + ReasonGRPCDeadline Reason = "grpc_deadline_exceeded" + ReasonDeadline Reason = "deadline_exceeded" + ReasonHTTP429 Reason = "http_429" + ReasonHTTP503 Reason = "http_503" + ReasonHTTP504 Reason = "http_504" + ReasonHTTPRetryAfter Reason = "http_retry_after" + ReasonNetTimeout Reason = "net_timeout" + ReasonTextBackpressure Reason = "text_backpressure" +) -// IsBackpressure returns true if the error is an explicit backpressure error -// OR it matches well-known HTTP/gRPC backpressure signals. -func IsBackpressure(err error) bool { +// ClassifyBackpressure returns (isBackpressure, reason). +// The logic is strictly: +// 1. Typed signals first (fast, no allocs) +// 2. Fallback to a single lowercase + a few contains checks +// +// NOTE: This function runs only on error paths. Even then, +// typed checks return early. Fallback string scanning is a +// micro-cost compared to network I/O and retry backoff. +func ClassifyBackpressure(err error) (bool, Reason) { if err == nil { - return false + return false, "" } - // Our explicit wrapper. - var x backpressureError - if errors.As(err, &x) { - return true + + // 1) Context deadline/timeout + if errors.Is(err, context.DeadlineExceeded) { + return true, ReasonDeadline } - // Check for net.Error timeout - var ne net.Error - if errors.As(err, &ne) && ne.Timeout() { - return true + // 2) net.Error timeouts + var nerr net.Error + if errors.As(err, &nerr) && nerr.Timeout() { + return true, ReasonNetTimeout } - // Context timed out (treat as transient/backpressure for ARC) - if errors.Is(err, context.DeadlineExceeded) { - return true + // 3) gRPC statuses (common in OTLP/gRPC) + if st, ok := status.FromError(err); ok { + switch st.Code() { + case codes.ResourceExhausted: + return true, ReasonGRPCResourceExhausted + case codes.Unavailable: + return true, ReasonGRPCUnavailable + case codes.DeadlineExceeded: + return true, ReasonGRPCDeadline + } } - // gRPC: RESOURCE_EXHAUSTED (8), UNAVAILABLE (14), DEADLINE_EXCEEDED (4) - if s, ok := status.FromError(err); ok { - switch s.Code() { - case codes.ResourceExhausted, codes.Unavailable, codes.DeadlineExceeded: - return true + // 4) Wrapped HTTP response providers (if your error wraps response) + // Define a minimal interface to avoid importing your transport layer. + type respCarrier interface{ Response() *http.Response } + var rc respCarrier + if errors.As(err, &rc) { + if r := rc.Response(); r != nil { + switch r.StatusCode { + case http.StatusTooManyRequests: + return true, ReasonHTTP429 + case http.StatusServiceUnavailable: + return true, ReasonHTTP503 + case http.StatusGatewayTimeout: + return true, ReasonHTTP504 + } + if r.Header.Get("Retry-After") != "" { + return true, ReasonHTTPRetryAfter + } } } - // HTTP-ish strings (safe fallback; exporters often wrap as text) - // NOTE: intentionally case-insensitive and substring-based. - msg := strings.ToLower(err.Error()) - if strings.Contains(msg, "429") || - strings.Contains(msg, "too many requests") || - strings.Contains(msg, "503") || - strings.Contains(msg, "service unavailable") || - strings.Contains(msg, "502") || - strings.Contains(msg, "bad gateway") || - strings.Contains(msg, "504") || - strings.Contains(msg, "gateway timeout") || - strings.Contains(msg, "408") || - strings.Contains(msg, "request timeout") || - strings.Contains(msg, "599") || - strings.Contains(msg, "proxy timeout") || - strings.Contains(msg, "resource_exhausted") || - strings.Contains(msg, "unavailable") || - strings.Contains(msg, "deadline_exceeded") || // Added for string-based gRPC errors - strings.Contains(msg, "internal server error") || - strings.Contains(msg, "circuit breaker") { - return true + // 5) Fallback: cheap lowercase + tight substring checks. + // Keep this list intentionally small to minimize false positives. + s := strings.ToLower(err.Error()) + if strings.Contains(s, "429") || strings.Contains(s, "too many requests") { + return true, ReasonHTTP429 + } + if strings.Contains(s, "503") || strings.Contains(s, "service unavailable") { + return true, ReasonHTTP503 + } + if strings.Contains(s, "504") || strings.Contains(s, "gateway timeout") || strings.Contains(s, "upstream timeout") { + return true, ReasonHTTP504 + } + if strings.Contains(s, "retry-after") { + return true, ReasonHTTPRetryAfter } + if strings.Contains(s, "circuit breaker") || strings.Contains(s, "overload") || strings.Contains(s, "backpressure") { + return true, ReasonTextBackpressure + } + + return false, "" +} - return false +// IsBackpressure is a convenience wrapper. +func IsBackpressure(err error) bool { + ok, _ := ClassifyBackpressure(err) + return ok } diff --git a/exporter/exporterhelper/internal/experr/back_pressure_test.go b/exporter/exporterhelper/internal/experr/back_pressure_test.go index 96af20c453f..db550a9141f 100644 --- a/exporter/exporterhelper/internal/experr/back_pressure_test.go +++ b/exporter/exporterhelper/internal/experr/back_pressure_test.go @@ -4,7 +4,10 @@ package experr // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" import ( + "context" "errors" + "net/http" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -12,39 +15,129 @@ import ( "google.golang.org/grpc/status" ) -type timeoutNetError struct{} +// ----- helpers / fakes ----- -func (timeoutNetError) Error() string { return "timeout" } -func (timeoutNetError) Timeout() bool { return true } -func (timeoutNetError) Temporary() bool { return false } +// netTimeoutErr implements net.Error and returns Timeout()==true. +type netTimeoutErr struct{} -func TestIsBackpressure_PositiveCases(t *testing.T) { - cases := []error{ - NewBackpressureError(errors.New("wrapped")), - timeoutNetError{}, - errors.New("HTTP 429 Too Many Requests"), - errors.New("received HTTP 503 Service Unavailable"), - errors.New("http 502 bad gateway"), - errors.New("http 504 gateway timeout"), - errors.New("http 408 request timeout"), - errors.New("http 599 proxy timeout"), - errors.New("too many requests"), - errors.New("resource_exhausted"), - errors.New("unavailable"), - status.Error(codes.ResourceExhausted, "overloaded"), - status.Error(codes.Unavailable, "temporarily unavailable"), +func (netTimeoutErr) Error() string { return "i/o timeout" } +func (netTimeoutErr) Timeout() bool { return true } +func (netTimeoutErr) Temporary() bool { return false } // kept for older interfaces + +// httpRespErr is an error that carries an *http.Response and satisfies the +// minimal interface the classifier uses via errors.As. +type httpRespErr struct { + resp *http.Response + msg string +} + +func (e httpRespErr) Error() string { return e.msg } +func (e httpRespErr) Response() *http.Response { return e.resp } + +// lower builds a simple error with the provided message (for fallback checks). +func lower(msg string) error { return errors.New(strings.ToLower(msg)) } + +// ----- tests ----- + +func TestClassifyBackpressure_TypedContextDeadline(t *testing.T) { + ok, reason := ClassifyBackpressure(context.DeadlineExceeded) + assert.True(t, ok) + assert.Equal(t, ReasonDeadline, reason) +} + +func TestClassifyBackpressure_TypedNetTimeout(t *testing.T) { + ok, reason := ClassifyBackpressure(netTimeoutErr{}) + assert.True(t, ok) + assert.Equal(t, ReasonNetTimeout, reason) +} + +func TestClassifyBackpressure_TypedGRPC_StatusCodes(t *testing.T) { + cases := []struct { + err error + reason Reason + }{ + {status.Error(codes.ResourceExhausted, "overloaded"), ReasonGRPCResourceExhausted}, + {status.Error(codes.Unavailable, "unavailable"), ReasonGRPCUnavailable}, + {status.Error(codes.DeadlineExceeded, "deadline"), ReasonGRPCDeadline}, } - for _, err := range cases { - assert.True(t, IsBackpressure(err), "expected backpressure: %v", err) + for _, tc := range cases { + ok, reason := ClassifyBackpressure(tc.err) + assert.True(t, ok, tc.err) + assert.Equal(t, tc.reason, reason) + } +} + +func TestClassifyBackpressure_TypedHTTP_StatusCodes(t *testing.T) { + cases := []struct { + code int + header http.Header + reason Reason + }{ + {http.StatusTooManyRequests, nil, ReasonHTTP429}, + {http.StatusServiceUnavailable, nil, ReasonHTTP503}, + {http.StatusGatewayTimeout, nil, ReasonHTTP504}, + {http.StatusOK, http.Header{"Retry-After": []string{"10"}}, ReasonHTTPRetryAfter}, + } + + for _, tc := range cases { + resp := &http.Response{StatusCode: tc.code, Header: tc.header} + err := httpRespErr{resp: resp, msg: "transport error"} + ok, reason := ClassifyBackpressure(err) + assert.True(t, ok, tc) + assert.Equal(t, tc.reason, reason, tc) + } +} + +func TestClassifyBackpressure_FallbackStrings_Positive(t *testing.T) { + // These exercise the lowercase + contains path. + cases := []struct { + err error + reason Reason + }{ + {errors.New("HTTP 429 Too Many Requests"), ReasonHTTP429}, + {lower("too many requests"), ReasonHTTP429}, + + {errors.New("received HTTP 503 Service Unavailable"), ReasonHTTP503}, + {lower("service unavailable"), ReasonHTTP503}, + + {errors.New("HTTP 504 Gateway Timeout"), ReasonHTTP504}, + {lower("gateway timeout"), ReasonHTTP504}, + {lower("upstream timeout"), ReasonHTTP504}, + + {lower("retry-after header present"), ReasonHTTPRetryAfter}, + + {lower("circuit breaker open"), ReasonTextBackpressure}, + {lower("system overload"), ReasonTextBackpressure}, + {lower("backpressure detected"), ReasonTextBackpressure}, } + + for _, tc := range cases { + ok, reason := ClassifyBackpressure(tc.err) + assert.True(t, ok, tc.err) + assert.Equal(t, tc.reason, reason, tc.err) + } +} + +func TestIsBackpressure_Positive_Smoke(t *testing.T) { + // A quick smoke test through the convenience wrapper. + assert.True(t, IsBackpressure(status.Error(codes.ResourceExhausted, "x"))) + assert.True(t, IsBackpressure(netTimeoutErr{})) + assert.True(t, IsBackpressure(context.DeadlineExceeded)) + assert.True(t, IsBackpressure(errors.New("429"))) + assert.True(t, IsBackpressure(errors.New("service unavailable"))) + assert.True(t, IsBackpressure(errors.New("gateway timeout"))) } func TestIsBackpressure_NegativeCases(t *testing.T) { cases := []error{ nil, errors.New("some transient error"), - errors.New("internal error without overload signal"), + errors.New("internal server error"), // intentionally not treated as backpressure + errors.New("permission denied"), // not backpressure + errors.New("bad gateway (502)"), // not in our fallback allowlist + errors.New("request timeout (408)"), // not in our fallback allowlist + errors.New("proxy timeout (599)"), // not in our fallback allowlist } for _, err := range cases { diff --git a/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go b/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go index aba8ed904c0..1daff068bed 100644 --- a/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go +++ b/exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go @@ -6,9 +6,10 @@ import ( "testing" "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/component/componenttest" "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.opentelemetry.io/otel/sdk/metric/metricdata/metricdatatest" + + "go.opentelemetry.io/collector/component/componenttest" ) func AssertEqualExporterArcAcquireWaitMs(t *testing.T, tt *componenttest.Telemetry, dps []metricdata.HistogramDataPoint[int64], opts ...metricdatatest.Option) { diff --git a/exporter/exporterhelper/internal/queue_sender.go b/exporter/exporterhelper/internal/queue_sender.go index e0e50ddd90b..2955b175765 100644 --- a/exporter/exporterhelper/internal/queue_sender.go +++ b/exporter/exporterhelper/internal/queue_sender.go @@ -7,6 +7,7 @@ import ( "context" "errors" "fmt" + "strings" "time" "go.opentelemetry.io/otel/attribute" @@ -15,7 +16,6 @@ import ( "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/config/configoptional" - "go.opentelemetry.io/collector/consumer/consumererror" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/arc" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr" "go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata" @@ -74,8 +74,8 @@ func NewQueueSender( // Create the attributes for sync ARC metrics exporterAttr := attribute.String("exporter", qSet.ID.String()) - dataTypeAttr := attribute.String("data_type", qSet.Signal.String()) - arcAttrs := metric.WithAttributeSet(attribute.NewSet(exporterAttr, dataTypeAttr)) + dataTypeAttr := attribute.String("data_type", strings.ToLower(qSet.Signal.String())) + arcAttrs := metric.WithAttributes(exporterAttr, dataTypeAttr) origExportFunc := exportFunc exportFunc = func(ctx context.Context, req request.Request) error { @@ -84,6 +84,18 @@ func NewQueueSender( // This context is from the queue, so it's a background context. // If Acquire fails, it means context was cancelled, which shouldn't happen // unless shutdown is happening. + + // Record wait time even on failure + waitMs := time.Since(startWait).Milliseconds() + tel.ExporterArcAcquireWaitMs.Record(ctx, waitMs, arcAttrs) + + // Record failure + tel.ExporterArcFailures.Add(ctx, 1, arcAttrs) + + if err := ctx.Err(); err != nil { + return err + } + return experr.NewShutdownErr(errors.New("arc semaphore closed")) } waitMs := time.Since(startWait).Milliseconds() @@ -95,8 +107,8 @@ func NewQueueSender( rtt := time.Since(startTime) isBackpressure := experr.IsBackpressure(err) - // Apply De Morgan's law: !(A || B) == !A && !B - isSuccess := err == nil || (!consumererror.IsPermanent(err) && !isBackpressure) + + isSuccess := err == nil arcCtl.ReleaseWithSample(ctx, rtt, isSuccess, isBackpressure) return err diff --git a/exporter/exporterhelper/internal/queue_sender_test.go b/exporter/exporterhelper/internal/queue_sender_test.go index a1ce6a1336c..34f88a50f77 100644 --- a/exporter/exporterhelper/internal/queue_sender_test.go +++ b/exporter/exporterhelper/internal/queue_sender_test.go @@ -7,6 +7,7 @@ import ( "context" "errors" "sync" + "sync/atomic" "testing" "time" @@ -15,6 +16,8 @@ import ( "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.uber.org/zap" "go.uber.org/zap/zaptest/observer" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/component/componenttest" @@ -46,7 +49,7 @@ func TestNewQueueSenderFailedRequestDropped(t *testing.T) { assert.Equal(t, "Exporting failed. Dropping data.", observed.All()[0].Message) } -func TestQueueSender_ArcAcquireWaitMetric(t *testing.T) { +func TestQueueSender_ArcAcquireWaitMetric_Failure(t *testing.T) { tt := componenttest.NewTelemetry() t.Cleanup(func() { require.NoError(t, tt.Shutdown(context.Background())) }) @@ -58,14 +61,13 @@ func TestQueueSender_ArcAcquireWaitMetric(t *testing.T) { qCfg := NewDefaultQueueConfig() qCfg.Arc.Enabled = true qCfg.Arc.InitialLimit = 1 - // Disable batching and wait for results to test ARC correctly. qCfg.Batch = configoptional.Optional[queuebatch.BatchConfig]{} // This disables batching qCfg.WaitForResult = true - // Create a sender that blocks until the channel is closed + // Create a sender that blocks blocker := make(chan struct{}) mockSender := sender.NewSender(func(context.Context, request.Request) error { - <-blocker + <-blocker // Block the first request return nil }) @@ -73,30 +75,36 @@ func TestQueueSender_ArcAcquireWaitMetric(t *testing.T) { require.NoError(t, err) require.NoError(t, be.Start(context.Background(), componenttest.NewNopHost())) + // Send first request (will block in sender) var sendWg sync.WaitGroup - sendWg.Add(2) - - // Send first request + sendWg.Add(1) go func() { defer sendWg.Done() - // This send will acquire the only permit and block in the sender assert.NoError(t, be.Send(context.Background(), &requesttest.FakeRequest{Items: 1})) }() // Wait for the first request to acquire the permit - time.Sleep(20 * time.Millisecond) + time.Sleep(50 * time.Millisecond) - // Send second request + // Send second request with a context that will be cancelled + ctx, cancel := context.WithCancel(context.Background()) + sendWg.Add(1) go func() { defer sendWg.Done() - // This send will block in arcCtl.Acquire() - assert.NoError(t, be.Send(context.Background(), &requesttest.FakeRequest{Items: 1})) + // This send will block in arcCtl.Acquire() and then fail + sendErr := be.Send(ctx, &requesttest.FakeRequest{Items: 1}) + assert.Error(t, sendErr) + // The error should be context.Canceled, as the context passed to Send is canceled. + assert.ErrorIs(t, sendErr, context.Canceled) }() // Wait for the second request to block in Acquire time.Sleep(50 * time.Millisecond) - // Unblock the sender, which will release the permit, unblocking the second request + // Cancel the context + cancel() + + // Unblock the first sender close(blocker) // Wait for both sends to complete @@ -105,21 +113,24 @@ func TestQueueSender_ArcAcquireWaitMetric(t *testing.T) { // Shutdown to ensure all telemetry is flushed require.NoError(t, be.Shutdown(context.Background())) - // Verify the metric + // 4. Verify the acquire wait metric metrics, err := tt.GetMetric("otelcol_exporter_arc_acquire_wait_ms") require.NoError(t, err) points := metrics.Data.(metricdata.Histogram[int64]).DataPoints require.Len(t, points, 1) - // Both requests should have recorded a wait time + // Both requests should have recorded a wait time (1 success, 1 failure) assert.Equal(t, uint64(2), points[0].Count) - - // The max wait time should be > 40ms (accounting for test scheduler variance) - // The first request had minimal wait, the second had ~50ms wait. maxVal, ok := points[0].Max.Value() require.True(t, ok, "Max value should be valid") - assert.Greater(t, maxVal, int64(40), "Max wait time should be significant") - assert.Greater(t, points[0].Sum, int64(40), "Sum of wait times should be significant") + assert.Greater(t, maxVal, int64(40), "Max wait time (from failed acquire) should be significant") + + // Verify the arc failures metric + metrics, err = tt.GetMetric("otelcol_exporter_arc_failures") + require.NoError(t, err) + pointsF := metrics.Data.(metricdata.Sum[int64]).DataPoints + require.Len(t, pointsF, 1) + assert.Equal(t, int64(1), pointsF[0].Value) } func TestQueueConfig_Validate(t *testing.T) { @@ -137,3 +148,69 @@ func TestQueueConfig_Validate(t *testing.T) { qCfg.Enabled = false assert.NoError(t, qCfg.Validate()) } + +func TestArc_BackpressureTriggersShrinkAndMetrics(t *testing.T) { + tt := componenttest.NewTelemetry() + t.Cleanup(func() { require.NoError(t, tt.Shutdown(context.Background())) }) + + qSet := queuebatch.AllSettings[request.Request]{ + Signal: pipeline.SignalTraces, + ID: component.NewID(exportertest.NopType), + Telemetry: tt.NewTelemetrySettings(), + } + + qCfg := NewDefaultQueueConfig() + qCfg.WaitForResult = true + qCfg.Batch = configoptional.Optional[queuebatch.BatchConfig]{} // disable batching + qCfg.Arc.Enabled = true + qCfg.Arc.InitialLimit = 3 + qCfg.Arc.MaxConcurrency = 10 + + // First call: explicit backpressure (gRPC ResourceExhausted). Second call: success. + var call int32 + be, err := NewQueueSender( + qSet, + qCfg, + qSet.ID.String(), + sender.NewSender(func(context.Context, request.Request) error { + c := atomic.AddInt32(&call, 1) + if c == 1 { + // 1. Simulates a backpressure error + return status.Error(codes.ResourceExhausted, "overload") + } + return nil + }), + ) + require.NoError(t, err) + + require.NoError(t, be.Start(context.Background(), componenttest.NewNopHost())) + // First send (backpressure) + require.Error(t, be.Send(context.Background(), &requesttest.FakeRequest{Items: 1})) + // Ensure control step boundary is crossed (~300ms default period) + time.Sleep(350 * time.Millisecond) + // Second send (success) triggers control step and metrics flush + require.NoError(t, be.Send(context.Background(), &requesttest.FakeRequest{Items: 1})) + require.NoError(t, be.Shutdown(context.Background())) + + // 2. Asserts otelcol_exporter_arc_backoff_events increments + m, err := tt.GetMetric("otelcol_exporter_arc_backoff_events") + require.NoError(t, err) + backoffPoints := m.Data.(metricdata.Sum[int64]).DataPoints + var backoffSum int64 + for _, dp := range backoffPoints { + backoffSum += dp.Value + } + assert.GreaterOrEqual(t, backoffSum, int64(1), "expected at least one backoff event") + + // 3. Asserts otelcol_exporter_arc_limit_changes{direction="down"} increments + m, err = tt.GetMetric("otelcol_exporter_arc_limit_changes") + require.NoError(t, err) + lcPoints := m.Data.(metricdata.Sum[int64]).DataPoints + var down int64 + for _, dp := range lcPoints { + if v, ok := dp.Attributes.Value("direction"); ok && v.AsString() == "down" { + down += dp.Value + } + } + assert.GreaterOrEqual(t, down, int64(1), "expected at least one limit-down change") +} From b1c96fb628fff426e522e37cccb7e1bf6f03ac1b Mon Sep 17 00:00:00 2001 From: raghu999 Date: Sun, 16 Nov 2025 00:23:42 -0500 Subject: [PATCH 4/4] fix: remove back pressure and add retryable error --- .../{back_pressure.go => retryable_error.go} | 59 ++++++++++++------- ...essure_test.go => retryable_error_test.go} | 58 ++++++++++-------- .../exporterhelper/internal/queue_sender.go | 2 +- 3 files changed, 74 insertions(+), 45 deletions(-) rename exporter/exporterhelper/internal/experr/{back_pressure.go => retryable_error.go} (64%) rename exporter/exporterhelper/internal/experr/{back_pressure_test.go => retryable_error_test.go} (65%) diff --git a/exporter/exporterhelper/internal/experr/back_pressure.go b/exporter/exporterhelper/internal/experr/retryable_error.go similarity index 64% rename from exporter/exporterhelper/internal/experr/back_pressure.go rename to exporter/exporterhelper/internal/experr/retryable_error.go index a085e18814e..3d960bb4da6 100644 --- a/exporter/exporterhelper/internal/experr/back_pressure.go +++ b/exporter/exporterhelper/internal/experr/retryable_error.go @@ -8,7 +8,7 @@ import ( "errors" "net" "net/http" - "strings" + "regexp" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -29,18 +29,30 @@ const ( ReasonHTTP504 Reason = "http_504" ReasonHTTPRetryAfter Reason = "http_retry_after" ReasonNetTimeout Reason = "net_timeout" - ReasonTextBackpressure Reason = "text_backpressure" + ReasonTextCircuitBreaker Reason = "text:circuit_breaker" + ReasonTextOverload Reason = "text:overload" + ReasonTextBackpressure Reason = "text:backpressure" ) -// ClassifyBackpressure returns (isBackpressure, reason). +var fallbackRegex = regexp.MustCompile( + `(?i)(429|too many requests)|` + // ReasonHTTP429 + `(503|service unavailable)|` + // ReasonHTTP503 + `(504|gateway timeout|upstream timeout)|` + // ReasonHTTP504 + `(retry-after)|` + // ReasonHTTPRetryAfter + `(circuit breaker)|` + // ReasonTextCircuitBreaker + `(overload)|` + // ReasonTextOverload + `(backpressure)`, // ReasonTextBackpressure +) + +// ClassifyErrorReason returns (isRetryable, reason). // The logic is strictly: // 1. Typed signals first (fast, no allocs) -// 2. Fallback to a single lowercase + a few contains checks +// 2. Fallback to a single regex match on the error string // // NOTE: This function runs only on error paths. Even then, // typed checks return early. Fallback string scanning is a // micro-cost compared to network I/O and retry backoff. -func ClassifyBackpressure(err error) (bool, Reason) { +func ClassifyErrorReason(err error) (bool, Reason) { if err == nil { return false, "" } @@ -88,30 +100,37 @@ func ClassifyBackpressure(err error) (bool, Reason) { } } - // 5) Fallback: cheap lowercase + tight substring checks. - // Keep this list intentionally small to minimize false positives. - s := strings.ToLower(err.Error()) - if strings.Contains(s, "429") || strings.Contains(s, "too many requests") { - return true, ReasonHTTP429 + // 5) Fallback: cheap regex match on error string. + matches := fallbackRegex.FindStringSubmatch(err.Error()) + if matches == nil { + return false, "" } - if strings.Contains(s, "503") || strings.Contains(s, "service unavailable") { + + // The regex has 7 capture groups, one for each reason. + // FindStringSubmatch returns the full match at [0], then groups at [1]...[n]. + // We check which group captured. + switch { + case matches[1] != "": + return true, ReasonHTTP429 + case matches[2] != "": return true, ReasonHTTP503 - } - if strings.Contains(s, "504") || strings.Contains(s, "gateway timeout") || strings.Contains(s, "upstream timeout") { + case matches[3] != "": return true, ReasonHTTP504 - } - if strings.Contains(s, "retry-after") { + case matches[4] != "": return true, ReasonHTTPRetryAfter - } - if strings.Contains(s, "circuit breaker") || strings.Contains(s, "overload") || strings.Contains(s, "backpressure") { + case matches[5] != "": + return true, ReasonTextCircuitBreaker + case matches[6] != "": + return true, ReasonTextOverload + case matches[7] != "": return true, ReasonTextBackpressure } return false, "" } -// IsBackpressure is a convenience wrapper. -func IsBackpressure(err error) bool { - ok, _ := ClassifyBackpressure(err) +// IsRetryableError is a convenience wrapper. +func IsRetryableError(err error) bool { + ok, _ := ClassifyErrorReason(err) return ok } diff --git a/exporter/exporterhelper/internal/experr/back_pressure_test.go b/exporter/exporterhelper/internal/experr/retryable_error_test.go similarity index 65% rename from exporter/exporterhelper/internal/experr/back_pressure_test.go rename to exporter/exporterhelper/internal/experr/retryable_error_test.go index db550a9141f..f933efe17a4 100644 --- a/exporter/exporterhelper/internal/experr/back_pressure_test.go +++ b/exporter/exporterhelper/internal/experr/retryable_error_test.go @@ -39,19 +39,19 @@ func lower(msg string) error { return errors.New(strings.ToLower(msg)) } // ----- tests ----- -func TestClassifyBackpressure_TypedContextDeadline(t *testing.T) { - ok, reason := ClassifyBackpressure(context.DeadlineExceeded) +func TestClassifyErrorReason_TypedContextDeadline(t *testing.T) { + ok, reason := ClassifyErrorReason(context.DeadlineExceeded) assert.True(t, ok) assert.Equal(t, ReasonDeadline, reason) } -func TestClassifyBackpressure_TypedNetTimeout(t *testing.T) { - ok, reason := ClassifyBackpressure(netTimeoutErr{}) +func TestClassifyErrorReason_TypedNetTimeout(t *testing.T) { + ok, reason := ClassifyErrorReason(netTimeoutErr{}) assert.True(t, ok) assert.Equal(t, ReasonNetTimeout, reason) } -func TestClassifyBackpressure_TypedGRPC_StatusCodes(t *testing.T) { +func TestClassifyErrorReason_TypedGRPC_StatusCodes(t *testing.T) { cases := []struct { err error reason Reason @@ -62,13 +62,13 @@ func TestClassifyBackpressure_TypedGRPC_StatusCodes(t *testing.T) { } for _, tc := range cases { - ok, reason := ClassifyBackpressure(tc.err) + ok, reason := ClassifyErrorReason(tc.err) assert.True(t, ok, tc.err) assert.Equal(t, tc.reason, reason) } } -func TestClassifyBackpressure_TypedHTTP_StatusCodes(t *testing.T) { +func TestClassifyErrorReason_TypedHTTP_StatusCodes(t *testing.T) { cases := []struct { code int header http.Header @@ -83,64 +83,74 @@ func TestClassifyBackpressure_TypedHTTP_StatusCodes(t *testing.T) { for _, tc := range cases { resp := &http.Response{StatusCode: tc.code, Header: tc.header} err := httpRespErr{resp: resp, msg: "transport error"} - ok, reason := ClassifyBackpressure(err) + ok, reason := ClassifyErrorReason(err) assert.True(t, ok, tc) assert.Equal(t, tc.reason, reason, tc) } } -func TestClassifyBackpressure_FallbackStrings_Positive(t *testing.T) { - // These exercise the lowercase + contains path. +func TestClassifyErrorReason_FallbackRegex_Positive(t *testing.T) { + // These exercise the regex fallback path. cases := []struct { err error reason Reason }{ {errors.New("HTTP 429 Too Many Requests"), ReasonHTTP429}, {lower("too many requests"), ReasonHTTP429}, + {errors.New("some 429 error"), ReasonHTTP429}, {errors.New("received HTTP 503 Service Unavailable"), ReasonHTTP503}, {lower("service unavailable"), ReasonHTTP503}, + {errors.New("503 blah"), ReasonHTTP503}, {errors.New("HTTP 504 Gateway Timeout"), ReasonHTTP504}, {lower("gateway timeout"), ReasonHTTP504}, {lower("upstream timeout"), ReasonHTTP504}, + {errors.New("got a 504"), ReasonHTTP504}, {lower("retry-after header present"), ReasonHTTPRetryAfter}, + {errors.New("Retry-After: 30"), ReasonHTTPRetryAfter}, + + {lower("circuit breaker open"), ReasonTextCircuitBreaker}, + {errors.New("Circuit Breaker"), ReasonTextCircuitBreaker}, + + {lower("system overload"), ReasonTextOverload}, + {errors.New("OVERLOAD"), ReasonTextOverload}, - {lower("circuit breaker open"), ReasonTextBackpressure}, - {lower("system overload"), ReasonTextBackpressure}, {lower("backpressure detected"), ReasonTextBackpressure}, + {errors.New("BACKPRESSURE"), ReasonTextBackpressure}, } for _, tc := range cases { - ok, reason := ClassifyBackpressure(tc.err) + ok, reason := ClassifyErrorReason(tc.err) assert.True(t, ok, tc.err) assert.Equal(t, tc.reason, reason, tc.err) } } -func TestIsBackpressure_Positive_Smoke(t *testing.T) { +func TestIsRetryableError_Positive_Smoke(t *testing.T) { // A quick smoke test through the convenience wrapper. - assert.True(t, IsBackpressure(status.Error(codes.ResourceExhausted, "x"))) - assert.True(t, IsBackpressure(netTimeoutErr{})) - assert.True(t, IsBackpressure(context.DeadlineExceeded)) - assert.True(t, IsBackpressure(errors.New("429"))) - assert.True(t, IsBackpressure(errors.New("service unavailable"))) - assert.True(t, IsBackpressure(errors.New("gateway timeout"))) + assert.True(t, IsRetryableError(status.Error(codes.ResourceExhausted, "x"))) + assert.True(t, IsRetryableError(netTimeoutErr{})) + assert.True(t, IsRetryableError(context.DeadlineExceeded)) + assert.True(t, IsRetryableError(errors.New("429"))) + assert.True(t, IsRetryableError(errors.New("service unavailable"))) + assert.True(t, IsRetryableError(errors.New("gateway timeout"))) + assert.True(t, IsRetryableError(errors.New("OVERLOAD"))) } -func TestIsBackpressure_NegativeCases(t *testing.T) { +func TestIsRetryableError_NegativeCases(t *testing.T) { cases := []error{ nil, errors.New("some transient error"), - errors.New("internal server error"), // intentionally not treated as backpressure - errors.New("permission denied"), // not backpressure + errors.New("internal server error"), // intentionally not treated as retryable + errors.New("permission denied"), // not retryable errors.New("bad gateway (502)"), // not in our fallback allowlist errors.New("request timeout (408)"), // not in our fallback allowlist errors.New("proxy timeout (599)"), // not in our fallback allowlist } for _, err := range cases { - assert.False(t, IsBackpressure(err), "expected not backpressure: %v", err) + assert.False(t, IsRetryableError(err), "expected not retryable: %v", err) } } diff --git a/exporter/exporterhelper/internal/queue_sender.go b/exporter/exporterhelper/internal/queue_sender.go index 2955b175765..fc1886c9a39 100644 --- a/exporter/exporterhelper/internal/queue_sender.go +++ b/exporter/exporterhelper/internal/queue_sender.go @@ -106,7 +106,7 @@ func NewQueueSender( err := origExportFunc(ctx, req) rtt := time.Since(startTime) - isBackpressure := experr.IsBackpressure(err) + isBackpressure := experr.IsRetryableError(err) isSuccess := err == nil