|
| 1 | +use std::cell::Cell; |
1 | 2 | use std::io::IsTerminal;
|
| 3 | +use std::time::Duration; |
| 4 | +use std::time::Instant; |
2 | 5 |
|
3 | 6 | use env::otel_logs_enabled;
|
4 | 7 | use env::otel_metrics_enabled;
|
@@ -98,12 +101,34 @@ pub fn init(spin_version: String) -> anyhow::Result<ShutdownGuard> {
|
98 | 101 | }
|
99 | 102 |
|
100 | 103 | fn otel_error_handler(err: opentelemetry::global::Error) {
|
| 104 | + // Track the error count |
| 105 | + let signal = match err { |
| 106 | + opentelemetry::global::Error::Metric(_) => "metrics", |
| 107 | + opentelemetry::global::Error::Trace(_) => "traces", |
| 108 | + opentelemetry::global::Error::Log(_) => "logs", |
| 109 | + _ => "unknown", |
| 110 | + }; |
| 111 | + metrics::monotonic_counter!(spin.otel_error_count = 1, signal = signal); |
| 112 | + |
| 113 | + // Only log the first error at ERROR level, subsequent errors will be logged at higher levels and rate limited |
101 | 114 | static FIRST_OTEL_ERROR: std::sync::Once = std::sync::Once::new();
|
102 | 115 | FIRST_OTEL_ERROR.call_once(|| {
|
103 |
| - tracing::error!("There has been an error with the OpenTelemetry system, traces and metrics are likely failing to export"); |
104 |
| - tracing::error!("Further OpenTelemetry errors will be logged at DEBUG level") |
| 116 | + tracing::error!(?err, "OpenTelemetry error"); |
| 117 | + tracing::error!("There has been an error with the OpenTelemetry system. Traces, logs or metrics are likely failing to export."); |
| 118 | + tracing::error!("Further OpenTelemetry errors will be available at WARN level (rate limited) or at TRACE level."); |
105 | 119 | });
|
106 |
| - tracing::debug!(?err, "OpenTelemetry error"); |
| 120 | + |
| 121 | + // Rate limit the logging of the OTel errors to not occur more frequently on each thread than OTEL_ERROR_INTERVAL |
| 122 | + const OTEL_ERROR_INTERVAL: Duration = Duration::from_millis(5000); |
| 123 | + thread_local! { |
| 124 | + static LAST_OTEL_ERROR: Cell<Instant> = Cell::new(Instant::now() - OTEL_ERROR_INTERVAL); |
| 125 | + } |
| 126 | + if LAST_OTEL_ERROR.get().elapsed() > OTEL_ERROR_INTERVAL { |
| 127 | + LAST_OTEL_ERROR.set(Instant::now()); |
| 128 | + tracing::warn!(?err, "OpenTelemetry error"); |
| 129 | + } else { |
| 130 | + tracing::trace!(?err, "OpenTelemetry error"); |
| 131 | + } |
107 | 132 | }
|
108 | 133 |
|
109 | 134 | /// An RAII implementation for connection to open telemetry services.
|
|
0 commit comments