Skip to content

Commit f6d6549

Browse files
committed
Improve OTel error logging
Signed-off-by: Caleb Schoepp <[email protected]>
1 parent 22f9f98 commit f6d6549

File tree

2 files changed

+29
-4
lines changed

2 files changed

+29
-4
lines changed

crates/telemetry/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ edition = { workspace = true }
88
anyhow = { workspace = true }
99
http0 = { version = "0.2.9", package = "http" }
1010
http1 = { version = "1.0.0", package = "http" }
11-
opentelemetry = { version = "0.22.0", features = [ "metrics", "trace"] }
11+
opentelemetry = { version = "0.22.0", features = [ "metrics", "trace", "logs"] }
1212
opentelemetry_sdk = { version = "0.22.1", features = ["rt-tokio", "logs_level_enabled"] }
1313
opentelemetry-otlp = { version = "0.15.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client", "metrics", "grpc-tonic", "logs"] }
1414
opentelemetry-semantic-conventions = "0.14.0"

crates/telemetry/src/lib.rs

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
use std::cell::Cell;
12
use std::io::IsTerminal;
3+
use std::time::Duration;
4+
use std::time::Instant;
25

36
use env::otel_logs_enabled;
47
use env::otel_metrics_enabled;
@@ -98,12 +101,34 @@ pub fn init(spin_version: String) -> anyhow::Result<ShutdownGuard> {
98101
}
99102

100103
fn otel_error_handler(err: opentelemetry::global::Error) {
104+
// Track the error count
105+
let signal = match err {
106+
opentelemetry::global::Error::Metric(_) => "metrics",
107+
opentelemetry::global::Error::Trace(_) => "traces",
108+
opentelemetry::global::Error::Log(_) => "logs",
109+
_ => "unknown",
110+
};
111+
metrics::monotonic_counter!(spin.otel_error_count = 1, signal = signal);
112+
113+
// Only log the first error at ERROR level, subsequent errors will be logged at higher levels and rate limited
101114
static FIRST_OTEL_ERROR: std::sync::Once = std::sync::Once::new();
102115
FIRST_OTEL_ERROR.call_once(|| {
103-
tracing::error!("There has been an error with the OpenTelemetry system, traces and metrics are likely failing to export");
104-
tracing::error!("Further OpenTelemetry errors will be logged at DEBUG level")
116+
tracing::error!(?err, "OpenTelemetry error");
117+
tracing::error!("There has been an error with the OpenTelemetry system. Traces, logs or metrics are likely failing to export.");
118+
tracing::error!("Further OpenTelemetry errors will be available at WARN level (rate limited) or at TRACE level.");
105119
});
106-
tracing::debug!(?err, "OpenTelemetry error");
120+
121+
// Rate limit the logging of the OTel errors to not occur more frequently on each thread than OTEL_ERROR_INTERVAL
122+
const OTEL_ERROR_INTERVAL: Duration = Duration::from_millis(5000);
123+
thread_local! {
124+
static LAST_OTEL_ERROR: Cell<Instant> = Cell::new(Instant::now() - OTEL_ERROR_INTERVAL);
125+
}
126+
if LAST_OTEL_ERROR.get().elapsed() > OTEL_ERROR_INTERVAL {
127+
LAST_OTEL_ERROR.set(Instant::now());
128+
tracing::warn!(?err, "OpenTelemetry error");
129+
} else {
130+
tracing::trace!(?err, "OpenTelemetry error");
131+
}
107132
}
108133

109134
/// An RAII implementation for connection to open telemetry services.

0 commit comments

Comments
 (0)