Skip to content

Commit de4b1f2

Browse files
BrynCookebryn
andauthored
Rate limit otel errors (#2954)
When a batch span exporter is unable to send accept a span because the buffer is full it will emit an error. These errors can be very frequent and could potentially impact performance. Otel errors are not rate limited to one every ten seconds per error type. Fixes #2953 <!-- start metadata --> **Checklist** Complete the checklist (and note appropriate exceptions) before a final PR is raised. - [ ] Changes are compatible[^1] - [ ] Documentation[^2] completed - [ ] Performance impact assessed and acceptable - Tests added and passing[^3] - [x] Unit Tests - [ ] Integration Tests - [ ] Manual Tests **Exceptions** *Note any exceptions here* **Notes** [^1]. It may be appropriate to bring upcoming changes to the attention of other (impacted) groups. Please endeavour to do this before seeking PR approval. The mechanism for doing this will vary considerably, so use your judgement as to how and when to do this. [^2]. Configuration is an important part of many changes. Where applicable please try to document configuration examples. [^3]. Tick whichever testing boxes are applicable. If you are adding Manual Tests: - please document the manual testing (extensively) in the Exceptions. - please raise a separate issue to automate the test and label it (or ask for it to be labeled) as `manual test` --------- Co-authored-by: bryn <[email protected]>
1 parent 1e6bd0d commit de4b1f2

File tree

2 files changed

+141
-12
lines changed

2 files changed

+141
-12
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
### Rate limit errors emitted from open telemetry ([Issue #2953](https://github.com/apollographql/router/issues/2953))
2+
3+
When a batch span exporter is unable to send accept a span because the buffer is full it will emit an error.
4+
These errors can be very frequent and could potentially impact performance.
5+
6+
Otel errors are now rate limited to one every ten seconds per error type.
7+
8+
By [@bryncooke](https://github.com/bryncooke) in https://github.com/apollographql/router/pull/2954

apollo-router/src/plugins/telemetry/mod.rs

Lines changed: 133 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use ::tracing::field;
1212
use ::tracing::info_span;
1313
use ::tracing::Span;
1414
use axum::headers::HeaderName;
15+
use dashmap::DashMap;
1516
use futures::future::ready;
1617
use futures::future::BoxFuture;
1718
use futures::stream::once;
@@ -22,6 +23,7 @@ use http::HeaderMap;
2223
use http::HeaderValue;
2324
use http::StatusCode;
2425
use multimap::MultiMap;
26+
use once_cell::sync::OnceCell;
2527
use opentelemetry::propagation::text_map_propagator::FieldIter;
2628
use opentelemetry::propagation::Extractor;
2729
use opentelemetry::propagation::Injector;
@@ -1366,19 +1368,56 @@ fn convert(
13661368
}
13671369
}
13681370

1371+
#[derive(Eq, PartialEq, Hash)]
1372+
enum ErrorType {
1373+
Trace,
1374+
Metric,
1375+
Other,
1376+
}
1377+
static OTEL_ERROR_LAST_LOGGED: OnceCell<DashMap<ErrorType, Instant>> = OnceCell::new();
1378+
13691379
fn handle_error<T: Into<opentelemetry::global::Error>>(err: T) {
1370-
match err.into() {
1371-
opentelemetry::global::Error::Trace(err) => {
1372-
::tracing::error!("OpenTelemetry trace error occurred: {}", err)
1373-
}
1374-
opentelemetry::global::Error::Metric(err_msg) => {
1375-
::tracing::error!("OpenTelemetry metric error occurred: {}", err_msg)
1376-
}
1377-
opentelemetry::global::Error::Other(err_msg) => {
1378-
::tracing::error!("OpenTelemetry error occurred: {}", err_msg)
1379-
}
1380-
other => {
1381-
::tracing::error!("OpenTelemetry error occurred: {:?}", other)
1380+
// We have to rate limit these errors because when they happen they are very frequent.
1381+
// Use a dashmap to store the message type with the last time it was logged.
1382+
let last_logged_map = OTEL_ERROR_LAST_LOGGED.get_or_init(DashMap::new);
1383+
let err = err.into();
1384+
1385+
// We don't want the dashmap to get big, so we key the error messages by type.
1386+
let error_type = match err {
1387+
opentelemetry::global::Error::Trace(_) => ErrorType::Trace,
1388+
opentelemetry::global::Error::Metric(_) => ErrorType::Metric,
1389+
_ => ErrorType::Other,
1390+
};
1391+
#[cfg(not(test))]
1392+
let threshold = Duration::from_secs(10);
1393+
#[cfg(test)]
1394+
let threshold = Duration::from_millis(100);
1395+
1396+
// Copy here so that we don't retain a mutable reference into the dashmap and lock the shard
1397+
let now = Instant::now();
1398+
let last_logged = *last_logged_map
1399+
.entry(error_type)
1400+
.and_modify(|last_logged| {
1401+
if last_logged.elapsed() > threshold {
1402+
*last_logged = now;
1403+
}
1404+
})
1405+
.or_insert_with(|| now);
1406+
1407+
if last_logged == now {
1408+
match err {
1409+
opentelemetry::global::Error::Trace(err) => {
1410+
::tracing::error!("OpenTelemetry trace error occurred: {}", err)
1411+
}
1412+
opentelemetry::global::Error::Metric(err) => {
1413+
::tracing::error!("OpenTelemetry metric error occurred: {}", err)
1414+
}
1415+
opentelemetry::global::Error::Other(err) => {
1416+
::tracing::error!("OpenTelemetry error occurred: {}", err)
1417+
}
1418+
other => {
1419+
::tracing::error!("OpenTelemetry error occurred: {:?}", other)
1420+
}
13821421
}
13831422
}
13841423
}
@@ -1491,7 +1530,12 @@ impl TextMapPropagator for CustomTraceIdPropagator {
14911530
//
14921531
#[cfg(test)]
14931532
mod tests {
1533+
use std::fmt::Debug;
1534+
use std::ops::DerefMut;
14941535
use std::str::FromStr;
1536+
use std::sync::Arc;
1537+
use std::sync::Mutex;
1538+
use std::time::Duration;
14951539

14961540
use axum::headers::HeaderName;
14971541
use http::HeaderMap;
@@ -1505,6 +1549,14 @@ mod tests {
15051549
use tower::util::BoxService;
15061550
use tower::Service;
15071551
use tower::ServiceExt;
1552+
use tracing_core::field::Visit;
1553+
use tracing_core::Event;
1554+
use tracing_core::Field;
1555+
use tracing_core::Subscriber;
1556+
use tracing_futures::WithSubscriber;
1557+
use tracing_subscriber::layer::Context;
1558+
use tracing_subscriber::layer::SubscriberExt;
1559+
use tracing_subscriber::Layer;
15081560

15091561
use super::apollo::ForwardHeaders;
15101562
use crate::error::FetchError;
@@ -1515,6 +1567,7 @@ mod tests {
15151567
use crate::plugin::test::MockSubgraphService;
15161568
use crate::plugin::test::MockSupergraphService;
15171569
use crate::plugin::DynPlugin;
1570+
use crate::plugins::telemetry::handle_error;
15181571
use crate::services::SubgraphRequest;
15191572
use crate::services::SubgraphResponse;
15201573
use crate::services::SupergraphRequest;
@@ -1996,4 +2049,72 @@ mod tests {
19962049
let filtered_headers = super::filter_headers(&headers, &ForwardHeaders::None);
19972050
assert_eq!(filtered_headers.as_str(), "{}");
19982051
}
2052+
2053+
#[tokio::test]
2054+
async fn test_handle_error_throttling() {
2055+
// Set up a fake subscriber so we can check log events. If this is useful then maybe it can be factored out into something reusable
2056+
#[derive(Default)]
2057+
struct TestVisitor {
2058+
log_entries: Vec<String>,
2059+
}
2060+
2061+
#[derive(Default, Clone)]
2062+
struct TestLayer {
2063+
visitor: Arc<Mutex<TestVisitor>>,
2064+
}
2065+
impl TestLayer {
2066+
fn assert_log_entry_count(&self, message: &str, expected: usize) {
2067+
let log_entries = self.visitor.lock().unwrap().log_entries.clone();
2068+
let actual = log_entries.iter().filter(|e| e.contains(message)).count();
2069+
assert_eq!(actual, expected);
2070+
}
2071+
}
2072+
impl Visit for TestVisitor {
2073+
fn record_debug(&mut self, field: &Field, value: &dyn Debug) {
2074+
self.log_entries
2075+
.push(format!("{}={:?}", field.name(), value));
2076+
}
2077+
}
2078+
2079+
impl<S> Layer<S> for TestLayer
2080+
where
2081+
S: Subscriber,
2082+
Self: 'static,
2083+
{
2084+
fn on_event(&self, event: &Event<'_>, _ctx: Context<'_, S>) {
2085+
event.record(self.visitor.lock().unwrap().deref_mut())
2086+
}
2087+
}
2088+
2089+
let test_layer = TestLayer::default();
2090+
2091+
async {
2092+
// Log twice rapidly, they should get deduped
2093+
handle_error(opentelemetry::global::Error::Other(
2094+
"other error".to_string(),
2095+
));
2096+
handle_error(opentelemetry::global::Error::Other(
2097+
"other error".to_string(),
2098+
));
2099+
handle_error(opentelemetry::global::Error::Trace(
2100+
"trace error".to_string().into(),
2101+
));
2102+
}
2103+
.with_subscriber(tracing_subscriber::registry().with(test_layer.clone()))
2104+
.await;
2105+
2106+
test_layer.assert_log_entry_count("other error", 1);
2107+
test_layer.assert_log_entry_count("trace error", 1);
2108+
2109+
// Sleep a bit and then log again, it should get logged
2110+
tokio::time::sleep(Duration::from_millis(200)).await;
2111+
async {
2112+
handle_error(opentelemetry::global::Error::Other(
2113+
"other error".to_string(),
2114+
));
2115+
}
2116+
.with_subscriber(tracing_subscriber::registry().with(test_layer.clone()))
2117+
.await;
2118+
test_layer.assert_log_entry_count("other error", 2);
2119+
}
19992120
}

0 commit comments

Comments
 (0)