Skip to content

Commit 7bc8489

Browse files
feat: More granular instrumentation and remove long running spans (#811)
* chore(stackable-telemetry): use default sampler and id generators Defaults are: ``` sampler: Box::new(Sampler::ParentBased(Box::new(Sampler::AlwaysOn))), id_generator: Box::<RandomIdGenerator>::default(), ``` * chore: remove instrumentation from trivial methods, set events to trace level * chore: remove long running spans * fix: carry over otel context (incl parent span) between low level tls handling, and high level route handling This fixes the trace hierarchy which used to be disjoint because the parent span/trace id was missing and a new one was being generated in the axum layers * feat: granular instrumentation of futures * chore: add reminder about naming of special layer * chore: tidy up tracing events * chore: update changelogs * chore(stackable-telemetry): remove commented span attributes * chore(stackable-telemetry): remove seemingly unnecessary context guard * fix(stackable-telemetry): cast numeric span attributes as i64 * chore(stackable-telemetry): add missing span attributes for client * chore(stackable-telemetry): add comment explaining that we need to pass tracing context * chore(stackable-telemetry): demote event levels for low level errors that can easily be triggered by clients * chore(stackable-telemetry): upgrade opentelemetry crates * fix(stackable-telemetry): only set parent context if the context is different. Add links between traces * chore(stackable-telemetry): add important developer notes * chore(stackable-telemetry): emit a tracing event when the otlp providers are about to shut down * chore(stackable-telemetry): rename event fields because the providers are defined in opentelemetry, not opentelemetry_otlp. Even though we happen to be using otlp providers. * chore(stackable-telemetry): improve field names in events * fix(stackable-telemetry): only shutdown the global TracerProvider if we set it * chore(stackable-operator): update changelog * chore(stackable-telemetry): update changelog * Apply suggestions from code review Co-authored-by: Techassi <[email protected]> * Apply suggestions from code review Co-authored-by: Techassi <[email protected]> * chore(stackable-telemetry): remove indentation on otherwise empty lines * chore(stackable-telemetry): add an important doc-comment note to the Tracing::init method --------- Co-authored-by: Techassi <[email protected]>
1 parent d0ab773 commit 7bc8489

File tree

12 files changed

+309
-424
lines changed

12 files changed

+309
-424
lines changed

Cargo.lock

Lines changed: 102 additions & 343 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ k8s-openapi = { version = "0.22.0", default-features = false, features = ["schem
3232
# We use rustls instead of openssl for easier portablitly, e.g. so that we can build stackablectl without the need to vendor (build from source) openssl
3333
kube = { version = "0.92.0", default-features = false, features = ["client", "jsonpatch", "runtime", "derive", "rustls-tls"] }
3434
lazy_static = "1.4.0"
35-
opentelemetry = "0.22.0"
36-
opentelemetry_sdk = { version = "0.22.1", features = ["rt-tokio"] }
37-
opentelemetry-appender-tracing = "0.3.0"
38-
opentelemetry-jaeger = { version = "0.21.0", features = ["rt-tokio"] }
39-
opentelemetry-otlp = "0.15.0"
40-
opentelemetry-semantic-conventions = "0.14.0"
35+
opentelemetry = "0.23.0"
36+
opentelemetry_sdk = { version = "0.23.0", features = ["rt-tokio"] }
37+
opentelemetry-appender-tracing = "0.4.0"
38+
opentelemetry-jaeger = { version = "0.22.0", features = ["rt-tokio"] }
39+
opentelemetry-otlp = "0.16.0"
40+
opentelemetry-semantic-conventions = "0.15.0"
4141
p256 = { version = "0.13.2", features = ["ecdsa"] }
4242
pin-project = "1.1.5"
4343
proc-macro2 = "1.0.83"
@@ -67,7 +67,7 @@ tokio-test = "0.4.4"
6767
tower = "0.4.13"
6868
tower-http = { version = "0.5.2", features = ["trace"] }
6969
tracing = "0.1.40"
70-
tracing-opentelemetry = "0.23.0"
70+
tracing-opentelemetry = "0.24.0"
7171
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
7272
url = { version = "2.5.0", features = ["serde"] }
7373
x509-cert = { version = "0.2.5", features = ["builder"] }

crates/stackable-operator/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,14 @@ All notable changes to this project will be documented in this file.
1212
- [Blog Post - Breaking Change](https://kube.rs/blog/2024/06/11/watcher-memory-improvements/#breaking-change)
1313
- [kube#1494](https://github.com/kube-rs/kube/pull/1494)
1414
- [kube#1504](https://github.com/kube-rs/kube/pull/1504)
15+
- Upgrade opentelemetry crates ([#811]).
1516

1617
### Fixed
1718

1819
- Product image selection pull request version override now only applies to pull requests ([#812]).
1920

2021
[#804]: https://github.com/stackabletech/operator-rs/pull/804
22+
[#811]: https://github.com/stackabletech/operator-rs/pull/811
2123
[#812]: https://github.com/stackabletech/operator-rs/pull/812
2224

2325
## [0.69.3] - 2024-06-12

crates/stackable-telemetry/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,12 @@ All notable changes to this project will be documented in this file.
77
### Changed
88

99
- Add support for setting the environment variable for each configured tracing subscriber ([#801]).
10+
- Use OpenTelemetry Context in Axum instrumentation layer, adjust log and span level, simplify trace config ([#811]).
11+
- tracing: Upgrade opentelemetry crates, simplify trace config, fix shutdown conditions, use new way to shutdown LoggerProvider.
12+
- instrumentation/axum: demote event severity for errors easily caused by clients, replace parent span context if given in http header and link to previous trace contexts.
1013

1114
[#801]: https://github.com/stackabletech/operator-rs/pull/801
15+
[#811]: https://github.com/stackabletech/operator-rs/pull/811
1216

1317
## [0.1.0] - 2024-05-08
1418

crates/stackable-telemetry/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ opentelemetry = { workspace = true, features = ["logs"] }
1313
opentelemetry-appender-tracing.workspace = true
1414
opentelemetry-otlp = { workspace = true, features = ["gzip-tonic", "logs"] }
1515
opentelemetry-semantic-conventions.workspace = true
16-
opentelemetry_sdk = { workspace = true, features = ["logs", "rt-tokio"] }
16+
opentelemetry_sdk = { workspace = true, features = ["logs", "rt-tokio", "logs_level_enabled"] }
1717
pin-project.workspace = true
1818
snafu.workspace = true
1919
tokio.workspace = true

crates/stackable-telemetry/src/instrumentation/axum/mod.rs

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,14 @@ use axum::{
2121
response::Response,
2222
};
2323
use futures_util::ready;
24-
use opentelemetry::trace::SpanKind;
24+
use opentelemetry::{
25+
trace::{SpanKind, TraceContextExt},
26+
Context,
27+
};
2528
use pin_project::pin_project;
2629
use snafu::{ResultExt, Snafu};
2730
use tower::{Layer, Service};
28-
use tracing::{debug, field::Empty, instrument, trace_span, Span};
31+
use tracing::{field::Empty, Span};
2932
use tracing_opentelemetry::OpenTelemetrySpanExt;
3033

3134
mod extractor;
@@ -91,7 +94,6 @@ impl<S> Layer<S> for TraceLayer {
9194

9295
impl TraceLayer {
9396
/// Creates a new default trace layer.
94-
#[instrument(name = "create_trace_layer")]
9597
pub fn new() -> Self {
9698
Self::default()
9799
}
@@ -367,7 +369,7 @@ impl SpanExt for Span {
367369
let span_name = req.span_name();
368370
let url = req.uri();
369371

370-
debug!(
372+
tracing::trace!(
371373
http_method,
372374
span_name,
373375
?url,
@@ -396,8 +398,7 @@ impl SpanExt for Span {
396398
// Setting common fields first
397399
// See https://opentelemetry.io/docs/specs/semconv/http/http-spans/#common-attributes
398400

399-
debug!("create http span");
400-
let span = trace_span!(
401+
let span = tracing::trace_span!(
401402
"HTTP request",
402403
"otel.name" = span_name,
403404
"otel.kind" = ?SpanKind::Server,
@@ -418,9 +419,44 @@ impl SpanExt for Span {
418419
);
419420

420421
// Set the parent span based on the extracted context
421-
debug!("set parent span based on extracted context");
422-
let parent = HeaderExtractor::new(req.headers()).extract_context();
423-
span.set_parent(parent);
422+
//
423+
// The OpenTelemetry spec does not allow trace ids to be updated after
424+
// a span has been created. Since the (optional) new trace id given by
425+
// a client is only knowable after handling the request, it is not
426+
// available to the existing parent spans for the lower layers (tcp/tls
427+
// handling).
428+
//
429+
// Therefore, we have to made a decision about linking the two traces.
430+
// These are the options:
431+
// 1. Link to the trace id supplied in the incoming request, or
432+
// 2. Link to the current trace id, then set the parent contex based on
433+
// trace information supplied in the incoming request.
434+
//
435+
// Neither is ideal, as it means there are (at least) two traces to look
436+
// at to get a complete picture of what happened over the request.
437+
//
438+
// Option 1 is not viable, as the trace id in the response headers will
439+
// not be the same as what the client sent. Yet we are supposed to pass
440+
// their trace id in any further requests.
441+
//
442+
// We will go with option 2 as it at least keeps the higher layer spans
443+
// in one trace, which is likely going to be more useful to the person
444+
// visualising the traces.
445+
let new_parent = HeaderExtractor::new(req.headers()).extract_context();
446+
let new_span_context = new_parent.span().span_context().clone();
447+
let current_span_context = Context::current().span().span_context().clone();
448+
449+
if new_span_context != current_span_context {
450+
tracing::trace!(
451+
opentelemetry.trace_id.from = ?current_span_context.trace_id(),
452+
opentelemetry.trace_id.to = ?new_span_context.trace_id(),
453+
"set parent span context based on context extracted from request headers"
454+
);
455+
456+
Span::current().add_link(new_parent.span().span_context().clone());
457+
span.add_link(Context::current().span().span_context().to_owned());
458+
span.set_parent(new_parent);
459+
}
424460

425461
if let Some(user_agent) = req.user_agent() {
426462
span.record("user_agent.original", user_agent);

crates/stackable-telemetry/src/tracing.rs

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,9 @@
99
use opentelemetry::KeyValue;
1010
use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge;
1111
use opentelemetry_sdk::{
12-
logs,
12+
logs::{self, LoggerProvider},
1313
propagation::TraceContextPropagator,
14-
trace::{self, RandomIdGenerator, Sampler},
15-
Resource,
14+
trace, Resource,
1615
};
1716
use opentelemetry_semantic_conventions::resource;
1817
use snafu::{ResultExt as _, Snafu};
@@ -46,6 +45,8 @@ pub enum Error {
4645
///
4746
/// #[tokio::main]
4847
/// async fn main() -> Result<(), Error> {
48+
/// // IMPORTANT: Name the guard variable appropriately, do not just use
49+
/// // `let _ =`, as that will drop immediately.
4950
/// let _tracing_guard = Tracing::builder()
5051
/// .service_name("test")
5152
/// .with_console_output("TEST_CONSOLE", LevelFilter::INFO)
@@ -125,6 +126,7 @@ pub struct Tracing {
125126
console_log_config: SubscriberConfig,
126127
otlp_log_config: SubscriberConfig,
127128
otlp_trace_config: SubscriberConfig,
129+
logger_provider: Option<LoggerProvider>,
128130
}
129131

130132
impl Tracing {
@@ -134,7 +136,10 @@ impl Tracing {
134136

135137
/// Initialise the configured tracing subscribers, returning a guard that
136138
/// will shutdown the subscribers when dropped.
137-
pub fn init(self) -> Result<Tracing> {
139+
///
140+
/// IMPORTANT: Name the guard variable appropriately, do not just use
141+
/// `let _ =`, as that will drop immediately.
142+
pub fn init(mut self) -> Result<Tracing> {
138143
let mut layers: Vec<Box<dyn Layer<Registry> + Sync + Send>> = Vec::new();
139144

140145
if self.console_log_config.enabled {
@@ -168,10 +173,11 @@ impl Tracing {
168173

169174
// Convert `tracing::Event` to OpenTelemetry logs
170175
layers.push(
171-
OpenTelemetryTracingBridge::new(otel_log.provider())
176+
OpenTelemetryTracingBridge::new(&otel_log)
172177
.with_filter(env_filter_layer)
173178
.boxed(),
174179
);
180+
self.logger_provider = Some(otel_log);
175181
}
176182

177183
if self.otlp_trace_config.enabled {
@@ -186,15 +192,9 @@ impl Tracing {
186192
let otel_tracer = opentelemetry_otlp::new_pipeline()
187193
.tracing()
188194
.with_exporter(trace_exporter)
189-
.with_trace_config(
190-
trace::config()
191-
.with_sampler(Sampler::AlwaysOn) // TODO (@NickLarsenNZ): Make this configurable. See also Sampler::ParentBased
192-
.with_id_generator(RandomIdGenerator::default())
193-
.with_resource(Resource::new(vec![KeyValue::new(
194-
resource::SERVICE_NAME,
195-
self.service_name,
196-
)])),
197-
)
195+
.with_trace_config(trace::config().with_resource(Resource::new(vec![
196+
KeyValue::new(resource::SERVICE_NAME, self.service_name),
197+
])))
198198
.install_batch(opentelemetry_sdk::runtime::Tokio)
199199
.context(InstallOtelTraceExporterSnafu)?;
200200

@@ -223,16 +223,31 @@ impl Tracing {
223223
.context(SetGlobalDefaultSubscriberSnafu)?;
224224
}
225225

226+
// IMPORTANT: we must return self, otherwise Drop will be called and uninitialise tracing
226227
Ok(self)
227228
}
228229
}
229230

230231
impl Drop for Tracing {
231232
fn drop(&mut self) {
232-
// NOTE (@NickLarsenNZ): These might eventually be replaced with something like SdkMeterProvider::shutdown(&self)
233-
// see: https://github.com/open-telemetry/opentelemetry-rust/pull/1412/files#r1409608679
234-
opentelemetry::global::shutdown_tracer_provider();
235-
opentelemetry::global::shutdown_logger_provider();
233+
tracing::debug!(
234+
opentelemetry.tracing.enabled = self.otlp_trace_config.enabled,
235+
opentelemetry.logger.enabled = self.otlp_log_config.enabled,
236+
"shutting down opentelemetry OTLP providers"
237+
);
238+
239+
if self.otlp_trace_config.enabled {
240+
// NOTE (@NickLarsenNZ): This might eventually be replaced with something like SdkMeterProvider::shutdown(&self)
241+
// as has been done with the LoggerProvider (further below)
242+
// see: https://github.com/open-telemetry/opentelemetry-rust/pull/1412/files#r1409608679
243+
opentelemetry::global::shutdown_tracer_provider();
244+
}
245+
246+
if let Some(logger_provider) = &self.logger_provider {
247+
if let Err(error) = logger_provider.shutdown() {
248+
tracing::error!(%error, "unable to shutdown LoggerProvider");
249+
}
250+
}
236251
}
237252
}
238253

@@ -417,6 +432,7 @@ impl TracingBuilder<builder_state::Config> {
417432
console_log_config: self.console_log_config,
418433
otlp_log_config: self.otlp_log_config,
419434
otlp_trace_config: self.otlp_trace_config,
435+
logger_provider: None,
420436
}
421437
}
422438
}

crates/stackable-webhook/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
## Changed
8+
9+
- Remove instrumentation of long running functions, add more granular instrumentation of futures. Adjust span and event levels ([#811]).
10+
711
### Fixed
812

913
- Fix the extraction of `ConnectInfo` (data about the connection client) and
@@ -12,6 +16,7 @@ All notable changes to this project will be documented in this file.
1216
compatible traces ([#806]).
1317

1418
[#806]: https://github.com/stackabletech/operator-rs/pull/806
19+
[#811]: https://github.com/stackabletech/operator-rs/pull/811
1520

1621
## [0.3.0] - 2024-05-08
1722

crates/stackable-webhook/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ hyper-util.workspace = true
1717
hyper.workspace = true
1818
k8s-openapi.workspace = true
1919
kube.workspace = true
20+
opentelemetry.workspace = true
2021
serde_json.workspace = true
2122
snafu.workspace = true
2223
tokio-rustls.workspace = true
2324
tokio.workspace = true
2425
tower-http.workspace = true
2526
tower.workspace = true
2627
tracing.workspace = true
28+
tracing-opentelemetry.workspace = true

crates/stackable-webhook/src/lib.rs

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ use stackable_telemetry::AxumTraceLayer;
3030
use tokio::signal::unix::{signal, SignalKind};
3131
use tower::ServiceBuilder;
3232
// use tower_http::trace::TraceLayer;
33-
use tracing::{debug, instrument};
3433

3534
use crate::tls::TlsServer;
3635

@@ -127,9 +126,8 @@ impl WebhookServer {
127126
/// let router = Router::new();
128127
/// let server = WebhookServer::new(router, options);
129128
/// ```
130-
#[instrument(name = "create_webhook_server", skip(router))]
131129
pub fn new(router: Router, options: Options) -> Self {
132-
debug!("create new webhook server");
130+
tracing::trace!("create new webhook server");
133131
Self { options, router }
134132
}
135133

@@ -142,17 +140,17 @@ impl WebhookServer {
142140
let mut sigint = signal(SignalKind::interrupt()).expect("create SIGINT listener");
143141
let mut sigterm = signal(SignalKind::terminate()).expect("create SIGTERM listener");
144142

145-
debug!("created unix signal handlers");
143+
tracing::debug!("created unix signal handlers");
146144

147145
select! {
148146
signal = sigint.recv().fuse() => {
149147
if signal.is_some() {
150-
debug!( "received SIGINT");
148+
tracing::debug!( "received SIGINT");
151149
}
152150
},
153151
signal = sigterm.recv().fuse() => {
154152
if signal.is_some() {
155-
debug!( "received SIGTERM");
153+
tracing::debug!( "received SIGTERM");
156154
}
157155
},
158156
};
@@ -169,36 +167,36 @@ impl WebhookServer {
169167

170168
/// Runs the webhook server by creating a TCP listener and binding it to
171169
/// the specified socket address.
172-
#[instrument(name = "run_webhook_server", skip(self), fields(self.options))]
173170
async fn run_server(self) -> Result<()> {
174-
debug!("run webhook server");
171+
tracing::debug!("run webhook server");
175172

176173
// TODO (@Techassi): Make opt-in configurable from the outside
177174
// Create an OpenTelemetry tracing layer
178-
debug!("create tracing service (layer)");
175+
tracing::trace!("create tracing service (layer)");
179176
let trace_layer = AxumTraceLayer::new().with_opt_in();
180177

181178
// Use a service builder to provide multiple layers at once. Recommended
182179
// by the Axum project.
183180
//
184181
// See https://docs.rs/axum/latest/axum/middleware/index.html#applying-multiple-middleware
182+
// TODO (@NickLarsenNZ): rename this server_builder and keep it specific to tracing, since it's placement in the chain is important
185183
let service_builder = ServiceBuilder::new().layer(trace_layer);
186184

187185
// Create the root router and merge the provided router into it.
188-
debug!("create core router and merge provided router");
186+
tracing::debug!("create core router and merge provided router");
189187
let router = self
190188
.router
191189
.layer(service_builder)
192190
// The health route is below the AxumTraceLayer so as not to be instrumented
193191
.route("/health", get(|| async { "ok" }));
194192

195193
// Create server for TLS termination
196-
debug!("create TLS server");
194+
tracing::debug!("create TLS server");
197195
let tls_server = TlsServer::new(self.options.socket_addr, router)
198196
.await
199197
.context(CreateTlsServerSnafu)?;
200198

201-
debug!("running TLS server");
199+
tracing::info!("running TLS server");
202200
tls_server.run().await.context(RunTlsServerSnafu)
203201
}
204202
}

0 commit comments

Comments
 (0)