diff --git a/opentelemetry-sdk/CHANGELOG.md b/opentelemetry-sdk/CHANGELOG.md index e5e4ca3295..845d21354b 100644 --- a/opentelemetry-sdk/CHANGELOG.md +++ b/opentelemetry-sdk/CHANGELOG.md @@ -5,8 +5,14 @@ - Added `Resource::get_ref(&self, key: &Key) -> Option<&Value>` to allow retrieving a reference to a resource value without cloning. - **Breaking** Removed the following public hidden methods from the `SdkTracer` [#3227][3227]: - `id_generator`, `should_sample` +- **Fix**: Fixed panic when `SpanProcessor::on_end` calls `Context::current()` ([#2871][2871]). + - Updated `SpanProcessor::on_end` documentation to clarify that `Context::current()` should not be relied upon in `on_end` + - `Context::current()` returns whatever context happens to be active, which may be unrelated to the span being ended + - Added best practice guidance: extract context information in `on_start` and store as span attributes + - Documented that the panic fix in `opentelemetry` allows safe calls to `Context::current()` from `on_end` [3227]: https://github.com/open-telemetry/opentelemetry-rust/pull/3227 +[3262]: https://github.com/open-telemetry/opentelemetry-rust/pull/3262 ## 0.31.0 diff --git a/opentelemetry-sdk/src/trace/mod.rs b/opentelemetry-sdk/src/trace/mod.rs index 585f65f27d..b49e9cd71a 100644 --- a/opentelemetry-sdk/src/trace/mod.rs +++ b/opentelemetry-sdk/src/trace/mod.rs @@ -137,10 +137,22 @@ mod tests { } } - fn on_end(&self, _span: SpanData) { - // TODO: Accessing Context::current() will panic today and hence commented out. + fn on_end(&self, span: SpanData) { + // Fixed: Context::current() no longer panics from Drop // See https://github.com/open-telemetry/opentelemetry-rust/issues/2871 - // let _c = Context::current(); + Context::current(); + + // IMPORTANT NOTE: The context returned here is whatever context happens + // to be current at this moment, which may be unrelated to the span being ended. + // The span's baggage was extracted in on_start and stored as attributes. + + // Verify: on_start stored the baggage as an attribute + assert!( + span.attributes + .iter() + .any(|kv| kv.key.as_str() == "bag-key"), + "Baggage should have been stored as span attribute in on_start" + ); } fn force_flush(&self) -> crate::error::OTelSdkResult { diff --git a/opentelemetry-sdk/src/trace/span_processor.rs b/opentelemetry-sdk/src/trace/span_processor.rs index 0ebe80f885..4e9162e8aa 100644 --- a/opentelemetry-sdk/src/trace/span_processor.rs +++ b/opentelemetry-sdk/src/trace/span_processor.rs @@ -79,9 +79,45 @@ pub trait SpanProcessor: Send + Sync + std::fmt::Debug { /// synchronously on the thread that started the span, therefore it should /// not block or throw exceptions. fn on_start(&self, span: &mut Span, cx: &Context); + /// `on_end` is called after a `Span` is ended (i.e., the end timestamp is /// already set). This method is called synchronously within the `Span::end` /// API, therefore it should not block or throw an exception. + /// + /// # Accessing Context + /// + /// **Important**: Do not rely on [`Context::current()`] in `on_end`. When `on_end` + /// is called during span cleanup, `Context::current()` returns whatever context + /// happens to be active at that moment, which is typically unrelated to the span + /// being ended. Contexts can be activated in any order and are not necessarily + /// hierarchical. + /// + /// **Best Practice**: Extract any needed context information in [`on_start`] + /// and store it as span attributes. This ensures the information is available + /// in the [`SpanData`] passed to `on_end`. + /// + /// # Example + /// + /// ```rust,ignore + /// impl SpanProcessor for MyProcessor { + /// fn on_start(&self, span: &mut Span, cx: &Context) { + /// // Extract baggage and store as span attribute + /// if let Some(value) = cx.baggage().get("my-key") { + /// span.set_attribute(KeyValue::new("my-key", value.to_string())); + /// } + /// } + /// + /// fn on_end(&self, span: SpanData) { + /// // Access the attribute stored in on_start + /// let my_value = span.attributes.iter() + /// .find(|kv| kv.key.as_str() == "my-key"); + /// } + /// } + /// ``` + /// + /// [`on_start`]: SpanProcessor::on_start + /// [`Context::current()`]: opentelemetry::Context::current + /// /// TODO - This method should take reference to `SpanData` fn on_end(&self, span: SpanData); /// Force the spans lying in the cache to be exported. diff --git a/opentelemetry/CHANGELOG.md b/opentelemetry/CHANGELOG.md index a3686a9169..6dbcc547a3 100644 --- a/opentelemetry/CHANGELOG.md +++ b/opentelemetry/CHANGELOG.md @@ -7,8 +7,13 @@ - `trace_id`, `span_id`, `end_time`, `status`, `sampling_result` - `with_trace_id`, `with_span_id`, `with_end_time`, `with_status`, `with_sampling_result` - **Added** `#[must_use]` attribute to `opentelemetry::metrics::AsyncInstrumentBuilder` to add compile time warning when `.build()` is not called on observable instrument builders, preventing silent failures where callbacks are never registered and metrics are never reported. +- **Fix**: Fixed panic when calling `Context::current()` from `Drop` implementations triggered by `ContextGuard` cleanup ([#2871][2871]). + - Modified `ContextGuard::drop` to extract only the span (not the entire context) outside of `borrow_mut()` scope before dropping it + - This prevents "RefCell already mutably borrowed" panics when Span's `Drop` implementation calls `Context::current()` during context cleanup + - **Important**: `Context::current()` called from within a Drop triggered by context cleanup returns whatever context happens to be current at that moment, which may be unrelated to the context being dropped [3227]: https://github.com/open-telemetry/opentelemetry-rust/pull/3227 +[2871]: https://github.com/open-telemetry/opentelemetry-rust/issues/2871 ## v0.31.0 diff --git a/opentelemetry/src/context.rs b/opentelemetry/src/context.rs index ecad94d8f2..a25fef450e 100644 --- a/opentelemetry/src/context.rs +++ b/opentelemetry/src/context.rs @@ -115,6 +115,22 @@ impl Context { /// Returns an immutable snapshot of the current thread's context. /// + /// # Behavior During Context Drop + /// + /// When called from within a [`Drop`] implementation that is triggered by + /// a [`ContextGuard`] being dropped (e.g., when a [`Span`] is dropped as part + /// of context cleanup), this function returns **whatever context happens to be + /// current after the guard is popped**, not the context being dropped. + /// + /// **Important**: The returned context may be completely unrelated to the + /// context being dropped, as contexts can be activated in any order and are + /// not necessarily hierarchical. Do not rely on any relationship between them. + /// + /// This behavior is by design and prevents panics that would otherwise occur + /// from attempting to borrow the context while it's being mutably borrowed + /// for cleanup. See [issue #2871](https://github.com/open-telemetry/opentelemetry-rust/issues/2871) + /// for details. + /// /// # Examples /// /// ``` @@ -130,6 +146,8 @@ impl Context { /// let _guard = Context::new().with_value(ValueA("a")).attach(); /// do_work() /// ``` + /// + /// [`Span`]: crate::trace::Span pub fn current() -> Self { Self::map_current(|cx| cx.clone()) } @@ -464,7 +482,14 @@ impl Drop for ContextGuard { fn drop(&mut self) { let id = self.cx_pos; if id > ContextStack::BASE_POS && id < ContextStack::MAX_POS { + // Extract only the span to drop outside of borrow_mut to avoid panic + // when the span's drop implementation calls Context::current() + #[cfg(feature = "trace")] + let _to_drop = + CURRENT_CONTEXT.with(|context_stack| context_stack.borrow_mut().pop_id(id)); + #[cfg(not(feature = "trace"))] CURRENT_CONTEXT.with(|context_stack| context_stack.borrow_mut().pop_id(id)); + // Span (if any) is automatically dropped here, outside of borrow_mut scope } } } @@ -515,6 +540,12 @@ struct ContextStack { _marker: PhantomData<*const ()>, } +// Type alias for what pop_id returns - only return the span when trace feature is enabled +#[cfg(feature = "trace")] +type PopIdReturn = Option>; +#[cfg(not(feature = "trace"))] +type PopIdReturn = (); + impl ContextStack { const BASE_POS: u16 = 0; const MAX_POS: u16 = u16::MAX; @@ -543,7 +574,7 @@ impl ContextStack { } #[inline(always)] - fn pop_id(&mut self, pos: u16) { + fn pop_id(&mut self, pos: u16) -> PopIdReturn { if pos == ContextStack::BASE_POS || pos == ContextStack::MAX_POS { // The empty context is always at the bottom of the [`ContextStack`] // and cannot be popped, and the overflow position is invalid, so do @@ -557,7 +588,8 @@ impl ContextStack { "Attempted to pop the overflow position which is not allowed" } ); - return; + #[cfg(feature = "trace")] + return None; } let len: u16 = self.stack.len() as u16; // Are we at the top of the [`ContextStack`]? @@ -570,8 +602,19 @@ impl ContextStack { // empty context is always at the bottom of the stack if the // [`ContextStack`] is not empty. if let Some(Some(next_cx)) = self.stack.pop() { - self.current_cx = next_cx; + // Extract and return only the span to avoid cloning the entire Context + #[cfg(feature = "trace")] + { + let old_cx = std::mem::replace(&mut self.current_cx, next_cx); + return old_cx.span; + } + #[cfg(not(feature = "trace"))] + { + self.current_cx = next_cx; + } } + #[cfg(feature = "trace")] + return None; } else { // This is an out of order pop. if pos >= len { @@ -582,10 +625,16 @@ impl ContextStack { stack_length = len, message = "Attempted to pop beyond the end of the context stack" ); - return; + #[cfg(feature = "trace")] + return None; + } + // Clear out the entry at the given id and extract its span + #[cfg(feature = "trace")] + return self.stack[pos as usize].take().and_then(|cx| cx.span); + #[cfg(not(feature = "trace"))] + { + self.stack[pos as usize] = None; } - // Clear out the entry at the given id. - _ = self.stack[pos as usize].take(); } }