fix(langchain): langgraph application crash due to context detach (#3256)

galkleinman · web-flow · commit dbeceb8650b3 · 2025-08-14T15:06:05.000+03:00
diff --git a/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/__init__.py b/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/__init__.py
@@ -242,8 +242,13 @@ def __call__(
 
         # In legacy chains like LLMChain, suppressing model instrumentations
         # within create_llm_span doesn't work, so this should helps as a fallback
-        context_api.attach(
-            context_api.set_value(SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY, True)
-        )
+        try:
+            context_api.attach(
+                context_api.set_value(SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY, True)
+            )
+        except Exception:
+            # If context setting fails, continue without suppression
+            # This is not critical for core functionality
+            pass
 
         return wrapped(*args, **kwargs)
diff --git a/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/callback_handler.py b/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/callback_handler.py
@@ -187,18 +187,57 @@ def _end_span(self, span: Span, run_id: UUID) -> None:
         span.end()
         token = self.spans[run_id].token
         if token:
-            try:
-                # Use the runtime context directly to avoid logging from context_api.detach()
-                from opentelemetry.context import _RUNTIME_CONTEXT
-                _RUNTIME_CONTEXT.detach(token)
-            except (ValueError, RuntimeError, Exception):
-                # Context detach can fail in async scenarios when tokens are created in different contexts
-                # This includes ValueError, RuntimeError, and other context-related exceptions
-                # This is expected behavior and doesn't affect the correct span hierarchy
-                pass
+            self._safe_detach_context(token)
 
         del self.spans[run_id]
 
+    def _safe_attach_context(self, span: Span):
+        """
+        Safely attach span to context, handling potential failures in async scenarios.
+
+        Returns the context token for later detachment, or None if attachment fails.
+        """
+        try:
+            return context_api.attach(set_span_in_context(span))
+        except Exception:
+            # Context attachment can fail in some edge cases, particularly in
+            # complex async scenarios or when context is corrupted.
+            # Return None to indicate no token needs to be detached later.
+            return None
+
+    def _safe_detach_context(self, token):
+        """
+        Safely detach context token without causing application crashes.
+
+        This method implements a fail-safe approach to context detachment that handles
+        all known edge cases in async/concurrent scenarios where context tokens may
+        become invalid or be detached in different execution contexts.
+
+        We use the runtime context directly to avoid logging errors from context_api.detach()
+        """
+        if not token:
+            return
+
+        try:
+            # Use the runtime context directly to avoid error logging from context_api.detach()
+            from opentelemetry.context import _RUNTIME_CONTEXT
+
+            _RUNTIME_CONTEXT.detach(token)
+        except Exception:
+            # Context detach can fail in async scenarios when tokens are created in different contexts
+            # This includes ValueError, RuntimeError, and other context-related exceptions
+            # This is expected behavior and doesn't affect the correct span hierarchy
+            #
+            # Common scenarios where this happens:
+            # 1. Token created in one async task/thread, detached in another
+            # 2. Context was already detached by another process
+            # 3. Token became invalid due to context switching
+            # 4. Race conditions in highly concurrent scenarios
+            #
+            # This is safe to ignore as the span itself was properly ended
+            # and the tracing data is correctly captured.
+            pass
+
     def _create_span(
         self,
         run_id: UUID,
@@ -220,12 +259,17 @@ def _create_span(
                 for k, v in metadata.items()
                 if v is not None
             }
-            context_api.attach(
-                context_api.set_value(
-                    "association_properties",
-                    {**current_association_properties, **sanitized_metadata},
+            try:
+                context_api.attach(
+                    context_api.set_value(
+                        "association_properties",
+                        {**current_association_properties, **sanitized_metadata},
+                    )
                 )
-            )
+            except Exception:
+                # If setting association properties fails, continue without them
+                # This doesn't affect the core span functionality
+                pass
 
         if parent_run_id is not None and parent_run_id in self.spans:
             span = self.tracer.start_span(
@@ -236,7 +280,7 @@ def _create_span(
         else:
             span = self.tracer.start_span(span_name, kind=kind)
 
-        token = context_api.attach(set_span_in_context(span))
+        token = self._safe_attach_context(span)
 
         _set_span_attribute(span, SpanAttributes.TRACELOOP_WORKFLOW_NAME, workflow_name)
         _set_span_attribute(span, SpanAttributes.TRACELOOP_ENTITY_PATH, entity_path)
@@ -317,9 +361,13 @@ def _create_llm_span(
 
         # we already have an LLM span by this point,
         # so skip any downstream instrumentation from here
-        token = context_api.attach(
-            context_api.set_value(SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY, True)
-        )
+        try:
+            token = context_api.attach(
+                context_api.set_value(SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY, True)
+            )
+        except Exception:
+            # If context setting fails, continue without suppression token
+            token = None
 
         self.spans[run_id] = SpanHolder(
             span, token, None, [], workflow_name, None, entity_path
@@ -411,11 +459,15 @@ def on_chain_end(
 
         self._end_span(span, run_id)
         if parent_run_id is None:
-            context_api.attach(
-                context_api.set_value(
-                    SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY, False
+            try:
+                context_api.attach(
+                    context_api.set_value(
+                        SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY, False
+                    )
                 )
-            )
+            except Exception:
+                # If context reset fails, it's not critical for functionality
+                pass
 
     @dont_throw
     def on_chat_model_start(
@@ -513,8 +565,12 @@ def on_llm_end(
         if model_name is None:
             model_name = extract_model_name_from_response_metadata(response)
         if model_name is None and hasattr(context_api, "get_value"):
-            association_properties = context_api.get_value("association_properties") or {}
-            model_name = _extract_model_name_from_association_metadata(association_properties)
+            association_properties = (
+                context_api.get_value("association_properties") or {}
+            )
+            model_name = _extract_model_name_from_association_metadata(
+                association_properties
+            )
         token_usage = (response.llm_output or {}).get("token_usage") or (
             response.llm_output or {}
         ).get("usage")