1313# limitations under the License.
1414
1515from dataclasses import dataclass , field
16+ from threading import RLock
1617from typing import Dict , List , Optional
1718from uuid import UUID
1819
20+ from cachetools import TTLCache
1921from opentelemetry .semconv ._incubating .attributes import (
2022 gen_ai_attributes as GenAI ,
2123)
@@ -40,10 +42,11 @@ def __init__(
4042 tracer : Tracer ,
4143 ) -> None :
4244 self ._tracer = tracer
45+ self ._lock = RLock ()
4346
4447 # Map from run_id -> _SpanState, to keep track of spans and parent/child relationships
45- # TODO: Use weak references or a TTL cache to avoid memory leaks in long-running processes. See #3735
46- self .spans : Dict [UUID , _SpanState ] = {}
48+ # Using a TTL cache to avoid memory leaks in long-running processes where end_span might not be called.
49+ self .spans : TTLCache [UUID , _SpanState ] = TTLCache ( maxsize = 1024 , ttl = 3600 )
4750
4851 def _create_span (
4952 self ,
@@ -52,23 +55,24 @@ def _create_span(
5255 span_name : str ,
5356 kind : SpanKind = SpanKind .INTERNAL ,
5457 ) -> Span :
55- if parent_run_id is not None and parent_run_id in self .spans :
56- parent_state = self .spans [parent_run_id ]
57- parent_span = parent_state .span
58- ctx = set_span_in_context (parent_span )
59- span = self ._tracer .start_span (
60- name = span_name , kind = kind , context = ctx
61- )
62- parent_state .children .append (run_id )
63- else :
64- # top-level or missing parent
65- span = self ._tracer .start_span (name = span_name , kind = kind )
66- set_span_in_context (span )
67-
68- span_state = _SpanState (span = span )
69- self .spans [run_id ] = span_state
70-
71- return span
58+ with self ._lock :
59+ if parent_run_id is not None and parent_run_id in self .spans :
60+ parent_state = self .spans [parent_run_id ]
61+ parent_span = parent_state .span
62+ ctx = set_span_in_context (parent_span )
63+ span = self ._tracer .start_span (
64+ name = span_name , kind = kind , context = ctx
65+ )
66+ parent_state .children .append (run_id )
67+ else :
68+ # top-level or missing parent
69+ span = self ._tracer .start_span (name = span_name , kind = kind )
70+ set_span_in_context (span )
71+
72+ span_state = _SpanState (span = span )
73+ self .spans [run_id ] = span_state
74+
75+ return span
7276
7377 def create_chat_span (
7478 self ,
@@ -92,18 +96,25 @@ def create_chat_span(
9296 return span
9397
9498 def end_span (self , run_id : UUID ) -> None :
95- state = self .spans [run_id ]
96- for child_id in state .children :
97- child_state = self .spans .get (child_id )
98- if child_state :
99- child_state .span .end ()
100- del self .spans [child_id ]
101- state .span .end ()
102- del self .spans [run_id ]
99+ with self ._lock :
100+ state = self .spans .get (run_id )
101+ if not state :
102+ return
103+ # End children first (make a copy to avoid modification during iteration)
104+ for child_id in list (state .children ):
105+ child_state = self .spans .get (child_id )
106+ if child_state :
107+ child_state .span .end ()
108+ # Use pop to avoid KeyError if already expired
109+ self .spans .pop (child_id , None )
110+ state .span .end ()
111+ # Use pop to avoid KeyError if already expired
112+ self .spans .pop (run_id , None )
103113
104114 def get_span (self , run_id : UUID ) -> Optional [Span ]:
105- state = self .spans .get (run_id )
106- return state .span if state else None
115+ with self ._lock :
116+ state = self .spans .get (run_id )
117+ return state .span if state else None
107118
108119 def handle_error (self , error : BaseException , run_id : UUID ):
109120 span = self .get_span (run_id )
0 commit comments