@@ -260,6 +260,7 @@ def _accumulate_streaming_response(
260260 llm_request_type ,
261261 response ,
262262 streaming_time_to_first_token = None ,
263+ streaming_time_to_generate = None ,
263264 start_time = None
264265):
265266 if llm_request_type == LLMRequestTypeValues .CHAT :
@@ -268,10 +269,16 @@ def _accumulate_streaming_response(
268269 accumulated_response = {"response" : "" }
269270
270271 first_token = True
272+ first_token_time = None
273+ last_response = None
274+
271275 for res in response :
276+ last_response = res # Track the last response explicitly
277+
272278 if first_token and streaming_time_to_first_token and start_time is not None :
279+ first_token_time = time .perf_counter ()
273280 streaming_time_to_first_token .record (
274- time . perf_counter () - start_time ,
281+ first_token_time - start_time ,
275282 attributes = {SpanAttributes .LLM_SYSTEM : "Ollama" },
276283 )
277284 first_token = False
@@ -284,8 +291,24 @@ def _accumulate_streaming_response(
284291 text = res .get ("response" , "" )
285292 accumulated_response ["response" ] += text
286293
287- response_data = res .model_dump () if hasattr (res , 'model_dump' ) else res
288- _set_response_attributes (span , token_histogram , llm_request_type , response_data | accumulated_response )
294+ # Record streaming time to generate after the response is complete
295+ if streaming_time_to_generate and first_token_time is not None :
296+ model_name = last_response .get ("model" ) if last_response else None
297+ streaming_time_to_generate .record (
298+ time .perf_counter () - first_token_time ,
299+ attributes = {
300+ SpanAttributes .LLM_SYSTEM : "Ollama" ,
301+ SpanAttributes .LLM_RESPONSE_MODEL : model_name ,
302+ },
303+ )
304+
305+ response_data = (
306+ last_response .model_dump ()
307+ if last_response and hasattr (last_response , 'model_dump' )
308+ else last_response
309+ )
310+ if response_data :
311+ _set_response_attributes (span , token_histogram , llm_request_type , response_data | accumulated_response )
289312 span .end ()
290313
291314
@@ -295,6 +318,7 @@ async def _aaccumulate_streaming_response(
295318 llm_request_type ,
296319 response ,
297320 streaming_time_to_first_token = None ,
321+ streaming_time_to_generate = None ,
298322 start_time = None ,
299323):
300324 if llm_request_type == LLMRequestTypeValues .CHAT :
@@ -303,11 +327,16 @@ async def _aaccumulate_streaming_response(
303327 accumulated_response = {"response" : "" }
304328
305329 first_token = True
330+ first_token_time = None
331+ last_response = None
306332
307333 async for res in response :
334+ last_response = res
335+
308336 if first_token and streaming_time_to_first_token and start_time is not None :
337+ first_token_time = time .perf_counter ()
309338 streaming_time_to_first_token .record (
310- time . perf_counter () - start_time ,
339+ first_token_time - start_time ,
311340 attributes = {SpanAttributes .LLM_SYSTEM : "Ollama" },
312341 )
313342 first_token = False
@@ -320,21 +349,45 @@ async def _aaccumulate_streaming_response(
320349 text = res .get ("response" , "" )
321350 accumulated_response ["response" ] += text
322351
323- response_data = res .model_dump () if hasattr (res , 'model_dump' ) else res
324- _set_response_attributes (span , token_histogram , llm_request_type , response_data | accumulated_response )
352+ # Record streaming time to generate after the response is complete
353+ if streaming_time_to_generate and first_token_time is not None :
354+ model_name = last_response .get ("model" ) if last_response else None
355+ streaming_time_to_generate .record (
356+ time .perf_counter () - first_token_time ,
357+ attributes = {
358+ SpanAttributes .LLM_SYSTEM : "Ollama" ,
359+ SpanAttributes .LLM_RESPONSE_MODEL : model_name ,
360+ },
361+ )
362+
363+ response_data = (
364+ last_response .model_dump ()
365+ if last_response and hasattr (last_response , 'model_dump' )
366+ else last_response
367+ )
368+ if response_data :
369+ _set_response_attributes (span , token_histogram , llm_request_type , response_data | accumulated_response )
325370 span .end ()
326371
327372
328373def _with_tracer_wrapper (func ):
329374 """Helper for providing tracer for wrapper functions."""
330375
331- def _with_tracer (tracer , token_histogram , duration_histogram , streaming_time_to_first_token , to_wrap ):
376+ def _with_tracer (
377+ tracer ,
378+ token_histogram ,
379+ duration_histogram ,
380+ streaming_time_to_first_token ,
381+ streaming_time_to_generate ,
382+ to_wrap
383+ ):
332384 def wrapper (wrapped , instance , args , kwargs ):
333385 return func (
334386 tracer ,
335387 token_histogram ,
336388 duration_histogram ,
337389 streaming_time_to_first_token ,
390+ streaming_time_to_generate ,
338391 to_wrap ,
339392 wrapped ,
340393 instance ,
@@ -364,6 +417,7 @@ def _wrap(
364417 token_histogram : Histogram ,
365418 duration_histogram : Histogram ,
366419 streaming_time_to_first_token : Histogram ,
420+ streaming_time_to_generate : Histogram ,
367421 to_wrap ,
368422 wrapped ,
369423 instance ,
@@ -410,6 +464,7 @@ def _wrap(
410464 llm_request_type ,
411465 response ,
412466 streaming_time_to_first_token ,
467+ streaming_time_to_generate ,
413468 start_time ,
414469 )
415470
@@ -426,6 +481,7 @@ async def _awrap(
426481 token_histogram : Histogram ,
427482 duration_histogram : Histogram ,
428483 streaming_time_to_first_token : Histogram ,
484+ streaming_time_to_generate : Histogram ,
429485 to_wrap ,
430486 wrapped ,
431487 instance ,
@@ -472,6 +528,7 @@ async def _awrap(
472528 llm_request_type ,
473529 response ,
474530 streaming_time_to_first_token ,
531+ streaming_time_to_generate ,
475532 start_time ,
476533 )
477534
@@ -501,7 +558,13 @@ def _build_metrics(meter: Meter):
501558 description = "Time to first token in streaming chat completions" ,
502559 )
503560
504- return token_histogram , duration_histogram , streaming_time_to_first_token
561+ streaming_time_to_generate = meter .create_histogram (
562+ name = Meters .LLM_STREAMING_TIME_TO_GENERATE ,
563+ unit = "s" ,
564+ description = "Time from first token to completion in streaming responses" ,
565+ )
566+
567+ return token_histogram , duration_histogram , streaming_time_to_first_token , streaming_time_to_generate
505568
506569
507570def is_metrics_collection_enabled () -> bool :
@@ -530,13 +593,15 @@ def _instrument(self, **kwargs):
530593 token_histogram ,
531594 duration_histogram ,
532595 streaming_time_to_first_token ,
596+ streaming_time_to_generate ,
533597 ) = _build_metrics (meter )
534598 else :
535599 (
536600 token_histogram ,
537601 duration_histogram ,
538602 streaming_time_to_first_token ,
539- ) = (None , None , None )
603+ streaming_time_to_generate ,
604+ ) = (None , None , None , None )
540605
541606 # Patch _copy_messages to sanitize tool_calls arguments before Pydantic validation
542607 wrap_function_wrapper (
@@ -548,12 +613,24 @@ def _instrument(self, **kwargs):
548613 wrap_function_wrapper (
549614 "ollama._client" ,
550615 "Client._request" ,
551- _dispatch_wrap (tracer , token_histogram , duration_histogram , streaming_time_to_first_token ),
616+ _dispatch_wrap (
617+ tracer ,
618+ token_histogram ,
619+ duration_histogram ,
620+ streaming_time_to_first_token ,
621+ streaming_time_to_generate
622+ ),
552623 )
553624 wrap_function_wrapper (
554625 "ollama._client" ,
555626 "AsyncClient._request" ,
556- _dispatch_awrap (tracer , token_histogram , duration_histogram , streaming_time_to_first_token ),
627+ _dispatch_awrap (
628+ tracer ,
629+ token_histogram ,
630+ duration_histogram ,
631+ streaming_time_to_first_token ,
632+ streaming_time_to_generate
633+ ),
557634 )
558635
559636 def _uninstrument (self , ** kwargs ):
@@ -572,30 +649,56 @@ def _uninstrument(self, **kwargs):
572649 )
573650
574651
575- def _dispatch_wrap (tracer , token_histogram , duration_histogram , streaming_time_to_first_token ):
652+ def _dispatch_wrap (
653+ tracer ,
654+ token_histogram ,
655+ duration_histogram ,
656+ streaming_time_to_first_token ,
657+ streaming_time_to_generate
658+ ):
576659 def wrapper (wrapped , instance , args , kwargs ):
577660 to_wrap = None
578661 if len (args ) > 2 and isinstance (args [2 ], str ):
579662 path = args [2 ]
580663 op = path .rstrip ('/' ).split ('/' )[- 1 ]
581664 to_wrap = next ((m for m in WRAPPED_METHODS if m .get ("method" ) == op ), None )
582665 if to_wrap :
583- return _wrap (tracer , token_histogram , duration_histogram , streaming_time_to_first_token , to_wrap )(
666+ return _wrap (
667+ tracer ,
668+ token_histogram ,
669+ duration_histogram ,
670+ streaming_time_to_first_token ,
671+ streaming_time_to_generate ,
672+ to_wrap
673+ )(
584674 wrapped , instance , args , kwargs
585675 )
586676 return wrapped (* args , ** kwargs )
587677 return wrapper
588678
589679
590- def _dispatch_awrap (tracer , token_histogram , duration_histogram , streaming_time_to_first_token ):
680+ def _dispatch_awrap (
681+ tracer ,
682+ token_histogram ,
683+ duration_histogram ,
684+ streaming_time_to_first_token ,
685+ streaming_time_to_generate
686+ ):
591687 async def wrapper (wrapped , instance , args , kwargs ):
592688 to_wrap = None
593689 if len (args ) > 2 and isinstance (args [2 ], str ):
594690 path = args [2 ]
595691 op = path .rstrip ('/' ).split ('/' )[- 1 ]
596692 to_wrap = next ((m for m in WRAPPED_METHODS if m .get ("method" ) == op ), None )
597693 if to_wrap :
598- return await _awrap (tracer , token_histogram , duration_histogram , streaming_time_to_first_token , to_wrap )(
694+ return await _awrap (
695+ tracer ,
696+ token_histogram ,
697+ duration_histogram ,
698+ streaming_time_to_first_token ,
699+ streaming_time_to_generate ,
700+ to_wrap
701+ )(
599702 wrapped , instance , args , kwargs
600703 )
601704 return await wrapped (* args , ** kwargs )
0 commit comments