Skip to content

Commit 622d1e4

Browse files
feat(ollama): add meter STTG to ollama instrumentation (#3053)
Co-authored-by: Nir Gazit <nirga@users.noreply.github.com>
1 parent 41917d6 commit 622d1e4

File tree

4 files changed

+215
-17
lines changed

4 files changed

+215
-17
lines changed

packages/opentelemetry-instrumentation-ollama/opentelemetry/instrumentation/ollama/__init__.py

Lines changed: 118 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ def _accumulate_streaming_response(
260260
llm_request_type,
261261
response,
262262
streaming_time_to_first_token=None,
263+
streaming_time_to_generate=None,
263264
start_time=None
264265
):
265266
if llm_request_type == LLMRequestTypeValues.CHAT:
@@ -268,10 +269,16 @@ def _accumulate_streaming_response(
268269
accumulated_response = {"response": ""}
269270

270271
first_token = True
272+
first_token_time = None
273+
last_response = None
274+
271275
for res in response:
276+
last_response = res # Track the last response explicitly
277+
272278
if first_token and streaming_time_to_first_token and start_time is not None:
279+
first_token_time = time.perf_counter()
273280
streaming_time_to_first_token.record(
274-
time.perf_counter() - start_time,
281+
first_token_time - start_time,
275282
attributes={SpanAttributes.LLM_SYSTEM: "Ollama"},
276283
)
277284
first_token = False
@@ -284,8 +291,24 @@ def _accumulate_streaming_response(
284291
text = res.get("response", "")
285292
accumulated_response["response"] += text
286293

287-
response_data = res.model_dump() if hasattr(res, 'model_dump') else res
288-
_set_response_attributes(span, token_histogram, llm_request_type, response_data | accumulated_response)
294+
# Record streaming time to generate after the response is complete
295+
if streaming_time_to_generate and first_token_time is not None:
296+
model_name = last_response.get("model") if last_response else None
297+
streaming_time_to_generate.record(
298+
time.perf_counter() - first_token_time,
299+
attributes={
300+
SpanAttributes.LLM_SYSTEM: "Ollama",
301+
SpanAttributes.LLM_RESPONSE_MODEL: model_name,
302+
},
303+
)
304+
305+
response_data = (
306+
last_response.model_dump()
307+
if last_response and hasattr(last_response, 'model_dump')
308+
else last_response
309+
)
310+
if response_data:
311+
_set_response_attributes(span, token_histogram, llm_request_type, response_data | accumulated_response)
289312
span.end()
290313

291314

@@ -295,6 +318,7 @@ async def _aaccumulate_streaming_response(
295318
llm_request_type,
296319
response,
297320
streaming_time_to_first_token=None,
321+
streaming_time_to_generate=None,
298322
start_time=None,
299323
):
300324
if llm_request_type == LLMRequestTypeValues.CHAT:
@@ -303,11 +327,16 @@ async def _aaccumulate_streaming_response(
303327
accumulated_response = {"response": ""}
304328

305329
first_token = True
330+
first_token_time = None
331+
last_response = None
306332

307333
async for res in response:
334+
last_response = res
335+
308336
if first_token and streaming_time_to_first_token and start_time is not None:
337+
first_token_time = time.perf_counter()
309338
streaming_time_to_first_token.record(
310-
time.perf_counter() - start_time,
339+
first_token_time - start_time,
311340
attributes={SpanAttributes.LLM_SYSTEM: "Ollama"},
312341
)
313342
first_token = False
@@ -320,21 +349,45 @@ async def _aaccumulate_streaming_response(
320349
text = res.get("response", "")
321350
accumulated_response["response"] += text
322351

323-
response_data = res.model_dump() if hasattr(res, 'model_dump') else res
324-
_set_response_attributes(span, token_histogram, llm_request_type, response_data | accumulated_response)
352+
# Record streaming time to generate after the response is complete
353+
if streaming_time_to_generate and first_token_time is not None:
354+
model_name = last_response.get("model") if last_response else None
355+
streaming_time_to_generate.record(
356+
time.perf_counter() - first_token_time,
357+
attributes={
358+
SpanAttributes.LLM_SYSTEM: "Ollama",
359+
SpanAttributes.LLM_RESPONSE_MODEL: model_name,
360+
},
361+
)
362+
363+
response_data = (
364+
last_response.model_dump()
365+
if last_response and hasattr(last_response, 'model_dump')
366+
else last_response
367+
)
368+
if response_data:
369+
_set_response_attributes(span, token_histogram, llm_request_type, response_data | accumulated_response)
325370
span.end()
326371

327372

328373
def _with_tracer_wrapper(func):
329374
"""Helper for providing tracer for wrapper functions."""
330375

331-
def _with_tracer(tracer, token_histogram, duration_histogram, streaming_time_to_first_token, to_wrap):
376+
def _with_tracer(
377+
tracer,
378+
token_histogram,
379+
duration_histogram,
380+
streaming_time_to_first_token,
381+
streaming_time_to_generate,
382+
to_wrap
383+
):
332384
def wrapper(wrapped, instance, args, kwargs):
333385
return func(
334386
tracer,
335387
token_histogram,
336388
duration_histogram,
337389
streaming_time_to_first_token,
390+
streaming_time_to_generate,
338391
to_wrap,
339392
wrapped,
340393
instance,
@@ -364,6 +417,7 @@ def _wrap(
364417
token_histogram: Histogram,
365418
duration_histogram: Histogram,
366419
streaming_time_to_first_token: Histogram,
420+
streaming_time_to_generate: Histogram,
367421
to_wrap,
368422
wrapped,
369423
instance,
@@ -410,6 +464,7 @@ def _wrap(
410464
llm_request_type,
411465
response,
412466
streaming_time_to_first_token,
467+
streaming_time_to_generate,
413468
start_time,
414469
)
415470

@@ -426,6 +481,7 @@ async def _awrap(
426481
token_histogram: Histogram,
427482
duration_histogram: Histogram,
428483
streaming_time_to_first_token: Histogram,
484+
streaming_time_to_generate: Histogram,
429485
to_wrap,
430486
wrapped,
431487
instance,
@@ -472,6 +528,7 @@ async def _awrap(
472528
llm_request_type,
473529
response,
474530
streaming_time_to_first_token,
531+
streaming_time_to_generate,
475532
start_time,
476533
)
477534

@@ -501,7 +558,13 @@ def _build_metrics(meter: Meter):
501558
description="Time to first token in streaming chat completions",
502559
)
503560

504-
return token_histogram, duration_histogram, streaming_time_to_first_token
561+
streaming_time_to_generate = meter.create_histogram(
562+
name=Meters.LLM_STREAMING_TIME_TO_GENERATE,
563+
unit="s",
564+
description="Time from first token to completion in streaming responses",
565+
)
566+
567+
return token_histogram, duration_histogram, streaming_time_to_first_token, streaming_time_to_generate
505568

506569

507570
def is_metrics_collection_enabled() -> bool:
@@ -530,13 +593,15 @@ def _instrument(self, **kwargs):
530593
token_histogram,
531594
duration_histogram,
532595
streaming_time_to_first_token,
596+
streaming_time_to_generate,
533597
) = _build_metrics(meter)
534598
else:
535599
(
536600
token_histogram,
537601
duration_histogram,
538602
streaming_time_to_first_token,
539-
) = (None, None, None)
603+
streaming_time_to_generate,
604+
) = (None, None, None, None)
540605

541606
# Patch _copy_messages to sanitize tool_calls arguments before Pydantic validation
542607
wrap_function_wrapper(
@@ -548,12 +613,24 @@ def _instrument(self, **kwargs):
548613
wrap_function_wrapper(
549614
"ollama._client",
550615
"Client._request",
551-
_dispatch_wrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token),
616+
_dispatch_wrap(
617+
tracer,
618+
token_histogram,
619+
duration_histogram,
620+
streaming_time_to_first_token,
621+
streaming_time_to_generate
622+
),
552623
)
553624
wrap_function_wrapper(
554625
"ollama._client",
555626
"AsyncClient._request",
556-
_dispatch_awrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token),
627+
_dispatch_awrap(
628+
tracer,
629+
token_histogram,
630+
duration_histogram,
631+
streaming_time_to_first_token,
632+
streaming_time_to_generate
633+
),
557634
)
558635

559636
def _uninstrument(self, **kwargs):
@@ -572,30 +649,56 @@ def _uninstrument(self, **kwargs):
572649
)
573650

574651

575-
def _dispatch_wrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token):
652+
def _dispatch_wrap(
653+
tracer,
654+
token_histogram,
655+
duration_histogram,
656+
streaming_time_to_first_token,
657+
streaming_time_to_generate
658+
):
576659
def wrapper(wrapped, instance, args, kwargs):
577660
to_wrap = None
578661
if len(args) > 2 and isinstance(args[2], str):
579662
path = args[2]
580663
op = path.rstrip('/').split('/')[-1]
581664
to_wrap = next((m for m in WRAPPED_METHODS if m.get("method") == op), None)
582665
if to_wrap:
583-
return _wrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token, to_wrap)(
666+
return _wrap(
667+
tracer,
668+
token_histogram,
669+
duration_histogram,
670+
streaming_time_to_first_token,
671+
streaming_time_to_generate,
672+
to_wrap
673+
)(
584674
wrapped, instance, args, kwargs
585675
)
586676
return wrapped(*args, **kwargs)
587677
return wrapper
588678

589679

590-
def _dispatch_awrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token):
680+
def _dispatch_awrap(
681+
tracer,
682+
token_histogram,
683+
duration_histogram,
684+
streaming_time_to_first_token,
685+
streaming_time_to_generate
686+
):
591687
async def wrapper(wrapped, instance, args, kwargs):
592688
to_wrap = None
593689
if len(args) > 2 and isinstance(args[2], str):
594690
path = args[2]
595691
op = path.rstrip('/').split('/')[-1]
596692
to_wrap = next((m for m in WRAPPED_METHODS if m.get("method") == op), None)
597693
if to_wrap:
598-
return await _awrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token, to_wrap)(
694+
return await _awrap(
695+
tracer,
696+
token_histogram,
697+
duration_histogram,
698+
streaming_time_to_first_token,
699+
streaming_time_to_generate,
700+
to_wrap
701+
)(
599702
wrapped, instance, args, kwargs
600703
)
601704
return await wrapped(*args, **kwargs)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# VCR cassette for test_ollama_streaming_time_to_generate_metrics
2+
# Use `pytest --vcr-record=once` to generate and record HTTP interactions
3+
4+
interactions:
5+
- request:
6+
body: '{"model": "gemma3:1b", "stream": true, "prompt": "Tell me a joke about
7+
OpenTelemetry"}'
8+
headers:
9+
accept:
10+
- application/json
11+
accept-encoding:
12+
- gzip, deflate, br
13+
connection:
14+
- keep-alive
15+
content-length:
16+
- '86'
17+
content-type:
18+
- application/json
19+
host:
20+
- 127.0.0.1:11434
21+
user-agent:
22+
- ollama-python/0.4.7 (arm64 darwin) Python/3.12.1
23+
method: POST
24+
uri: http://127.0.0.1:11434/api/generate
25+
response:
26+
body:
27+
string: "{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.678947Z\",\"response\":\"Okay\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.689977Z\",\"response\":\",\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.698682Z\",\"response\":\"
28+
here\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.705542Z\",\"response\":\"'\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.719443Z\",\"response\":\"s\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.727065Z\",\"response\":\"
29+
a\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.734378Z\",\"response\":\"
30+
joke\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.741338Z\",\"response\":\"
31+
about\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.748548Z\",\"response\":\"
32+
Open\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.755651Z\",\"response\":\"Telemetry\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.762923Z\",\"response\":\":\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.770238Z\",\"response\":\"\\n\\n\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.777345Z\",\"response\":\"Why\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.784452Z\",\"response\":\"
33+
did\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.791322Z\",\"response\":\"
34+
the\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.799592Z\",\"response\":\"
35+
Open\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.807707Z\",\"response\":\"Telemetry\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.815026Z\",\"response\":\"
36+
team\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.82255Z\",\"response\":\"
37+
break\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.829539Z\",\"response\":\"
38+
up\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.837091Z\",\"response\":\"?\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.844886Z\",\"response\":\"
39+
\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.851832Z\",\"response\":\"\\n\\n\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.858934Z\",\"response\":\"Because\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.86602Z\",\"response\":\"
40+
they\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.87291Z\",\"response\":\"
41+
couldn\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.880172Z\",\"response\":\"'\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.887737Z\",\"response\":\"t\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.895174Z\",\"response\":\"
42+
stop\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.902823Z\",\"response\":\"
43+
arguing\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.917946Z\",\"response\":\"
44+
about\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.92544Z\",\"response\":\"
45+
the\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.932778Z\",\"response\":\"
46+
*\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.939594Z\",\"response\":\"trace\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.946936Z\",\"response\":\"
47+
path\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.953976Z\",\"response\":\"*\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.96117Z\",\"response\":\"!\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.968478Z\",\"response\":\"
48+
\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.975841Z\",\"response\":\"\\n\\n\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.982851Z\",\"response\":\"---\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.989474Z\",\"response\":\"\\n\\n\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.996748Z\",\"response\":\"Would\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.003849Z\",\"response\":\"
49+
you\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.010896Z\",\"response\":\"
50+
like\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.018201Z\",\"response\":\"
51+
to\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.025795Z\",\"response\":\"
52+
hear\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.033527Z\",\"response\":\"
53+
another\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.040887Z\",\"response\":\"
54+
one\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.048069Z\",\"response\":\"?\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.055597Z\",\"response\":\"
55+
\U0001F60A\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.062985Z\",\"response\":\"\",\"done\":true,\"done_reason\":\"stop\",\"context\":[105,2364,107,54593,786,496,31481,1003,7607,236654,106,107,105,4368,107,19058,236764,1590,236789,236751,496,31481,1003,7607,236654,236787,108,11355,1602,506,7607,236654,2434,2541,872,236881,236743,108,17574,901,9225,236789,236745,4721,46256,1003,506,808,27807,2479,236829,236888,236743,108,7243,108,38786,611,1133,531,6899,2264,886,236881,103453],\"total_duration\":482007750,\"load_duration\":52719542,\"prompt_eval_count\":16,\"prompt_eval_duration\":44457208,\"eval_count\":51,\"eval_duration\":384438167}\n"
56+
headers:
57+
Content-Type:
58+
- application/x-ndjson
59+
Date:
60+
- Mon, 19 May 2025 09:26:59 GMT
61+
Transfer-Encoding:
62+
- chunked
63+
status:
64+
code: 200
65+
message: OK
66+
version: 1

packages/opentelemetry-instrumentation-ollama/tests/test_ollama_metrics.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,44 @@ def test_ollama_streaming_metrics(metrics_test_context):
2929
pass
3030

3131
points = _collect_metrics(reader)
32-
# Assert metrics for token usage, operation duration, and time to first token are present
32+
# Assert metrics for token usage, operation duration, time to first token,
33+
# and streaming time to generate are present
3334
assert any(name == Meters.LLM_TOKEN_USAGE for name, _ in points), "Token usage metric not found"
3435
assert any(name == Meters.LLM_OPERATION_DURATION for name, _ in points), "Operation duration metric not found"
3536
assert any(name == GenAIMetrics.GEN_AI_SERVER_TIME_TO_FIRST_TOKEN for name, _ in points), \
3637
"Time to first token metric not found"
38+
assert any(name == Meters.LLM_STREAMING_TIME_TO_GENERATE for name, _ in points), \
39+
"Streaming time to generate metric not found"
3740

3841
# Further assert that time-to-first-token is greater than 0 and has the system attribute
3942
for name, dp in points:
4043
if name == GenAIMetrics.GEN_AI_SERVER_TIME_TO_FIRST_TOKEN:
4144
assert dp.sum > 0, "Time to first token should be greater than 0"
4245
assert dp.attributes.get(SpanAttributes.LLM_SYSTEM) == "Ollama"
4346
break
47+
48+
49+
@pytest.mark.vcr
50+
def test_ollama_streaming_time_to_generate_metrics(metrics_test_context):
51+
_, reader = metrics_test_context
52+
53+
gen = ollama.generate(
54+
model="gemma3:1b",
55+
prompt="Tell me a joke about OpenTelemetry",
56+
stream=True,
57+
)
58+
for _ in gen:
59+
pass
60+
61+
points = _collect_metrics(reader)
62+
# Assert metrics for streaming time to generate is present
63+
assert any(name == Meters.LLM_STREAMING_TIME_TO_GENERATE for name, _ in points), \
64+
"Streaming time to generate metric not found"
65+
66+
# Further assert that streaming-time-to-generate is greater than 0 and has the system attribute
67+
for name, dp in points:
68+
if name == Meters.LLM_STREAMING_TIME_TO_GENERATE:
69+
assert dp.sum > 0, "Streaming time to generate should be greater than 0"
70+
assert dp.attributes.get(SpanAttributes.LLM_SYSTEM) == "Ollama"
71+
assert dp.attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) is not None
72+
break

0 commit comments

Comments
 (0)