feat(ollama): add meter STTG to ollama instrumentation (#3053)

minimAluminiumalism · nirga · web-flow · commit 622d1e4344d6 · 2025-06-28T22:36:07.000+03:00
Co-authored-by: Nir Gazit &lt;nirga@users.noreply.github.com&gt;
diff --git a/packages/opentelemetry-instrumentation-ollama/opentelemetry/instrumentation/ollama/__init__.py b/packages/opentelemetry-instrumentation-ollama/opentelemetry/instrumentation/ollama/__init__.py
@@ -260,6 +260,7 @@ def _accumulate_streaming_response(
     llm_request_type,
     response,
     streaming_time_to_first_token=None,
+    streaming_time_to_generate=None,
     start_time=None
 ):
     if llm_request_type == LLMRequestTypeValues.CHAT:
@@ -268,10 +269,16 @@ def _accumulate_streaming_response(
         accumulated_response = {"response": ""}
 
     first_token = True
+    first_token_time = None
+    last_response = None
+
     for res in response:
+        last_response = res  # Track the last response explicitly
+
         if first_token and streaming_time_to_first_token and start_time is not None:
+            first_token_time = time.perf_counter()
             streaming_time_to_first_token.record(
-                time.perf_counter() - start_time,
+                first_token_time - start_time,
                 attributes={SpanAttributes.LLM_SYSTEM: "Ollama"},
             )
             first_token = False
@@ -284,8 +291,24 @@ def _accumulate_streaming_response(
             text = res.get("response", "")
             accumulated_response["response"] += text
 
-    response_data = res.model_dump() if hasattr(res, 'model_dump') else res
-    _set_response_attributes(span, token_histogram, llm_request_type, response_data | accumulated_response)
+    # Record streaming time to generate after the response is complete
+    if streaming_time_to_generate and first_token_time is not None:
+        model_name = last_response.get("model") if last_response else None
+        streaming_time_to_generate.record(
+            time.perf_counter() - first_token_time,
+            attributes={
+                SpanAttributes.LLM_SYSTEM: "Ollama",
+                SpanAttributes.LLM_RESPONSE_MODEL: model_name,
+            },
+        )
+
+    response_data = (
+        last_response.model_dump()
+        if last_response and hasattr(last_response, 'model_dump')
+        else last_response
+    )
+    if response_data:
+        _set_response_attributes(span, token_histogram, llm_request_type, response_data | accumulated_response)
     span.end()
 
 
@@ -295,6 +318,7 @@ async def _aaccumulate_streaming_response(
     llm_request_type,
     response,
     streaming_time_to_first_token=None,
+    streaming_time_to_generate=None,
     start_time=None,
 ):
     if llm_request_type == LLMRequestTypeValues.CHAT:
@@ -303,11 +327,16 @@ async def _aaccumulate_streaming_response(
         accumulated_response = {"response": ""}
 
     first_token = True
+    first_token_time = None
+    last_response = None
 
     async for res in response:
+        last_response = res
+
         if first_token and streaming_time_to_first_token and start_time is not None:
+            first_token_time = time.perf_counter()
             streaming_time_to_first_token.record(
-                time.perf_counter() - start_time,
+                first_token_time - start_time,
                 attributes={SpanAttributes.LLM_SYSTEM: "Ollama"},
             )
             first_token = False
@@ -320,21 +349,45 @@ async def _aaccumulate_streaming_response(
             text = res.get("response", "")
             accumulated_response["response"] += text
 
-    response_data = res.model_dump() if hasattr(res, 'model_dump') else res
-    _set_response_attributes(span, token_histogram, llm_request_type, response_data | accumulated_response)
+    # Record streaming time to generate after the response is complete
+    if streaming_time_to_generate and first_token_time is not None:
+        model_name = last_response.get("model") if last_response else None
+        streaming_time_to_generate.record(
+            time.perf_counter() - first_token_time,
+            attributes={
+                SpanAttributes.LLM_SYSTEM: "Ollama",
+                SpanAttributes.LLM_RESPONSE_MODEL: model_name,
+            },
+        )
+
+    response_data = (
+        last_response.model_dump()
+        if last_response and hasattr(last_response, 'model_dump')
+        else last_response
+    )
+    if response_data:
+        _set_response_attributes(span, token_histogram, llm_request_type, response_data | accumulated_response)
     span.end()
 
 
 def _with_tracer_wrapper(func):
     """Helper for providing tracer for wrapper functions."""
 
-    def _with_tracer(tracer, token_histogram, duration_histogram, streaming_time_to_first_token, to_wrap):
+    def _with_tracer(
+        tracer,
+        token_histogram,
+        duration_histogram,
+        streaming_time_to_first_token,
+        streaming_time_to_generate,
+        to_wrap
+    ):
         def wrapper(wrapped, instance, args, kwargs):
             return func(
                 tracer,
                 token_histogram,
                 duration_histogram,
                 streaming_time_to_first_token,
+                streaming_time_to_generate,
                 to_wrap,
                 wrapped,
                 instance,
@@ -364,6 +417,7 @@ def _wrap(
     token_histogram: Histogram,
     duration_histogram: Histogram,
     streaming_time_to_first_token: Histogram,
+    streaming_time_to_generate: Histogram,
     to_wrap,
     wrapped,
     instance,
@@ -410,6 +464,7 @@ def _wrap(
                     llm_request_type,
                     response,
                     streaming_time_to_first_token,
+                    streaming_time_to_generate,
                     start_time,
                 )
 
@@ -426,6 +481,7 @@ async def _awrap(
     token_histogram: Histogram,
     duration_histogram: Histogram,
     streaming_time_to_first_token: Histogram,
+    streaming_time_to_generate: Histogram,
     to_wrap,
     wrapped,
     instance,
@@ -472,6 +528,7 @@ async def _awrap(
                     llm_request_type,
                     response,
                     streaming_time_to_first_token,
+                    streaming_time_to_generate,
                     start_time,
                 )
 
@@ -501,7 +558,13 @@ def _build_metrics(meter: Meter):
         description="Time to first token in streaming chat completions",
     )
 
-    return token_histogram, duration_histogram, streaming_time_to_first_token
+    streaming_time_to_generate = meter.create_histogram(
+        name=Meters.LLM_STREAMING_TIME_TO_GENERATE,
+        unit="s",
+        description="Time from first token to completion in streaming responses",
+    )
+
+    return token_histogram, duration_histogram, streaming_time_to_first_token, streaming_time_to_generate
 
 
 def is_metrics_collection_enabled() -> bool:
@@ -530,13 +593,15 @@ def _instrument(self, **kwargs):
                 token_histogram,
                 duration_histogram,
                 streaming_time_to_first_token,
+                streaming_time_to_generate,
             ) = _build_metrics(meter)
         else:
             (
                 token_histogram,
                 duration_histogram,
                 streaming_time_to_first_token,
-            ) = (None, None, None)
+                streaming_time_to_generate,
+            ) = (None, None, None, None)
 
         # Patch _copy_messages to sanitize tool_calls arguments before Pydantic validation
         wrap_function_wrapper(
@@ -548,12 +613,24 @@ def _instrument(self, **kwargs):
         wrap_function_wrapper(
             "ollama._client",
             "Client._request",
-            _dispatch_wrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token),
+            _dispatch_wrap(
+                tracer,
+                token_histogram,
+                duration_histogram,
+                streaming_time_to_first_token,
+                streaming_time_to_generate
+            ),
         )
         wrap_function_wrapper(
             "ollama._client",
             "AsyncClient._request",
-            _dispatch_awrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token),
+            _dispatch_awrap(
+                tracer,
+                token_histogram,
+                duration_histogram,
+                streaming_time_to_first_token,
+                streaming_time_to_generate
+            ),
         )
 
     def _uninstrument(self, **kwargs):
@@ -572,30 +649,56 @@ def _uninstrument(self, **kwargs):
             )
 
 
-def _dispatch_wrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token):
+def _dispatch_wrap(
+    tracer,
+    token_histogram,
+    duration_histogram,
+    streaming_time_to_first_token,
+    streaming_time_to_generate
+):
     def wrapper(wrapped, instance, args, kwargs):
         to_wrap = None
         if len(args) > 2 and isinstance(args[2], str):
             path = args[2]
             op = path.rstrip('/').split('/')[-1]
             to_wrap = next((m for m in WRAPPED_METHODS if m.get("method") == op), None)
         if to_wrap:
-            return _wrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token, to_wrap)(
+            return _wrap(
+                tracer,
+                token_histogram,
+                duration_histogram,
+                streaming_time_to_first_token,
+                streaming_time_to_generate,
+                to_wrap
+            )(
                 wrapped, instance, args, kwargs
             )
         return wrapped(*args, **kwargs)
     return wrapper
 
 
-def _dispatch_awrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token):
+def _dispatch_awrap(
+    tracer,
+    token_histogram,
+    duration_histogram,
+    streaming_time_to_first_token,
+    streaming_time_to_generate
+):
     async def wrapper(wrapped, instance, args, kwargs):
         to_wrap = None
         if len(args) > 2 and isinstance(args[2], str):
             path = args[2]
             op = path.rstrip('/').split('/')[-1]
             to_wrap = next((m for m in WRAPPED_METHODS if m.get("method") == op), None)
         if to_wrap:
-            return await _awrap(tracer, token_histogram, duration_histogram, streaming_time_to_first_token, to_wrap)(
+            return await _awrap(
+                tracer,
+                token_histogram,
+                duration_histogram,
+                streaming_time_to_first_token,
+                streaming_time_to_generate,
+                to_wrap
+            )(
                 wrapped, instance, args, kwargs
             )
         return await wrapped(*args, **kwargs)
diff --git a/packages/opentelemetry-instrumentation-ollama/tests/cassettes/test_ollama_metrics/test_ollama_streaming_time_to_generate_metrics.yaml b/packages/opentelemetry-instrumentation-ollama/tests/cassettes/test_ollama_metrics/test_ollama_streaming_time_to_generate_metrics.yaml
@@ -0,0 +1,66 @@
+# VCR cassette for test_ollama_streaming_time_to_generate_metrics
+# Use `pytest --vcr-record=once` to generate and record HTTP interactions 
+
+interactions:
+- request:
+    body: '{"model": "gemma3:1b", "stream": true, "prompt": "Tell me a joke about
+      OpenTelemetry"}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate, br
+      connection:
+      - keep-alive
+      content-length:
+      - '86'
+      content-type:
+      - application/json
+      host:
+      - 127.0.0.1:11434
+      user-agent:
+      - ollama-python/0.4.7 (arm64 darwin) Python/3.12.1
+    method: POST
+    uri: http://127.0.0.1:11434/api/generate
+  response:
+    body:
+      string: "{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.678947Z\",\"response\":\"Okay\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.689977Z\",\"response\":\",\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.698682Z\",\"response\":\"
+        here\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.705542Z\",\"response\":\"'\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.719443Z\",\"response\":\"s\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.727065Z\",\"response\":\"
+        a\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.734378Z\",\"response\":\"
+        joke\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.741338Z\",\"response\":\"
+        about\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.748548Z\",\"response\":\"
+        Open\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.755651Z\",\"response\":\"Telemetry\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.762923Z\",\"response\":\":\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.770238Z\",\"response\":\"\\n\\n\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.777345Z\",\"response\":\"Why\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.784452Z\",\"response\":\"
+        did\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.791322Z\",\"response\":\"
+        the\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.799592Z\",\"response\":\"
+        Open\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.807707Z\",\"response\":\"Telemetry\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.815026Z\",\"response\":\"
+        team\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.82255Z\",\"response\":\"
+        break\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.829539Z\",\"response\":\"
+        up\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.837091Z\",\"response\":\"?\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.844886Z\",\"response\":\"
+        \",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.851832Z\",\"response\":\"\\n\\n\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.858934Z\",\"response\":\"Because\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.86602Z\",\"response\":\"
+        they\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.87291Z\",\"response\":\"
+        couldn\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.880172Z\",\"response\":\"'\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.887737Z\",\"response\":\"t\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.895174Z\",\"response\":\"
+        stop\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.902823Z\",\"response\":\"
+        arguing\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.917946Z\",\"response\":\"
+        about\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.92544Z\",\"response\":\"
+        the\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.932778Z\",\"response\":\"
+        *\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.939594Z\",\"response\":\"trace\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.946936Z\",\"response\":\"
+        path\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.953976Z\",\"response\":\"*\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.96117Z\",\"response\":\"!\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.968478Z\",\"response\":\"
+        \",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.975841Z\",\"response\":\"\\n\\n\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.982851Z\",\"response\":\"---\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.989474Z\",\"response\":\"\\n\\n\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:26:59.996748Z\",\"response\":\"Would\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.003849Z\",\"response\":\"
+        you\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.010896Z\",\"response\":\"
+        like\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.018201Z\",\"response\":\"
+        to\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.025795Z\",\"response\":\"
+        hear\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.033527Z\",\"response\":\"
+        another\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.040887Z\",\"response\":\"
+        one\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.048069Z\",\"response\":\"?\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.055597Z\",\"response\":\"
+        \U0001F60A\",\"done\":false}\n{\"model\":\"gemma3:1b\",\"created_at\":\"2025-05-19T09:27:00.062985Z\",\"response\":\"\",\"done\":true,\"done_reason\":\"stop\",\"context\":[105,2364,107,54593,786,496,31481,1003,7607,236654,106,107,105,4368,107,19058,236764,1590,236789,236751,496,31481,1003,7607,236654,236787,108,11355,1602,506,7607,236654,2434,2541,872,236881,236743,108,17574,901,9225,236789,236745,4721,46256,1003,506,808,27807,2479,236829,236888,236743,108,7243,108,38786,611,1133,531,6899,2264,886,236881,103453],\"total_duration\":482007750,\"load_duration\":52719542,\"prompt_eval_count\":16,\"prompt_eval_duration\":44457208,\"eval_count\":51,\"eval_duration\":384438167}\n"
+    headers:
+      Content-Type:
+      - application/x-ndjson
+      Date:
+      - Mon, 19 May 2025 09:26:59 GMT
+      Transfer-Encoding:
+      - chunked
+    status:
+      code: 200
+      message: OK
+version: 1 
diff --git a/packages/opentelemetry-instrumentation-ollama/tests/test_ollama_metrics.py b/packages/opentelemetry-instrumentation-ollama/tests/test_ollama_metrics.py
@@ -29,15 +29,44 @@ def test_ollama_streaming_metrics(metrics_test_context):
         pass
 
     points = _collect_metrics(reader)
-    # Assert metrics for token usage, operation duration, and time to first token are present
+    # Assert metrics for token usage, operation duration, time to first token,
+    # and streaming time to generate are present
     assert any(name == Meters.LLM_TOKEN_USAGE for name, _ in points), "Token usage metric not found"
     assert any(name == Meters.LLM_OPERATION_DURATION for name, _ in points), "Operation duration metric not found"
     assert any(name == GenAIMetrics.GEN_AI_SERVER_TIME_TO_FIRST_TOKEN for name, _ in points), \
         "Time to first token metric not found"
+    assert any(name == Meters.LLM_STREAMING_TIME_TO_GENERATE for name, _ in points), \
+        "Streaming time to generate metric not found"
 
     # Further assert that time-to-first-token is greater than 0 and has the system attribute
     for name, dp in points:
         if name == GenAIMetrics.GEN_AI_SERVER_TIME_TO_FIRST_TOKEN:
             assert dp.sum > 0, "Time to first token should be greater than 0"
             assert dp.attributes.get(SpanAttributes.LLM_SYSTEM) == "Ollama"
             break
+
+
+@pytest.mark.vcr
+def test_ollama_streaming_time_to_generate_metrics(metrics_test_context):
+    _, reader = metrics_test_context
+
+    gen = ollama.generate(
+        model="gemma3:1b",
+        prompt="Tell me a joke about OpenTelemetry",
+        stream=True,
+    )
+    for _ in gen:
+        pass
+
+    points = _collect_metrics(reader)
+    # Assert metrics for streaming time to generate is present
+    assert any(name == Meters.LLM_STREAMING_TIME_TO_GENERATE for name, _ in points), \
+        "Streaming time to generate metric not found"
+
+    # Further assert that streaming-time-to-generate is greater than 0 and has the system attribute
+    for name, dp in points:
+        if name == Meters.LLM_STREAMING_TIME_TO_GENERATE:
+            assert dp.sum > 0, "Streaming time to generate should be greater than 0"
+            assert dp.attributes.get(SpanAttributes.LLM_SYSTEM) == "Ollama"
+            assert dp.attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) is not None
+            break
diff --git a/packages/opentelemetry-semantic-conventions-ai/opentelemetry/semconv_ai/__init__.py b/packages/opentelemetry-semantic-conventions-ai/opentelemetry/semconv_ai/__init__.py