VertexAI handle streaming requests

aabmass · aabmass · commit 60c60f73fad5 · 2025-03-04T21:10:07.000Z
WIP using shared context manager

Properly implement uninstrument

Shared code with a contextmanager
diff --git a/instrumentation-genai/opentelemetry-instrumentation-vertexai/src/opentelemetry/instrumentation/vertexai/__init__.py b/instrumentation-genai/opentelemetry-instrumentation-vertexai/src/opentelemetry/instrumentation/vertexai/__init__.py
@@ -46,12 +46,11 @@
 )
 
 from opentelemetry._events import get_event_logger
+from opentelemetry.instrumentation.utils import unwrap
 from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
 from opentelemetry.instrumentation.utils import unwrap
 from opentelemetry.instrumentation.vertexai.package import _instruments
-from opentelemetry.instrumentation.vertexai.patch import (
-    generate_content_create,
-)
+from opentelemetry.instrumentation.vertexai.patch import PatchedMethods
 from opentelemetry.instrumentation.vertexai.utils import is_content_enabled
 from opentelemetry.semconv.schemas import Schemas
 from opentelemetry.trace import get_tracer
@@ -104,6 +103,23 @@ def _instrument(self, **kwargs: Any):
                 ),
             )
 
+        for module in (
+            "google.cloud.aiplatform_v1.services.prediction_service.client",
+            "google.cloud.aiplatform_v1beta1.services.prediction_service.client",
+        ):
+            # non streaming
+            wrap_function_wrapper(
+                module=module,
+                name="PredictionServiceClient.generate_content",
+                wrapper=patched_methods.generate_content,
+            )
+            # streaming
+            wrap_function_wrapper(
+                module=module,
+                name="PredictionServiceClient.stream_generate_content",
+                wrapper=patched_methods.stream_generate_content,
+            )
+
     def _uninstrument(self, **kwargs: Any) -> None:
         for client_class in _client_classes():
             unwrap(client_class, "generate_content")
diff --git a/instrumentation-genai/opentelemetry-instrumentation-vertexai/src/opentelemetry/instrumentation/vertexai/patch.py b/instrumentation-genai/opentelemetry-instrumentation-vertexai/src/opentelemetry/instrumentation/vertexai/patch.py
@@ -14,13 +14,22 @@
 
 from __future__ import annotations
 
+from contextlib import contextmanager
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
+    Generator,
+    Iterable,
     MutableSequence,
 )
 
+from google.cloud.aiplatform_v1.types.prediction_service import (
+    GenerateContentResponse,
+)
+from google.cloud.aiplatform_v1beta1.types.prediction_service import (
+    GenerateContentResponse,
+)
 from opentelemetry._events import EventLogger
 from opentelemetry.instrumentation.vertexai.utils import (
     GenerateContentParams,
@@ -87,17 +96,17 @@ def _extract_params(
     )
 
 
-def generate_content_create(
-    tracer: Tracer, event_logger: EventLogger, capture_content: bool
-):
-    """Wrap the `generate_content` method of the `GenerativeModel` class to trace it."""
+class PatchedMethods:
+    def __init__(
+        self, tracer: Tracer, event_logger: EventLogger, capture_content: bool
+    ) -> None:
+        self.tracer = tracer
+        self.event_logger = event_logger
+        self.capture_content = capture_content
 
-    def traced_method(
-        wrapped: Callable[
-            ...,
-            prediction_service.GenerateContentResponse
-            | prediction_service_v1beta1.GenerateContentResponse,
-        ],
+    @contextmanager
+    def _start_as_current_span(
+        self,
         instance: client.PredictionServiceClient
         | client_v1beta1.PredictionServiceClient,
         args: Any,
@@ -111,32 +120,76 @@ def traced_method(
         }
 
         span_name = get_span_name(span_attributes)
-        with tracer.start_as_current_span(
-            name=span_name,
-            kind=SpanKind.CLIENT,
-            attributes=span_attributes,
+
+        with self.tracer.start_as_current_span(
+            name=span_name, kind=SpanKind.CLIENT, attributes=span_attributes
         ) as span:
             for event in request_to_events(
-                params=params, capture_content=capture_content
+                params=params, capture_content=self.capture_content
             ):
-                event_logger.emit(event)
+                self.event_logger.emit(event)
 
             # TODO: set error.type attribute
             # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md
-            response = wrapped(*args, **kwargs)
-            # TODO: handle streaming
-            # if is_streaming(kwargs):
-            #     return StreamWrapper(
-            #         result, span, event_logger, capture_content
-            #     )
-
-            if span.is_recording():
-                span.set_attributes(get_genai_response_attributes(response))
-            for event in response_to_events(
-                response=response, capture_content=capture_content
-            ):
-                event_logger.emit(event)
 
+            final_response = None
+
+            def handle_response(
+                response: prediction_service.GenerateContentResponse
+                | prediction_service_v1beta1.GenerateContentResponse,
+            ) -> None:
+                nonlocal final_response
+                final_response = response
+                for event in response_to_events(
+                    response=response, capture_content=self.capture_content
+                ):
+                    self.event_logger.emit(event)
+
+            yield handle_response
+
+            # These attributes are only set on the final response in the case of streaming
+            if final_response and span.is_recording():
+                span.set_attributes(
+                    get_genai_response_attributes(final_response)
+                )
+
+    def generate_content(
+        self,
+        wrapped: Callable[
+            ...,
+            prediction_service.GenerateContentResponse
+            | prediction_service_v1beta1.GenerateContentResponse,
+        ],
+        instance: client.PredictionServiceClient
+        | client_v1beta1.PredictionServiceClient,
+        args: Any,
+        kwargs: Any,
+    ) -> GenerateContentResponse | GenerateContentResponse:
+        with self._start_as_current_span(
+            instance, args, kwargs
+        ) as handle_response:
+            response = wrapped(*args, **kwargs)
+            handle_response(response)
             return response
 
-    return traced_method
+    def stream_generate_content(
+        self,
+        wrapped: Callable[
+            ...,
+            Iterable[prediction_service.GenerateContentResponse]
+            | Iterable[prediction_service_v1beta1.GenerateContentResponse],
+        ],
+        instance: client.PredictionServiceClient
+        | client_v1beta1.PredictionServiceClient,
+        args: Any,
+        kwargs: Any,
+    ) -> Generator[
+        GenerateContentResponse | GenerateContentResponse, Any, None
+    ]:
+        print("stream_generate_content() starting ctxmanager")
+        with self._start_as_current_span(
+            instance, args, kwargs
+        ) as handle_response:
+            for response in wrapped(*args, **kwargs):
+                handle_response(response)
+                yield response
diff --git a/instrumentation-genai/opentelemetry-instrumentation-vertexai/tests/cassettes/test_generate_content_streaming.yaml b/instrumentation-genai/opentelemetry-instrumentation-vertexai/tests/cassettes/test_generate_content_streaming.yaml
@@ -0,0 +1,120 @@
+interactions:
+- request:
+    body: |-
+      {
+        "contents": [
+          {
+            "role": "user",
+            "parts": [
+              {
+                "text": "Say this is a test"
+              }
+            ]
+          }
+        ]
+      }
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '141'
+      Content-Type:
+      - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://us-central1-aiplatform.googleapis.com/v1/projects/fake-project/locations/us-central1/publishers/google/models/gemini-1.5-flash-002:streamGenerateContent?%24alt=json%3Benum-encoding%3Dint
+  response:
+    body:
+      string: |-
+        [
+          {
+            "candidates": [
+              {
+                "content": {
+                  "role": "model",
+                  "parts": [
+                    {
+                      "text": "Okay"
+                    }
+                  ]
+                }
+              }
+            ],
+            "usageMetadata": {},
+            "modelVersion": "gemini-1.5-flash-002",
+            "createTime": "2025-03-03T22:23:47.310622Z",
+            "responseId": "8yvGZ976Eu6knvgPpOnW2Q4"
+          },
+          {
+            "candidates": [
+              {
+                "content": {
+                  "role": "model",
+                  "parts": [
+                    {
+                      "text": ", I understand.  I'm ready for your test.  Please proceed"
+                    }
+                  ]
+                }
+              }
+            ],
+            "modelVersion": "gemini-1.5-flash-002",
+            "createTime": "2025-03-03T22:23:47.310622Z",
+            "responseId": "8yvGZ976Eu6knvgPpOnW2Q4"
+          },
+          {
+            "candidates": [
+              {
+                "content": {
+                  "role": "model",
+                  "parts": [
+                    {
+                      "text": ".\n"
+                    }
+                  ]
+                },
+                "finishReason": 1
+              }
+            ],
+            "usageMetadata": {
+              "promptTokenCount": 5,
+              "candidatesTokenCount": 19,
+              "totalTokenCount": 24,
+              "promptTokensDetails": [
+                {
+                  "modality": 1,
+                  "tokenCount": 5
+                }
+              ],
+              "candidatesTokensDetails": [
+                {
+                  "modality": 1,
+                  "tokenCount": 19
+                }
+              ]
+            },
+            "modelVersion": "gemini-1.5-flash-002",
+            "createTime": "2025-03-03T22:23:47.310622Z",
+            "responseId": "8yvGZ976Eu6knvgPpOnW2Q4"
+          }
+        ]
+    headers:
+      Content-Type:
+      - application/json; charset=UTF-8
+      Transfer-Encoding:
+      - chunked
+      Vary:
+      - Origin
+      - X-Origin
+      - Referer
+      content-length:
+      - '1328'
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/instrumentation-genai/opentelemetry-instrumentation-vertexai/tests/test_chat_completions_streaming.py b/instrumentation-genai/opentelemetry-instrumentation-vertexai/tests/test_chat_completions_streaming.py