fix(openai): ensure streamed spans with error are manually finished [backport 1.19] (#6911)

github-actions[bot] · Yun-Kim · web-flow · commit f9e090fdbd8e · 2023-09-15T10:25:26.000Z
Backport f3beaf4 from #6891 to 1.19. Resolves #6769. This PR fixes an unhandled case in our OpenAI integration specifically regarding streamed `Chat/Completion` requests that have errors and result in an empty response. In this case, streamed spans were not finished (as we avoided finishing streamed response spans until the underlying generator was exhausted) because we did not handle empty response cases. The fix here is to add manual span finishing for streamed response spans with error, as we already do that for non-streamed spans. The only risk is if there are requests that we trace that might result in a non-empty response even with an error. This is highly unlikely as we shouldn't get any response if the corresponding request was faulty. Note: this PR also moves tagging prompt token usage information to the `EndpointHook.process_response()` instead of the `EndpointHook.process_request()` handler as keeping that in the latter results in prompt token information being recorded, even if there were no actual prompt/completion operation happening for OpenAI due to the erroneous request. ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) - [x] If this PR touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. - [x] This PR doesn't touch any of that. Co-authored-by: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
diff --git a/ddtrace/contrib/openai/_endpoint_hooks.py b/ddtrace/contrib/openai/_endpoint_hooks.py
@@ -91,7 +91,6 @@ def shared_gen():
             try:
                 num_prompt_tokens = span.get_metric("openai.response.usage.prompt_tokens") or 0
                 num_completion_tokens = yield
-
                 span.set_metric("openai.response.usage.completion_tokens", num_completion_tokens)
                 total_tokens = num_prompt_tokens + num_completion_tokens
                 span.set_metric("openai.response.usage.total_tokens", total_tokens)
@@ -180,8 +179,15 @@ def _record_request(self, pin, integration, span, args, kwargs):
             elif prompt:
                 for idx, p in enumerate(prompt):
                     span.set_tag_str("openai.request.prompt.%d" % idx, integration.trunc(str(p)))
+        return
+
+    def _record_response(self, pin, integration, span, args, kwargs, resp, error):
+        if not resp:
+            return self._handle_response(pin, span, integration, resp)
+        prompt = kwargs.get("prompt", "")
         if kwargs.get("stream"):
             num_prompt_tokens = 0
+            estimated = False
             if isinstance(prompt, str) or isinstance(prompt, list) and isinstance(prompt[0], int):
                 estimated, prompt_tokens = _compute_prompt_token_count(prompt, kwargs.get("model"))
                 num_prompt_tokens += prompt_tokens
@@ -191,10 +197,6 @@ def _record_request(self, pin, integration, span, args, kwargs):
                     num_prompt_tokens += prompt_tokens
             span.set_metric("openai.request.prompt_tokens_estimated", int(estimated))
             span.set_metric("openai.response.usage.prompt_tokens", num_prompt_tokens)
-        return
-
-    def _record_response(self, pin, integration, span, args, kwargs, resp, error):
-        if not resp or kwargs.get("stream"):
             return self._handle_response(pin, span, integration, resp)
         if "choices" in resp:
             choices = resp["choices"]
@@ -212,7 +214,6 @@ def _record_response(self, pin, integration, span, args, kwargs, resp, error):
                     span.set_tag_str("openai.response.choices.%d.text" % idx, integration.trunc(choice.get("text")))
         integration.record_usage(span, resp.get("usage"))
         if integration.is_pc_sampled_log(span):
-            prompt = kwargs.get("prompt", "")
             integration.log(
                 span,
                 "info" if error is None else "error",
@@ -254,19 +255,20 @@ def _record_request(self, pin, integration, span, args, kwargs):
                 span.set_tag_str("openai.request.messages.%d.content" % idx, content)
                 span.set_tag_str("openai.request.messages.%d.role" % idx, role)
                 span.set_tag_str("openai.request.messages.%d.name" % idx, name)
+        return
+
+    def _record_response(self, pin, integration, span, args, kwargs, resp, error):
+        if not resp:
+            return self._handle_response(pin, span, integration, resp)
+        messages = kwargs.get("messages")
         if kwargs.get("stream"):
-            # streamed responses do not have a usage field, so we have to
-            # estimate the number of tokens returned.
             est_num_message_tokens = 0
+            estimated = False
             for m in messages:
                 estimated, prompt_tokens = _compute_prompt_token_count(m.get("content", ""), kwargs.get("model"))
                 est_num_message_tokens += prompt_tokens
             span.set_metric("openai.request.prompt_tokens_estimated", int(estimated))
             span.set_metric("openai.response.usage.prompt_tokens", est_num_message_tokens)
-        return
-
-    def _record_response(self, pin, integration, span, args, kwargs, resp, error):
-        if not resp or kwargs.get("stream"):
             return self._handle_response(pin, span, integration, resp)
         choices = resp.get("choices", [])
         span.set_metric("openai.response.choices_count", len(choices))
@@ -291,7 +293,6 @@ def _record_response(self, pin, integration, span, args, kwargs, resp, error):
                 )
         integration.record_usage(span, resp.get("usage"))
         if integration.is_pc_sampled_log(span):
-            messages = kwargs.get("messages")
             integration.log(
                 span,
                 "info" if error is None else "error",
diff --git a/ddtrace/contrib/openai/patch.py b/ddtrace/contrib/openai/patch.py
@@ -349,6 +349,7 @@ def _patched_make_session(func, args, kwargs):
 def _traced_endpoint(endpoint_hook, integration, pin, args, kwargs):
     span = integration.trace(pin, endpoint_hook.OPERATION_ID)
     openai_api_key = _format_openai_api_key(kwargs.get("api_key"))
+    err = None
     if openai_api_key:
         # API key can either be set on the import or per request
         span.set_tag_str("openai.user.api_key", openai_api_key)
@@ -357,22 +358,23 @@ def _traced_endpoint(endpoint_hook, integration, pin, args, kwargs):
         hook = endpoint_hook().handle_request(pin, integration, span, args, kwargs)
         hook.send(None)
 
-        resp, error = yield
+        resp, err = yield
 
         # Record any error information
-        if error is not None:
+        if err is not None:
             span.set_exc_info(*sys.exc_info())
             integration.metric(span, "incr", "request.error", 1)
 
         # Pass the response and the error to the hook
         try:
-            hook.send((resp, error))
+            hook.send((resp, err))
         except StopIteration as e:
-            if error is None:
+            if err is None:
                 return e.value
     finally:
-        # Streamed responses will be finished when the generator exits.
-        if not kwargs.get("stream"):
+        # Streamed responses will be finished when the generator exits, so finish non-streamed spans here.
+        # Streamed responses with error will need to be finished manually as well.
+        if not kwargs.get("stream") or err is not None:
             span.finish()
             integration.metric(span, "dist", "request.duration", span.duration_ns)
 
diff --git a/releasenotes/notes/fix-openai-stream-error-unfinished-span-d62df398912b0f56.yaml b/releasenotes/notes/fix-openai-stream-error-unfinished-span-d62df398912b0f56.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    openai: This fix resolves an issue where errors during streamed requests resulted in unfinished spans.
diff --git a/tests/contrib/openai/cassettes/completion_stream_wrong_api_key.yaml b/tests/contrib/openai/cassettes/completion_stream_wrong_api_key.yaml
@@ -0,0 +1,57 @@
+interactions:
+- request:
+    body: '{"model": "text-curie-001", "prompt": "how does openai tokenize prompts?",
+      "temperature": 0.8, "n": 1, "max_tokens": 150, "stream": true}'
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '137'
+      Content-Type:
+      - application/json
+      User-Agent:
+      - OpenAI/v1 PythonBindings/0.27.2
+      X-OpenAI-Client-User-Agent:
+      - '{"bindings_version": "0.27.2", "httplib": "requests", "lang": "python", "lang_version":
+        "3.10.5", "platform": "macOS-13.5.1-arm64-arm-64bit", "publisher": "openai",
+        "uname": "Darwin 22.6.0 Darwin Kernel Version 22.6.0: Wed Jul  5 22:22:05
+        PDT 2023; root:xnu-8796.141.3~6/RELEASE_ARM64_T6000 arm64"}'
+    method: POST
+    uri: https://api.openai.com/v1/completions
+  response:
+    body:
+      string: "{\n    \"error\": {\n        \"message\": \"Incorrect API key provided:
+        sk-wrong****-key. You can find your API key at https://platform.openai.com/account/api-keys.\",\n
+        \       \"type\": \"invalid_request_error\",\n        \"param\": null,\n        \"code\":
+        \"invalid_api_key\"\n    }\n}\n"
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-RAY:
+      - 80599159bdd94288-EWR
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '266'
+      Content-Type:
+      - application/json; charset=utf-8
+      Date:
+      - Tue, 12 Sep 2023 16:36:09 GMT
+      Server:
+      - cloudflare
+      alt-svc:
+      - h3=":443"; ma=86400
+      strict-transport-security:
+      - max-age=15724800; includeSubDomains
+      vary:
+      - Origin
+      x-request-id:
+      - 912bc0d688b018590ad4644213b9c72f
+    status:
+      code: 401
+      message: Unauthorized
+version: 1
diff --git a/tests/contrib/openai/test_openai.py b/tests/contrib/openai/test_openai.py
@@ -1645,6 +1645,21 @@ def test_misuse(openai, snapshot_tracer):
         openai.Completion.create(input="wrong arg")
 
 
+@pytest.mark.snapshot(ignores=["meta.http.useragent", "meta.error.stack"])
+def test_span_finish_on_stream_error(openai, openai_vcr, snapshot_tracer):
+    with openai_vcr.use_cassette("completion_stream_wrong_api_key.yaml"):
+        with pytest.raises(openai.error.AuthenticationError):
+            openai.Completion.create(
+                api_key="sk-wrong-api-key",
+                model="text-curie-001",
+                prompt="how does openai tokenize prompts?",
+                temperature=0.8,
+                n=1,
+                max_tokens=150,
+                stream=True,
+            )
+
+
 def test_completion_stream(openai, openai_vcr, mock_metrics, mock_tracer):
     with openai_vcr.use_cassette("completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.openai.utils.encoding_for_model", create=True) as mock_encoding:
diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_span_finish_on_stream_error.json b/tests/snapshots/tests.contrib.openai.test_openai.test_span_finish_on_stream_error.json
@@ -0,0 +1,41 @@
+[[
+  {
+    "name": "openai.request",
+    "service": "",
+    "resource": "createCompletion",
+    "trace_id": 0,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "",
+    "error": 1,
+    "meta": {
+      "_dd.p.dm": "-0",
+      "component": "openai",
+      "error.message": "Incorrect API key provided: sk-wrong****-key. You can find your API key at https://platform.openai.com/account/api-keys.",
+      "error.stack": "Traceback (most recent call last):\n  openai.error.AuthenticationError: Incorrect API key provided: sk-wrong****-key. You can find your API key at https://platform.openai.com/account/api-keys.\n",
+      "error.type": "openai.error.AuthenticationError",
+      "language": "python",
+      "openai.api_base": "https://api.openai.com/v1",
+      "openai.api_type": "open_ai",
+      "openai.request.endpoint": "/v1/completions",
+      "openai.request.max_tokens": "150",
+      "openai.request.method": "POST",
+      "openai.request.model": "text-curie-001",
+      "openai.request.n": "1",
+      "openai.request.prompt": "how does openai tokenize prompts?",
+      "openai.request.stream": "True",
+      "openai.request.temperature": "0.8",
+      "openai.user.api_key": "sk-...-key",
+      "runtime-id": "0a0a92d644714949b7544ee81c6d1bf1"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sample_rate": 1.0,
+      "_sampling_priority_v1": 1,
+      "process_id": 91222
+    },
+    "duration": 291271000,
+    "start": 1694536282656608000
+  }]]

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +fixes:
 +  - |
 +    openai: This fix resolves an issue where errors during streamed requests resulted in unfinished spans.