fix(openai): ensure embeddings input gets prompt-completion sampled correctly [backport 1.15] (#6073)

github-actions[bot] · Yun-Kim · web-flow · commit cba55055cf9c · 2023-06-08T17:43:35.000Z
Backport 93d700e from #6062 to 1.15. Fixes #5963. The traced embeddings endpoint always tags the `openai.request.input` argument regardless of the configured prompt-completion sample rate. This PR ensures that input tag will be tagged based on the prompt-completion sample rate. ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/contributing.html#Release-Note-Guidelines) are followed. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. Co-authored-by: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
diff --git a/ddtrace/contrib/openai/_endpoint_hooks.py b/ddtrace/contrib/openai/_endpoint_hooks.py
@@ -241,19 +241,17 @@ def _post_response(self, pin, integration, span, args, kwargs, resp, error):
 
 
 class _EmbeddingHook(_EndpointHook):
+    _request_tag_attrs = ["model", "user"]
     _default_name = "embeddings"
 
     def _pre_response(self, pin, integration, span, args, kwargs):
-        for kw_attr in ["model", "input", "user"]:
-            if kw_attr in kwargs:
-                if kw_attr == "input" and integration.is_pc_sampled_span(span):
-                    if isinstance(kwargs["input"], list):
-                        for idx, inp in enumerate(kwargs["input"]):
-                            span.set_tag_str("openai.request.input.%d" % idx, integration.trunc(str(inp)))
-                    else:
-                        span.set_tag("openai.request.%s" % kw_attr, kwargs[kw_attr])
-                else:
-                    span.set_tag("openai.request.%s" % kw_attr, kwargs[kw_attr])
+        embedding_input = kwargs.get("input", "")
+        if integration.is_pc_sampled_span(span):
+            if isinstance(embedding_input, list):
+                for idx, inp in enumerate(embedding_input):
+                    span.set_tag_str("openai.request.input.%d" % idx, integration.trunc(str(inp)))
+            else:
+                span.set_tag("openai.request.input", embedding_input)
         return
 
     def _post_response(self, pin, integration, span, args, kwargs, resp, error):
diff --git a/releasenotes/notes/fix-openai-embeddings-pc-sample-rate-52a8433eb22833e1.yaml b/releasenotes/notes/fix-openai-embeddings-pc-sample-rate-52a8433eb22833e1.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    openai: This fix resolves an issue where embeddings inputs were always tagged regardless of the
+    configured prompt-completion sample rate.
diff --git a/tests/contrib/openai/test_openai.py b/tests/contrib/openai/test_openai.py
@@ -951,6 +951,27 @@ def test_completion_truncation(openai, openai_vcr, mock_tracer):
                     assert len(completion.replace("...", "")) == limit
 
 
+@pytest.mark.parametrize(
+    "ddtrace_config_openai",
+    [
+        dict(
+            _api_key="<not-real-but-it's-something>",
+            span_prompt_completion_sample_rate=0,
+        )
+    ],
+)
+def test_embedding_unsampled_prompt_completion(openai, openai_vcr, ddtrace_config_openai, mock_logs, mock_tracer):
+    if not hasattr(openai, "Embedding"):
+        pytest.skip("embedding not supported for this version of openai")
+    with openai_vcr.use_cassette("embedding.yaml"):
+        openai.Embedding.create(input="hello world", model="text-embedding-ada-002")
+    logs = mock_logs.enqueue.call_count
+    traces = mock_tracer.pop_traces()
+    assert len(traces) == 1
+    assert traces[0][0].get_tag("openai.request.input") is None
+    assert logs == 0
+
+
 @pytest.mark.parametrize(
     "ddtrace_config_openai",
     [

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +fixes:
 +  - |
 +    openai: This fix resolves an issue where embeddings inputs were always tagged regardless of the
 +    configured prompt-completion sample rate.