Skip to content

Commit 59a81a8

Browse files
authored
fix(openai): allow token inputs for Embeddings endpoint [backport #5890 to 1.13] (#5909)
Backports #5890 to 1.13. Fixes #5884. This PR adds handling for non-string request input params for the `Embeddings.create()` method. This API method accepts strings, array of strings, as well as token arrays (list of ints) and arrays of token arrays (list of list of ints), but the previous implementation only accounted for strings and array of strings. Token arrays or arrays of token arrays being passed in as arguments would then cause errors as the traced request handler for the Embeddings endpoint did not account for non-string types. ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/contributing.html#Release-Note-Guidelines) are followed. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] OPTIONAL: PR description includes explicit acknowledgement of the performance implications of the change as reported in the benchmarks PR comment. ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment.
1 parent c49973d commit 59a81a8

7 files changed

+155
-1
lines changed

ddtrace/contrib/openai/patch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ def handle_request(self, pin, integration, span, args, kwargs):
630630
if kw_attr == "input" and integration.is_pc_sampled_span(span):
631631
if isinstance(kwargs["input"], list):
632632
for idx, inp in enumerate(kwargs["input"]):
633-
span.set_tag_str("openai.request.input.%d" % idx, integration.trunc(inp))
633+
span.set_tag_str("openai.request.input.%d" % idx, integration.trunc(str(inp)))
634634
else:
635635
span.set_tag("openai.request.%s" % kw_attr, kwargs[kw_attr])
636636
else:

docs/spelling_wordlist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ mysqlclient
133133
mysqldb
134134
namespace
135135
obfuscator
136+
openai
136137
opensearch
137138
opentracer
138139
opentracing
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
fixes:
3+
- |
4+
openai: Resolves an issue where using an array of tokens or an array of token arrays
5+
for the Embeddings endpoint caused an AttributeError.

tests/contrib/openai/test_openai.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,32 @@ def test_embedding(api_key_in_env, request_api_key, openai, openai_vcr, snapshot
546546
openai.Embedding.create(api_key=request_api_key, input="hello world", model="text-embedding-ada-002")
547547

548548

549+
@pytest.mark.snapshot(ignores=["meta.http.useragent"])
550+
def test_embedding_string_array(openai, openai_vcr, snapshot_tracer):
551+
if not hasattr(openai, "Embedding"):
552+
pytest.skip("embedding not supported for this version of openai")
553+
with openai_vcr.use_cassette("embedding.yaml"):
554+
openai.Embedding.create(input=["hello world", "hello again"], model="text-embedding-ada-002")
555+
556+
557+
@pytest.mark.snapshot(ignores=["meta.http.useragent"])
558+
def test_embedding_token_array(openai, openai_vcr, snapshot_tracer):
559+
if not hasattr(openai, "Embedding"):
560+
pytest.skip("embedding not supported for this version of openai")
561+
with openai_vcr.use_cassette("embedding.yaml"):
562+
openai.Embedding.create(input=[1111, 2222, 3333], model="text-embedding-ada-002")
563+
564+
565+
@pytest.mark.snapshot(ignores=["meta.http.useragent"])
566+
def test_embedding_array_of_token_arrays(openai, openai_vcr, snapshot_tracer):
567+
if not hasattr(openai, "Embedding"):
568+
pytest.skip("embedding not supported for this version of openai")
569+
with openai_vcr.use_cassette("embedding.yaml"):
570+
openai.Embedding.create(
571+
input=[[1111, 2222, 3333], [4444, 5555, 6666], [7777, 8888, 9999]], model="text-embedding-ada-002"
572+
)
573+
574+
549575
@pytest.mark.asyncio
550576
@pytest.mark.snapshot(ignores=["meta.http.useragent"])
551577
@pytest.mark.parametrize("api_key_in_env", [True, False])
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
[[
2+
{
3+
"name": "openai.request",
4+
"service": "",
5+
"resource": "embeddings/text-embedding-ada-002",
6+
"trace_id": 0,
7+
"span_id": 1,
8+
"parent_id": 0,
9+
"type": "",
10+
"error": 0,
11+
"meta": {
12+
"_dd.p.dm": "-0",
13+
"api_base": "https://api.openai.com/v1",
14+
"component": "openai",
15+
"language": "python",
16+
"openai.endpoint": "embeddings",
17+
"openai.model": "text-embedding-ada-002",
18+
"openai.organization.name": "datadog-4",
19+
"openai.organization.ratelimit.requests.remaining": "2999",
20+
"openai.request.input.0": "[1111, 2222, 3333]",
21+
"openai.request.input.1": "[4444, 5555, 6666]",
22+
"openai.request.input.2": "[7777, 8888, 9999]",
23+
"openai.request.model": "text-embedding-ada-002",
24+
"openai.user.api_key": "sk-...key>",
25+
"runtime-id": "b168eb19ef14414ca786ac99826ef9e0"
26+
},
27+
"metrics": {
28+
"_dd.agent_psr": 1.0,
29+
"_dd.measured": 1,
30+
"_dd.top_level": 1,
31+
"_dd.tracer_kr": 1.0,
32+
"_sampling_priority_v1": 1,
33+
"openai.response.data.embedding-length": 1536,
34+
"openai.response.data.num-embeddings": 1,
35+
"openai.response.usage.prompt_tokens": 2,
36+
"openai.response.usage.total_tokens": 2,
37+
"process_id": 85925
38+
},
39+
"duration": 3655000,
40+
"start": 1684349256345286000
41+
}]]
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
[[
2+
{
3+
"name": "openai.request",
4+
"service": "",
5+
"resource": "embeddings/text-embedding-ada-002",
6+
"trace_id": 0,
7+
"span_id": 1,
8+
"parent_id": 0,
9+
"type": "",
10+
"error": 0,
11+
"meta": {
12+
"_dd.p.dm": "-0",
13+
"api_base": "https://api.openai.com/v1",
14+
"component": "openai",
15+
"language": "python",
16+
"openai.endpoint": "embeddings",
17+
"openai.model": "text-embedding-ada-002",
18+
"openai.organization.name": "datadog-4",
19+
"openai.organization.ratelimit.requests.remaining": "2999",
20+
"openai.request.input.0": "hello world",
21+
"openai.request.input.1": "hello again",
22+
"openai.request.model": "text-embedding-ada-002",
23+
"openai.user.api_key": "sk-...key>",
24+
"runtime-id": "b168eb19ef14414ca786ac99826ef9e0"
25+
},
26+
"metrics": {
27+
"_dd.agent_psr": 1.0,
28+
"_dd.measured": 1,
29+
"_dd.top_level": 1,
30+
"_dd.tracer_kr": 1.0,
31+
"_sampling_priority_v1": 1,
32+
"openai.response.data.embedding-length": 1536,
33+
"openai.response.data.num-embeddings": 1,
34+
"openai.response.usage.prompt_tokens": 2,
35+
"openai.response.usage.total_tokens": 2,
36+
"process_id": 85925
37+
},
38+
"duration": 3517000,
39+
"start": 1684349256264173000
40+
}]]
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
[[
2+
{
3+
"name": "openai.request",
4+
"service": "",
5+
"resource": "embeddings/text-embedding-ada-002",
6+
"trace_id": 0,
7+
"span_id": 1,
8+
"parent_id": 0,
9+
"type": "",
10+
"error": 0,
11+
"meta": {
12+
"_dd.p.dm": "-0",
13+
"api_base": "https://api.openai.com/v1",
14+
"component": "openai",
15+
"language": "python",
16+
"openai.endpoint": "embeddings",
17+
"openai.model": "text-embedding-ada-002",
18+
"openai.organization.name": "datadog-4",
19+
"openai.organization.ratelimit.requests.remaining": "2999",
20+
"openai.request.input.0": "1111",
21+
"openai.request.input.1": "2222",
22+
"openai.request.input.2": "3333",
23+
"openai.request.model": "text-embedding-ada-002",
24+
"openai.user.api_key": "sk-...key>",
25+
"runtime-id": "b168eb19ef14414ca786ac99826ef9e0"
26+
},
27+
"metrics": {
28+
"_dd.agent_psr": 1.0,
29+
"_dd.measured": 1,
30+
"_dd.top_level": 1,
31+
"_dd.tracer_kr": 1.0,
32+
"_sampling_priority_v1": 1,
33+
"openai.response.data.embedding-length": 1536,
34+
"openai.response.data.num-embeddings": 1,
35+
"openai.response.usage.prompt_tokens": 2,
36+
"openai.response.usage.total_tokens": 2,
37+
"process_id": 85925
38+
},
39+
"duration": 3522000,
40+
"start": 1684349256312922000
41+
}]]

0 commit comments

Comments
 (0)