Skip to content

Commit 20eefc1

Browse files
authored
ci(llmobs): start to replace some api calls in tests with testagent vcr proxy calls instead (#14507)
Remediates issues where hitting our API (even staging) from tests could be flaky. Updating test behavior should be reactive to intentional change, and not because there's a bug in the staging API, etc. We'll need to do this for some more tests too, but that'll require a change to the test agent. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
1 parent 01e267e commit 20eefc1

11 files changed

+349
-42
lines changed

ddtrace/llmobs/_writer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import Union
1212
from typing import cast
1313
from urllib.parse import quote
14+
from urllib.parse import urlparse
1415

1516

1617
# TypedDict was added to typing in python 3.8
@@ -156,6 +157,14 @@ def __init__(
156157
f"{self.AGENTLESS_BASE_URL}.{self._site}" if is_agentless else agent_config.trace_agent_url
157158
)
158159
self._endpoint: str = self.ENDPOINT if is_agentless else f"{EVP_PROXY_AGENT_BASE_PATH}{self.ENDPOINT}"
160+
override_url_parsed = urlparse(self._override_url)
161+
if self._override_url and override_url_parsed.scheme != "unix" and override_url_parsed.path not in ("/", ""):
162+
# handles cases where the override url includes a base path, ie
163+
# http://localhost:8080/foo/bar and endpoint /buz/baz
164+
# we need to strip the base path from the endpoint so the eventual urljoin works properly
165+
# to form http://localhost:8080/foo/bar/buz/baz
166+
self._endpoint = self.ENDPOINT.lstrip("/")
167+
159168
self._headers: Dict[str, str] = {"Content-Type": "application/json"}
160169
if is_agentless:
161170
self._headers["DD-API-KEY"] = self._api_key

tests/llmobs/conftest.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,11 @@ def llmobs_enable_opts():
253253
yield {"project_name": "test-project"}
254254

255255

256+
@pytest.fixture
257+
def llmobs_api_proxy_url():
258+
return "http://localhost:9126/vcr/datadog"
259+
260+
256261
@pytest.fixture
257262
def llmobs(
258263
ddtrace_global_config,
@@ -261,6 +266,7 @@ def llmobs(
261266
llmobs_enable_opts,
262267
llmobs_env,
263268
llmobs_span_writer,
269+
llmobs_api_proxy_url,
264270
mock_llmobs_eval_metric_writer,
265271
mock_llmobs_evaluator_runner,
266272
):
@@ -274,7 +280,7 @@ def llmobs(
274280
llmobs_service.enable(_tracer=tracer, **llmobs_enable_opts)
275281
llmobs_service._instance._llmobs_span_writer = llmobs_span_writer
276282
llmobs_service._instance._llmobs_span_writer.start()
277-
llmobs_service._instance._dne_client._intake = "http://localhost:9126/vcr/datadog"
283+
llmobs_service._instance._dne_client._intake = llmobs_api_proxy_url
278284
yield llmobs_service
279285
tracer.shutdown()
280286
llmobs_service.disable()
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
5+
"categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
6+
"timestamp_ms": 1756910127022}]}}}'
7+
headers:
8+
Accept:
9+
- '*/*'
10+
? !!python/object/apply:multidict._multidict.istr
11+
- Accept-Encoding
12+
: - identity
13+
Connection:
14+
- keep-alive
15+
Content-Length:
16+
- '283'
17+
? !!python/object/apply:multidict._multidict.istr
18+
- Content-Type
19+
: - application/json
20+
User-Agent:
21+
- python-requests/2.32.4
22+
method: POST
23+
uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
24+
response:
25+
body:
26+
string: '{"data":{"id":"1ef94721-392d-4612-ad63-5f3b289c1cd5","type":"evaluation_metric","attributes":{"metrics":[{"id":"-Xbd-WStY2","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1756910127022,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
27+
headers:
28+
content-length:
29+
- '325'
30+
content-security-policy:
31+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
32+
content-type:
33+
- application/vnd.api+json
34+
date:
35+
- Wed, 03 Sep 2025 14:41:13 GMT
36+
strict-transport-security:
37+
- max-age=31536000; includeSubDomains; preload
38+
vary:
39+
- Accept-Encoding
40+
x-content-type-options:
41+
- nosniff
42+
x-frame-options:
43+
- SAMEORIGIN
44+
status:
45+
code: 202
46+
message: Accepted
47+
version: 1
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type":
5+
"score", "timestamp_ms": 1757074814754, "score_value": 1.0, "ml_app": "unnamed-ml-app",
6+
"tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}'
7+
headers:
8+
Accept:
9+
- '*/*'
10+
? !!python/object/apply:multidict._multidict.istr
11+
- Accept-Encoding
12+
: - identity
13+
Connection:
14+
- keep-alive
15+
Content-Length:
16+
- '340'
17+
? !!python/object/apply:multidict._multidict.istr
18+
- Content-Type
19+
: - application/json
20+
User-Agent:
21+
- python-requests/2.32.4
22+
method: POST
23+
uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
24+
response:
25+
body:
26+
string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"[email protected]"}'
27+
headers:
28+
connection:
29+
- close
30+
content-length:
31+
- '169'
32+
content-type:
33+
- application/json
34+
date:
35+
- Fri, 05 Sep 2025 12:20:14 GMT
36+
strict-transport-security:
37+
- max-age=31536000; includeSubDomains; preload
38+
x-content-type-options:
39+
- nosniff
40+
status:
41+
code: 403
42+
message: Forbidden
43+
version: 1
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
5+
"score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
6+
"timestamp_ms": 1756910127022}]}}}'
7+
headers:
8+
Accept:
9+
- '*/*'
10+
? !!python/object/apply:multidict._multidict.istr
11+
- Accept-Encoding
12+
: - identity
13+
Connection:
14+
- keep-alive
15+
Content-Length:
16+
- '269'
17+
? !!python/object/apply:multidict._multidict.istr
18+
- Content-Type
19+
: - application/json
20+
User-Agent:
21+
- python-requests/2.32.4
22+
method: POST
23+
uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
24+
response:
25+
body:
26+
string: '{"data":{"id":"c7ca5837-c593-4973-aefc-fe9ccbca1e74","type":"evaluation_metric","attributes":{"metrics":[{"id":"BKrS9Vc9nU","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1756910127022,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
27+
headers:
28+
content-length:
29+
- '311'
30+
content-security-policy:
31+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
32+
content-type:
33+
- application/vnd.api+json
34+
date:
35+
- Wed, 03 Sep 2025 14:45:25 GMT
36+
strict-transport-security:
37+
- max-age=31536000; includeSubDomains; preload
38+
vary:
39+
- Accept-Encoding
40+
x-content-type-options:
41+
- nosniff
42+
x-frame-options:
43+
- SAMEORIGIN
44+
status:
45+
code: 202
46+
message: Accepted
47+
version: 1
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type":
5+
"score", "timestamp_ms": 1756911917780, "score_value": 1.0, "ml_app": "unnamed-ml-app",
6+
"tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}'
7+
headers:
8+
Accept:
9+
- '*/*'
10+
? !!python/object/apply:multidict._multidict.istr
11+
- Accept-Encoding
12+
: - identity
13+
Connection:
14+
- keep-alive
15+
Content-Length:
16+
- '340'
17+
? !!python/object/apply:multidict._multidict.istr
18+
- Content-Type
19+
: - application/json
20+
User-Agent:
21+
- python-requests/2.32.4
22+
method: POST
23+
uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
24+
response:
25+
body:
26+
string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"[email protected]"}'
27+
headers:
28+
connection:
29+
- close
30+
content-length:
31+
- '169'
32+
content-type:
33+
- application/json
34+
date:
35+
- Wed, 03 Sep 2025 15:05:17 GMT
36+
strict-transport-security:
37+
- max-age=31536000; includeSubDomains; preload
38+
x-content-type-options:
39+
- nosniff
40+
status:
41+
code: 403
42+
message: Forbidden
43+
version: 1
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
5+
"categorical", "categorical_value": "wrong-api-key", "label": "api-key", "ml_app":
6+
"dummy-ml-app", "timestamp_ms": 1756910127022}]}}}'
7+
headers:
8+
Accept:
9+
- '*/*'
10+
? !!python/object/apply:multidict._multidict.istr
11+
- Accept-Encoding
12+
: - identity
13+
Connection:
14+
- keep-alive
15+
Content-Length:
16+
- '291'
17+
? !!python/object/apply:multidict._multidict.istr
18+
- Content-Type
19+
: - application/json
20+
User-Agent:
21+
- python-requests/2.32.4
22+
method: POST
23+
uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
24+
response:
25+
body:
26+
string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"[email protected]"}'
27+
headers:
28+
connection:
29+
- close
30+
content-length:
31+
- '169'
32+
content-type:
33+
- application/json
34+
date:
35+
- Wed, 03 Sep 2025 14:39:21 GMT
36+
strict-transport-security:
37+
- max-age=31536000; includeSubDomains; preload
38+
x-content-type-options:
39+
- nosniff
40+
status:
41+
code: 403
42+
message: Forbidden
43+
version: 1
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type":
5+
"score", "timestamp_ms": 1757074518879, "score_value": 1.0, "ml_app": "unnamed-ml-app",
6+
"tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}'
7+
headers:
8+
Accept:
9+
- '*/*'
10+
? !!python/object/apply:multidict._multidict.istr
11+
- Accept-Encoding
12+
: - identity
13+
Connection:
14+
- keep-alive
15+
Content-Length:
16+
- '340'
17+
? !!python/object/apply:multidict._multidict.istr
18+
- Content-Type
19+
: - application/json
20+
User-Agent:
21+
- python-requests/2.32.4
22+
method: POST
23+
uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
24+
response:
25+
body:
26+
string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"[email protected]"}'
27+
headers:
28+
connection:
29+
- close
30+
content-length:
31+
- '169'
32+
content-type:
33+
- application/json
34+
date:
35+
- Fri, 05 Sep 2025 12:15:18 GMT
36+
strict-transport-security:
37+
- max-age=31536000; includeSubDomains; preload
38+
x-content-type-options:
39+
- nosniff
40+
status:
41+
code: 403
42+
message: Forbidden
43+
version: 1

tests/llmobs/test_llmobs_eval_metric_agent_writer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,15 @@ def test_buffer_limit(mock_writer_logs):
4444
@mock.patch("ddtrace.llmobs._writer.LLMObsEvalMetricWriter._send_payload")
4545
def test_send_categorical_metrics(mock_send_payload, mock_writer_logs):
4646
llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False)
47-
llmobs_eval_metric_writer.enqueue(_categorical_metric_event())
47+
llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very"))
4848
llmobs_eval_metric_writer.periodic()
4949
mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")
5050

5151

5252
@mock.patch("ddtrace.llmobs._writer.LLMObsEvalMetricWriter._send_payload")
5353
def test_send_score_metric(mock_send_payload, mock_writer_logs):
5454
llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False)
55-
llmobs_eval_metric_writer.enqueue(_score_metric_event())
55+
llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9))
5656
llmobs_eval_metric_writer.periodic()
5757
mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")
5858

@@ -63,11 +63,11 @@ def test_send_timed_events(mock_send_payload, mock_writer_logs):
6363
llmobs_eval_metric_writer.start()
6464
mock_writer_logs.reset_mock()
6565

66-
llmobs_eval_metric_writer.enqueue(_score_metric_event())
66+
llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9))
6767
time.sleep(0.1)
6868
mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")
6969
mock_writer_logs.reset_mock()
70-
llmobs_eval_metric_writer.enqueue(_categorical_metric_event())
70+
llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very"))
7171
time.sleep(0.1)
7272
mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")
7373
llmobs_eval_metric_writer.stop()
@@ -77,7 +77,7 @@ def test_send_timed_events(mock_send_payload, mock_writer_logs):
7777
def test_send_multiple_events(mock_send_payload, mock_writer_logs):
7878
llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False)
7979
mock_writer_logs.reset_mock()
80-
llmobs_eval_metric_writer.enqueue(_score_metric_event())
81-
llmobs_eval_metric_writer.enqueue(_categorical_metric_event())
80+
llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9))
81+
llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very"))
8282
llmobs_eval_metric_writer.periodic()
8383
mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 2, "evaluation_metric")

0 commit comments

Comments
 (0)