ci(llmobs): start to replace some api calls in tests with testagent vcr proxy calls instead (#14507)

sabrenner · web-flow · commit 20eefc15463d · 2025-09-07T03:00:43.000Z
Remediates issues where hitting our API (even staging) from tests could be flaky. Updating test behavior should be reactive to intentional change, and not because there's a bug in the staging API, etc. We'll need to do this for some more tests too, but that'll require a change to the test agent. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
@@ -11,6 +11,7 @@
 from typing import Union
 from typing import cast
 from urllib.parse import quote
+from urllib.parse import urlparse
 
 
 # TypedDict was added to typing in python 3.8
@@ -156,6 +157,14 @@ def __init__(
             f"{self.AGENTLESS_BASE_URL}.{self._site}" if is_agentless else agent_config.trace_agent_url
         )
         self._endpoint: str = self.ENDPOINT if is_agentless else f"{EVP_PROXY_AGENT_BASE_PATH}{self.ENDPOINT}"
+        override_url_parsed = urlparse(self._override_url)
+        if self._override_url and override_url_parsed.scheme != "unix" and override_url_parsed.path not in ("/", ""):
+            # handles cases where the override url includes a base path, ie
+            # http://localhost:8080/foo/bar and endpoint /buz/baz
+            # we need to strip the base path from the endpoint so the eventual urljoin works properly
+            # to form http://localhost:8080/foo/bar/buz/baz
+            self._endpoint = self.ENDPOINT.lstrip("/")
+
         self._headers: Dict[str, str] = {"Content-Type": "application/json"}
         if is_agentless:
             self._headers["DD-API-KEY"] = self._api_key
diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py
@@ -253,6 +253,11 @@ def llmobs_enable_opts():
     yield {"project_name": "test-project"}
 
 
+@pytest.fixture
+def llmobs_api_proxy_url():
+    return "http://localhost:9126/vcr/datadog"
+
+
 @pytest.fixture
 def llmobs(
     ddtrace_global_config,
@@ -261,6 +266,7 @@ def llmobs(
     llmobs_enable_opts,
     llmobs_env,
     llmobs_span_writer,
+    llmobs_api_proxy_url,
     mock_llmobs_eval_metric_writer,
     mock_llmobs_evaluator_runner,
 ):
@@ -274,7 +280,7 @@ def llmobs(
         llmobs_service.enable(_tracer=tracer, **llmobs_enable_opts)
         llmobs_service._instance._llmobs_span_writer = llmobs_span_writer
         llmobs_service._instance._llmobs_span_writer.start()
-        llmobs_service._instance._dne_client._intake = "http://localhost:9126/vcr/datadog"
+        llmobs_service._instance._dne_client._intake = llmobs_api_proxy_url
         yield llmobs_service
     tracer.shutdown()
     llmobs_service.disable()
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_1218a393.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_1218a393.yaml
@@ -0,0 +1,47 @@
+interactions:
+- request:
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1756910127022}]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '283'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.4
+    method: POST
+    uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
+  response:
+    body:
+      string: '{"data":{"id":"1ef94721-392d-4612-ad63-5f3b289c1cd5","type":"evaluation_metric","attributes":{"metrics":[{"id":"-Xbd-WStY2","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1756910127022,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
+    headers:
+      content-length:
+      - '325'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 03 Sep 2025 14:41:13 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 202
+      message: Accepted
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_2d529580.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_2d529580.yaml
@@ -0,0 +1,43 @@
+interactions:
+- request:
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type":
+      "score", "timestamp_ms": 1757074814754, "score_value": 1.0, "ml_app": "unnamed-ml-app",
+      "tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '340'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.4
+    method: POST
+    uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
+  response:
+    body:
+      string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}'
+    headers:
+      connection:
+      - close
+      content-length:
+      - '169'
+      content-type:
+      - application/json
+      date:
+      - Fri, 05 Sep 2025 12:20:14 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-content-type-options:
+      - nosniff
+    status:
+      code: 403
+      message: Forbidden
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_3ef3a86e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_3ef3a86e.yaml
@@ -0,0 +1,47 @@
+interactions:
+- request:
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1756910127022}]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '269'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.4
+    method: POST
+    uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
+  response:
+    body:
+      string: '{"data":{"id":"c7ca5837-c593-4973-aefc-fe9ccbca1e74","type":"evaluation_metric","attributes":{"metrics":[{"id":"BKrS9Vc9nU","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1756910127022,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
+    headers:
+      content-length:
+      - '311'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 03 Sep 2025 14:45:25 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 202
+      message: Accepted
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_42090a9a.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_42090a9a.yaml
@@ -0,0 +1,43 @@
+interactions:
+- request:
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type":
+      "score", "timestamp_ms": 1756911917780, "score_value": 1.0, "ml_app": "unnamed-ml-app",
+      "tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '340'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.4
+    method: POST
+    uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
+  response:
+    body:
+      string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}'
+    headers:
+      connection:
+      - close
+      content-length:
+      - '169'
+      content-type:
+      - application/json
+      date:
+      - Wed, 03 Sep 2025 15:05:17 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-content-type-options:
+      - nosniff
+    status:
+      code: 403
+      message: Forbidden
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_9ef24d1e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_9ef24d1e.yaml
@@ -0,0 +1,43 @@
+interactions:
+- request:
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "wrong-api-key", "label": "api-key", "ml_app":
+      "dummy-ml-app", "timestamp_ms": 1756910127022}]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '291'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.4
+    method: POST
+    uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
+  response:
+    body:
+      string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}'
+    headers:
+      connection:
+      - close
+      content-length:
+      - '169'
+      content-type:
+      - application/json
+      date:
+      - Wed, 03 Sep 2025 14:39:21 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-content-type-options:
+      - nosniff
+    status:
+      code: 403
+      message: Forbidden
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_ca2bfa88.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_ca2bfa88.yaml
@@ -0,0 +1,43 @@
+interactions:
+- request:
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type":
+      "score", "timestamp_ms": 1757074518879, "score_value": 1.0, "ml_app": "unnamed-ml-app",
+      "tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '340'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.4
+    method: POST
+    uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric
+  response:
+    body:
+      string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}'
+    headers:
+      connection:
+      - close
+      content-length:
+      - '169'
+      content-type:
+      - application/json
+      date:
+      - Fri, 05 Sep 2025 12:15:18 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-content-type-options:
+      - nosniff
+    status:
+      code: 403
+      message: Forbidden
+version: 1
diff --git a/tests/llmobs/test_llmobs_eval_metric_agent_writer.py b/tests/llmobs/test_llmobs_eval_metric_agent_writer.py
@@ -44,15 +44,15 @@ def test_buffer_limit(mock_writer_logs):
 @mock.patch("ddtrace.llmobs._writer.LLMObsEvalMetricWriter._send_payload")
 def test_send_categorical_metrics(mock_send_payload, mock_writer_logs):
     llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False)
-    llmobs_eval_metric_writer.enqueue(_categorical_metric_event())
+    llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very"))
     llmobs_eval_metric_writer.periodic()
     mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")
 
 
 @mock.patch("ddtrace.llmobs._writer.LLMObsEvalMetricWriter._send_payload")
 def test_send_score_metric(mock_send_payload, mock_writer_logs):
     llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False)
-    llmobs_eval_metric_writer.enqueue(_score_metric_event())
+    llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9))
     llmobs_eval_metric_writer.periodic()
     mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")
 
@@ -63,11 +63,11 @@ def test_send_timed_events(mock_send_payload, mock_writer_logs):
     llmobs_eval_metric_writer.start()
     mock_writer_logs.reset_mock()
 
-    llmobs_eval_metric_writer.enqueue(_score_metric_event())
+    llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9))
     time.sleep(0.1)
     mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")
     mock_writer_logs.reset_mock()
-    llmobs_eval_metric_writer.enqueue(_categorical_metric_event())
+    llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very"))
     time.sleep(0.1)
     mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")
     llmobs_eval_metric_writer.stop()
@@ -77,7 +77,7 @@ def test_send_timed_events(mock_send_payload, mock_writer_logs):
 def test_send_multiple_events(mock_send_payload, mock_writer_logs):
     llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False)
     mock_writer_logs.reset_mock()
-    llmobs_eval_metric_writer.enqueue(_score_metric_event())
-    llmobs_eval_metric_writer.enqueue(_categorical_metric_event())
+    llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9))
+    llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very"))
     llmobs_eval_metric_writer.periodic()
     mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 2, "evaluation_metric")
diff --git a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py