chore(llmobs): loosen restrictions on experiment spans IO (#14266)

gary-huang · web-flow · commit fe23980e56c5 · 2025-08-11T14:17:08.000-04:00
experiments spans' IO should be free form, acceptive of anything from structured data to non structured data before https://dddev.datadoghq.com/llm/experiments/0a623ad2-5a4f-4713-9bea-8b614ec640d9 <img width="1741" height="336" alt="image" src="https://github.com/user-attachments/assets/24ff5bfb-be4b-4df7-94f4-a80a938fa67a" /> after https://dddev.datadoghq.com/llm/experiments/0b7eb94d-b3de-49e8-8c9b-90fa167152b9 <img width="1748" height="273" alt="image" src="https://github.com/user-attachments/assets/92a7fd31-e8ce-40fb-8d7c-40ca7b03ffb7" /> ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py
@@ -103,4 +103,6 @@
 
 EXPERIMENT_ID_KEY = "_ml_obs.experiment_id"
 EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"
+EXPERIMENTS_INPUT = "_ml_obs.meta.input"
+EXPERIMENTS_OUTPUT = "_ml_obs.meta.output"
 DEFAULT_PROJECT_NAME = "default-project"
diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
@@ -27,6 +27,7 @@
 from ddtrace.llmobs._constants import DD_SITES_NEEDING_APP_SUBDOMAIN
 from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
 from ddtrace.llmobs._utils import convert_tags_dict_to_list
+from ddtrace.llmobs._utils import safe_json
 
 
 if TYPE_CHECKING:
@@ -365,7 +366,7 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas
             except Exception:
                 span.set_exc_info(*sys.exc_info())
             self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags)
-            span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, record["expected_output"])
+            span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, safe_json(record["expected_output"]))
             return {
                 "idx": idx,
                 "span_id": span_id,
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
@@ -54,6 +54,8 @@
 from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE
 from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
 from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY
+from ddtrace.llmobs._constants import EXPERIMENTS_INPUT
+from ddtrace.llmobs._constants import EXPERIMENTS_OUTPUT
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
 from ddtrace.llmobs._constants import INPUT_MESSAGES
 from ddtrace.llmobs._constants import INPUT_PROMPT
@@ -278,9 +280,18 @@ def _llmobs_span_event(self, span: Span) -> Optional[LLMObsSpanEvent]:
 
         if span.context.get_baggage_item(EXPERIMENT_ID_KEY):
             _dd_attrs["scope"] = "experiments"
-            expected_output = span._get_ctx_item(EXPERIMENT_EXPECTED_OUTPUT)
-            if span_kind == "experiment" and expected_output:
-                meta["expected_output"] = expected_output
+            if span_kind == "experiment":
+                expected_output = span._get_ctx_item(EXPERIMENT_EXPECTED_OUTPUT)
+                if expected_output:
+                    meta["expected_output"] = expected_output
+
+                input_data = span._get_ctx_item(EXPERIMENTS_INPUT)
+                if input_data:
+                    meta["input"] = input_data
+
+                output_data = span._get_ctx_item(EXPERIMENTS_OUTPUT)
+                if output_data:
+                    meta["output"] = output_data
 
         input_messages = span._get_ctx_item(INPUT_MESSAGES)
         if span_kind == "llm" and input_messages is not None:
@@ -1366,6 +1377,8 @@ def annotate(
                     error = cls._tag_embedding_io(span, input_documents=input_data, output_text=output_data)
                 elif span_kind == "retrieval":
                     error = cls._tag_retrieval_io(span, input_text=input_data, output_documents=output_data)
+                elif span_kind == "experiment":
+                    cls._tag_freeform_io(span, input_value=input_data, output_value=output_data)
                 else:
                     cls._tag_text_io(span, input_value=input_data, output_value=output_data)
         finally:
@@ -1447,6 +1460,18 @@ def _tag_text_io(cls, span, input_value=None, output_value=None):
         if output_value is not None:
             span._set_ctx_item(OUTPUT_VALUE, safe_json(output_value))
 
+    @classmethod
+    def _tag_freeform_io(cls, span, input_value=None, output_value=None):
+        """Tags input/output values for experient spans.
+        Will be mapped to span's `meta.{input,output}` fields.
+        this is meant to be non restrictive on user's data, experiments allow
+        arbitrary structured or non structured IO values in its spans
+        """
+        if input_value is not None:
+            span._set_ctx_item(EXPERIMENTS_INPUT, safe_json(input_value))
+        if output_value is not None:
+            span._set_ctx_item(EXPERIMENTS_OUTPUT, safe_json(output_value))
+
     @staticmethod
     def _set_dict_attribute(span: Span, key, value: Dict[str, Any]) -> None:
         """Sets a given LLM Obs span attribute with a dictionary key/values.
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
@@ -1125,9 +1125,9 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test
     for key in ("span_id", "trace_id", "parent_id", "start_ns", "duration", "metrics"):
         assert event[key] == mock.ANY
     assert event["status"] == "ok"
-    assert event["meta"]["input"] == {"value": '{"prompt": "What is the capital of France?"}'}
-    assert event["meta"]["output"] == {"value": '{"prompt": "What is the capital of France?"}'}
-    assert event["meta"]["expected_output"] == {"answer": "Paris"}
+    assert event["meta"]["input"] == '{"prompt": "What is the capital of France?"}'
+    assert event["meta"]["output"] == '{"prompt": "What is the capital of France?"}'
+    assert event["meta"]["expected_output"] == '{"answer": "Paris"}'
     assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"]
     assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"]
     assert "experiment_id:1234567890" in event["tags"]