feat(llmobs): add toolcall and toolresults to messages (#14385)

ZStriker19 · web-flow · commit 189052f50136 · 2025-08-26T16:46:51.000Z
This introduces support for tool calls and tool results in LLMObs message annotations when using custom instrumentation. The ``LLMObs.annotate()`` method now accept input and output data with optional ``tool_calls`` and ``tool_results`` fields for function calling scenarios. [ Docs update to be merged once this is merged ](DataDog/documentation#31231) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
@@ -1317,14 +1317,20 @@ def annotate(
                             `rag_query_variables` - a list of variable key names that contains query
                                                         information for an LLM call
         :param input_data: A single input string, dictionary, or a list of dictionaries based on the span kind:
-                           - llm spans: accepts a string, or a dictionary of form {"content": "...", "role": "..."},
-                                        or a list of dictionaries with the same signature.
+                           - llm spans: accepts a string, or a dictionary of form {"content": "...", "role": "...",
+                                        "tool_calls": ..., "tool_results": ...}, where "tool_calls" are an optional
+                                        list of tool call dictionaries with required keys: "name", "arguments", and
+                                        optional keys: "tool_id", "type", and "tool_results" are an optional list of
+                                        tool result dictionaries with required key: "result", and optional keys:
+                                        "name", "tool_id", "type" for function calling scenarios.
                            - embedding spans: accepts a string, list of strings, or a dictionary of form
                                               {"text": "...", ...} or a list of dictionaries with the same signature.
                            - other: any JSON serializable type.
         :param output_data: A single output string, dictionary, or a list of dictionaries based on the span kind:
-                           - llm spans: accepts a string, or a dictionary of form {"content": "...", "role": "..."},
-                                        or a list of dictionaries with the same signature.
+                           - llm spans: accepts a string, or a dictionary of form {"content": "...", "role": "...",
+                                        "tool_calls": ...}, where "tool_calls" are an optional list of tool call
+                                        dictionaries with required keys: "name", "arguments", and optional keys:
+                                        "tool_id", "type" for function calling scenarios.
                            - retrieval spans: a dictionary containing any of the key value pairs
                                               {"name": str, "id": str, "text": str, "score": float},
                                               or a list of dictionaries with the same signature.
diff --git a/ddtrace/llmobs/utils.py b/ddtrace/llmobs/utils.py
@@ -17,9 +17,70 @@
 
 DocumentType = Dict[str, Union[str, int, float]]
 
+
+def _extract_tool_call(tool_call: Dict[str, Any]) -> "ToolCall":
+    """Extract and validate a tool call dictionary."""
+    if not isinstance(tool_call, dict):
+        raise TypeError("Each tool_call must be a dictionary.")
+
+    # name and arguments are required
+    name = tool_call.get("name")
+    arguments = tool_call.get("arguments")
+
+    if not name or not isinstance(name, str):
+        raise TypeError("ToolCall name must be a non-empty string.")
+    if arguments is None or not isinstance(arguments, dict):
+        raise TypeError("ToolCall arguments must be a dictionary.")
+
+    formatted_tool_call = ToolCall(name=name, arguments=arguments)
+
+    # Add optional fields if present
+    tool_id = tool_call.get("tool_id")
+    if tool_id and isinstance(tool_id, str):
+        formatted_tool_call["tool_id"] = tool_id
+
+    tool_type = tool_call.get("type")
+    if tool_type and isinstance(tool_type, str):
+        formatted_tool_call["type"] = tool_type
+
+    return formatted_tool_call
+
+
+def _extract_tool_result(tool_result: Dict[str, Any]) -> "ToolResult":
+    """Extract and validate a tool result dictionary."""
+    if not isinstance(tool_result, dict):
+        raise TypeError("Each tool_result must be a dictionary.")
+
+    # result is required
+    result = tool_result.get("result")
+    if result is None or not isinstance(result, str):
+        raise TypeError("ToolResult result must be a string.")
+
+    formatted_tool_result = ToolResult(result=result)
+
+    # Add optional fields if present
+    name = tool_result.get("name")
+    if name and isinstance(name, str):
+        formatted_tool_result["name"] = name
+
+    tool_id = tool_result.get("tool_id")
+    if tool_id and isinstance(tool_id, str):
+        formatted_tool_result["tool_id"] = tool_id
+
+    tool_type = tool_result.get("type")
+    if tool_type and isinstance(tool_type, str):
+        formatted_tool_result["type"] = tool_type
+
+    return formatted_tool_result
+
+
 ExportedLLMObsSpan = TypedDict("ExportedLLMObsSpan", {"span_id": str, "trace_id": str})
 Document = TypedDict("Document", {"name": str, "id": str, "text": str, "score": float}, total=False)
-Message = TypedDict("Message", {"content": str, "role": str}, total=False)
+Message = TypedDict(
+    "Message",
+    {"content": str, "role": str, "tool_calls": List["ToolCall"], "tool_results": List["ToolResult"]},
+    total=False,
+)
 Prompt = TypedDict(
     "Prompt",
     {
@@ -66,7 +127,7 @@
 
 
 class Messages:
-    def __init__(self, messages: Union[List[Dict[str, str]], Dict[str, str], str]):
+    def __init__(self, messages: Union[List[Dict[str, Any]], Dict[str, Any], str]):
         self.messages = []
         if not isinstance(messages, list):
             messages = [messages]  # type: ignore[list-item]
@@ -76,16 +137,33 @@ def __init__(self, messages: Union[List[Dict[str, str]], Dict[str, str], str]):
                 continue
             elif not isinstance(message, dict):
                 raise TypeError("messages must be a string, dictionary, or list of dictionaries.")
+
             content = message.get("content", "")
             role = message.get("role")
             if not isinstance(content, str):
                 raise TypeError("Message content must be a string.")
-            if not role:
-                self.messages.append(Message(content=content))
-                continue
-            if not isinstance(role, str):
-                raise TypeError("Message role must be a string, and one of .")
-            self.messages.append(Message(content=content, role=role))
+
+            msg_dict = Message(content=content)
+            if role:
+                if not isinstance(role, str):
+                    raise TypeError("Message role must be a string.")
+                msg_dict["role"] = role
+
+            tool_calls = message.get("tool_calls")
+            if tool_calls is not None:
+                if not isinstance(tool_calls, list):
+                    raise TypeError("tool_calls must be a list.")
+                formatted_tool_calls = [_extract_tool_call(tool_call) for tool_call in tool_calls]
+                msg_dict["tool_calls"] = formatted_tool_calls
+
+            tool_results = message.get("tool_results")
+            if tool_results is not None:
+                if not isinstance(tool_results, list):
+                    raise TypeError("tool_results must be a list.")
+                formatted_tool_results = [_extract_tool_result(tool_result) for tool_result in tool_results]
+                msg_dict["tool_results"] = formatted_tool_results
+
+            self.messages.append(msg_dict)
 
 
 class Documents:
diff --git a/releasenotes/notes/llmobs_toolcall_toolresult_custom_instr-4fcc7708bebc6f5c.yaml b/releasenotes/notes/llmobs_toolcall_toolresult_custom_instr-4fcc7708bebc6f5c.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    llmobs: This introduces support for tool calls and tool results in LLMObs message annotations when using custom instrumentation. 
+    The ``LLMObs.annotate()`` method now accept input and output data with optional 
+    ``tool_calls`` and ``tool_results`` fields for function calling scenarios.
diff --git a/tests/llmobs/test_utils.py b/tests/llmobs/test_utils.py
@@ -56,6 +56,145 @@ def test_messages_with_no_role_is_ok():
     assert messages.messages == [{"content": "hello"}, {"content": "world"}]
 
 
+def test_messages_with_tool_calls():
+    """Test that messages can include tool calls."""
+    messages = Messages(
+        [
+            {
+                "content": "I'll help you with that calculation.",
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "name": "calculator",
+                        "arguments": {"operation": "add", "a": 5, "b": 3},
+                        "tool_id": "call_123",
+                        "type": "function",
+                    }
+                ],
+            }
+        ]
+    )
+    expected = [
+        {
+            "content": "I'll help you with that calculation.",
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "name": "calculator",
+                    "arguments": {"operation": "add", "a": 5, "b": 3},
+                    "tool_id": "call_123",
+                    "type": "function",
+                }
+            ],
+        }
+    ]
+    assert messages.messages == expected
+
+
+def test_messages_with_tool_results():
+    """Test that messages can include tool results."""
+    messages = Messages(
+        [
+            {
+                "content": "",
+                "role": "tool",
+                "tool_results": [
+                    {"name": "calculator", "result": "8", "tool_id": "call_123", "type": "function_result"}
+                ],
+            }
+        ]
+    )
+    expected = [
+        {
+            "content": "",
+            "role": "tool",
+            "tool_results": [{"name": "calculator", "result": "8", "tool_id": "call_123", "type": "function_result"}],
+        }
+    ]
+    assert messages.messages == expected
+
+
+def test_messages_with_tool_calls_minimal():
+    """Test tool calls with only required fields."""
+    messages = Messages(
+        [
+            {
+                "content": "Using calculator",
+                "role": "assistant",
+                "tool_calls": [{"name": "calculator", "arguments": {"x": 10}}],
+            }
+        ]
+    )
+    expected = [
+        {
+            "content": "Using calculator",
+            "role": "assistant",
+            "tool_calls": [{"name": "calculator", "arguments": {"x": 10}}],
+        }
+    ]
+    assert messages.messages == expected
+
+
+def test_messages_with_tool_results_minimal():
+    """Test tool results with only required fields."""
+    messages = Messages([{"content": "", "role": "tool", "tool_results": [{"result": "Success"}]}])
+    expected = [{"content": "", "role": "tool", "tool_results": [{"result": "Success"}]}]
+    assert messages.messages == expected
+
+
+def test_messages_with_both_tool_calls_and_results():
+    """Test that a message can have both tool calls and tool results"""
+    messages = Messages(
+        [
+            {
+                "content": "Processing...",
+                "role": "assistant",
+                "tool_calls": [{"name": "calculator", "arguments": {"x": 5}}],
+                "tool_results": [{"result": "10"}],
+            }
+        ]
+    )
+    expected = [
+        {
+            "content": "Processing...",
+            "role": "assistant",
+            "tool_calls": [{"name": "calculator", "arguments": {"x": 5}}],
+            "tool_results": [{"result": "10"}],
+        }
+    ]
+    assert messages.messages == expected
+
+
+def test_messages_tool_calls_missing_required_fields():
+    """Test that tool_calls raise errors when required fields are missing."""
+    # Missing name field
+    with pytest.raises(TypeError, match="ToolCall name must be a non-empty string"):
+        Messages([{"content": "test", "tool_calls": [{"arguments": {"x": 5}}]}])
+
+    # Missing arguments field
+    with pytest.raises(TypeError, match="ToolCall arguments must be a dictionary"):
+        Messages([{"content": "test", "tool_calls": [{"name": "calculator"}]}])
+
+    # Empty name field
+    with pytest.raises(TypeError, match="ToolCall name must be a non-empty string"):
+        Messages([{"content": "test", "tool_calls": [{"name": "", "arguments": {"x": 5}}]}])
+
+    # Invalid arguments type
+    with pytest.raises(TypeError, match="ToolCall arguments must be a dictionary"):
+        Messages([{"content": "test", "tool_calls": [{"name": "calculator", "arguments": "invalid"}]}])
+
+
+def test_messages_tool_results_missing_required_fields():
+    """Test that tool_results raise errors when required fields are missing."""
+    # Missing result field
+    with pytest.raises(TypeError, match="ToolResult result must be a string"):
+        Messages([{"content": "test", "tool_results": [{"name": "calculator"}]}])
+
+    # Invalid result type
+    with pytest.raises(TypeError, match="ToolResult result must be a string"):
+        Messages([{"content": "test", "tool_results": [{"result": 123}]}])
+
+
 def test_documents_with_string():
     documents = Documents("hello")
     assert documents.documents == [{"text": "hello"}]