feat(anthropic): added support for tool results, definitions (#14164)

maxzhangdd · ncybul · web-flow · commit 1bd9fe90941e · 2025-08-11T15:39:14.000-04:00
This PR adds improved tracking of tool results and definitions to the LLMObs anthropic integration Testing: traces showing up (frontend is still WIP and not deployed yet) <img width="1729" height="935" alt="Screenshot 2025-07-31 at 2 11 17 PM" src="https://github.com/user-attachments/assets/76fd224b-58fa-43da-aee5-b1f7c3a204bf" /> ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: ncybul <124532568+ncybul@users.noreply.github.com>
diff --git a/ddtrace/llmobs/_integrations/anthropic.py b/ddtrace/llmobs/_integrations/anthropic.py
@@ -19,10 +19,14 @@
 from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import PROXY_REQUEST
 from ddtrace.llmobs._constants import SPAN_KIND
+from ddtrace.llmobs._constants import TOOL_DEFINITIONS
 from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
 from ddtrace.llmobs._integrations.base import BaseLLMIntegration
 from ddtrace.llmobs._integrations.utils import update_proxy_workflow_input_output_value
 from ddtrace.llmobs._utils import _get_attr
+from ddtrace.llmobs.utils import ToolCall
+from ddtrace.llmobs.utils import ToolDefinition
+from ddtrace.llmobs.utils import ToolResult
 from ddtrace.trace import Span
 
 
@@ -60,6 +64,9 @@ def _llmobs_set_tags(
             parameters["temperature"] = kwargs.get("temperature")
         if kwargs.get("max_tokens"):
             parameters["max_tokens"] = kwargs.get("max_tokens")
+        if kwargs.get("tools"):
+            tools = self._extract_tools(kwargs.get("tools"))
+            span._set_ctx_item(TOOL_DEFINITIONS, tools)
         messages = kwargs.get("messages")
         system_prompt = kwargs.get("system")
         input_messages = self._extract_input_message(messages, system_prompt)
@@ -124,33 +131,44 @@ def _extract_input_message(self, messages, system_prompt: Optional[Union[str, Li
                         input_data = _get_attr(block, "input", "")
                         if isinstance(input_data, str):
                             input_data = json.loads(input_data)
-                        tool_call_info = {
-                            "name": _get_attr(block, "name", ""),
-                            "arguments": input_data,
-                            "tool_id": _get_attr(block, "id", ""),
-                            "type": _get_attr(block, "type", ""),
-                        }
+                        tool_call_info = ToolCall(
+                            name=_get_attr(block, "name", ""),
+                            arguments=input_data,
+                            tool_id=_get_attr(block, "id", ""),
+                            type=_get_attr(block, "type", ""),
+                        )
                         if text is None:
                             text = ""
                         input_messages.append({"content": text, "role": role, "tool_calls": [tool_call_info]})
 
                     elif _get_attr(block, "type", None) == "tool_result":
                         content = _get_attr(block, "content", None)
-                        if isinstance(content, str):
-                            input_messages.append({"content": "[tool result: {}]".format(content), "role": role})
-                        elif isinstance(content, list):
-                            input_messages.append({"content": [], "role": role})
-                            for tool_result_block in content:
-                                if _get_attr(tool_result_block, "text", "") != "":
-                                    input_messages[-1]["content"].append(_get_attr(tool_result_block, "text", ""))
-                                elif _get_attr(tool_result_block, "type", None) == "image":
-                                    # Store a placeholder for potentially enormous binary image data.
-                                    input_messages[-1]["content"].append("([IMAGE DETECTED])")
+                        formatted_content = self._format_tool_result_content(content)
+                        tool_result_info = ToolResult(
+                            result=formatted_content,
+                            tool_id=_get_attr(block, "tool_use_id", ""),
+                            type="tool_result",
+                        )
+                        input_messages.append({"content": "", "role": role, "tool_results": [tool_result_info]})
                     else:
                         input_messages.append({"content": str(block), "role": role})
 
         return input_messages
 
+    def _format_tool_result_content(self, content) -> str:
+        if isinstance(content, str):
+            return content
+        elif isinstance(content, Iterable):
+            formatted_content = []
+            for tool_result_block in content:
+                if _get_attr(tool_result_block, "text", "") != "":
+                    formatted_content.append(_get_attr(tool_result_block, "text", ""))
+                elif _get_attr(tool_result_block, "type", None) == "image":
+                    # Store a placeholder for potentially enormous binary image data.
+                    formatted_content.append("([IMAGE DETECTED])")
+            return ",".join(formatted_content)
+        return str(content)
+
     def _extract_output_message(self, response):
         """Extract output messages from the stored response."""
         output_messages = []
@@ -170,12 +188,12 @@ def _extract_output_message(self, response):
                         input_data = _get_attr(completion, "input", "")
                         if isinstance(input_data, str):
                             input_data = json.loads(input_data)
-                        tool_call_info = {
-                            "name": _get_attr(completion, "name", ""),
-                            "arguments": input_data,
-                            "tool_id": _get_attr(completion, "id", ""),
-                            "type": _get_attr(completion, "type", ""),
-                        }
+                        tool_call_info = ToolCall(
+                            name=_get_attr(completion, "name", ""),
+                            arguments=input_data,
+                            tool_id=_get_attr(completion, "id", ""),
+                            type=_get_attr(completion, "type", ""),
+                        )
                         if text is None:
                             text = ""
                         output_messages.append({"content": text, "role": role, "tool_calls": [tool_call_info]})
@@ -211,3 +229,15 @@ def _get_base_url(self, **kwargs: Dict[str, Any]) -> Optional[str]:
         client = getattr(instance, "_client", None)
         base_url = getattr(client, "_base_url", None) if client else None
         return str(base_url) if base_url else None
+
+    def _extract_tools(self, tools: Optional[Any]) -> List[ToolDefinition]:
+        if not tools:
+            return []
+
+        tool_definitions = []
+        for tool in tools:
+            tool_def = ToolDefinition(
+                name=tool.get("name", ""), description=tool.get("description", ""), schema=tool.get("input_schema", {})
+            )
+            tool_definitions.append(tool_def)
+        return tool_definitions
diff --git a/releasenotes/notes/anthropic-tool-usage-4d8b46cbe7173979.yaml b/releasenotes/notes/anthropic-tool-usage-4d8b46cbe7173979.yaml
@@ -0,0 +1,3 @@
+features:
+  - |
+    LLM Observability: Adds support for collecting tool definitions, tool calls and tool results in the Anthropic integration.
diff --git a/tests/contrib/anthropic/test_anthropic_llmobs.py b/tests/contrib/anthropic/test_anthropic_llmobs.py
@@ -28,6 +28,20 @@
 ]
 WEATHER_OUTPUT_MESSAGE_3 = "Based on the result from the get_weather tool, the current weather in San \
 Francisco, CA is 73°F."
+WEATHER_TOOL_RESULT = [
+    {"result": "The weather is 73f", "tool_id": "toolu_01DYJo37oETVsCdLTTcCWcdq", "type": "tool_result"}
+]
+
+EXPECTED_TOOL_DEFINITIONS = [
+    {
+        "name": "get_weather",
+        "description": "Get the weather for a specific location",
+        "schema": {
+            "type": "object",
+            "properties": {"location": {"type": "string"}},
+        },
+    }
+]
 
 
 @pytest.mark.parametrize(
@@ -453,6 +467,7 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
                 metadata={"max_tokens": 200.0},
                 token_metrics={"input_tokens": 599, "output_tokens": 152, "total_tokens": 751},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
+                tool_definitions=EXPECTED_TOOL_DEFINITIONS,
             )
         )
 
@@ -495,7 +510,11 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
                         "role": "assistant",
                     },
                     {"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},
-                    {"content": ["The weather is 73f"], "role": "user"},
+                    {
+                        "content": "",
+                        "role": "user",
+                        "tool_results": WEATHER_TOOL_RESULT,
+                    },
                 ],
                 output_messages=[
                     {
@@ -506,6 +525,7 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
                 metadata={"max_tokens": 500.0},
                 token_metrics={"input_tokens": 768, "output_tokens": 29, "total_tokens": 797},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
+                tool_definitions=EXPECTED_TOOL_DEFINITIONS,
             )
         )
 
@@ -549,6 +569,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w
                 metadata={"max_tokens": 200.0},
                 token_metrics={"input_tokens": 599, "output_tokens": 152, "total_tokens": 751},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
+                tool_definitions=EXPECTED_TOOL_DEFINITIONS,
             )
         )
 
@@ -591,7 +612,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w
                         "role": "assistant",
                     },
                     {"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},
-                    {"content": ["The weather is 73f"], "role": "user"},
+                    {"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},
                 ],
                 output_messages=[
                     {
@@ -602,6 +623,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w
                 metadata={"max_tokens": 500.0},
                 token_metrics={"input_tokens": 768, "output_tokens": 29, "total_tokens": 797},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
+                tool_definitions=EXPECTED_TOOL_DEFINITIONS,
             )
         )
 
@@ -662,6 +684,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w
                 metadata={"max_tokens": 200.0},
                 token_metrics={"input_tokens": 599, "output_tokens": 135, "total_tokens": 734},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
+                tool_definitions=EXPECTED_TOOL_DEFINITIONS,
             )
         )
 
@@ -701,7 +724,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w
                     {"content": WEATHER_PROMPT, "role": "user"},
                     {"content": message[0]["text"], "role": "assistant"},
                     {"content": message[1]["text"], "role": "assistant"},
-                    {"content": ["The weather is 73f"], "role": "user"},
+                    {"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},
                 ],
                 output_messages=[
                     {
@@ -712,6 +735,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w
                 metadata={"max_tokens": 500.0},
                 token_metrics={"input_tokens": 762, "output_tokens": 33, "total_tokens": 795},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
+                tool_definitions=EXPECTED_TOOL_DEFINITIONS,
             )
         )
 
@@ -767,6 +791,7 @@ async def test_tools_async_stream_helper(
                 metadata={"max_tokens": 200.0},
                 token_metrics={"input_tokens": 599, "output_tokens": 146, "total_tokens": 745},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
+                tool_definitions=EXPECTED_TOOL_DEFINITIONS,
             )
         )
 
@@ -811,7 +836,7 @@ async def test_tools_async_stream_helper(
                     {"content": WEATHER_PROMPT, "role": "user"},
                     {"content": message.content[0].text, "role": "assistant"},
                     {"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},
-                    {"content": ["The weather is 73f"], "role": "user"},
+                    {"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},
                 ],
                 output_messages=[
                     {
@@ -822,6 +847,7 @@ async def test_tools_async_stream_helper(
                 metadata={"max_tokens": 500.0},
                 token_metrics={"input_tokens": 762, "output_tokens": 18, "total_tokens": 780},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
+                tool_definitions=EXPECTED_TOOL_DEFINITIONS,
             )
         )
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+features:`
	`2`	`+ - \|`
	`3`	`+ LLM Observability: Adds support for collecting tool definitions, tool calls and tool results in the Anthropic integration.`
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,20 @@`
`28`	`28`	`]`
`29`	`29`	`WEATHER_OUTPUT_MESSAGE_3 = "Based on the result from the get_weather tool, the current weather in San \`
`30`	`30`	`Francisco, CA is 73°F."`
	`31`	`+WEATHER_TOOL_RESULT = [`
	`32`	`+ {"result": "The weather is 73f", "tool_id": "toolu_01DYJo37oETVsCdLTTcCWcdq", "type": "tool_result"}`
	`33`	`+]`
	`34`	`+`
	`35`	`+EXPECTED_TOOL_DEFINITIONS = [`
	`36`	`+ {`
	`37`	`+ "name": "get_weather",`
	`38`	`+ "description": "Get the weather for a specific location",`
	`39`	`+ "schema": {`
	`40`	`+ "type": "object",`
	`41`	`+ "properties": {"location": {"type": "string"}},`
	`42`	`+ },`
	`43`	`+ }`
	`44`	`+]`
`31`	`45`
`32`	`46`
`33`	`47`	`@pytest.mark.parametrize(`
`@@ -453,6 +467,7 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,`
`453`	`467`	`metadata={"max_tokens": 200.0},`
`454`	`468`	`token_metrics={"input_tokens": 599, "output_tokens": 152, "total_tokens": 751},`
`455`	`469`	`tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},`
	`470`	`+ tool_definitions=EXPECTED_TOOL_DEFINITIONS,`
`456`	`471`	`)`
`457`	`472`	`)`
`458`	`473`
`@@ -495,7 +510,11 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,`
`495`	`510`	`"role": "assistant",`
`496`	`511`	`},`
`497`	`512`	`{"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},`
`498`		`- {"content": ["The weather is 73f"], "role": "user"},`
	`513`	`+ {`
	`514`	`+ "content": "",`
	`515`	`+ "role": "user",`
	`516`	`+ "tool_results": WEATHER_TOOL_RESULT,`
	`517`	`+ },`
`499`	`518`	`],`
`500`	`519`	`output_messages=[`
`501`	`520`	`{`
`@@ -506,6 +525,7 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,`
`506`	`525`	`metadata={"max_tokens": 500.0},`
`507`	`526`	`token_metrics={"input_tokens": 768, "output_tokens": 29, "total_tokens": 797},`
`508`	`527`	`tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},`
	`528`	`+ tool_definitions=EXPECTED_TOOL_DEFINITIONS,`
`509`	`529`	`)`
`510`	`530`	`)`
`511`	`531`
`@@ -549,6 +569,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w`
`549`	`569`	`metadata={"max_tokens": 200.0},`
`550`	`570`	`token_metrics={"input_tokens": 599, "output_tokens": 152, "total_tokens": 751},`
`551`	`571`	`tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},`
	`572`	`+ tool_definitions=EXPECTED_TOOL_DEFINITIONS,`
`552`	`573`	`)`
`553`	`574`	`)`
`554`	`575`
`@@ -591,7 +612,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w`
`591`	`612`	`"role": "assistant",`
`592`	`613`	`},`
`593`	`614`	`{"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},`
`594`		`- {"content": ["The weather is 73f"], "role": "user"},`
	`615`	`+ {"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},`
`595`	`616`	`],`
`596`	`617`	`output_messages=[`
`597`	`618`	`{`
`@@ -602,6 +623,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w`
`602`	`623`	`metadata={"max_tokens": 500.0},`
`603`	`624`	`token_metrics={"input_tokens": 768, "output_tokens": 29, "total_tokens": 797},`
`604`	`625`	`tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},`
	`626`	`+ tool_definitions=EXPECTED_TOOL_DEFINITIONS,`
`605`	`627`	`)`
`606`	`628`	`)`
`607`	`629`
`@@ -662,6 +684,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w`
`662`	`684`	`metadata={"max_tokens": 200.0},`
`663`	`685`	`token_metrics={"input_tokens": 599, "output_tokens": 135, "total_tokens": 734},`
`664`	`686`	`tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},`
	`687`	`+ tool_definitions=EXPECTED_TOOL_DEFINITIONS,`
`665`	`688`	`)`
`666`	`689`	`)`
`667`	`690`
`@@ -701,7 +724,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w`
`701`	`724`	`{"content": WEATHER_PROMPT, "role": "user"},`
`702`	`725`	`{"content": message[0]["text"], "role": "assistant"},`
`703`	`726`	`{"content": message[1]["text"], "role": "assistant"},`
`704`		`- {"content": ["The weather is 73f"], "role": "user"},`
	`727`	`+ {"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},`
`705`	`728`	`],`
`706`	`729`	`output_messages=[`
`707`	`730`	`{`
`@@ -712,6 +735,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w`
`712`	`735`	`metadata={"max_tokens": 500.0},`
`713`	`736`	`token_metrics={"input_tokens": 762, "output_tokens": 33, "total_tokens": 795},`
`714`	`737`	`tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},`
	`738`	`+ tool_definitions=EXPECTED_TOOL_DEFINITIONS,`
`715`	`739`	`)`
`716`	`740`	`)`
`717`	`741`
`@@ -767,6 +791,7 @@ async def test_tools_async_stream_helper(`
`767`	`791`	`metadata={"max_tokens": 200.0},`
`768`	`792`	`token_metrics={"input_tokens": 599, "output_tokens": 146, "total_tokens": 745},`
`769`	`793`	`tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},`
	`794`	`+ tool_definitions=EXPECTED_TOOL_DEFINITIONS,`
`770`	`795`	`)`
`771`	`796`	`)`
`772`	`797`
`@@ -811,7 +836,7 @@ async def test_tools_async_stream_helper(`
`811`	`836`	`{"content": WEATHER_PROMPT, "role": "user"},`
`812`	`837`	`{"content": message.content[0].text, "role": "assistant"},`
`813`	`838`	`{"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},`
`814`		`- {"content": ["The weather is 73f"], "role": "user"},`
	`839`	`+ {"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},`
`815`	`840`	`],`
`816`	`841`	`output_messages=[`
`817`	`842`	`{`
`@@ -822,6 +847,7 @@ async def test_tools_async_stream_helper(`
`822`	`847`	`metadata={"max_tokens": 500.0},`
`823`	`848`	`token_metrics={"input_tokens": 762, "output_tokens": 18, "total_tokens": 780},`
`824`	`849`	`tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},`
	`850`	`+ tool_definitions=EXPECTED_TOOL_DEFINITIONS,`
`825`	`851`	`)`
`826`	`852`	`)`
`827`	`853`