Skip to content

Commit 1bd9fe9

Browse files
maxzhangddncybul
andauthored
feat(anthropic): added support for tool results, definitions (#14164)
This PR adds improved tracking of tool results and definitions to the LLMObs anthropic integration Testing: traces showing up (frontend is still WIP and not deployed yet) <img width="1729" height="935" alt="Screenshot 2025-07-31 at 2 11 17 PM" src="https://github.com/user-attachments/assets/76fd224b-58fa-43da-aee5-b1f7c3a204bf" /> ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: ncybul <[email protected]>
1 parent fe23980 commit 1bd9fe9

File tree

3 files changed

+85
-26
lines changed

3 files changed

+85
-26
lines changed

ddtrace/llmobs/_integrations/anthropic.py

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@
1919
from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
2020
from ddtrace.llmobs._constants import PROXY_REQUEST
2121
from ddtrace.llmobs._constants import SPAN_KIND
22+
from ddtrace.llmobs._constants import TOOL_DEFINITIONS
2223
from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
2324
from ddtrace.llmobs._integrations.base import BaseLLMIntegration
2425
from ddtrace.llmobs._integrations.utils import update_proxy_workflow_input_output_value
2526
from ddtrace.llmobs._utils import _get_attr
27+
from ddtrace.llmobs.utils import ToolCall
28+
from ddtrace.llmobs.utils import ToolDefinition
29+
from ddtrace.llmobs.utils import ToolResult
2630
from ddtrace.trace import Span
2731

2832

@@ -60,6 +64,9 @@ def _llmobs_set_tags(
6064
parameters["temperature"] = kwargs.get("temperature")
6165
if kwargs.get("max_tokens"):
6266
parameters["max_tokens"] = kwargs.get("max_tokens")
67+
if kwargs.get("tools"):
68+
tools = self._extract_tools(kwargs.get("tools"))
69+
span._set_ctx_item(TOOL_DEFINITIONS, tools)
6370
messages = kwargs.get("messages")
6471
system_prompt = kwargs.get("system")
6572
input_messages = self._extract_input_message(messages, system_prompt)
@@ -124,33 +131,44 @@ def _extract_input_message(self, messages, system_prompt: Optional[Union[str, Li
124131
input_data = _get_attr(block, "input", "")
125132
if isinstance(input_data, str):
126133
input_data = json.loads(input_data)
127-
tool_call_info = {
128-
"name": _get_attr(block, "name", ""),
129-
"arguments": input_data,
130-
"tool_id": _get_attr(block, "id", ""),
131-
"type": _get_attr(block, "type", ""),
132-
}
134+
tool_call_info = ToolCall(
135+
name=_get_attr(block, "name", ""),
136+
arguments=input_data,
137+
tool_id=_get_attr(block, "id", ""),
138+
type=_get_attr(block, "type", ""),
139+
)
133140
if text is None:
134141
text = ""
135142
input_messages.append({"content": text, "role": role, "tool_calls": [tool_call_info]})
136143

137144
elif _get_attr(block, "type", None) == "tool_result":
138145
content = _get_attr(block, "content", None)
139-
if isinstance(content, str):
140-
input_messages.append({"content": "[tool result: {}]".format(content), "role": role})
141-
elif isinstance(content, list):
142-
input_messages.append({"content": [], "role": role})
143-
for tool_result_block in content:
144-
if _get_attr(tool_result_block, "text", "") != "":
145-
input_messages[-1]["content"].append(_get_attr(tool_result_block, "text", ""))
146-
elif _get_attr(tool_result_block, "type", None) == "image":
147-
# Store a placeholder for potentially enormous binary image data.
148-
input_messages[-1]["content"].append("([IMAGE DETECTED])")
146+
formatted_content = self._format_tool_result_content(content)
147+
tool_result_info = ToolResult(
148+
result=formatted_content,
149+
tool_id=_get_attr(block, "tool_use_id", ""),
150+
type="tool_result",
151+
)
152+
input_messages.append({"content": "", "role": role, "tool_results": [tool_result_info]})
149153
else:
150154
input_messages.append({"content": str(block), "role": role})
151155

152156
return input_messages
153157

158+
def _format_tool_result_content(self, content) -> str:
159+
if isinstance(content, str):
160+
return content
161+
elif isinstance(content, Iterable):
162+
formatted_content = []
163+
for tool_result_block in content:
164+
if _get_attr(tool_result_block, "text", "") != "":
165+
formatted_content.append(_get_attr(tool_result_block, "text", ""))
166+
elif _get_attr(tool_result_block, "type", None) == "image":
167+
# Store a placeholder for potentially enormous binary image data.
168+
formatted_content.append("([IMAGE DETECTED])")
169+
return ",".join(formatted_content)
170+
return str(content)
171+
154172
def _extract_output_message(self, response):
155173
"""Extract output messages from the stored response."""
156174
output_messages = []
@@ -170,12 +188,12 @@ def _extract_output_message(self, response):
170188
input_data = _get_attr(completion, "input", "")
171189
if isinstance(input_data, str):
172190
input_data = json.loads(input_data)
173-
tool_call_info = {
174-
"name": _get_attr(completion, "name", ""),
175-
"arguments": input_data,
176-
"tool_id": _get_attr(completion, "id", ""),
177-
"type": _get_attr(completion, "type", ""),
178-
}
191+
tool_call_info = ToolCall(
192+
name=_get_attr(completion, "name", ""),
193+
arguments=input_data,
194+
tool_id=_get_attr(completion, "id", ""),
195+
type=_get_attr(completion, "type", ""),
196+
)
179197
if text is None:
180198
text = ""
181199
output_messages.append({"content": text, "role": role, "tool_calls": [tool_call_info]})
@@ -211,3 +229,15 @@ def _get_base_url(self, **kwargs: Dict[str, Any]) -> Optional[str]:
211229
client = getattr(instance, "_client", None)
212230
base_url = getattr(client, "_base_url", None) if client else None
213231
return str(base_url) if base_url else None
232+
233+
def _extract_tools(self, tools: Optional[Any]) -> List[ToolDefinition]:
234+
if not tools:
235+
return []
236+
237+
tool_definitions = []
238+
for tool in tools:
239+
tool_def = ToolDefinition(
240+
name=tool.get("name", ""), description=tool.get("description", ""), schema=tool.get("input_schema", {})
241+
)
242+
tool_definitions.append(tool_def)
243+
return tool_definitions
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
features:
2+
- |
3+
LLM Observability: Adds support for collecting tool definitions, tool calls and tool results in the Anthropic integration.

tests/contrib/anthropic/test_anthropic_llmobs.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,20 @@
2828
]
2929
WEATHER_OUTPUT_MESSAGE_3 = "Based on the result from the get_weather tool, the current weather in San \
3030
Francisco, CA is 73°F."
31+
WEATHER_TOOL_RESULT = [
32+
{"result": "The weather is 73f", "tool_id": "toolu_01DYJo37oETVsCdLTTcCWcdq", "type": "tool_result"}
33+
]
34+
35+
EXPECTED_TOOL_DEFINITIONS = [
36+
{
37+
"name": "get_weather",
38+
"description": "Get the weather for a specific location",
39+
"schema": {
40+
"type": "object",
41+
"properties": {"location": {"type": "string"}},
42+
},
43+
}
44+
]
3145

3246

3347
@pytest.mark.parametrize(
@@ -453,6 +467,7 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
453467
metadata={"max_tokens": 200.0},
454468
token_metrics={"input_tokens": 599, "output_tokens": 152, "total_tokens": 751},
455469
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
470+
tool_definitions=EXPECTED_TOOL_DEFINITIONS,
456471
)
457472
)
458473

@@ -495,7 +510,11 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
495510
"role": "assistant",
496511
},
497512
{"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},
498-
{"content": ["The weather is 73f"], "role": "user"},
513+
{
514+
"content": "",
515+
"role": "user",
516+
"tool_results": WEATHER_TOOL_RESULT,
517+
},
499518
],
500519
output_messages=[
501520
{
@@ -506,6 +525,7 @@ def test_tools_sync(self, anthropic, ddtrace_global_config, mock_llmobs_writer,
506525
metadata={"max_tokens": 500.0},
507526
token_metrics={"input_tokens": 768, "output_tokens": 29, "total_tokens": 797},
508527
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
528+
tool_definitions=EXPECTED_TOOL_DEFINITIONS,
509529
)
510530
)
511531

@@ -549,6 +569,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w
549569
metadata={"max_tokens": 200.0},
550570
token_metrics={"input_tokens": 599, "output_tokens": 152, "total_tokens": 751},
551571
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
572+
tool_definitions=EXPECTED_TOOL_DEFINITIONS,
552573
)
553574
)
554575

@@ -591,7 +612,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w
591612
"role": "assistant",
592613
},
593614
{"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},
594-
{"content": ["The weather is 73f"], "role": "user"},
615+
{"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},
595616
],
596617
output_messages=[
597618
{
@@ -602,6 +623,7 @@ async def test_tools_async(self, anthropic, ddtrace_global_config, mock_llmobs_w
602623
metadata={"max_tokens": 500.0},
603624
token_metrics={"input_tokens": 768, "output_tokens": 29, "total_tokens": 797},
604625
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
626+
tool_definitions=EXPECTED_TOOL_DEFINITIONS,
605627
)
606628
)
607629

@@ -662,6 +684,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w
662684
metadata={"max_tokens": 200.0},
663685
token_metrics={"input_tokens": 599, "output_tokens": 135, "total_tokens": 734},
664686
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
687+
tool_definitions=EXPECTED_TOOL_DEFINITIONS,
665688
)
666689
)
667690

@@ -701,7 +724,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w
701724
{"content": WEATHER_PROMPT, "role": "user"},
702725
{"content": message[0]["text"], "role": "assistant"},
703726
{"content": message[1]["text"], "role": "assistant"},
704-
{"content": ["The weather is 73f"], "role": "user"},
727+
{"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},
705728
],
706729
output_messages=[
707730
{
@@ -712,6 +735,7 @@ def test_tools_sync_stream(self, anthropic, ddtrace_global_config, mock_llmobs_w
712735
metadata={"max_tokens": 500.0},
713736
token_metrics={"input_tokens": 762, "output_tokens": 33, "total_tokens": 795},
714737
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
738+
tool_definitions=EXPECTED_TOOL_DEFINITIONS,
715739
)
716740
)
717741

@@ -767,6 +791,7 @@ async def test_tools_async_stream_helper(
767791
metadata={"max_tokens": 200.0},
768792
token_metrics={"input_tokens": 599, "output_tokens": 146, "total_tokens": 745},
769793
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
794+
tool_definitions=EXPECTED_TOOL_DEFINITIONS,
770795
)
771796
)
772797

@@ -811,7 +836,7 @@ async def test_tools_async_stream_helper(
811836
{"content": WEATHER_PROMPT, "role": "user"},
812837
{"content": message.content[0].text, "role": "assistant"},
813838
{"content": "", "role": "assistant", "tool_calls": WEATHER_OUTPUT_MESSAGE_2_TOOL_CALL},
814-
{"content": ["The weather is 73f"], "role": "user"},
839+
{"content": "", "role": "user", "tool_results": WEATHER_TOOL_RESULT},
815840
],
816841
output_messages=[
817842
{
@@ -822,6 +847,7 @@ async def test_tools_async_stream_helper(
822847
metadata={"max_tokens": 500.0},
823848
token_metrics={"input_tokens": 762, "output_tokens": 18, "total_tokens": 780},
824849
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.anthropic"},
850+
tool_definitions=EXPECTED_TOOL_DEFINITIONS,
825851
)
826852
)
827853

0 commit comments

Comments
 (0)