From 25f64773278bd2db0707fb3448db47a4e19454e5 Mon Sep 17 00:00:00 2001 From: Simon Hellmayr Date: Fri, 26 Sep 2025 14:20:47 +0200 Subject: [PATCH 1/2] add executed tools to invoke_agent spans in openai agents --- .../openai_agents/spans/invoke_agent.py | 5 +- .../integrations/openai_agents/utils.py | 64 +++- .../openai_agents/test_openai_agents.py | 360 +++++++++++++++--- 3 files changed, 367 insertions(+), 62 deletions(-) diff --git a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py index cf06120625..24d9a78629 100644 --- a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py +++ b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py @@ -5,7 +5,7 @@ from sentry_sdk.utils import safe_serialize from ..consts import SPAN_ORIGIN -from ..utils import _set_agent_data +from ..utils import _set_agent_data, _set_output_data from typing import TYPE_CHECKING @@ -75,4 +75,7 @@ def update_invoke_agent_span(context, agent, output): span, SPANDATA.GEN_AI_RESPONSE_TEXT, output, unpack=False ) + # Capture tool calls from the output if available + _set_output_data(span, output) + span.__exit__(None, None, None) diff --git a/sentry_sdk/integrations/openai_agents/utils.py b/sentry_sdk/integrations/openai_agents/utils.py index a0487e0e3a..b2f1d117f5 100644 --- a/sentry_sdk/integrations/openai_agents/utils.py +++ b/sentry_sdk/integrations/openai_agents/utils.py @@ -3,7 +3,7 @@ from sentry_sdk.consts import SPANDATA from sentry_sdk.integrations import DidNotEnable from sentry_sdk.scope import should_send_default_pii -from sentry_sdk.utils import event_from_exception, safe_serialize +from sentry_sdk.utils import event_from_exception from typing import TYPE_CHECKING @@ -28,6 +28,35 @@ def _capture_exception(exc): sentry_sdk.capture_event(event, hint=hint) +def _simplify_openai_agent_tools(tools): + # type: (Any) -> list[dict[str, Any]] | None + """Parse and simplify OpenAI agent tools into a cleaner format.""" + if not tools: + return None + + if not isinstance(tools, (list, tuple)): + return None + + simplified_tools = [] + for tool in tools: + try: + simplified_tool = { + "name": getattr(tool, "name", None), + "description": getattr(tool, "description", None), + } + + tool_type = getattr(tool, "__class__", None) + if tool_type: + simplified_tool["type"] = tool_type.__name__.lower().replace("tool", "") + + if simplified_tool["name"]: + simplified_tools.append(simplified_tool) + except Exception: + continue + + return simplified_tools if simplified_tools else None + + def _set_agent_data(span, agent): # type: (sentry_sdk.tracing.Span, agents.Agent) -> None span.set_data( @@ -66,10 +95,10 @@ def _set_agent_data(span, agent): ) if len(agent.tools) > 0: - span.set_data( - SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS, - safe_serialize([vars(tool) for tool in agent.tools]), - ) + simplified_tools = _simplify_openai_agent_tools(agent.tools) + if simplified_tools: + # Use span.set_data directly to preserve list type instead of JSON string + span.set_data(SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS, simplified_tools) def _set_usage_data(span, usage): @@ -128,6 +157,14 @@ def _set_output_data(span, result): if not should_send_default_pii(): return + # Handle case where result is a string directly + if isinstance(result, str): + return + + # Handle case where result doesn't have an output attribute + if not hasattr(result, "output"): + return + output_messages = { "response": [], "tool": [], @@ -135,19 +172,26 @@ def _set_output_data(span, result): for output in result.output: if output.type == "function_call": - output_messages["tool"].append(output.dict()) + # Use model_dump() if available (Pydantic v2), fallback to dict() for compatibility + if hasattr(output, "model_dump"): + output_messages["tool"].append(output.model_dump()) + else: + output_messages["tool"].append(output.dict()) elif output.type == "message": for output_message in output.content: try: output_messages["response"].append(output_message.text) except AttributeError: # Unknown output message type, just return the json - output_messages["response"].append(output_message.dict()) + # Use model_dump() if available (Pydantic v2), fallback to dict() for compatibility + if hasattr(output_message, "model_dump"): + output_messages["response"].append(output_message.model_dump()) + else: + output_messages["response"].append(output_message.dict()) if len(output_messages["tool"]) > 0: - span.set_data( - SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS, safe_serialize(output_messages["tool"]) - ) + # Use span.set_data directly to preserve list type instead of JSON string + span.set_data(SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS, output_messages["tool"]) if len(output_messages["response"]) > 0: set_data_normalized( diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 047b919213..e1f6e17ad9 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -437,24 +437,10 @@ def simple_test_tool(message: str) -> str: ai_client_span2, ) = spans - available_tools = safe_serialize( - [ - { - "name": "simple_test_tool", - "description": "A simple tool", - "params_json_schema": { - "properties": {"message": {"title": "Message", "type": "string"}}, - "required": ["message"], - "title": "simple_test_tool_args", - "type": "object", - "additionalProperties": False, - }, - "on_invoke_tool": "._create_function_tool.._on_invoke_tool>", - "strict_json_schema": True, - "is_enabled": True, - } - ] - ) + # Expect simplified tool format, not raw tool data + available_tools = [ + {"name": "simple_test_tool", "description": "A simple tool", "type": "function"} + ] assert transaction["transaction"] == "test_agent workflow" assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" @@ -500,35 +486,22 @@ def simple_test_tool(message: str) -> str: assert ai_client_span1["data"]["gen_ai.usage.output_tokens"] == 5 assert ai_client_span1["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 assert ai_client_span1["data"]["gen_ai.usage.total_tokens"] == 15 - assert re.sub( - r"SerializationIterator\(.*\)", - "NOT_CHECKED", - ai_client_span1["data"]["gen_ai.response.tool_calls"], - ) == safe_serialize( - [ - { - "arguments": '{"message": "hello"}', - "call_id": "call_123", - "name": "simple_test_tool", - "type": "function_call", - "id": "call_123", - "status": None, - "function": "NOT_CHECKED", - } - ] - ) + # Tool calls are now stored as a list, not a JSON string + tool_calls = ai_client_span1["data"]["gen_ai.response.tool_calls"] + assert len(tool_calls) == 1 + tool_call = tool_calls[0] + assert tool_call["arguments"] == '{"message": "hello"}' + assert tool_call["call_id"] == "call_123" + assert tool_call["name"] == "simple_test_tool" + assert tool_call["type"] == "function_call" + assert tool_call["id"] == "call_123" + assert tool_call["status"] is None + # Don't check the function field as it contains mock objects assert tool_span["description"] == "execute_tool simple_test_tool" assert tool_span["data"]["gen_ai.agent.name"] == "test_agent" assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert ( - re.sub( - "<.*>(,)", - r"'NOT_CHECKED'\1", - agent_span["data"]["gen_ai.request.available_tools"], - ) - == available_tools - ) + assert agent_span["data"]["gen_ai.request.available_tools"] == available_tools assert tool_span["data"]["gen_ai.request.max_tokens"] == 100 assert tool_span["data"]["gen_ai.request.model"] == "gpt-4" assert tool_span["data"]["gen_ai.request.temperature"] == 0.7 @@ -543,14 +516,10 @@ def simple_test_tool(message: str) -> str: assert ai_client_span2["description"] == "chat gpt-4" assert ai_client_span2["data"]["gen_ai.agent.name"] == "test_agent" assert ai_client_span2["data"]["gen_ai.operation.name"] == "chat" - assert ( - re.sub( - "<.*>(,)", - r"'NOT_CHECKED'\1", - agent_span["data"]["gen_ai.request.available_tools"], - ) - == available_tools - ) + # available_tools is now a list, not a JSON string, so we can compare directly + assert agent_span["data"]["gen_ai.request.available_tools"] == [ + {"name": "simple_test_tool", "description": "A simple tool", "type": "function"} + ] assert ai_client_span2["data"]["gen_ai.request.max_tokens"] == 100 assert re.sub( r"SerializationIterator\(.*\)", @@ -697,3 +666,292 @@ async def run(): assert txn2["transaction"] == "test_agent workflow" assert txn3["type"] == "transaction" assert txn3["transaction"] == "test_agent workflow" + + +@pytest.mark.asyncio +async def test_available_tools_simplified_format( + sentry_init, capture_events, test_agent, mock_model_response +): + """ + Test that available tools are recorded in simplified format on invoke_agent spans. + """ + + @agents.function_tool + def search_tool(query: str) -> str: + """Search for information using the given query.""" + return f"Search results for: {query}" + + @agents.function_tool + def calculator_tool(expression: str) -> str: + """Calculate mathematical expressions.""" + return f"Result: {expression}" + + # Create agent with multiple tools + agent_with_tools = test_agent.clone(tools=[search_tool, calculator_tool]) + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + mock_get_response.return_value = mock_model_response + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) + + events = capture_events() + + result = await agents.Runner.run( + agent_with_tools, "Test input", run_config=test_run_config + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = spans[0] + + # Verify simplified tools format + available_tools = invoke_agent_span["data"]["gen_ai.request.available_tools"] + assert isinstance(available_tools, list) + assert len(available_tools) == 2 + + # Check first tool + search_tool_data = next( + (t for t in available_tools if t["name"] == "search_tool"), None + ) + assert search_tool_data is not None + assert search_tool_data["name"] == "search_tool" + assert ( + search_tool_data["description"] + == "Search for information using the given query." + ) + assert search_tool_data["type"] == "function" + + # Check second tool + calculator_tool_data = next( + (t for t in available_tools if t["name"] == "calculator_tool"), None + ) + assert calculator_tool_data is not None + assert calculator_tool_data["name"] == "calculator_tool" + assert calculator_tool_data["description"] == "Calculate mathematical expressions." + assert calculator_tool_data["type"] == "function" + + # Verify no extra fields are included (simplified format) + for tool_data in available_tools: + expected_keys = {"name", "description", "type"} + assert set(tool_data.keys()) == expected_keys + + +@pytest.mark.asyncio +async def test_tool_calls_captured_in_invoke_agent_span( + sentry_init, capture_events, test_agent +): + """ + Test that tool calls are captured in invoke_agent spans when tools are used. + """ + + @agents.function_tool + def test_function(input_text: str) -> str: + """A test function.""" + return f"Processed: {input_text}" + + agent_with_tool = test_agent.clone(tools=[test_function]) + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + + # Mock response that includes a tool call + tool_call_response = ModelResponse( + output=[ + ResponseFunctionToolCall( + id="call_test_123", + call_id="call_test_123", + name="test_function", + type="function_call", + arguments='{"input_text": "hello world"}', + function=MagicMock( + name="test_function", + arguments='{"input_text": "hello world"}', + ), + ) + ], + usage=Usage( + requests=1, input_tokens=10, output_tokens=5, total_tokens=15 + ), + response_id="resp_tool_123", + ) + + # Final response after tool execution + final_response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Tool execution completed successfully", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=Usage( + requests=1, input_tokens=15, output_tokens=10, total_tokens=25 + ), + response_id="resp_final_123", + ) + + mock_get_response.side_effect = [tool_call_response, final_response] + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) + + events = capture_events() + + result = await agents.Runner.run( + agent_with_tool, + "Please use the test function", + run_config=test_run_config, + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = spans[0] + + # Verify that available tools are recorded + assert "gen_ai.request.available_tools" in invoke_agent_span["data"] + available_tools = invoke_agent_span["data"]["gen_ai.request.available_tools"] + assert len(available_tools) == 1 + assert available_tools[0]["name"] == "test_function" + assert available_tools[0]["type"] == "function" + + # Find the AI client span that contains the tool call (first response) + # The tool calls should be captured in the AI client span, not the invoke agent span + tool_call_span = None + for span in spans: + if span.get("description", "").startswith( + "chat" + ) and "gen_ai.response.tool_calls" in span.get("data", {}): + tool_call_span = span + break + + assert tool_call_span is not None, "Tool call span not found" + tool_calls = tool_call_span["data"]["gen_ai.response.tool_calls"] + assert len(tool_calls) == 1 + + tool_call = tool_calls[0] + assert tool_call["name"] == "test_function" + assert tool_call["type"] == "function_call" + assert tool_call["call_id"] == "call_test_123" + assert tool_call["arguments"] == '{"input_text": "hello world"}' + + +@pytest.mark.asyncio +async def test_agent_without_tools( + sentry_init, capture_events, test_agent, mock_model_response +): + """ + Test that agents without tools don't cause issues and don't include tools data. + """ + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + mock_get_response.return_value = mock_model_response + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) + + events = capture_events() + + result = await agents.Runner.run( + test_agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = spans[0] + + # Agent has no tools, so available_tools should not be present + assert "gen_ai.request.available_tools" not in invoke_agent_span["data"] + + # And no tool calls should be present since no tools were used + assert "gen_ai.response.tool_calls" not in invoke_agent_span["data"] + + +def test_simplify_openai_agent_tools_edge_cases(): + """ + Test edge cases for the _simplify_openai_agent_tools function. + """ + from sentry_sdk.integrations.openai_agents.utils import _simplify_openai_agent_tools + + # Test with None + assert _simplify_openai_agent_tools(None) is None + + # Test with empty list + assert _simplify_openai_agent_tools([]) is None + + # Test with non-list/tuple + assert _simplify_openai_agent_tools("invalid") is None + assert _simplify_openai_agent_tools(42) is None + + # Test with mock tool objects + class FunctionTool: + def __init__(self, name, description): + self.name = name + self.description = description + + class CustomTool: + def __init__(self, name, description): + self.name = name + self.description = description + + # Test with valid tools + mock_tools = [ + FunctionTool("tool1", "Description 1"), + CustomTool("tool2", "Description 2"), + ] + + result = _simplify_openai_agent_tools(mock_tools) + assert result is not None + assert len(result) == 2 + assert result[0]["name"] == "tool1" + assert result[0]["description"] == "Description 1" + assert result[0]["type"] == "function" + assert result[1]["name"] == "tool2" + assert result[1]["description"] == "Description 2" + assert result[1]["type"] == "custom" + + # Test with tool missing name (should be filtered out) + class MockToolNoName: + def __init__(self): + self.description = "Has description but no name" + + mock_tools_with_invalid = [ + FunctionTool("valid_tool", "Valid description"), + MockToolNoName(), + ] + + result = _simplify_openai_agent_tools(mock_tools_with_invalid) + assert result is not None + assert len(result) == 1 + assert result[0]["name"] == "valid_tool" From dc6fc84dec764659a24c7dcde373804457417b80 Mon Sep 17 00:00:00 2001 From: Simon Hellmayr Date: Tue, 30 Sep 2025 10:46:25 +0200 Subject: [PATCH 2/2] cleanup --- sentry_sdk/integrations/openai_agents/utils.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sentry_sdk/integrations/openai_agents/utils.py b/sentry_sdk/integrations/openai_agents/utils.py index b2f1d117f5..3427dbe8aa 100644 --- a/sentry_sdk/integrations/openai_agents/utils.py +++ b/sentry_sdk/integrations/openai_agents/utils.py @@ -97,7 +97,6 @@ def _set_agent_data(span, agent): if len(agent.tools) > 0: simplified_tools = _simplify_openai_agent_tools(agent.tools) if simplified_tools: - # Use span.set_data directly to preserve list type instead of JSON string span.set_data(SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS, simplified_tools) @@ -157,14 +156,6 @@ def _set_output_data(span, result): if not should_send_default_pii(): return - # Handle case where result is a string directly - if isinstance(result, str): - return - - # Handle case where result doesn't have an output attribute - if not hasattr(result, "output"): - return - output_messages = { "response": [], "tool": [], @@ -172,7 +163,6 @@ def _set_output_data(span, result): for output in result.output: if output.type == "function_call": - # Use model_dump() if available (Pydantic v2), fallback to dict() for compatibility if hasattr(output, "model_dump"): output_messages["tool"].append(output.model_dump()) else: @@ -182,15 +172,12 @@ def _set_output_data(span, result): try: output_messages["response"].append(output_message.text) except AttributeError: - # Unknown output message type, just return the json - # Use model_dump() if available (Pydantic v2), fallback to dict() for compatibility if hasattr(output_message, "model_dump"): output_messages["response"].append(output_message.model_dump()) else: output_messages["response"].append(output_message.dict()) if len(output_messages["tool"]) > 0: - # Use span.set_data directly to preserve list type instead of JSON string span.set_data(SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS, output_messages["tool"]) if len(output_messages["response"]) > 0: