| 
 | 1 | +"""  | 
 | 2 | +Tests to ensure that tool call arguments are properly populated in streaming events.  | 
 | 3 | +
  | 
 | 4 | +This test specifically guards against the regression where tool_called events  | 
 | 5 | +were emitted with empty arguments during streaming (Issue #1629).  | 
 | 6 | +"""  | 
 | 7 | + | 
 | 8 | +import json  | 
 | 9 | +from collections.abc import AsyncIterator  | 
 | 10 | +from typing import Any, Optional, Union, cast  | 
 | 11 | + | 
 | 12 | +import pytest  | 
 | 13 | +from openai.types.responses import (  | 
 | 14 | +    ResponseCompletedEvent,  | 
 | 15 | +    ResponseFunctionToolCall,  | 
 | 16 | +    ResponseOutputItemAddedEvent,  | 
 | 17 | +    ResponseOutputItemDoneEvent,  | 
 | 18 | +)  | 
 | 19 | + | 
 | 20 | +from agents import Agent, Runner, function_tool  | 
 | 21 | +from agents.agent_output import AgentOutputSchemaBase  | 
 | 22 | +from agents.handoffs import Handoff  | 
 | 23 | +from agents.items import TResponseInputItem, TResponseOutputItem, TResponseStreamEvent  | 
 | 24 | +from agents.model_settings import ModelSettings  | 
 | 25 | +from agents.models.interface import Model, ModelTracing  | 
 | 26 | +from agents.stream_events import RunItemStreamEvent  | 
 | 27 | +from agents.tool import Tool  | 
 | 28 | +from agents.tracing import generation_span  | 
 | 29 | + | 
 | 30 | +from .fake_model import get_response_obj  | 
 | 31 | +from .test_responses import get_function_tool_call  | 
 | 32 | + | 
 | 33 | + | 
 | 34 | +class StreamingFakeModel(Model):  | 
 | 35 | +    """A fake model that actually emits streaming events to test our streaming fix."""  | 
 | 36 | + | 
 | 37 | +    def __init__(self):  | 
 | 38 | +        self.turn_outputs: list[list[TResponseOutputItem]] = []  | 
 | 39 | +        self.last_turn_args: dict[str, Any] = {}  | 
 | 40 | + | 
 | 41 | +    def set_next_output(self, output: list[TResponseOutputItem]):  | 
 | 42 | +        self.turn_outputs.append(output)  | 
 | 43 | + | 
 | 44 | +    def get_next_output(self) -> list[TResponseOutputItem]:  | 
 | 45 | +        if not self.turn_outputs:  | 
 | 46 | +            return []  | 
 | 47 | +        return self.turn_outputs.pop(0)  | 
 | 48 | + | 
 | 49 | +    async def get_response(  | 
 | 50 | +        self,  | 
 | 51 | +        system_instructions: Optional[str],  | 
 | 52 | +        input: Union[str, list[TResponseInputItem]],  | 
 | 53 | +        model_settings: ModelSettings,  | 
 | 54 | +        tools: list[Tool],  | 
 | 55 | +        output_schema: Optional[AgentOutputSchemaBase],  | 
 | 56 | +        handoffs: list[Handoff],  | 
 | 57 | +        tracing: ModelTracing,  | 
 | 58 | +        *,  | 
 | 59 | +        previous_response_id: Optional[str],  | 
 | 60 | +        conversation_id: Optional[str],  | 
 | 61 | +        prompt: Optional[Any],  | 
 | 62 | +    ):  | 
 | 63 | +        raise NotImplementedError("Use stream_response instead")  | 
 | 64 | + | 
 | 65 | +    async def stream_response(  | 
 | 66 | +        self,  | 
 | 67 | +        system_instructions: Optional[str],  | 
 | 68 | +        input: Union[str, list[TResponseInputItem]],  | 
 | 69 | +        model_settings: ModelSettings,  | 
 | 70 | +        tools: list[Tool],  | 
 | 71 | +        output_schema: Optional[AgentOutputSchemaBase],  | 
 | 72 | +        handoffs: list[Handoff],  | 
 | 73 | +        tracing: ModelTracing,  | 
 | 74 | +        *,  | 
 | 75 | +        previous_response_id: Optional[str] = None,  | 
 | 76 | +        conversation_id: Optional[str] = None,  | 
 | 77 | +        prompt: Optional[Any] = None,  | 
 | 78 | +    ) -> AsyncIterator[TResponseStreamEvent]:  | 
 | 79 | +        """Stream events that simulate real OpenAI streaming behavior for tool calls."""  | 
 | 80 | +        self.last_turn_args = {  | 
 | 81 | +            "system_instructions": system_instructions,  | 
 | 82 | +            "input": input,  | 
 | 83 | +            "model_settings": model_settings,  | 
 | 84 | +            "tools": tools,  | 
 | 85 | +            "output_schema": output_schema,  | 
 | 86 | +            "previous_response_id": previous_response_id,  | 
 | 87 | +            "conversation_id": conversation_id,  | 
 | 88 | +        }  | 
 | 89 | + | 
 | 90 | +        with generation_span(disabled=True) as _:  | 
 | 91 | +            output = self.get_next_output()  | 
 | 92 | + | 
 | 93 | +            sequence_number = 0  | 
 | 94 | + | 
 | 95 | +            # Emit each output item with proper streaming events  | 
 | 96 | +            for item in output:  | 
 | 97 | +                if isinstance(item, ResponseFunctionToolCall):  | 
 | 98 | +                    # First: emit ResponseOutputItemAddedEvent with EMPTY arguments  | 
 | 99 | +                    # (this simulates the real streaming behavior that was causing the bug)  | 
 | 100 | +                    empty_args_item = ResponseFunctionToolCall(  | 
 | 101 | +                        id=item.id,  | 
 | 102 | +                        call_id=item.call_id,  | 
 | 103 | +                        type=item.type,  | 
 | 104 | +                        name=item.name,  | 
 | 105 | +                        arguments="",  # EMPTY - this is the bug condition!  | 
 | 106 | +                    )  | 
 | 107 | + | 
 | 108 | +                    yield ResponseOutputItemAddedEvent(  | 
 | 109 | +                        item=empty_args_item,  | 
 | 110 | +                        output_index=0,  | 
 | 111 | +                        type="response.output_item.added",  | 
 | 112 | +                        sequence_number=sequence_number,  | 
 | 113 | +                    )  | 
 | 114 | +                    sequence_number += 1  | 
 | 115 | + | 
 | 116 | +                    # Then: emit ResponseOutputItemDoneEvent with COMPLETE arguments  | 
 | 117 | +                    yield ResponseOutputItemDoneEvent(  | 
 | 118 | +                        item=item,  # This has the complete arguments  | 
 | 119 | +                        output_index=0,  | 
 | 120 | +                        type="response.output_item.done",  | 
 | 121 | +                        sequence_number=sequence_number,  | 
 | 122 | +                    )  | 
 | 123 | +                    sequence_number += 1  | 
 | 124 | + | 
 | 125 | +            # Finally: emit completion  | 
 | 126 | +            yield ResponseCompletedEvent(  | 
 | 127 | +                type="response.completed",  | 
 | 128 | +                response=get_response_obj(output),  | 
 | 129 | +                sequence_number=sequence_number,  | 
 | 130 | +            )  | 
 | 131 | + | 
 | 132 | + | 
 | 133 | +@function_tool  | 
 | 134 | +def calculate_sum(a: int, b: int) -> str:  | 
 | 135 | +    """Add two numbers together."""  | 
 | 136 | +    return str(a + b)  | 
 | 137 | + | 
 | 138 | + | 
 | 139 | +@function_tool  | 
 | 140 | +def format_message(name: str, message: str, urgent: bool = False) -> str:  | 
 | 141 | +    """Format a message with name and urgency."""  | 
 | 142 | +    prefix = "URGENT: " if urgent else ""  | 
 | 143 | +    return f"{prefix}Hello {name}, {message}"  | 
 | 144 | + | 
 | 145 | + | 
 | 146 | +@pytest.mark.asyncio  | 
 | 147 | +async def test_streaming_tool_call_arguments_not_empty():  | 
 | 148 | +    """Test that tool_called events contain non-empty arguments during streaming."""  | 
 | 149 | +    model = StreamingFakeModel()  | 
 | 150 | +    agent = Agent(  | 
 | 151 | +        name="TestAgent",  | 
 | 152 | +        model=model,  | 
 | 153 | +        tools=[calculate_sum],  | 
 | 154 | +    )  | 
 | 155 | + | 
 | 156 | +    # Set up a tool call with arguments  | 
 | 157 | +    expected_arguments = '{"a": 5, "b": 3}'  | 
 | 158 | +    model.set_next_output(  | 
 | 159 | +        [  | 
 | 160 | +            get_function_tool_call("calculate_sum", expected_arguments, "call_123"),  | 
 | 161 | +        ]  | 
 | 162 | +    )  | 
 | 163 | + | 
 | 164 | +    result = Runner.run_streamed(agent, input="Add 5 and 3")  | 
 | 165 | + | 
 | 166 | +    tool_called_events = []  | 
 | 167 | +    async for event in result.stream_events():  | 
 | 168 | +        if (  | 
 | 169 | +            event.type == "run_item_stream_event"  | 
 | 170 | +            and isinstance(event, RunItemStreamEvent)  | 
 | 171 | +            and event.name == "tool_called"  | 
 | 172 | +        ):  | 
 | 173 | +            tool_called_events.append(event)  | 
 | 174 | + | 
 | 175 | +    # Verify we got exactly one tool_called event  | 
 | 176 | +    assert len(tool_called_events) == 1, (  | 
 | 177 | +        f"Expected 1 tool_called event, got {len(tool_called_events)}"  | 
 | 178 | +    )  | 
 | 179 | + | 
 | 180 | +    tool_event = tool_called_events[0]  | 
 | 181 | + | 
 | 182 | +    # Verify the event has the expected structure  | 
 | 183 | +    assert hasattr(tool_event.item, "raw_item"), "tool_called event should have raw_item"  | 
 | 184 | +    assert hasattr(tool_event.item.raw_item, "arguments"), "raw_item should have arguments field"  | 
 | 185 | + | 
 | 186 | +    # The critical test: arguments should NOT be empty  | 
 | 187 | +    # Cast to ResponseFunctionToolCall since we know that's what it is in our test  | 
 | 188 | +    raw_item = cast(ResponseFunctionToolCall, tool_event.item.raw_item)  | 
 | 189 | +    actual_arguments = raw_item.arguments  | 
 | 190 | +    assert actual_arguments != "", (  | 
 | 191 | +        f"Tool call arguments should not be empty, got: '{actual_arguments}'"  | 
 | 192 | +    )  | 
 | 193 | +    assert actual_arguments is not None, "Tool call arguments should not be None"  | 
 | 194 | + | 
 | 195 | +    # Verify arguments contain the expected data  | 
 | 196 | +    assert actual_arguments == expected_arguments, (  | 
 | 197 | +        f"Expected arguments '{expected_arguments}', got '{actual_arguments}'"  | 
 | 198 | +    )  | 
 | 199 | + | 
 | 200 | +    # Verify arguments are valid JSON that can be parsed  | 
 | 201 | +    try:  | 
 | 202 | +        parsed_args = json.loads(actual_arguments)  | 
 | 203 | +        assert parsed_args == {"a": 5, "b": 3}, (  | 
 | 204 | +            f"Parsed arguments should match expected values, got {parsed_args}"  | 
 | 205 | +        )  | 
 | 206 | +    except json.JSONDecodeError as e:  | 
 | 207 | +        pytest.fail(  | 
 | 208 | +            f"Tool call arguments should be valid JSON, but got: '{actual_arguments}' with error: {e}"  # noqa: E501  | 
 | 209 | +        )  | 
 | 210 | + | 
 | 211 | + | 
 | 212 | +@pytest.mark.asyncio  | 
 | 213 | +async def test_streaming_tool_call_arguments_complex():  | 
 | 214 | +    """Test streaming tool calls with complex arguments including strings and booleans."""  | 
 | 215 | +    model = StreamingFakeModel()  | 
 | 216 | +    agent = Agent(  | 
 | 217 | +        name="TestAgent",  | 
 | 218 | +        model=model,  | 
 | 219 | +        tools=[format_message],  | 
 | 220 | +    )  | 
 | 221 | + | 
 | 222 | +    # Set up a tool call with complex arguments  | 
 | 223 | +    expected_arguments = (  | 
 | 224 | +        '{"name": "Alice", "message": "Your meeting is starting soon", "urgent": true}'  | 
 | 225 | +    )  | 
 | 226 | +    model.set_next_output(  | 
 | 227 | +        [  | 
 | 228 | +            get_function_tool_call("format_message", expected_arguments, "call_456"),  | 
 | 229 | +        ]  | 
 | 230 | +    )  | 
 | 231 | + | 
 | 232 | +    result = Runner.run_streamed(agent, input="Format a message for Alice")  | 
 | 233 | + | 
 | 234 | +    tool_called_events = []  | 
 | 235 | +    async for event in result.stream_events():  | 
 | 236 | +        if (  | 
 | 237 | +            event.type == "run_item_stream_event"  | 
 | 238 | +            and isinstance(event, RunItemStreamEvent)  | 
 | 239 | +            and event.name == "tool_called"  | 
 | 240 | +        ):  | 
 | 241 | +            tool_called_events.append(event)  | 
 | 242 | + | 
 | 243 | +    assert len(tool_called_events) == 1, (  | 
 | 244 | +        f"Expected 1 tool_called event, got {len(tool_called_events)}"  | 
 | 245 | +    )  | 
 | 246 | + | 
 | 247 | +    tool_event = tool_called_events[0]  | 
 | 248 | +    # Cast to ResponseFunctionToolCall since we know that's what it is in our test  | 
 | 249 | +    raw_item = cast(ResponseFunctionToolCall, tool_event.item.raw_item)  | 
 | 250 | +    actual_arguments = raw_item.arguments  | 
 | 251 | + | 
 | 252 | +    # Critical checks for the regression  | 
 | 253 | +    assert actual_arguments != "", "Tool call arguments should not be empty"  | 
 | 254 | +    assert actual_arguments is not None, "Tool call arguments should not be None"  | 
 | 255 | +    assert actual_arguments == expected_arguments, (  | 
 | 256 | +        f"Expected '{expected_arguments}', got '{actual_arguments}'"  | 
 | 257 | +    )  | 
 | 258 | + | 
 | 259 | +    # Verify the complex arguments parse correctly  | 
 | 260 | +    parsed_args = json.loads(actual_arguments)  | 
 | 261 | +    expected_parsed = {"name": "Alice", "message": "Your meeting is starting soon", "urgent": True}  | 
 | 262 | +    assert parsed_args == expected_parsed, f"Parsed arguments should match, got {parsed_args}"  | 
 | 263 | + | 
 | 264 | + | 
 | 265 | +@pytest.mark.asyncio  | 
 | 266 | +async def test_streaming_multiple_tool_calls_arguments():  | 
 | 267 | +    """Test that multiple tool calls in streaming all have proper arguments."""  | 
 | 268 | +    model = StreamingFakeModel()  | 
 | 269 | +    agent = Agent(  | 
 | 270 | +        name="TestAgent",  | 
 | 271 | +        model=model,  | 
 | 272 | +        tools=[calculate_sum, format_message],  | 
 | 273 | +    )  | 
 | 274 | + | 
 | 275 | +    # Set up multiple tool calls  | 
 | 276 | +    model.set_next_output(  | 
 | 277 | +        [  | 
 | 278 | +            get_function_tool_call("calculate_sum", '{"a": 10, "b": 20}', "call_1"),  | 
 | 279 | +            get_function_tool_call(  | 
 | 280 | +                "format_message", '{"name": "Bob", "message": "Test"}', "call_2"  | 
 | 281 | +            ),  | 
 | 282 | +        ]  | 
 | 283 | +    )  | 
 | 284 | + | 
 | 285 | +    result = Runner.run_streamed(agent, input="Do some calculations")  | 
 | 286 | + | 
 | 287 | +    tool_called_events = []  | 
 | 288 | +    async for event in result.stream_events():  | 
 | 289 | +        if (  | 
 | 290 | +            event.type == "run_item_stream_event"  | 
 | 291 | +            and isinstance(event, RunItemStreamEvent)  | 
 | 292 | +            and event.name == "tool_called"  | 
 | 293 | +        ):  | 
 | 294 | +            tool_called_events.append(event)  | 
 | 295 | + | 
 | 296 | +    # Should have exactly 2 tool_called events  | 
 | 297 | +    assert len(tool_called_events) == 2, (  | 
 | 298 | +        f"Expected 2 tool_called events, got {len(tool_called_events)}"  | 
 | 299 | +    )  | 
 | 300 | + | 
 | 301 | +    # Check first tool call  | 
 | 302 | +    event1 = tool_called_events[0]  | 
 | 303 | +    # Cast to ResponseFunctionToolCall since we know that's what it is in our test  | 
 | 304 | +    raw_item1 = cast(ResponseFunctionToolCall, event1.item.raw_item)  | 
 | 305 | +    args1 = raw_item1.arguments  | 
 | 306 | +    assert args1 != "", "First tool call arguments should not be empty"  | 
 | 307 | +    expected_args1 = '{"a": 10, "b": 20}'  | 
 | 308 | +    assert args1 == expected_args1, (  | 
 | 309 | +        f"First tool call args: expected '{expected_args1}', got '{args1}'"  | 
 | 310 | +    )  | 
 | 311 | + | 
 | 312 | +    # Check second tool call  | 
 | 313 | +    event2 = tool_called_events[1]  | 
 | 314 | +    # Cast to ResponseFunctionToolCall since we know that's what it is in our test  | 
 | 315 | +    raw_item2 = cast(ResponseFunctionToolCall, event2.item.raw_item)  | 
 | 316 | +    args2 = raw_item2.arguments  | 
 | 317 | +    assert args2 != "", "Second tool call arguments should not be empty"  | 
 | 318 | +    expected_args2 = '{"name": "Bob", "message": "Test"}'  | 
 | 319 | +    assert args2 == expected_args2, (  | 
 | 320 | +        f"Second tool call args: expected '{expected_args2}', got '{args2}'"  | 
 | 321 | +    )  | 
 | 322 | + | 
 | 323 | + | 
 | 324 | +@pytest.mark.asyncio  | 
 | 325 | +async def test_streaming_tool_call_with_empty_arguments():  | 
 | 326 | +    """Test that tool calls with legitimately empty arguments still work correctly."""  | 
 | 327 | +    model = StreamingFakeModel()  | 
 | 328 | + | 
 | 329 | +    @function_tool  | 
 | 330 | +    def get_current_time() -> str:  | 
 | 331 | +        """Get the current time (no arguments needed)."""  | 
 | 332 | +        return "2024-01-15 10:30:00"  | 
 | 333 | + | 
 | 334 | +    agent = Agent(  | 
 | 335 | +        name="TestAgent",  | 
 | 336 | +        model=model,  | 
 | 337 | +        tools=[get_current_time],  | 
 | 338 | +    )  | 
 | 339 | + | 
 | 340 | +    # Tool call with empty arguments (legitimate case)  | 
 | 341 | +    model.set_next_output(  | 
 | 342 | +        [  | 
 | 343 | +            get_function_tool_call("get_current_time", "{}", "call_time"),  | 
 | 344 | +        ]  | 
 | 345 | +    )  | 
 | 346 | + | 
 | 347 | +    result = Runner.run_streamed(agent, input="What time is it?")  | 
 | 348 | + | 
 | 349 | +    tool_called_events = []  | 
 | 350 | +    async for event in result.stream_events():  | 
 | 351 | +        if (  | 
 | 352 | +            event.type == "run_item_stream_event"  | 
 | 353 | +            and isinstance(event, RunItemStreamEvent)  | 
 | 354 | +            and event.name == "tool_called"  | 
 | 355 | +        ):  | 
 | 356 | +            tool_called_events.append(event)  | 
 | 357 | + | 
 | 358 | +    assert len(tool_called_events) == 1, (  | 
 | 359 | +        f"Expected 1 tool_called event, got {len(tool_called_events)}"  | 
 | 360 | +    )  | 
 | 361 | + | 
 | 362 | +    tool_event = tool_called_events[0]  | 
 | 363 | +    # Cast to ResponseFunctionToolCall since we know that's what it is in our test  | 
 | 364 | +    raw_item = cast(ResponseFunctionToolCall, tool_event.item.raw_item)  | 
 | 365 | +    actual_arguments = raw_item.arguments  | 
 | 366 | + | 
 | 367 | +    # Even "empty" arguments should be "{}", not literally empty string  | 
 | 368 | +    assert actual_arguments is not None, "Arguments should not be None"  | 
 | 369 | +    assert actual_arguments == "{}", f"Expected empty JSON object '{{}}', got '{actual_arguments}'"  | 
 | 370 | + | 
 | 371 | +    # Should parse as valid empty JSON  | 
 | 372 | +    parsed_args = json.loads(actual_arguments)  | 
 | 373 | +    assert parsed_args == {}, f"Should parse to empty dict, got {parsed_args}"  | 
0 commit comments