Prevent preamble messages from being treated as final output when tool calls are pending (openai#1689)

ihower · seratch · web-flow · commit 5a9cab876b4d · 2025-09-09T16:00:38.000+09:00
Co-authored-by: Kazuhiro Sera &lt;seratch@openai.com&gt;
diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py
@@ -330,43 +330,40 @@ async def execute_tools_and_side_effects(
             ItemHelpers.extract_last_text(message_items[-1].raw_item) if message_items else None
         )
 
-        # There are two possibilities that lead to a final output:
-        # 1. Structured output schema => always leads to a final output
-        # 2. Plain text output schema => only leads to a final output if there are no tool calls
-        if output_schema and not output_schema.is_plain_text() and potential_final_output_text:
-            final_output = output_schema.validate_json(potential_final_output_text)
-            return await cls.execute_final_output(
-                agent=agent,
-                original_input=original_input,
-                new_response=new_response,
-                pre_step_items=pre_step_items,
-                new_step_items=new_step_items,
-                final_output=final_output,
-                hooks=hooks,
-                context_wrapper=context_wrapper,
-            )
-        elif (
-            not output_schema or output_schema.is_plain_text()
-        ) and not processed_response.has_tools_or_approvals_to_run():
-            return await cls.execute_final_output(
-                agent=agent,
-                original_input=original_input,
-                new_response=new_response,
-                pre_step_items=pre_step_items,
-                new_step_items=new_step_items,
-                final_output=potential_final_output_text or "",
-                hooks=hooks,
-                context_wrapper=context_wrapper,
-            )
-        else:
-            # If there's no final output, we can just run again
-            return SingleStepResult(
-                original_input=original_input,
-                model_response=new_response,
-                pre_step_items=pre_step_items,
-                new_step_items=new_step_items,
-                next_step=NextStepRunAgain(),
-            )
+        # Generate final output only when there are no pending tool calls or approval requests.
+        if not processed_response.has_tools_or_approvals_to_run():
+            if output_schema and not output_schema.is_plain_text() and potential_final_output_text:
+                final_output = output_schema.validate_json(potential_final_output_text)
+                return await cls.execute_final_output(
+                    agent=agent,
+                    original_input=original_input,
+                    new_response=new_response,
+                    pre_step_items=pre_step_items,
+                    new_step_items=new_step_items,
+                    final_output=final_output,
+                    hooks=hooks,
+                    context_wrapper=context_wrapper,
+                )
+            elif not output_schema or output_schema.is_plain_text():
+                return await cls.execute_final_output(
+                    agent=agent,
+                    original_input=original_input,
+                    new_response=new_response,
+                    pre_step_items=pre_step_items,
+                    new_step_items=new_step_items,
+                    final_output=potential_final_output_text or "",
+                    hooks=hooks,
+                    context_wrapper=context_wrapper,
+                )
+
+        # If there's no final output, we can just run again
+        return SingleStepResult(
+            original_input=original_input,
+            model_response=new_response,
+            pre_step_items=pre_step_items,
+            new_step_items=new_step_items,
+            next_step=NextStepRunAgain(),
+        )
 
     @classmethod
     def maybe_reset_tool_choice(
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
@@ -196,11 +196,13 @@ async def test_structured_output():
             [get_function_tool_call("foo", json.dumps({"bar": "baz"}))],
             # Second turn: a message and a handoff
             [get_text_message("a_message"), get_handoff_tool_call(agent_1)],
-            # Third turn: tool call and structured output
+            # Third turn: tool call with preamble message
             [
+                get_text_message(json.dumps(Foo(bar="preamble"))),
                 get_function_tool_call("bar", json.dumps({"bar": "baz"})),
-                get_final_output_message(json.dumps(Foo(bar="baz"))),
             ],
+            # Fourth turn: structured output
+            [get_final_output_message(json.dumps(Foo(bar="baz")))],
         ]
     )
 
@@ -213,10 +215,10 @@ async def test_structured_output():
     )
 
     assert result.final_output == Foo(bar="baz")
-    assert len(result.raw_responses) == 3, "should have three model responses"
-    assert len(result.to_input_list()) == 10, (
+    assert len(result.raw_responses) == 4, "should have four model responses"
+    assert len(result.to_input_list()) == 11, (
         "should have input: 2 orig inputs, function call, function call result, message, handoff, "
-        "handoff output, tool call, tool call result, final output message"
+        "handoff output, preamble message, tool call, tool call result, final output"
     )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
diff --git a/tests/test_agent_runner_streamed.py b/tests/test_agent_runner_streamed.py
@@ -207,11 +207,13 @@ async def test_structured_output():
             [get_function_tool_call("foo", json.dumps({"bar": "baz"}))],
             # Second turn: a message and a handoff
             [get_text_message("a_message"), get_handoff_tool_call(agent_1)],
-            # Third turn: tool call and structured output
+            # Third turn: tool call with preamble message
             [
+                get_text_message(json.dumps(Foo(bar="preamble"))),
                 get_function_tool_call("bar", json.dumps({"bar": "baz"})),
-                get_final_output_message(json.dumps(Foo(bar="baz"))),
             ],
+            # Fourth turn: structured output
+            [get_final_output_message(json.dumps(Foo(bar="baz")))],
         ]
     )
 
@@ -226,10 +228,10 @@ async def test_structured_output():
         pass
 
     assert result.final_output == Foo(bar="baz")
-    assert len(result.raw_responses) == 3, "should have three model responses"
-    assert len(result.to_input_list()) == 10, (
+    assert len(result.raw_responses) == 4, "should have four model responses"
+    assert len(result.to_input_list()) == 11, (
         "should have input: 2 orig inputs, function call, function call result, message, handoff, "
-        "handoff output, tool call, tool call result, final output"
+        "handoff output, preamble message, tool call, tool call result, final output"
     )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
@@ -624,11 +626,10 @@ async def test_streaming_events():
             [get_function_tool_call("foo", json.dumps({"bar": "baz"}))],
             # Second turn: a message and a handoff
             [get_text_message("a_message"), get_handoff_tool_call(agent_1)],
-            # Third turn: tool call and structured output
-            [
-                get_function_tool_call("bar", json.dumps({"bar": "baz"})),
-                get_final_output_message(json.dumps(Foo(bar="baz"))),
-            ],
+            # Third turn: tool call
+            [get_function_tool_call("bar", json.dumps({"bar": "baz"}))],
+            # Fourth turn: structured output
+            [get_final_output_message(json.dumps(Foo(bar="baz")))],
         ]
     )
 
@@ -652,7 +653,7 @@ async def test_streaming_events():
             agent_data.append(event)
 
     assert result.final_output == Foo(bar="baz")
-    assert len(result.raw_responses) == 3, "should have three model responses"
+    assert len(result.raw_responses) == 4, "should have four model responses"
     assert len(result.to_input_list()) == 10, (
         "should have input: 2 orig inputs, function call, function call result, message, handoff, "
         "handoff output, tool call, tool call result, final output"

Original file line number	Diff line number	Diff line change
`@@ -196,11 +196,13 @@ async def test_structured_output():`
`196`	`196`	`[get_function_tool_call("foo", json.dumps({"bar": "baz"}))],`
`197`	`197`	`# Second turn: a message and a handoff`
`198`	`198`	`[get_text_message("a_message"), get_handoff_tool_call(agent_1)],`
`199`		`- # Third turn: tool call and structured output`
	`199`	`+ # Third turn: tool call with preamble message`
`200`	`200`	`[`
	`201`	`+ get_text_message(json.dumps(Foo(bar="preamble"))),`
`201`	`202`	`get_function_tool_call("bar", json.dumps({"bar": "baz"})),`
`202`		`- get_final_output_message(json.dumps(Foo(bar="baz"))),`
`203`	`203`	`],`
	`204`	`+ # Fourth turn: structured output`
	`205`	`+ [get_final_output_message(json.dumps(Foo(bar="baz")))],`
`204`	`206`	`]`
`205`	`207`	`)`
`206`	`208`
`@@ -213,10 +215,10 @@ async def test_structured_output():`
`213`	`215`	`)`
`214`	`216`
`215`	`217`	`assert result.final_output == Foo(bar="baz")`
`216`		`- assert len(result.raw_responses) == 3, "should have three model responses"`
`217`		`- assert len(result.to_input_list()) == 10, (`
	`218`	`+ assert len(result.raw_responses) == 4, "should have four model responses"`
	`219`	`+ assert len(result.to_input_list()) == 11, (`
`218`	`220`	`"should have input: 2 orig inputs, function call, function call result, message, handoff, "`
`219`		`- "handoff output, tool call, tool call result, final output message"`
	`221`	`+ "handoff output, preamble message, tool call, tool call result, final output"`
`220`	`222`	`)`
`221`	`223`
`222`	`224`	`assert result.last_agent == agent_1, "should have handed off to agent_1"`