|
5 | 5 | from typing import Any, Callable, Optional, cast |
6 | 6 |
|
7 | 7 | import pytest |
| 8 | +from openai.types.responses import ResponseComputerToolCall |
| 9 | +from openai.types.responses.response_computer_tool_call import ActionScreenshot |
8 | 10 | from openai.types.responses.response_input_param import ( |
9 | 11 | ComputerCallOutput, |
10 | 12 | LocalShellCallOutput, |
|
14 | 16 | from agents import ( |
15 | 17 | Agent, |
16 | 18 | ApplyPatchTool, |
| 19 | + ComputerTool, |
17 | 20 | LocalShellTool, |
18 | 21 | Runner, |
19 | 22 | RunResult, |
|
22 | 25 | ToolApprovalItem, |
23 | 26 | function_tool, |
24 | 27 | ) |
| 28 | +from agents.computer import Computer, Environment |
25 | 29 | from agents.exceptions import ModelBehaviorError, UserError |
26 | 30 | from agents.items import ( |
27 | 31 | MCPApprovalResponseItem, |
|
38 | 42 | NextStepInterruption, |
39 | 43 | NextStepRunAgain, |
40 | 44 | ProcessedResponse, |
| 45 | + ToolRunComputerAction, |
41 | 46 | ToolRunFunction, |
42 | 47 | ToolRunMCPApprovalRequest, |
43 | 48 | ToolRunShellCall, |
|
76 | 81 | ) |
77 | 82 |
|
78 | 83 |
|
| 84 | +class TrackingComputer(Computer): |
| 85 | + """Minimal computer implementation that records method calls.""" |
| 86 | + |
| 87 | + def __init__(self) -> None: |
| 88 | + self.calls: list[str] = [] |
| 89 | + |
| 90 | + @property |
| 91 | + def environment(self) -> Environment: |
| 92 | + return "mac" |
| 93 | + |
| 94 | + @property |
| 95 | + def dimensions(self) -> tuple[int, int]: |
| 96 | + return (1, 1) |
| 97 | + |
| 98 | + def screenshot(self) -> str: |
| 99 | + self.calls.append("screenshot") |
| 100 | + return "img" |
| 101 | + |
| 102 | + def click(self, _x: int, _y: int, _button: str) -> None: |
| 103 | + self.calls.append("click") |
| 104 | + |
| 105 | + def double_click(self, _x: int, _y: int) -> None: |
| 106 | + self.calls.append("double_click") |
| 107 | + |
| 108 | + def scroll(self, _x: int, _y: int, _scroll_x: int, _scroll_y: int) -> None: |
| 109 | + self.calls.append("scroll") |
| 110 | + |
| 111 | + def type(self, _text: str) -> None: |
| 112 | + self.calls.append("type") |
| 113 | + |
| 114 | + def wait(self) -> None: |
| 115 | + self.calls.append("wait") |
| 116 | + |
| 117 | + def move(self, _x: int, _y: int) -> None: |
| 118 | + self.calls.append("move") |
| 119 | + |
| 120 | + def keypress(self, _keys: list[str]) -> None: |
| 121 | + self.calls.append("keypress") |
| 122 | + |
| 123 | + def drag(self, _path: list[tuple[int, int]]) -> None: |
| 124 | + self.calls.append("drag") |
| 125 | + |
| 126 | + |
79 | 127 | def _shell_approval_setup() -> ApprovalScenario: |
80 | 128 | tool = ShellTool(executor=lambda request: "shell_output", needs_approval=require_approval) |
81 | 129 | shell_call = make_shell_call("call_shell_1", id_value="shell_1", commands=["echo test"]) |
@@ -889,6 +937,123 @@ async def test_resume_skips_shell_calls_with_existing_output() -> None: |
889 | 937 | assert not result.new_step_items, "Shell call should not run when output already exists" |
890 | 938 |
|
891 | 939 |
|
| 940 | +@pytest.mark.asyncio |
| 941 | +async def test_resume_executes_pending_computer_actions() -> None: |
| 942 | + """Pending computer actions should execute when resuming an interrupted turn.""" |
| 943 | + |
| 944 | + computer = TrackingComputer() |
| 945 | + computer_tool = ComputerTool(computer=computer) |
| 946 | + model, agent = make_model_and_agent(tools=[computer_tool]) |
| 947 | + |
| 948 | + computer_call = ResponseComputerToolCall( |
| 949 | + type="computer_call", |
| 950 | + id="comp_pending", |
| 951 | + call_id="comp_pending", |
| 952 | + status="in_progress", |
| 953 | + action=ActionScreenshot(type="screenshot"), |
| 954 | + pending_safety_checks=[], |
| 955 | + ) |
| 956 | + |
| 957 | + processed_response = ProcessedResponse( |
| 958 | + new_items=[], |
| 959 | + handoffs=[], |
| 960 | + functions=[], |
| 961 | + computer_actions=[ |
| 962 | + ToolRunComputerAction(tool_call=computer_call, computer_tool=computer_tool) |
| 963 | + ], |
| 964 | + local_shell_calls=[], |
| 965 | + shell_calls=[], |
| 966 | + apply_patch_calls=[], |
| 967 | + tools_used=[computer_tool.name], |
| 968 | + mcp_approval_requests=[], |
| 969 | + interruptions=[], |
| 970 | + ) |
| 971 | + |
| 972 | + result = await run_loop.resolve_interrupted_turn( |
| 973 | + agent=agent, |
| 974 | + original_input="resume computer", |
| 975 | + original_pre_step_items=[], |
| 976 | + new_response=ModelResponse(output=[], usage=Usage(), response_id="resp"), |
| 977 | + processed_response=processed_response, |
| 978 | + hooks=RunHooks(), |
| 979 | + context_wrapper=make_context_wrapper(), |
| 980 | + run_config=RunConfig(), |
| 981 | + run_state=None, |
| 982 | + ) |
| 983 | + |
| 984 | + outputs = [ |
| 985 | + item |
| 986 | + for item in result.new_step_items |
| 987 | + if isinstance(item, ToolCallOutputItem) |
| 988 | + and isinstance(item.raw_item, dict) |
| 989 | + and item.raw_item.get("type") == "computer_call_output" |
| 990 | + ] |
| 991 | + assert outputs, "Computer action should run when resuming without prior output" |
| 992 | + assert computer.calls, "Computer should have been invoked" |
| 993 | + assert isinstance(result.next_step, NextStepRunAgain) |
| 994 | + |
| 995 | + |
| 996 | +@pytest.mark.asyncio |
| 997 | +async def test_resume_skips_computer_actions_with_existing_output() -> None: |
| 998 | + """Computer actions with persisted output should not execute again when resuming.""" |
| 999 | + |
| 1000 | + computer = TrackingComputer() |
| 1001 | + computer_tool = ComputerTool(computer=computer) |
| 1002 | + model, agent = make_model_and_agent(tools=[computer_tool]) |
| 1003 | + |
| 1004 | + computer_call = ResponseComputerToolCall( |
| 1005 | + type="computer_call", |
| 1006 | + id="comp_skip", |
| 1007 | + call_id="comp_skip", |
| 1008 | + status="completed", |
| 1009 | + action=ActionScreenshot(type="screenshot"), |
| 1010 | + pending_safety_checks=[], |
| 1011 | + ) |
| 1012 | + |
| 1013 | + processed_response = ProcessedResponse( |
| 1014 | + new_items=[], |
| 1015 | + handoffs=[], |
| 1016 | + functions=[], |
| 1017 | + computer_actions=[ |
| 1018 | + ToolRunComputerAction(tool_call=computer_call, computer_tool=computer_tool) |
| 1019 | + ], |
| 1020 | + local_shell_calls=[], |
| 1021 | + shell_calls=[], |
| 1022 | + apply_patch_calls=[], |
| 1023 | + tools_used=[computer_tool.name], |
| 1024 | + mcp_approval_requests=[], |
| 1025 | + interruptions=[], |
| 1026 | + ) |
| 1027 | + |
| 1028 | + original_pre_step_items = [ |
| 1029 | + ToolCallOutputItem( |
| 1030 | + agent=agent, |
| 1031 | + raw_item={ |
| 1032 | + "type": "computer_call_output", |
| 1033 | + "call_id": "comp_skip", |
| 1034 | + "output": {"type": "computer_screenshot", "image_url": "data:image/png;base64,ok"}, |
| 1035 | + }, |
| 1036 | + output="image_url", |
| 1037 | + ) |
| 1038 | + ] |
| 1039 | + |
| 1040 | + result = await run_loop.resolve_interrupted_turn( |
| 1041 | + agent=agent, |
| 1042 | + original_input="resume computer existing", |
| 1043 | + original_pre_step_items=cast(list[RunItem], original_pre_step_items), |
| 1044 | + new_response=ModelResponse(output=[], usage=Usage(), response_id="resp"), |
| 1045 | + processed_response=processed_response, |
| 1046 | + hooks=RunHooks(), |
| 1047 | + context_wrapper=make_context_wrapper(), |
| 1048 | + run_config=RunConfig(), |
| 1049 | + run_state=None, |
| 1050 | + ) |
| 1051 | + |
| 1052 | + assert not computer.calls, "Computer action should not run when output already exists" |
| 1053 | + assert not result.new_step_items, "No new items should be emitted when output exists" |
| 1054 | + assert isinstance(result.next_step, NextStepRunAgain) |
| 1055 | + |
| 1056 | + |
892 | 1057 | @pytest.mark.asyncio |
893 | 1058 | async def test_rebuild_function_runs_handles_pending_and_rejections() -> None: |
894 | 1059 | """Rebuilt function runs should surface pending approvals and emit rejections.""" |
@@ -1017,6 +1182,86 @@ async def test_rejected_shell_calls_emit_rejection_output() -> None: |
1017 | 1182 | assert isinstance(result.next_step, NextStepRunAgain) |
1018 | 1183 |
|
1019 | 1184 |
|
| 1185 | +@pytest.mark.asyncio |
| 1186 | +async def test_rejected_shell_calls_with_existing_output_are_not_duplicated() -> None: |
| 1187 | + """Rejected shell calls with persisted output should not emit duplicate rejections.""" |
| 1188 | + |
| 1189 | + shell_tool = ShellTool(executor=lambda _req: "should_not_run", needs_approval=True) |
| 1190 | + _model, agent = make_model_and_agent(tools=[shell_tool]) |
| 1191 | + context_wrapper = make_context_wrapper() |
| 1192 | + |
| 1193 | + shell_call = make_shell_call( |
| 1194 | + "call_reject_shell_dup", |
| 1195 | + id_value="shell_reject_dup", |
| 1196 | + commands=["echo test"], |
| 1197 | + status="in_progress", |
| 1198 | + ) |
| 1199 | + approval_item = ToolApprovalItem( |
| 1200 | + agent=agent, |
| 1201 | + raw_item=cast(dict[str, Any], shell_call), |
| 1202 | + tool_name=shell_tool.name, |
| 1203 | + ) |
| 1204 | + context_wrapper.reject_tool(approval_item) |
| 1205 | + |
| 1206 | + processed_response = ProcessedResponse( |
| 1207 | + new_items=[], |
| 1208 | + handoffs=[], |
| 1209 | + functions=[], |
| 1210 | + computer_actions=[], |
| 1211 | + local_shell_calls=[], |
| 1212 | + shell_calls=[ToolRunShellCall(tool_call=shell_call, shell_tool=shell_tool)], |
| 1213 | + apply_patch_calls=[], |
| 1214 | + tools_used=[], |
| 1215 | + mcp_approval_requests=[], |
| 1216 | + interruptions=[], |
| 1217 | + ) |
| 1218 | + |
| 1219 | + original_pre_step_items = [ |
| 1220 | + ToolCallOutputItem( |
| 1221 | + agent=agent, |
| 1222 | + raw_item=cast( |
| 1223 | + dict[str, Any], |
| 1224 | + { |
| 1225 | + "type": "shell_call_output", |
| 1226 | + "call_id": "call_reject_shell_dup", |
| 1227 | + "output": [ |
| 1228 | + { |
| 1229 | + "stdout": "", |
| 1230 | + "stderr": HITL_REJECTION_MSG, |
| 1231 | + "outcome": {"type": "exit", "exit_code": 1}, |
| 1232 | + } |
| 1233 | + ], |
| 1234 | + }, |
| 1235 | + ), |
| 1236 | + output=HITL_REJECTION_MSG, |
| 1237 | + ) |
| 1238 | + ] |
| 1239 | + |
| 1240 | + result = await run_loop.resolve_interrupted_turn( |
| 1241 | + agent=agent, |
| 1242 | + original_input="resume shell rejection existing", |
| 1243 | + original_pre_step_items=cast(list[RunItem], original_pre_step_items), |
| 1244 | + new_response=ModelResponse(output=[], usage=Usage(), response_id="resp"), |
| 1245 | + processed_response=processed_response, |
| 1246 | + hooks=RunHooks(), |
| 1247 | + context_wrapper=context_wrapper, |
| 1248 | + run_config=RunConfig(), |
| 1249 | + run_state=None, |
| 1250 | + ) |
| 1251 | + |
| 1252 | + duplicate_rejections = [ |
| 1253 | + item |
| 1254 | + for item in result.new_step_items |
| 1255 | + if isinstance(item, ToolCallOutputItem) |
| 1256 | + and isinstance(item.raw_item, dict) |
| 1257 | + and item.raw_item.get("type") == "shell_call_output" |
| 1258 | + and HITL_REJECTION_MSG in str(item.output) |
| 1259 | + ] |
| 1260 | + |
| 1261 | + assert not duplicate_rejections, "No duplicate rejection outputs should be emitted" |
| 1262 | + assert isinstance(result.next_step, NextStepRunAgain) |
| 1263 | + |
| 1264 | + |
1020 | 1265 | @pytest.mark.asyncio |
1021 | 1266 | async def test_mcp_callback_approvals_are_processed() -> None: |
1022 | 1267 | """MCP approval requests with callbacks should emit approval responses.""" |
|
0 commit comments