Skip to content

Commit 6411cc2

Browse files
committed
handling navigation for new tabs - cua
1 parent cb57004 commit 6411cc2

File tree

2 files changed

+107
-9
lines changed

2 files changed

+107
-9
lines changed

stagehand/agent/anthropic_cua.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,16 @@ def __init__(
8686
"required": ["url"],
8787
},
8888
},
89+
{
90+
"name": "navigate_back",
91+
"description": "Navigate back to the previous page",
92+
"input_schema": {
93+
"type": "object",
94+
"properties": {},
95+
},
96+
},
8997
]
90-
self.max_tokens = kwargs.get("max_tokens", 4096)
98+
self.max_tokens = kwargs.get("max_tokens", 1024)
9199
self.last_tool_use_ids = None
92100
self.logger.info(
93101
f"AnthropicCUAClient initialized for model: {model}",
@@ -143,7 +151,8 @@ async def run_task(
143151
response = self.anthropic_sdk_client.beta.messages.create(
144152
model=self.model,
145153
max_tokens=self.max_tokens,
146-
system=self.instructions, # System prompt
154+
system=self.instructions
155+
+ "Remember to call the computer tools, and only goto or navigate_back if you need to. Screenshots, clicks, etc, will be parsed from computer tool calls", # System prompt
147156
messages=current_messages,
148157
tools=self.tools,
149158
betas=["computer-use-2025-01-24"],
@@ -300,9 +309,6 @@ def _process_provider_response(
300309
task_completed = not bool(
301310
agent_action
302311
) # Task is complete if no tool_use blocks
303-
self.logger.info(
304-
f"{agent_action}, {model_message_text}, {task_completed}, {raw_assistant_content_blocks}"
305-
)
306312

307313
return (
308314
agent_action,
@@ -314,14 +320,18 @@ def _process_provider_response(
314320
def _convert_tool_use_to_agent_action(
315321
self, tool_name: str, tool_input: dict[str, Any]
316322
) -> Optional[AgentAction]:
317-
if tool_name != "computer" and tool_name != "goto":
323+
if (
324+
tool_name != "computer"
325+
and tool_name != "goto"
326+
and tool_name != "navigate_back"
327+
):
318328
self.logger.warning(
319329
f"Unsupported tool name from Anthropic: {tool_name}",
320330
category=StagehandFunctionName.AGENT,
321331
)
322332
return None
323333

324-
if tool_name == "goto":
334+
if tool_name == "goto" or tool_name == "navigate_back":
325335
action_type_str = "function"
326336
else:
327337
action_type_str = tool_input.get("action")
@@ -542,6 +552,13 @@ def _convert_tool_use_to_agent_action(
542552
category=StagehandFunctionName.AGENT,
543553
)
544554
return None
555+
elif tool_name == "navigate_back":
556+
action_model_payload = AgentActionType(
557+
type="function",
558+
name="navigate_back",
559+
arguments=FunctionArguments(),
560+
)
561+
action_type_str = "function"
545562
else:
546563
self.logger.warning(
547564
f"Unsupported action type '{action_type_str}' from Anthropic computer tool.",
@@ -573,7 +590,6 @@ def _format_action_feedback(
573590
new_screenshot_base64: str,
574591
current_url: Optional[str],
575592
) -> list[dict[str, Any]]:
576-
self.logger.info(f"Action result: {action_result}")
577593
content_for_tool_result: list[dict[str, Any]] = []
578594
is_error_result = not action_result.get("success", False)
579595

stagehand/handlers/cua_handler.py

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ async def perform_action(self, action: AgentAction) -> ActionExecutionResult:
5353
}
5454

5555
try:
56+
# Store initial URL to detect navigation
57+
initial_url = self.page.url
58+
5659
if action_type == "click":
5760
# specific_action_model is already an instance of ClickAction
5861
x, y = specific_action_model.x, specific_action_model.y
@@ -66,7 +69,9 @@ async def perform_action(self, action: AgentAction) -> ActionExecutionResult:
6669
await self._animate_click(x, y)
6770
await asyncio.sleep(0.1) # Ensure animation is visible
6871
await self.page.mouse.click(x, y, button=button)
69-
# Consider new tab/page handling logic here if needed
72+
73+
# Check for page navigation
74+
await self.handle_page_navigation("click", initial_url)
7075
return {"success": True}
7176

7277
elif action_type == "double_click":
@@ -78,6 +83,9 @@ async def perform_action(self, action: AgentAction) -> ActionExecutionResult:
7883
await self._animate_click(x, y)
7984
await asyncio.sleep(0.1)
8085
await self.page.mouse.dblclick(x, y)
86+
87+
# Check for page navigation
88+
await self.handle_page_navigation("double_click", initial_url)
8189
return {"success": True}
8290

8391
elif action_type == "type":
@@ -92,6 +100,9 @@ async def perform_action(self, action: AgentAction) -> ActionExecutionResult:
92100
for key_str in specific_action_model.keys:
93101
playwright_key = self._convert_key_name(key_str)
94102
await self.page.keyboard.press(playwright_key) # Press each key
103+
104+
# Check for page navigation - keys like Enter can cause navigation
105+
await self.handle_page_navigation("keypress", initial_url)
95106
return {"success": True}
96107

97108
elif action_type == "scroll":
@@ -110,6 +121,9 @@ async def perform_action(self, action: AgentAction) -> ActionExecutionResult:
110121
if name == "goto" and args.url:
111122
await self.page.goto(args.url)
112123
return {"success": True}
124+
elif name == "navigate_back":
125+
await self.page.go_back()
126+
return {"success": True}
113127
# Add other function calls like back, forward, reload if needed, similar to TS version
114128
self.logger.warning(
115129
f"Unsupported function call: {name}",
@@ -131,6 +145,9 @@ async def perform_action(self, action: AgentAction) -> ActionExecutionResult:
131145
else:
132146
# Use _convert_key_name for consistency if possible, or press directly
133147
await self.page.keyboard.press(self._convert_key_name(text))
148+
149+
# Check for page navigation - Enter and other keys may navigate
150+
await self.handle_page_navigation("key", initial_url)
134151
return {"success": True}
135152

136153
elif action_type == "wait":
@@ -248,3 +265,68 @@ def _convert_key_name(self, key: str) -> str:
248265
# Convert to uppercase for case-insensitive matching then check map,
249266
# default to original key if not found.
250267
return key_map.get(key.upper(), key)
268+
269+
async def _handle_page_navigation(self) -> None:
270+
"""Handle page navigation actions."""
271+
pass
272+
273+
async def handle_page_navigation(
274+
self,
275+
action_description: str,
276+
initial_url: str,
277+
dom_settle_timeout_ms: int = 1000,
278+
) -> None:
279+
"""Handle possible page navigation after an action."""
280+
self.logger.debug(
281+
f"{action_description} - checking for page navigation",
282+
category=StagehandFunctionName.AGENT,
283+
)
284+
285+
# Check for new tab/window
286+
new_opened_tab = None
287+
try:
288+
async with self.page.context.expect_page(timeout=1500) as new_page_info:
289+
# Just checking if a page was opened by the action
290+
pass
291+
new_opened_tab = await new_page_info.value
292+
except Exception:
293+
new_opened_tab = None
294+
295+
# Handle new tab if one was opened
296+
if new_opened_tab:
297+
self.logger.info(
298+
f"New tab detected with URL: {new_opened_tab.url}",
299+
category=StagehandFunctionName.AGENT,
300+
)
301+
new_tab_url = new_opened_tab.url
302+
await new_opened_tab.close()
303+
await self.page.goto(new_tab_url)
304+
await self.page.wait_for_load_state("domcontentloaded")
305+
306+
# Wait for DOM to settle
307+
try:
308+
await self.page.wait_for_load_state(
309+
"domcontentloaded", timeout=dom_settle_timeout_ms
310+
)
311+
# Additional optional wait for network idle
312+
await self.page.wait_for_load_state(
313+
"networkidle", timeout=dom_settle_timeout_ms
314+
)
315+
except Exception as e:
316+
self.logger.warning(
317+
f"Wait for DOM settle timed out: {str(e)}",
318+
category=StagehandFunctionName.AGENT,
319+
)
320+
321+
# Check if URL changed
322+
current_url = self.page.url
323+
if current_url != initial_url:
324+
self.logger.debug(
325+
f"Page navigation detected: {initial_url} -> {current_url}",
326+
category=StagehandFunctionName.AGENT,
327+
)
328+
329+
self.logger.debug(
330+
"Finished checking for page navigation",
331+
category=StagehandFunctionName.AGENT,
332+
)

0 commit comments

Comments
 (0)