Skip to content

Commit 4d6c4e4

Browse files
committed
remove all images except the latest two in AnthropicCuaClient
1 parent 04cd8da commit 4d6c4e4

File tree

3 files changed

+108
-6
lines changed

3 files changed

+108
-6
lines changed

stagehand/agent/agent.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@
1919
"computer-use-preview": OpenAICUAClient,
2020
"claude-3-5-sonnet-latest": AnthropicCUAClient,
2121
"claude-3-7-sonnet-latest": AnthropicCUAClient,
22+
"claude-sonnet-4-20250514": AnthropicCUAClient,
2223
}
2324
MODEL_TO_PROVIDER_MAP: dict[str, AgentProvider] = {
2425
"computer-use-preview": AgentProvider.OPENAI,
2526
"claude-3-5-sonnet-20240620": AgentProvider.ANTHROPIC,
2627
"claude-3-7-sonnet-20250219": AgentProvider.ANTHROPIC,
28+
"claude-sonnet-4-20250514": AgentProvider.ANTHROPIC,
2729
# Add more mappings as needed
2830
}
2931

@@ -84,6 +86,7 @@ def _get_client(self) -> AgentClient:
8486
logger=self.logger,
8587
handler=self.cua_handler,
8688
viewport=self.viewport,
89+
experimental=self.stagehand.experimental,
8790
)
8891

8992
async def execute(

stagehand/agent/anthropic_cua.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
Point,
1919
)
2020
from .client import AgentClient
21+
from .image_compression_utils import compress_conversation_images
2122

2223
load_dotenv()
2324

@@ -51,9 +52,11 @@ def __init__(
5152
logger: Optional[Any] = None,
5253
handler: Optional[CUAHandler] = None,
5354
viewport: Optional[dict[str, int]] = None,
55+
experimental: bool = False,
5456
**kwargs,
5557
):
5658
super().__init__(model, instructions, config, logger, handler)
59+
self.experimental = experimental
5760
self.anthropic_sdk_client = Anthropic(
5861
api_key=config.options.get("apiKey") or os.getenv("ANTHROPIC_API_KEY")
5962
)
@@ -67,14 +70,14 @@ def __init__(
6770
if hasattr(self.config, "display_height") and self.config.display_height is not None: # type: ignore
6871
dimensions[1] = self.config.display_height # type: ignore
6972
computer_tool_type = (
70-
"computer_20250124"
71-
if model == "claude-3-7-sonnet-latest"
72-
else "computer_20241022"
73+
"computer_20241022"
74+
if model == "claude-3-5-sonnet-latest"
75+
else "computer_20250124"
7376
)
7477
self.beta_flag = (
75-
["computer-use-2025-01-24"]
76-
if model == "claude-3-7-sonnet-latest"
77-
else ["computer-use-2024-10-22"]
78+
["computer-use-2024-10-22"]
79+
if model == "claude-3-5-sonnet-latest"
80+
else ["computer-use-2025-01-24"]
7881
)
7982
self.tools = [
8083
{
@@ -162,6 +165,9 @@ async def run_task(
162165

163166
start_time = asyncio.get_event_loop().time()
164167
try:
168+
if self.experimental:
169+
compress_conversation_images(current_messages)
170+
165171
response = self.anthropic_sdk_client.beta.messages.create(
166172
model=self.model,
167173
max_tokens=self.max_tokens,
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from typing import List, Dict, Any, Union
2+
3+
4+
def find_items_with_images(items: List[Dict[str, Any]]) -> List[int]:
5+
"""
6+
Finds all items in the conversation history that contain images
7+
8+
Args:
9+
items: Array of conversation items to check
10+
11+
Returns:
12+
Array of indices where images were found
13+
"""
14+
items_with_images = []
15+
16+
for index, item in enumerate(items):
17+
has_image = False
18+
19+
if isinstance(item.get("content"), list):
20+
has_image = any(
21+
content_item.get("type") == "tool_result"
22+
and "content" in content_item
23+
and isinstance(content_item["content"], list)
24+
and any(
25+
nested_item.get("type") == "image"
26+
for nested_item in content_item["content"]
27+
if isinstance(nested_item, dict)
28+
)
29+
for content_item in item["content"]
30+
if isinstance(content_item, dict)
31+
)
32+
33+
if has_image:
34+
items_with_images.append(index)
35+
36+
return items_with_images
37+
38+
39+
def compress_conversation_images(
40+
items: List[Dict[str, Any]],
41+
keep_most_recent_count: int = 2
42+
) -> Dict[str, List[Dict[str, Any]]]:
43+
"""
44+
Compresses conversation history by removing images from older items
45+
while keeping the most recent images intact
46+
47+
Args:
48+
items: Array of conversation items to process
49+
keep_most_recent_count: Number of most recent image-containing items to preserve (default: 2)
50+
51+
Returns:
52+
Dictionary with processed items
53+
"""
54+
items_with_images = find_items_with_images(items)
55+
56+
for index, item in enumerate(items):
57+
image_index = -1
58+
if index in items_with_images:
59+
image_index = items_with_images.index(index)
60+
61+
should_compress = (
62+
image_index >= 0
63+
and image_index < len(items_with_images) - keep_most_recent_count
64+
)
65+
66+
if should_compress:
67+
if isinstance(item.get("content"), list):
68+
new_content = []
69+
for content_item in item["content"]:
70+
if isinstance(content_item, dict):
71+
if (
72+
content_item.get("type") == "tool_result"
73+
and "content" in content_item
74+
and isinstance(content_item["content"], list)
75+
and any(
76+
nested_item.get("type") == "image"
77+
for nested_item in content_item["content"]
78+
if isinstance(nested_item, dict)
79+
)
80+
):
81+
# Replace the content with a text placeholder
82+
new_content.append({
83+
**content_item,
84+
"content": "screenshot taken"
85+
})
86+
else:
87+
new_content.append(content_item)
88+
else:
89+
new_content.append(content_item)
90+
91+
item["content"] = new_content
92+
93+
return {"items": items}

0 commit comments

Comments
 (0)