From 4d6c4e4101c80f159650490551ceba34090bf8c8 Mon Sep 17 00:00:00 2001
From: tkattkat <tkat@tkat.net>
Date: Tue, 22 Jul 2025 14:48:11 -0700
Subject: [PATCH 1/4] remove all images except the latest two in
 AnthropicCuaClient

---
 stagehand/agent/agent.py                   |  3 +
 stagehand/agent/anthropic_cua.py           | 18 +++--
 stagehand/agent/image_compression_utils.py | 93 ++++++++++++++++++++++
 3 files changed, 108 insertions(+), 6 deletions(-)
 create mode 100644 stagehand/agent/image_compression_utils.py

diff --git a/stagehand/agent/agent.py b/stagehand/agent/agent.py
index c3f0e97b..c5bd429e 100644
--- a/stagehand/agent/agent.py
+++ b/stagehand/agent/agent.py
@@ -19,11 +19,13 @@
     "computer-use-preview": OpenAICUAClient,
     "claude-3-5-sonnet-latest": AnthropicCUAClient,
     "claude-3-7-sonnet-latest": AnthropicCUAClient,
+    "claude-sonnet-4-20250514": AnthropicCUAClient,
 }
 MODEL_TO_PROVIDER_MAP: dict[str, AgentProvider] = {
     "computer-use-preview": AgentProvider.OPENAI,
     "claude-3-5-sonnet-20240620": AgentProvider.ANTHROPIC,
     "claude-3-7-sonnet-20250219": AgentProvider.ANTHROPIC,
+    "claude-sonnet-4-20250514": AgentProvider.ANTHROPIC,
     # Add more mappings as needed
 }
 
@@ -84,6 +86,7 @@ def _get_client(self) -> AgentClient:
             logger=self.logger,
             handler=self.cua_handler,
             viewport=self.viewport,
+            experimental=self.stagehand.experimental,
         )
 
     async def execute(
diff --git a/stagehand/agent/anthropic_cua.py b/stagehand/agent/anthropic_cua.py
index edcdd39e..a77d5d8d 100644
--- a/stagehand/agent/anthropic_cua.py
+++ b/stagehand/agent/anthropic_cua.py
@@ -18,6 +18,7 @@
     Point,
 )
 from .client import AgentClient
+from .image_compression_utils import compress_conversation_images
 
 load_dotenv()
 
@@ -51,9 +52,11 @@ def __init__(
         logger: Optional[Any] = None,
         handler: Optional[CUAHandler] = None,
         viewport: Optional[dict[str, int]] = None,
+        experimental: bool = False,
         **kwargs,
     ):
         super().__init__(model, instructions, config, logger, handler)
+        self.experimental = experimental
         self.anthropic_sdk_client = Anthropic(
             api_key=config.options.get("apiKey") or os.getenv("ANTHROPIC_API_KEY")
         )
@@ -67,14 +70,14 @@ def __init__(
             if hasattr(self.config, "display_height") and self.config.display_height is not None:  # type: ignore
                 dimensions[1] = self.config.display_height  # type: ignore
         computer_tool_type = (
-            "computer_20250124"
-            if model == "claude-3-7-sonnet-latest"
-            else "computer_20241022"
+            "computer_20241022"
+            if model == "claude-3-5-sonnet-latest"
+            else "computer_20250124"
         )
         self.beta_flag = (
-            ["computer-use-2025-01-24"]
-            if model == "claude-3-7-sonnet-latest"
-            else ["computer-use-2024-10-22"]
+            ["computer-use-2024-10-22"]
+            if model == "claude-3-5-sonnet-latest"
+            else ["computer-use-2025-01-24"]
         )
         self.tools = [
             {
@@ -162,6 +165,9 @@ async def run_task(
 
             start_time = asyncio.get_event_loop().time()
             try:
+                if self.experimental:
+                    compress_conversation_images(current_messages)
+                    
                 response = self.anthropic_sdk_client.beta.messages.create(
                     model=self.model,
                     max_tokens=self.max_tokens,
diff --git a/stagehand/agent/image_compression_utils.py b/stagehand/agent/image_compression_utils.py
new file mode 100644
index 00000000..a339b5ab
--- /dev/null
+++ b/stagehand/agent/image_compression_utils.py
@@ -0,0 +1,93 @@
+from typing import List, Dict, Any, Union
+
+
+def find_items_with_images(items: List[Dict[str, Any]]) -> List[int]:
+    """
+    Finds all items in the conversation history that contain images
+    
+    Args:
+        items: Array of conversation items to check
+        
+    Returns:
+        Array of indices where images were found
+    """
+    items_with_images = []
+    
+    for index, item in enumerate(items):
+        has_image = False
+        
+        if isinstance(item.get("content"), list):
+            has_image = any(
+                content_item.get("type") == "tool_result"
+                and "content" in content_item
+                and isinstance(content_item["content"], list)
+                and any(
+                    nested_item.get("type") == "image"
+                    for nested_item in content_item["content"]
+                    if isinstance(nested_item, dict)
+                )
+                for content_item in item["content"]
+                if isinstance(content_item, dict)
+            )
+        
+        if has_image:
+            items_with_images.append(index)
+    
+    return items_with_images
+
+
+def compress_conversation_images(
+    items: List[Dict[str, Any]], 
+    keep_most_recent_count: int = 2
+) -> Dict[str, List[Dict[str, Any]]]:
+    """
+    Compresses conversation history by removing images from older items
+    while keeping the most recent images intact
+    
+    Args:
+        items: Array of conversation items to process
+        keep_most_recent_count: Number of most recent image-containing items to preserve (default: 2)
+        
+    Returns:
+        Dictionary with processed items
+    """
+    items_with_images = find_items_with_images(items)
+    
+    for index, item in enumerate(items):
+        image_index = -1
+        if index in items_with_images:
+            image_index = items_with_images.index(index)
+        
+        should_compress = (
+            image_index >= 0 
+            and image_index < len(items_with_images) - keep_most_recent_count
+        )
+        
+        if should_compress:
+            if isinstance(item.get("content"), list):
+                new_content = []
+                for content_item in item["content"]:
+                    if isinstance(content_item, dict):
+                        if (
+                            content_item.get("type") == "tool_result"
+                            and "content" in content_item
+                            and isinstance(content_item["content"], list)
+                            and any(
+                                nested_item.get("type") == "image"
+                                for nested_item in content_item["content"]
+                                if isinstance(nested_item, dict)
+                            )
+                        ):
+                            # Replace the content with a text placeholder
+                            new_content.append({
+                                **content_item,
+                                "content": "screenshot taken"
+                            })
+                        else:
+                            new_content.append(content_item)
+                    else:
+                        new_content.append(content_item)
+                
+                item["content"] = new_content
+    
+    return {"items": items}
\ No newline at end of file

From db8c833890c5e5c744c4d8ef1c523a24a8425063 Mon Sep 17 00:00:00 2001
From: tkattkat <tkat@tkat.net>
Date: Tue, 22 Jul 2025 14:52:07 -0700
Subject: [PATCH 2/4] format code

---
 stagehand/agent/anthropic_cua.py           |  2 +-
 stagehand/agent/image_compression_utils.py | 40 ++++++++++------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/stagehand/agent/anthropic_cua.py b/stagehand/agent/anthropic_cua.py
index a77d5d8d..8a896f8a 100644
--- a/stagehand/agent/anthropic_cua.py
+++ b/stagehand/agent/anthropic_cua.py
@@ -167,7 +167,7 @@ async def run_task(
             try:
                 if self.experimental:
                     compress_conversation_images(current_messages)
-                    
+
                 response = self.anthropic_sdk_client.beta.messages.create(
                     model=self.model,
                     max_tokens=self.max_tokens,
diff --git a/stagehand/agent/image_compression_utils.py b/stagehand/agent/image_compression_utils.py
index a339b5ab..7bbf030b 100644
--- a/stagehand/agent/image_compression_utils.py
+++ b/stagehand/agent/image_compression_utils.py
@@ -4,18 +4,18 @@
 def find_items_with_images(items: List[Dict[str, Any]]) -> List[int]:
     """
     Finds all items in the conversation history that contain images
-    
+
     Args:
         items: Array of conversation items to check
-        
+
     Returns:
         Array of indices where images were found
     """
     items_with_images = []
-    
+
     for index, item in enumerate(items):
         has_image = False
-        
+
         if isinstance(item.get("content"), list):
             has_image = any(
                 content_item.get("type") == "tool_result"
@@ -29,40 +29,39 @@ def find_items_with_images(items: List[Dict[str, Any]]) -> List[int]:
                 for content_item in item["content"]
                 if isinstance(content_item, dict)
             )
-        
+
         if has_image:
             items_with_images.append(index)
-    
+
     return items_with_images
 
 
 def compress_conversation_images(
-    items: List[Dict[str, Any]], 
-    keep_most_recent_count: int = 2
+    items: List[Dict[str, Any]], keep_most_recent_count: int = 2
 ) -> Dict[str, List[Dict[str, Any]]]:
     """
     Compresses conversation history by removing images from older items
     while keeping the most recent images intact
-    
+
     Args:
         items: Array of conversation items to process
         keep_most_recent_count: Number of most recent image-containing items to preserve (default: 2)
-        
+
     Returns:
         Dictionary with processed items
     """
     items_with_images = find_items_with_images(items)
-    
+
     for index, item in enumerate(items):
         image_index = -1
         if index in items_with_images:
             image_index = items_with_images.index(index)
-        
+
         should_compress = (
-            image_index >= 0 
+            image_index >= 0
             and image_index < len(items_with_images) - keep_most_recent_count
         )
-        
+
         if should_compress:
             if isinstance(item.get("content"), list):
                 new_content = []
@@ -79,15 +78,14 @@ def compress_conversation_images(
                             )
                         ):
                             # Replace the content with a text placeholder
-                            new_content.append({
-                                **content_item,
-                                "content": "screenshot taken"
-                            })
+                            new_content.append(
+                                {**content_item, "content": "screenshot taken"}
+                            )
                         else:
                             new_content.append(content_item)
                     else:
                         new_content.append(content_item)
-                
+
                 item["content"] = new_content
-    
-    return {"items": items}
\ No newline at end of file
+
+    return {"items": items}

From f86ddcb7d6a9d1eea6e6050056ca9168c0af8b24 Mon Sep 17 00:00:00 2001
From: tkattkat <tkat@tkat.net>
Date: Tue, 22 Jul 2025 14:59:12 -0700
Subject: [PATCH 3/4] add changeset

---
 .changeset/masterful-amiable-leopard.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/masterful-amiable-leopard.md

diff --git a/.changeset/masterful-amiable-leopard.md b/.changeset/masterful-amiable-leopard.md
new file mode 100644
index 00000000..07065b4f
--- /dev/null
+++ b/.changeset/masterful-amiable-leopard.md
@@ -0,0 +1,5 @@
+---
+"stagehand": patch
+---
+
+Add support for claude 4 sonnet in agent & remove all images but the last two from anthropic cua client

From 5b0522343598a2be37a17dcdff9b08fc859ae210 Mon Sep 17 00:00:00 2001
From: tkattkat <tkat@tkat.net>
Date: Tue, 22 Jul 2025 16:21:45 -0700
Subject: [PATCH 4/4] lint

---
 stagehand/agent/image_compression_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/stagehand/agent/image_compression_utils.py b/stagehand/agent/image_compression_utils.py
index 7bbf030b..8f929f0c 100644
--- a/stagehand/agent/image_compression_utils.py
+++ b/stagehand/agent/image_compression_utils.py
@@ -1,7 +1,7 @@
-from typing import List, Dict, Any, Union
+from typing import Any
 
 
-def find_items_with_images(items: List[Dict[str, Any]]) -> List[int]:
+def find_items_with_images(items: list[dict[str, Any]]) -> list[int]:
     """
     Finds all items in the conversation history that contain images
 
@@ -37,8 +37,8 @@ def find_items_with_images(items: List[Dict[str, Any]]) -> List[int]:
 
 
 def compress_conversation_images(
-    items: List[Dict[str, Any]], keep_most_recent_count: int = 2
-) -> Dict[str, List[Dict[str, Any]]]:
+    items: list[dict[str, Any]], keep_most_recent_count: int = 2
+) -> dict[str, list[dict[str, Any]]]:
     """
     Compresses conversation history by removing images from older items
     while keeping the most recent images intact