feat: run OpenAdapt inside Anthropic Computer Use (#934)

abrichr · web-flow · commit 8a79c83aabe3 · 2025-01-02T01:04:47.000-05:00
* import sounddevice inside record_audio()

* utils.get_scaling_factor

* describe_actions.py with PIL

* show text in top left

* max_width = image.width

* dim_outside_window

* add module docstring

* add browser to visualize.py
diff --git a/experiments/describe_actions.py b/experiments/describe_actions.py
@@ -1,38 +1,40 @@
-"""Generate action descriptions."""
+"""Generate natural language descriptions from actions."""
 
 from pprint import pformat
-
 from loguru import logger
-import cv2
+from PIL import Image, ImageDraw
 import numpy as np
 
 from openadapt.db import crud
+from openadapt.plotting import get_font
+from openadapt.utils import get_scaling_factor
+
+scaling_factor = get_scaling_factor()
 
 
 def embed_description(
-    image: np.ndarray,
+    image: Image.Image,
     description: str,
-    x: int = None,
-    y: int = None,
-) -> np.ndarray:
+    x: int = 0,
+    y: int = 0,
+) -> Image.Image:
     """Embed a description into an image at the specified location.
 
     Args:
-        image (np.ndarray): The image to annotate.
+        image (Image.Image): The image to annotate.
         description (str): The text to embed.
-        x (int, optional): The x-coordinate. Defaults to None (centered).
-        y (int, optional): The y-coordinate. Defaults to None (centered).
+        x (int, optional): The x-coordinate. Defaults to 0.
+        y (int, optional): The y-coordinate. Defaults to 0.
 
     Returns:
-        np.ndarray: The annotated image.
+        Image.Image: The annotated image.
     """
-    font = cv2.FONT_HERSHEY_SIMPLEX
-    font_scale = 1
-    font_color = (255, 255, 255)  # White
-    line_type = 1
+    draw = ImageDraw.Draw(image)
+    font_size = 30  # Set font size (2x the default size)
+    font = get_font("Arial.ttf", font_size)
 
     # Split description into multiple lines
-    max_width = 60  # Maximum characters per line
+    max_width = image.width
     words = description.split()
     lines = []
     current_line = []
@@ -45,36 +47,28 @@ def embed_description(
     if current_line:
         lines.append(" ".join(current_line))
 
-    # Default to center if coordinates are not provided
-    if x is None or y is None:
-        x = image.shape[1] // 2
-        y = image.shape[0] // 2
+    # Adjust coordinates for scaling factor
+    x = int(x * scaling_factor)
+    y = int(y * scaling_factor)
 
-    # Draw semi-transparent background and text
+    # Calculate text dimensions and draw semi-transparent background and text
     for i, line in enumerate(lines):
-        text_size, _ = cv2.getTextSize(line, font, font_scale, line_type)
-        text_x = max(0, min(x - text_size[0] // 2, image.shape[1] - text_size[0]))
-        text_y = y + i * 20
+        bbox = draw.textbbox((0, 0), line, font=font)
+        text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
+        text_x = max(0, min(x - text_width // 2, image.width - text_width))
+        text_y = y + i * text_height
 
         # Draw background
-        cv2.rectangle(
-            image,
-            (text_x - 15, text_y - 25),
-            (text_x + text_size[0] + 15, text_y + 15),
-            (0, 0, 0),
-            -1,
+        background_box = (
+            text_x - 15,
+            text_y - 5,
+            text_x + text_width + 15,
+            text_y + text_height + 5,
         )
+        draw.rectangle(background_box, fill=(0, 0, 0, 128))
 
         # Draw text
-        cv2.putText(
-            image,
-            line,
-            (text_x, text_y),
-            font,
-            font_scale,
-            font_color,
-            line_type,
-        )
+        draw.text((text_x, text_y), line, fill=(255, 255, 255), font=font)
 
     return image
 
@@ -88,25 +82,22 @@ def main() -> None:
         for action in action_events:
             description, image = action.prompt_for_description(return_image=True)
 
-            # Convert image to numpy array for OpenCV compatibility
-            image = np.array(image)
+            # Convert image to PIL.Image for compatibility
+            image = Image.fromarray(np.array(image))
 
             if action.mouse_x is not None and action.mouse_y is not None:
                 # Use the mouse coordinates for mouse events
                 annotated_image = embed_description(
                     image,
                     description,
-                    x=int(action.mouse_x) * 2,
-                    y=int(action.mouse_y) * 2,
                 )
             else:
                 # Center the text for other events
                 annotated_image = embed_description(image, description)
 
             logger.info(f"{action=}")
             logger.info(f"{description=}")
-            cv2.imshow("Annotated Image", annotated_image)
-            cv2.waitKey(0)
+            annotated_image.show()  # Opens the annotated image using the default viewer
             descriptions.append(description)
 
         logger.info(f"descriptions=\n{pformat(descriptions)}")
diff --git a/openadapt/models.py b/openadapt/models.py
@@ -573,6 +573,7 @@ def prompt_for_description(self, return_image: bool = False) -> str:
             darken_outside=0.7,
             display_text=False,
             marker_fill_transparency=0,
+            dim_outside_window=False,
         )
 
         if self.text:
diff --git a/openadapt/plotting.py b/openadapt/plotting.py
@@ -228,6 +228,7 @@ def display_event(
     diff: bool = False,
     darken_outside: float | None = None,
     display_text: bool = True,
+    dim_outside_window: bool = True,
 ) -> Image.Image:
     """Display an action event on the image.
 
@@ -247,6 +248,7 @@ def display_event(
           the ellipse for mouse events. Range 0-1, where 1 is completely black.
           Defaults to None (no darkening).
         display_text (bool): Whether to display action text. Defaults to True.
+        dim_outside_window (bool): Whether to dim outside the WindowEvent area.
 
     Returns:
         PIL.Image.Image: The image with the action event displayed on it.
@@ -267,14 +269,15 @@ def display_event(
     width_ratio, height_ratio = utils.get_scale_ratios(action_event)
 
     # dim area outside window event
-    if not window_event:
-        logger.error(f"{window_event=}")
-    else:
-        x0 = window_event.left * width_ratio
-        y0 = window_event.top * height_ratio
-        x1 = x0 + window_event.width * width_ratio
-        y1 = y0 + window_event.height * height_ratio
-        image = draw_rectangle(x0, y0, x1, y1, image, outline_width=5)
+    if dim_outside_window:
+        if not window_event:
+            logger.error(f"{window_event=}")
+        else:
+            x0 = window_event.left * width_ratio
+            y0 = window_event.top * height_ratio
+            x1 = x0 + window_event.width * width_ratio
+            y1 = y0 + window_event.height * height_ratio
+            image = draw_rectangle(x0, y0, x1, y1, image, outline_width=5)
 
     # display diff bbox
     if diff:
diff --git a/openadapt/record.py b/openadapt/record.py
@@ -35,7 +35,6 @@
 
 import numpy as np
 import psutil
-import sounddevice
 import soundfile
 import websockets.sync.server
 import whisper
@@ -1082,6 +1081,8 @@ def record_audio(
 
     audio_frames = []  # to store audio frames
 
+    import sounddevice
+
     def audio_callback(
         indata: np.ndarray, frames: int, time: Any, status: sounddevice.CallbackFlags
     ) -> None:
diff --git a/openadapt/utils.py b/openadapt/utils.py
@@ -1087,6 +1087,18 @@ def get_html_prompt(html: str, convert_to_markdown: bool = False) -> str:
     return str(soup)
 
 
+def get_scaling_factor() -> int:
+    """Determine the scaling factor using AppKit on macOS."""
+    if sys.platform == "darwin":
+        from AppKit import NSScreen
+
+        main_screen = NSScreen.mainScreen()
+        backing_scale = main_screen.backingScaleFactor()
+        logger.info(f"Backing Scale Factor: {backing_scale}")
+        return int(backing_scale)
+    return 1  # Default for Windows/Linux
+
+
 class WrapStdout:
     """Class to be used a target for multiprocessing.Process."""
 
diff --git a/openadapt/visualize.py b/openadapt/visualize.py
@@ -159,6 +159,7 @@ def main(
     recording_id: int = None,
     diff_video: bool = False,
     cleanup: bool = True,
+    browser: str = None,
 ) -> bool:
     """Visualize a recording.
 
@@ -167,6 +168,7 @@ def main(
         recording_id (int, optional): The ID of the recording to visualize.
         diff_video (bool): Whether to diff Screenshots against video frames.
         cleanup (bool): Whether to remove the HTML file after it is displayed.
+        browser (str, optional): Command to open the browser executable.
 
     Returns:
         bool: True if visualization was successful, None otherwise.
@@ -442,11 +444,13 @@ def main(
     os.makedirs(RECORDING_DIR_PATH, exist_ok=True)
     output_file(fname_out, title=title)
 
-    result = show(  # noqa: F841
-        layout(
-            rows,
-        )
-    )
+    result = show(layout(rows))  # noqa: F841
+
+    if browser:
+        import subprocess
+
+        logger.info(f"Opening browser with command: {browser}")
+        subprocess.run([browser, f"file://{fname_out}"], check=True)
 
     def _cleanup() -> None:
         os.remove(fname_out)

Original file line number	Diff line number	Diff line change
`@@ -573,6 +573,7 @@ def prompt_for_description(self, return_image: bool = False) -> str:`
`573`	`573`	`darken_outside=0.7,`
`574`	`574`	`display_text=False,`
`575`	`575`	`marker_fill_transparency=0,`
	`576`	`+ dim_outside_window=False,`
`576`	`577`	`)`
`577`	`578`
`578`	`579`	`if self.text:`