diff --git a/experiments/describe_action.py b/experiments/describe_action.py new file mode 100644 index 000000000..81a79f67c --- /dev/null +++ b/experiments/describe_action.py @@ -0,0 +1,116 @@ +"""Generate action descriptions.""" + +from pprint import pformat + +from loguru import logger +import cv2 +import numpy as np + +from openadapt.db import crud + + +def embed_description( + image: np.ndarray, + description: str, + x: int = None, + y: int = None, +) -> np.ndarray: + """Embed a description into an image at the specified location. + + Args: + image (np.ndarray): The image to annotate. + description (str): The text to embed. + x (int, optional): The x-coordinate. Defaults to None (centered). + y (int, optional): The y-coordinate. Defaults to None (centered). + + Returns: + np.ndarray: The annotated image. + """ + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 1 + font_color = (255, 255, 255) # White + line_type = 1 + + # Split description into multiple lines + max_width = 60 # Maximum characters per line + words = description.split() + lines = [] + current_line = [] + for word in words: + if len(" ".join(current_line + [word])) <= max_width: + current_line.append(word) + else: + lines.append(" ".join(current_line)) + current_line = [word] + if current_line: + lines.append(" ".join(current_line)) + + # Default to center if coordinates are not provided + if x is None or y is None: + x = image.shape[1] // 2 + y = image.shape[0] // 2 + + # Draw semi-transparent background and text + for i, line in enumerate(lines): + text_size, _ = cv2.getTextSize(line, font, font_scale, line_type) + text_x = max(0, min(x - text_size[0] // 2, image.shape[1] - text_size[0])) + text_y = y + i * 20 + + # Draw background + cv2.rectangle( + image, + (text_x - 15, text_y - 25), + (text_x + text_size[0] + 15, text_y + 15), + (0, 0, 0), + -1, + ) + + # Draw text + cv2.putText( + image, + line, + (text_x, text_y), + font, + font_scale, + font_color, + line_type, + ) + + return image + + +def main() -> None: + """Main function.""" + with crud.get_new_session(read_only=True) as session: + recording = crud.get_latest_recording(session) + action_events = recording.processed_action_events + descriptions = [] + for action in action_events: + description, image = action.prompt_for_description(return_image=True) + + # Convert image to numpy array for OpenCV compatibility + image = np.array(image) + + if action.mouse_x is not None and action.mouse_y is not None: + # Use the mouse coordinates for mouse events + annotated_image = embed_description( + image, + description, + x=int(action.mouse_x) * 2, + y=int(action.mouse_y) * 2, + ) + else: + # Center the text for other events + annotated_image = embed_description(image, description) + + logger.info(f"{action=}") + logger.info(f"{description=}") + cv2.imshow("Annotated Image", annotated_image) + cv2.waitKey(0) + descriptions.append(description) + + logger.info(f"descriptions=\n{pformat(descriptions)}") + + +if __name__ == "__main__": + main() diff --git a/openadapt/config.defaults.json b/openadapt/config.defaults.json index ef1d15608..1f935ca3f 100644 --- a/openadapt/config.defaults.json +++ b/openadapt/config.defaults.json @@ -17,7 +17,7 @@ "RECORD_READ_ACTIVE_ELEMENT_STATE": false, "REPLAY_STRIP_ELEMENT_STATE": true, "RECORD_VIDEO": true, - "RECORD_AUDIO": true, + "RECORD_AUDIO": false, "RECORD_BROWSER_EVENTS": false, "RECORD_FULL_VIDEO": false, "RECORD_IMAGES": false, diff --git a/openadapt/db/crud.py b/openadapt/db/crud.py index b70ec98da..39a4f7677 100644 --- a/openadapt/db/crud.py +++ b/openadapt/db/crud.py @@ -337,16 +337,18 @@ def get_all_scrubbed_recordings( def get_latest_recording(session: SaSession) -> Recording: - """Get the latest recording. - - Args: - session (sa.orm.Session): The database session. - - Returns: - Recording: The latest recording object. - """ + """Get the latest recording with preloaded relationships.""" return ( - session.query(Recording).order_by(sa.desc(Recording.timestamp)).limit(1).first() + session.query(Recording) + .options( + sa.orm.joinedload(Recording.screenshots), + sa.orm.joinedload(Recording.action_events) + .joinedload(ActionEvent.screenshot) + .joinedload(Screenshot.recording), + sa.orm.joinedload(Recording.window_events), + ) + .order_by(sa.desc(Recording.timestamp)) + .first() ) diff --git a/openadapt/drivers/anthropic.py b/openadapt/drivers/anthropic.py index 7799f07ae..b863fc435 100644 --- a/openadapt/drivers/anthropic.py +++ b/openadapt/drivers/anthropic.py @@ -5,14 +5,14 @@ from PIL import Image import anthropic -from openadapt import cache, utils +from openadapt import cache from openadapt.config import config from openadapt.custom_logger import logger MAX_TOKENS = 4096 # from https://docs.anthropic.com/claude/docs/vision MAX_IMAGES = 20 -MODEL_NAME = "claude-3-opus-20240229" +MODEL_NAME = "claude-3-5-sonnet-20241022" @cache.cache() @@ -24,6 +24,8 @@ def create_payload( max_tokens: int | None = None, ) -> dict: """Creates the payload for the Anthropic API request with image support.""" + from openadapt import utils + messages = [] user_message_content = [] @@ -36,7 +38,7 @@ def create_payload( # Add base64 encoded images to the user message content if images: for image in images: - image_base64 = utils.image2utf8(image) + image_base64 = utils.image2utf8(image, "PNG") # Extract media type and base64 data # TODO: don't add it to begin with media_type, image_base64_data = image_base64.split(";base64,", 1) @@ -90,7 +92,7 @@ def get_completion( """Sends a request to the Anthropic API and returns the response.""" client = anthropic.Anthropic(api_key=api_key) try: - response = client.messages.create(**payload) + response = client.beta.messages.create(**payload) except Exception as exc: logger.exception(exc) if dev_mode: diff --git a/openadapt/models.py b/openadapt/models.py index 2f42f17fb..b2e9a4224 100644 --- a/openadapt/models.py +++ b/openadapt/models.py @@ -7,6 +7,7 @@ import copy import io import sys +import textwrap from bs4 import BeautifulSoup from pynput import keyboard @@ -16,6 +17,7 @@ from openadapt.config import config from openadapt.custom_logger import logger +from openadapt.drivers import anthropic from openadapt.db import db from openadapt.privacy.base import ScrubbingProvider, TextScrubbingMixin from openadapt.privacy.providers import ScrubProvider @@ -110,6 +112,9 @@ def processed_action_events(self) -> list: if not self._processed_action_events: session = crud.get_new_session(read_only=True) self._processed_action_events = events.get_events(session, self) + # Preload screenshots to avoid lazy loading later + for event in self._processed_action_events: + event.screenshot return self._processed_action_events def scrub(self, scrubber: ScrubbingProvider) -> None: @@ -125,6 +130,7 @@ class ActionEvent(db.Base): """Class representing an action event in the database.""" __tablename__ = "action_event" + _repr_ignore_attrs = ["reducer_names"] _segment_description_separator = ";" @@ -333,6 +339,11 @@ def canonical_text(self, value: str) -> None: if not value == self.canonical_text: logger.warning(f"{value=} did not match {self.canonical_text=}") + @property + def raw_text(self) -> str: + """Return a string containing the raw action text (without separators).""" + return "".join(self.text.split(config.ACTION_TEXT_SEP)) + def __str__(self) -> str: """Return a string representation of the action event.""" attr_names = [ @@ -544,6 +555,75 @@ def next_event(self) -> Union["ActionEvent", None]: return None + def prompt_for_description(self, return_image: bool = False) -> str: + """Use the Anthropic API to describe what is happening in the action event. + + Args: + return_image (bool): Whether to return the image sent to the model. + + Returns: + str: The description of the action event. + """ + from openadapt.plotting import display_event + + image = display_event( + self, + marker_width_pct=0.05, + marker_height_pct=0.05, + darken_outside=0.7, + display_text=False, + marker_fill_transparency=0, + ) + + if self.text: + description = f"Type '{self.raw_text}'" + else: + prompt = ( + "What user interface element is contained in the highlighted circle " + "of the image?" + ) + # TODO: disambiguate + system_prompt = textwrap.dedent( + """ + Briefly describe the user interface element in the screenshot at the + highlighted location. + For example: + - "OK button" + - "URL bar" + - "Down arrow" + DO NOT DESCRIBE ANYTHING OUTSIDE THE HIGHLIGHTED AREA. + Do not append anything like "is contained within the highlighted circle + in the calculator interface." Just name the user interface element. + """ + ) + + logger.info(f"system_prompt=\n{system_prompt}") + logger.info(f"prompt=\n{prompt}") + + # Call the Anthropic API + element = anthropic.prompt( + prompt=prompt, + system_prompt=system_prompt, + images=[image], + ) + + if self.name == "move": + description = f"Move mouse to '{element}'" + elif self.name == "scroll": + # TODO: "scroll to", dx/dy + description = f"Scroll mouse on '{element}'" + elif "click" in self.name: + description = ( + f"{self.mouse_button_name.capitalize()} {self.name} '{element}'" + ) + else: + raise ValueError(f"Unhandled {self.name=} {self}") + + if return_image: + return description, image + else: + return description + class WindowEvent(db.Base): """Class representing a window event in the database.""" diff --git a/openadapt/plotting.py b/openadapt/plotting.py index 06eacb612..b0bc0b932 100644 --- a/openadapt/plotting.py +++ b/openadapt/plotting.py @@ -12,7 +12,7 @@ import matplotlib.pyplot as plt import numpy as np -from openadapt import common, contrib, models, utils +from openadapt import common, models, utils from openadapt.config import PERFORMANCE_PLOTS_DIR_PATH, config from openadapt.custom_logger import logger from openadapt.models import ActionEvent @@ -226,6 +226,8 @@ def display_event( marker_fill_transparency: float = 0.25, marker_outline_transparency: float = 0.5, diff: bool = False, + darken_outside: float | None = None, + display_text: bool = True, ) -> Image.Image: """Display an action event on the image. @@ -241,6 +243,10 @@ def display_event( marker outline. Defaults to 0.5. diff (bool): Flag indicating whether to display the diff image. Defaults to False. + darken_outside (float | None): How much to darken the areas outside + the ellipse for mouse events. Range 0-1, where 1 is completely black. + Defaults to None (no darkening). + display_text (bool): Whether to display action text. Defaults to True. Returns: PIL.Image.Image: The image with the action event displayed on it. @@ -292,14 +298,50 @@ def display_event( if action_event.name in common.MOUSE_EVENTS: x = action_event.mouse_x * width_ratio y = action_event.mouse_y * height_ratio - image, ellipse_width, ellipse_height = draw_ellipse(x, y, image) + image, ellipse_width, ellipse_height = draw_ellipse( + x, + y, + image, + width_pct=marker_width_pct, + height_pct=marker_height_pct, + fill_transparency=marker_fill_transparency, + outline_transparency=marker_outline_transparency, + ) + + # Apply darkening outside the ellipse if darken_outside is set + if darken_outside is not None: + darken_outside = max(0, min(darken_outside, 1)) # Ensure between 0 and 1 + mask = Image.new("L", image.size, 255) # Start with a white mask + draw = ImageDraw.Draw(mask) + # Black ellipse + draw.ellipse( + ( + x - ellipse_width / 2, + y - ellipse_height / 2, + x + ellipse_width / 2, + y + ellipse_height / 2, + ), + fill=0, + ) + # Create a transparent dark overlay + overlay = Image.new( + "RGBA", image.size, (0, 0, 0, int(255 * darken_outside)) + ) + # Use the mask to apply the overlay only outside the ellipse + image = Image.alpha_composite( + image, + Image.composite( + overlay, Image.new("RGBA", image.size, (0, 0, 0, 0)), mask + ), + ) # draw text dx = action_event.mouse_dx or 0 dy = action_event.mouse_dy or 0 d_text = f" {dx=} {dy=}" if dx or dy else "" text = f"{action_event.name}{d_text}" - image = draw_text(x, y + ellipse_height / 2, text, image) + if display_text: + image = draw_text(x, y + ellipse_height / 2, text, image) elif action_event.name in common.KEY_EVENTS: x = recording.monitor_width * width_ratio / 2 y = recording.monitor_height * height_ratio / 2 @@ -322,7 +364,8 @@ def display_event( " original text." ) - image = draw_text(x, y, text, image, outline=True) + if display_text: + image = draw_text(x, y, text, image, outline=True) else: raise Exception("unhandled {action_event.name=}") @@ -792,6 +835,8 @@ def get_marked_image( Image.Image: The marked image, where marks and/or masks are applied based on the specified confidence and IoU thresholds and the include flags. """ + from openadapt import contrib + image_arr = np.asarray(original_image) # The rest of this function is copied from diff --git a/openadapt/utils.py b/openadapt/utils.py index 4143b5918..524441946 100644 --- a/openadapt/utils.py +++ b/openadapt/utils.py @@ -357,24 +357,26 @@ def get_scale_ratios( return width_ratio, height_ratio -# TODO: png -def image2utf8(image: Image.Image, include_prefix: bool = True) -> str: +def image2utf8(image: Image.Image, image_format: str = "JPEG") -> str: """Convert an image to UTF-8 format. Args: image (PIL.Image.Image): The image to convert. - include_prefix (bool): Whether to include the "data:" prefix. + image_format (str): The format of the image ("JPEG" or "PNG"). Returns: str: The UTF-8 encoded image. """ + KNOWN_FORMATS = ("JPEG", "PNG") + assert image_format in KNOWN_FORMATS, (image_format, KNOWN_FORMATS) if not image: return "" image = image.convert("RGB") buffered = BytesIO() - image.save(buffered, format="JPEG") + image.save(buffered, format=image_format.upper()) image_str = base64.b64encode(buffered.getvalue()) - base64_prefix = bytes("data:image/jpeg;base64,", encoding="utf-8") + fmt = image_format.lower() + base64_prefix = bytes(f"data:image/{fmt};base64,", encoding="utf-8") image_base64 = base64_prefix + image_str image_utf8 = image_base64.decode("utf-8") return image_utf8 diff --git a/poetry.lock b/poetry.lock index 4a204ac3c..bcf9a6433 100644 --- a/poetry.lock +++ b/poetry.lock @@ -190,13 +190,13 @@ files = [ [[package]] name = "anthropic" -version = "0.34.2" +version = "0.42.0" description = "The official Python library for the anthropic API" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "anthropic-0.34.2-py3-none-any.whl", hash = "sha256:f50a628eb71e2c76858b106c8cbea278c45c6bd2077cb3aff716a112abddc9fc"}, - {file = "anthropic-0.34.2.tar.gz", hash = "sha256:808ea19276f26646bfde9ee535669735519376e4eeb301a2974fc69892be1d6e"}, + {file = "anthropic-0.42.0-py3-none-any.whl", hash = "sha256:46775f65b723c078a2ac9e9de44a46db5c6a4fabeacfd165e5ea78e6817f4eff"}, + {file = "anthropic-0.42.0.tar.gz", hash = "sha256:bf8b0ed8c8cb2c2118038f29c58099d2f99f7847296cafdaa853910bfff4edf4"}, ] [package.dependencies] @@ -206,8 +206,7 @@ httpx = ">=0.23.0,<1" jiter = ">=0.4.0,<1" pydantic = ">=1.9.0,<3" sniffio = "*" -tokenizers = ">=0.13.0" -typing-extensions = ">=4.7,<5" +typing-extensions = ">=4.10,<5" [package.extras] bedrock = ["boto3 (>=1.28.57)", "botocore (>=1.31.57)"] @@ -2065,10 +2064,7 @@ grpcio-status = [ {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, ] -proto-plus = [ - {version = ">=1.25.0,<2.0.0dev", markers = "python_version >= \"3.13\""}, - {version = ">=1.22.3,<2.0.0dev", markers = "python_version < \"3.13\""}, -] +proto-plus = ">=1.22.3,<2.0.0dev" protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" requests = ">=2.18.0,<3.0.0.dev0" @@ -4294,8 +4290,7 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, ] @@ -4318,8 +4313,7 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, ] @@ -5298,10 +5292,7 @@ files = [ [package.dependencies] annotated-types = ">=0.6.0" pydantic-core = "2.23.4" -typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, - {version = ">=4.6.1", markers = "python_version < \"3.13\""}, -] +typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""} [package.extras] email = ["email-validator (>=2.0.0)"] @@ -5561,8 +5552,7 @@ files = [ astroid = ">=3.3.4,<=3.4.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.3.7", markers = "python_version >= \"3.12\""}, - {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""}, ] isort = ">=4.2.5,<5.13.0 || >5.13.0,<6" @@ -9228,5 +9218,5 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" -python-versions = ">=3.10,<3.14" -content-hash = "9b98c96a50451de3edfedc5627e895a74c84f6af3af535fee09a343806e3fbdf" +python-versions = ">=3.10,<3.12" +content-hash = "62daa68ec7dec0a4bba61359258025611ce0e48cc3c16a195d9804336e5dbbe9" diff --git a/pyproject.toml b/pyproject.toml index ce1c617b8..161a60e75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [tool.poetry] name = "openadapt" version = "0.43.1" -description = "GUI Process Automation with Transformers" +description = "Generative Process Automation" authors = [ - 'OpenAdapt.AI Team ', + 'Richard Abrich ', ] classifiers = [ "Programming Language :: Python :: 3", @@ -12,14 +12,14 @@ classifiers = [ readme = "README.md" -repository = "https://github.com/mldsai/openadapt" +repository = "https://github.com/OpenAdaptAI/OpenAdapt" homepage = "https://openadapt.ai/" [tool.poetry.urls] -"Bug Tracker" = "https://github.com/MLDSAI/OpenAdapt/issues" +"Bug Tracker" = "https://github.com/OpenAdaptAI/OpenAdapt/issues" [tool.poetry.dependencies] -python = ">=3.10,<3.14" +python = ">=3.10,<3.12" alembic = "1.8.1" black = "^24.8.0" pygetwindow = { version = "<0.0.5", markers = "sys_platform == 'win32'" } @@ -85,7 +85,7 @@ boto3 = "^1.28.30" botocore = "^1.31.30" easyocr = "^1.7.0" spacy-curated-transformers = "^0.2.0" -anthropic = "^0.34.2" +anthropic = "0.42.0" orjson = "^3.9.15" replicate = "^0.25.0" gradio-client = "0.15.0"