77import copy
88import io
99import sys
10+ import textwrap
1011
1112from bs4 import BeautifulSoup
1213from pynput import keyboard
1617
1718from openadapt .config import config
1819from openadapt .custom_logger import logger
20+ from openadapt .drivers import anthropic
1921from openadapt .db import db
2022from openadapt .privacy .base import ScrubbingProvider , TextScrubbingMixin
2123from openadapt .privacy .providers import ScrubProvider
@@ -110,6 +112,9 @@ def processed_action_events(self) -> list:
110112 if not self ._processed_action_events :
111113 session = crud .get_new_session (read_only = True )
112114 self ._processed_action_events = events .get_events (session , self )
115+ # Preload screenshots to avoid lazy loading later
116+ for event in self ._processed_action_events :
117+ event .screenshot
113118 return self ._processed_action_events
114119
115120 def scrub (self , scrubber : ScrubbingProvider ) -> None :
@@ -125,6 +130,7 @@ class ActionEvent(db.Base):
125130 """Class representing an action event in the database."""
126131
127132 __tablename__ = "action_event"
133+ _repr_ignore_attrs = ["reducer_names" ]
128134
129135 _segment_description_separator = ";"
130136
@@ -333,6 +339,10 @@ def canonical_text(self, value: str) -> None:
333339 if not value == self .canonical_text :
334340 logger .warning (f"{ value = } did not match { self .canonical_text = } " )
335341
342+ @property
343+ def raw_text (self ) -> str :
344+ return "" .join (self .text .split (config .ACTION_TEXT_SEP ))
345+
336346 def __str__ (self ) -> str :
337347 """Return a string representation of the action event."""
338348 attr_names = [
@@ -544,6 +554,75 @@ def next_event(self) -> Union["ActionEvent", None]:
544554
545555 return None
546556
557+ def prompt_for_description (self , return_image : bool = False ) -> str :
558+ """Use the Anthropic API to describe what is happening in the action event.
559+
560+ Args:
561+ return_image (bool): Whether to return the image sent to the model.
562+
563+ Returns:
564+ str: The description of the action event.
565+ """
566+ from openadapt .plotting import display_event
567+
568+ image = display_event (
569+ self ,
570+ marker_width_pct = 0.05 ,
571+ marker_height_pct = 0.05 ,
572+ darken_outside = 0.7 ,
573+ display_text = False ,
574+ marker_fill_transparency = 0 ,
575+ )
576+
577+ if self .text :
578+ description = f"Type '{ self .raw_text } '"
579+ else :
580+ prompt = (
581+ "What user interface element is contained in the highlighted circle "
582+ "of the image?"
583+ )
584+ # TODO: disambiguate
585+ system_prompt = textwrap .dedent (
586+ """
587+ Briefly describe the user interface element in the screenshot at the
588+ highlighted location.
589+ For example:
590+ - "OK button"
591+ - "URL bar"
592+ - "Down arrow"
593+ DO NOT DESCRIBE ANYTHING OUTSIDE THE HIGHLIGHTED AREA.
594+ Do not append anything like "is contained within the highlighted circle
595+ in the calculator interface." Just name the user interface element.
596+ """
597+ )
598+
599+ logger .info (f"system_prompt=\n { system_prompt } " )
600+ logger .info (f"prompt=\n { prompt } " )
601+
602+ # Call the Anthropic API
603+ element = anthropic .prompt (
604+ prompt = prompt ,
605+ system_prompt = system_prompt ,
606+ images = [image ],
607+ )
608+
609+ if self .name == "move" :
610+ description = f"Move mouse to '{ element } '"
611+ elif self .name == "scroll" :
612+ # TODO: "scroll to", dx/dy
613+ description = f"Scroll mouse on '{ element } '"
614+ elif "click" in self .name :
615+ description = (
616+ f"{ self .mouse_button_name .capitalize ()} { self .name } '{ element } '"
617+ )
618+ else :
619+ raise ValueError (f"Unhandled { self .name = } { self } " )
620+
621+ if return_image :
622+ return description , image
623+ else :
624+ return description
625+
547626
548627class WindowEvent (db .Base ):
549628 """Class representing a window event in the database."""
0 commit comments