77import copy
88import io
99import sys
10+ import textwrap
1011
1112from bs4 import BeautifulSoup
1213from pynput import keyboard
1617
1718from openadapt .config import config
1819from openadapt .custom_logger import logger
20+ from openadapt .drivers import anthropic
1921from openadapt .db import db
2022from openadapt .privacy .base import ScrubbingProvider , TextScrubbingMixin
2123from openadapt .privacy .providers import ScrubProvider
@@ -110,6 +112,9 @@ def processed_action_events(self) -> list:
110112 if not self ._processed_action_events :
111113 session = crud .get_new_session (read_only = True )
112114 self ._processed_action_events = events .get_events (session , self )
115+ # Preload screenshots to avoid lazy loading later
116+ for event in self ._processed_action_events :
117+ event .screenshot
113118 return self ._processed_action_events
114119
115120 def scrub (self , scrubber : ScrubbingProvider ) -> None :
@@ -125,6 +130,7 @@ class ActionEvent(db.Base):
125130 """Class representing an action event in the database."""
126131
127132 __tablename__ = "action_event"
133+ _repr_ignore_attrs = ["reducer_names" ]
128134
129135 _segment_description_separator = ";"
130136
@@ -333,6 +339,11 @@ def canonical_text(self, value: str) -> None:
333339 if not value == self .canonical_text :
334340 logger .warning (f"{ value = } did not match { self .canonical_text = } " )
335341
342+ @property
343+ def raw_text (self ) -> str :
344+ """Return a string containing the raw action text (without separators)."""
345+ return "" .join (self .text .split (config .ACTION_TEXT_SEP ))
346+
336347 def __str__ (self ) -> str :
337348 """Return a string representation of the action event."""
338349 attr_names = [
@@ -544,6 +555,75 @@ def next_event(self) -> Union["ActionEvent", None]:
544555
545556 return None
546557
558+ def prompt_for_description (self , return_image : bool = False ) -> str :
559+ """Use the Anthropic API to describe what is happening in the action event.
560+
561+ Args:
562+ return_image (bool): Whether to return the image sent to the model.
563+
564+ Returns:
565+ str: The description of the action event.
566+ """
567+ from openadapt .plotting import display_event
568+
569+ image = display_event (
570+ self ,
571+ marker_width_pct = 0.05 ,
572+ marker_height_pct = 0.05 ,
573+ darken_outside = 0.7 ,
574+ display_text = False ,
575+ marker_fill_transparency = 0 ,
576+ )
577+
578+ if self .text :
579+ description = f"Type '{ self .raw_text } '"
580+ else :
581+ prompt = (
582+ "What user interface element is contained in the highlighted circle "
583+ "of the image?"
584+ )
585+ # TODO: disambiguate
586+ system_prompt = textwrap .dedent (
587+ """
588+ Briefly describe the user interface element in the screenshot at the
589+ highlighted location.
590+ For example:
591+ - "OK button"
592+ - "URL bar"
593+ - "Down arrow"
594+ DO NOT DESCRIBE ANYTHING OUTSIDE THE HIGHLIGHTED AREA.
595+ Do not append anything like "is contained within the highlighted circle
596+ in the calculator interface." Just name the user interface element.
597+ """
598+ )
599+
600+ logger .info (f"system_prompt=\n { system_prompt } " )
601+ logger .info (f"prompt=\n { prompt } " )
602+
603+ # Call the Anthropic API
604+ element = anthropic .prompt (
605+ prompt = prompt ,
606+ system_prompt = system_prompt ,
607+ images = [image ],
608+ )
609+
610+ if self .name == "move" :
611+ description = f"Move mouse to '{ element } '"
612+ elif self .name == "scroll" :
613+ # TODO: "scroll to", dx/dy
614+ description = f"Scroll mouse on '{ element } '"
615+ elif "click" in self .name :
616+ description = (
617+ f"{ self .mouse_button_name .capitalize ()} { self .name } '{ element } '"
618+ )
619+ else :
620+ raise ValueError (f"Unhandled { self .name = } { self } " )
621+
622+ if return_image :
623+ return description , image
624+ else :
625+ return description
626+
547627
548628class WindowEvent (db .Base ):
549629 """Class representing a window event in the database."""
0 commit comments