OpenAdaptAI
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/main.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎openadapt/adapters/anthropic.py‎
Lines changed: 22 additions & 18 deletions b/‎openadapt/adapters/anthropic.py‎
Lines changed: 22 additions & 18 deletions
diff --git a/‎openadapt/adapters/google.py‎
Lines changed: 19 additions & 23 deletions b/‎openadapt/adapters/google.py‎
Lines changed: 19 additions & 23 deletions
diff --git a/‎openadapt/adapters/openai.py‎
Lines changed: 41 additions & 15 deletions b/‎openadapt/adapters/openai.py‎
Lines changed: 41 additions & 15 deletions
diff --git a/‎openadapt/capture/_macos.py‎
Lines changed: 2 additions & 0 deletions b/‎openadapt/capture/_macos.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎openadapt/config.py‎
Lines changed: 3 additions & 2 deletions b/‎openadapt/config.py‎
Lines changed: 3 additions & 2 deletions
@@ -63,4 +63,4 @@ jobs:
         run: poetry run black --preview --check . --exclude '/(alembic|\.venv)/'
 
       - name: Run Flake8
-        run: poetry run flake8 --exclude=alembic,.venv
+        run: poetry run flake8 --exclude=alembic,.venv,*/.cache
@@ -191,6 +191,7 @@ python -m openadapt.replay NaiveReplayStrategy
 Other replay strategies include:
 
 - [`StatefulReplayStrategy`](https://github.com/OpenAdaptAI/OpenAdapt/blob/main/openadapt/strategies/stateful.py): Proof-of-concept which uses the OpenAI GPT-4 API with prompts constructed via OS-level window data.
+- [`VanillaReplayStrategy`](https://github.com/OpenAdaptAI/OpenAdapt/blob/main/openadapt/strategies/vanilla.py): If AGI or GPT6 happens, this script should be able to suddenly do the work.  --LunjunZhang
 - [`VisualReplayStrategy`](https://github.com/OpenAdaptAI/OpenAdapt/blob/main/openadapt/strategies/visual.py): Uses [Fast Segment Anything Model (FastSAM)](https://github.com/CASIA-IVA-Lab/FastSAM) to segment active window. Accepts an "instructions" parameter that is used to modify the recording, e.g.:
 
 ```
 
@@ -3,9 +3,10 @@
 from pprint import pprint
 
 from loguru import logger
+from PIL import Image
 import anthropic
 
-from openadapt import cache
+from openadapt import cache, utils
 from openadapt.config import config
 
 MAX_TOKENS = 4096
@@ -18,7 +19,7 @@
 def create_payload(
     prompt: str,
     system_prompt: str | None = None,
-    base64_images: list[tuple[str, str]] | None = None,
+    images: list[Image.Image] | None = None,
     model: str = MODEL_NAME,
     max_tokens: int | None = None,
 ) -> dict:
@@ -33,10 +34,12 @@ def create_payload(
         max_tokens = MAX_TOKENS
 
     # Add base64 encoded images to the user message content
-    if base64_images:
-        for image_data in base64_images:
+    if images:
+        for image in images:
+            image_base64 = utils.image2utf8(image)
             # Extract media type and base64 data
-            media_type, base64_str = image_data.split(";base64,", 1)
+            # TODO: don't add it to begin with
+            media_type, image_base64_data = image_base64.split(";base64,", 1)
             media_type = media_type.split(":")[-1]  # Remove 'data:' prefix
 
             user_message_content.append(
@@ -45,7 +48,7 @@ def create_payload(
                     "source": {
                         "type": "base64",
                         "media_type": media_type,
-                        "data": base64_str,
+                        "data": image_base64_data,
                     },
                 }
             )
@@ -80,19 +83,22 @@ def create_payload(
     return payload
 
 
-client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
-
-
 @cache.cache()
-def get_completion(payload: dict) -> str:
+def get_completion(
+    payload: dict, dev_mode: bool = False, api_key: str = config.ANTHROPIC_API_KEY
+) -> str:
     """Sends a request to the Anthropic API and returns the response."""
+    client = anthropic.Anthropic(api_key=api_key)
     try:
         response = client.messages.create(**payload)
     except Exception as exc:
         logger.exception(exc)
-        import ipdb
+        if dev_mode:
+            import ipdb
 
-        ipdb.set_trace()
+            ipdb.set_trace()
+        else:
+            raise
     """
     Message(
         id='msg_01L55ai2A9q92687mmjMSch3',
@@ -125,19 +131,17 @@ def get_completion(payload: dict) -> str:
 def prompt(
     prompt: str,
     system_prompt: str | None = None,
-    base64_images: list[str] | None = None,
+    images: list[Image.Image] | None = None,
     max_tokens: int | None = None,
 ) -> str:
     """Public method to get a response from the Anthropic API with image support."""
-    if len(base64_images) > MAX_IMAGES:
+    if len(images) > MAX_IMAGES:
         # XXX TODO handle this
-        raise Exception(
-            f"{len(base64_images)=} > {MAX_IMAGES=}. Use a different adapter."
-        )
+        raise Exception(f"{len(images)=} > {MAX_IMAGES=}. Use a different adapter.")
     payload = create_payload(
         prompt,
         system_prompt,
-        base64_images,
+        images,
         max_tokens=max_tokens,
     )
     # pprint(f"payload=\n{payload}")  # Log payload for debugging
 
@@ -3,13 +3,14 @@
 See https://ai.google.dev/tutorials/python_quickstart for documentation.
 """
 
-from pprint import pprint
+from pprint import pformat
 
+from loguru import logger
 from PIL import Image
 import fire
 import google.generativeai as genai
 
-from openadapt import cache, utils
+from openadapt import cache
 from openadapt.config import config
 
 MAX_TOKENS = 2**20  # 1048576
@@ -28,46 +29,41 @@
 def prompt(
     prompt: str,
     system_prompt: str | None = None,
-    base64_images: list[str] | None = None,
+    images: list[Image.Image] | None = None,
     # max_tokens: int | None = None,
     model_name: str = MODEL_NAME,
+    timeout: int = 10,
 ) -> str:
     """Public method to get a response from the Google API with image support."""
     full_prompt = "\n\n###\n\n".join([s for s in (system_prompt, prompt) if s])
     # HACK
     full_prompt += "\nWhen responding in JSON, you MUST use double quotes around keys."
 
-    # TODO: modify API across all adapters to accept PIL.Image
-    images = (
-        [utils.utf82image(base64_image) for base64_image in base64_images]
-        if base64_images
-        else []
-    )
-
     genai.configure(api_key=config.GOOGLE_API_KEY)
     model = genai.GenerativeModel(model_name)
-    response = model.generate_content([full_prompt] + images)
+    response = model.generate_content(
+        [full_prompt] + images, request_options={"timeout": timeout}
+    )
     response.resolve()
-    pprint(f"response=\n{response}")  # Log response for debugging
+    logger.info(f"response=\n{pformat(response)}")
     return response.text
 
 
 def main(text: str, image_path: str | None = None) -> None:
     """Prompt Google Gemini with text and a path to an image."""
     if image_path:
-        with Image.open(image_path) as img:
-            # Convert image to RGB if it's RGBA (to remove alpha channel)
-            if img.mode in ("RGBA", "LA") or (
-                img.mode == "P" and "transparency" in img.info
-            ):
-                img = img.convert("RGB")
-            base64_image = utils.image2utf8(img)
+        image = Image.open(image_path)
+        # Convert image to RGB if it's RGBA (to remove alpha channel)
+        if image.mode in ("RGBA", "LA") or (
+            image.mode == "P" and "transparency" in image.info
+        ):
+            image = image.convert("RGB")
     else:
-        base64_image = None
+        image = None
 
-    base64_images = [base64_image] if base64_image else None
-    output = prompt(text, base64_images=base64_images)
-    print(output)
+    images = [image] if image else None
+    output = prompt(text, images=images)
+    logger.info(output)
 
 
 if __name__ == "__main__":
 
@@ -3,18 +3,23 @@
 https://platform.openai.com/docs/guides/vision
 """
 
+from copy import deepcopy
 from pprint import pformat
+from typing import Any
 
 from loguru import logger
+from PIL import Image
 import requests
 
-from openadapt import cache
+from openadapt import cache, utils
 from openadapt.config import config
 
 MODEL_NAME = [
     "gpt-4-vision-preview",
     "gpt-4-turbo-2024-04-09",
+    "gpt-4o",
 ][-1]
+# TODO XXX: per model
 MAX_TOKENS = 4096
 # TODO XXX undocumented
 MAX_IMAGES = None
@@ -23,7 +28,7 @@
 def create_payload(
     prompt: str,
     system_prompt: str | None = None,
-    base64_images: list[str] | None = None,
+    images: list[Image.Image] | None = None,
     model: str = MODEL_NAME,
     detail: str = "high",  # "low" or "high"
     max_tokens: int | None = None,
@@ -33,7 +38,7 @@ def create_payload(
     Args:
         prompt: the prompt
         system_prompt: the system prompt
-        base64_images: list of base64 encoded images
+        images: list of images
         model: name of OpenAI model
         detail: detail level of images, "low" or "high"
         max_tokens: maximum number of tokens
@@ -59,8 +64,9 @@ def create_payload(
         },
     ]
 
-    base64_images = base64_images or []
-    for base64_image in base64_images:
+    images = images or []
+    for image in images:
+        base64_image = utils.image2utf8(image)
         messages[0]["content"].append(
             {
                 "type": "image_url",
@@ -94,18 +100,22 @@ def create_payload(
 
 
 @cache.cache()
-def get_response(payload: dict) -> requests.Response:
+def get_response(
+    payload: dict,
+    api_key: str = config.OPENAI_API_KEY,
+) -> requests.Response:
     """Sends a request to the OpenAI API and returns the response.
 
     Args:
         payload: dictionary returned by create_payload
+        api_key (str): api key
 
     Returns:
         response from OpenAI API
     """
     headers = {
         "Content-Type": "application/json",
-        "Authorization": f"Bearer {config.OPENAI_API_KEY}",
+        "Authorization": f"Bearer {api_key}",
     }
     response = requests.post(
         "https://api.openai.com/v1/chat/completions",
@@ -115,14 +125,15 @@ def get_response(payload: dict) -> requests.Response:
     return response
 
 
-def get_completion(payload: dict) -> str:
+def get_completion(payload: dict, dev_mode: bool = False) -> str:
     """Sends a request to the OpenAI API and returns the first message.
 
     Args:
-        pyalod: dictionary returned by create_payload
+        payload (dict): dictionary returned by create_payload
+        dev_mode (bool): whether to launch a debugger on error
 
     Returns:
-        string containing the first message from the response
+        (str) first message from the response
     """
     response = get_response(payload)
     result = response.json()
@@ -133,22 +144,37 @@ def get_completion(payload: dict) -> str:
         # TODO: fail after maximum number of attempts
         if "retry your request" in message:
             return get_completion(payload)
-        else:
+        elif dev_mode:
             import ipdb
 
             ipdb.set_trace()
             # TODO: handle more errors
+        else:
+            raise ValueError(result["error"]["message"])
     choices = result["choices"]
     choice = choices[0]
     message = choice["message"]
     content = message["content"]
     return content
 
 
+def log_payload(payload: dict[Any, Any]) -> None:
+    """Logs a payload after removing base-64 encoded values recursively."""
+    # TODO: detect base64 encoded strings dynamically
+    # messages["content"][{"image_url": ...
+    # payload["messages"][1]["content"][9]["image_url"]
+    payload_copy = deepcopy(payload)
+    for message in payload_copy["messages"]:
+        for content in message["content"]:
+            if "image_url" in content:
+                content["image_url"]["url"] = "[REDACTED]"
+    logger.info(f"payload=\n{pformat(payload_copy)}")
+
+
 def prompt(
     prompt: str,
     system_prompt: str | None = None,
-    base64_images: list[str] | None = None,
+    images: list[Image.Image] | None = None,
     max_tokens: int | None = None,
     detail: str = "high",
 ) -> str:
@@ -157,7 +183,7 @@ def prompt(
     Args:
         prompt: the prompt
         system_prompt: the system prompt
-        base64_images: list of base64 encoded images
+        images: list of images
         model: name of OpenAI model
         detail: detail level of images, "low" or "high"
         max_tokens: maximum number of tokens
@@ -168,11 +194,11 @@ def prompt(
     payload = create_payload(
         prompt,
         system_prompt,
-        base64_images,
+        images,
         max_tokens=max_tokens,
         detail=detail,
     )
-    logger.info(f"payload=\n{pformat(payload)}")
+    log_payload(payload)
     result = get_completion(payload)
     logger.info(f"result=\n{pformat(result)}")
     return result
@@ -9,6 +9,7 @@
 import os
 
 from Foundation import NSURL, NSObject  # type: ignore # noqa
+from loguru import logger
 from Quartz import CGMainDisplayID  # type: ignore # noqa
 import AVFoundation as AVF  # type: ignore # noqa
 import objc  # type: ignore # noqa
@@ -56,6 +57,7 @@ def start(self, audio: bool = False, camera: bool = False) -> None:
                 datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".mov",
             )
         )
+        logger.info(f"{self.file_url=}")
         if audio and self.session.canAddInput_(self.audio_input[0]):
             self.session.addInput_(self.audio_input[0])
 
 
@@ -25,6 +25,7 @@
 ).absolute()
 
 ROOT_DIR_PATH = get_root_dir_path()
+PARENT_DIR_PATH = ROOT_DIR_PATH.parent
 DATA_DIR_PATH = (ROOT_DIR_PATH / "data").absolute()
 CONFIG_FILE_PATH = (DATA_DIR_PATH / "config.json").absolute()
 RECORDING_DIR_PATH = (DATA_DIR_PATH / "recordings").absolute()
@@ -136,7 +137,7 @@ class SegmentationAdapter(str, Enum):
     OPENAI_MODEL_NAME: str = "gpt-3.5-turbo"
 
     # Record and replay
-    RECORD_WINDOW_DATA: bool = False
+    RECORD_WINDOW_DATA: bool = True
     RECORD_READ_ACTIVE_ELEMENT_STATE: bool = False
     RECORD_VIDEO: bool
     RECORD_AUDIO: bool
@@ -407,7 +408,7 @@ def print_config() -> None:
             if is_running_from_executable():
                 is_reporting_branch = True
             else:
-                active_branch_name = git.Repo(ROOT_DIR_PATH.parent).active_branch.name
+                active_branch_name = git.Repo(PARENT_DIR_PATH).active_branch.name
                 logger.info(f"{active_branch_name=}")
                 is_reporting_branch = (
                     active_branch_name == config.ERROR_REPORTING_BRANCH
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`import os`
`10`	`10`
`11`	`11`	`from Foundation import NSURL, NSObject # type: ignore # noqa`
	`12`	`+from loguru import logger`
`12`	`13`	`from Quartz import CGMainDisplayID # type: ignore # noqa`
`13`	`14`	`import AVFoundation as AVF # type: ignore # noqa`
`14`	`15`	`import objc # type: ignore # noqa`
`@@ -56,6 +57,7 @@ def start(self, audio: bool = False, camera: bool = False) -> None:`
`56`	`57`	`datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".mov",`
`57`	`58`	`)`
`58`	`59`	`)`
	`60`	`+ logger.info(f"{self.file_url=}")`
`59`	`61`	`if audio and self.session.canAddInput_(self.audio_input[0]):`
`60`	`62`	`self.session.addInput_(self.audio_input[0])`
`61`	`63`