diff --git a/README.md b/README.md index 652fe3c..f4d42e6 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ create-kernel-app [app-name] [options] - `stagehand`: Template with Stagehand SDK (Typescript only) - `advanced-sample`: Implements sample apps using advanced Kernel configs - `computer-use`: Implements a prompt loop using Anthropic Computer Use + - `cua`: Implements a Computer Use Agent (OpenAI CUA) sample ### Examples @@ -121,6 +122,12 @@ kernel invoke python-basic get-page-title --payload '{"url": "https://www.google # Python + Browser Use kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}' + +# Typescript + CUA Sample +kernel invoke ts-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' + +# Python + CUA Sample +kernel invoke python-cua cua-task --payload '{"task": "Get current market price range for an unboxed Dreamcast"}' ``` ## Sample apps reference @@ -134,6 +141,7 @@ These are the sample apps currently available when you run `npx @onkernel/create | **stagehand** | Returns the first result of a specified Google search | Stagehand | `{ query }` | | **advanced-sample** | Implements sample apps using advanced Kernel configs | n/a | | **computer-use** | Implements a prompt loop | Anthropic Computer Use API | `{ query }` | +| **cua** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ task }` | ## Documentation diff --git a/index.ts b/index.ts index 8111e91..1eaed85 100644 --- a/index.ts +++ b/index.ts @@ -18,7 +18,8 @@ type TemplateKey = | "browser-use" | "stagehand" | "advanced-sample" - | "computer-use"; + | "computer-use" + | "cua"; type LanguageInfo = { name: string; shorthand: string }; type TemplateInfo = { name: string; @@ -34,6 +35,7 @@ const TEMPLATE_BROWSER_USE = "browser-use"; const TEMPLATE_STAGEHAND = "stagehand"; const TEMPLATE_ADVANCED_SAMPLE = "advanced-sample"; const TEMPLATE_COMPUTER_USE = "computer-use"; +const TEMPLATE_CUA = "cua"; const LANGUAGE_SHORTHAND_TS = "ts"; const LANGUAGE_SHORTHAND_PY = "py"; @@ -73,6 +75,11 @@ const TEMPLATES: Record = { description: "Implements the Anthropic Computer Use SDK", languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON], }, + [TEMPLATE_CUA]: { + name: "CUA Sample", + description: "Implements a Computer Use Agent (OpenAI CUA) sample", + languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON], + }, }; const INVOKE_SAMPLES: Record< @@ -88,6 +95,8 @@ const INVOKE_SAMPLES: Record< 'kernel invoke ts-advanced test-captcha-solver', [TEMPLATE_COMPUTER_USE]: 'kernel invoke ts-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'', + [TEMPLATE_CUA]: + 'kernel invoke ts-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'', }, [LANGUAGE_PYTHON]: { [TEMPLATE_SAMPLE_APP]: @@ -98,6 +107,8 @@ const INVOKE_SAMPLES: Record< 'kernel invoke python-advanced test-captcha-solver', [TEMPLATE_COMPUTER_USE]: 'kernel invoke python-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'', + [TEMPLATE_CUA]: + 'kernel invoke python-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'', }, }; @@ -114,6 +125,8 @@ const REGISTERED_APP_NAMES: Record< 'ts-advanced', [TEMPLATE_COMPUTER_USE]: 'ts-cu', + [TEMPLATE_CUA]: + 'ts-cua', }, [LANGUAGE_PYTHON]: { [TEMPLATE_SAMPLE_APP]: @@ -124,6 +137,8 @@ const REGISTERED_APP_NAMES: Record< 'python-advanced', [TEMPLATE_COMPUTER_USE]: 'python-cu', + [TEMPLATE_CUA]: + 'python-cua', }, }; @@ -354,12 +369,16 @@ function printNextSteps( ? "kernel deploy index.ts --env OPENAI_API_KEY=XXX" : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_COMPUTER_USE ? "kernel deploy index.ts --env ANTHROPIC_API_KEY=XXX" + : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_CUA + ? "kernel deploy index.ts --env OPENAI_API_KEY=XXX" : language === LANGUAGE_PYTHON && (template === TEMPLATE_SAMPLE_APP || template === TEMPLATE_ADVANCED_SAMPLE) ? "kernel deploy main.py" : language === LANGUAGE_PYTHON && template === TEMPLATE_BROWSER_USE ? "kernel deploy main.py --env OPENAI_API_KEY=XXX" : language === LANGUAGE_PYTHON && template === TEMPLATE_COMPUTER_USE ? "kernel deploy main.py --env ANTHROPIC_API_KEY=XXX" + : language === LANGUAGE_PYTHON && template === TEMPLATE_CUA + ? "kernel deploy main.py --env OPENAI_API_KEY=XXX" : ""; console.log( diff --git a/templates/python/browser-use/main.py b/templates/python/browser-use/main.py index 5dfc7cc..ad1e034 100644 --- a/templates/python/browser-use/main.py +++ b/templates/python/browser-use/main.py @@ -1,8 +1,9 @@ from langchain_openai import ChatOpenAI -from browser_use import Agent, BrowserSession +from browser_use import Agent import kernel from kernel import Kernel from typing import TypedDict +from session import BrowserSessionCustomResize client = Kernel() @@ -13,7 +14,7 @@ class TaskInput(TypedDict): # LLM API Keys are set in the environment during `kernel deploy -e OPENAI_API_KEY=XXX` # See https://docs.onkernel.com/launch/deploy#environment-variables -llm = ChatOpenAI(model="gpt-4o") +llm = ChatOpenAI(model="gpt-4o-mini") @app.action("bu-task") async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput): @@ -37,7 +38,7 @@ async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput): #task="Compare the price of gpt-4o and DeepSeek-V3", task=input_data["task"], llm=llm, - browser_session=BrowserSession(cdp_url=kernel_browser.cdp_ws_url) + browser_session=BrowserSessionCustomResize(cdp_url=kernel_browser.cdp_ws_url) ) result = await agent.run() if result.final_result() is not None: diff --git a/templates/python/browser-use/session.py b/templates/python/browser-use/session.py new file mode 100644 index 0000000..b351a13 --- /dev/null +++ b/templates/python/browser-use/session.py @@ -0,0 +1,85 @@ +from browser_use import BrowserSession + +# Define a subclass of BrowserSession that overrides _setup_viewports (which mishandles resizeing on connecting via cdp) +class BrowserSessionCustomResize(BrowserSession): + async def _setup_viewports(self) -> None: + """Resize any existing page viewports to match the configured size, set up storage_state, permissions, geolocation, etc.""" + + assert self.browser_context, 'BrowserSession.browser_context must already be set up before calling _setup_viewports()' + + self.browser_profile.window_size = {"width": 1024, "height": 786} + self.browser_profile.viewport = {"width": 1024, "height": 786} + self.browser_profile.screen = {"width": 1024, "height": 786} + self.browser_profile.device_scale_factor = 1.0 + + # log the viewport settings to terminal + viewport = self.browser_profile.viewport + # if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults + if self.browser_profile.permissions: + try: + await self.browser_context.grant_permissions(self.browser_profile.permissions) + except Exception as e: + print(e) + try: + if self.browser_profile.default_timeout: + self.browser_context.set_default_timeout(self.browser_profile.default_timeout) + if self.browser_profile.default_navigation_timeout: + self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout) + except Exception as e: + print(e) + try: + if self.browser_profile.extra_http_headers: + self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers) + except Exception as e: + print(e) + + try: + if self.browser_profile.geolocation: + await self.browser_context.set_geolocation(self.browser_profile.geolocation) + except Exception as e: + print(e) + + await self.load_storage_state() + + page = None + + for page in self.browser_context.pages: + # apply viewport size settings to any existing pages + if viewport: + await page.set_viewport_size(viewport) + + # show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages + if page.url == 'about:blank': + await self._show_dvd_screensaver_loading_animation(page) + + page = page or (await self.browser_context.new_page()) + + if (not viewport) and (self.browser_profile.window_size is not None) and not self.browser_profile.headless: + # attempt to resize the actual browser window + + # cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds + try: + cdp_session = await page.context.new_cdp_session(page) + window_id_result = await cdp_session.send('Browser.getWindowForTarget') + await cdp_session.send( + 'Browser.setWindowBounds', + { + 'windowId': window_id_result['windowId'], + 'bounds': { + **self.browser_profile.window_size, + 'windowState': 'normal', # Ensure window is not minimized/maximized + }, + }, + ) + await cdp_session.detach() + except Exception as e: + _log_size = lambda size: f'{size["width"]}x{size["height"]}px' + try: + # fallback to javascript resize if cdp setWindowBounds fails + await page.evaluate( + """(width, height) => {window.resizeTo(width, height)}""", + **self.browser_profile.window_size, + ) + return + except Exception as e: + pass diff --git a/templates/python/cua/README.md b/templates/python/cua/README.md new file mode 100644 index 0000000..03d2dd7 --- /dev/null +++ b/templates/python/cua/README.md @@ -0,0 +1,7 @@ +# Kernel Python Sample App - CUA + +This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI. + +It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation. + +See the [docs](https://docs.onkernel.com/quickstart) for more information. \ No newline at end of file diff --git a/templates/python/cua/__init__.py b/templates/python/cua/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/python/cua/_gitignore b/templates/python/cua/_gitignore new file mode 100644 index 0000000..0ab378f --- /dev/null +++ b/templates/python/cua/_gitignore @@ -0,0 +1,4 @@ +__pycache__/ +.env +.venv/ +env/ \ No newline at end of file diff --git a/templates/python/cua/agent/__init__.py b/templates/python/cua/agent/__init__.py new file mode 100644 index 0000000..d2361b7 --- /dev/null +++ b/templates/python/cua/agent/__init__.py @@ -0,0 +1 @@ +from .agent import Agent diff --git a/templates/python/cua/agent/agent.py b/templates/python/cua/agent/agent.py new file mode 100644 index 0000000..d7f4267 --- /dev/null +++ b/templates/python/cua/agent/agent.py @@ -0,0 +1,170 @@ +from computers import Computer +from utils import ( + create_response, + show_image, + pp, + sanitize_message, + check_blocklisted_url, +) +import json +from typing import Callable + + +class Agent: + """ + A sample agent class that can be used to interact with a computer. + + (See simple_cua_loop.py for a simple example without an agent.) + """ + + def __init__( + self, + model="computer-use-preview", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda message: False, + ): + self.model = model + self.computer = computer + self.tools = tools + self.print_steps = True + self.debug = False + self.show_images = False + self.acknowledge_safety_check_callback = acknowledge_safety_check_callback + + if computer: + dimensions = computer.get_dimensions() + self.tools += [ + { + "type": "computer-preview", + "display_width": dimensions[0], + "display_height": dimensions[1], + "environment": computer.get_environment(), + }, + { + "type": "function", + "name": "back", + "description": "Go back to the previous page.", + "parameters": {}, + }, + { + "type": "function", + "name": "goto", + "description": "Go to a specific URL.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Fully qualified URL to navigate to.", + }, + }, + "additionalProperties": False, + "required": ["url"], + }, + }, + { + "type": "function", + "name": "forward", + "description": "Go forward to the next page.", + "parameters": {}, + }, + ] + + def debug_print(self, *args): + if self.debug: + pp(*args) + + def handle_item(self, item): + """Handle each item; may cause a computer action + screenshot.""" + if item["type"] == "message": + if self.print_steps: + print(item["content"][0]["text"]) + + if item["type"] == "function_call": + name, args = item["name"], json.loads(item["arguments"]) + if self.print_steps: + print(f"{name}({args})") + + if hasattr(self.computer, name): # if function exists on computer, call it + method = getattr(self.computer, name) + method(**args) + return [ + { + "type": "function_call_output", + "call_id": item["call_id"], + "output": "success", # hard-coded output for demo + } + ] + + if item["type"] == "computer_call": + action = item["action"] + action_type = action["type"] + action_args = {k: v for k, v in action.items() if k != "type"} + if self.print_steps: + print(f"{action_type}({action_args})") + + method = getattr(self.computer, action_type) + method(**action_args) + + screenshot_base64 = self.computer.screenshot() + if self.show_images: + show_image(screenshot_base64) + + # if user doesn't ack all safety checks exit with error + pending_checks = item.get("pending_safety_checks", []) + for check in pending_checks: + message = check["message"] + if not self.acknowledge_safety_check_callback(message): + raise ValueError( + f"Safety check failed: {message}. Cannot continue with unacknowledged safety checks." + ) + + call_output = { + "type": "computer_call_output", + "call_id": item["call_id"], + "acknowledged_safety_checks": pending_checks, + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshot_base64}", + }, + } + + # additional URL safety checks for browser environments + if self.computer.get_environment() == "browser": + current_url = self.computer.get_current_url() + check_blocklisted_url(current_url) + call_output["output"]["current_url"] = current_url + + return [call_output] + return [] + + def run_full_turn( + self, input_items, print_steps=True, debug=False, show_images=False + ): + self.print_steps = print_steps + self.debug = debug + self.show_images = show_images + new_items = [] + + # keep looping until we get a final response + while new_items[-1].get("role") != "assistant" if new_items else True: + self.debug_print([sanitize_message(msg) for msg in input_items + new_items]) + + response = create_response( + model=self.model, + input=input_items + new_items, + tools=self.tools, + truncation="auto", + ) + self.debug_print(response) + + if "output" not in response and self.debug: + print(response) + raise ValueError("No output from model") + else: + new_items += response["output"] + for item in response["output"]: + new_items += self.handle_item(item) + + return new_items diff --git a/templates/python/cua/computers/__init__.py b/templates/python/cua/computers/__init__.py new file mode 100644 index 0000000..0e8c132 --- /dev/null +++ b/templates/python/cua/computers/__init__.py @@ -0,0 +1,11 @@ +from . import default +from . import contrib +from .computer import Computer +from .config import computers_config + +__all__ = [ + "default", + "contrib", + "Computer", + "computers_config", +] diff --git a/templates/python/cua/computers/computer.py b/templates/python/cua/computers/computer.py new file mode 100644 index 0000000..8098650 --- /dev/null +++ b/templates/python/cua/computers/computer.py @@ -0,0 +1,29 @@ +from typing import Protocol, List, Literal, Dict + + +class Computer(Protocol): + """Defines the 'shape' (methods/properties) our loop expects.""" + + def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... + + def get_dimensions(self) -> tuple[int, int]: ... + + def screenshot(self) -> str: ... + + def click(self, x: int, y: int, button: str = "left") -> None: ... + + def double_click(self, x: int, y: int) -> None: ... + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... + + def type(self, text: str) -> None: ... + + def wait(self, ms: int = 1000) -> None: ... + + def move(self, x: int, y: int) -> None: ... + + def keypress(self, keys: List[str]) -> None: ... + + def drag(self, path: List[Dict[str, int]]) -> None: ... + + def get_current_url() -> str: ... diff --git a/templates/python/cua/computers/config.py b/templates/python/cua/computers/config.py new file mode 100644 index 0000000..4bf314c --- /dev/null +++ b/templates/python/cua/computers/config.py @@ -0,0 +1,7 @@ +from .default import * +from .contrib import * + +computers_config = { + "local-playwright": LocalPlaywrightBrowser, + "kernel": KernelPlaywrightBrowser, +} diff --git a/templates/python/cua/computers/contrib/__init__.py b/templates/python/cua/computers/contrib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/python/cua/computers/default/__init__.py b/templates/python/cua/computers/default/__init__.py new file mode 100644 index 0000000..5e168f7 --- /dev/null +++ b/templates/python/cua/computers/default/__init__.py @@ -0,0 +1,2 @@ +from .local_playwright import LocalPlaywrightBrowser +from .kernel import KernelPlaywrightBrowser diff --git a/templates/python/cua/computers/default/kernel.py b/templates/python/cua/computers/default/kernel.py new file mode 100644 index 0000000..cf28022 --- /dev/null +++ b/templates/python/cua/computers/default/kernel.py @@ -0,0 +1,55 @@ +from playwright.sync_api import Browser, Page +from ..shared.base_playwright import BasePlaywrightComputer + +class KernelPlaywrightBrowser(BasePlaywrightComputer): + """ + Connects to a remote Chromium instance using a provided CDP URL. + Expects a dict as input: {'cdp_ws_url': ..., 'width': ..., 'height': ...} + Width and height are optional, defaulting to 1024x768. + """ + + def __init__(self, config: dict): + super().__init__() + self.cdp_ws_url = config.get("cdp_ws_url") + if not self.cdp_ws_url: + raise ValueError("cdp_ws_url must be provided in config dict") + self.width = config.get("width", 1024) + self.height = config.get("height", 768) + self.dimensions = (self.width, self.height) + + def get_dimensions(self): + return self.dimensions + + def _get_browser_and_page(self) -> tuple[Browser, Page]: + # Connect to the remote browser using the CDP URL + browser = self._playwright.chromium.connect_over_cdp(self.cdp_ws_url) + # Use the first context or create one if none exists + if browser.contexts: + context = browser.contexts[0] + else: + context = browser.new_context() + # Add event listeners for page creation and closure + context.on("page", self._handle_new_page) + # Create a new page and set viewport + page = context.pages[0] if context.pages else context.new_page() + page.set_viewport_size({"width": self.width, "height": self.height}) + page.on("close", self._handle_page_close) + # Optionally, navigate to a default page + # page.goto("about:blank") + return browser, page + + def _handle_new_page(self, page: Page): + """Handle the creation of a new page.""" + print("New page created") + self._page = page + page.on("close", self._handle_page_close) + + def _handle_page_close(self, page: Page): + """Handle the closure of a page.""" + print("Page closed") + if hasattr(self, "_browser") and self._page == page: + if self._browser.contexts[0].pages: + self._page = self._browser.contexts[0].pages[-1] + else: + print("Warning: All pages have been closed.") + self._page = None diff --git a/templates/python/cua/computers/default/local_playwright.py b/templates/python/cua/computers/default/local_playwright.py new file mode 100644 index 0000000..aab3355 --- /dev/null +++ b/templates/python/cua/computers/default/local_playwright.py @@ -0,0 +1,53 @@ +from playwright.sync_api import Browser, Page +from ..shared.base_playwright import BasePlaywrightComputer + + +class LocalPlaywrightBrowser(BasePlaywrightComputer): + """Launches a local Chromium instance using Playwright.""" + + def __init__(self, headless: bool = False): + super().__init__() + self.headless = headless + + def _get_browser_and_page(self) -> tuple[Browser, Page]: + width, height = self.get_dimensions() + launch_args = [ + f"--window-size={width},{height}", + "--disable-extensions", + "--disable-file-system", + ] + browser = self._playwright.chromium.launch( + chromium_sandbox=True, + headless=self.headless, + args=launch_args, + env={"DISPLAY": ":0"}, + ) + + context = browser.new_context() + + # Add event listeners for page creation and closure + context.on("page", self._handle_new_page) + + page = context.new_page() + page.set_viewport_size({"width": width, "height": height}) + page.on("close", self._handle_page_close) + + # page.goto("about:blank") + + return browser, page + + def _handle_new_page(self, page: Page): + """Handle the creation of a new page.""" + print("New page created") + self._page = page + page.on("close", self._handle_page_close) + + def _handle_page_close(self, page: Page): + """Handle the closure of a page.""" + print("Page closed") + if self._page == page: + if self._browser.contexts[0].pages: + self._page = self._browser.contexts[0].pages[-1] + else: + print("Warning: All pages have been closed.") + self._page = None diff --git a/templates/python/cua/computers/shared/__init__.py b/templates/python/cua/computers/shared/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/python/cua/computers/shared/base_playwright.py b/templates/python/cua/computers/shared/base_playwright.py new file mode 100644 index 0000000..0c38e24 --- /dev/null +++ b/templates/python/cua/computers/shared/base_playwright.py @@ -0,0 +1,154 @@ +import time +import base64 +from typing import List, Dict, Literal +from playwright.sync_api import sync_playwright, Browser, Page +from utils import check_blocklisted_url + +# Optional: key mapping if your model uses "CUA" style keys +CUA_KEY_TO_PLAYWRIGHT_KEY = { + "/": "Divide", + "\\": "Backslash", + "alt": "Alt", + "arrowdown": "ArrowDown", + "arrowleft": "ArrowLeft", + "arrowright": "ArrowRight", + "arrowup": "ArrowUp", + "backspace": "Backspace", + "capslock": "CapsLock", + "cmd": "Meta", + "ctrl": "Control", + "delete": "Delete", + "end": "End", + "enter": "Enter", + "esc": "Escape", + "home": "Home", + "insert": "Insert", + "option": "Alt", + "pagedown": "PageDown", + "pageup": "PageUp", + "shift": "Shift", + "space": " ", + "super": "Meta", + "tab": "Tab", + "win": "Meta", +} + + +class BasePlaywrightComputer: + """ + Abstract base for Playwright-based computers: + + - Subclasses override `_get_browser_and_page()` to do local or remote connection, + returning (Browser, Page). + - This base class handles context creation (`__enter__`/`__exit__`), + plus standard "Computer" actions like click, scroll, etc. + - We also have extra browser actions: `goto(url)` and `back()`. + """ + + def get_environment(self): + return "browser" + + def get_dimensions(self): + return (1024, 768) + + def __init__(self): + self._playwright = None + self._browser: Browser | None = None + self._page: Page | None = None + + def __enter__(self): + # Start Playwright and call the subclass hook for getting browser/page + self._playwright = sync_playwright().start() + self._browser, self._page = self._get_browser_and_page() + + # Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS + def handle_route(route, request): + + url = request.url + if check_blocklisted_url(url): + print(f"Flagging blocked domain: {url}") + route.abort() + else: + route.continue_() + + self._page.route("**/*", handle_route) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._browser: + self._browser.close() + if self._playwright: + self._playwright.stop() + + def get_current_url(self) -> str: + return self._page.url + + # --- Common "Computer" actions --- + def screenshot(self) -> str: + """Capture only the viewport (not full_page).""" + png_bytes = self._page.screenshot(full_page=False) + return base64.b64encode(png_bytes).decode("utf-8") + + def click(self, x: int, y: int, button: str = "left") -> None: + match button: + case "back": + self.back() + case "forward": + self.forward() + case "wheel": + self._page.mouse.wheel(x, y) + case _: + button_mapping = {"left": "left", "right": "right"} + button_type = button_mapping.get(button, "left") + self._page.mouse.click(x, y, button=button_type) + + def double_click(self, x: int, y: int) -> None: + self._page.mouse.dblclick(x, y) + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + self._page.mouse.move(x, y) + self._page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})") + + def type(self, text: str) -> None: + self._page.keyboard.type(text) + + def wait(self, ms: int = 1000) -> None: + time.sleep(ms / 1000) + + def move(self, x: int, y: int) -> None: + self._page.mouse.move(x, y) + + def keypress(self, keys: List[str]) -> None: + mapped_keys = [CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) for key in keys] + for key in mapped_keys: + self._page.keyboard.down(key) + for key in reversed(mapped_keys): + self._page.keyboard.up(key) + + def drag(self, path: List[Dict[str, int]]) -> None: + if not path: + return + self._page.mouse.move(path[0]["x"], path[0]["y"]) + self._page.mouse.down() + for point in path[1:]: + self._page.mouse.move(point["x"], point["y"]) + self._page.mouse.up() + + # --- Extra browser-oriented actions --- + def goto(self, url: str) -> None: + try: + return self._page.goto(url) + except Exception as e: + print(f"Error navigating to {url}: {e}") + + def back(self) -> None: + return self._page.go_back() + + def forward(self) -> None: + return self._page.go_forward() + + # --- Subclass hook --- + def _get_browser_and_page(self) -> tuple[Browser, Page]: + """Subclasses must implement, returning (Browser, Page).""" + raise NotImplementedError diff --git a/templates/python/cua/main.py b/templates/python/cua/main.py new file mode 100644 index 0000000..6ec301a --- /dev/null +++ b/templates/python/cua/main.py @@ -0,0 +1,94 @@ +import os +from typing import TypedDict +import kernel +from kernel import Kernel +from computers.default import KernelPlaywrightBrowser +from agent import Agent +import datetime +import asyncio + +""" +Example app that runs an agent using openai CUA +Args: + ctx: Kernel context containing invocation information + payload: An object with a `task` property +Returns: + An answer to the task, elapsed time and optionally the messages stack +Invoke this via CLI: + export KERNEL_API_KEY= + kernel deploy main.py -e OPENAI_API_KEY=XXXXX --force + kernel invoke python-cua cua-task -p '{"task":"go to https://news.ycombinator.com and list top 5 articles"}' + kernel logs python-cua -f # Open in separate tab +""" + +class CuaInput(TypedDict): + task: str + +class CuaOutput(TypedDict): + result: str + +api_key = os.getenv("OPENAI_API_KEY") +if not api_key: + raise ValueError("OPENAI_API_KEY is not set") + +client = Kernel() +app = kernel.App("python-cua") + +@app.action("cua-task") +async def cua_task( + ctx: kernel.KernelContext, + payload: CuaInput, +) -> CuaOutput: + # A function that processes a user task using the kernel browser and agent + + if not payload or not payload.get("task"): + raise ValueError("task is required") + + kernel_browser = await asyncio.to_thread( + client.browsers.create, invocation_id=ctx.invocation_id, stealth=True + ) + print("Kernel browser live view url: ", kernel_browser.browser_live_view_url) + cdp_ws_url = kernel_browser.cdp_ws_url + + def run_agent(): + with KernelPlaywrightBrowser({"cdp_ws_url": cdp_ws_url}) as computer: + + # messages to provide to the agent + items = [ + { + "role": "system", + "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", + }, + { + "role": "user", + "content": payload["task"] + } + ] + + # setup the agent + agent = Agent( + computer=computer, + tools=[], # can provide additional tools to the agent + acknowledge_safety_check_callback=lambda message: (print(f"> agent : safety check message (skipping): {message}") or True) # safety check function , now defaults to true + ) + + # run the agent + response_items = agent.run_full_turn( + items, + debug=True, + show_images=False, + ) + + if not response_items or "content" not in response_items[-1]: + raise ValueError("No response from agent") + # The content may be a list of blocks, get the first text block + content = response_items[-1]["content"] + if isinstance(content, list) and content and isinstance(content[0], dict) and "text" in content[0]: + result = content[0]["text"] + elif isinstance(content, str): + result = content + else: + result = str(content) + return {"result": result} + + return await asyncio.to_thread(run_agent) diff --git a/templates/python/cua/pyproject.toml b/templates/python/cua/pyproject.toml new file mode 100644 index 0000000..7115077 --- /dev/null +++ b/templates/python/cua/pyproject.toml @@ -0,0 +1,29 @@ +[project] +name = "python-cua" +version = "0.1.0" +description = "Kernel sample app for CUA" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "annotated-types==0.7.0", + "anyio==4.8.0", + "certifi==2025.1.31", + "charset-normalizer==3.4.1", + "distro==1.9.0", + "greenlet==3.1.1", + "h11==0.14.0", + "httpcore==1.0.7", + "httpx==0.28.1", + "idna==3.10", + "jiter==0.8.2", + "pillow==11.1.0", + "playwright==1.50.0", + "pydantic==2.10.6", + "pydantic_core==2.27.2", + "pyee==12.1.1", + "python-dotenv==1.0.1", + "requests==2.32.3", + "sniffio==1.3.1", + "typing_extensions==4.12.2", + "urllib3==2.3.0", +] diff --git a/templates/python/cua/utils.py b/templates/python/cua/utils.py new file mode 100644 index 0000000..b17ee81 --- /dev/null +++ b/templates/python/cua/utils.py @@ -0,0 +1,76 @@ +import os +import requests +from dotenv import load_dotenv +import json +import base64 +from PIL import Image +from io import BytesIO +import io +from urllib.parse import urlparse + +load_dotenv(override=True) + +BLOCKED_DOMAINS = [ + "maliciousbook.com", + "evilvideos.com", + "darkwebforum.com", + "shadytok.com", + "suspiciouspins.com", + "ilanbigio.com", +] + + +def pp(obj): + print(json.dumps(obj, indent=4)) + + +def show_image(base_64_image): + image_data = base64.b64decode(base_64_image) + image = Image.open(BytesIO(image_data)) + image.show() + + +def calculate_image_dimensions(base_64_image): + image_data = base64.b64decode(base_64_image) + image = Image.open(io.BytesIO(image_data)) + return image.size + + +def sanitize_message(msg: dict) -> dict: + """Return a copy of the message with image_url omitted for computer_call_output messages.""" + if msg.get("type") == "computer_call_output": + output = msg.get("output", {}) + if isinstance(output, dict): + sanitized = msg.copy() + sanitized["output"] = {**output, "image_url": "[omitted]"} + return sanitized + return msg + + +def create_response(**kwargs): + url = "https://api.openai.com/v1/responses" + headers = { + "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}", + "Content-Type": "application/json" + } + + openai_org = os.getenv("OPENAI_ORG") + if openai_org: + headers["Openai-Organization"] = openai_org + + response = requests.post(url, headers=headers, json=kwargs) + + if response.status_code != 200: + print(f"Error: {response.status_code} {response.text}") + + return response.json() + + +def check_blocklisted_url(url: str) -> None: + """Raise ValueError if the given URL (including subdomains) is in the blocklist.""" + hostname = urlparse(url).hostname or "" + if any( + hostname == blocked or hostname.endswith(f".{blocked}") + for blocked in BLOCKED_DOMAINS + ): + raise ValueError(f"Blocked URL: {url}") diff --git a/templates/typescript/advanced-sample/package.json b/templates/typescript/advanced-sample/package.json index 504cccc..e94eaa2 100644 --- a/templates/typescript/advanced-sample/package.json +++ b/templates/typescript/advanced-sample/package.json @@ -7,7 +7,7 @@ "typescript": "^5" }, "dependencies": { - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "playwright": "^1.52.0" } } diff --git a/templates/typescript/computer-use/package.json b/templates/typescript/computer-use/package.json index 61a3f5a..fa8b15e 100644 --- a/templates/typescript/computer-use/package.json +++ b/templates/typescript/computer-use/package.json @@ -7,7 +7,7 @@ "typescript": "^5" }, "dependencies": { - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "playwright": "^1.52.0", "@anthropic-ai/sdk": "0.52.0", "luxon": "3.6.0" diff --git a/templates/typescript/cua/.prettierignore b/templates/typescript/cua/.prettierignore new file mode 100644 index 0000000..b512c09 --- /dev/null +++ b/templates/typescript/cua/.prettierignore @@ -0,0 +1 @@ +node_modules \ No newline at end of file diff --git a/templates/typescript/cua/.prettierrc b/templates/typescript/cua/.prettierrc new file mode 100644 index 0000000..3ee282f --- /dev/null +++ b/templates/typescript/cua/.prettierrc @@ -0,0 +1,7 @@ +{ + "semi": true, + "trailingComma": "all", + "singleQuote": true, + "printWidth": 100, + "tabWidth": 2 +} \ No newline at end of file diff --git a/templates/typescript/cua/README.md b/templates/typescript/cua/README.md new file mode 100644 index 0000000..0cb2dfe --- /dev/null +++ b/templates/typescript/cua/README.md @@ -0,0 +1,8 @@ +# Kernel Typescript Sample App - CUA + +This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI. + +It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation. +Also makes use of the latest OpenAI SDK format, and has local equivalent to Kernel methods for local testing before deploying on Kernel. + +See the [docs](https://docs.onkernel.com/quickstart) for information. \ No newline at end of file diff --git a/templates/typescript/cua/_gitignore b/templates/typescript/cua/_gitignore new file mode 100644 index 0000000..bbacd7f --- /dev/null +++ b/templates/typescript/cua/_gitignore @@ -0,0 +1,2 @@ +node_modules +bun.lockb \ No newline at end of file diff --git a/templates/typescript/cua/index.ts b/templates/typescript/cua/index.ts new file mode 100644 index 0000000..e32a090 --- /dev/null +++ b/templates/typescript/cua/index.ts @@ -0,0 +1,111 @@ +import "dotenv/config"; +import { Kernel, type KernelContext } from "@onkernel/sdk"; +import { Agent } from "./lib/agent"; +import computers from "./lib/computers"; + +const kernel = new Kernel(); +const app = kernel.app("ts-cua"); + +// LLM API Keys are set in the environment during `kernel deploy -e ANTHROPIC_API_KEY=XXX` +// See https://docs.onkernel.com/launch/deploy#environment-variables +if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); + +/** + * Example app that run an agent using openai CUA + * Args: + * ctx: Kernel context containing invocation information + * payload: An object with a `task` property + * Returns: + * An answer to the task, elapsed time and optionally the messages stack + * Invoke this via CLI: + * export KERNEL_API_KEY= + * kernel deploy index.ts -e OPENAI_API_KEY=XXXXX --force + * kernel invoke ts-cua cua-task -p "{\"task\":\"current market price range for a used dreamcast\"}" + * kernel logs ts-cua -f # Open in separate tab + */ + +interface CuaInput { + task: string; +} + +interface CuaOutput { + elapsed: number; + response?: Array; + answer: object; +} + +app.action( + "cua-task", + async (ctx: KernelContext, payload?: CuaInput): Promise => { + const startTime = Date.now(); + const kernelBrowser = await kernel.browsers.create({ + invocation_id: ctx.invocation_id, + }); + console.log( + "> Kernel browser live view url: ", + kernelBrowser.browser_live_view_url, + ); + + if (!payload?.task){ + throw new Error('task is required'); + } + + try { + + // kernel browser + const { computer } = await computers.create({ + type: "kernel", // for local testing before deploying to Kernel, you can use type: "local" + cdp_ws_url: kernelBrowser.cdp_ws_url, + }); + + // setup agent + const agent = new Agent({ + model: "computer-use-preview", + computer, + tools: [], // additional function_call tools to provide to the llm + acknowledge_safety_check_callback: (message: string) => { + console.log(`> safety check: ${message}`); + return true; // Auto-acknowledge all safety checks for testing + }, + }); + + // start agent run + const response = await agent.runFullTurn({ + messages: [ + { + role: "system", + content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString("en-US", { weekday: "long" })})`, + }, + { + type: "message", + role: "user", + content: [ + { + type: "input_text", + text: payload.task, + // text: "go to https://news.ycombinator.com , open top article , describe the target website design (in yaml format)" + }, + ], + }, + ], + print_steps: true, // log function_call and computer_call actions + debug: true, // show agent debug logs (llm messages and responses) + show_images: false, // if set to true, response messages stack will return base64 images (webp format) of screenshots, if false, replaced with "[omitted]"" + }); + + console.log("> agent run done"); + + const endTime = Date.now(); + const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + return { + // response, // full messages stack trace + elapsed: parseFloat(timeElapsed.toFixed(2)), + answer: response?.slice(-1)?.[0]?.content?.[0]?.text ?? null, + }; + } finally { + // Note: KernelPlaywrightComputer handles browser cleanup internally + // No need to manually close browser here + } + }, +); diff --git a/templates/typescript/cua/lib/agent.ts b/templates/typescript/cua/lib/agent.ts new file mode 100644 index 0000000..f28b298 --- /dev/null +++ b/templates/typescript/cua/lib/agent.ts @@ -0,0 +1,220 @@ +import utils from "./utils"; +import computers from "./computers"; +import toolset from "./toolset"; + +import type { BasePlaywrightComputer } from "./playwright/base"; + + +interface Item { + [key: string]: any; +} + +interface Tool { + type: string; + display_width?: number; + display_height?: number; + environment?: string; + [key: string]: any; +} + +interface SafetyCheck { + message: string; + [key: string]: any; +} + +interface ComputerCallOutput { + type: string; + call_id: string; + acknowledged_safety_checks: SafetyCheck[]; + output: { + type: string; + image_url: string; + current_url?: string; + }; +} + +type AcknowledgeSafetyCheckCallback = (message: string) => boolean; + +/** + * A sample agent class that can be used to interact with a computer. + */ +export class Agent { + private model: string; + private computer: BasePlaywrightComputer | null; + private tools: Tool[]; + private print_steps: boolean; + private debug: boolean; + private show_images: boolean; + private acknowledge_safety_check_callback: AcknowledgeSafetyCheckCallback; + + constructor({ + model = "computer-use-preview", + computer = null, + tools = [], + acknowledge_safety_check_callback = () => true, + }: { + model?: string; + computer?: BasePlaywrightComputer | null; + tools?: Tool[]; + acknowledge_safety_check_callback?: AcknowledgeSafetyCheckCallback; + }) { + this.model = model; + this.computer = computer; + this.tools = [...toolset.shared, ...tools]; + this.print_steps = true; + this.debug = false; + this.show_images = false; + this.acknowledge_safety_check_callback = acknowledge_safety_check_callback; + + if (computer) { + const dimensions = computer.getDimensions(); + this.tools.push({ + type: "computer-preview", + display_width: dimensions[0], + display_height: dimensions[1], + environment: computer.getEnvironment(), + }); + } + } + + private debugPrint(...args: any[]): void { + if (this.debug) { + console.warn("--- debug:agent:debugPrint"); + console.dir(...args, { depth: null }); + } + } + + private async handleItem(item: Item): Promise { + /**Handle each item; may cause a computer action + screenshot.*/ + if (item.type === "message") { + if (this.print_steps && item.content?.[0]?.text) { + console.log(item.content[0].text); + } + } + + if (item.type === "function_call") { + const name = item.name!; + const args = JSON.parse(item.arguments!); + if (this.print_steps) { + console.log(`${name}(${JSON.stringify(args)})`); + } + + if (this.computer && (this.computer as any)[name]) { + const method = (this.computer as any)[name]; + await method.call(this.computer, ...Object.values(args)); + } + return [ + { + type: "function_call_output", + call_id: item.call_id!, + output: "success", // hard-coded output for demo + }, + ]; + } + + if (item.type === "computer_call") { + const action = item.action!; + const action_type = action.type; + const action_args = Object.fromEntries( + Object.entries(action).filter(([k]) => k !== "type"), + ); + if (this.print_steps) { + console.log(`${action_type}(${JSON.stringify(action_args)})`); + } + + if (this.computer) { + const method = (this.computer as any)[action_type]; + await method.call(this.computer, ...Object.values(action_args)); + + const screenshot_base64 = await this.computer.screenshot(); + // console.dir({ debug: { screenshot_base64 }}) + + // if user doesn't ack all safety checks exit with error + const pending_checks = item.pending_safety_checks || []; + for (const check of pending_checks) { + const message = check.message; + if (!this.acknowledge_safety_check_callback(message)) { + throw new Error( + `Safety check failed: ${message}. Cannot continue with unacknowledged safety checks.`, + ); + } + } + + const call_output: ComputerCallOutput = { + type: "computer_call_output", + call_id: item.call_id!, + acknowledged_safety_checks: pending_checks, + output: { + type: "input_image", + image_url: `data:image/webp;base64,${screenshot_base64}`, + }, + }; + + // additional URL safety checks for browser environments + if (this.computer.getEnvironment() === "browser") { + const current_url = this.computer.getCurrentUrl(); + utils.checkBlocklistedUrl(current_url); + call_output.output.current_url = current_url; + } + + return [call_output]; + } + } + return []; + } + + async runFullTurn({ + messages, + print_steps = true, + debug = false, + show_images = false, + }: { + messages: Item[]; + print_steps?: boolean; + debug?: boolean; + show_images?: boolean; + }): Promise { + this.print_steps = print_steps; + this.debug = debug; + this.show_images = show_images; + const new_items: Item[] = []; + + // keep looping until we get a final response + while ( + new_items.length === 0 || + (new_items[new_items.length - 1]?.role !== "assistant") + ) { + this.debugPrint( + messages.concat(new_items).map((msg) => utils.sanitizeMessage(msg)), + ); + + const response = await utils.createResponse({ + model: this.model, + input: messages.concat(new_items), + tools: this.tools, + truncation: "auto", + }); + this.debugPrint(response); + + if (!response.output && this.debug) { + console.log(response); + throw new Error("No output from model"); + } else if (response.output) { + new_items.push(...response.output); + for (const item of response.output) { + const handled_items = await this.handleItem(item); + new_items.push(...handled_items); + } + } + } + + // Return sanitized messages if show_images is false + if (!show_images) { + return new_items.map((msg) => utils.sanitizeMessage(msg)); + } + + return new_items; + } +} + +export default { Agent }; diff --git a/templates/typescript/cua/lib/computers.ts b/templates/typescript/cua/lib/computers.ts new file mode 100644 index 0000000..3c8aa47 --- /dev/null +++ b/templates/typescript/cua/lib/computers.ts @@ -0,0 +1,25 @@ +import { KernelPlaywrightComputer } from "./playwright/kernel.ts"; +import { LocalPlaywrightComputer } from "./playwright/local.ts"; + +interface ComputerConfig { + type: "local" | "kernel"; + [key: string]: any; +} + +const computers = { + async create({ type, ...args }: ComputerConfig) { + if (type === "kernel") { + const computer = new KernelPlaywrightComputer(args.cdp_ws_url); + await computer.enter(); + return { computer }; + } else if (type === "local") { + const computer = new LocalPlaywrightComputer(args.headless); + await computer.enter(); + return { computer }; + } else { + throw new Error(`Unknown computer type: ${type}`); + } + }, +}; + +export default computers; diff --git a/templates/typescript/cua/lib/playwright/base.ts b/templates/typescript/cua/lib/playwright/base.ts new file mode 100644 index 0000000..5176869 --- /dev/null +++ b/templates/typescript/cua/lib/playwright/base.ts @@ -0,0 +1,220 @@ +import utils from "../utils.ts"; +import sharp from "sharp"; +import type { Browser, Page, Route, Request } from "playwright"; + +// Optional: key mapping if your model uses "CUA" style keys +const CUA_KEY_TO_PLAYWRIGHT_KEY: Record = { + "/": "/", + "\\": "\\", + alt: "Alt", + arrowdown: "ArrowDown", + arrowleft: "ArrowLeft", + arrowright: "ArrowRight", + arrowup: "ArrowUp", + backspace: "Backspace", + capslock: "CapsLock", + cmd: "Meta", + ctrl: "Control", + delete: "Delete", + end: "End", + enter: "Enter", + esc: "Escape", + home: "Home", + insert: "Insert", + option: "Alt", + pagedown: "PageDown", + pageup: "PageUp", + shift: "Shift", + space: " ", + super: "Meta", + tab: "Tab", + win: "Meta", +}; + +interface Point { + x: number; + y: number; +} + +/** + * Abstract base for Playwright-based computers: + * + * - Subclasses override `_getBrowserAndPage()` to do local or remote connection, + * returning [Browser, Page]. + * - This base class handles context creation (`enter()`/`exit()`), + * plus standard "Computer" actions like click, scroll, etc. + * - We also have extra browser actions: `goto(url)` and `back()`. + */ + +export class BasePlaywrightComputer { + protected _browser: Browser | null = null; + protected _page: Page | null = null; + + constructor() { + this._browser = null; + this._page = null; + } + + /** + * Type guard to assert that this._page is present and is a Playwright Page. + * Throws an error if not present. + */ + protected _assertPage(): asserts this is { _page: Page } { + if (!this._page) { + throw new Error("Playwright Page is not initialized. Did you forget to call enter()?"); + } + } + + getEnvironment(): string { + return "browser"; + } + + getDimensions(): [number, number] { + return [1024, 768]; + } + + async enter(): Promise { + // Call the subclass hook for getting browser/page + [this._browser, this._page] = await this._getBrowserAndPage(); + + // Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS + const handleRoute = (route: Route, request: Request): void => { + const url = request.url(); + if (utils.checkBlocklistedUrl(url)) { + console.log(`Flagging blocked domain: ${url}`); + route.abort(); + } else { + route.continue(); + } + }; + + this._assertPage(); + await this._page.route("**/*", handleRoute); + return this; + } + + async exit(): Promise { + if (this._browser) { + await this._browser.close(); + } + } + + getCurrentUrl(): string { + this._assertPage(); + return this._page.url(); + } + + // Common "Computer" actions + async screenshot(): Promise { + this._assertPage(); + // Capture only the viewport (not full_page) + const screenshotBuffer = await this._page.screenshot({ fullPage: false }); + const webpBuffer = await sharp(screenshotBuffer).webp().toBuffer(); + return webpBuffer.toString("base64"); + } + + async click(button: string = "left", x: number, y: number): Promise { + this._assertPage(); + // console.dir({ debug:{base:{click:{x,y,button}}} },{depth:null}) + switch (button) { + case "back": + await this.back(); + break; + case "forward": + await this.forward(); + break; + case "wheel": + await this._page.mouse.wheel(x, y); + break; + default: + const buttonMapping: Record = { + left: "left", + right: "right", + }; + const buttonType = + buttonMapping[button as keyof typeof buttonMapping] || "left"; + await this._page.mouse.click(x, y, { button: buttonType }); + } + } + + async doubleClick(x: number, y: number): Promise { + this._assertPage(); + await this._page.mouse.dblclick(x, y); + } + + async scroll( + x: number, + y: number, + scrollX: number, + scrollY: number, + ): Promise { + this._assertPage(); + await this._page.mouse.move(x, y); + await this._page.evaluate(`window.scrollBy(${scrollX}, ${scrollY})`); + } + + async type(text: string): Promise { + this._assertPage(); + await this._page.keyboard.type(text); + } + + async keypress(keys: string[]): Promise { + this._assertPage(); + const mappedKeys = keys.map( + (key) => CUA_KEY_TO_PLAYWRIGHT_KEY[key.toLowerCase()] || key, + ); + for (const key of mappedKeys) { + await this._page.keyboard.down(key); + } + for (const key of mappedKeys.reverse()) { + await this._page.keyboard.up(key); + } + } + + async wait(ms: number = 1000): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); + } + + async move(x: number, y: number): Promise { + this._assertPage(); + await this._page.mouse.move(x, y); + } + + async drag(path: Point[]): Promise { + this._assertPage(); + const first = path[0]; + if (!first) return; + await this._page.mouse.move(first.x, first.y); + await this._page.mouse.down(); + for (const point of path.slice(1)) { + await this._page.mouse.move(point.x, point.y); + } + await this._page.mouse.up(); + } + + // Extra browser-oriented actions + async goto(url: string): Promise { + this._assertPage(); + try { + return await this._page.goto(url); + } catch (e) { + console.log(`Error navigating to ${url}: ${e}`); + } + } + + async back(): Promise { + this._assertPage(); + return await this._page.goBack(); + } + + async forward(): Promise { + this._assertPage(); + return await this._page.goForward(); + } + + // Subclass hook + async _getBrowserAndPage(): Promise<[Browser, Page]> { + // Subclasses must implement, returning [Browser, Page] + throw new Error("Subclasses must implement _getBrowserAndPage()"); + } +} diff --git a/templates/typescript/cua/lib/playwright/kernel.ts b/templates/typescript/cua/lib/playwright/kernel.ts new file mode 100644 index 0000000..8f1cc6d --- /dev/null +++ b/templates/typescript/cua/lib/playwright/kernel.ts @@ -0,0 +1,90 @@ +import { chromium, type Browser, type Page } from "playwright"; +import { BasePlaywrightComputer } from "./base"; + +/** + * KernelPlaywrightComputer connects to a remote browser instance via CDP WebSocket URL. + * Similar to LocalPlaywrightComputer but uses an existing browser instance instead of launching one. + */ +export class KernelPlaywrightComputer extends BasePlaywrightComputer { + private cdp_ws_url: string; + + constructor(cdp_ws_url: string) { + super(); + this.cdp_ws_url = cdp_ws_url; + } + + async _getBrowserAndPage(): Promise<[Browser, Page]> { + const [width, height] = this.getDimensions(); + + // Connect to existing browser instance via CDP + const browser = await chromium.connectOverCDP(this.cdp_ws_url); + + // Get existing context or create new one + let context = browser.contexts()[0]; + if (!context) { + context = await browser.newContext(); + } + + // Add event listeners for page creation and closure + context.on("page", this._handleNewPage.bind(this)); + + // Get existing page or create new one + let page = context.pages()[0]; + if (!page) { + page = await context.newPage(); + } + + // Set viewport size + await page.setViewportSize({ width, height }); + page.on("close", this._handlePageClose.bind(this)); + + return [browser, page]; + } + + private _handleNewPage(page: Page): void { + /** Handle the creation of a new page. */ + console.log("New page created"); + this._page = page; + page.on("close", this._handlePageClose.bind(this)); + } + + private _handlePageClose(page: Page): void { + /** Handle the closure of a page. */ + console.log("Page closed"); + try { + this._assertPage(); + } catch { + return; + } + if (this._page !== page) return; + + const browser = this._browser; + if (!browser || typeof browser.contexts !== "function") { + console.log("Warning: Browser or context not available."); + this._page = undefined as any; + return; + } + + const contexts = browser.contexts(); + if (!contexts.length) { + console.log("Warning: No browser contexts available."); + this._page = undefined as any; + return; + } + + const context = contexts[0]; + if (!context || typeof context.pages !== "function") { + console.log("Warning: Context pages not available."); + this._page = undefined as any; + return; + } + + const pages = context.pages(); + if (pages.length) { + this._page = pages[pages.length - 1]!; + } else { + console.log("Warning: All pages have been closed."); + this._page = undefined as any; + } + } +} diff --git a/templates/typescript/cua/lib/playwright/local.ts b/templates/typescript/cua/lib/playwright/local.ts new file mode 100644 index 0000000..d5cb284 --- /dev/null +++ b/templates/typescript/cua/lib/playwright/local.ts @@ -0,0 +1,89 @@ +import { chromium, type Browser, type Page } from "playwright"; +import { BasePlaywrightComputer } from "./base"; + +/** + * Launches a local Chromium instance using Playwright. + */ +export class LocalPlaywrightComputer extends BasePlaywrightComputer { + private headless: boolean; + + constructor(headless: boolean = false) { + super(); + this.headless = headless; + } + + async _getBrowserAndPage(): Promise<[Browser, Page]> { + const [width, height] = this.getDimensions(); + const launchArgs = [ + `--window-size=${width},${height}`, + "--disable-extensions", + "--disable-file-system", + ]; + + const browser = await chromium.launch({ + headless: this.headless, + args: launchArgs, + env: { DISPLAY: ":0" }, + }); + + const context = await browser.newContext(); + + // Add event listeners for page creation and closure + context.on("page", this._handleNewPage.bind(this)); + + const page = await context.newPage(); + await page.setViewportSize({ width, height }); + page.on("close", this._handlePageClose.bind(this)); + + await page.goto("https://bing.com"); + + return [browser, page]; + } + + private _handleNewPage(page: Page): void { + /** Handle the creation of a new page. */ + console.log("New page created"); + this._page = page; + page.on("close", this._handlePageClose.bind(this)); + } + + private _handlePageClose(page: Page): void { + /** Handle the closure of a page. */ + console.log("Page closed"); + try { + this._assertPage(); + } catch { + return; + } + if (this._page !== page) return; + + const browser = this._browser; + if (!browser || typeof browser.contexts !== "function") { + console.log("Warning: Browser or context not available."); + this._page = undefined as any; + return; + } + + const contexts = browser.contexts(); + if (!contexts.length) { + console.log("Warning: No browser contexts available."); + this._page = undefined as any; + return; + } + + const context = contexts[0]; + if (!context || typeof context.pages !== "function") { + console.log("Warning: Context pages not available."); + this._page = undefined as any; + return; + } + + const pages = context.pages(); + if (pages.length) { + this._page = pages[pages.length - 1]!; + } else { + console.log("Warning: All pages have been closed."); + this._page = undefined as any; + } + } +} diff --git a/templates/typescript/cua/lib/toolset.ts b/templates/typescript/cua/lib/toolset.ts new file mode 100644 index 0000000..d15fdc0 --- /dev/null +++ b/templates/typescript/cua/lib/toolset.ts @@ -0,0 +1,40 @@ +const shared = [ + { + type: "function", + name: "goto", + description: "Go to a specific URL.", + parameters: { + type: "object", + properties: { + url: { + type: "string", + description: "Fully qualified URL to navigate to.", + }, + }, + additionalProperties: false, + required: ["url"], + }, + }, + { + type: "function", + name: "back", + description: "Navigate back in the browser history.", + parameters: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + type: "function", + name: "forward", + description: "Navigate forward in the browser history.", + parameters: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, +]; + +export default { shared }; diff --git a/templates/typescript/cua/lib/utils.ts b/templates/typescript/cua/lib/utils.ts new file mode 100644 index 0000000..e45d733 --- /dev/null +++ b/templates/typescript/cua/lib/utils.ts @@ -0,0 +1,77 @@ +import "dotenv/config"; +import sharp from "sharp"; +import OpenAI from "openai"; + +const BLOCKED_DOMAINS: readonly string[] = [ + "maliciousbook.com", + "evilvideos.com", + "darkwebforum.com", + "shadytok.com", + "suspiciouspins.com", + "ilanbigio.com", +] as const; + +interface ImageDimensions { + width: number; + height: number; +} + +interface ComputerCallOutput { + type: "computer_call_output"; + output?: { + image_url?: string; + [key: string]: any; + }; + [key: string]: any; +} + +interface Message { + [key: string]: any; +} + +async function calculateImageDimensions( + base64Image: string, +): Promise { + const imageBuffer = Buffer.from(base64Image, "base64"); + const metadata = await sharp(imageBuffer).metadata(); + return { width: metadata.width!, height: metadata.height! }; +} + +function sanitizeMessage(msg: Message): Message { + /** Return a copy of the message with image_url omitted for computer_call_output messages. */ + if (msg.type === "computer_call_output") { + const output = msg.output || {}; + if (typeof output === "object") { + const sanitized = { ...msg }; + sanitized.output = { ...output, image_url: "[omitted]" }; + return sanitized; + } + } + return msg; +} + +async function createResponse(kwargs: any): Promise { + const openai = new OpenAI(); + try { + const response = await openai.responses.create(kwargs); + return response; + } catch (error: any) { + console.error(`Error: ${error.status} ${error.message}`); + throw error; + } +} + +function checkBlocklistedUrl(url: string): boolean { + /** Return true if the given URL (including subdomains) is in the blocklist. */ + const hostname = new URL(url).hostname || ""; + return BLOCKED_DOMAINS.some( + (blocked) => hostname === blocked || hostname.endsWith(`.${blocked}`), + ); +} + +export default { + calculateImageDimensions, + sanitizeMessage, + createResponse, + checkBlocklistedUrl, +}; diff --git a/templates/typescript/cua/package.json b/templates/typescript/cua/package.json new file mode 100644 index 0000000..70b296c --- /dev/null +++ b/templates/typescript/cua/package.json @@ -0,0 +1,15 @@ +{ + "type": "module", + "private": true, + "dependencies": { + "@onkernel/sdk": "^0.6.0", + "@types/node": "^24.0.3", + "dotenv": "^16.5.0", + "openai": "^5.5.1", + "playwright": "^1.53.0", + "sharp": "^0.34.2" + }, + "peerDependencies": { + "typescript": "^5.8.3" + } +} diff --git a/templates/typescript/cua/tsconfig.json b/templates/typescript/cua/tsconfig.json new file mode 100644 index 0000000..f5c1fe2 --- /dev/null +++ b/templates/typescript/cua/tsconfig.json @@ -0,0 +1,30 @@ +{ + "compilerOptions": { + // Environment setup & latest features + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + + // Bundler mode + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + + // Best practices + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + + // Some stricter flags (disabled by default) + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + }, + "include": ["./**/*.ts", "./**/*.tsx"], + "exclude": ["node_modules", "dist"] +} diff --git a/templates/typescript/sample-app/package.json b/templates/typescript/sample-app/package.json index 504cccc..e94eaa2 100644 --- a/templates/typescript/sample-app/package.json +++ b/templates/typescript/sample-app/package.json @@ -7,7 +7,7 @@ "typescript": "^5" }, "dependencies": { - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "playwright": "^1.52.0" } } diff --git a/templates/typescript/stagehand/package.json b/templates/typescript/stagehand/package.json index 92fa508..81a3af5 100644 --- a/templates/typescript/stagehand/package.json +++ b/templates/typescript/stagehand/package.json @@ -8,7 +8,7 @@ }, "dependencies": { "@browserbasehq/stagehand": "^2.2.1", - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "zod": "^3.25.7" } }