Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ create-kernel-app [app-name] [options]
- `stagehand`: Template with Stagehand SDK (Typescript only)
- `advanced-sample`: Implements sample apps using advanced Kernel configs
- `computer-use`: Implements a prompt loop using Anthropic Computer Use
- `cua-sample`: Implements a Computer Use Agent (OpenAI CUA) sample (Typescript only)

### Examples

Expand Down Expand Up @@ -121,6 +122,9 @@ kernel invoke python-basic get-page-title --payload '{"url": "https://www.google

# Python + Browser Use
kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}'

# Typescript + CUA Sample
kernel invoke ts-cua agent-run --payload '{"query": "open hackernews and get the top 5 articles"}'
```

## Sample apps reference
Expand All @@ -134,6 +138,7 @@ These are the sample apps currently available when you run `npx @onkernel/create
| **stagehand** | Returns the first result of a specified Google search | Stagehand | `{ query }` |
| **advanced-sample** | Implements sample apps using advanced Kernel configs | n/a |
| **computer-use** | Implements a prompt loop | Anthropic Computer Use API | `{ query }` |
| **cua-sample** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ query }` |

## Documentation

Expand Down
15 changes: 14 additions & 1 deletion index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ type TemplateKey =
| "browser-use"
| "stagehand"
| "advanced-sample"
| "computer-use";
| "computer-use"
| "cua-sample";
type LanguageInfo = { name: string; shorthand: string };
type TemplateInfo = {
name: string;
Expand All @@ -34,6 +35,7 @@ const TEMPLATE_BROWSER_USE = "browser-use";
const TEMPLATE_STAGEHAND = "stagehand";
const TEMPLATE_ADVANCED_SAMPLE = "advanced-sample";
const TEMPLATE_COMPUTER_USE = "computer-use";
const TEMPLATE_CUA_SAMPLE = "cua-sample";
const LANGUAGE_SHORTHAND_TS = "ts";
const LANGUAGE_SHORTHAND_PY = "py";

Expand Down Expand Up @@ -73,6 +75,11 @@ const TEMPLATES: Record<TemplateKey, TemplateInfo> = {
description: "Implements the Anthropic Computer Use SDK",
languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON],
},
[TEMPLATE_CUA_SAMPLE]: {
name: "CUA Sample",
description: "Implements a Computer Use Agent (OpenAI CUA) sample",
languages: [LANGUAGE_TYPESCRIPT],
},
};

const INVOKE_SAMPLES: Record<
Expand All @@ -88,6 +95,8 @@ const INVOKE_SAMPLES: Record<
'kernel invoke ts-advanced test-captcha-solver',
[TEMPLATE_COMPUTER_USE]:
'kernel invoke ts-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'',
[TEMPLATE_CUA_SAMPLE]:
'kernel invoke ts-cua cua-task --payload \'{"query": "open hackernews and get the top 5 articles"}\'',
},
[LANGUAGE_PYTHON]: {
[TEMPLATE_SAMPLE_APP]:
Expand All @@ -114,6 +123,8 @@ const REGISTERED_APP_NAMES: Record<
'ts-advanced',
[TEMPLATE_COMPUTER_USE]:
'ts-cu',
[TEMPLATE_CUA_SAMPLE]:
'ts-cua',
},
[LANGUAGE_PYTHON]: {
[TEMPLATE_SAMPLE_APP]:
Expand Down Expand Up @@ -354,6 +365,8 @@ function printNextSteps(
? "kernel deploy index.ts --env OPENAI_API_KEY=XXX"
: language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_COMPUTER_USE
? "kernel deploy index.ts --env ANTHROPIC_API_KEY=XXX"
: language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_CUA_SAMPLE
? "kernel deploy index.ts --env OPENAI_API_KEY=XXX"
: language === LANGUAGE_PYTHON && (template === TEMPLATE_SAMPLE_APP || template === TEMPLATE_ADVANCED_SAMPLE)
? "kernel deploy main.py"
: language === LANGUAGE_PYTHON && template === TEMPLATE_BROWSER_USE
Expand Down
127 changes: 125 additions & 2 deletions templates/python/browser-use/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,130 @@ class TaskInput(TypedDict):

# LLM API Keys are set in the environment during `kernel deploy <filename> -e OPENAI_API_KEY=XXX`
# See https://docs.onkernel.com/launch/deploy#environment-variables
llm = ChatOpenAI(model="gpt-4o")
llm = ChatOpenAI(model="gpt-4o-mini")


# Define a subclass of BrowserSession that overrides _setup_viewports (which mishandles resizeing on connecting via cdp)
class BrowserSessionCustomResize(BrowserSession):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for this! Can you move it to a separate file so we can keep main.py nice and clean?

async def _setup_viewports(self) -> None:
"""Resize any existing page viewports to match the configured size, set up storage_state, permissions, geolocation, etc."""

assert self.browser_context, 'BrowserSession.browser_context must already be set up before calling _setup_viewports()'

self.browser_profile.window_size = {"width": 1024, "height": 786}
self.browser_profile.viewport = {"width": 1024, "height": 786}
self.browser_profile.screen = {"width": 1024, "height": 786}
self.browser_profile.device_scale_factor = 1.0

# log the viewport settings to terminal
viewport = self.browser_profile.viewport
print(
'📐 Setting up viewport: '
+ f'headless={self.browser_profile.headless} '
+ (
f'window={self.browser_profile.window_size["width"]}x{self.browser_profile.window_size["height"]}px '
if self.browser_profile.window_size
else '(no window) '
)
+ (
f'screen={self.browser_profile.screen["width"]}x{self.browser_profile.screen["height"]}px '
if self.browser_profile.screen
else ''
)
+ (f'viewport={viewport["width"]}x{viewport["height"]}px ' if viewport else '(no viewport) ')
+ f'device_scale_factor={self.browser_profile.device_scale_factor or 1.0} '
+ f'is_mobile={self.browser_profile.is_mobile} '
+ (f'color_scheme={self.browser_profile.color_scheme.value} ' if self.browser_profile.color_scheme else '')
+ (f'locale={self.browser_profile.locale} ' if self.browser_profile.locale else '')
+ (f'timezone_id={self.browser_profile.timezone_id} ' if self.browser_profile.timezone_id else '')
+ (f'geolocation={self.browser_profile.geolocation} ' if self.browser_profile.geolocation else '')
+ (f'permissions={",".join(self.browser_profile.permissions or ["<none>"])} ')
)

# if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults
if self.browser_profile.permissions:
try:
await self.browser_context.grant_permissions(self.browser_profile.permissions)
except Exception as e:
self.logger.warning(
f'⚠️ Failed to grant browser permissions {self.browser_profile.permissions}: {type(e).__name__}: {e}'
)
try:
if self.browser_profile.default_timeout:
self.browser_context.set_default_timeout(self.browser_profile.default_timeout)
if self.browser_profile.default_navigation_timeout:
self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout)
except Exception as e:
self.logger.warning(
f'⚠️ Failed to set playwright timeout settings '
f'cdp_api={self.browser_profile.default_timeout} '
f'navigation={self.browser_profile.default_navigation_timeout}: {type(e).__name__}: {e}'
)
try:
if self.browser_profile.extra_http_headers:
self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers)
except Exception as e:
self.logger.warning(
f'⚠️ Failed to setup playwright extra_http_headers: {type(e).__name__}: {e}'
) # dont print the secret header contents in the logs!

try:
if self.browser_profile.geolocation:
await self.browser_context.set_geolocation(self.browser_profile.geolocation)
except Exception as e:
self.logger.warning(
f'⚠️ Failed to update browser geolocation {self.browser_profile.geolocation}: {type(e).__name__}: {e}'
)

await self.load_storage_state()

page = None

for page in self.browser_context.pages:
# apply viewport size settings to any existing pages
if viewport:
await page.set_viewport_size(viewport)

# show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages
if page.url == 'about:blank':
await self._show_dvd_screensaver_loading_animation(page)

page = page or (await self.browser_context.new_page())

if (not viewport) and (self.browser_profile.window_size is not None) and not self.browser_profile.headless:
# attempt to resize the actual browser window

# cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds
try:
cdp_session = await page.context.new_cdp_session(page)
window_id_result = await cdp_session.send('Browser.getWindowForTarget')
await cdp_session.send(
'Browser.setWindowBounds',
{
'windowId': window_id_result['windowId'],
'bounds': {
**self.browser_profile.window_size,
'windowState': 'normal', # Ensure window is not minimized/maximized
},
},
)
await cdp_session.detach()
except Exception as e:
_log_size = lambda size: f'{size["width"]}x{size["height"]}px'
try:
# fallback to javascript resize if cdp setWindowBounds fails
await page.evaluate(
"""(width, height) => {window.resizeTo(width, height)}""",
**self.browser_profile.window_size,
)
return
except Exception as e:
pass

self.logger.warning(
f'⚠️ Failed to resize browser window to {_log_size(self.browser_profile.window_size)} using CDP setWindowBounds: {type(e).__name__}: {e}'
)


@app.action("bu-task")
async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
Expand All @@ -37,7 +160,7 @@ async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
#task="Compare the price of gpt-4o and DeepSeek-V3",
task=input_data["task"],
llm=llm,
browser_session=BrowserSession(cdp_url=kernel_browser.cdp_ws_url)
browser_session=BrowserSessionCustomResize(cdp_url=kernel_browser.cdp_ws_url)
)
result = await agent.run()
if result.final_result() is not None:
Expand Down
2 changes: 2 additions & 0 deletions templates/typescript/cua-sample/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename the folder /cua-sample/ to just /cua/ so it matches other examples

bun.lockb
1 change: 1 addition & 0 deletions templates/typescript/cua-sample/.prettierignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
4 changes: 4 additions & 0 deletions templates/typescript/cua-sample/.prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we update this (this is what we're using in other repos)?

{
"semi": true,
"trailingComma": "all",
"singleQuote": true,
"printWidth": 100,
"tabWidth": 2
}

"tabWidth": 1,
"useTabs": true
}
8 changes: 8 additions & 0 deletions templates/typescript/cua-sample/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Kernel Typescript Sample App - CUA

This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI.

It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation.
Also makes use of the latest OpenAI SDK format, and has local equivalent to Kernel methods for local testing before deploying on Kernel.

See the [docs](https://docs.onkernel.com/quickstart) for information.
110 changes: 110 additions & 0 deletions templates/typescript/cua-sample/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// @ts-nocheck
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove and update type errors as needed


import "dotenv/config";
import { Kernel, type KernelContext } from "@onkernel/sdk";
import { chromium } from "playwright";
import { Agent } from "./lib/agent";
import computers from "./lib/computers";

const kernel = new Kernel();
const app = kernel.app("ts-cua");

// LLM API Keys are set in the environment during `kernel deploy <filename> -e ANTHROPIC_API_KEY=XXX`
// See https://docs.onkernel.com/launch/deploy#environment-variables
if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set');

/**
* Example app that run an agent using openai CUA
* Args:
* ctx: Kernel context containing invocation information
* payload: An object with a `query` property
* Returns:
* An answer to the query, elapsed time and optionally the messages stack
* Invoke this via CLI:
* export KERNEL_API_KEY=<your_api_key>
* kernel deploy index.ts -e OPENAI_API_KEY=XXXXX --force
* kernel invoke ts-cua agent-run -p "{\"query\":\"current market price range for a used dreamcast\"}"
* kernel logs ts-cua -f # Open in separate tab
*/

interface CuaInput {
query: string;
}

interface CuaOutput {
elapsed: number;
response: Array<object>;
answer: object;
}

app.action<CuaInput, CuaOutput>(
"agent-run",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to "cua-task" to conform to other examples

async (ctx: KernelContext, payload?: CuaInput): Promise<CuaOutput> => {
const startTime = Date.now();
const kernelBrowser = await kernel.browsers.create({
invocation_id: ctx.invocation_id,
});
console.log(
"> Kernel browser live view url: ",
kernelBrowser.browser_live_view_url,
);

try {

// kernel browser
const { computer } = await computers.create({
type: "kernel",
cdp_ws_url: kernelBrowser.cdp_ws_url,
});

// setup agent
const agent = new Agent(
"computer-use-preview",
computer,
[], // additional tools
(message: string) => {
console.log(`> safety check: ${message}`);
return true; // Auto-acknowledge all safety checks for testing
},
);

// start agent run
const response = await agent.runFullTurn(
[
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would prefer runFullTurn to accept an obj with named vars

{
role: "system",
content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString("en-US", { weekday: "long" })})`,
},
{
type: "message",
role: "user",
content: [
{
type: "input_text",
text: payload.query,
// text: "go to https://news.ycombinator.com , open top article , describe the target website design (in yaml format)"
},
],
},
],
true, // print_steps
true, // debug
false, // show_images
);

console.log("> agent run done");

const endTime = Date.now();
const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds

return {
// response, // full messages stack trace
elapsed: parseFloat(timeElapsed.toFixed(2)),
answer: response?.slice(-1)?.[0]?.content?.[0]?.text ?? null,
};
} finally {
// Note: KernelPlaywrightComputer handles browser cleanup internally
// No need to manually close browser here
}
},
);
Loading