Skip to content

Commit d628368

Browse files
author
raidendotai
committed
ts-cua updates, python-bu cleanup
2 parents 7ff9af9 + 3fe0b0d commit d628368

File tree

30 files changed

+320
-283
lines changed

30 files changed

+320
-283
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ create-kernel-app [app-name] [options]
4747
- `stagehand`: Template with Stagehand SDK (Typescript only)
4848
- `advanced-sample`: Implements sample apps using advanced Kernel configs
4949
- `computer-use`: Implements a prompt loop using Anthropic Computer Use
50-
- `cua-sample`: Implements a Computer Use Agent (OpenAI CUA) sample (Typescript only)
50+
- `cua`: Implements a Computer Use Agent (OpenAI CUA) sample (Typescript only)
5151

5252
### Examples
5353

@@ -124,7 +124,7 @@ kernel invoke python-basic get-page-title --payload '{"url": "https://www.google
124124
kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}'
125125

126126
# Typescript + CUA Sample
127-
kernel invoke ts-cua agent-run --payload '{"query": "open hackernews and get the top 5 articles"}'
127+
kernel invoke ts-cua cua-task --payload '{"query": "open hackernews and get the top 5 articles"}'
128128
```
129129

130130
## Sample apps reference
@@ -138,7 +138,7 @@ These are the sample apps currently available when you run `npx @onkernel/create
138138
| **stagehand** | Returns the first result of a specified Google search | Stagehand | `{ query }` |
139139
| **advanced-sample** | Implements sample apps using advanced Kernel configs | n/a |
140140
| **computer-use** | Implements a prompt loop | Anthropic Computer Use API | `{ query }` |
141-
| **cua-sample** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ query }` |
141+
| **cua** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ query }` |
142142

143143
## Documentation
144144

index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ type TemplateKey =
1919
| "stagehand"
2020
| "advanced-sample"
2121
| "computer-use"
22-
| "cua-sample";
22+
| "cua";
2323
type LanguageInfo = { name: string; shorthand: string };
2424
type TemplateInfo = {
2525
name: string;
@@ -35,7 +35,7 @@ const TEMPLATE_BROWSER_USE = "browser-use";
3535
const TEMPLATE_STAGEHAND = "stagehand";
3636
const TEMPLATE_ADVANCED_SAMPLE = "advanced-sample";
3737
const TEMPLATE_COMPUTER_USE = "computer-use";
38-
const TEMPLATE_CUA_SAMPLE = "cua-sample";
38+
const TEMPLATE_CUA_SAMPLE = "cua";
3939
const LANGUAGE_SHORTHAND_TS = "ts";
4040
const LANGUAGE_SHORTHAND_PY = "py";
4141

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@onkernel/create-kernel-app",
3-
"version": "0.1.19",
3+
"version": "0.1.22",
44
"description": "Create Kernel sample applications",
55
"main": "dist/index.js",
66
"type": "module",

templates/python/advanced-sample/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ description = "Sample application implementing advanced Kernel configs"
55
readme = "README.md"
66
requires-python = ">=3.11"
77
dependencies = [
8-
"kernel>=0.5.0",
8+
"kernel>=0.6.0",
99
"playwright>=1.52.0"
1010
]
1111

templates/python/browser-use/main.py

Lines changed: 2 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from langchain_openai import ChatOpenAI
2-
from browser_use import Agent, BrowserSession
2+
from browser_use import Agent
33
import kernel
44
from kernel import Kernel
55
from typing import TypedDict
6+
from session import BrowserSessionCustomResize
67

78
client = Kernel()
89

@@ -15,129 +16,6 @@ class TaskInput(TypedDict):
1516
# See https://docs.onkernel.com/launch/deploy#environment-variables
1617
llm = ChatOpenAI(model="gpt-4o-mini")
1718

18-
19-
# Define a subclass of BrowserSession that overrides _setup_viewports (which mishandles resizeing on connecting via cdp)
20-
class BrowserSessionCustomResize(BrowserSession):
21-
async def _setup_viewports(self) -> None:
22-
"""Resize any existing page viewports to match the configured size, set up storage_state, permissions, geolocation, etc."""
23-
24-
assert self.browser_context, 'BrowserSession.browser_context must already be set up before calling _setup_viewports()'
25-
26-
self.browser_profile.window_size = {"width": 1024, "height": 786}
27-
self.browser_profile.viewport = {"width": 1024, "height": 786}
28-
self.browser_profile.screen = {"width": 1024, "height": 786}
29-
self.browser_profile.device_scale_factor = 1.0
30-
31-
# log the viewport settings to terminal
32-
viewport = self.browser_profile.viewport
33-
print(
34-
'📐 Setting up viewport: '
35-
+ f'headless={self.browser_profile.headless} '
36-
+ (
37-
f'window={self.browser_profile.window_size["width"]}x{self.browser_profile.window_size["height"]}px '
38-
if self.browser_profile.window_size
39-
else '(no window) '
40-
)
41-
+ (
42-
f'screen={self.browser_profile.screen["width"]}x{self.browser_profile.screen["height"]}px '
43-
if self.browser_profile.screen
44-
else ''
45-
)
46-
+ (f'viewport={viewport["width"]}x{viewport["height"]}px ' if viewport else '(no viewport) ')
47-
+ f'device_scale_factor={self.browser_profile.device_scale_factor or 1.0} '
48-
+ f'is_mobile={self.browser_profile.is_mobile} '
49-
+ (f'color_scheme={self.browser_profile.color_scheme.value} ' if self.browser_profile.color_scheme else '')
50-
+ (f'locale={self.browser_profile.locale} ' if self.browser_profile.locale else '')
51-
+ (f'timezone_id={self.browser_profile.timezone_id} ' if self.browser_profile.timezone_id else '')
52-
+ (f'geolocation={self.browser_profile.geolocation} ' if self.browser_profile.geolocation else '')
53-
+ (f'permissions={",".join(self.browser_profile.permissions or ["<none>"])} ')
54-
)
55-
56-
# if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults
57-
if self.browser_profile.permissions:
58-
try:
59-
await self.browser_context.grant_permissions(self.browser_profile.permissions)
60-
except Exception as e:
61-
self.logger.warning(
62-
f'⚠️ Failed to grant browser permissions {self.browser_profile.permissions}: {type(e).__name__}: {e}'
63-
)
64-
try:
65-
if self.browser_profile.default_timeout:
66-
self.browser_context.set_default_timeout(self.browser_profile.default_timeout)
67-
if self.browser_profile.default_navigation_timeout:
68-
self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout)
69-
except Exception as e:
70-
self.logger.warning(
71-
f'⚠️ Failed to set playwright timeout settings '
72-
f'cdp_api={self.browser_profile.default_timeout} '
73-
f'navigation={self.browser_profile.default_navigation_timeout}: {type(e).__name__}: {e}'
74-
)
75-
try:
76-
if self.browser_profile.extra_http_headers:
77-
self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers)
78-
except Exception as e:
79-
self.logger.warning(
80-
f'⚠️ Failed to setup playwright extra_http_headers: {type(e).__name__}: {e}'
81-
) # dont print the secret header contents in the logs!
82-
83-
try:
84-
if self.browser_profile.geolocation:
85-
await self.browser_context.set_geolocation(self.browser_profile.geolocation)
86-
except Exception as e:
87-
self.logger.warning(
88-
f'⚠️ Failed to update browser geolocation {self.browser_profile.geolocation}: {type(e).__name__}: {e}'
89-
)
90-
91-
await self.load_storage_state()
92-
93-
page = None
94-
95-
for page in self.browser_context.pages:
96-
# apply viewport size settings to any existing pages
97-
if viewport:
98-
await page.set_viewport_size(viewport)
99-
100-
# show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages
101-
if page.url == 'about:blank':
102-
await self._show_dvd_screensaver_loading_animation(page)
103-
104-
page = page or (await self.browser_context.new_page())
105-
106-
if (not viewport) and (self.browser_profile.window_size is not None) and not self.browser_profile.headless:
107-
# attempt to resize the actual browser window
108-
109-
# cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds
110-
try:
111-
cdp_session = await page.context.new_cdp_session(page)
112-
window_id_result = await cdp_session.send('Browser.getWindowForTarget')
113-
await cdp_session.send(
114-
'Browser.setWindowBounds',
115-
{
116-
'windowId': window_id_result['windowId'],
117-
'bounds': {
118-
**self.browser_profile.window_size,
119-
'windowState': 'normal', # Ensure window is not minimized/maximized
120-
},
121-
},
122-
)
123-
await cdp_session.detach()
124-
except Exception as e:
125-
_log_size = lambda size: f'{size["width"]}x{size["height"]}px'
126-
try:
127-
# fallback to javascript resize if cdp setWindowBounds fails
128-
await page.evaluate(
129-
"""(width, height) => {window.resizeTo(width, height)}""",
130-
**self.browser_profile.window_size,
131-
)
132-
return
133-
except Exception as e:
134-
pass
135-
136-
self.logger.warning(
137-
f'⚠️ Failed to resize browser window to {_log_size(self.browser_profile.window_size)} using CDP setWindowBounds: {type(e).__name__}: {e}'
138-
)
139-
140-
14119
@app.action("bu-task")
14220
async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
14321
"""

templates/python/browser-use/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ readme = "README.md"
66
requires-python = ">=3.11"
77
dependencies = [
88
"browser-use~=0.2.4",
9-
"kernel>=0.5.0",
9+
"kernel>=0.6.0",
1010
"langchain-openai>=0.3.11",
1111
"pydantic>=2.10.6",
1212
]
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from browser_use import BrowserSession
2+
3+
# Define a subclass of BrowserSession that overrides _setup_viewports (which mishandles resizeing on connecting via cdp)
4+
class BrowserSessionCustomResize(BrowserSession):
5+
async def _setup_viewports(self) -> None:
6+
"""Resize any existing page viewports to match the configured size, set up storage_state, permissions, geolocation, etc."""
7+
8+
assert self.browser_context, 'BrowserSession.browser_context must already be set up before calling _setup_viewports()'
9+
10+
self.browser_profile.window_size = {"width": 1024, "height": 786}
11+
self.browser_profile.viewport = {"width": 1024, "height": 786}
12+
self.browser_profile.screen = {"width": 1024, "height": 786}
13+
self.browser_profile.device_scale_factor = 1.0
14+
15+
# log the viewport settings to terminal
16+
viewport = self.browser_profile.viewport
17+
# if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults
18+
if self.browser_profile.permissions:
19+
try:
20+
await self.browser_context.grant_permissions(self.browser_profile.permissions)
21+
except Exception as e:
22+
print(e)
23+
try:
24+
if self.browser_profile.default_timeout:
25+
self.browser_context.set_default_timeout(self.browser_profile.default_timeout)
26+
if self.browser_profile.default_navigation_timeout:
27+
self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout)
28+
except Exception as e:
29+
print(e)
30+
try:
31+
if self.browser_profile.extra_http_headers:
32+
self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers)
33+
except Exception as e:
34+
print(e)
35+
36+
try:
37+
if self.browser_profile.geolocation:
38+
await self.browser_context.set_geolocation(self.browser_profile.geolocation)
39+
except Exception as e:
40+
print(e)
41+
42+
await self.load_storage_state()
43+
44+
page = None
45+
46+
for page in self.browser_context.pages:
47+
# apply viewport size settings to any existing pages
48+
if viewport:
49+
await page.set_viewport_size(viewport)
50+
51+
# show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages
52+
if page.url == 'about:blank':
53+
await self._show_dvd_screensaver_loading_animation(page)
54+
55+
page = page or (await self.browser_context.new_page())
56+
57+
if (not viewport) and (self.browser_profile.window_size is not None) and not self.browser_profile.headless:
58+
# attempt to resize the actual browser window
59+
60+
# cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds
61+
try:
62+
cdp_session = await page.context.new_cdp_session(page)
63+
window_id_result = await cdp_session.send('Browser.getWindowForTarget')
64+
await cdp_session.send(
65+
'Browser.setWindowBounds',
66+
{
67+
'windowId': window_id_result['windowId'],
68+
'bounds': {
69+
**self.browser_profile.window_size,
70+
'windowState': 'normal', # Ensure window is not minimized/maximized
71+
},
72+
},
73+
)
74+
await cdp_session.detach()
75+
except Exception as e:
76+
_log_size = lambda size: f'{size["width"]}x{size["height"]}px'
77+
try:
78+
# fallback to javascript resize if cdp setWindowBounds fails
79+
await page.evaluate(
80+
"""(width, height) => {window.resizeTo(width, height)}""",
81+
**self.browser_profile.window_size,
82+
)
83+
return
84+
except Exception as e:
85+
pass

templates/python/computer-use/tools/computer.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -322,16 +322,26 @@ async def __call__(
322322
await self.page.mouse.move(x, y)
323323

324324
# Map scroll directions to Playwright's wheel events
325+
page_dimensions = await self.page.evaluate(
326+
"() => Promise.resolve({ h: window.innerHeight, w: window.innerWidth })"
327+
)
328+
page_partitions = 25
329+
scroll_factor = scroll_amount / page_partitions
330+
page_width = page_dimensions['w']
331+
page_height = page_dimensions['h']
332+
325333
delta_x = 0
326334
delta_y = 0
327335
if scroll_direction == "up":
328-
delta_y = -scroll_amount * 100
336+
delta_y = -scroll_factor * page_height
329337
elif scroll_direction == "down":
330-
delta_y = scroll_amount * 100
338+
delta_y = scroll_factor * page_height
331339
elif scroll_direction == "left":
332-
delta_x = -scroll_amount * 100
340+
delta_x = -scroll_factor * page_width
333341
elif scroll_direction == "right":
334-
delta_x = scroll_amount * 100
342+
delta_x = scroll_factor * page_width
343+
344+
print(f"Scrolling {abs(delta_x) if delta_x != 0 else abs(delta_y):.02f} pixels {scroll_direction}")
335345

336346
await self.page.mouse.wheel(delta_x=delta_x, delta_y=delta_y)
337347
return await self.screenshot()
@@ -391,4 +401,4 @@ async def __call__(
391401

392402
return await super().__call__(
393403
action=action, text=text, coordinate=coordinate, key=key, **kwargs
394-
)
404+
)

templates/python/sample-app/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ version = "0.1.0"
44
description = "Kernel application template - Python"
55
readme = "README.md"
66
requires-python = ">=3.11"
7-
dependencies = ["kernel>=0.5.0", "playwright>=1.52.0"]
7+
dependencies = ["kernel>=0.6.0", "playwright>=1.52.0"]
88

99
[dependency-groups]
1010
dev = ["mypy>=1.15.0"]

templates/typescript/advanced-sample/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"typescript": "^5"
88
},
99
"dependencies": {
10-
"@onkernel/sdk": ">=0.5.0",
10+
"@onkernel/sdk": ">=0.6.0",
1111
"playwright": "^1.52.0"
1212
}
1313
}

0 commit comments

Comments
 (0)