Skip to content

Commit 7ff9af9

Browse files
author
raidendotai
committed
python/browser-use : fixed viewport & window resize
1 parent 27dbcca commit 7ff9af9

File tree

2 files changed

+126
-3
lines changed

2 files changed

+126
-3
lines changed

templates/python/browser-use/main.py

Lines changed: 125 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,130 @@ class TaskInput(TypedDict):
1313

1414
# LLM API Keys are set in the environment during `kernel deploy <filename> -e OPENAI_API_KEY=XXX`
1515
# See https://docs.onkernel.com/launch/deploy#environment-variables
16-
llm = ChatOpenAI(model="gpt-4o")
16+
llm = ChatOpenAI(model="gpt-4o-mini")
17+
18+
19+
# Define a subclass of BrowserSession that overrides _setup_viewports (which mishandles resizeing on connecting via cdp)
20+
class BrowserSessionCustomResize(BrowserSession):
21+
async def _setup_viewports(self) -> None:
22+
"""Resize any existing page viewports to match the configured size, set up storage_state, permissions, geolocation, etc."""
23+
24+
assert self.browser_context, 'BrowserSession.browser_context must already be set up before calling _setup_viewports()'
25+
26+
self.browser_profile.window_size = {"width": 1024, "height": 786}
27+
self.browser_profile.viewport = {"width": 1024, "height": 786}
28+
self.browser_profile.screen = {"width": 1024, "height": 786}
29+
self.browser_profile.device_scale_factor = 1.0
30+
31+
# log the viewport settings to terminal
32+
viewport = self.browser_profile.viewport
33+
print(
34+
'📐 Setting up viewport: '
35+
+ f'headless={self.browser_profile.headless} '
36+
+ (
37+
f'window={self.browser_profile.window_size["width"]}x{self.browser_profile.window_size["height"]}px '
38+
if self.browser_profile.window_size
39+
else '(no window) '
40+
)
41+
+ (
42+
f'screen={self.browser_profile.screen["width"]}x{self.browser_profile.screen["height"]}px '
43+
if self.browser_profile.screen
44+
else ''
45+
)
46+
+ (f'viewport={viewport["width"]}x{viewport["height"]}px ' if viewport else '(no viewport) ')
47+
+ f'device_scale_factor={self.browser_profile.device_scale_factor or 1.0} '
48+
+ f'is_mobile={self.browser_profile.is_mobile} '
49+
+ (f'color_scheme={self.browser_profile.color_scheme.value} ' if self.browser_profile.color_scheme else '')
50+
+ (f'locale={self.browser_profile.locale} ' if self.browser_profile.locale else '')
51+
+ (f'timezone_id={self.browser_profile.timezone_id} ' if self.browser_profile.timezone_id else '')
52+
+ (f'geolocation={self.browser_profile.geolocation} ' if self.browser_profile.geolocation else '')
53+
+ (f'permissions={",".join(self.browser_profile.permissions or ["<none>"])} ')
54+
)
55+
56+
# if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults
57+
if self.browser_profile.permissions:
58+
try:
59+
await self.browser_context.grant_permissions(self.browser_profile.permissions)
60+
except Exception as e:
61+
self.logger.warning(
62+
f'⚠️ Failed to grant browser permissions {self.browser_profile.permissions}: {type(e).__name__}: {e}'
63+
)
64+
try:
65+
if self.browser_profile.default_timeout:
66+
self.browser_context.set_default_timeout(self.browser_profile.default_timeout)
67+
if self.browser_profile.default_navigation_timeout:
68+
self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout)
69+
except Exception as e:
70+
self.logger.warning(
71+
f'⚠️ Failed to set playwright timeout settings '
72+
f'cdp_api={self.browser_profile.default_timeout} '
73+
f'navigation={self.browser_profile.default_navigation_timeout}: {type(e).__name__}: {e}'
74+
)
75+
try:
76+
if self.browser_profile.extra_http_headers:
77+
self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers)
78+
except Exception as e:
79+
self.logger.warning(
80+
f'⚠️ Failed to setup playwright extra_http_headers: {type(e).__name__}: {e}'
81+
) # dont print the secret header contents in the logs!
82+
83+
try:
84+
if self.browser_profile.geolocation:
85+
await self.browser_context.set_geolocation(self.browser_profile.geolocation)
86+
except Exception as e:
87+
self.logger.warning(
88+
f'⚠️ Failed to update browser geolocation {self.browser_profile.geolocation}: {type(e).__name__}: {e}'
89+
)
90+
91+
await self.load_storage_state()
92+
93+
page = None
94+
95+
for page in self.browser_context.pages:
96+
# apply viewport size settings to any existing pages
97+
if viewport:
98+
await page.set_viewport_size(viewport)
99+
100+
# show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages
101+
if page.url == 'about:blank':
102+
await self._show_dvd_screensaver_loading_animation(page)
103+
104+
page = page or (await self.browser_context.new_page())
105+
106+
if (not viewport) and (self.browser_profile.window_size is not None) and not self.browser_profile.headless:
107+
# attempt to resize the actual browser window
108+
109+
# cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds
110+
try:
111+
cdp_session = await page.context.new_cdp_session(page)
112+
window_id_result = await cdp_session.send('Browser.getWindowForTarget')
113+
await cdp_session.send(
114+
'Browser.setWindowBounds',
115+
{
116+
'windowId': window_id_result['windowId'],
117+
'bounds': {
118+
**self.browser_profile.window_size,
119+
'windowState': 'normal', # Ensure window is not minimized/maximized
120+
},
121+
},
122+
)
123+
await cdp_session.detach()
124+
except Exception as e:
125+
_log_size = lambda size: f'{size["width"]}x{size["height"]}px'
126+
try:
127+
# fallback to javascript resize if cdp setWindowBounds fails
128+
await page.evaluate(
129+
"""(width, height) => {window.resizeTo(width, height)}""",
130+
**self.browser_profile.window_size,
131+
)
132+
return
133+
except Exception as e:
134+
pass
135+
136+
self.logger.warning(
137+
f'⚠️ Failed to resize browser window to {_log_size(self.browser_profile.window_size)} using CDP setWindowBounds: {type(e).__name__}: {e}'
138+
)
139+
17140

18141
@app.action("bu-task")
19142
async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
@@ -37,7 +160,7 @@ async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput):
37160
#task="Compare the price of gpt-4o and DeepSeek-V3",
38161
task=input_data["task"],
39162
llm=llm,
40-
browser_session=BrowserSession(cdp_url=kernel_browser.cdp_ws_url)
163+
browser_session=BrowserSessionCustomResize(cdp_url=kernel_browser.cdp_ws_url)
41164
)
42165
result = await agent.run()
43166
if result.final_result() is not None:

templates/typescript/cua-sample/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI.
44

55
It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation.
6-
Also makes use of the latest OpenAI SDK format.
6+
Also makes use of the latest OpenAI SDK format, and has local equivalent to Kernel methods for local testing before deploying on Kernel.
77

88
See the [docs](https://docs.onkernel.com/quickstart) for information.

0 commit comments

Comments
 (0)