Skip to content

Commit 9e4e07c

Browse files
authored
Miniwob zoom (#351)
* Set device scale factor to 1.0 in extract_screenshot for consistent rendering. This prevens screenshot sized to be affected by environments such as "retina" on macbook * add examples to the description * zoom miniwob 150% by default (doesn't change the viewport, just has a better usage of the viewport) * adjust version * Update extract_screenshot to use actual window dimensions for device metrics * Enhance zoom functionality in miniwob for better viewport usage and precision in visual agents * Enhance zoom_page docstring with detailed explanation of CSS transform scaling benefits * Adjust zoom factor in miniwob to improve viewport usage and precision for visual agents * adding type * Remap the coordinate actions based on the zoom factor * apply the zooming at the screenshot level * changing the zoom method
1 parent 6bc4c51 commit 9e4e07c

File tree

14 files changed

+119
-29
lines changed

14 files changed

+119
-29
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
browsergym-core==0.14.0
1+
browsergym-core==0.14.1
22
datasets
33
scipy
44
numpy

browsergym/core/src/browsergym/core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.14.0"
1+
__version__ = "0.14.1"
22

33
import playwright.sync_api
44

browsergym/core/src/browsergym/core/action/functions.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
call_fun,
1010
get_elem_by_bid,
1111
highlight_by_box,
12+
map_coordinates,
1213
smooth_move_visual_cursor_to,
1314
)
1415

@@ -277,6 +278,9 @@ def scroll(delta_x: float, delta_y: float):
277278
scroll(0, 200)
278279
scroll(-50.2, -100.5)
279280
"""
281+
delta_x, delta_y = map_coordinates(
282+
page, delta_x, delta_y
283+
) # map coordinates to page coordinates
280284
page.mouse.wheel(delta_x, delta_y)
281285

282286

@@ -287,6 +291,8 @@ def scroll_at(x: int, y: int, dx: int, dy: int):
287291
Examples:
288292
scroll_at(50, 100, -50, -100)
289293
"""
294+
x, y = map_coordinates(page, x, y) # map coordinates to page coordinates
295+
dx, dy = map_coordinates(page, dx, dy) # map coordinates to page coordinates
290296
page.mouse.move(x, y) # position pointer
291297
page.mouse.wheel(dx, dy)
292298

@@ -300,6 +306,7 @@ def mouse_move(x: float, y: float):
300306
Examples:
301307
mouse_move(65.2, 158.5)
302308
"""
309+
x, y = map_coordinates(page, x, y) # map coordinates to page coordinates
303310
if demo_mode != "off":
304311
smooth_move_visual_cursor_to(page, x, y)
305312
page.mouse.move(x, y)
@@ -315,6 +322,7 @@ def mouse_up(x: float, y: float, button: Literal["left", "middle", "right"] = "l
315322
mouse_up(250, 120)
316323
mouse_up(47, 252, 'right')
317324
"""
325+
x, y = map_coordinates(page, x, y) # map coordinates to page coordinates
318326
if demo_mode != "off":
319327
smooth_move_visual_cursor_to(page, x, y)
320328
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
@@ -332,6 +340,7 @@ def mouse_down(x: float, y: float, button: Literal["left", "middle", "right"] =
332340
mouse_down(140.2, 580.1)
333341
mouse_down(458, 254.5, 'middle')
334342
"""
343+
x, y = map_coordinates(page, x, y) # map coordinates to page coordinates
335344
if demo_mode != "off":
336345
smooth_move_visual_cursor_to(page, x, y)
337346
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
@@ -349,6 +358,7 @@ def mouse_click(x: float, y: float, button: Literal["left", "middle", "right"] =
349358
mouse_click(887.2, 68)
350359
mouse_click(56, 712.56, 'right')
351360
"""
361+
x, y = map_coordinates(page, x, y) # map coordinates to page coordinates
352362
if demo_mode != "off":
353363
smooth_move_visual_cursor_to(page, x, y)
354364
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
@@ -365,6 +375,7 @@ def mouse_dblclick(x: float, y: float, button: Literal["left", "middle", "right"
365375
mouse_dblclick(5, 236)
366376
mouse_dblclick(87.5, 354, 'right')
367377
"""
378+
x, y = map_coordinates(page, x, y) # map coordinates to page coordinates
368379
if demo_mode != "off":
369380
smooth_move_visual_cursor_to(page, x, y)
370381
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
@@ -380,6 +391,9 @@ def mouse_drag_and_drop(from_x: float, from_y: float, to_x: float, to_y: float):
380391
Examples:
381392
mouse_drag_and_drop(10.7, 325, 235.6, 24.54)
382393
"""
394+
from_x, from_y = map_coordinates(page, from_x, from_y) # map coordinates to page coordinates
395+
to_x, to_y = map_coordinates(page, to_x, to_y) # map coordinates to page coordinates
396+
383397
if demo_mode != "off":
384398
x, y = from_x, from_y
385399
smooth_move_visual_cursor_to(page, x, y)
@@ -624,6 +638,7 @@ def mouse_upload_file(x: float, y: float, file: str | list[str]):
624638
mouse_upload_file(132.1, 547, "my_receipt.pdf")
625639
mouse_upload_file(328, 812, ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"])
626640
"""
641+
x, y = map_coordinates(page, x, y) # map coordinates to page coordinates
627642
if demo_mode != "off":
628643
smooth_move_visual_cursor_to(page, x, y)
629644
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})

browsergym/core/src/browsergym/core/action/highlevel.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ def to_python_code(self, action):
515515
# return the constructed python code
516516
return python_code
517517

518-
def to_tool_description(self, api="openai") -> list[dict]:
518+
def to_tool_description(self, api="openai", add_examples=True) -> list[dict]:
519519
"""
520520
Translates actions to tool descriptions following the OpenAI API format.
521521
@@ -535,16 +535,16 @@ def to_tool_description(self, api="openai") -> list[dict]:
535535
for param_name, param in signature.parameters.items():
536536
param_type = "string" # Default to string if type is not specified
537537
if param.annotation != inspect.Parameter.empty:
538-
if param.annotation is str:
539-
param_type = "string"
540-
elif param.annotation is float or param.annotation is int:
541-
param_type = "number"
542-
elif param.annotation is bool:
543-
param_type = "boolean"
544-
elif param.annotation is dict:
545-
param_type = "object"
546-
elif param.annotation is list:
547-
param_type = "array"
538+
539+
type_map = {
540+
str: "string",
541+
float: "number",
542+
int: "number",
543+
bool: "boolean",
544+
dict: "object",
545+
list: "array",
546+
}
547+
param_type = type_map.get(param.annotation, "string")
548548

549549
parameters["properties"][param_name] = {
550550
"type": param_type,
@@ -554,9 +554,15 @@ def to_tool_description(self, api="openai") -> list[dict]:
554554
parameters["required"].append(param_name)
555555

556556
# Construct the tool descriptor
557+
description = action.description
558+
if add_examples and action.examples:
559+
description += "\n\nExamples:\n"
560+
for example in action.examples:
561+
description += f"- {example}\n"
562+
557563
tool = {
558564
"name": tool_name,
559-
"description": action.description,
565+
"description": description,
560566
schema: parameters,
561567
}
562568
if api == "openai":

browsergym/core/src/browsergym/core/action/utils.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,3 +286,32 @@ def call_fun(fun: callable, retry_with_force: bool):
286286
fun(force=True)
287287
else:
288288
raise e
289+
290+
291+
def map_coordinates(page, x, y):
292+
"""
293+
Maps screenshot coordinates back to page coordinates based on bgym_scale_factor.
294+
295+
When bgym_scale_factor > 1.0, the screenshot is captured at higher resolution,
296+
so VLM coordinates need to be scaled down to match the actual page coordinates.
297+
298+
Args:
299+
page: Playwright page object (should have _bgym_scale_factor attribute)
300+
x: X coordinate from VLM (based on scaled screenshot)
301+
y: Y coordinate from VLM (based on scaled screenshot)
302+
303+
Returns:
304+
tuple: (mapped_x, mapped_y) coordinates for use with Playwright
305+
306+
Examples:
307+
# If bgym_scale_factor = 1.5 and VLM returns coordinates (300, 450)
308+
# from the scaled screenshot, map them back to page coordinates:
309+
page_x, page_y = map_coordinates(page, 300, 450) # Returns (200.0, 300.0)
310+
"""
311+
scale_factor = getattr(page, "_bgym_scale_factor", 1.0)
312+
313+
# Scale down coordinates to match actual page coordinate system
314+
mapped_x = x / scale_factor
315+
mapped_y = y / scale_factor
316+
317+
return mapped_x, mapped_y

browsergym/core/src/browsergym/core/observation.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ def _post_extract(page: playwright.sync_api.Page):
115115
def extract_screenshot(page: playwright.sync_api.Page):
116116
"""
117117
Extracts the screenshot image of a Playwright page using Chrome DevTools Protocol.
118+
Uses the bgym_scale_factor to capture higher resolution screenshots for better VLM understanding.
118119
119120
Args:
120121
page: the playwright page of which to extract the screenshot.
@@ -124,13 +125,45 @@ def extract_screenshot(page: playwright.sync_api.Page):
124125
125126
"""
126127

128+
scale_factor = getattr(page, "_bgym_scale_factor", 1.0)
127129
cdp = page.context.new_cdp_session(page)
130+
viewport = page.viewport_size
131+
132+
if viewport is None:
133+
dimensions = page.evaluate(
134+
"""() => ({
135+
width: window.innerWidth,
136+
height: window.innerHeight,
137+
devicePixelRatio: window.devicePixelRatio
138+
})"""
139+
)
140+
else:
141+
dimensions = {
142+
"width": viewport["width"],
143+
"height": viewport["height"],
144+
"devicePixelRatio": 1.0, # Override system DPR
145+
}
146+
147+
# Apply scale factor to device metrics for higher resolution capture
148+
cdp.send(
149+
"Emulation.setDeviceMetricsOverride",
150+
{
151+
"width": dimensions["width"],
152+
"height": dimensions["height"],
153+
"deviceScaleFactor": scale_factor, # Use bgym_scale_factor here
154+
"mobile": False,
155+
},
156+
)
157+
128158
cdp_answer = cdp.send(
129159
"Page.captureScreenshot",
130160
{
131161
"format": "png",
132162
},
133163
)
164+
165+
# Reset device metrics
166+
cdp.send("Emulation.clearDeviceMetricsOverride")
134167
cdp.detach()
135168

136169
# bytes of a png file
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
browsergym-core==0.14.0
1+
browsergym-core==0.14.1
22
tiktoken>=0.4
33
dataclasses-json

browsergym/experiments/src/browsergym/experiments/loop.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pathlib import Path
1818
from typing import Optional
1919

20+
from browsergym.core.env import BrowserEnv
2021
import gymnasium as gym
2122
import numpy as np
2223
from dataclasses_json import DataClassJsonMixin
@@ -49,7 +50,7 @@ class EnvArgs(DataClassJsonMixin):
4950

5051
def make_env(
5152
self, action_mapping, exp_dir, exp_task_kwargs: dict = {}, use_raw_page_output=False
52-
):
53+
) -> BrowserEnv:
5354
"""
5455
Instantiates the BrowserGym environment corresponding to the arguments (with some tweaks).
5556

browsergym/miniwob/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ authors = [
99
{name = "Rim Assouel"},
1010
{name = "Maxime Gasse"},
1111
{name = "Tom Marty"},
12+
{name = "Alexandre Lacoste"},
1213
]
1314
readme = "README.md"
1415
requires-python = ">3.7"
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
browsergym-core==0.14.0
1+
browsergym-core==0.14.1

0 commit comments

Comments
 (0)