Skip to content

Commit ee01ebb

Browse files
TLSDCCopilotrecursix
authored
Some changes in preparation for agentlab's new ToolUseAgent and the new APIs (ServiceNow#340)
* added openai/anthropic compatible tool json descriptions * added raw page as possible output to reduce overhead * moving the benchmark section out of bgym (to agentlab) * Update browsergym/core/src/browsergym/core/action/highlevel.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * adding benchmark stuff back in * backtracking * backtracking everything * brought back the 'eval' bc we need it actually * fixing broken imports * version bump * switching eval func to globals() * Update loop.py * Update browsergym/experiments/src/browsergym/experiments/loop.py * Add scroll_at function and update references in highlevel.py; refactor browser launch args in env.py; clean up utils.py * Replace 'scroll' with 'scroll_at' in ACTION_SUBSETS for clarity and consistency * Refactor ACTION_SUBSETS to replace 'scroll_at' with 'scroll' for consistency; update browser launch args comments in env.py * Update test_scroll to include 'bid' in action_set subsets for improved functionality; add main execution block for direct testing --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: recursix <alex.lacoste.shmu@gmail.com>
1 parent bbf3ad6 commit ee01ebb

File tree

16 files changed

+201
-71
lines changed

16 files changed

+201
-71
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
browsergym-core==0.13.4
1+
browsergym-core==0.14.0
22
datasets
33
scipy
44
numpy

browsergym/core/src/browsergym/core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.13.4"
1+
__version__ = "0.14.0"
22

33
import playwright.sync_api
44

browsergym/core/src/browsergym/core/action/base.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from abc import ABC, abstractmethod
2+
from typing import Any
23

34
import playwright.sync_api
45

@@ -33,6 +34,15 @@ def to_python_code(self, action) -> str:
3334
Executable python code that performs the action in a browsergym environment.
3435
"""
3536

37+
def to_tool_descriptor(self) -> list[Any]:
38+
"""
39+
Converts the action set to a tool descriptor.
40+
41+
Returns:
42+
A list of dictionaries describing the actions in the action set.
43+
"""
44+
pass
45+
3646

3747
def execute_python_code(
3848
code: str,

browsergym/core/src/browsergym/core/action/functions.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,17 @@ def scroll(delta_x: float, delta_y: float):
280280
page.mouse.wheel(delta_x, delta_y)
281281

282282

283+
def scroll_at(x: int, y: int, dx: int, dy: int):
284+
"""
285+
Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
286+
287+
Examples:
288+
scroll_at(50, 100, -50, -100)
289+
"""
290+
page.mouse.move(x, y) # position pointer
291+
page.mouse.wheel(dx, dy)
292+
293+
283294
# https://playwright.dev/python/docs/api/class-mouse#mouse-move
284295
def mouse_move(x: float, y: float):
285296
"""

browsergym/core/src/browsergym/core/action/highlevel.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
press,
3434
report_infeasible,
3535
scroll,
36+
scroll_at,
3637
select_option,
3738
send_msg_to_user,
3839
tab_close,
@@ -61,7 +62,7 @@
6162
upload_file,
6263
],
6364
"coord": [
64-
scroll,
65+
scroll_at,
6566
mouse_move,
6667
mouse_up,
6768
mouse_down,
@@ -514,6 +515,56 @@ def to_python_code(self, action):
514515
# return the constructed python code
515516
return python_code
516517

518+
def to_tool_description(self, api="openai") -> list[dict]:
519+
"""
520+
Translates actions to tool descriptions following the OpenAI API format.
521+
522+
Returns:
523+
A list of tool descriptors.
524+
"""
525+
schema_keys = {
526+
"openai": "parameters",
527+
"anthropic": "input_schema",
528+
}
529+
schema = schema_keys.get(api, "parameters")
530+
tools = []
531+
for tool_name, action in self.action_set.items():
532+
# Parse the signature to extract parameter names and types
533+
parameters = {"type": "object", "properties": {}, "required": []}
534+
signature = inspect.signature(globals()[tool_name])
535+
for param_name, param in signature.parameters.items():
536+
param_type = "string" # Default to string if type is not specified
537+
if param.annotation != inspect.Parameter.empty:
538+
if param.annotation is str:
539+
param_type = "string"
540+
elif param.annotation is float or param.annotation is int:
541+
param_type = "number"
542+
elif param.annotation is bool:
543+
param_type = "boolean"
544+
elif param.annotation is dict:
545+
param_type = "object"
546+
elif param.annotation is list:
547+
param_type = "array"
548+
549+
parameters["properties"][param_name] = {
550+
"type": param_type,
551+
# "description": f"Parameter {param_name} of type {param_type}",
552+
}
553+
if param.default == inspect.Parameter.empty:
554+
parameters["required"].append(param_name)
555+
556+
# Construct the tool descriptor
557+
tool = {
558+
"name": tool_name,
559+
"description": action.description,
560+
schema: parameters,
561+
}
562+
if api == "openai":
563+
tool["type"] = "function"
564+
tools.append(tool)
565+
566+
return tools
567+
517568

518569
# consistency checks
519570
assert "custom" not in ACTION_SUBSETS

browsergym/core/src/browsergym/core/env.py

Lines changed: 95 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
extract_merged_axtree,
2626
extract_screenshot,
2727
)
28-
from .spaces import AnyBox, AnyDict, Float, Unicode
28+
from .spaces import AnyBox, AnyDict, Anything, Float, Unicode
2929
from .task import AbstractBrowserTask
3030

3131
logger = logging.getLogger(__name__)
@@ -76,6 +76,7 @@ def __init__(
7676
pw_context_kwargs: dict = {},
7777
# agent-related arguments
7878
action_mapping: Optional[callable] = HighLevelActionSet().to_python_code,
79+
use_raw_page_output: bool = False,
7980
):
8081
"""
8182
Instantiate a ready to use BrowserEnv gym environment.
@@ -96,6 +97,7 @@ def __init__(
9697
pw_chromium_kwargs: extra parameters for the playwright Browser. Should only be used for debugging/testing.
9798
pw_context_kwargs: extra parameters for the playwright BrowserContext. Should only be used for debugging/testing.
9899
action_mapping: if set, the environment will use this function to map every received action to executable Python code.
100+
use_raw_page_output: if set, the environment will use the raw page output instead of the default processing.
99101
100102
"""
101103
super().__init__()
@@ -115,6 +117,7 @@ def __init__(
115117
self.pw_chromium_kwargs = pw_chromium_kwargs
116118
self.pw_context_kwargs = pw_context_kwargs
117119
self.action_mapping = action_mapping
120+
self.use_raw_page_output = use_raw_page_output
118121

119122
# check argument values
120123
assert tags_to_mark in ("all", "standard_html")
@@ -132,42 +135,67 @@ def __init__(
132135
self.chat: Chat = None
133136

134137
# observation space
135-
self.observation_space = gym.spaces.Dict(
136-
{
137-
"chat_messages": gym.spaces.Sequence(
138-
gym.spaces.Dict(
139-
{
140-
"role": Unicode(),
141-
"timestamp": Float(),
142-
"message": Unicode(),
143-
}
144-
)
145-
),
146-
"goal": Unicode(),
147-
"goal_object": gym.spaces.Sequence(AnyDict()),
148-
"open_pages_urls": gym.spaces.Sequence(Unicode()),
149-
"open_pages_titles": gym.spaces.Sequence(Unicode()),
150-
"active_page_index": gym.spaces.Box(
151-
low=0, high=255, dtype=int
152-
), # TODO: change to an Integer (breaking change for users)
153-
"url": Unicode(),
154-
"screenshot": AnyBox(
155-
low=0,
156-
high=255,
157-
shape=(-1, -1, 3),
158-
dtype=np.uint8,
159-
), # swapped axes (height, width, RGB)
160-
"dom_object": AnyDict(),
161-
"axtree_object": AnyDict(),
162-
"extra_element_properties": AnyDict(),
163-
"focused_element_bid": Unicode(),
164-
"last_action": Unicode(),
165-
"last_action_error": Unicode(),
166-
"elapsed_time": gym.spaces.Box(
167-
low=0, high=np.inf, dtype=float
168-
), # TODO: change to a Float (breaking change for users)
169-
}
170-
)
138+
if use_raw_page_output:
139+
self.observation_space = gym.spaces.Dict(
140+
{
141+
"page": Anything(),
142+
"chat_messages": gym.spaces.Sequence(
143+
gym.spaces.Dict(
144+
{
145+
"role": Unicode(),
146+
"timestamp": Float(),
147+
"message": Unicode(),
148+
}
149+
)
150+
),
151+
"goal": Unicode(),
152+
"goal_object": gym.spaces.Sequence(AnyDict()),
153+
"open_pages_urls": gym.spaces.Sequence(Unicode()),
154+
"open_pages_titles": gym.spaces.Sequence(Unicode()),
155+
"active_page_index": gym.spaces.Box(low=0, high=255, dtype=int),
156+
"url": Unicode(),
157+
"last_action": Unicode(),
158+
"last_action_error": Unicode(),
159+
"elapsed_time": gym.spaces.Box(low=0, high=np.inf, dtype=float),
160+
}
161+
)
162+
else:
163+
self.observation_space = gym.spaces.Dict(
164+
{
165+
"chat_messages": gym.spaces.Sequence(
166+
gym.spaces.Dict(
167+
{
168+
"role": Unicode(),
169+
"timestamp": Float(),
170+
"message": Unicode(),
171+
}
172+
)
173+
),
174+
"goal": Unicode(),
175+
"goal_object": gym.spaces.Sequence(AnyDict()),
176+
"open_pages_urls": gym.spaces.Sequence(Unicode()),
177+
"open_pages_titles": gym.spaces.Sequence(Unicode()),
178+
"active_page_index": gym.spaces.Box(
179+
low=0, high=255, dtype=int
180+
), # TODO: change to an Integer (breaking change for users)
181+
"url": Unicode(),
182+
"screenshot": AnyBox(
183+
low=0,
184+
high=255,
185+
shape=(-1, -1, 3),
186+
dtype=np.uint8,
187+
), # swapped axes (height, width, RGB)
188+
"dom_object": AnyDict(),
189+
"axtree_object": AnyDict(),
190+
"extra_element_properties": AnyDict(),
191+
"focused_element_bid": Unicode(),
192+
"last_action": Unicode(),
193+
"last_action_error": Unicode(),
194+
"elapsed_time": gym.spaces.Box(
195+
low=0, high=np.inf, dtype=float
196+
), # TODO: change to a Float (breaking change for users)
197+
}
198+
)
171199

172200
# action space
173201
self.action_space = Unicode()
@@ -227,16 +255,24 @@ def override_property(task, env, property):
227255
pw: playwright.sync_api.Playwright = _get_global_playwright()
228256
# important: change playwright's test id attribute from "data-testid" to "bid"
229257
pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE)
258+
args = [
259+
(
260+
f"--window-size={viewport['width']},{viewport['height']}"
261+
if self.resizeable_window
262+
else None
263+
),
264+
"--disable-features=OverlayScrollbars,ExtendedOverlayScrollbars", # otherwise the screenshot doesn't see the scrollbars
265+
]
266+
args = [arg for arg in args if arg is not None] # Remove None values
230267

231268
# create a new browser
232269
self.browser = pw.chromium.launch(
233270
headless=self.headless,
234271
slow_mo=slow_mo,
235-
args=(
236-
[f"--window-size={viewport['width']},{viewport['height']}"]
237-
if self.resizeable_window
238-
else None
239-
),
272+
args=args,
273+
ignore_default_args=[
274+
"--hide-scrollbars"
275+
], # otherwise the screenshot doesn't see the scrollbars
240276
# will raise an Exception if above args are overriden
241277
**self.pw_chromium_kwargs,
242278
)
@@ -566,7 +602,23 @@ def _active_page_check(self):
566602
raise RuntimeError(f"Unexpected: active page has been closed ({self.page}).")
567603

568604
def _get_obs(self):
569-
605+
if self.use_raw_page_output:
606+
obs = {
607+
"page": self.page,
608+
"chat_messages": tuple(copy.deepcopy(self.chat.messages)),
609+
"goal": _try_to_extract_legacy_goal(self.goal_object), # legacy goal, deprecated
610+
"goal_object": tuple(
611+
copy.deepcopy(self.goal_object)
612+
), # new goal format, list of messages openai style
613+
"open_pages_urls": tuple(page.url for page in self.context.pages),
614+
"open_pages_titles": tuple(page.title() for page in self.context.pages),
615+
"active_page_index": np.asarray([self.context.pages.index(self.page)]),
616+
"url": self.page.url, # redundant with "open_pages_urls" and "active_page_index"
617+
"last_action": self.last_action,
618+
"last_action_error": self.last_action_error,
619+
"elapsed_time": np.asarray([time.time() - self.start_time]),
620+
}
621+
return obs
570622
for retries_left in reversed(range(EXTRACT_OBS_MAX_TRIES)):
571623
try:
572624
# pre-extraction, mark dom elements (set bid, set dynamic attributes like value and checked)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
browsergym-core==0.13.4
1+
browsergym-core==0.14.0
22
tiktoken>=0.4
33
dataclasses-json

browsergym/experiments/src/bgym/__init__.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,7 @@
22
from browsergym.core.action.highlevel import HighLevelActionSet
33
from browsergym.core.action.python import PythonActionSet
44
from browsergym.experiments.agent import Agent, AgentInfo
5-
from browsergym.experiments.benchmark import (
6-
DEFAULT_BENCHMARKS,
7-
Benchmark,
8-
HighLevelActionSetArgs,
9-
)
5+
from browsergym.experiments.benchmark import DEFAULT_BENCHMARKS, Benchmark, HighLevelActionSetArgs
106
from browsergym.experiments.loop import (
117
AbstractAgentArgs,
128
EnvArgs,

browsergym/experiments/src/browsergym/experiments/benchmark/configs.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
import numpy as np
22

3-
from browsergym.experiments.benchmark.metadata.utils import (
4-
task_list_from_metadata,
5-
task_metadata,
6-
)
3+
from browsergym.experiments.benchmark.metadata.utils import task_list_from_metadata, task_metadata
74
from browsergym.experiments.benchmark.utils import (
85
make_env_args_list_from_fixed_seeds,
96
make_env_args_list_from_repeat_tasks,

browsergym/experiments/src/browsergym/experiments/benchmark/utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from typing import Literal
77

88
import numpy as np
9-
109
from browsergym.experiments.loop import SEED_MAX, EnvArgs
1110

1211
logger = logging.getLogger(__name__)
@@ -46,7 +45,11 @@ def make_env_args_list_from_workarena_curriculum(
4645

4746

4847
def make_env_args_list_from_repeat_tasks(
49-
task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState
48+
task_list: list[str],
49+
max_steps: int,
50+
n_repeats: int,
51+
seeds_rng: np.random.RandomState,
52+
viewport=None,
5053
):
5154
"""
5255
Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds.
@@ -62,7 +65,7 @@ def make_env_args_list_from_repeat_tasks(
6265
headless=True,
6366
record_video=False,
6467
wait_for_user_message=False,
65-
viewport=None,
68+
viewport=viewport,
6669
slow_mo=None,
6770
storage_state=None,
6871
task_kwargs=None,

0 commit comments

Comments
 (0)