Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
79d4141
added openai/anthropic compatible tool json descriptions
TLSDC Apr 22, 2025
80e8843
added raw page as possible output to reduce overhead
TLSDC Apr 23, 2025
31c3224
moving the benchmark section out of bgym (to agentlab)
TLSDC Apr 23, 2025
3d19bea
Update browsergym/core/src/browsergym/core/action/highlevel.py
TLSDC Apr 24, 2025
6de7a02
adding benchmark stuff back in
TLSDC May 2, 2025
5911c1e
backtracking
TLSDC May 2, 2025
be39f6f
Merge branch 'tlsdc/tool_use_agents' of github.com:ServiceNow/Browser…
TLSDC May 2, 2025
2703be9
Merge branch 'main' into tlsdc/tool_use_agents
TLSDC May 2, 2025
ecb4632
backtracking everything
TLSDC May 2, 2025
26ac1e9
Merge branch 'tlsdc/tool_use_agents' of github.com:ServiceNow/Browser…
TLSDC May 2, 2025
be0e45d
brought back the 'eval' bc we need it actually
TLSDC May 2, 2025
da73bf2
fixing broken imports
TLSDC May 2, 2025
709332a
version bump
TLSDC May 2, 2025
fca18ff
switching eval func to globals()
TLSDC May 2, 2025
3dca1b7
Update loop.py
TLSDC May 7, 2025
808ce72
Update browsergym/experiments/src/browsergym/experiments/loop.py
TLSDC May 7, 2025
02978dd
Add scroll_at function and update references in highlevel.py; refacto…
recursix May 9, 2025
67e7302
Replace 'scroll' with 'scroll_at' in ACTION_SUBSETS for clarity and c…
recursix May 9, 2025
b2583f9
Refactor ACTION_SUBSETS to replace 'scroll_at' with 'scroll' for cons…
recursix May 20, 2025
616f6b9
Update test_scroll to include 'bid' in action_set subsets for improve…
recursix May 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion browsergym/assistantbench/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
browsergym-core==0.13.4
browsergym-core==0.14.0
datasets
scipy
numpy
2 changes: 1 addition & 1 deletion browsergym/core/src/browsergym/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.13.4"
__version__ = "0.14.0"

import playwright.sync_api

Expand Down
10 changes: 10 additions & 0 deletions browsergym/core/src/browsergym/core/action/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
from typing import Any

import playwright.sync_api

Expand Down Expand Up @@ -33,6 +34,15 @@ def to_python_code(self, action) -> str:
Executable python code that performs the action in a browsergym environment.
"""

def to_tool_descriptor(self) -> list[Any]:
"""
Converts the action set to a tool descriptor.

Returns:
A list of dictionaries describing the actions in the action set.
"""
pass


def execute_python_code(
code: str,
Expand Down
11 changes: 11 additions & 0 deletions browsergym/core/src/browsergym/core/action/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,17 @@ def scroll(delta_x: float, delta_y: float):
page.mouse.wheel(delta_x, delta_y)


def scroll_at(x: int, y: int, dx: int, dy: int):
"""
Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.

Examples:
scroll_at(50, 100, -50, -100)
"""
page.mouse.move(x, y) # position pointer
page.mouse.wheel(dx, dy)


# https://playwright.dev/python/docs/api/class-mouse#mouse-move
def mouse_move(x: float, y: float):
"""
Expand Down
53 changes: 52 additions & 1 deletion browsergym/core/src/browsergym/core/action/highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
press,
report_infeasible,
scroll,
scroll_at,
select_option,
send_msg_to_user,
tab_close,
Expand Down Expand Up @@ -61,7 +62,7 @@
upload_file,
],
"coord": [
scroll,
scroll_at,
mouse_move,
mouse_up,
mouse_down,
Expand Down Expand Up @@ -514,6 +515,56 @@ def to_python_code(self, action):
# return the constructed python code
return python_code

def to_tool_description(self, api="openai") -> list[dict]:
Copy link
Collaborator

@recursix recursix Apr 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like a good candidate for an easy unit test. But perhaps we could isolate it into a separate function first. Perhaps tool = get_tool(tool_name, action).

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing API Validation in Tool Description Generation category Functionality

Tell me more
What is the issue?

The to_tool_description method defaults to 'openai' API but lacks validation for unsupported API values.

Why this matters

Invalid API values could lead to unexpected behavior as the method will use 'parameters' as the default schema key for any unrecognized API value.

Suggested change ∙ Feature Preview

Add validation for supported API values:

def to_tool_description(self, api="openai") -> list[dict]:
    supported_apis = {"openai", "anthropic"}
    if api not in supported_apis:
        raise ValueError(f"Unsupported API: {api}. Must be one of {supported_apis}")
    
    schema_keys = {
        "openai": "parameters",
        "anthropic": "input_schema",
    }
    schema = schema_keys[api]  # Will now always be valid
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

"""
Translates actions to tool descriptions following the OpenAI API format.

Returns:
A list of tool descriptors.
"""
schema_keys = {
"openai": "parameters",
"anthropic": "input_schema",
}
schema = schema_keys.get(api, "parameters")
tools = []
for tool_name, action in self.action_set.items():
# Parse the signature to extract parameter names and types
parameters = {"type": "object", "properties": {}, "required": []}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isolate from here into a function get_tool(name, action) -> tool and unit test it.

signature = inspect.signature(globals()[tool_name])
for param_name, param in signature.parameters.items():
Comment on lines +533 to +535
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uncached Function Introspection category Performance

Tell me more
What is the issue?

The to_tool_description method performs introspection inside a loop for each action, which is an expensive operation that could be cached.

Why this matters

Repeatedly calling inspect.signature() for the same functions across multiple calls creates unnecessary overhead, especially if this method is called frequently.

Suggested change ∙ Feature Preview

Cache the function signatures and parameter information during initialization or as a class variable to avoid repeated introspection:

def __init__(self, ...):
    # existing code
    self._cached_signatures = {}
    for tool_name in self.action_set:
        self._cached_signatures[tool_name] = {
            'signature': inspect.signature(globals()[tool_name]),
            'parameters': self._extract_parameters(globals()[tool_name])
        }
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

param_type = "string" # Default to string if type is not specified
if param.annotation != inspect.Parameter.empty:
if param.annotation is str:
param_type = "string"
elif param.annotation is float or param.annotation is int:
param_type = "number"
elif param.annotation is bool:
param_type = "boolean"
elif param.annotation is dict:
param_type = "object"
elif param.annotation is list:
param_type = "array"
Comment on lines +536 to +547
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replace type mapping chain with dictionary category Readability

Tell me more
What is the issue?

Long type mapping chain using if-elif statements could be replaced with a more readable mapping dictionary.

Why this matters

The current nested if-else structure makes the code harder to scan and maintain. Future type additions would require extending the if-else chain.

Suggested change ∙ Feature Preview
TYPE_MAPPING = {
    str: "string",
    float: "number",
    int: "number",
    bool: "boolean",
    dict: "object",
    list: "array"
}

param_type = "string"  # Default to string if type is not specified
if param.annotation != inspect.Parameter.empty:
    param_type = TYPE_MAPPING.get(param.annotation, "string")
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.


parameters["properties"][param_name] = {
"type": param_type,
# "description": f"Parameter {param_name} of type {param_type}",
}
if param.default == inspect.Parameter.empty:
parameters["required"].append(param_name)

# Construct the tool descriptor
tool = {
"name": tool_name,
"description": action.description,
schema: parameters,
}
if api == "openai":
tool["type"] = "function"
tools.append(tool)

return tools


# consistency checks
assert "custom" not in ACTION_SUBSETS
Expand Down
138 changes: 95 additions & 43 deletions browsergym/core/src/browsergym/core/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
extract_merged_axtree,
extract_screenshot,
)
from .spaces import AnyBox, AnyDict, Float, Unicode
from .spaces import AnyBox, AnyDict, Anything, Float, Unicode
from .task import AbstractBrowserTask

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -76,6 +76,7 @@ def __init__(
pw_context_kwargs: dict = {},
# agent-related arguments
action_mapping: Optional[callable] = HighLevelActionSet().to_python_code,
use_raw_page_output: bool = False,
):
"""
Instantiate a ready to use BrowserEnv gym environment.
Expand All @@ -96,6 +97,7 @@ def __init__(
pw_chromium_kwargs: extra parameters for the playwright Browser. Should only be used for debugging/testing.
pw_context_kwargs: extra parameters for the playwright BrowserContext. Should only be used for debugging/testing.
action_mapping: if set, the environment will use this function to map every received action to executable Python code.
use_raw_page_output: if set, the environment will use the raw page output instead of the default processing.

"""
super().__init__()
Expand All @@ -115,6 +117,7 @@ def __init__(
self.pw_chromium_kwargs = pw_chromium_kwargs
self.pw_context_kwargs = pw_context_kwargs
self.action_mapping = action_mapping
self.use_raw_page_output = use_raw_page_output

# check argument values
assert tags_to_mark in ("all", "standard_html")
Expand All @@ -132,42 +135,67 @@ def __init__(
self.chat: Chat = None

# observation space
self.observation_space = gym.spaces.Dict(
{
"chat_messages": gym.spaces.Sequence(
gym.spaces.Dict(
{
"role": Unicode(),
"timestamp": Float(),
"message": Unicode(),
}
)
),
"goal": Unicode(),
"goal_object": gym.spaces.Sequence(AnyDict()),
"open_pages_urls": gym.spaces.Sequence(Unicode()),
"open_pages_titles": gym.spaces.Sequence(Unicode()),
"active_page_index": gym.spaces.Box(
low=0, high=255, dtype=int
), # TODO: change to an Integer (breaking change for users)
"url": Unicode(),
"screenshot": AnyBox(
low=0,
high=255,
shape=(-1, -1, 3),
dtype=np.uint8,
), # swapped axes (height, width, RGB)
"dom_object": AnyDict(),
"axtree_object": AnyDict(),
"extra_element_properties": AnyDict(),
"focused_element_bid": Unicode(),
"last_action": Unicode(),
"last_action_error": Unicode(),
"elapsed_time": gym.spaces.Box(
low=0, high=np.inf, dtype=float
), # TODO: change to a Float (breaking change for users)
}
)
if use_raw_page_output:
self.observation_space = gym.spaces.Dict(
{
"page": Anything(),
"chat_messages": gym.spaces.Sequence(
gym.spaces.Dict(
{
"role": Unicode(),
"timestamp": Float(),
"message": Unicode(),
}
)
),
"goal": Unicode(),
"goal_object": gym.spaces.Sequence(AnyDict()),
"open_pages_urls": gym.spaces.Sequence(Unicode()),
"open_pages_titles": gym.spaces.Sequence(Unicode()),
"active_page_index": gym.spaces.Box(low=0, high=255, dtype=int),
"url": Unicode(),
"last_action": Unicode(),
"last_action_error": Unicode(),
"elapsed_time": gym.spaces.Box(low=0, high=np.inf, dtype=float),
}
)
else:
self.observation_space = gym.spaces.Dict(
{
"chat_messages": gym.spaces.Sequence(
gym.spaces.Dict(
{
"role": Unicode(),
"timestamp": Float(),
"message": Unicode(),
}
)
),
"goal": Unicode(),
"goal_object": gym.spaces.Sequence(AnyDict()),
"open_pages_urls": gym.spaces.Sequence(Unicode()),
"open_pages_titles": gym.spaces.Sequence(Unicode()),
"active_page_index": gym.spaces.Box(
low=0, high=255, dtype=int
), # TODO: change to an Integer (breaking change for users)
"url": Unicode(),
"screenshot": AnyBox(
low=0,
high=255,
shape=(-1, -1, 3),
dtype=np.uint8,
), # swapped axes (height, width, RGB)
"dom_object": AnyDict(),
"axtree_object": AnyDict(),
"extra_element_properties": AnyDict(),
"focused_element_bid": Unicode(),
"last_action": Unicode(),
"last_action_error": Unicode(),
"elapsed_time": gym.spaces.Box(
low=0, high=np.inf, dtype=float
), # TODO: change to a Float (breaking change for users)
}
)

# action space
self.action_space = Unicode()
Expand Down Expand Up @@ -227,16 +255,24 @@ def override_property(task, env, property):
pw: playwright.sync_api.Playwright = _get_global_playwright()
# important: change playwright's test id attribute from "data-testid" to "bid"
pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE)
args = [
(
f"--window-size={viewport['width']},{viewport['height']}"
if self.resizeable_window
else None
),
"--disable-features=OverlayScrollbars,ExtendedOverlayScrollbars", # otherwise the screenshot doesn't see the scrollbars
]
args = [arg for arg in args if arg is not None] # Remove None values

# create a new browser
self.browser = pw.chromium.launch(
headless=self.headless,
slow_mo=slow_mo,
args=(
[f"--window-size={viewport['width']},{viewport['height']}"]
if self.resizeable_window
else None
),
args=args,
ignore_default_args=[
"--hide-scrollbars"
], # otherwise the screenshot doesn't see the scrollbars
# will raise an Exception if above args are overriden
**self.pw_chromium_kwargs,
)
Expand Down Expand Up @@ -566,7 +602,23 @@ def _active_page_check(self):
raise RuntimeError(f"Unexpected: active page has been closed ({self.page}).")

def _get_obs(self):

if self.use_raw_page_output:
obs = {
"page": self.page,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unsafe Raw Page Object Exposure category Security

Tell me more
What is the issue?

Direct exposure of the raw Playwright page object in the observation space when use_raw_page_output=True.

Why this matters

Exposing the raw browser page object gives unrestricted access to potentially sensitive page data and browser control methods that could be exploited maliciously.

Suggested change ∙ Feature Preview

Instead of exposing the raw page object, create a restricted interface that only exposes necessary page properties and methods:

"page": {
    "url": self.page.url,
    "title": self.page.title(),
    # Add other needed properties with proper sanitization
}
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

"chat_messages": tuple(copy.deepcopy(self.chat.messages)),
"goal": _try_to_extract_legacy_goal(self.goal_object), # legacy goal, deprecated
"goal_object": tuple(
copy.deepcopy(self.goal_object)
), # new goal format, list of messages openai style
"open_pages_urls": tuple(page.url for page in self.context.pages),
"open_pages_titles": tuple(page.title() for page in self.context.pages),
"active_page_index": np.asarray([self.context.pages.index(self.page)]),
"url": self.page.url, # redundant with "open_pages_urls" and "active_page_index"
"last_action": self.last_action,
"last_action_error": self.last_action_error,
"elapsed_time": np.asarray([time.time() - self.start_time]),
}
return obs
for retries_left in reversed(range(EXTRACT_OBS_MAX_TRIES)):
try:
Comment on lines 604 to 623
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Observation extraction logic lacks clear structure category Readability

Tell me more
What is the issue?

The _get_obs method has a complex structure with two very different code paths and duplicated field extraction logic.

Why this matters

The method's structure makes it hard to understand what fields are common between the two observation types and obscures the relationship between the observation_space definition and actual observation creation.

Suggested change ∙ Feature Preview

Split the method into smaller, focused functions:

def _get_obs(self):
    base_obs = self._get_base_observation()
    if self.use_raw_page_output:
        return base_obs
    return {**base_obs, **self._get_extended_observation()}

def _get_base_observation(self):
    return {
        "chat_messages": tuple(copy.deepcopy(self.chat.messages)),
        # ... other common fields
    }

def _get_extended_observation(self):
    return self._extract_with_retries()
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

# pre-extraction, mark dom elements (set bid, set dynamic attributes like value and checked)
Expand Down
2 changes: 1 addition & 1 deletion browsergym/experiments/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
browsergym-core==0.13.4
browsergym-core==0.14.0
tiktoken>=0.4
dataclasses-json
6 changes: 1 addition & 5 deletions browsergym/experiments/src/bgym/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,7 @@
from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.core.action.python import PythonActionSet
from browsergym.experiments.agent import Agent, AgentInfo
from browsergym.experiments.benchmark import (
DEFAULT_BENCHMARKS,
Benchmark,
HighLevelActionSetArgs,
)
from browsergym.experiments.benchmark import DEFAULT_BENCHMARKS, Benchmark, HighLevelActionSetArgs
from browsergym.experiments.loop import (
AbstractAgentArgs,
EnvArgs,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import numpy as np

from browsergym.experiments.benchmark.metadata.utils import (
task_list_from_metadata,
task_metadata,
)
from browsergym.experiments.benchmark.metadata.utils import task_list_from_metadata, task_metadata
from browsergym.experiments.benchmark.utils import (
make_env_args_list_from_fixed_seeds,
make_env_args_list_from_repeat_tasks,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import Literal

import numpy as np

from browsergym.experiments.loop import SEED_MAX, EnvArgs

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -46,7 +45,11 @@ def make_env_args_list_from_workarena_curriculum(


def make_env_args_list_from_repeat_tasks(
task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState
task_list: list[str],
max_steps: int,
n_repeats: int,
seeds_rng: np.random.RandomState,
viewport=None,
):
"""
Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds.
Expand All @@ -62,7 +65,7 @@ def make_env_args_list_from_repeat_tasks(
headless=True,
record_video=False,
wait_for_user_message=False,
viewport=None,
viewport=viewport,
slow_mo=None,
storage_state=None,
task_kwargs=None,
Expand Down
Loading