diff --git a/browsergym/assistantbench/requirements.txt b/browsergym/assistantbench/requirements.txt index c432e0c15..a3cc3625e 100644 --- a/browsergym/assistantbench/requirements.txt +++ b/browsergym/assistantbench/requirements.txt @@ -1,4 +1,4 @@ -browsergym-core==0.13.4 +browsergym-core==0.14.0 datasets scipy numpy diff --git a/browsergym/core/src/browsergym/core/__init__.py b/browsergym/core/src/browsergym/core/__init__.py index e9ba517f2..ad8eeda8e 100644 --- a/browsergym/core/src/browsergym/core/__init__.py +++ b/browsergym/core/src/browsergym/core/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.13.4" +__version__ = "0.14.0" import playwright.sync_api diff --git a/browsergym/core/src/browsergym/core/action/base.py b/browsergym/core/src/browsergym/core/action/base.py index 6f06303b7..7dab2958c 100644 --- a/browsergym/core/src/browsergym/core/action/base.py +++ b/browsergym/core/src/browsergym/core/action/base.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import Any import playwright.sync_api @@ -33,6 +34,15 @@ def to_python_code(self, action) -> str: Executable python code that performs the action in a browsergym environment. """ + def to_tool_descriptor(self) -> list[Any]: + """ + Converts the action set to a tool descriptor. + + Returns: + A list of dictionaries describing the actions in the action set. + """ + pass + def execute_python_code( code: str, diff --git a/browsergym/core/src/browsergym/core/action/functions.py b/browsergym/core/src/browsergym/core/action/functions.py index bb31db9ac..21fd2b95f 100644 --- a/browsergym/core/src/browsergym/core/action/functions.py +++ b/browsergym/core/src/browsergym/core/action/functions.py @@ -280,6 +280,17 @@ def scroll(delta_x: float, delta_y: float): page.mouse.wheel(delta_x, delta_y) +def scroll_at(x: int, y: int, dx: int, dy: int): + """ + Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event. + + Examples: + scroll_at(50, 100, -50, -100) + """ + page.mouse.move(x, y) # position pointer + page.mouse.wheel(dx, dy) + + # https://playwright.dev/python/docs/api/class-mouse#mouse-move def mouse_move(x: float, y: float): """ diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index da2c539cf..b494c4fa3 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -33,6 +33,7 @@ press, report_infeasible, scroll, + scroll_at, select_option, send_msg_to_user, tab_close, @@ -61,7 +62,7 @@ upload_file, ], "coord": [ - scroll, + scroll_at, mouse_move, mouse_up, mouse_down, @@ -514,6 +515,56 @@ def to_python_code(self, action): # return the constructed python code return python_code + def to_tool_description(self, api="openai") -> list[dict]: + """ + Translates actions to tool descriptions following the OpenAI API format. + + Returns: + A list of tool descriptors. + """ + schema_keys = { + "openai": "parameters", + "anthropic": "input_schema", + } + schema = schema_keys.get(api, "parameters") + tools = [] + for tool_name, action in self.action_set.items(): + # Parse the signature to extract parameter names and types + parameters = {"type": "object", "properties": {}, "required": []} + signature = inspect.signature(globals()[tool_name]) + for param_name, param in signature.parameters.items(): + param_type = "string" # Default to string if type is not specified + if param.annotation != inspect.Parameter.empty: + if param.annotation is str: + param_type = "string" + elif param.annotation is float or param.annotation is int: + param_type = "number" + elif param.annotation is bool: + param_type = "boolean" + elif param.annotation is dict: + param_type = "object" + elif param.annotation is list: + param_type = "array" + + parameters["properties"][param_name] = { + "type": param_type, + # "description": f"Parameter {param_name} of type {param_type}", + } + if param.default == inspect.Parameter.empty: + parameters["required"].append(param_name) + + # Construct the tool descriptor + tool = { + "name": tool_name, + "description": action.description, + schema: parameters, + } + if api == "openai": + tool["type"] = "function" + tools.append(tool) + + return tools + # consistency checks assert "custom" not in ACTION_SUBSETS diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py index 115eb8c60..bcdea4cd6 100644 --- a/browsergym/core/src/browsergym/core/env.py +++ b/browsergym/core/src/browsergym/core/env.py @@ -25,7 +25,7 @@ extract_merged_axtree, extract_screenshot, ) -from .spaces import AnyBox, AnyDict, Float, Unicode +from .spaces import AnyBox, AnyDict, Anything, Float, Unicode from .task import AbstractBrowserTask logger = logging.getLogger(__name__) @@ -76,6 +76,7 @@ def __init__( pw_context_kwargs: dict = {}, # agent-related arguments action_mapping: Optional[callable] = HighLevelActionSet().to_python_code, + use_raw_page_output: bool = False, ): """ Instantiate a ready to use BrowserEnv gym environment. @@ -96,6 +97,7 @@ def __init__( pw_chromium_kwargs: extra parameters for the playwright Browser. Should only be used for debugging/testing. pw_context_kwargs: extra parameters for the playwright BrowserContext. Should only be used for debugging/testing. action_mapping: if set, the environment will use this function to map every received action to executable Python code. + use_raw_page_output: if set, the environment will use the raw page output instead of the default processing. """ super().__init__() @@ -115,6 +117,7 @@ def __init__( self.pw_chromium_kwargs = pw_chromium_kwargs self.pw_context_kwargs = pw_context_kwargs self.action_mapping = action_mapping + self.use_raw_page_output = use_raw_page_output # check argument values assert tags_to_mark in ("all", "standard_html") @@ -132,42 +135,67 @@ def __init__( self.chat: Chat = None # observation space - self.observation_space = gym.spaces.Dict( - { - "chat_messages": gym.spaces.Sequence( - gym.spaces.Dict( - { - "role": Unicode(), - "timestamp": Float(), - "message": Unicode(), - } - ) - ), - "goal": Unicode(), - "goal_object": gym.spaces.Sequence(AnyDict()), - "open_pages_urls": gym.spaces.Sequence(Unicode()), - "open_pages_titles": gym.spaces.Sequence(Unicode()), - "active_page_index": gym.spaces.Box( - low=0, high=255, dtype=int - ), # TODO: change to an Integer (breaking change for users) - "url": Unicode(), - "screenshot": AnyBox( - low=0, - high=255, - shape=(-1, -1, 3), - dtype=np.uint8, - ), # swapped axes (height, width, RGB) - "dom_object": AnyDict(), - "axtree_object": AnyDict(), - "extra_element_properties": AnyDict(), - "focused_element_bid": Unicode(), - "last_action": Unicode(), - "last_action_error": Unicode(), - "elapsed_time": gym.spaces.Box( - low=0, high=np.inf, dtype=float - ), # TODO: change to a Float (breaking change for users) - } - ) + if use_raw_page_output: + self.observation_space = gym.spaces.Dict( + { + "page": Anything(), + "chat_messages": gym.spaces.Sequence( + gym.spaces.Dict( + { + "role": Unicode(), + "timestamp": Float(), + "message": Unicode(), + } + ) + ), + "goal": Unicode(), + "goal_object": gym.spaces.Sequence(AnyDict()), + "open_pages_urls": gym.spaces.Sequence(Unicode()), + "open_pages_titles": gym.spaces.Sequence(Unicode()), + "active_page_index": gym.spaces.Box(low=0, high=255, dtype=int), + "url": Unicode(), + "last_action": Unicode(), + "last_action_error": Unicode(), + "elapsed_time": gym.spaces.Box(low=0, high=np.inf, dtype=float), + } + ) + else: + self.observation_space = gym.spaces.Dict( + { + "chat_messages": gym.spaces.Sequence( + gym.spaces.Dict( + { + "role": Unicode(), + "timestamp": Float(), + "message": Unicode(), + } + ) + ), + "goal": Unicode(), + "goal_object": gym.spaces.Sequence(AnyDict()), + "open_pages_urls": gym.spaces.Sequence(Unicode()), + "open_pages_titles": gym.spaces.Sequence(Unicode()), + "active_page_index": gym.spaces.Box( + low=0, high=255, dtype=int + ), # TODO: change to an Integer (breaking change for users) + "url": Unicode(), + "screenshot": AnyBox( + low=0, + high=255, + shape=(-1, -1, 3), + dtype=np.uint8, + ), # swapped axes (height, width, RGB) + "dom_object": AnyDict(), + "axtree_object": AnyDict(), + "extra_element_properties": AnyDict(), + "focused_element_bid": Unicode(), + "last_action": Unicode(), + "last_action_error": Unicode(), + "elapsed_time": gym.spaces.Box( + low=0, high=np.inf, dtype=float + ), # TODO: change to a Float (breaking change for users) + } + ) # action space self.action_space = Unicode() @@ -227,16 +255,24 @@ def override_property(task, env, property): pw: playwright.sync_api.Playwright = _get_global_playwright() # important: change playwright's test id attribute from "data-testid" to "bid" pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE) + args = [ + ( + f"--window-size={viewport['width']},{viewport['height']}" + if self.resizeable_window + else None + ), + "--disable-features=OverlayScrollbars,ExtendedOverlayScrollbars", # otherwise the screenshot doesn't see the scrollbars + ] + args = [arg for arg in args if arg is not None] # Remove None values # create a new browser self.browser = pw.chromium.launch( headless=self.headless, slow_mo=slow_mo, - args=( - [f"--window-size={viewport['width']},{viewport['height']}"] - if self.resizeable_window - else None - ), + args=args, + ignore_default_args=[ + "--hide-scrollbars" + ], # otherwise the screenshot doesn't see the scrollbars # will raise an Exception if above args are overriden **self.pw_chromium_kwargs, ) @@ -566,7 +602,23 @@ def _active_page_check(self): raise RuntimeError(f"Unexpected: active page has been closed ({self.page}).") def _get_obs(self): - + if self.use_raw_page_output: + obs = { + "page": self.page, + "chat_messages": tuple(copy.deepcopy(self.chat.messages)), + "goal": _try_to_extract_legacy_goal(self.goal_object), # legacy goal, deprecated + "goal_object": tuple( + copy.deepcopy(self.goal_object) + ), # new goal format, list of messages openai style + "open_pages_urls": tuple(page.url for page in self.context.pages), + "open_pages_titles": tuple(page.title() for page in self.context.pages), + "active_page_index": np.asarray([self.context.pages.index(self.page)]), + "url": self.page.url, # redundant with "open_pages_urls" and "active_page_index" + "last_action": self.last_action, + "last_action_error": self.last_action_error, + "elapsed_time": np.asarray([time.time() - self.start_time]), + } + return obs for retries_left in reversed(range(EXTRACT_OBS_MAX_TRIES)): try: # pre-extraction, mark dom elements (set bid, set dynamic attributes like value and checked) diff --git a/browsergym/experiments/requirements.txt b/browsergym/experiments/requirements.txt index 0542cc0d8..233af8f4b 100644 --- a/browsergym/experiments/requirements.txt +++ b/browsergym/experiments/requirements.txt @@ -1,3 +1,3 @@ -browsergym-core==0.13.4 +browsergym-core==0.14.0 tiktoken>=0.4 dataclasses-json diff --git a/browsergym/experiments/src/bgym/__init__.py b/browsergym/experiments/src/bgym/__init__.py index c43f505fc..d4c74cd75 100644 --- a/browsergym/experiments/src/bgym/__init__.py +++ b/browsergym/experiments/src/bgym/__init__.py @@ -2,11 +2,7 @@ from browsergym.core.action.highlevel import HighLevelActionSet from browsergym.core.action.python import PythonActionSet from browsergym.experiments.agent import Agent, AgentInfo -from browsergym.experiments.benchmark import ( - DEFAULT_BENCHMARKS, - Benchmark, - HighLevelActionSetArgs, -) +from browsergym.experiments.benchmark import DEFAULT_BENCHMARKS, Benchmark, HighLevelActionSetArgs from browsergym.experiments.loop import ( AbstractAgentArgs, EnvArgs, diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index c9c5994e0..d099faa3e 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -1,9 +1,6 @@ import numpy as np -from browsergym.experiments.benchmark.metadata.utils import ( - task_list_from_metadata, - task_metadata, -) +from browsergym.experiments.benchmark.metadata.utils import task_list_from_metadata, task_metadata from browsergym.experiments.benchmark.utils import ( make_env_args_list_from_fixed_seeds, make_env_args_list_from_repeat_tasks, diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index 16e0aa667..c554b15d1 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -6,7 +6,6 @@ from typing import Literal import numpy as np - from browsergym.experiments.loop import SEED_MAX, EnvArgs logger = logging.getLogger(__name__) @@ -46,7 +45,11 @@ def make_env_args_list_from_workarena_curriculum( def make_env_args_list_from_repeat_tasks( - task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState + task_list: list[str], + max_steps: int, + n_repeats: int, + seeds_rng: np.random.RandomState, + viewport=None, ): """ Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds. @@ -62,7 +65,7 @@ def make_env_args_list_from_repeat_tasks( headless=True, record_video=False, wait_for_user_message=False, - viewport=None, + viewport=viewport, slow_mo=None, storage_state=None, task_kwargs=None, diff --git a/browsergym/experiments/src/browsergym/experiments/loop.py b/browsergym/experiments/src/browsergym/experiments/loop.py index 22c5d924d..f185b8b1d 100644 --- a/browsergym/experiments/src/browsergym/experiments/loop.py +++ b/browsergym/experiments/src/browsergym/experiments/loop.py @@ -47,7 +47,9 @@ class EnvArgs(DataClassJsonMixin): storage_state: Optional[str | Path | dict] = None task_kwargs: Optional[dict] = None # use default value from BrowserGym - def make_env(self, action_mapping, exp_dir, exp_task_kwargs: dict = {}): + def make_env( + self, action_mapping, exp_dir, exp_task_kwargs: dict = {}, use_raw_page_output=False + ): """ Instantiates the BrowserGym environment corresponding to the arguments (with some tweaks). @@ -55,6 +57,9 @@ def make_env(self, action_mapping, exp_dir, exp_task_kwargs: dict = {}): action_mapping: overrides the action mapping of the environment. exp_dir: will set some environment parameters (e.g., record_video_dir) with respect to the directory where the experiment is running. exp_task_kwargs: use with caution! Will override task parameters to experiment-specific values. Useful to set different server configs for different experiments, or output file paths within the experiment's folder (e.g., assistantbench). + + Returns: + env: the gym environment. """ extra_kwargs = {} if self.record_video: @@ -84,6 +89,7 @@ def make_env(self, action_mapping, exp_dir, exp_task_kwargs: dict = {}): headless=self.headless, wait_for_user_message=self.wait_for_user_message, action_mapping=action_mapping, # action mapping is provided by the agent + use_raw_page_output=use_raw_page_output, **extra_kwargs, ) diff --git a/browsergym/miniwob/requirements.txt b/browsergym/miniwob/requirements.txt index f0aa29e8b..a69001e8c 100644 --- a/browsergym/miniwob/requirements.txt +++ b/browsergym/miniwob/requirements.txt @@ -1 +1 @@ -browsergym-core==0.13.4 +browsergym-core==0.14.0 diff --git a/browsergym/pyproject.toml b/browsergym/pyproject.toml index 26c9039cb..dc6dfffe9 100644 --- a/browsergym/pyproject.toml +++ b/browsergym/pyproject.toml @@ -28,14 +28,14 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "License :: OSI Approved :: Apache Software License", ] -version="0.13.4" +version="0.14.0" dependencies = [ - "browsergym-core==0.13.4", - "browsergym-miniwob==0.13.4", - "browsergym-webarena==0.13.4", - "browsergym-visualwebarena==0.13.4", - "browsergym-assistantbench==0.13.4", - "browsergym-experiments==0.13.4", + "browsergym-core==0.14.0", + "browsergym-miniwob==0.14.0", + "browsergym-webarena==0.14.0", + "browsergym-visualwebarena==0.14.0", + "browsergym-assistantbench==0.14.0", + "browsergym-experiments==0.14.0", "browsergym-workarena>=0.4.1", "weblinx-browsergym>=0.0.2", ] diff --git a/browsergym/visualwebarena/requirements.txt b/browsergym/visualwebarena/requirements.txt index f450c295e..70a0f5cab 100644 --- a/browsergym/visualwebarena/requirements.txt +++ b/browsergym/visualwebarena/requirements.txt @@ -1,4 +1,4 @@ -browsergym-core==0.13.4 +browsergym-core==0.14.0 browsergym-webarena libvisualwebarena==0.0.15 requests diff --git a/browsergym/webarena/requirements.txt b/browsergym/webarena/requirements.txt index 635396bdd..18ed3d634 100644 --- a/browsergym/webarena/requirements.txt +++ b/browsergym/webarena/requirements.txt @@ -1,2 +1,2 @@ -browsergym-core==0.13.4 +browsergym-core==0.14.0 libwebarena==0.0.4 diff --git a/tests/core/test_actions_highlevel.py b/tests/core/test_actions_highlevel.py index a3a4f56c6..00d9ec221 100644 --- a/tests/core/test_actions_highlevel.py +++ b/tests/core/test_actions_highlevel.py @@ -923,7 +923,7 @@ def test_goto(): def test_scroll(): - action_set = HighLevelActionSet(subsets=["coord"]) + action_set = HighLevelActionSet(subsets=["bid", "coord"]) env = gym.make( "browsergym/openended", @@ -1254,3 +1254,7 @@ def get_checkbox(obs, i): assert checkbox.has_attr("checked") env.close() + + +if __name__ == "__main__": + test_scroll()