diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py index bcdea4cd..161ef43e 100644 --- a/browsergym/core/src/browsergym/core/env.py +++ b/browsergym/core/src/browsergym/core/env.py @@ -627,7 +627,8 @@ def _get_obs(self): dom = extract_dom_snapshot(self.page) axtree = extract_merged_axtree(self.page) focused_element_bid = extract_focused_element_bid(self.page) - extra_properties = extract_dom_extra_properties(dom) + scale_factor = getattr(self.page, "_bgym_scale_factor", 1.0) + extra_properties = extract_dom_extra_properties(dom, scale_factor=scale_factor) except (playwright.sync_api.Error, MarkingError) as e: err_msg = str(e) # try to add robustness to async events (detached / deleted frames) diff --git a/browsergym/core/src/browsergym/core/observation.py b/browsergym/core/src/browsergym/core/observation.py index f8e4e066..f0e9a165 100644 --- a/browsergym/core/src/browsergym/core/observation.py +++ b/browsergym/core/src/browsergym/core/observation.py @@ -144,6 +144,14 @@ def extract_screenshot(page: playwright.sync_api.Page): "devicePixelRatio": 1.0, # Override system DPR } + # CAPTURE ORIGINAL METRICS BEFORE CHANGING + original_metrics = { + "width": dimensions["width"], + "height": dimensions["height"], + "deviceScaleFactor": 1.0, # The original scale factor + "mobile": False, + } + # Apply scale factor to device metrics for higher resolution capture cdp.send( "Emulation.setDeviceMetricsOverride", @@ -162,8 +170,9 @@ def extract_screenshot(page: playwright.sync_api.Page): }, ) - # Reset device metrics - cdp.send("Emulation.clearDeviceMetricsOverride") + # RESTORE ORIGINAL METRICS (don't just clear) + cdp.send("Emulation.setDeviceMetricsOverride", original_metrics) + cdp.detach() # bytes of a png file @@ -281,7 +290,7 @@ def pop_bids_from_attribute(dom_snapshot, attr: str): break -def extract_dom_extra_properties(dom_snapshot): +def extract_dom_extra_properties(dom_snapshot, scale_factor): def to_string(idx): if idx == -1: return None @@ -436,9 +445,16 @@ def to_string(idx): if bid: if bid in extra_properties: logger.warning(f"duplicate {BID_ATTR}={repr(bid)} attribute detected") + + scaled_bbox = None + if node["bbox"]: + scaled_bbox = [coord * scale_factor for coord in node["bbox"]] + extra_properties[bid] = { - extra_prop: node[extra_prop] - for extra_prop in ("visibility", "bbox", "clickable", "set_of_marks") + "visibility": node["visibility"], + "bbox": scaled_bbox, # Use scaled coordinates + "clickable": node["clickable"], + "set_of_marks": node["set_of_marks"], } return extra_properties diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index d099faa3..0a1ff0a1 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -1,6 +1,8 @@ import numpy as np - -from browsergym.experiments.benchmark.metadata.utils import task_list_from_metadata, task_metadata +from browsergym.experiments.benchmark.metadata.utils import ( + task_list_from_metadata, + task_metadata, +) from browsergym.experiments.benchmark.utils import ( make_env_args_list_from_fixed_seeds, make_env_args_list_from_repeat_tasks, @@ -88,7 +90,7 @@ # all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()` DEFAULT_BENCHMARKS = { - "miniwob": lambda: Benchmark( + "miniwob": lambda n_repeats=5: Benchmark( name="miniwob", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob_all"], is_multi_tab=False, @@ -97,12 +99,12 @@ env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("miniwob")), max_steps=10, - n_repeats=5, + n_repeats=n_repeats, seeds_rng=np.random.RandomState(42), ), task_metadata=task_metadata("miniwob"), ), - "miniwob_tiny_test": lambda: Benchmark( + "miniwob_tiny_test": lambda n_repeats=2: Benchmark( name="miniwob_tiny_test", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob_all"], is_multi_tab=False, @@ -111,12 +113,12 @@ env_args_list=make_env_args_list_from_repeat_tasks( task_list=["miniwob.click-dialog", "miniwob.click-checkboxes"], max_steps=5, - n_repeats=2, + n_repeats=n_repeats, seeds_rng=np.random.RandomState(42), ), task_metadata=task_metadata("miniwob"), ), - "webarena": lambda: Benchmark( + "webarena": lambda n_repeats=1: Benchmark( name="webarena", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], is_multi_tab=True, @@ -125,12 +127,12 @@ env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("webarena")), max_steps=30, - n_repeats=1, + n_repeats=n_repeats, seeds_rng=np.random.RandomState(42), ), task_metadata=task_metadata("webarena"), ), - "webarena_tiny": lambda: Benchmark( + "webarena_tiny": lambda n_repeats=1: Benchmark( name="webarena_tiny", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], is_multi_tab=True, @@ -150,7 +152,7 @@ ), task_metadata=task_metadata("webarena"), ), - "visualwebarena_tiny": lambda: Benchmark( + "visualwebarena_tiny": lambda n_repeats=10: Benchmark( name="visualwebarena_tiny", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"], is_multi_tab=True, @@ -168,7 +170,7 @@ ), task_metadata=task_metadata("visualwebarena"), ), - "visualwebarena": lambda: Benchmark( + "visualwebarena": lambda n_repeats=1: Benchmark( name="visualwebarena", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"], is_multi_tab=True, @@ -177,12 +179,12 @@ env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("visualwebarena")), max_steps=30, - n_repeats=1, + n_repeats=n_repeats, seeds_rng=np.random.RandomState(42), ), task_metadata=task_metadata("visualwebarena"), ), - "workarena_l1": lambda: Benchmark( + "workarena_l1": lambda n_repeats=10: Benchmark( name="workarena_l1", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"], is_multi_tab=False, @@ -194,11 +196,11 @@ meta_seed=42, # meta seed for evaluation curriculum max_steps=15, curriculum_type="agent", - seeds_l1=10, + seeds_l1=n_repeats, ), task_metadata=task_metadata("workarena"), ), - "workarena_l2_agent_curriculum_eval": lambda: Benchmark( + "workarena_l2_agent_curriculum_eval": lambda n_repeats=1: Benchmark( name="workarena_l2_agent_curriculum_eval", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena++"], is_multi_tab=True, @@ -213,7 +215,7 @@ ), task_metadata=task_metadata("workarena"), ), - "workarena_l3_agent_curriculum_eval": lambda: Benchmark( + "workarena_l3_agent_curriculum_eval": lambda n_repeats=1: Benchmark( name="workarena_l3_agent_curriculum_eval", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena++"], is_multi_tab=True, @@ -228,7 +230,7 @@ ), task_metadata=task_metadata("workarena"), ), - "assistantbench": lambda: Benchmark( + "assistantbench": lambda n_repeats=1: Benchmark( name="assistantbench", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["assistantbench"], is_multi_tab=True, @@ -239,12 +241,12 @@ metadata=task_metadata("assistantbench"), filter={"browsergym_split": "valid|test"} ), max_steps=30, - n_repeats=1, + n_repeats=n_repeats, seeds_rng=np.random.RandomState(42), ), task_metadata=task_metadata("assistantbench"), ), - "weblinx": lambda: Benchmark( + "weblinx": lambda n_repeats=1: Benchmark( name="weblinx", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["weblinx"], is_multi_tab=True, @@ -253,7 +255,7 @@ env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("weblinx")), max_steps=1, - n_repeats=1, + n_repeats=n_repeats, seeds_rng=np.random.RandomState(42), ), task_metadata=task_metadata("weblinx"),