create webarena_verified action space with special submit function to match the benchmark expected agent response format

NicolasAG · NicolasAG · commit b7f847a62d6c · 2025-10-28T20:33:36.000Z
diff --git a/browsergym/core/src/browsergym/core/action/functions.py b/browsergym/core/src/browsergym/core/action/functions.py
@@ -1,6 +1,7 @@
 # these are placeholders
 # all these symbols will be available in browsergym actions
-from typing import Literal
+import json
+from typing import Any, Literal
 
 import playwright.sync_api
 
@@ -24,6 +25,53 @@
 inspect.getsource().
 """
 
+def send_response_to_wav(
+    performed_operation: Literal["RETRIEVE", "MUTATE", "NAVIGATE"],
+    status: Literal["SUCCESS", "ACTION_NOT_ALLOWED_ERROR", "NOT_FOUND_ERROR", "PERMISSION_DENIED_ERROR", "DATA_VALIDATION_ERROR", "UNKNOWN_ERROR"],
+    retrieved_data: list[str | int | float | bool | dict[str, Any] | None] | None = None,
+    error_details: str | None = None,
+):
+    """Send the final response.
+    Args:
+        performed_operation: The overall type of work performed to attain the task objective.
+            - RETRIEVE: Use when retrieving data is the main objective of the task
+            - MUTATE: Use when creating, updating, or deleting data is the main objective of the task
+            - NAVIGATE: Use when navigating or browsing to show a specific page or location is the main objective of the task
+        status: The outcome of the task execution.
+            - SUCCESS: Use when the task objective was fully achieved
+            - ACTION_NOT_ALLOWED_ERROR: Use when the platform does not support the requested action
+            - NOT_FOUND_ERROR: Use when the target entity or resource could not be located after retry attempts
+            - PERMISSION_DENIED_ERROR: Use when the current user lacks permission to perform the action
+            - DATA_VALIDATION_ERROR: Use when required input data was missing or invalid
+            - UNKNOWN_ERROR: Use when an unexpected failure doesn't match other categories
+        retrieved_data: Array of items for 'retrieve' operations, null for 'mutate' and 'navigate' operations.
+            Returns empty array if no items found. All items must be the same type (either all primitives of the same type, or all objects with the same keys).
+            Use appropriate data type formats (e.g., numbers for amounts/counts, true/false for booleans, not strings).
+            For list of objects, the user instruction contains the format specification.
+        error_details: Null when status is 'SUCCESS'. Otherwise, explains what failed, why it failed, and what was attempted.
+
+    Examples:
+        send_response_to_wav("RETRIEVE", "SUCCESS", ["The city was built in 1751."])
+        send_response_to_wav("RETRIEVE", "SUCCESS", [{"name": "John Doe", "age": 30}])
+        send_response_to_wav("RETRIEVE", "SUCCESS", [0,3])
+        send_response_to_wav("RETRIEVE", "ACTION_NOT_ALLOWED_ERROR", None)
+        send_response_to_wav("RETRIEVE", "NOT_FOUND_ERROR", None, "No city found.")
+        send_response_to_wav("MUTATE", "SUCCESS", None)
+        send_response_to_wav("MUTATE", "PERMISSION_DENIED_ERROR", None, "User lacks permission to build a city.")
+        send_response_to_wav("NAVIGATE", "SUCCESS", None)
+        send_response_to_wav("NAVIGATE", "DATA_VALIDATION_ERROR", None, "Invalid city name.")
+        send_response_to_wav("NAVIGATE", "UNKNOWN_ERROR", None, "Unexpected error.")
+
+    """
+    final_response_dict = {
+        "performed_operation": performed_operation,
+        "status": status,
+        "retrieved_data": retrieved_data,
+        "error_details": error_details,
+    }
+    text = json.dumps(final_response_dict)
+    send_message_to_user(text)
+
 
 def send_msg_to_user(text: str):
     """
diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py
@@ -36,6 +36,7 @@
     scroll_at,
     select_option,
     send_msg_to_user,
+    send_response_to_wav,
     tab_close,
     tab_focus,
     upload_file,
@@ -151,6 +152,28 @@
         send_msg_to_user,  #  STOP          | stop(answer)     | stop [answer]
         report_infeasible,  ## explicit unachievable action, equivalent STOP "N/A"
     ],
+    # webarena_verified agent response schema
+    # https://github.com/ServiceNow/platform-labs-webarena-verified/blob/main/src/webarena_verified/types/agent_response.py
+    "webarena_verified": [
+        #                   #     code      |      paper       |      prompt
+        scroll,  #            SCROLL        | scroll(dir)      | scroll [down|up]
+        keyboard_press,  #    KEY_PRESS     | press(key_comb)  | press [key_comb]
+        #                     MOUSE_CLICK   |                  |
+        #                     KEYBOARD_TYPE |                  |
+        #                     MOUSE_HOVER   |                  |
+        click,  #             CLICK         | click(elem)      | click [id]
+        fill,  #              TYPE          | type(elem, text) | type [id] [content]
+        hover,  #             HOVER         | hover(elem)      | hover [id]
+        tab_focus,  #         PAGE_FOCUS    | tab_focus(index) | tab_focus [tab_index]
+        new_tab,  #           NEW_TAB       | new_tab()        | new_tab
+        go_back,  #           GO_BACK       | go_back()        | go_back
+        go_forward,  #        GO_FORWARD    | go_forward()     | go_forward
+        goto,  #              GOTO_URL      | goto(url)        | goto [url]
+        tab_close,  #         PAGE_CLOSE    | tab_close()      | close_tab
+        #                     CHECK         |                  |
+        select_option,  #     SELECT_OPTION |                  |
+        send_response_to_wav,  #  STOP          | stop(answer)     | stop [answer]
+    ],
     # from the visualwebarena paper
     # https://arxiv.org/abs/2401.13649
     # from the visualwebarena source code
@@ -272,6 +295,7 @@ class HighLevelActionSet(AbstractActionSet):
         "miniwob_liu18",
         "miniwob_humphreys22",
         "webarena",
+        "webarena_verified",
         "visualwebarena",
         "workarena",
         "workarena++",
diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py
@@ -65,6 +65,13 @@
         retry_with_force=True,
         demo_mode="off",
     ),
+    "webarena_verified": HighLevelActionSetArgs(
+        subsets=["webarena_verified"],
+        multiaction=False,
+        strict=False,
+        retry_with_force=True,
+        demo_mode="off",
+    ),
     # from https://arxiv.org/abs/2401.13649
     "visualwebarena": HighLevelActionSetArgs(
         subsets=["visualwebarena"],
@@ -135,7 +142,7 @@
     ),
     "webarena_verified": lambda n_repeats=1: Benchmark(
         name="webarena_verified",
-        high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
+        high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena_verified"],
         is_multi_tab=True,
         supports_parallel_seeds=False,
         backends=["webarena_verified"],
diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py
@@ -137,16 +137,20 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]:
         goal = self.config["intent"]
 
         # WebArena Verified requires a specific format for the agent response
-        response_schema = FinalAgentResponse.model_json_schema()
-        goal += f"""
-
----
-Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema:
-```
-{json.dumps(response_schema, indent=4)}
-```
-Your message in `send_msg_to_user` will be validated against this schema.
+        goal += """
+
+When you are done, send your final answer to the user with `send_response_to_wav`.
 """
+#         response_schema = FinalAgentResponse.model_json_schema()
+#         goal += f"""
+
+# ---
+# Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema:
+# ```
+# {json.dumps(response_schema, indent=4)}
+# ```
+# Your message in `send_msg_to_user` will be validated against this schema.
+# """
 
         # This note is present in all webarena's agent prompts
         # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/agent/prompts/raw/p_cot_id_actree_2s.py#L34