Skip to content

Commit b7f847a

Browse files
committed
create webarena_verified action space with special submit function to match the benchmark expected agent response format
1 parent 63b4b07 commit b7f847a

File tree

4 files changed

+94
-11
lines changed

4 files changed

+94
-11
lines changed

browsergym/core/src/browsergym/core/action/functions.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# these are placeholders
22
# all these symbols will be available in browsergym actions
3-
from typing import Literal
3+
import json
4+
from typing import Any, Literal
45

56
import playwright.sync_api
67

@@ -24,6 +25,53 @@
2425
inspect.getsource().
2526
"""
2627

28+
def send_response_to_wav(
29+
performed_operation: Literal["RETRIEVE", "MUTATE", "NAVIGATE"],
30+
status: Literal["SUCCESS", "ACTION_NOT_ALLOWED_ERROR", "NOT_FOUND_ERROR", "PERMISSION_DENIED_ERROR", "DATA_VALIDATION_ERROR", "UNKNOWN_ERROR"],
31+
retrieved_data: list[str | int | float | bool | dict[str, Any] | None] | None = None,
32+
error_details: str | None = None,
33+
):
34+
"""Send the final response.
35+
Args:
36+
performed_operation: The overall type of work performed to attain the task objective.
37+
- RETRIEVE: Use when retrieving data is the main objective of the task
38+
- MUTATE: Use when creating, updating, or deleting data is the main objective of the task
39+
- NAVIGATE: Use when navigating or browsing to show a specific page or location is the main objective of the task
40+
status: The outcome of the task execution.
41+
- SUCCESS: Use when the task objective was fully achieved
42+
- ACTION_NOT_ALLOWED_ERROR: Use when the platform does not support the requested action
43+
- NOT_FOUND_ERROR: Use when the target entity or resource could not be located after retry attempts
44+
- PERMISSION_DENIED_ERROR: Use when the current user lacks permission to perform the action
45+
- DATA_VALIDATION_ERROR: Use when required input data was missing or invalid
46+
- UNKNOWN_ERROR: Use when an unexpected failure doesn't match other categories
47+
retrieved_data: Array of items for 'retrieve' operations, null for 'mutate' and 'navigate' operations.
48+
Returns empty array if no items found. All items must be the same type (either all primitives of the same type, or all objects with the same keys).
49+
Use appropriate data type formats (e.g., numbers for amounts/counts, true/false for booleans, not strings).
50+
For list of objects, the user instruction contains the format specification.
51+
error_details: Null when status is 'SUCCESS'. Otherwise, explains what failed, why it failed, and what was attempted.
52+
53+
Examples:
54+
send_response_to_wav("RETRIEVE", "SUCCESS", ["The city was built in 1751."])
55+
send_response_to_wav("RETRIEVE", "SUCCESS", [{"name": "John Doe", "age": 30}])
56+
send_response_to_wav("RETRIEVE", "SUCCESS", [0,3])
57+
send_response_to_wav("RETRIEVE", "ACTION_NOT_ALLOWED_ERROR", None)
58+
send_response_to_wav("RETRIEVE", "NOT_FOUND_ERROR", None, "No city found.")
59+
send_response_to_wav("MUTATE", "SUCCESS", None)
60+
send_response_to_wav("MUTATE", "PERMISSION_DENIED_ERROR", None, "User lacks permission to build a city.")
61+
send_response_to_wav("NAVIGATE", "SUCCESS", None)
62+
send_response_to_wav("NAVIGATE", "DATA_VALIDATION_ERROR", None, "Invalid city name.")
63+
send_response_to_wav("NAVIGATE", "UNKNOWN_ERROR", None, "Unexpected error.")
64+
65+
"""
66+
final_response_dict = {
67+
"performed_operation": performed_operation,
68+
"status": status,
69+
"retrieved_data": retrieved_data,
70+
"error_details": error_details,
71+
}
72+
text = json.dumps(final_response_dict)
73+
send_message_to_user(text)
74+
2775

2876
def send_msg_to_user(text: str):
2977
"""

browsergym/core/src/browsergym/core/action/highlevel.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
scroll_at,
3737
select_option,
3838
send_msg_to_user,
39+
send_response_to_wav,
3940
tab_close,
4041
tab_focus,
4142
upload_file,
@@ -151,6 +152,28 @@
151152
send_msg_to_user, # STOP | stop(answer) | stop [answer]
152153
report_infeasible, ## explicit unachievable action, equivalent STOP "N/A"
153154
],
155+
# webarena_verified agent response schema
156+
# https://github.com/ServiceNow/platform-labs-webarena-verified/blob/main/src/webarena_verified/types/agent_response.py
157+
"webarena_verified": [
158+
# # code | paper | prompt
159+
scroll, # SCROLL | scroll(dir) | scroll [down|up]
160+
keyboard_press, # KEY_PRESS | press(key_comb) | press [key_comb]
161+
# MOUSE_CLICK | |
162+
# KEYBOARD_TYPE | |
163+
# MOUSE_HOVER | |
164+
click, # CLICK | click(elem) | click [id]
165+
fill, # TYPE | type(elem, text) | type [id] [content]
166+
hover, # HOVER | hover(elem) | hover [id]
167+
tab_focus, # PAGE_FOCUS | tab_focus(index) | tab_focus [tab_index]
168+
new_tab, # NEW_TAB | new_tab() | new_tab
169+
go_back, # GO_BACK | go_back() | go_back
170+
go_forward, # GO_FORWARD | go_forward() | go_forward
171+
goto, # GOTO_URL | goto(url) | goto [url]
172+
tab_close, # PAGE_CLOSE | tab_close() | close_tab
173+
# CHECK | |
174+
select_option, # SELECT_OPTION | |
175+
send_response_to_wav, # STOP | stop(answer) | stop [answer]
176+
],
154177
# from the visualwebarena paper
155178
# https://arxiv.org/abs/2401.13649
156179
# from the visualwebarena source code
@@ -272,6 +295,7 @@ class HighLevelActionSet(AbstractActionSet):
272295
"miniwob_liu18",
273296
"miniwob_humphreys22",
274297
"webarena",
298+
"webarena_verified",
275299
"visualwebarena",
276300
"workarena",
277301
"workarena++",

browsergym/experiments/src/browsergym/experiments/benchmark/configs.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,13 @@
6565
retry_with_force=True,
6666
demo_mode="off",
6767
),
68+
"webarena_verified": HighLevelActionSetArgs(
69+
subsets=["webarena_verified"],
70+
multiaction=False,
71+
strict=False,
72+
retry_with_force=True,
73+
demo_mode="off",
74+
),
6875
# from https://arxiv.org/abs/2401.13649
6976
"visualwebarena": HighLevelActionSetArgs(
7077
subsets=["visualwebarena"],
@@ -135,7 +142,7 @@
135142
),
136143
"webarena_verified": lambda n_repeats=1: Benchmark(
137144
name="webarena_verified",
138-
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
145+
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena_verified"],
139146
is_multi_tab=True,
140147
supports_parallel_seeds=False,
141148
backends=["webarena_verified"],

browsergym/webarena_verified/src/browsergym/webarena_verified/task.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -137,16 +137,20 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]:
137137
goal = self.config["intent"]
138138

139139
# WebArena Verified requires a specific format for the agent response
140-
response_schema = FinalAgentResponse.model_json_schema()
141-
goal += f"""
142-
143-
---
144-
Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema:
145-
```
146-
{json.dumps(response_schema, indent=4)}
147-
```
148-
Your message in `send_msg_to_user` will be validated against this schema.
140+
goal += """
141+
142+
When you are done, send your final answer to the user with `send_response_to_wav`.
149143
"""
144+
# response_schema = FinalAgentResponse.model_json_schema()
145+
# goal += f"""
146+
147+
# ---
148+
# Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema:
149+
# ```
150+
# {json.dumps(response_schema, indent=4)}
151+
# ```
152+
# Your message in `send_msg_to_user` will be validated against this schema.
153+
# """
150154

151155
# This note is present in all webarena's agent prompts
152156
# https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/agent/prompts/raw/p_cot_id_actree_2s.py#L34

0 commit comments

Comments
 (0)