Skip to content

Commit 35f149b

Browse files
committed
Merge branch 'main' into new_experiments
2 parents b1bad6e + 7d55bfd commit 35f149b

25 files changed

+3078
-598
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,9 @@ outputs/
172172
miniwob-plusplus/
173173
.miniwob-server.pid
174174
debugging_results/
175+
docker_vm_data/
176+
OSWorld/
177+
175178

176179
# working files
177180
experiments/*

.vscode/settings.json

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,20 @@
33
"editor.formatOnSave": true,
44
"editor.defaultFormatter": "ms-python.black-formatter",
55
"editor.codeActionsOnSave": {
6-
"source.organizeImports": "explicit",
7-
"source.fixAll": "never"
8-
}
6+
"source.organizeImports": "always",
7+
"source.fixAll": "always",
8+
},
99
},
10+
"python.analysis.languageServerMode": "full",
11+
"python.analysis.typeCheckingMode": "standard",
1012
"python.testing.pytestArgs": [
1113
"tests"
1214
],
1315
"python.testing.unittestEnabled": false,
1416
"python.testing.pytestEnabled": true,
17+
"files.watcherExclude": {
18+
"**/.git/objects/**": true,
19+
"**/.git/subtree-cache/**": true,
20+
"**/node_modules/*/**": true
21+
},
1522
}

Makefile

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: test setup miniwob lint stop-miniwob
1+
.PHONY: test setup miniwob lint stop-miniwob osworld
22

33
setup:
44
@pip install -e .
@@ -30,3 +30,23 @@ test: setup miniwob check-miniwob run-tests stop-miniwob
3030
lint: setup
3131
@black src/ --check --diff
3232
@darglint -v 2 -z short src/
33+
34+
osworld:
35+
@echo "Setting up OSWorld..."
36+
@git clone https://github.com/xlang-ai/OSWorld || true
37+
@echo "Modifying OSWorld requirements.txt to remove pinned versions..."
38+
@cd OSWorld && \
39+
sed -i.bak 's/numpy~=.*/numpy/' requirements.txt && \
40+
sed -i.bak 's/torch~=.*/torch/' requirements.txt && \
41+
sed -i.bak 's/torch$$/torch/' requirements.txt && \
42+
sed -i.bak 's/tqdm~=.*/tqdm/' requirements.txt && \
43+
sed -i.bak 's/pandas~=.*/pandas/' requirements.txt
44+
@echo "Installing OSWorld requirements..."
45+
@cd OSWorld && pip install -r requirements.txt
46+
@echo "Installing OSWorld in development mode..."
47+
@cd OSWorld && pip install -e .
48+
@echo "OSWorld setup completed!"
49+
@echo "Next steps:"
50+
@echo "1. Configure your VM (VMware/VirtualBox) according to OSWorld documentation"
51+
@echo "2. Download or set up the Ubuntu VM image"
52+
@echo "3. Run AgentLab with OSWorld tasks"

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ AgentLab Features:
6161
| [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon |
6262
| [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
6363
| [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |
64+
| [OSWorld](https://os-world.github.io/) | [setup](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/benchmarks/setup.md) | 369 | None | - | - | self hosted | soon |
6465

6566

6667
## 🛠️ Setup AgentLab
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[
2+
{
3+
"id": "550ce7e7-747b-495f-b122-acdc4d0b8e54",
4+
"task": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?",
5+
"complexity": 1
6+
},
7+
{
8+
"id": "59f21cfb-0120-4326-b255-a5b827b38967",
9+
"task": "Could you play the music video that's saved on my desktop for me via vlc?",
10+
"complexity": 1
11+
},
12+
{
13+
"id": "35253b65-1c19-4304-8aa4-6884b8218fc0",
14+
"task": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?",
15+
"complexity": 1
16+
},
17+
{
18+
"id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
19+
"task": "Please help me change all the places in this document that say \"text\" to \"test\".",
20+
"complexity": 1
21+
},
22+
{
23+
"id": "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
24+
"task": "I am currently using an Ubuntu system, and I have wrongly deleted a poster of party night. Could you help me recover it from the Trash?",
25+
"complexity": 1
26+
},
27+
{
28+
"id": "510f64c8-9bcc-4be1-8d30-638705850618",
29+
"task": "Could you start VS Code in folder ~/Desktop/project from the terminal?",
30+
"complexity": 1
31+
},
32+
{
33+
"id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
34+
"task": "Please help me use VS Code to open the \"project\" in the \"user\" folder under \"home\".",
35+
"complexity": 1
36+
}
37+
]

experiments/osworld_docker_test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import logging
2+
3+
from desktop_env.desktop_env import DesktopEnv
4+
5+
logging.basicConfig(
6+
level=logging.INFO,
7+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
8+
handlers=[logging.StreamHandler()],
9+
)
10+
11+
example = {
12+
"id": "94d95f96-9699-4208-98ba-3c3119edf9c2",
13+
"instruction": "I want to install Spotify on my current system. Could you please help me?",
14+
"config": [
15+
{
16+
"type": "execute",
17+
"parameters": {
18+
"command": [
19+
"python",
20+
"-c",
21+
"import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);",
22+
]
23+
},
24+
}
25+
],
26+
"evaluator": {
27+
"func": "check_include_exclude",
28+
"result": {"type": "vm_command_line", "command": "which spotify"},
29+
"expected": {"type": "rule", "rules": {"include": ["spotify"], "exclude": ["not found"]}},
30+
},
31+
}
32+
33+
env = DesktopEnv(action_space="pyautogui", provider_name="docker", os_type="Ubuntu")
34+
35+
obs = env.reset(task_config=example)
36+
obs, reward, done, info = env.step("pyautogui.rightClick()")
37+
print(obs)

experiments/run_osworld.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import json
2+
import logging
3+
import os
4+
5+
from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE
6+
from agentlab.benchmarks.osworld import OsworldBenchmark
7+
from agentlab.experiments.study import Study, make_study
8+
9+
fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
10+
logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
11+
12+
13+
def get_most_recent_incomplete_study() -> Study:
14+
"""
15+
Relaunch an existing study, this will continue incomplete experiments and relaunch errored experiments.
16+
"""
17+
study = Study.load_most_recent()
18+
study.find_incomplete(include_errors=True)
19+
return study
20+
21+
22+
def get_task_ids() -> set[str]:
23+
with open("experiments/osworld_debug_task_ids.json", "r") as f:
24+
task_ids = json.load(f)
25+
return set([task["id"] for task in task_ids])
26+
27+
28+
def main():
29+
n_jobs = 4
30+
use_vmware = True
31+
relaunch = False
32+
agent_args = [
33+
OSWORLD_CLAUDE,
34+
# OSWORLD_OAI # performs poorly.
35+
] # type: ignore
36+
parallel_backend = "ray"
37+
os.environ["AGENTLAB_DEBUG"] = os.environ.get("AGENTLAB_DEBUG", "1")
38+
39+
study = make_study(
40+
benchmark=OsworldBenchmark(
41+
test_set_name="test_small.json"
42+
), # or test_all.json (Exper) # type: ignore
43+
agent_args=agent_args, # type: ignore
44+
comment="osworld debug 2",
45+
logging_level=logging.INFO,
46+
logging_level_stdout=logging.INFO,
47+
)
48+
49+
if use_vmware:
50+
for exp_args in study.exp_args_list:
51+
exp_args.env_args.provider_name = "vmware" # type: ignore
52+
exp_args.env_args.path_to_vm = "OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx" # type: ignore
53+
parallel_backend = "sequential"
54+
55+
if os.environ.get("AGENTLAB_DEBUG"):
56+
task_ids = get_task_ids()
57+
study.exp_args_list = [exp_args for exp_args in study.exp_args_list if exp_args.env_args.task["id"] in task_ids] # type: ignore
58+
print(f"Debug on {len(study.exp_args_list)} experiments")
59+
n_jobs = 1 # Make sure to use 1 job when debugging in VS
60+
61+
study = get_most_recent_incomplete_study() if relaunch else study
62+
study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend=parallel_backend)
63+
64+
65+
if __name__ == "__main__":
66+
main()

main_workarena_debug.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Note: This script is a convenience script to launch experiments instead of using
3+
the command line.
4+
5+
Copy this script and modify at will, but don't push your changes to the
6+
repository.
7+
"""
8+
9+
import logging
10+
from copy import deepcopy
11+
12+
import bgym
13+
14+
from agentlab.agents.tool_use_agent.tool_use_agent import (
15+
DEFAULT_PROMPT_CONFIG,
16+
GPT_4_1,
17+
ToolUseAgentArgs,
18+
)
19+
from agentlab.experiments.study import Study
20+
21+
logging.getLogger().setLevel(logging.INFO)
22+
23+
config = deepcopy(DEFAULT_PROMPT_CONFIG)
24+
# config.keep_last_n_obs = 1
25+
config.obs.use_som = True
26+
27+
28+
agent_configs = [
29+
ToolUseAgentArgs(
30+
model_args=GPT_4_1,
31+
config=config,
32+
),
33+
# ToolUseAgentArgs(
34+
# model_args=GPT_4_1,
35+
# config=config,
36+
# ),
37+
]
38+
39+
for agent_config in agent_configs:
40+
agent_config.config.action_subsets = ("workarena",) # use the workarena action set
41+
42+
43+
# ## select the benchmark to run on
44+
# benchmark = "miniwob_tiny_test"
45+
benchmark = "workarena_l1"
46+
47+
48+
benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4) # type: bgym.Benchmark
49+
benchmark = benchmark.subset_from_glob("task_name", "*create*")
50+
51+
# for env_args in benchmark.env_args_list:
52+
# print(env_args.task_name)
53+
# env_args.max_steps = 15
54+
55+
relaunch = False
56+
57+
## Number of parallel jobs
58+
n_jobs = 10 # Make sure to use 1 job when debugging in VSCode
59+
parallel_backend = "ray"
60+
# parallel_backend = "sequential" # activate sequential backend for debugging in VSCode
61+
62+
if __name__ == "__main__": # necessary for dask backend
63+
64+
if relaunch:
65+
# relaunch an existing study
66+
study = Study.load_most_recent(contains=None)
67+
study.find_incomplete(include_errors=True)
68+
69+
else:
70+
study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING)
71+
72+
study.run(
73+
n_jobs=n_jobs,
74+
parallel_backend=parallel_backend, # "ray", "joblib" or "sequential"
75+
strict_reproducibility=False,
76+
n_relaunch=3,
77+
)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@ matplotlib
2626
ray[default]
2727
python-slugify
2828
pillow
29-
gymnasium>=0.27
29+
gymnasium>=0.27

src/agentlab/agents/debug_agent.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from copy import deepcopy
2+
from dataclasses import asdict, dataclass
3+
from functools import partial
4+
5+
import bgym
6+
from browsergym.experiments.agent import Agent, AgentInfo
7+
from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
8+
9+
from agentlab.agents.agent_args import AgentArgs
10+
from agentlab.llm.chat_api import BaseModelArgs
11+
from agentlab.llm.llm_utils import ParseError, image_to_png_base64_url, parse_html_tags_raise, retry
12+
from agentlab.llm.tracking import cost_tracker_decorator
13+
14+
15+
@dataclass
16+
class DebugAgentArgs(AgentArgs):
17+
18+
def __post_init__(self):
19+
try: # some attributes might be temporarily args.CrossProd for hyperparameter generation
20+
self.agent_name = f"debug".replace("/", "_")
21+
except AttributeError:
22+
pass
23+
self.action_set_args = bgym.DEFAULT_BENCHMARKS[
24+
"miniwob_tiny_test"
25+
]().high_level_action_set_args
26+
self.use_html = False
27+
28+
def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode):
29+
if benchmark.name.startswith("miniwob"):
30+
self.use_html = True
31+
self.action_set_args = benchmark.high_level_action_set_args
32+
33+
def make_agent(self):
34+
return DebugAgent(self.action_set_args, use_html=self.use_html)
35+
36+
37+
class DebugAgent(Agent):
38+
def __init__(
39+
self,
40+
action_set_args,
41+
use_html=False,
42+
):
43+
self.action_set = action_set_args.make_action_set()
44+
self.use_html = use_html
45+
46+
def obs_preprocessor(self, obs):
47+
obs = deepcopy(obs)
48+
obs["dom_txt"] = flatten_dom_to_str(
49+
obs["dom_object"],
50+
extra_properties=obs["extra_element_properties"],
51+
with_visible=True,
52+
with_clickable=True,
53+
with_center_coords=True,
54+
with_bounding_box_coords=True,
55+
filter_visible_only=False,
56+
filter_with_bid_only=False,
57+
filter_som_only=False,
58+
)
59+
obs["axtree_txt"] = flatten_axtree_to_str(
60+
obs["axtree_object"],
61+
extra_properties=obs["extra_element_properties"],
62+
with_visible=True,
63+
with_clickable=True,
64+
with_center_coords=True,
65+
with_bounding_box_coords=True,
66+
filter_visible_only=False,
67+
filter_with_bid_only=False,
68+
filter_som_only=False,
69+
)
70+
obs["pruned_html"] = prune_html(obs["dom_txt"])
71+
obs["screenshot_som"] = overlay_som(
72+
obs["screenshot"], extra_properties=obs["extra_element_properties"]
73+
)
74+
return obs
75+
76+
def get_action(self, obs):
77+
78+
# print(obs["pruned_html"])
79+
print("\n")
80+
observation = obs["pruned_html"] if self.use_html else obs["axtree_txt"]
81+
action = input(observation + "\n")
82+
agent_info = AgentInfo(
83+
think="nope",
84+
chat_messages=[],
85+
stats={},
86+
)
87+
return action, agent_info
88+
89+
90+
DEBUG_AGENT = DebugAgentArgs()

0 commit comments

Comments
 (0)