Skip to content

Commit 684f56b

Browse files
Merge pull request #255 from ServiceNow/osworld
OSWorld benchmark
2 parents 7b8d24e + b0d4a99 commit 684f56b

File tree

17 files changed

+1923
-19
lines changed

17 files changed

+1923
-19
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,9 @@ outputs/
172172
miniwob-plusplus/
173173
.miniwob-server.pid
174174
debugging_results/
175+
docker_vm_data/
176+
OSWorld/
177+
175178

176179
# working files
177180
experiments/*

.vscode/settings.json

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,20 @@
33
"editor.formatOnSave": true,
44
"editor.defaultFormatter": "ms-python.black-formatter",
55
"editor.codeActionsOnSave": {
6-
"source.organizeImports": "explicit",
7-
"source.fixAll": "never"
8-
}
6+
"source.organizeImports": "always",
7+
"source.fixAll": "always",
8+
},
99
},
10+
"python.analysis.languageServerMode": "full",
11+
"python.analysis.typeCheckingMode": "standard",
1012
"python.testing.pytestArgs": [
1113
"tests"
1214
],
1315
"python.testing.unittestEnabled": false,
1416
"python.testing.pytestEnabled": true,
17+
"files.watcherExclude": {
18+
"**/.git/objects/**": true,
19+
"**/.git/subtree-cache/**": true,
20+
"**/node_modules/*/**": true
21+
},
1522
}

Makefile

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: test setup miniwob lint stop-miniwob
1+
.PHONY: test setup miniwob lint stop-miniwob osworld
22

33
setup:
44
@pip install -e .
@@ -30,3 +30,23 @@ test: setup miniwob check-miniwob run-tests stop-miniwob
3030
lint: setup
3131
@black src/ --check --diff
3232
@darglint -v 2 -z short src/
33+
34+
osworld:
35+
@echo "Setting up OSWorld..."
36+
@git clone https://github.com/xlang-ai/OSWorld || true
37+
@echo "Modifying OSWorld requirements.txt to remove pinned versions..."
38+
@cd OSWorld && \
39+
sed -i.bak 's/numpy~=.*/numpy/' requirements.txt && \
40+
sed -i.bak 's/torch~=.*/torch/' requirements.txt && \
41+
sed -i.bak 's/torch$$/torch/' requirements.txt && \
42+
sed -i.bak 's/tqdm~=.*/tqdm/' requirements.txt && \
43+
sed -i.bak 's/pandas~=.*/pandas/' requirements.txt
44+
@echo "Installing OSWorld requirements..."
45+
@cd OSWorld && pip install -r requirements.txt
46+
@echo "Installing OSWorld in development mode..."
47+
@cd OSWorld && pip install -e .
48+
@echo "OSWorld setup completed!"
49+
@echo "Next steps:"
50+
@echo "1. Configure your VM (VMware/VirtualBox) according to OSWorld documentation"
51+
@echo "2. Download or set up the Ubuntu VM image"
52+
@echo "3. Run AgentLab with OSWorld tasks"

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ AgentLab Features:
6161
| [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon |
6262
| [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
6363
| [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |
64+
| [OSWorld](https://os-world.github.io/) | [setup](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/benchmarks/setup.md) | 369 | None | - | - | self hosted | soon |
6465

6566

6667
## 🛠️ Setup AgentLab
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[
2+
{
3+
"id": "550ce7e7-747b-495f-b122-acdc4d0b8e54",
4+
"task": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?",
5+
"complexity": 1
6+
},
7+
{
8+
"id": "59f21cfb-0120-4326-b255-a5b827b38967",
9+
"task": "Could you play the music video that's saved on my desktop for me via vlc?",
10+
"complexity": 1
11+
},
12+
{
13+
"id": "35253b65-1c19-4304-8aa4-6884b8218fc0",
14+
"task": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?",
15+
"complexity": 1
16+
},
17+
{
18+
"id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
19+
"task": "Please help me change all the places in this document that say \"text\" to \"test\".",
20+
"complexity": 1
21+
},
22+
{
23+
"id": "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
24+
"task": "I am currently using an Ubuntu system, and I have wrongly deleted a poster of party night. Could you help me recover it from the Trash?",
25+
"complexity": 1
26+
},
27+
{
28+
"id": "510f64c8-9bcc-4be1-8d30-638705850618",
29+
"task": "Could you start VS Code in folder ~/Desktop/project from the terminal?",
30+
"complexity": 1
31+
},
32+
{
33+
"id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
34+
"task": "Please help me use VS Code to open the \"project\" in the \"user\" folder under \"home\".",
35+
"complexity": 1
36+
}
37+
]

experiments/osworld_docker_test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import logging
2+
3+
from desktop_env.desktop_env import DesktopEnv
4+
5+
logging.basicConfig(
6+
level=logging.INFO,
7+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
8+
handlers=[logging.StreamHandler()],
9+
)
10+
11+
example = {
12+
"id": "94d95f96-9699-4208-98ba-3c3119edf9c2",
13+
"instruction": "I want to install Spotify on my current system. Could you please help me?",
14+
"config": [
15+
{
16+
"type": "execute",
17+
"parameters": {
18+
"command": [
19+
"python",
20+
"-c",
21+
"import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);",
22+
]
23+
},
24+
}
25+
],
26+
"evaluator": {
27+
"func": "check_include_exclude",
28+
"result": {"type": "vm_command_line", "command": "which spotify"},
29+
"expected": {"type": "rule", "rules": {"include": ["spotify"], "exclude": ["not found"]}},
30+
},
31+
}
32+
33+
env = DesktopEnv(action_space="pyautogui", provider_name="docker", os_type="Ubuntu")
34+
35+
obs = env.reset(task_config=example)
36+
obs, reward, done, info = env.step("pyautogui.rightClick()")
37+
print(obs)

experiments/run_osworld.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import json
2+
import logging
3+
import os
4+
5+
from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE
6+
from agentlab.benchmarks.osworld import OsworldBenchmark
7+
from agentlab.experiments.study import Study, make_study
8+
9+
fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
10+
logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
11+
12+
13+
def get_most_recent_incomplete_study() -> Study:
14+
"""
15+
Relaunch an existing study, this will continue incomplete experiments and relaunch errored experiments.
16+
"""
17+
study = Study.load_most_recent()
18+
study.find_incomplete(include_errors=True)
19+
return study
20+
21+
22+
def get_task_ids() -> set[str]:
23+
with open("experiments/osworld_debug_task_ids.json", "r") as f:
24+
task_ids = json.load(f)
25+
return set([task["id"] for task in task_ids])
26+
27+
28+
def main():
29+
n_jobs = 4
30+
use_vmware = True
31+
relaunch = True
32+
agent_args = [
33+
OSWORLD_CLAUDE,
34+
# OSWORLD_OAI # performs poorly.
35+
] # type: ignore
36+
parallel_backend = "ray"
37+
os.environ["AGENTLAB_DEBUG"] = os.environ.get("AGENTLAB_DEBUG", "1")
38+
39+
study = make_study(
40+
benchmark=OsworldBenchmark(
41+
test_set_name="test_small.json"
42+
), # or test_all.json (Exper) # type: ignore
43+
agent_args=agent_args, # type: ignore
44+
comment="osworld debug 2",
45+
logging_level=logging.INFO,
46+
logging_level_stdout=logging.INFO,
47+
)
48+
49+
if use_vmware:
50+
for exp_args in study.exp_args_list:
51+
exp_args.env_args.provider_name = "vmware" # type: ignore
52+
exp_args.env_args.path_to_vm = "OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx" # type: ignore
53+
parallel_backend = "sequential"
54+
55+
if os.environ.get("AGENTLAB_DEBUG"):
56+
task_ids = get_task_ids()
57+
study.exp_args_list = [exp_args for exp_args in study.exp_args_list if exp_args.env_args.task["id"] in task_ids] # type: ignore
58+
print(f"Debug on {len(study.exp_args_list)} experiments")
59+
n_jobs = 1 # Make sure to use 1 job when debugging in VS
60+
61+
study = get_most_recent_incomplete_study() if relaunch else study
62+
study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend=parallel_backend)
63+
64+
65+
if __name__ == "__main__":
66+
main()

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@ matplotlib
2626
ray[default]
2727
python-slugify
2828
pillow
29-
gymnasium>=0.27
29+
gymnasium>=0.27

src/agentlab/agents/tool_use_agent/tool_use_agent.py

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@
1919
from PIL import Image
2020

2121
from agentlab.agents import agent_utils
22+
from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark
23+
from bgym import Benchmark as BgymBenchmark
2224
from agentlab.agents.agent_args import AgentArgs
25+
from agentlab.benchmarks.osworld import OSWorldActionSet
26+
from agentlab.llm.base_api import BaseModelArgs
2327
from agentlab.llm.llm_utils import image_to_png_base64_url
2428
from agentlab.llm.response_api import (
2529
APIPayload,
@@ -36,7 +40,6 @@
3640

3741
@dataclass
3842
class Block(ABC):
39-
4043
def _init(self):
4144
"""Initialize the block."""
4245
pass
@@ -169,6 +172,7 @@ class Obs(Block):
169172
use_tabs: bool = False
170173
# add_mouse_pointer: bool = False
171174
use_zoomed_webpage: bool = False
175+
skip_preprocessing: bool = False
172176

173177
def apply(
174178
self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput
@@ -181,7 +185,6 @@ def apply(
181185
obs_msg.add_text(f"Last action error:\n{obs['last_action_error']}")
182186

183187
if self.use_screenshot:
184-
185188
if self.use_som:
186189
screenshot = obs["screenshot_som"]
187190
else:
@@ -231,7 +234,6 @@ def _format_tabs(obs):
231234

232235
@dataclass
233236
class GeneralHints(Block):
234-
235237
use_hints: bool = True
236238

237239
def apply(self, llm, discussion: StructuredDiscussion) -> dict:
@@ -342,9 +344,10 @@ class PromptConfig:
342344

343345
@dataclass
344346
class ToolUseAgentArgs(AgentArgs):
345-
model_args: OpenAIResponseModelArgs = None
347+
model_args: BaseModelArgs = None
346348
config: PromptConfig = None
347349
use_raw_page_output: bool = False # This attribute is used in loop.py to setup the env.
350+
action_set: bgym.AbstractActionSet | None = None
348351

349352
def __post_init__(self):
350353
try:
@@ -356,8 +359,9 @@ def make_agent(self) -> bgym.Agent:
356359
if self.config is None:
357360
self.config = DEFAULT_PROMPT_CONFIG
358361
return ToolUseAgent(
359-
model_args=self.model_args,
362+
model_args=self.model_args, # type: ignore
360363
config=self.config,
364+
action_set=self.action_set,
361365
)
362366

363367
def prepare(self):
@@ -366,17 +370,24 @@ def prepare(self):
366370
def close(self):
367371
return self.model_args.close_server()
368372

373+
def set_benchmark(self, benchmark: AgentLabBenchmark | BgymBenchmark, demo_mode: bool):
374+
"""Set benchmark specific flags."""
375+
benchmark_name = benchmark.name
376+
if benchmark_name == "osworld":
377+
self.config.obs.skip_preprocessing = True
378+
369379

370380
class ToolUseAgent(bgym.Agent):
371381
def __init__(
372382
self,
373383
model_args: OpenAIResponseModelArgs,
374384
config: PromptConfig = None,
385+
action_set: bgym.AbstractActionSet | None = None,
375386
):
376387
self.model_args = model_args
377388
self.config = config
378-
self.action_set = bgym.HighLevelActionSet(
379-
self.config.action_subsets, multiaction=self.config.multiaction
389+
self.action_set: bgym.AbstractActionSet = action_set or bgym.HighLevelActionSet(
390+
self.config.action_subsets, multiaction=self.config.multiaction # type: ignore
380391
)
381392
self.tools = self.action_set.to_tool_description(api=model_args.api)
382393

@@ -395,7 +406,8 @@ def __init__(
395406

396407
def obs_preprocessor(self, obs):
397408
obs = copy(obs)
398-
409+
if self.config.obs.skip_preprocessing:
410+
return obs
399411
page = obs.pop("page", None)
400412
if page is not None:
401413
obs["screenshot"] = extract_screenshot(page)
@@ -592,3 +604,49 @@ def get_action(self, obs: Any) -> float:
592604
model_args=GPT4_1_OPENROUTER_MODEL,
593605
config=DEFAULT_PROMPT_CONFIG,
594606
)
607+
608+
OSWORLD_CLAUDE = ToolUseAgentArgs(
609+
model_args=CLAUDE_MODEL_CONFIG,
610+
config=PromptConfig(
611+
tag_screenshot=True,
612+
goal=Goal(goal_as_system_msg=True),
613+
obs=Obs(
614+
use_last_error=True,
615+
use_screenshot=True,
616+
use_axtree=True,
617+
use_dom=False,
618+
use_som=False,
619+
use_tabs=False,
620+
),
621+
summarizer=Summarizer(do_summary=True),
622+
general_hints=GeneralHints(use_hints=False),
623+
task_hint=TaskHint(use_task_hint=False),
624+
keep_last_n_obs=None,
625+
multiaction=False, # whether to use multi-action or not
626+
action_subsets=("coord",), # or "bid"
627+
),
628+
action_set=OSWorldActionSet("computer_13"), # or "pyautogui"
629+
)
630+
631+
OSWORLD_OAI = ToolUseAgentArgs(
632+
model_args=OPENAI_MODEL_CONFIG,
633+
config=PromptConfig(
634+
tag_screenshot=True,
635+
goal=Goal(goal_as_system_msg=True),
636+
obs=Obs(
637+
use_last_error=True,
638+
use_screenshot=True,
639+
use_axtree=False,
640+
use_dom=False,
641+
use_som=False,
642+
use_tabs=False,
643+
),
644+
summarizer=Summarizer(do_summary=True),
645+
general_hints=GeneralHints(use_hints=False),
646+
task_hint=TaskHint(use_task_hint=False),
647+
keep_last_n_obs=1, # keep only the last observation in the discussion
648+
multiaction=False, # whether to use multi-action or not
649+
action_subsets=("coord",),
650+
),
651+
action_set=OSWorldActionSet("computer_13"),
652+
)

src/agentlab/analyze/agent_xray.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,7 @@ def dict_msg_to_markdown(d: dict):
712712
case "text":
713713
parts.append(f"\n```\n{item['text']}\n```\n")
714714
case "tool_use":
715-
tool_use = _format_tool_call(item["name"], item["input"], item["call_id"])
715+
tool_use = _format_tool_call(item["name"], item["input"], item["id"])
716716
parts.append(f"\n```\n{tool_use}\n```\n")
717717
case _:
718718
parts.append(f"\n```\n{str(item)}\n```\n")
@@ -1337,7 +1337,7 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr
13371337
horizontalalignment="right",
13381338
rotation=0,
13391339
clip_on=True,
1340-
antialiased=True,
1340+
# antialiased=True,
13411341
fontweight=1000,
13421342
backgroundcolor=color,
13431343
)

0 commit comments

Comments
 (0)