Skip to content

Commit d36709a

Browse files
update run_osworld.py with study relaunch capability and setup readme
1 parent 63d141b commit d36709a

File tree

2 files changed

+89
-9
lines changed

2 files changed

+89
-9
lines changed

experiments/run_osworld.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,38 +2,63 @@
22
import logging
33
import os
44

5-
from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE
5+
from tapeagents import agent
6+
7+
from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE, OSWORLD_OAI
68
from agentlab.benchmarks.osworld import OsworldBenchmark
7-
from agentlab.experiments.study import make_study
9+
from agentlab.experiments.study import make_study, Study
810

911
fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
1012
logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
1113

1214

15+
def get_most_recent_incomplete_study() -> Study:
16+
"""
17+
Relaunch an existing study, this will continue incomplete experiments and relaunch errored experiments.
18+
"""
19+
study = Study.load_most_recent()
20+
study.find_incomplete(include_errors=True)
21+
return study
22+
1323
def get_task_ids() -> set[str]:
1424
with open("experiments/osworld_debug_task_ids.json", "r") as f:
1525
task_ids = json.load(f)
1626
return set([task["id"] for task in task_ids])
1727

1828

1929
def main():
20-
n_jobs = 1
21-
os.environ["AGENTLAB_DEBUG"] = "1"
30+
n_jobs = 4
31+
use_vmware = True
32+
relaunch = True
33+
agent_args = [
34+
OSWORLD_CLAUDE,
35+
# OSWORLD_OAI # performs poorly.
36+
] # type: ignore
37+
parallel_backend = "ray"
38+
os.environ["AGENTLAB_DEBUG"] = os.environ.get("AGENTLAB_DEBUG", "1")
39+
2240
study = make_study(
23-
benchmark=OsworldBenchmark(test_set_name="test_small.json"), # type: ignore
24-
agent_args=[OSWORLD_CLAUDE],
41+
benchmark=OsworldBenchmark(test_set_name="test_small.json"), # or test_all.json (Exper) # type: ignore
42+
agent_args=agent_args, # type: ignore
2543
comment="osworld debug 2",
2644
logging_level=logging.INFO,
2745
logging_level_stdout=logging.INFO,
2846
)
2947

48+
if use_vmware:
49+
for exp_args in study.exp_args_list:
50+
exp_args.env_args.provider_name = "vmware" # type: ignore
51+
exp_args.env_args.path_to_vm = "OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx" # type: ignore
52+
parallel_backend = "sequential"
53+
3054
if os.environ.get("AGENTLAB_DEBUG"):
3155
task_ids = get_task_ids()
3256
study.exp_args_list = [exp_args for exp_args in study.exp_args_list if exp_args.env_args.task["id"] in task_ids] # type: ignore
3357
print(f"Debug on {len(study.exp_args_list)} experiments")
34-
study.run(n_jobs=4, n_relaunch=1, parallel_backend="ray")
35-
else:
36-
study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend="ray")
58+
n_jobs = 1 # Make sure to use 1 job when debugging in VS
59+
60+
study = get_most_recent_incomplete_study() if relaunch else study
61+
study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend=parallel_backend)
3762

3863

3964
if __name__ == "__main__":

src/agentlab/benchmarks/setup.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Setup OSWorld in AgentLab
2+
3+
This guide walks you through setting up the OSWorld benchmark in AgentLab for GUI automation testing.
4+
5+
## Installation
6+
7+
1. **Clone and install OSWorld repository:**
8+
```bash
9+
make osworld
10+
```
11+
12+
2. **Complete OSWorld setup:**
13+
- Navigate to the `OSWorld/` directory
14+
- Follow the detailed setup instructions in the OSWorld README
15+
- Download required VM images and configure virtual machines
16+
17+
18+
## Usage
19+
20+
### Entry Point Configuration
21+
22+
The main entry point `experiments/run_osworld.py` is currently configured with hardcoded parameters. To modify the execution:
23+
24+
1. **Edit the script directly** to change:
25+
- `n_jobs`: Number of parallel jobs (default: 4, set to 1 for debugging)
26+
- `use_vmware`: Set to `True` for VMware, `False` for other platforms
27+
- `relaunch`: Whether to continue incomplete studies
28+
- `agent_args`: List of agents to test (OSWORLD_CLAUDE, OSWORLD_OAI)
29+
- `test_set_name`: Choose between "test_small.json" or "test_all.json"
30+
31+
2. **Environment Variables:**
32+
- `AGENTLAB_DEBUG=1`: Automatically runs the debug subset (7 tasks from `osworld_debug_task_ids.json`)
33+
34+
### Running OSWorld Tasks
35+
36+
We provide different subsets of tasks:
37+
38+
- **Debug subset:** 7 tasks defined in `experiments/osworld_debug_task_ids.json`
39+
- **Small subset:** Tasks from `test_small.json`
40+
- **Full subset:** All tasks from `test_all.json`
41+
42+
### Example Commands
43+
44+
```bash
45+
# Run with default debug subset (7 tasks)
46+
python experiments/run_osworld.py
47+
```
48+
49+
50+
### Configuration Notes
51+
52+
- **VMware path:** Currently hardcoded to `"OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx"`
53+
- **Parallel execution:** Automatically switches to sequential when using VMware
54+
- **Relaunch capability:** Can continue incomplete studies by loading the most recent study
55+

0 commit comments

Comments
 (0)