update run_osworld.py with study relaunch capability and setup readme

amanjaiswal73892 · amanjaiswal73892 · commit d36709a0c659 · 2025-07-10T17:25:26.000-04:00
diff --git a/experiments/run_osworld.py b/experiments/run_osworld.py
@@ -2,38 +2,63 @@
 import logging
 import os
 
-from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE
+from tapeagents import agent
+
+from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE, OSWORLD_OAI
 from agentlab.benchmarks.osworld import OsworldBenchmark
-from agentlab.experiments.study import make_study
+from agentlab.experiments.study import make_study, Study
 
 fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
 logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
 
 
+def get_most_recent_incomplete_study() -> Study:
+    """
+    Relaunch an existing study, this will continue incomplete experiments and relaunch errored experiments.
+    """
+    study = Study.load_most_recent()
+    study.find_incomplete(include_errors=True)
+    return study
+
 def get_task_ids() -> set[str]:
     with open("experiments/osworld_debug_task_ids.json", "r") as f:
         task_ids = json.load(f)
     return set([task["id"] for task in task_ids])
 
 
 def main():
-    n_jobs = 1
-    os.environ["AGENTLAB_DEBUG"] = "1"
+    n_jobs = 4
+    use_vmware = True
+    relaunch = True
+    agent_args = [
+        OSWORLD_CLAUDE,
+                #    OSWORLD_OAI # performs poorly. 
+                   ]  # type: ignore
+    parallel_backend = "ray"
+    os.environ["AGENTLAB_DEBUG"] = os.environ.get("AGENTLAB_DEBUG", "1")
+
     study = make_study(
-        benchmark=OsworldBenchmark(test_set_name="test_small.json"),  # type: ignore
-        agent_args=[OSWORLD_CLAUDE],
+        benchmark=OsworldBenchmark(test_set_name="test_small.json"), # or test_all.json (Exper)  # type: ignore
+        agent_args=agent_args,  # type: ignore
         comment="osworld debug 2",
         logging_level=logging.INFO,
         logging_level_stdout=logging.INFO,
     )
 
+    if use_vmware:
+        for exp_args in study.exp_args_list:
+            exp_args.env_args.provider_name = "vmware"  # type: ignore
+            exp_args.env_args.path_to_vm = "OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx"  # type: ignore
+        parallel_backend = "sequential"
+
     if os.environ.get("AGENTLAB_DEBUG"):
         task_ids = get_task_ids()
         study.exp_args_list = [exp_args for exp_args in study.exp_args_list if exp_args.env_args.task["id"] in task_ids]  # type: ignore
         print(f"Debug on {len(study.exp_args_list)} experiments")
-        study.run(n_jobs=4, n_relaunch=1, parallel_backend="ray")
-    else:
-        study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend="ray")
+        n_jobs = 1  # Make sure to use 1 job when debugging in VS
+
+    study = get_most_recent_incomplete_study() if relaunch else study
+    study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend=parallel_backend)
 
 
 if __name__ == "__main__":
diff --git a/src/agentlab/benchmarks/setup.md b/src/agentlab/benchmarks/setup.md
@@ -0,0 +1,55 @@
+# Setup OSWorld in AgentLab
+
+This guide walks you through setting up the OSWorld benchmark in AgentLab for GUI automation testing.
+
+## Installation
+
+1. **Clone and install OSWorld repository:**
+   ```bash
+   make osworld
+   ```
+
+2. **Complete OSWorld setup:**
+   - Navigate to the `OSWorld/` directory
+   - Follow the detailed setup instructions in the OSWorld README
+   - Download required VM images and configure virtual machines
+
+
+## Usage
+
+### Entry Point Configuration
+
+The main entry point `experiments/run_osworld.py` is currently configured with hardcoded parameters. To modify the execution:
+
+1. **Edit the script directly** to change:
+   - `n_jobs`: Number of parallel jobs (default: 4, set to 1 for debugging)
+   - `use_vmware`: Set to `True` for VMware, `False` for other platforms
+   - `relaunch`: Whether to continue incomplete studies
+   - `agent_args`: List of agents to test (OSWORLD_CLAUDE, OSWORLD_OAI)
+   - `test_set_name`: Choose between "test_small.json" or "test_all.json"
+
+2. **Environment Variables:**
+   - `AGENTLAB_DEBUG=1`: Automatically runs the debug subset (7 tasks from `osworld_debug_task_ids.json`)
+
+### Running OSWorld Tasks
+
+We provide different subsets of tasks:
+
+- **Debug subset:** 7 tasks defined in `experiments/osworld_debug_task_ids.json` 
+- **Small subset:** Tasks from `test_small.json`
+- **Full subset:** All tasks from `test_all.json`
+
+### Example Commands
+
+```bash
+# Run with default debug subset (7 tasks)
+python experiments/run_osworld.py
+```
+
+
+### Configuration Notes
+
+- **VMware path:** Currently hardcoded to `"OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx"`
+- **Parallel execution:** Automatically switches to sequential when using VMware
+- **Relaunch capability:** Can continue incomplete studies by loading the most recent study
+