ServiceNow
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 10 additions & 3 deletions b/‎.vscode/settings.json‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎Makefile‎
Lines changed: 21 additions & 1 deletion b/‎Makefile‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎experiments/osworld_debug_task_ids.json‎
Lines changed: 37 additions & 0 deletions b/‎experiments/osworld_debug_task_ids.json‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎experiments/osworld_docker_test.py‎
Lines changed: 37 additions & 0 deletions b/‎experiments/osworld_docker_test.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎experiments/run_osworld.py‎
Lines changed: 66 additions & 0 deletions b/‎experiments/run_osworld.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎main_workarena_debug.py‎
Lines changed: 77 additions & 0 deletions b/‎main_workarena_debug.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/agentlab/agents/debug_agent.py‎
Lines changed: 90 additions & 0 deletions b/‎src/agentlab/agents/debug_agent.py‎
Lines changed: 90 additions & 0 deletions
@@ -172,6 +172,9 @@ outputs/
 miniwob-plusplus/
 .miniwob-server.pid
 debugging_results/
+docker_vm_data/
+OSWorld/
+
 
 # working files
 experiments/*
@@ -3,13 +3,20 @@
         "editor.formatOnSave": true,
         "editor.defaultFormatter": "ms-python.black-formatter",
         "editor.codeActionsOnSave": {
-            "source.organizeImports": "explicit",
-            "source.fixAll": "never"
-        }
+            "source.organizeImports": "always",
+            "source.fixAll": "always",
+        },
     },
+    "python.analysis.languageServerMode": "full",
+    "python.analysis.typeCheckingMode": "standard",
     "python.testing.pytestArgs": [
         "tests"
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
+    "files.watcherExclude": {
+        "**/.git/objects/**": true,
+        "**/.git/subtree-cache/**": true,
+        "**/node_modules/*/**": true
+    },
 }
@@ -1,4 +1,4 @@
-.PHONY: test setup miniwob lint stop-miniwob
+.PHONY: test setup miniwob lint stop-miniwob osworld
 
 setup:
 	@pip install -e .
@@ -30,3 +30,23 @@ test: setup miniwob check-miniwob run-tests stop-miniwob
 lint: setup
 	@black src/ --check --diff
 	@darglint -v 2 -z short src/
+
+osworld:
+	@echo "Setting up OSWorld..."
+	@git clone https://github.com/xlang-ai/OSWorld || true
+	@echo "Modifying OSWorld requirements.txt to remove pinned versions..."
+	@cd OSWorld && \
+		sed -i.bak 's/numpy~=.*/numpy/' requirements.txt && \
+		sed -i.bak 's/torch~=.*/torch/' requirements.txt && \
+		sed -i.bak 's/torch$$/torch/' requirements.txt && \
+		sed -i.bak 's/tqdm~=.*/tqdm/' requirements.txt && \
+		sed -i.bak 's/pandas~=.*/pandas/' requirements.txt
+	@echo "Installing OSWorld requirements..."
+	@cd OSWorld && pip install -r requirements.txt
+	@echo "Installing OSWorld in development mode..."
+	@cd OSWorld && pip install -e .
+	@echo "OSWorld setup completed!"
+	@echo "Next steps:"
+	@echo "1. Configure your VM (VMware/VirtualBox) according to OSWorld documentation"
+	@echo "2. Download or set up the Ubuntu VM image"
+	@echo "3. Run AgentLab with OSWorld tasks"
@@ -61,6 +61,7 @@ AgentLab Features:
 | [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon |
 | [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
 | [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |
+| [OSWorld](https://os-world.github.io/) | [setup](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/benchmarks/setup.md) | 369 | None | - | - | self hosted  | soon |
 
 
 ## 🛠️ Setup AgentLab
 
@@ -0,0 +1,37 @@
+[
+  {
+    "id": "550ce7e7-747b-495f-b122-acdc4d0b8e54",
+    "task": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?",
+    "complexity": 1
+  },
+  {
+    "id": "59f21cfb-0120-4326-b255-a5b827b38967",
+    "task": "Could you play the music video that's saved on my desktop for me via vlc?",
+    "complexity": 1
+  },
+  {
+    "id": "35253b65-1c19-4304-8aa4-6884b8218fc0",
+    "task": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?",
+    "complexity": 1
+  },
+  {
+    "id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
+    "task": "Please help me change all the places in this document that say \"text\" to \"test\".",
+    "complexity": 1
+  },
+  {
+    "id": "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
+    "task": "I am currently using an Ubuntu system, and I have wrongly deleted a poster of party night. Could you help me recover it from the Trash?",
+    "complexity": 1
+  },
+  {
+    "id": "510f64c8-9bcc-4be1-8d30-638705850618",
+    "task": "Could you start VS Code in folder ~/Desktop/project from the terminal?",
+    "complexity": 1
+  },
+  {
+    "id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
+    "task": "Please help me use VS Code to open the \"project\" in the \"user\" folder under \"home\".",
+    "complexity": 1
+  }
+]
@@ -0,0 +1,37 @@
+import logging
+
+from desktop_env.desktop_env import DesktopEnv
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()],
+)
+
+example = {
+    "id": "94d95f96-9699-4208-98ba-3c3119edf9c2",
+    "instruction": "I want to install Spotify on my current system. Could you please help me?",
+    "config": [
+        {
+            "type": "execute",
+            "parameters": {
+                "command": [
+                    "python",
+                    "-c",
+                    "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);",
+                ]
+            },
+        }
+    ],
+    "evaluator": {
+        "func": "check_include_exclude",
+        "result": {"type": "vm_command_line", "command": "which spotify"},
+        "expected": {"type": "rule", "rules": {"include": ["spotify"], "exclude": ["not found"]}},
+    },
+}
+
+env = DesktopEnv(action_space="pyautogui", provider_name="docker", os_type="Ubuntu")
+
+obs = env.reset(task_config=example)
+obs, reward, done, info = env.step("pyautogui.rightClick()")
+print(obs)
@@ -0,0 +1,66 @@
+import json
+import logging
+import os
+
+from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE
+from agentlab.benchmarks.osworld import OsworldBenchmark
+from agentlab.experiments.study import Study, make_study
+
+fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
+logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
+
+
+def get_most_recent_incomplete_study() -> Study:
+    """
+    Relaunch an existing study, this will continue incomplete experiments and relaunch errored experiments.
+    """
+    study = Study.load_most_recent()
+    study.find_incomplete(include_errors=True)
+    return study
+
+
+def get_task_ids() -> set[str]:
+    with open("experiments/osworld_debug_task_ids.json", "r") as f:
+        task_ids = json.load(f)
+    return set([task["id"] for task in task_ids])
+
+
+def main():
+    n_jobs = 4
+    use_vmware = True
+    relaunch = False
+    agent_args = [
+        OSWORLD_CLAUDE,
+        #    OSWORLD_OAI # performs poorly.
+    ]  # type: ignore
+    parallel_backend = "ray"
+    os.environ["AGENTLAB_DEBUG"] = os.environ.get("AGENTLAB_DEBUG", "1")
+
+    study = make_study(
+        benchmark=OsworldBenchmark(
+            test_set_name="test_small.json"
+        ),  # or test_all.json (Exper)  # type: ignore
+        agent_args=agent_args,  # type: ignore
+        comment="osworld debug 2",
+        logging_level=logging.INFO,
+        logging_level_stdout=logging.INFO,
+    )
+
+    if use_vmware:
+        for exp_args in study.exp_args_list:
+            exp_args.env_args.provider_name = "vmware"  # type: ignore
+            exp_args.env_args.path_to_vm = "OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx"  # type: ignore
+        parallel_backend = "sequential"
+
+    if os.environ.get("AGENTLAB_DEBUG"):
+        task_ids = get_task_ids()
+        study.exp_args_list = [exp_args for exp_args in study.exp_args_list if exp_args.env_args.task["id"] in task_ids]  # type: ignore
+        print(f"Debug on {len(study.exp_args_list)} experiments")
+        n_jobs = 1  # Make sure to use 1 job when debugging in VS
+
+    study = get_most_recent_incomplete_study() if relaunch else study
+    study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend=parallel_backend)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,77 @@
+"""
+Note: This script is a convenience script to launch experiments instead of using
+the command line.
+
+Copy this script and modify at will, but don't push your changes to the
+repository.
+"""
+
+import logging
+from copy import deepcopy
+
+import bgym
+
+from agentlab.agents.tool_use_agent.tool_use_agent import (
+    DEFAULT_PROMPT_CONFIG,
+    GPT_4_1,
+    ToolUseAgentArgs,
+)
+from agentlab.experiments.study import Study
+
+logging.getLogger().setLevel(logging.INFO)
+
+config = deepcopy(DEFAULT_PROMPT_CONFIG)
+# config.keep_last_n_obs = 1
+config.obs.use_som = True
+
+
+agent_configs = [
+    ToolUseAgentArgs(
+        model_args=GPT_4_1,
+        config=config,
+    ),
+    # ToolUseAgentArgs(
+    #     model_args=GPT_4_1,
+    #     config=config,
+    # ),
+]
+
+for agent_config in agent_configs:
+    agent_config.config.action_subsets = ("workarena",)  # use the workarena action set
+
+
+# ## select the benchmark to run on
+# benchmark = "miniwob_tiny_test"
+benchmark = "workarena_l1"
+
+
+benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4)  # type: bgym.Benchmark
+benchmark = benchmark.subset_from_glob("task_name", "*create*")
+
+# for env_args in benchmark.env_args_list:
+#     print(env_args.task_name)
+#     env_args.max_steps = 15
+
+relaunch = False
+
+## Number of parallel jobs
+n_jobs = 10  # Make sure to use 1 job when debugging in VSCode
+parallel_backend = "ray"
+# parallel_backend = "sequential"  # activate sequential backend for debugging in VSCode
+
+if __name__ == "__main__":  # necessary for dask backend
+
+    if relaunch:
+        #  relaunch an existing study
+        study = Study.load_most_recent(contains=None)
+        study.find_incomplete(include_errors=True)
+
+    else:
+        study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING)
+
+    study.run(
+        n_jobs=n_jobs,
+        parallel_backend=parallel_backend,  # "ray", "joblib" or "sequential"
+        strict_reproducibility=False,
+        n_relaunch=3,
+    )
@@ -26,4 +26,4 @@ matplotlib
 ray[default]
 python-slugify
 pillow
-gymnasium>=0.27
+gymnasium>=0.27
@@ -0,0 +1,90 @@
+from copy import deepcopy
+from dataclasses import asdict, dataclass
+from functools import partial
+
+import bgym
+from browsergym.experiments.agent import Agent, AgentInfo
+from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
+
+from agentlab.agents.agent_args import AgentArgs
+from agentlab.llm.chat_api import BaseModelArgs
+from agentlab.llm.llm_utils import ParseError, image_to_png_base64_url, parse_html_tags_raise, retry
+from agentlab.llm.tracking import cost_tracker_decorator
+
+
+@dataclass
+class DebugAgentArgs(AgentArgs):
+
+    def __post_init__(self):
+        try:  # some attributes might be temporarily args.CrossProd for hyperparameter generation
+            self.agent_name = f"debug".replace("/", "_")
+        except AttributeError:
+            pass
+        self.action_set_args = bgym.DEFAULT_BENCHMARKS[
+            "miniwob_tiny_test"
+        ]().high_level_action_set_args
+        self.use_html = False
+
+    def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode):
+        if benchmark.name.startswith("miniwob"):
+            self.use_html = True
+        self.action_set_args = benchmark.high_level_action_set_args
+
+    def make_agent(self):
+        return DebugAgent(self.action_set_args, use_html=self.use_html)
+
+
+class DebugAgent(Agent):
+    def __init__(
+        self,
+        action_set_args,
+        use_html=False,
+    ):
+        self.action_set = action_set_args.make_action_set()
+        self.use_html = use_html
+
+    def obs_preprocessor(self, obs):
+        obs = deepcopy(obs)
+        obs["dom_txt"] = flatten_dom_to_str(
+            obs["dom_object"],
+            extra_properties=obs["extra_element_properties"],
+            with_visible=True,
+            with_clickable=True,
+            with_center_coords=True,
+            with_bounding_box_coords=True,
+            filter_visible_only=False,
+            filter_with_bid_only=False,
+            filter_som_only=False,
+        )
+        obs["axtree_txt"] = flatten_axtree_to_str(
+            obs["axtree_object"],
+            extra_properties=obs["extra_element_properties"],
+            with_visible=True,
+            with_clickable=True,
+            with_center_coords=True,
+            with_bounding_box_coords=True,
+            filter_visible_only=False,
+            filter_with_bid_only=False,
+            filter_som_only=False,
+        )
+        obs["pruned_html"] = prune_html(obs["dom_txt"])
+        obs["screenshot_som"] = overlay_som(
+            obs["screenshot"], extra_properties=obs["extra_element_properties"]
+        )
+        return obs
+
+    def get_action(self, obs):
+
+        # print(obs["pruned_html"])
+        print("\n")
+        observation = obs["pruned_html"] if self.use_html else obs["axtree_txt"]
+        action = input(observation + "\n")
+        agent_info = AgentInfo(
+            think="nope",
+            chat_messages=[],
+            stats={},
+        )
+        return action, agent_info
+
+
+DEBUG_AGENT = DebugAgentArgs()