ServiceNow
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 10 additions & 3 deletions b/‎.vscode/settings.json‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎Makefile‎
Lines changed: 21 additions & 1 deletion b/‎Makefile‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎experiments/osworld_debug_task_ids.json‎
Lines changed: 37 additions & 0 deletions b/‎experiments/osworld_debug_task_ids.json‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎experiments/osworld_docker_test.py‎
Lines changed: 37 additions & 0 deletions b/‎experiments/osworld_docker_test.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎experiments/run_osworld.py‎
Lines changed: 66 additions & 0 deletions b/‎experiments/run_osworld.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/agentlab/agents/tool_use_agent/tool_use_agent.py‎
Lines changed: 66 additions & 8 deletions b/‎src/agentlab/agents/tool_use_agent/tool_use_agent.py‎
Lines changed: 66 additions & 8 deletions
diff --git a/‎src/agentlab/analyze/agent_xray.py‎
Lines changed: 2 additions & 2 deletions b/‎src/agentlab/analyze/agent_xray.py‎
Lines changed: 2 additions & 2 deletions
@@ -172,6 +172,9 @@ outputs/
 miniwob-plusplus/
 .miniwob-server.pid
 debugging_results/
+docker_vm_data/
+OSWorld/
+
 
 # working files
 experiments/*
@@ -3,13 +3,20 @@
         "editor.formatOnSave": true,
         "editor.defaultFormatter": "ms-python.black-formatter",
         "editor.codeActionsOnSave": {
-            "source.organizeImports": "explicit",
-            "source.fixAll": "never"
-        }
+            "source.organizeImports": "always",
+            "source.fixAll": "always",
+        },
     },
+    "python.analysis.languageServerMode": "full",
+    "python.analysis.typeCheckingMode": "standard",
     "python.testing.pytestArgs": [
         "tests"
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
+    "files.watcherExclude": {
+        "**/.git/objects/**": true,
+        "**/.git/subtree-cache/**": true,
+        "**/node_modules/*/**": true
+    },
 }
@@ -1,4 +1,4 @@
-.PHONY: test setup miniwob lint stop-miniwob
+.PHONY: test setup miniwob lint stop-miniwob osworld
 
 setup:
 	@pip install -e .
@@ -30,3 +30,23 @@ test: setup miniwob check-miniwob run-tests stop-miniwob
 lint: setup
 	@black src/ --check --diff
 	@darglint -v 2 -z short src/
+
+osworld:
+	@echo "Setting up OSWorld..."
+	@git clone https://github.com/xlang-ai/OSWorld || true
+	@echo "Modifying OSWorld requirements.txt to remove pinned versions..."
+	@cd OSWorld && \
+		sed -i.bak 's/numpy~=.*/numpy/' requirements.txt && \
+		sed -i.bak 's/torch~=.*/torch/' requirements.txt && \
+		sed -i.bak 's/torch$$/torch/' requirements.txt && \
+		sed -i.bak 's/tqdm~=.*/tqdm/' requirements.txt && \
+		sed -i.bak 's/pandas~=.*/pandas/' requirements.txt
+	@echo "Installing OSWorld requirements..."
+	@cd OSWorld && pip install -r requirements.txt
+	@echo "Installing OSWorld in development mode..."
+	@cd OSWorld && pip install -e .
+	@echo "OSWorld setup completed!"
+	@echo "Next steps:"
+	@echo "1. Configure your VM (VMware/VirtualBox) according to OSWorld documentation"
+	@echo "2. Download or set up the Ubuntu VM image"
+	@echo "3. Run AgentLab with OSWorld tasks"
@@ -61,6 +61,7 @@ AgentLab Features:
 | [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon |
 | [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
 | [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |
+| [OSWorld](https://os-world.github.io/) | [setup](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/benchmarks/setup.md) | 369 | None | - | - | self hosted  | soon |
 
 
 ## 🛠️ Setup AgentLab
 
@@ -0,0 +1,37 @@
+[
+  {
+    "id": "550ce7e7-747b-495f-b122-acdc4d0b8e54",
+    "task": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?",
+    "complexity": 1
+  },
+  {
+    "id": "59f21cfb-0120-4326-b255-a5b827b38967",
+    "task": "Could you play the music video that's saved on my desktop for me via vlc?",
+    "complexity": 1
+  },
+  {
+    "id": "35253b65-1c19-4304-8aa4-6884b8218fc0",
+    "task": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?",
+    "complexity": 1
+  },
+  {
+    "id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
+    "task": "Please help me change all the places in this document that say \"text\" to \"test\".",
+    "complexity": 1
+  },
+  {
+    "id": "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
+    "task": "I am currently using an Ubuntu system, and I have wrongly deleted a poster of party night. Could you help me recover it from the Trash?",
+    "complexity": 1
+  },
+  {
+    "id": "510f64c8-9bcc-4be1-8d30-638705850618",
+    "task": "Could you start VS Code in folder ~/Desktop/project from the terminal?",
+    "complexity": 1
+  },
+  {
+    "id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
+    "task": "Please help me use VS Code to open the \"project\" in the \"user\" folder under \"home\".",
+    "complexity": 1
+  }
+]
@@ -0,0 +1,37 @@
+import logging
+
+from desktop_env.desktop_env import DesktopEnv
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()],
+)
+
+example = {
+    "id": "94d95f96-9699-4208-98ba-3c3119edf9c2",
+    "instruction": "I want to install Spotify on my current system. Could you please help me?",
+    "config": [
+        {
+            "type": "execute",
+            "parameters": {
+                "command": [
+                    "python",
+                    "-c",
+                    "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);",
+                ]
+            },
+        }
+    ],
+    "evaluator": {
+        "func": "check_include_exclude",
+        "result": {"type": "vm_command_line", "command": "which spotify"},
+        "expected": {"type": "rule", "rules": {"include": ["spotify"], "exclude": ["not found"]}},
+    },
+}
+
+env = DesktopEnv(action_space="pyautogui", provider_name="docker", os_type="Ubuntu")
+
+obs = env.reset(task_config=example)
+obs, reward, done, info = env.step("pyautogui.rightClick()")
+print(obs)
@@ -0,0 +1,66 @@
+import json
+import logging
+import os
+
+from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE
+from agentlab.benchmarks.osworld import OsworldBenchmark
+from agentlab.experiments.study import Study, make_study
+
+fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
+logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
+
+
+def get_most_recent_incomplete_study() -> Study:
+    """
+    Relaunch an existing study, this will continue incomplete experiments and relaunch errored experiments.
+    """
+    study = Study.load_most_recent()
+    study.find_incomplete(include_errors=True)
+    return study
+
+
+def get_task_ids() -> set[str]:
+    with open("experiments/osworld_debug_task_ids.json", "r") as f:
+        task_ids = json.load(f)
+    return set([task["id"] for task in task_ids])
+
+
+def main():
+    n_jobs = 4
+    use_vmware = True
+    relaunch = True
+    agent_args = [
+        OSWORLD_CLAUDE,
+        #    OSWORLD_OAI # performs poorly.
+    ]  # type: ignore
+    parallel_backend = "ray"
+    os.environ["AGENTLAB_DEBUG"] = os.environ.get("AGENTLAB_DEBUG", "1")
+
+    study = make_study(
+        benchmark=OsworldBenchmark(
+            test_set_name="test_small.json"
+        ),  # or test_all.json (Exper)  # type: ignore
+        agent_args=agent_args,  # type: ignore
+        comment="osworld debug 2",
+        logging_level=logging.INFO,
+        logging_level_stdout=logging.INFO,
+    )
+
+    if use_vmware:
+        for exp_args in study.exp_args_list:
+            exp_args.env_args.provider_name = "vmware"  # type: ignore
+            exp_args.env_args.path_to_vm = "OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx"  # type: ignore
+        parallel_backend = "sequential"
+
+    if os.environ.get("AGENTLAB_DEBUG"):
+        task_ids = get_task_ids()
+        study.exp_args_list = [exp_args for exp_args in study.exp_args_list if exp_args.env_args.task["id"] in task_ids]  # type: ignore
+        print(f"Debug on {len(study.exp_args_list)} experiments")
+        n_jobs = 1  # Make sure to use 1 job when debugging in VS
+
+    study = get_most_recent_incomplete_study() if relaunch else study
+    study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend=parallel_backend)
+
+
+if __name__ == "__main__":
+    main()
@@ -26,4 +26,4 @@ matplotlib
 ray[default]
 python-slugify
 pillow
-gymnasium>=0.27
+gymnasium>=0.27
@@ -19,7 +19,11 @@
 from PIL import Image
 
 from agentlab.agents import agent_utils
+from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark
+from bgym import Benchmark as BgymBenchmark
 from agentlab.agents.agent_args import AgentArgs
+from agentlab.benchmarks.osworld import OSWorldActionSet
+from agentlab.llm.base_api import BaseModelArgs
 from agentlab.llm.llm_utils import image_to_png_base64_url
 from agentlab.llm.response_api import (
     APIPayload,
@@ -36,7 +40,6 @@
 
 @dataclass
 class Block(ABC):
-
     def _init(self):
         """Initialize the block."""
         pass
@@ -169,6 +172,7 @@ class Obs(Block):
     use_tabs: bool = False
     # add_mouse_pointer: bool = False
     use_zoomed_webpage: bool = False
+    skip_preprocessing: bool = False
 
     def apply(
         self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput
@@ -181,7 +185,6 @@ def apply(
                 obs_msg.add_text(f"Last action error:\n{obs['last_action_error']}")
 
         if self.use_screenshot:
-
             if self.use_som:
                 screenshot = obs["screenshot_som"]
             else:
@@ -231,7 +234,6 @@ def _format_tabs(obs):
 
 @dataclass
 class GeneralHints(Block):
-
     use_hints: bool = True
 
     def apply(self, llm, discussion: StructuredDiscussion) -> dict:
@@ -342,9 +344,10 @@ class PromptConfig:
 
 @dataclass
 class ToolUseAgentArgs(AgentArgs):
-    model_args: OpenAIResponseModelArgs = None
+    model_args: BaseModelArgs = None
     config: PromptConfig = None
     use_raw_page_output: bool = False  # This attribute is used in loop.py to setup the env.
+    action_set: bgym.AbstractActionSet | None = None
 
     def __post_init__(self):
         try:
@@ -356,8 +359,9 @@ def make_agent(self) -> bgym.Agent:
         if self.config is None:
             self.config = DEFAULT_PROMPT_CONFIG
         return ToolUseAgent(
-            model_args=self.model_args,
+            model_args=self.model_args,  # type: ignore
             config=self.config,
+            action_set=self.action_set,
         )
 
     def prepare(self):
@@ -366,17 +370,24 @@ def prepare(self):
     def close(self):
         return self.model_args.close_server()
 
+    def set_benchmark(self, benchmark: AgentLabBenchmark | BgymBenchmark, demo_mode: bool):
+        """Set benchmark specific flags."""
+        benchmark_name = benchmark.name
+        if benchmark_name == "osworld":
+            self.config.obs.skip_preprocessing = True
+
 
 class ToolUseAgent(bgym.Agent):
     def __init__(
         self,
         model_args: OpenAIResponseModelArgs,
         config: PromptConfig = None,
+        action_set: bgym.AbstractActionSet | None = None,
     ):
         self.model_args = model_args
         self.config = config
-        self.action_set = bgym.HighLevelActionSet(
-            self.config.action_subsets, multiaction=self.config.multiaction
+        self.action_set: bgym.AbstractActionSet = action_set or bgym.HighLevelActionSet(
+            self.config.action_subsets, multiaction=self.config.multiaction  # type: ignore
         )
         self.tools = self.action_set.to_tool_description(api=model_args.api)
 
@@ -395,7 +406,8 @@ def __init__(
 
     def obs_preprocessor(self, obs):
         obs = copy(obs)
-
+        if self.config.obs.skip_preprocessing:
+            return obs
         page = obs.pop("page", None)
         if page is not None:
             obs["screenshot"] = extract_screenshot(page)
@@ -592,3 +604,49 @@ def get_action(self, obs: Any) -> float:
     model_args=GPT4_1_OPENROUTER_MODEL,
     config=DEFAULT_PROMPT_CONFIG,
 )
+
+OSWORLD_CLAUDE = ToolUseAgentArgs(
+    model_args=CLAUDE_MODEL_CONFIG,
+    config=PromptConfig(
+        tag_screenshot=True,
+        goal=Goal(goal_as_system_msg=True),
+        obs=Obs(
+            use_last_error=True,
+            use_screenshot=True,
+            use_axtree=True,
+            use_dom=False,
+            use_som=False,
+            use_tabs=False,
+        ),
+        summarizer=Summarizer(do_summary=True),
+        general_hints=GeneralHints(use_hints=False),
+        task_hint=TaskHint(use_task_hint=False),
+        keep_last_n_obs=None,
+        multiaction=False,  # whether to use multi-action or not
+        action_subsets=("coord",),  # or "bid"
+    ),
+    action_set=OSWorldActionSet("computer_13"),  # or "pyautogui"
+)
+
+OSWORLD_OAI = ToolUseAgentArgs(
+    model_args=OPENAI_MODEL_CONFIG,
+    config=PromptConfig(
+        tag_screenshot=True,
+        goal=Goal(goal_as_system_msg=True),
+        obs=Obs(
+            use_last_error=True,
+            use_screenshot=True,
+            use_axtree=False,
+            use_dom=False,
+            use_som=False,
+            use_tabs=False,
+        ),
+        summarizer=Summarizer(do_summary=True),
+        general_hints=GeneralHints(use_hints=False),
+        task_hint=TaskHint(use_task_hint=False),
+        keep_last_n_obs=1,  # keep only the last observation in the discussion
+        multiaction=False,  # whether to use multi-action or not
+        action_subsets=("coord",),
+    ),
+    action_set=OSWorldActionSet("computer_13"),
+)
@@ -712,7 +712,7 @@ def dict_msg_to_markdown(d: dict):
             case "text":
                 parts.append(f"\n```\n{item['text']}\n```\n")
             case "tool_use":
-                tool_use = _format_tool_call(item["name"], item["input"], item["call_id"])
+                tool_use = _format_tool_call(item["name"], item["input"], item["id"])
                 parts.append(f"\n```\n{tool_use}\n```\n")
             case _:
                 parts.append(f"\n```\n{str(item)}\n```\n")
@@ -1337,7 +1337,7 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr
                 horizontalalignment="right",
                 rotation=0,
                 clip_on=True,
-                antialiased=True,
+                # antialiased=True,
                 fontweight=1000,
                 backgroundcolor=color,
             )