ServiceNow
diff --git a/‎.github/workflows/darglint.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/darglint.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/python_version_compatibility.yml‎
Lines changed: 40 additions & 0 deletions b/‎.github/workflows/python_version_compatibility.yml‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎.gitignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 10 additions & 3 deletions b/‎.vscode/settings.json‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎Makefile‎
Lines changed: 21 additions & 1 deletion b/‎Makefile‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎experiments/osworld_debug_task_ids.json‎
Lines changed: 37 additions & 0 deletions b/‎experiments/osworld_debug_task_ids.json‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎experiments/osworld_docker_test.py‎
Lines changed: 37 additions & 0 deletions b/‎experiments/osworld_docker_test.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎experiments/run_osworld.py‎
Lines changed: 66 additions & 0 deletions b/‎experiments/run_osworld.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎main_workarena_debug.py‎
Lines changed: 77 additions & 0 deletions b/‎main_workarena_debug.py‎
Lines changed: 77 additions & 0 deletions
@@ -21,7 +21,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.10'
+          python-version: '3.12'
           cache: 'pip' # caching pip dependencies
 
       - name: Pip install
 
@@ -0,0 +1,40 @@
+name: Python Compatibility (Info Only)
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  info-check:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+    - uses: actions/checkout@v4
+    
+    # Optional: Cache uv for faster runs
+    - name: Cache uv
+      uses: actions/cache@v4
+      with:
+        path: ~/.cargo/bin/uv
+        key: uv-${{ runner.os }}
+    
+    - name: Install uv
+      run: |
+        if [ ! -f ~/.cargo/bin/uv ]; then
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+        fi
+        
+    - name: Check Python ${{ matrix.python-version }}
+      continue-on-error: true
+      run: |
+        export PATH="$HOME/.cargo/bin:$PATH"
+        if uvx --python ${{ matrix.python-version }} --from python --with-requirements requirements.txt python -c "print('✅ Compatible')"; then
+          echo "✅ Python ${{ matrix.python-version }} works"
+        else
+          echo "❌ Python ${{ matrix.python-version }} incompatible"
+        fi
@@ -171,3 +171,10 @@ results/
 outputs/
 miniwob-plusplus/
 .miniwob-server.pid
+debugging_results/
+docker_vm_data/
+OSWorld/
+
+
+# working files
+experiments/*
@@ -3,13 +3,20 @@
         "editor.formatOnSave": true,
         "editor.defaultFormatter": "ms-python.black-formatter",
         "editor.codeActionsOnSave": {
-            "source.organizeImports": "explicit",
-            "source.fixAll": "never"
-        }
+            "source.organizeImports": "always",
+            "source.fixAll": "always",
+        },
     },
+    "python.analysis.languageServerMode": "full",
+    "python.analysis.typeCheckingMode": "standard",
     "python.testing.pytestArgs": [
         "tests"
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
+    "files.watcherExclude": {
+        "**/.git/objects/**": true,
+        "**/.git/subtree-cache/**": true,
+        "**/node_modules/*/**": true
+    },
 }
@@ -1,4 +1,4 @@
-.PHONY: test setup miniwob lint stop-miniwob
+.PHONY: test setup miniwob lint stop-miniwob osworld
 
 setup:
 	@pip install -e .
@@ -30,3 +30,23 @@ test: setup miniwob check-miniwob run-tests stop-miniwob
 lint: setup
 	@black src/ --check --diff
 	@darglint -v 2 -z short src/
+
+osworld:
+	@echo "Setting up OSWorld..."
+	@git clone https://github.com/xlang-ai/OSWorld || true
+	@echo "Modifying OSWorld requirements.txt to remove pinned versions..."
+	@cd OSWorld && \
+		sed -i.bak 's/numpy~=.*/numpy/' requirements.txt && \
+		sed -i.bak 's/torch~=.*/torch/' requirements.txt && \
+		sed -i.bak 's/torch$$/torch/' requirements.txt && \
+		sed -i.bak 's/tqdm~=.*/tqdm/' requirements.txt && \
+		sed -i.bak 's/pandas~=.*/pandas/' requirements.txt
+	@echo "Installing OSWorld requirements..."
+	@cd OSWorld && pip install -r requirements.txt
+	@echo "Installing OSWorld in development mode..."
+	@cd OSWorld && pip install -e .
+	@echo "OSWorld setup completed!"
+	@echo "Next steps:"
+	@echo "1. Configure your VM (VMware/VirtualBox) according to OSWorld documentation"
+	@echo "2. Download or set up the Ubuntu VM image"
+	@echo "3. Run AgentLab with OSWorld tasks"
@@ -61,6 +61,7 @@ AgentLab Features:
 | [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) (soon) | - | - | None | - | - | live web | soon |
 | [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon |
 | [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon |
+| [OSWorld](https://os-world.github.io/) | [setup](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/benchmarks/setup.md) | 369 | None | - | - | self hosted  | soon |
 
 
 ## 🛠️ Setup AgentLab
@@ -275,6 +276,15 @@ dynamic benchmarks.
   between the two executions. **Note**: this is a beta feature and will need some adaptation for your
   own agent.
 
+## Variables
+Here's a list of relevant env. variables that are used by AgentLab:
+- `OPEAI_API_KEY` which is used by default for OpenAI LLMs.
+- `AZURE_OPENAI_API_KEY`, used by default for AzureOpenAI LLMs.
+- `AZURE_OPENAI_ENDPOINT` to specify your Azure endpoint.
+- `OPENAI_API_VERSION` for the Azure API.
+- `OPENROUTER_API_KEY` for the Openrouter API
+- `AGENTLAB_EXP_ROOT`, desired path for your experiments to be stored, defaults to `~/agentlab-results`.
+- `AGENTXRAY_SHARE_GRADIO`, which prompts AgentXRay to open a public tunnel on launch.
 
 ## Misc
 
 
@@ -0,0 +1,37 @@
+[
+  {
+    "id": "550ce7e7-747b-495f-b122-acdc4d0b8e54",
+    "task": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?",
+    "complexity": 1
+  },
+  {
+    "id": "59f21cfb-0120-4326-b255-a5b827b38967",
+    "task": "Could you play the music video that's saved on my desktop for me via vlc?",
+    "complexity": 1
+  },
+  {
+    "id": "35253b65-1c19-4304-8aa4-6884b8218fc0",
+    "task": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?",
+    "complexity": 1
+  },
+  {
+    "id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
+    "task": "Please help me change all the places in this document that say \"text\" to \"test\".",
+    "complexity": 1
+  },
+  {
+    "id": "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
+    "task": "I am currently using an Ubuntu system, and I have wrongly deleted a poster of party night. Could you help me recover it from the Trash?",
+    "complexity": 1
+  },
+  {
+    "id": "510f64c8-9bcc-4be1-8d30-638705850618",
+    "task": "Could you start VS Code in folder ~/Desktop/project from the terminal?",
+    "complexity": 1
+  },
+  {
+    "id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
+    "task": "Please help me use VS Code to open the \"project\" in the \"user\" folder under \"home\".",
+    "complexity": 1
+  }
+]
@@ -0,0 +1,37 @@
+import logging
+
+from desktop_env.desktop_env import DesktopEnv
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()],
+)
+
+example = {
+    "id": "94d95f96-9699-4208-98ba-3c3119edf9c2",
+    "instruction": "I want to install Spotify on my current system. Could you please help me?",
+    "config": [
+        {
+            "type": "execute",
+            "parameters": {
+                "command": [
+                    "python",
+                    "-c",
+                    "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);",
+                ]
+            },
+        }
+    ],
+    "evaluator": {
+        "func": "check_include_exclude",
+        "result": {"type": "vm_command_line", "command": "which spotify"},
+        "expected": {"type": "rule", "rules": {"include": ["spotify"], "exclude": ["not found"]}},
+    },
+}
+
+env = DesktopEnv(action_space="pyautogui", provider_name="docker", os_type="Ubuntu")
+
+obs = env.reset(task_config=example)
+obs, reward, done, info = env.step("pyautogui.rightClick()")
+print(obs)
@@ -0,0 +1,66 @@
+import json
+import logging
+import os
+
+from agentlab.agents.tool_use_agent.tool_use_agent import OSWORLD_CLAUDE
+from agentlab.benchmarks.osworld import OsworldBenchmark
+from agentlab.experiments.study import Study, make_study
+
+fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
+logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
+
+
+def get_most_recent_incomplete_study() -> Study:
+    """
+    Relaunch an existing study, this will continue incomplete experiments and relaunch errored experiments.
+    """
+    study = Study.load_most_recent()
+    study.find_incomplete(include_errors=True)
+    return study
+
+
+def get_task_ids() -> set[str]:
+    with open("experiments/osworld_debug_task_ids.json", "r") as f:
+        task_ids = json.load(f)
+    return set([task["id"] for task in task_ids])
+
+
+def main():
+    n_jobs = 4
+    use_vmware = True
+    relaunch = False
+    agent_args = [
+        OSWORLD_CLAUDE,
+        #    OSWORLD_OAI # performs poorly.
+    ]  # type: ignore
+    parallel_backend = "ray"
+    os.environ["AGENTLAB_DEBUG"] = os.environ.get("AGENTLAB_DEBUG", "1")
+
+    study = make_study(
+        benchmark=OsworldBenchmark(
+            test_set_name="test_small.json"
+        ),  # or test_all.json (Exper)  # type: ignore
+        agent_args=agent_args,  # type: ignore
+        comment="osworld debug 2",
+        logging_level=logging.INFO,
+        logging_level_stdout=logging.INFO,
+    )
+
+    if use_vmware:
+        for exp_args in study.exp_args_list:
+            exp_args.env_args.provider_name = "vmware"  # type: ignore
+            exp_args.env_args.path_to_vm = "OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx"  # type: ignore
+        parallel_backend = "sequential"
+
+    if os.environ.get("AGENTLAB_DEBUG"):
+        task_ids = get_task_ids()
+        study.exp_args_list = [exp_args for exp_args in study.exp_args_list if exp_args.env_args.task["id"] in task_ids]  # type: ignore
+        print(f"Debug on {len(study.exp_args_list)} experiments")
+        n_jobs = 1  # Make sure to use 1 job when debugging in VS
+
+    study = get_most_recent_incomplete_study() if relaunch else study
+    study.run(n_jobs=n_jobs, n_relaunch=1, parallel_backend=parallel_backend)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,77 @@
+"""
+Note: This script is a convenience script to launch experiments instead of using
+the command line.
+
+Copy this script and modify at will, but don't push your changes to the
+repository.
+"""
+
+import logging
+from copy import deepcopy
+
+import bgym
+
+from agentlab.agents.tool_use_agent.tool_use_agent import (
+    DEFAULT_PROMPT_CONFIG,
+    GPT_4_1,
+    ToolUseAgentArgs,
+)
+from agentlab.experiments.study import Study
+
+logging.getLogger().setLevel(logging.INFO)
+
+config = deepcopy(DEFAULT_PROMPT_CONFIG)
+# config.keep_last_n_obs = 1
+config.obs.use_som = True
+
+
+agent_configs = [
+    ToolUseAgentArgs(
+        model_args=GPT_4_1,
+        config=config,
+    ),
+    # ToolUseAgentArgs(
+    #     model_args=GPT_4_1,
+    #     config=config,
+    # ),
+]
+
+for agent_config in agent_configs:
+    agent_config.config.action_subsets = ("workarena",)  # use the workarena action set
+
+
+# ## select the benchmark to run on
+# benchmark = "miniwob_tiny_test"
+benchmark = "workarena_l1"
+
+
+benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4)  # type: bgym.Benchmark
+benchmark = benchmark.subset_from_glob("task_name", "*create*")
+
+# for env_args in benchmark.env_args_list:
+#     print(env_args.task_name)
+#     env_args.max_steps = 15
+
+relaunch = False
+
+## Number of parallel jobs
+n_jobs = 10  # Make sure to use 1 job when debugging in VSCode
+parallel_backend = "ray"
+# parallel_backend = "sequential"  # activate sequential backend for debugging in VSCode
+
+if __name__ == "__main__":  # necessary for dask backend
+
+    if relaunch:
+        #  relaunch an existing study
+        study = Study.load_most_recent(contains=None)
+        study.find_incomplete(include_errors=True)
+
+    else:
+        study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING)
+
+    study.run(
+        n_jobs=n_jobs,
+        parallel_backend=parallel_backend,  # "ray", "joblib" or "sequential"
+        strict_reproducibility=False,
+        n_relaunch=3,
+    )