Skip to content

Commit cad0629

Browse files
authored
Merge pull request #214 from ServiceNow/multitool_envs
Gaia bench with tape agent and multitool env
2 parents 6e75e81 + 58c69c4 commit cad0629

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+2399
-251
lines changed

.github/workflows/darglint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ jobs:
3131
run: pip list
3232

3333
- name: Darglint checks
34-
run: darglint -v 2 -z short .
34+
run: darglint -v 2 -z short src/

.gitignore

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ __pycache__/
33
*.py[cod]
44
*$py.class
55
results/
6-
.vscode
6+
77
# C extensions
88
*.so
99
# Distribution / packaging
@@ -160,11 +160,14 @@ cython_debug/
160160
# MacOS
161161
**/.DS_Store
162162

163-
.vscode
164163

165164
_sandbox.py
166165

167166
results/
168167

169168
# gradio
170-
.gradio/
169+
.gradio/
170+
171+
outputs/
172+
miniwob-plusplus/
173+
.miniwob-server.pid

.vscode/launch.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
// Use IntelliSense to learn about possible attributes.
3+
// Hover to view descriptions of existing attributes.
4+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": "Python Debugger: Current File",
9+
"type": "debugpy",
10+
"request": "launch",
11+
"program": "${file}",
12+
"console": "integratedTerminal",
13+
"justMyCode": false,
14+
"env": {
15+
"AGENTLAB_DEBUG": "1"
16+
}
17+
}
18+
]
19+
}

.vscode/settings.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"[python]": {
3+
"editor.formatOnSave": true,
4+
"editor.defaultFormatter": "ms-python.black-formatter",
5+
"editor.codeActionsOnSave": {
6+
"source.organizeImports": "explicit",
7+
"source.fixAll": "never"
8+
}
9+
},
10+
"python.testing.pytestArgs": [
11+
"tests"
12+
],
13+
"python.testing.unittestEnabled": false,
14+
"python.testing.pytestEnabled": true,
15+
}

Makefile

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
.PHONY: test setup miniwob lint stop-miniwob
2+
3+
setup:
4+
@pip install -e .
5+
@playwright install chromium --with-deps
6+
@python -c 'import nltk; nltk.download("punkt_tab")'
7+
8+
miniwob: stop-miniwob
9+
@git clone https://github.com/Farama-Foundation/miniwob-plusplus.git || true
10+
@cd miniwob-plusplus && git checkout 7fd85d71a4b60325c6585396ec4f48377d049838
11+
@python -m http.server 8080 --directory miniwob-plusplus/miniwob/html & echo $$! > .miniwob-server.pid
12+
@sleep 3
13+
@echo "MiniWob server started on http://localhost:8080"
14+
15+
check-miniwob:
16+
@curl -I "http://localhost:8080/miniwob/" || (echo "MiniWob not reachable" && exit 1)
17+
@echo "MiniWob server is reachable"
18+
19+
stop-miniwob:
20+
@kill -9 `cat .miniwob-server.pid` || true
21+
@rm -f .miniwob-server.pid
22+
@echo "MiniWob server stopped"
23+
24+
run-tests:
25+
@MINIWOB_URL="http://localhost:8080/miniwob/" pytest -n 5 --durations=10 -m 'not pricy' tests/
26+
@echo "Tests completed"
27+
28+
test: setup miniwob check-miniwob run-tests stop-miniwob
29+
30+
lint: setup
31+
@black src/ --check --diff
32+
@darglint -v 2 -z short src/

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@ pytest==7.3.2
55
flaky
66
pytest-xdist
77
pytest-playwright
8+
pydantic~=2.9
89
dask
910
distributed
1011
browsergym>=0.7.1
1112
joblib>=1.2.0
1213
openai>=1.7,<2
1314
langchain_community
1415
tiktoken
16+
tapeagents[converters]
1517
huggingface_hub
1618
contexttimer
1719
ipython
@@ -24,3 +26,4 @@ matplotlib
2426
ray[default]
2527
python-slugify
2628
pillow
29+
gymnasium>=0.27

src/agentlab/agents/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ have to specify the type of each field (You can use Any if it is unknown)*
9999
```python
100100
from dataclasses import dataclass
101101
from browsergym.experiment.agent import Agent
102-
from browsergym.experiment.loop import AgentArgs
102+
from agentlab.experiments.loop import AgentArgs
103103

104104

105105
@dataclass
@@ -116,7 +116,7 @@ class CustomAgentArgs(AgentArgs):
116116
To run experiments with your custom agent, define an instance of `ExpArgs` with the required parameters.
117117

118118
```python
119-
from browsergym.experiment.loop import ExpArgs
119+
from agentlab.experiments.loop import ExpArgs
120120

121121
exp_args = ExpArgs(
122122
agent_args=CustomAgentArgs(custom_param="value"),

src/agentlab/agents/generic_agent/reproducibility_agent.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,10 @@
2020

2121
import bgym
2222
from browsergym.experiments.agent import AgentInfo
23-
from browsergym.experiments.loop import ExpArgs, ExpResult, yield_all_exp_results
2423
from bs4 import BeautifulSoup
25-
from langchain.schema import AIMessage, BaseMessage
26-
from langchain_community.adapters.openai import convert_message_to_dict
2724

2825
from agentlab.agents.agent_args import AgentArgs
29-
from agentlab.agents.dynamic_prompting import ActionFlags
26+
from agentlab.experiments.loop import ExpArgs, ExpResult, yield_all_exp_results
3027
from agentlab.experiments.study import Study
3128
from agentlab.llm.chat_api import make_assistant_message
3229
from agentlab.llm.llm_utils import Discussion, messages_to_dict
@@ -65,7 +62,6 @@ def get_stats(self):
6562

6663
@dataclass
6764
class ReproAgentArgs(GenericAgentArgs):
68-
6965
# starting with "_" will prevent from being part of the index in the load_results function
7066
_repro_dir: str = None
7167

@@ -81,7 +77,6 @@ def make_agent(self):
8177

8278

8379
class ReproAgent(GenericAgent):
84-
8580
def __init__(
8681
self,
8782
chat_model_args,
@@ -93,7 +88,6 @@ def __init__(
9388
super().__init__(chat_model_args, flags, max_retry)
9489

9590
def get_action(self, obs):
96-
9791
# replace the chat model with a reproducible chat that will mimic the
9892
# same answers
9993
step = len(self.actions)

src/agentlab/agents/most_basic_agent/most_basic_agent.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import bgym
66

77
from agentlab.agents.agent_args import AgentArgs
8-
from agentlab.llm.chat_api import make_system_message, make_user_message
8+
from agentlab.experiments.loop import ExpArgs
99
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
1010
from agentlab.llm.llm_utils import (
1111
Discussion,
@@ -133,7 +133,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
133133

134134
# example for 2 experiments testing chain of thoughts on a miniwob task
135135
exp_args = [
136-
bgym.ExpArgs(
136+
ExpArgs(
137137
agent_args=MostBasicAgentArgs(
138138
temperature=0.1,
139139
use_chain_of_thought=True,
@@ -142,7 +142,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
142142
env_args=env_args,
143143
logging_level=logging.INFO,
144144
),
145-
bgym.ExpArgs(
145+
ExpArgs(
146146
agent_args=MostBasicAgentArgs(
147147
temperature=0.1,
148148
use_chain_of_thought=False,

src/agentlab/agents/tapeagent/.gitignore

Lines changed: 0 additions & 2 deletions
This file was deleted.

0 commit comments

Comments
 (0)