Skip to content

Commit a5af135

Browse files
authored
Merge branch 'main' into tlsdc/wa_vwa
2 parents a95a873 + 8dc809c commit a5af135

File tree

6 files changed

+23
-4
lines changed

6 files changed

+23
-4
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
<br>
2020
[🏆 Leaderboard](#-leaderboard) &nbsp;|&nbsp;
2121
[🤖 Build Your Agent](#-implement-a-new-agent) &nbsp;|&nbsp;
22-
[↻ Reproducibility](#-reproducibility)
22+
[↻ Reproducibility](#-reproducibility) &nbsp;|&nbsp;
23+
[💪 BrowserGym](https://github.com/ServiceNow/BrowserGym)
2324

2425

2526
<img src="https://github.com/user-attachments/assets/47a7c425-9763-46e5-be54-adac363be850" alt="agentlab-diagram" width="700"/>
@@ -30,6 +31,10 @@
3031

3132
</div>
3233

34+
> [!WARNING]
35+
> AgentLab is meant to provide an open, easy-to-use and extensible framework to accelerate the field of web agent research.
36+
> It is not meant to be a consumer product. Use with caution!
37+
3338
AgentLab is a framework for developing and evaluating agents on a variety of
3439
[benchmarks](#-supported-benchmarks) supported by
3540
[BrowserGym](https://github.com/ServiceNow/BrowserGym).

reproducibility_journal.csv

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,weblinx_test,0.0.1.de
4646
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.089,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,
4747
ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.125,0.006,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,
4848
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.079,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,
49+
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,workarena_l2_agent_curriculum_eval,0.4.1,2024-11-29_14-28-47,528da1f2-1949-41dc-b988-85f19f435af2,0.072,0.017,2,235/235,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,b115b2716d8a6328824684a692ed642297f0b1dc,,0.13.3,70dac253628c476aff1af6a975f27f8563453ad2,
50+
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,miniwob,0.13.3,2024-11-29_16-14-00,4d748972-6d35-4489-a197-138b656a7db3,0.646,0.019,0,625/625,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,becb4856fb1612f44010fe74ef8155d367ca17fc,,0.13.3,70dac253628c476aff1af6a975f27f8563453ad2,
51+
ThibaultLSDC,GenericAgent-gpt-4o,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.005,0.003,2,213/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None,
52+
ThibaultLSDC,GenericAgent-gpt-4o-mini,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.002,0.002,1,214/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None,
53+
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.008,0.003,1,212/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None,
54+
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.007,0.005,8,206/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None,
55+
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-8b-instruct,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.001,0.001,15,214/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None,
56+
ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.007,0.003,1,212/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None,
57+
ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.009,0.005,1,214/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None,
4958
ThibaultLSDC,GenericAgent-gpt-4o-mini,webarena,0.13.3,2024-11-29_19-25-49,c6bdeb87-9879-4c06-aa70-00d895001156,0.174,0.013,1,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,b115b2716d8a6328824684a692ed642297f0b1dc,,0.13.3,None,
5059
ThibaultLSDC,GenericAgent-gpt-4o,webarena,0.13.3,2024-11-29_22-28-32,d2eed215-91bb-4603-b69c-8ef8f9d57f34,0.314,0.016,3,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,430fe9456ba766398380454a6335f094004607af,,0.13.3,None,
5160
ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,webarena,0.13.3,2024-11-29_22-37-46,b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae,0.362,0.017,0,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,7b224971fb7a90fb76924ca9386a1e8bf609dd2a,,0.13.3,None,
@@ -55,3 +64,4 @@ ThibaultLSDC,GenericAgent-gpt-4o-mini_vision,visualwebarena,0.13.3,2024-12-02_02
5564
ThibaultLSDC,GenericAgent-gpt-4o_vision,visualwebarena,0.13.3,2024-12-02_07-17-28,7fb7eac8-4bbd-4ebe-be32-15901a7678f2,0.267,0.015,65,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
5665
ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta_vision,visualwebarena,0.13.3,2024-12-02_09-11-35,22f0611d-aeea-4ee9-a533-b45442b5e080,0.21,0.013,178,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
5766
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,webarena,0.13.3,2024-12-02_23-18-38,fc5747bc-d998-4942-a0eb-e55a3ccc1cb3,0.184,0.014,213,811/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
67+

src/agentlab/experiments/study.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from abc import ABC, abstractmethod
12
import gzip
23
import logging
34
import pickle
@@ -405,7 +406,6 @@ def load_most_recent(root_dir: Path = None, contains=None) -> "Study":
405406

406407
def _make_study_name(agent_names, benchmark_names, suffix=None):
407408
"""Make a study name from the agent and benchmark names."""
408-
409409
# extract unique agent and benchmark names
410410
agent_names = list(set(agent_names))
411411
benchmark_names = list(set(benchmark_names))

src/agentlab/llm/huggingface_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from transformers import AutoTokenizer, GPT2TokenizerFast
77

88
from agentlab.llm.base_api import AbstractChatModel
9+
from agentlab.llm.llm_utils import Discussion
910
from agentlab.llm.prompt_templates import PromptTemplate, get_prompt_template
1011

1112

@@ -59,6 +60,8 @@ def __call__(
5960
if self.tokenizer:
6061
# messages_formated = _convert_messages_to_dict(messages) ## ?
6162
try:
63+
if isinstance(messages, Discussion):
64+
messages.merge()
6265
prompt = self.tokenizer.apply_chat_template(messages, tokenize=False)
6366
except Exception as e:
6467
if "Conversation roles must alternate" in str(e):

src/agentlab/llm/llm_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,8 @@ def merge(self):
386386
else:
387387
new_content.append(elem)
388388
self["content"] = new_content
389+
if len(self["content"]) == 1:
390+
self["content"] = self["content"][0]["text"]
389391

390392

391393
class SystemMessage(BaseMessage):

tests/llm/test_llm_utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,7 @@ def test_message_merge_only_text():
251251
]
252252
message = llm_utils.BaseMessage(role="system", content=content)
253253
message.merge()
254-
assert len(message["content"]) == 1
255-
assert message["content"][0]["text"] == "Hello, world!\nThis is a test."
254+
assert message["content"] == "Hello, world!\nThis is a test."
256255

257256

258257
def test_message_merge_text_image():

0 commit comments

Comments
 (0)