Merge branch 'main' into add-claude-3.7

recursix · web-flow · commit 463e0cf71b8f · 2025-03-13T15:54:37.000-04:00
diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py
@@ -17,7 +17,9 @@
     AGENT_4o_MINI,
     AGENT_CLAUDE_SONNET_35,
     AGENT_37_SONNET,
+    AGENT_CLAUDE_SONNET_35_VISION,
     AGENT_4o_VISION,
+    AGENT_4o_MINI_VISION,
     AGENT_o3_MINI,
     AGENT_o1_MINI,
 )
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
@@ -1,12 +1,14 @@
-from concurrent.futures import ProcessPoolExecutor
 import gzip
 import logging
 import os
 import pickle
+import random
 import uuid
 from abc import ABC, abstractmethod
+from concurrent.futures import ProcessPoolExecutor
 from dataclasses import dataclass
 from datetime import datetime
+from multiprocessing import Manager, Pool, Queue
 from pathlib import Path
 
 import bgym
@@ -19,8 +21,6 @@
 from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies
 from agentlab.experiments.launch_exp import find_incomplete, non_dummy_count, run_experiments
 from agentlab.experiments.multi_server import BaseServer, WebArenaInstanceVars
-from multiprocessing import Pool, Manager, Queue
-import random
 
 logger = logging.getLogger(__name__)
 
@@ -238,7 +238,7 @@ def __post_init__(self):
 
     def make_exp_args_list(self):
         """Generate the exp_args_list from the agent_args and the benchmark."""
-        self.exp_args_list = _agents_on_benchmark(
+        self.exp_args_list = self.agents_on_benchmark(
             self.agent_args,
             self.benchmark,
             logging_level=self.logging_level,
@@ -424,6 +424,84 @@ def load(dir: Path) -> "Study":
     def load_most_recent(root_dir: Path = None, contains=None) -> "Study":
         return Study.load(get_most_recent_study(root_dir, contains=contains))
 
+    def agents_on_benchmark(
+        self,
+        agents: list[AgentArgs] | AgentArgs,
+        benchmark: bgym.Benchmark,
+        demo_mode=False,
+        logging_level: int = logging.INFO,
+        logging_level_stdout: int = logging.INFO,
+        ignore_dependencies=False,
+    ):
+        """Run one or multiple agents on a benchmark.
+
+        Args:
+            agents: list[AgentArgs] | AgentArgs
+                The agent configuration(s) to run.
+            benchmark: bgym.Benchmark
+                The benchmark to run the agents on.
+            demo_mode: bool
+                If True, the experiments will be run in demo mode.
+            logging_level: int
+                The logging level for individual jobs.
+            logging_level_stdout: int
+                The logging level for the stdout.
+            ignore_dependencies: bool
+                If True, the dependencies will be ignored and all experiments can be run in parallel.
+
+        Returns:
+            list[ExpArgs]: The list of experiments to run.
+
+        Raises:
+            ValueError: If multiple agents are run on a benchmark that requires manual reset.
+        """
+
+        if not isinstance(agents, (list, tuple)):
+            agents = [agents]
+
+        if benchmark.name.startswith("visualwebarena") or benchmark.name.startswith("webarena"):
+            if len(agents) > 1:
+                raise ValueError(
+                    f"Only one agent can be run on {benchmark.name} since the instance requires manual reset after each evaluation."
+                )
+
+        for agent in agents:
+            agent.set_benchmark(
+                benchmark, demo_mode
+            )  # the agent can adapt (lightly?) to the benchmark
+
+        env_args_list = benchmark.env_args_list
+        if demo_mode:
+            set_demo_mode(env_args_list)
+
+        exp_args_list = []
+
+        for agent in agents:
+            for env_args in env_args_list:
+                exp_args = ExpArgs(
+                    agent_args=agent,
+                    env_args=env_args,
+                    logging_level=logging_level,
+                    logging_level_stdout=logging_level_stdout,
+                )
+                exp_args_list.append(exp_args)
+
+        for i, exp_args in enumerate(exp_args_list):
+            exp_args.order = i
+
+        # not required with ray, but keeping around if we would need it for visualwebareana on joblib
+        # _flag_sequential_exp(exp_args_list, benchmark)
+
+        if not ignore_dependencies:
+            # populate the depends_on field based on the task dependencies in the benchmark
+            exp_args_list = add_dependencies(exp_args_list, benchmark.dependency_graph_over_tasks())
+        else:
+            logger.warning(
+                f"Ignoring dependencies for benchmark {benchmark.name}. This could lead to different results."
+            )
+
+        return exp_args_list
+
 
 def _make_study_name(agent_names, benchmark_names, suffix=None):
     """Make a study name from the agent and benchmark names."""
@@ -634,82 +712,6 @@ def set_demo_mode(env_args_list: list[EnvArgs]):
         env_args.slow_mo = 1000
 
 
-def _agents_on_benchmark(
-    agents: list[AgentArgs] | AgentArgs,
-    benchmark: bgym.Benchmark,
-    demo_mode=False,
-    logging_level: int = logging.INFO,
-    logging_level_stdout: int = logging.INFO,
-    ignore_dependencies=False,
-):
-    """Run one or multiple agents on a benchmark.
-
-    Args:
-        agents: list[AgentArgs] | AgentArgs
-            The agent configuration(s) to run.
-        benchmark: bgym.Benchmark
-            The benchmark to run the agents on.
-        demo_mode: bool
-            If True, the experiments will be run in demo mode.
-        logging_level: int
-            The logging level for individual jobs.
-        logging_level_stdout: int
-            The logging level for the stdout.
-        ignore_dependencies: bool
-            If True, the dependencies will be ignored and all experiments can be run in parallel.
-
-    Returns:
-        list[ExpArgs]: The list of experiments to run.
-
-    Raises:
-        ValueError: If multiple agents are run on a benchmark that requires manual reset.
-    """
-
-    if not isinstance(agents, (list, tuple)):
-        agents = [agents]
-
-    if benchmark.name.startswith("visualwebarena") or benchmark.name.startswith("webarena"):
-        if len(agents) > 1:
-            raise ValueError(
-                f"Only one agent can be run on {benchmark.name} since the instance requires manual reset after each evaluation."
-            )
-
-    for agent in agents:
-        agent.set_benchmark(benchmark, demo_mode)  # the agent can adapt (lightly?) to the benchmark
-
-    env_args_list = benchmark.env_args_list
-    if demo_mode:
-        set_demo_mode(env_args_list)
-
-    exp_args_list = []
-
-    for agent in agents:
-        for env_args in env_args_list:
-            exp_args = ExpArgs(
-                agent_args=agent,
-                env_args=env_args,
-                logging_level=logging_level,
-                logging_level_stdout=logging_level_stdout,
-            )
-            exp_args_list.append(exp_args)
-
-    for i, exp_args in enumerate(exp_args_list):
-        exp_args.order = i
-
-    # not required with ray, but keeping around if we would need it for visualwebareana on joblib
-    # _flag_sequential_exp(exp_args_list, benchmark)
-
-    if not ignore_dependencies:
-        # populate the depends_on field based on the task dependencies in the benchmark
-        exp_args_list = add_dependencies(exp_args_list, benchmark.dependency_graph_over_tasks())
-    else:
-        logger.warning(
-            f"Ignoring dependencies for benchmark {benchmark.name}. This could lead to different results."
-        )
-
-    return exp_args_list
-
-
 # def _flag_sequential_exp(exp_args_list: list[ExpArgs], benchmark: Benchmark):
 #     if benchmark.name.startswith("visualwebarena"):
 #         sequential_subset = benchmark.subset_from_glob("requires_reset", "True")
diff --git a/src/agentlab/llm/base_api.py b/src/agentlab/llm/base_api.py
@@ -21,6 +21,7 @@ class BaseModelArgs(ABC):
     max_new_tokens: int = None
     temperature: float = 0.1
     vision_support: bool = False
+    log_probs: bool = False
 
     @abstractmethod
     def make_model(self) -> AbstractChatModel:
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
@@ -87,6 +87,7 @@ def make_model(self):
             model_name=self.model_name,
             temperature=self.temperature,
             max_tokens=self.max_new_tokens,
+            log_probs=self.log_probs,
         )
 
 
@@ -100,6 +101,7 @@ def make_model(self):
             model_name=self.model_name,
             temperature=self.temperature,
             max_tokens=self.max_new_tokens,
+            log_probs=self.log_probs,
         )
 
 
@@ -115,6 +117,7 @@ def make_model(self):
             temperature=self.temperature,
             max_tokens=self.max_new_tokens,
             deployment_name=self.deployment_name,
+            log_probs=self.log_probs,
         )
 
 
@@ -142,6 +145,7 @@ def make_model(self):
                 temperature=self.temperature,
                 max_new_tokens=self.max_new_tokens,
                 n_retry_server=self.n_retry_server,
+                log_probs=self.log_probs,
             )
         elif self.backend == "vllm":
             return VLLMChatModel(
@@ -232,6 +236,7 @@ def __init__(
         client_class=OpenAI,
         client_args=None,
         pricing_func=None,
+        log_probs=False,
     ):
         assert max_retry > 0, "max_retry should be greater than 0"
 
@@ -240,6 +245,7 @@ def __init__(
         self.max_tokens = max_tokens
         self.max_retry = max_retry
         self.min_retry_wait_time = min_retry_wait_time
+        self.log_probs = log_probs
 
         # Get the API key from the environment variable if not provided
         if api_key_env_var:
@@ -286,6 +292,7 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float
                     n=n_samples,
                     temperature=temperature,
                     max_tokens=self.max_tokens,
+                    logprobs=self.log_probs,
                 )
 
                 if completion.usage is None:
@@ -315,7 +322,10 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float
             tracking.TRACKER.instance(input_tokens, output_tokens, cost)
 
         if n_samples == 1:
-            return AIMessage(completion.choices[0].message.content)
+            res = AIMessage(completion.choices[0].message.content)
+            if self.log_probs:
+                res["log_probs"] = completion.choices[0].log_probs
+            return res
         else:
             return [AIMessage(c.message.content) for c in completion.choices]
 
@@ -335,6 +345,7 @@ def __init__(
         max_tokens=100,
         max_retry=4,
         min_retry_wait_time=60,
+        log_probs=False,
     ):
         super().__init__(
             model_name=model_name,
@@ -346,6 +357,7 @@ def __init__(
             api_key_env_var="OPENAI_API_KEY",
             client_class=OpenAI,
             pricing_func=tracking.get_pricing_openai,
+            log_probs=log_probs,
         )
 
 
@@ -358,6 +370,7 @@ def __init__(
         max_tokens=100,
         max_retry=4,
         min_retry_wait_time=60,
+        log_probs=False,
     ):
         client_args = {
             "base_url": "https://openrouter.ai/api/v1",
@@ -373,6 +386,7 @@ def __init__(
             client_class=OpenAI,
             client_args=client_args,
             pricing_func=tracking.get_pricing_openrouter,
+            log_probs=log_probs,
         )
 
 
@@ -386,6 +400,7 @@ def __init__(
         max_tokens=100,
         max_retry=4,
         min_retry_wait_time=60,
+        log_probs=False,
     ):
         api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
         endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
@@ -406,6 +421,7 @@ def __init__(
             client_class=AzureOpenAI,
             client_args=client_args,
             pricing_func=tracking.get_pricing_openai,
+            log_probs=log_probs,
         )
 
 
@@ -419,8 +435,9 @@ def __init__(
         temperature: Optional[int] = 1e-1,
         max_new_tokens: Optional[int] = 512,
         n_retry_server: Optional[int] = 4,
+        log_probs: Optional[bool] = False,
     ):
-        super().__init__(model_name, base_model_name, n_retry_server)
+        super().__init__(model_name, base_model_name, n_retry_server, log_probs)
         if temperature < 1e-3:
             logging.warning("Models might behave weirdly when temperature is too low.")
         self.temperature = temperature
@@ -429,7 +446,7 @@ def __init__(
             token = os.environ["TGI_TOKEN"]
 
         client = InferenceClient(model=model_url, token=token)
-        self.llm = partial(client.text_generation, max_new_tokens=max_new_tokens)
+        self.llm = partial(client.text_generation, max_new_tokens=max_new_tokens, details=log_probs)
 
 
 class VLLMChatModel(ChatModel):
diff --git a/src/agentlab/llm/huggingface_utils.py b/src/agentlab/llm/huggingface_utils.py
@@ -40,9 +40,10 @@ class HFBaseChatModel(AbstractChatModel):
         description="The number of times to retry the server if it fails to respond",
     )
 
-    def __init__(self, model_name, base_model_name, n_retry_server):
+    def __init__(self, model_name, base_model_name, n_retry_server, log_probs):
         super().__init__()
         self.n_retry_server = n_retry_server
+        self.log_probs = log_probs
 
         if base_model_name is None:
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -100,7 +101,11 @@ def __call__(
             while True:
                 try:
                     temperature = temperature if temperature is not None else self.temperature
-                    response = AIMessage(self.llm(prompt, temperature=temperature))
+                    answer = self.llm(prompt, temperature=temperature)
+                    response = AIMessage(answer)
+                    if self.log_probs:
+                        response["content"] = answer.generated_text
+                        response["log_probs"] = answer.details
                     responses.append(response)
                     break
                 except Exception as e:
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py